Replace KanjiFinder with a better asset bundle string searcher
This commit is contained in:
121
scripts/CharacterInfoExtraction/AssetBundleStringExtractor.swift
Normal file
121
scripts/CharacterInfoExtraction/AssetBundleStringExtractor.swift
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
var standardError = FileHandle.standardError
|
||||||
|
|
||||||
|
extension FileHandle : TextOutputStream {
|
||||||
|
public func write(_ string: String) {
|
||||||
|
guard let data = string.data(using: .utf8) else { return }
|
||||||
|
self.write(data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
guard CommandLine.arguments.count > 1 else {
|
||||||
|
print("""
|
||||||
|
Usage: \(CommandLine.arguments[0]) assetBundle1.assets [assetBundle2.assets ...]
|
||||||
|
Use - to read from stdin
|
||||||
|
Finds strings in Unity asset bundles
|
||||||
|
""", to: &standardError)
|
||||||
|
exit(EXIT_FAILURE)
|
||||||
|
}
|
||||||
|
|
||||||
|
#if !swift(>=4.2)
|
||||||
|
extension Collection {
|
||||||
|
func firstIndex(where predicate: (Element) throws -> Bool) rethrows -> Index? {
|
||||||
|
return try self.index(where: predicate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
let inFiles: [String] = Array(CommandLine.arguments[1...])
|
||||||
|
|
||||||
|
extension Collection {
|
||||||
|
subscript(offset offset: Int) -> Element {
|
||||||
|
get {
|
||||||
|
let i = index(startIndex, offsetBy: offset)
|
||||||
|
return self[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isUTF8Continuation(_ byte: UInt8) -> Bool {
|
||||||
|
return byte & 0b1100_0000 == 0b1000_0000
|
||||||
|
}
|
||||||
|
|
||||||
|
func isValidUTF8<C: Collection>(data: C) -> Bool where C.Element == UInt8 {
|
||||||
|
var slice = data[...]
|
||||||
|
while !slice.isEmpty {
|
||||||
|
let byte = slice.first!
|
||||||
|
slice = slice.dropFirst()
|
||||||
|
if byte & 0b1000_0000 == 0 { // Single byte utf-8
|
||||||
|
// Strings shouldn't contain ASCII controll chars
|
||||||
|
guard byte > 8 else { return false }
|
||||||
|
guard !(0xe...0x1f).contains(byte) else { return false }
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
else if byte & 0b1110_0000 == 0b1100_0000 { // Two byte utf-8
|
||||||
|
guard let cont = slice.first else { return false }
|
||||||
|
guard isUTF8Continuation(cont) else { return false }
|
||||||
|
slice = slice.dropFirst()
|
||||||
|
}
|
||||||
|
else if byte & 0b1111_0000 == 0b1110_0000 { // Three byte utf-8
|
||||||
|
guard slice.count > 1 else { return false }
|
||||||
|
guard isUTF8Continuation(slice[offset: 0]) else { return false }
|
||||||
|
guard isUTF8Continuation(slice[offset: 1]) else { return false }
|
||||||
|
slice = slice.dropFirst(2)
|
||||||
|
}
|
||||||
|
else if byte & 0b1111_1000 == 0b1111_0000 { // Four byte utf-8
|
||||||
|
guard slice.count > 2 else { return false }
|
||||||
|
guard isUTF8Continuation(slice[offset: 0]) else { return false }
|
||||||
|
guard isUTF8Continuation(slice[offset: 1]) else { return false }
|
||||||
|
guard isUTF8Continuation(slice[offset: 2]) else { return false }
|
||||||
|
slice = slice.dropFirst(3)
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func stringFinder(data: Data, maxStringLength: Int = 100) -> [String] {
|
||||||
|
return data.withUnsafeBytes { (ptr: UnsafePointer<UInt8>) -> [String] in
|
||||||
|
let buffer = UnsafeRawBufferPointer(start: UnsafeRawPointer(ptr), count: data.count)
|
||||||
|
guard data.count % 4 == 0 else { fatalError("Input file wasn't 4-byte aligned, it's probably not an asset bundle") }
|
||||||
|
var out: [String] = []
|
||||||
|
let ints = buffer.bindMemory(to: UInt32.self)
|
||||||
|
for (index, int) in ints.lazy.map({ $0.littleEndian }).enumerated() {
|
||||||
|
guard int > 1 && int < maxStringLength && int < ((ints.count - index - 1) * 4) else {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
let uint32Length = Int((int &+ 3) / 4)
|
||||||
|
let padding = (4 - (int % 4))
|
||||||
|
let unicode = UnsafeBufferPointer(rebasing: ints[(index + 1)...].prefix(uint32Length))
|
||||||
|
// Ensure padding is all 0s
|
||||||
|
guard unicode.last!.littleEndian &>> (padding * 8) == 0 else { continue }
|
||||||
|
let optionalStr = unicode.withMemoryRebound(to: UInt8.self) { (unicode) -> String? in
|
||||||
|
let stringUnicode = unicode[..<Int(int)]
|
||||||
|
guard isValidUTF8(data: stringUnicode) else { return nil }
|
||||||
|
return String(decoding: stringUnicode, as: UTF8.self)
|
||||||
|
}
|
||||||
|
guard let str = optionalStr else { continue }
|
||||||
|
out.append(str)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if inFiles == ["-"] {
|
||||||
|
print(stringFinder(data: FileHandle.standardInput.readDataToEndOfFile()).joined(separator: "\n"))
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for path in inFiles {
|
||||||
|
guard FileManager.default.fileExists(atPath: path) else {
|
||||||
|
fatalError("No file exists at \(path)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for path in inFiles {
|
||||||
|
let data = try Data(contentsOf: URL(fileURLWithPath: path))
|
||||||
|
print(stringFinder(data: data).joined(separator: "\n"))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,90 +0,0 @@
|
|||||||
import Foundation
|
|
||||||
|
|
||||||
var standardError = FileHandle.standardError
|
|
||||||
|
|
||||||
extension FileHandle : TextOutputStream {
|
|
||||||
public func write(_ string: String) {
|
|
||||||
guard let data = string.data(using: .utf8) else { return }
|
|
||||||
self.write(data)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
guard CommandLine.arguments.count > 1 else {
|
|
||||||
print("""
|
|
||||||
Usage: \(CommandLine.arguments[0]) [-filter filterFile.txt] assetBundle1.assets [assetBundle2.assets ...]
|
|
||||||
Use - to read from stdin
|
|
||||||
Finds 3-byte unicode characters (like kanji) in files
|
|
||||||
If a filter is supplied, only characters also in the filter will be outputted
|
|
||||||
""", to: &standardError)
|
|
||||||
exit(EXIT_FAILURE)
|
|
||||||
}
|
|
||||||
|
|
||||||
#if !swift(>=4.2)
|
|
||||||
extension Collection {
|
|
||||||
func firstIndex(where predicate: (Element) throws -> Bool) rethrows -> Index? {
|
|
||||||
return try self.index(where: predicate)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
var filter: String? = nil
|
|
||||||
var inFiles: [String] = Array(CommandLine.arguments[1...])
|
|
||||||
|
|
||||||
if let filterIndex = inFiles.firstIndex(where: { $0.lowercased() == "-filter" }) {
|
|
||||||
if filterIndex + 1 < inFiles.endIndex {
|
|
||||||
filter = try String(contentsOf: URL(fileURLWithPath: inFiles[filterIndex + 1]))
|
|
||||||
inFiles[filterIndex...filterIndex+1] = []
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let bundles: [Data]
|
|
||||||
if inFiles == ["-"] {
|
|
||||||
bundles = [FileHandle.standardInput.readDataToEndOfFile()]
|
|
||||||
} else {
|
|
||||||
bundles = try inFiles.map { try Data(contentsOf: URL(fileURLWithPath: $0)) }
|
|
||||||
}
|
|
||||||
|
|
||||||
extension UTF8.CodeUnit {
|
|
||||||
var isStart3: Bool {
|
|
||||||
return self & 0b11110000 == 0b11100000
|
|
||||||
}
|
|
||||||
var isContinuation: Bool {
|
|
||||||
return self & 0b11000000 == 0b10000000
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func unicodeFinder(data: [UInt8], minLength: Int = 2) -> String {
|
|
||||||
var out = [UInt8]()
|
|
||||||
var left = data[...]
|
|
||||||
while true {
|
|
||||||
guard let index = left.firstIndex(where: { ($0 & 0b11110000) == 0b11100000 }) else { break }
|
|
||||||
left = left[index...]
|
|
||||||
guard left.count > 5 else { break }
|
|
||||||
var good = 0
|
|
||||||
for i in stride(from: left.startIndex, to: left.endIndex, by: 3) {
|
|
||||||
if left[i].isStart3 && left[i+1].isContinuation && left[i+2].isContinuation {
|
|
||||||
good += 1
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if good >= minLength {
|
|
||||||
out.append(contentsOf: left[..<i])
|
|
||||||
good = 0
|
|
||||||
}
|
|
||||||
left = left[(i+1)...]
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if good >= minLength {
|
|
||||||
out.append(contentsOf: left.prefix(left.count / 3 * 3))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return String(decoding: out, as: UTF8.self)
|
|
||||||
}
|
|
||||||
|
|
||||||
let unicodeStrings = bundles.map({ unicodeFinder(data: Array($0)) })
|
|
||||||
var chars = unicodeStrings.map({ Set($0.unicodeScalars) }).reduce(Set(), { $0.union($1) })
|
|
||||||
if let filter = filter {
|
|
||||||
chars.formIntersection(filter.unicodeScalars)
|
|
||||||
}
|
|
||||||
|
|
||||||
print(String(chars.sorted().lazy.map(Character.init)), terminator: "")
|
|
||||||
Reference in New Issue
Block a user