From b7305d85883e44fa16fcb099448388125577c021 Mon Sep 17 00:00:00 2001 From: Tellow Krinkle Date: Fri, 14 Sep 2018 04:58:40 -0500 Subject: [PATCH] Replace KanjiFinder with a better asset bundle string searcher --- .../AssetBundleStringExtractor.swift | 121 ++++++++++++++++++ .../CharacterInfoExtraction/KanjiFinder.swift | 90 ------------- 2 files changed, 121 insertions(+), 90 deletions(-) create mode 100644 scripts/CharacterInfoExtraction/AssetBundleStringExtractor.swift delete mode 100644 scripts/CharacterInfoExtraction/KanjiFinder.swift diff --git a/scripts/CharacterInfoExtraction/AssetBundleStringExtractor.swift b/scripts/CharacterInfoExtraction/AssetBundleStringExtractor.swift new file mode 100644 index 0000000..b2a659c --- /dev/null +++ b/scripts/CharacterInfoExtraction/AssetBundleStringExtractor.swift @@ -0,0 +1,121 @@ +import Foundation + +var standardError = FileHandle.standardError + +extension FileHandle : TextOutputStream { + public func write(_ string: String) { + guard let data = string.data(using: .utf8) else { return } + self.write(data) + } +} + +guard CommandLine.arguments.count > 1 else { + print(""" + Usage: \(CommandLine.arguments[0]) assetBundle1.assets [assetBundle2.assets ...] + Use - to read from stdin + Finds strings in Unity asset bundles + """, to: &standardError) + exit(EXIT_FAILURE) +} + +#if !swift(>=4.2) +extension Collection { +func firstIndex(where predicate: (Element) throws -> Bool) rethrows -> Index? { +return try self.index(where: predicate) +} +} +#endif + +let inFiles: [String] = Array(CommandLine.arguments[1...]) + +extension Collection { + subscript(offset offset: Int) -> Element { + get { + let i = index(startIndex, offsetBy: offset) + return self[i] + } + } +} + +func isUTF8Continuation(_ byte: UInt8) -> Bool { + return byte & 0b1100_0000 == 0b1000_0000 +} + +func isValidUTF8(data: C) -> Bool where C.Element == UInt8 { + var slice = data[...] + while !slice.isEmpty { + let byte = slice.first! + slice = slice.dropFirst() + if byte & 0b1000_0000 == 0 { // Single byte utf-8 + // Strings shouldn't contain ASCII controll chars + guard byte > 8 else { return false } + guard !(0xe...0x1f).contains(byte) else { return false } + continue + } + else if byte & 0b1110_0000 == 0b1100_0000 { // Two byte utf-8 + guard let cont = slice.first else { return false } + guard isUTF8Continuation(cont) else { return false } + slice = slice.dropFirst() + } + else if byte & 0b1111_0000 == 0b1110_0000 { // Three byte utf-8 + guard slice.count > 1 else { return false } + guard isUTF8Continuation(slice[offset: 0]) else { return false } + guard isUTF8Continuation(slice[offset: 1]) else { return false } + slice = slice.dropFirst(2) + } + else if byte & 0b1111_1000 == 0b1111_0000 { // Four byte utf-8 + guard slice.count > 2 else { return false } + guard isUTF8Continuation(slice[offset: 0]) else { return false } + guard isUTF8Continuation(slice[offset: 1]) else { return false } + guard isUTF8Continuation(slice[offset: 2]) else { return false } + slice = slice.dropFirst(3) + } + else { + return false + } + } + return true +} + +func stringFinder(data: Data, maxStringLength: Int = 100) -> [String] { + return data.withUnsafeBytes { (ptr: UnsafePointer) -> [String] in + let buffer = UnsafeRawBufferPointer(start: UnsafeRawPointer(ptr), count: data.count) + guard data.count % 4 == 0 else { fatalError("Input file wasn't 4-byte aligned, it's probably not an asset bundle") } + var out: [String] = [] + let ints = buffer.bindMemory(to: UInt32.self) + for (index, int) in ints.lazy.map({ $0.littleEndian }).enumerated() { + guard int > 1 && int < maxStringLength && int < ((ints.count - index - 1) * 4) else { + continue + } + let uint32Length = Int((int &+ 3) / 4) + let padding = (4 - (int % 4)) + let unicode = UnsafeBufferPointer(rebasing: ints[(index + 1)...].prefix(uint32Length)) + // Ensure padding is all 0s + guard unicode.last!.littleEndian &>> (padding * 8) == 0 else { continue } + let optionalStr = unicode.withMemoryRebound(to: UInt8.self) { (unicode) -> String? in + let stringUnicode = unicode[.. 1 else { - print(""" - Usage: \(CommandLine.arguments[0]) [-filter filterFile.txt] assetBundle1.assets [assetBundle2.assets ...] - Use - to read from stdin - Finds 3-byte unicode characters (like kanji) in files - If a filter is supplied, only characters also in the filter will be outputted - """, to: &standardError) - exit(EXIT_FAILURE) -} - -#if !swift(>=4.2) -extension Collection { - func firstIndex(where predicate: (Element) throws -> Bool) rethrows -> Index? { - return try self.index(where: predicate) - } -} -#endif - -var filter: String? = nil -var inFiles: [String] = Array(CommandLine.arguments[1...]) - -if let filterIndex = inFiles.firstIndex(where: { $0.lowercased() == "-filter" }) { - if filterIndex + 1 < inFiles.endIndex { - filter = try String(contentsOf: URL(fileURLWithPath: inFiles[filterIndex + 1])) - inFiles[filterIndex...filterIndex+1] = [] - } -} - -let bundles: [Data] -if inFiles == ["-"] { - bundles = [FileHandle.standardInput.readDataToEndOfFile()] -} else { - bundles = try inFiles.map { try Data(contentsOf: URL(fileURLWithPath: $0)) } -} - -extension UTF8.CodeUnit { - var isStart3: Bool { - return self & 0b11110000 == 0b11100000 - } - var isContinuation: Bool { - return self & 0b11000000 == 0b10000000 - } -} - -func unicodeFinder(data: [UInt8], minLength: Int = 2) -> String { - var out = [UInt8]() - var left = data[...] - while true { - guard let index = left.firstIndex(where: { ($0 & 0b11110000) == 0b11100000 }) else { break } - left = left[index...] - guard left.count > 5 else { break } - var good = 0 - for i in stride(from: left.startIndex, to: left.endIndex, by: 3) { - if left[i].isStart3 && left[i+1].isContinuation && left[i+2].isContinuation { - good += 1 - } - else { - if good >= minLength { - out.append(contentsOf: left[..= minLength { - out.append(contentsOf: left.prefix(left.count / 3 * 3)) - } - } - return String(decoding: out, as: UTF8.self) -} - -let unicodeStrings = bundles.map({ unicodeFinder(data: Array($0)) }) -var chars = unicodeStrings.map({ Set($0.unicodeScalars) }).reduce(Set(), { $0.union($1) }) -if let filter = filter { - chars.formIntersection(filter.unicodeScalars) -} - -print(String(chars.sorted().lazy.map(Character.init)), terminator: "")