diff --git a/CharacterInfoExtraction/HigurashiTextExtractor/Language.swift b/CharacterInfoExtraction/HigurashiTextExtractor/Language.swift new file mode 100644 index 0000000..01388f0 --- /dev/null +++ b/CharacterInfoExtraction/HigurashiTextExtractor/Language.swift @@ -0,0 +1,114 @@ +// +// Language.swift +// CParser_CS440 +// + +import Foundation + +enum Associativity { + case left, right; +} + +var typeNames: Set = ["bool", "char", "short", "int", "long", "float", "double"] + +var convertibleTypes: [Set] = [["bool", "char", "short", "int", "long"], ["float", "double"]] + +func isConvertible(left: String, right: String) -> Bool { + for convertibleSet in convertibleTypes { + if convertibleSet.contains(left) && convertibleSet.contains(right) { + return true + } + } + return false +} + +enum Type: CustomStringConvertible { + case any, noType, specific(String) + var description: String { + switch self { + case .specific(let string): + return string + case .noType: + return "None" + case .any: + return "Any" + } + } +} + +let binaryOperators: Dictionary)> = [ + 160: (.left, ["<<", ">>"]), + 150: (.left, ["*", "/", "%", "&"]), + 140: (.left, ["+", "-", "|", "^"]), + 130: (.left, ["<", "<=", ">", ">=", "==", "!="]), + 120: (.left, ["&&"]), + 110: (.left, ["||"]) +] + +let allIntegersBinaryOperator: [(left: Type, right: Type, out: Type)] = [(left: .specific("int"), right: .specific("int"), out: .specific("int"))] +let allNumbersBinaryOperator: [(left: Type, right: Type, out: Type)] = [(left: .specific("int"), right: .specific("int"), out: .specific("int")), (left: .specific("double"), right: .specific("double"), out: .specific("double")), (left: .specific("float"), right: .specific("float"), out: .specific("float"))] +let allBooleansBinaryOperator: [(left: Type, right: Type, out: Type)] = [(left: .specific("bool"), right: .specific("bool"), out: .specific("bool"))] +let comparisonBinaryOperator: [(left: Type, right: Type, out: Type)] = [(left: .any, right: .any, out: .specific("bool"))] + +let binaryOperatorTypes: Dictionary = [ + "<<": allIntegersBinaryOperator, + ">>": allIntegersBinaryOperator, + "*": allNumbersBinaryOperator, + "/": allNumbersBinaryOperator, + "%": allIntegersBinaryOperator, + "&": allIntegersBinaryOperator, + "+": allNumbersBinaryOperator, + "-": allNumbersBinaryOperator, + "|": allIntegersBinaryOperator, + "^": allIntegersBinaryOperator, + "<": comparisonBinaryOperator, + "<=": comparisonBinaryOperator, + ">": comparisonBinaryOperator, + ">=": comparisonBinaryOperator, + "==": comparisonBinaryOperator, + "!=": comparisonBinaryOperator, + "&&": allBooleansBinaryOperator, + "||": allBooleansBinaryOperator +] + +let assignmentOperators: Set = ["=", "*=", "/=", "%=", "+=", "-=", "<<=", ">>=", "&=", "^=", "|="] + +let assignmentOperatorTypes: Dictionary = [ + "=": [.any], + "*=": [.specific("int"), .specific("double"), .specific("float")], + "/=": [.specific("int"), .specific("double"), .specific("float")], + "%=": [.specific("int")], + "+=": [.specific("int"), .specific("double"), .specific("float")], + "-=": [.specific("int"), .specific("double"), .specific("float")], + "<<=": [.specific("int")], + ">>=": [.specific("int")], + "&=": [.specific("int")], + "^=": [.specific("int")], + "|=": [.specific("int")] +] + +let prefixOperators: Set = ["!", "~", "++", "--", "+", "-"] +let postfixOperators: Set = ["++", "--"] + + +let allIntegersUnaryOperator: [(in: Type, out: Type)] = [(in: .specific("int"), out: .specific("int"))] +let allNumbersUnaryOperator: [(in: Type, out: Type)] = [(in: .specific("int"), out: .specific("int")), (in: .specific("double"), out: .specific("double")), (in: .specific("float"), out: .specific("float"))] +let allBooleansUnaryOperator: [(in: Type, out: Type)] = [(in: .specific("bool"), out: .specific("bool"))] + +let unaryOperatorTypes: Dictionary = [ + "!": allBooleansUnaryOperator, + "~": allIntegersUnaryOperator, + "++": allNumbersUnaryOperator, + "--": allNumbersUnaryOperator, + "+": allNumbersUnaryOperator, + "-": allNumbersUnaryOperator +] + +let otherPunctuation: Set = ["(", ")", "{", "}", "[", "]", ";", ",", "."] +let commentPunctuation: Set = ["//", "/*"] + +let allPunctuation: Set = prefixOperators.union(postfixOperators).union(assignmentOperators).union(binaryOperators.values.flatMap({$0.1})).union(otherPunctuation).union(commentPunctuation) +let punctuationCharacters = Set(allPunctuation.flatMap({ $0.unicodeScalars })) +let longestPunctuation = allPunctuation.reduce(0, { longest, current in let len = current.count; return len > longest ? len : longest }) +let nonIdentifierCharacters = punctuationCharacters.union(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]) + diff --git a/CharacterInfoExtraction/HigurashiTextExtractor/README.md b/CharacterInfoExtraction/HigurashiTextExtractor/README.md new file mode 100644 index 0000000..5dd6aa2 --- /dev/null +++ b/CharacterInfoExtraction/HigurashiTextExtractor/README.md @@ -0,0 +1 @@ +Extractor that extracts text from Higurashi script files. Compile with `swiftc -O -wmo *.swift -o HigurashiTextExtractor`. Run with `./HigurashiTextExtractor scriptFile [e|j]` where `e` will get you the English script and `j` will get you Japanese \ No newline at end of file diff --git a/CharacterInfoExtraction/HigurashiTextExtractor/Scanner.swift b/CharacterInfoExtraction/HigurashiTextExtractor/Scanner.swift new file mode 100644 index 0000000..8fa2fcc --- /dev/null +++ b/CharacterInfoExtraction/HigurashiTextExtractor/Scanner.swift @@ -0,0 +1,315 @@ +// +// Scanner.swift +// CParser_CS440 +// + +import Foundation + +enum TokenType: String { + case stringLiteral, characterLiteral, punctuation, identifier +} + +enum TokenizationError: Error { + case badPunctuation(row: Int, column: Int, character: UnicodeScalar) + case unclosedString(row: Int, column: Int, string: String) +} + +struct TokenListSignature: Hashable, Equatable { + let list: [TokenType] + init(_ array: [TokenType]) { + self.list = array + } + init(from tokens: [Token]) { + list = tokens.map { $0.type } + } + static func ==(lhs: TokenListSignature, rhs: TokenListSignature) -> Bool { + guard lhs.list.count == rhs.list.count else { return false } + for tokentype in lhs.list.enumerated() { + if rhs.list[tokentype.offset] != tokentype.element { + return false + } + } + return true + } + var hashValue: Int { + return list.map({ $0.hashValue }).reduce(5381) { + ($0 << 5) &+ $0 &+ $1 + } + } +} + +struct MovingStringRange { + let string: String.UnicodeScalarView + private(set) var back: String.UnicodeScalarIndex + private(set) var front: String.UnicodeScalarIndex + private(set) var length: Int + private(set) var backRow: Int + private(set) var backColumn: Int + private(set) var frontRow: Int + private(set) var frontColumn: Int + init(_ string: String.UnicodeScalarView, atEnd: Bool = false) { + self.string = string + self.length = 0 + self.back = atEnd ? string.endIndex : string.startIndex + self.front = back + self.backRow = 1 + self.backColumn = 1 + self.frontRow = 1 + self.frontColumn = 1 + } + init(_ string: String, atEnd: Bool = false) { + self.init(string.unicodeScalars, atEnd: atEnd) + } + private mutating func advanceFront() { + if frontChar == "\n" { + frontColumn = 1 + frontRow += 1 + } + else { + frontColumn += 1 + } + front = string.index(after: front) + length += 1 + } + private mutating func retreatFront() { + front = string.index(before: front) + length -= 1 + if frontChar == "\n" { + var check = string.index(before: front) + frontColumn = 2 + while check >= string.startIndex && string[check] != "\n" { + frontColumn += 1 + check = string.index(before: check) + } + frontRow -= 1 + } + else { + frontColumn -= 1 + } + } + mutating func advanceFront(by: Int = 1) { + if by > 0 { + for _ in 0..= string.startIndex && string[check] != "\n" { + backColumn += 1 + check = string.index(before: check) + } + backRow -= 1 + } + else { + backColumn -= 1 + } + } + mutating func advanceBack(by: Int = 1) { + if by > 0 { + for _ in 0..= string.endIndex + } + var frontIsBeginning: Bool { + return front <= string.startIndex + } + var frontIsEnd: Bool { + return front >= string.endIndex + } +} + +extension UnicodeScalar { + var isNewline: Bool { + return (0x0a...0x0d).contains(self.value) || self.value == 0x85 || self.value == 0x2028 || self.value == 0x2029 + } + var isWhitespace: Bool { + return self.value == 0x20 || self.value == 0xa0 || self.value == 0x1680 || (0x2000...0x200a).contains(self.value) || self.value == 0x202f || self.value == 0x205f || self.value == 0x3000 + } + var isNewlineOrWhitespace: Bool { + return isNewline || isWhitespace + } +} + +struct Token: CustomStringConvertible { + let type: TokenType + let value: String + let row: Int + let column: Int + init(type: TokenType, value: String, row: Int, column: Int) { + self.type = type + self.value = value + self.row = row + self.column = column + } + init(type: TokenType, value: String.UnicodeScalarView, row: Int, column: Int) { + self.init(type: type, value: String(value), row: row, column: column) + } + var description: String { +// return "[\(type) \(value)]" + switch type { + case .identifier, .punctuation: + return value + case .stringLiteral: + return "\"\(value)\"" + case .characterLiteral: + return "'\(value)'" + } + } + + static func tokenize(input: String) throws -> [Token] { + var inputRange = MovingStringRange(input) + var tokens: [Token] = [] + + while !inputRange.backIsEnd { + //print("Currently looking from row \(inputRange.backRow) column \(inputRange.backColumn) to row \(inputRange.frontRow) column \(inputRange.frontColumn), \(inputRange.currentRange)") + if inputRange.frontIsEnd { // If this is the end of the file + if (inputRange.length > 0) { + tokens.append(Token(type: .identifier, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn)) + } + inputRange.setBackToFront() + continue + } + else if inputRange.frontChar.isNewlineOrWhitespace { // Whitespace, end of token + if inputRange.length > 0 { // If there's multiple whitespace chars in a row, don't add empty tokens + tokens.append(Token(type: .identifier, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn)) + } + inputRange.advanceFront() + inputRange.setBackToFront() + } + else if punctuationCharacters.contains(inputRange.frontChar) { + if inputRange.length > 0 { // Add the previous identifier if it exists + tokens.append(Token(type: .identifier, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn)) + } + inputRange.setBackToFront() + while !inputRange.frontIsEnd && (punctuationCharacters.contains(inputRange.frontChar) || inputRange.length > 0) { + // Keep going until we reach the end of the file and have parsed it all + inputRange.advanceFront() + if inputRange.length > longestPunctuation || inputRange.frontIsEnd || !punctuationCharacters.contains(inputRange.frontChar) { + var punctuationToken = inputRange.currentRange + while inputRange.length > 1 && !allPunctuation.contains(punctuationToken) { + inputRange.advanceFront(by: -1) + punctuationToken = inputRange.currentRange + } + if commentPunctuation.contains(punctuationToken) { + if punctuationToken == "//" { + while !inputRange.frontIsEnd && !inputRange.frontChar.isNewline { + inputRange.advanceFront() + } + } + else { + while !inputRange.frontIsEnd && inputRange.frontChar != "/" { + while !inputRange.frontIsEnd && inputRange.frontChar != "*" { + inputRange.advanceFront() + } + inputRange.advanceFront() + } + } + inputRange.advanceFront() + inputRange.setBackToFront() + continue + } + if allPunctuation.contains(punctuationToken) { + tokens.append(Token(type: .punctuation, value: punctuationToken, row: inputRange.backRow, column: inputRange.backColumn)) + inputRange.setBackToFront() + } + else { + throw TokenizationError.badPunctuation(row: inputRange.backRow, column: inputRange.backColumn, character: inputRange.backChar) + } + } + } + } + else if inputRange.frontChar == "\"" || inputRange.frontChar == "'" { + let quoteType = inputRange.frontChar + if inputRange.length > 0 { // Add the previous identifier if it exists + tokens.append(Token(type: .identifier, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn)) + } + inputRange.advanceFront() + inputRange.setBackToFront() + while true { + if inputRange.frontIsEnd || inputRange.frontChar.isNewline { + inputRange.advanceBack(by: -1) + throw TokenizationError.unclosedString(row: inputRange.backRow, column: inputRange.backColumn, string: inputRange.currentRange) + } + else if inputRange.frontChar == "\\" { + inputRange.advanceFront(by: 2) + } + else if inputRange.frontChar == quoteType { + let type: TokenType + if quoteType == "'" { + type = .characterLiteral + } + else { + type = .stringLiteral + } + tokens.append(Token(type: type, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn - 1)) + inputRange.advanceFront() + inputRange.setBackToFront() + break + } + else { + inputRange.advanceFront() + } + } + } + else { + inputRange.advanceFront() + } + } + return tokens + } +} diff --git a/CharacterInfoExtraction/HigurashiTextExtractor/main.swift b/CharacterInfoExtraction/HigurashiTextExtractor/main.swift new file mode 100644 index 0000000..d9ef583 --- /dev/null +++ b/CharacterInfoExtraction/HigurashiTextExtractor/main.swift @@ -0,0 +1,93 @@ +import Foundation + +guard CommandLine.arguments.count > 1 else { + print("Usage: \(CommandLine.arguments[0]) file [(e|j)]\nExtracts text from Higurashi script files. Use e or j to specify English or Japanese, otherwise you'll get both") + exit(EXIT_FAILURE) +} + +var verbose = false +var mode = 3 +if CommandLine.arguments.count >= 3 { + if CommandLine.arguments.contains(where: { $0.lowercased() == "e" }) { mode = 2 } + if CommandLine.arguments.contains(where: { $0.lowercased() == "j" }) { mode = 1 } + if CommandLine.arguments.contains(where: { $0.lowercased() == "-v" }) { verbose = true } +} + +var standardError = FileHandle.standardError + +extension FileHandle : TextOutputStream { + public func write(_ string: String) { + guard let data = string.data(using: .utf8) else { return } + self.write(data) + } +} + +struct Command { + let name: String + let arguments: [Token] + + init?(tokens: [Token]) { + guard tokens.count >= 3 else { return nil } + guard tokens[0].type == .identifier else { return nil } + guard tokens[1].type == .punctuation && tokens[1].value == "(" else { return nil } + guard tokens.last!.type == .punctuation && tokens.last!.value == ")" else { return nil } + self.name = tokens[0].value + self.arguments = tokens.dropFirst(2).dropLast().filter({ $0.value != "," }) + } +} + +func loadFile(path: String) throws -> [[Token]] { + let file: String + if path == "-" { + file = String(decoding: FileHandle.standardInput.readDataToEndOfFile(), as: UTF8.self) + } else { + file = try String(contentsOf: URL(fileURLWithPath: path)) + } + let tokens = try Token.tokenize(input: file) + let statements = tokens.split(whereSeparator: { $0.value == ";" || $0.value == "{" || $0.value == "}" }).map(Array.init) + return statements +} + + + +let tokens = try loadFile(path: CommandLine.arguments[1]) +let commands = tokens.compactMap { tokens -> Command? in + let output = Command(tokens: tokens) + if (output == nil) { + if verbose { print("\(tokens) was not a command!", to: &standardError) } + } + return output +} + +let ignore: Set = ["FadeOutBGM", "DisableWindow", "DrawScene", "PlayBGM", "Wait", "SetValidityOfInput", "DrawSceneWithMask", "SetSpeedOfMessage", "DrawBustshot", "FadeBustshot", "DrawBustshotWithFiltering", "FadeBustshotWithFiltering", "PlaySE", "ShakeScreen", "DrawFilm", "FadeFilm", "FadeAllBustshots", "DrawSpriteWithFiltering", "MoveSprite", "DrawSprite", "FadeSprite", "TitleScreen", "SetLocalFlag", "ShowChapterPreview", "SetCharSpacing", "SetLineSpacing", "SetScreenAspect", "SetWindowPos", "SetWindowSize", "SetWindowMargins", "FadeBG", "SetValidityOfSkipping", "SetGUIPosition", "SetStyleOfMessageSwinging", "EnableJumpingOfReturnIcon", "SetValidityOfTextFade", "SetValidityOfInterface", "Negative", "CallScript", "SavePoint", "SetValidityOfWindowDisablingWhenGraphicsControl", "SetFontSize", "SetNameFormat", "SetFontId", "StopBGM", "SetGlobalFlag", "LanguagePrompt", "SetValidityOfSaving", "ShowTips", "CheckTipsAchievements", "if", "StoreValueToLocalWork", "DrawBG", "ChangeScene", "StopSE", "ShakeScreenSx", "StopSE", "GetAchievement", "CallSection", "JumpSection", "SetDrawingPointOfMessage"] +var japanese = "" +var english = "" + +func stringFromLiteral(literal: Token) -> String { + guard literal.type == .stringLiteral else { + if literal.value == "NULL" { return "" } + fatalError("\(literal) wasn't a string literal!") + } + return literal.value.replacingOccurrences(of: "\\\"", with: "\"").replacingOccurrences(of: "\\n", with: "\n") +} + +for command in commands { + if ignore.contains(command.name) { continue } + + switch command.name { + case "OutputLine": + japanese += stringFromLiteral(literal: command.arguments[1]) + english += stringFromLiteral(literal: command.arguments[3]) + case "OutputLineAll": + let line = stringFromLiteral(literal: command.arguments[1]) + japanese += line + english += line + case "ClearMessage": + japanese += "\n\n" + english += "\n\n" + default: if verbose { print(command, to: &standardError) } + } +} + +if mode & 1 > 0 { print(japanese) } +if mode & 2 > 0 { print(english) } diff --git a/CharacterInfoExtraction/KanjiFinder.swift b/CharacterInfoExtraction/KanjiFinder.swift new file mode 100644 index 0000000..5a82c3b --- /dev/null +++ b/CharacterInfoExtraction/KanjiFinder.swift @@ -0,0 +1,90 @@ +import Foundation + +var standardError = FileHandle.standardError + +extension FileHandle : TextOutputStream { + public func write(_ string: String) { + guard let data = string.data(using: .utf8) else { return } + self.write(data) + } +} + +guard CommandLine.arguments.count > 1 else { + print(""" + Usage: \(CommandLine.arguments[0]) [-filter filterFile.txt] assetBundle1.assets [assetBundle2.assets ...] + Use - to read from stdin + Finds 3-byte unicode characters (like kanji) in files + If a filter is supplied, only characters also in the filter will be outputted + """, to: &standardError) + exit(EXIT_FAILURE) +} + +#if !swift(>=4.2) +extension Collection { + func firstIndex(where predicate: (Element) throws -> Bool) rethrows -> Index? { + return try self.index(where: predicate) + } +} +#endif + +var filter: String? = nil +var inFiles: [String] = Array(CommandLine.arguments[1...]) + +if let filterIndex = inFiles.firstIndex(where: { $0.lowercased() == "-filter" }) { + if filterIndex + 1 < inFiles.endIndex { + filter = try String(contentsOf: URL(fileURLWithPath: inFiles[filterIndex + 1])) + inFiles[filterIndex...filterIndex+1] = [] + } +} + +let bundles: [Data] +if inFiles == ["-"] { + bundles = [FileHandle.standardInput.readDataToEndOfFile()] +} else { + bundles = try inFiles.map { try Data(contentsOf: URL(fileURLWithPath: $0)) } +} + +extension UTF8.CodeUnit { + var isStart3: Bool { + return self & 0b11110000 == 0b11100000 + } + var isContinuation: Bool { + return self & 0b11000000 == 0b10000000 + } +} + +func unicodeFinder(data: [UInt8], minLength: Int = 2) -> String { + var out = [UInt8]() + var left = data[...] + while true { + guard let index = left.firstIndex(where: { ($0 & 0b11110000) == 0b11100000 }) else { break } + left = left[index...] + guard left.count > 5 else { break } + var good = 0 + for i in stride(from: left.startIndex, to: left.endIndex, by: 3) { + if left[i].isStart3 && left[i+1].isContinuation && left[i+2].isContinuation { + good += 1 + } + else { + if good >= minLength { + out.append(contentsOf: left[..= minLength { + out.append(contentsOf: left.prefix(left.count / 3 * 3)) + } + } + return String(decoding: out, as: UTF8.self) +} + +let unicodeStrings = bundles.map({ unicodeFinder(data: Array($0)) }) +var chars = unicodeStrings.map({ Set($0.unicodeScalars) }).reduce(Set(), { $0.union($1) }) +if let filter = filter { + chars.formIntersection(filter.unicodeScalars) +} + +print(String(chars.sorted().lazy.map(Character.init)), terminator: "") diff --git a/CharacterInfoExtraction/README.md b/CharacterInfoExtraction/README.md new file mode 100644 index 0000000..8f3d77f --- /dev/null +++ b/CharacterInfoExtraction/README.md @@ -0,0 +1,5 @@ +Some scripts for figuring out what characters are used in games to help with choosing what characters to put on font atlases + +I had originally made these for personal use and wasn't thinking about publishing them, so I wrote them in Swift, which doesn't currently support compiling on Windows. Very sorry about that. I guess you could try out WSL? + +Download Swift [here](https://swift.org/download/) for Ubuntu or macOS, it also appears to be [on the AUR](https://aur.archlinux.org/packages/swift/) for Arch users. Compile a script with `swiftc -O scriptFile.swift` or run it directly with `swift -O scriptFile.swift arguments`, though that will be fairly slow if you plan to run the script multiple times. \ No newline at end of file diff --git a/CharacterInfoExtraction/UniqueCharacters.swift b/CharacterInfoExtraction/UniqueCharacters.swift new file mode 100644 index 0000000..a835630 --- /dev/null +++ b/CharacterInfoExtraction/UniqueCharacters.swift @@ -0,0 +1,24 @@ +import Foundation + +var standardError = FileHandle.standardError + +extension FileHandle : TextOutputStream { + public func write(_ string: String) { + guard let data = string.data(using: .utf8) else { return } + self.write(data) + } +} + +guard CommandLine.arguments.count > 1 else { + print("Usage: \(CommandLine.arguments[0]) file\nUse - to read from stdin", to: &standardError) + exit(EXIT_FAILURE) +} +let input: String +if CommandLine.arguments[1] == "-" { + input = String(decoding: FileHandle.standardInput.readDataToEndOfFile(), as: UTF8.self) +} else { + input = try String(contentsOf: URL(fileURLWithPath: CommandLine.arguments[1])) +} +let chars = Set(input.unicodeScalars) +let out = chars.sorted().lazy.map(Character.init) +print(String(out), terminator: "")