Update otherlang charset and add python script for extract characters

rei: 凸 hou: ▲ and ▼ Note: there is already a swift script, but this rudimentary python script should be good enough The missing 凸 was reported in https://github.com/07th-mod/higurashi-rei/issues/22
2024-01-14 19:08:04 +11:00
parent 526acf5fe7
commit 68f8bdd9ae
2 changed files with 66 additions and 1 deletions
--- a/scripts/CharacterInfoExtraction/PythonTextExtractor/extract.py
+++ b/scripts/CharacterInfoExtraction/PythonTextExtractor/extract.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+import re
+
+en_regex = re.compile(r'OutputLine\([^,]*,\s*[^,]*,\s*[^,]*,\s*([^,]*)')
+
+def load_existing_list(path):
+    with open(path, encoding='utf-8', newline='') as f:
+        return f.read()
+
+
+existing_char_list = Path('C:/drojf/large_projects/umineko/ui-editing-scripts/scripts/CharacterInfoExtraction/msgothic_2_charset_OtherLang.txt')
+out_char_list = existing_char_list.with_suffix(existing_char_list.suffix + '.out')
+source_directory = Path('C:/drojf/large_projects/umineko/HIGURASHI_REPOS')
+
+existing_char_list_text = load_existing_list(existing_char_list)
+existing_font_set = set(existing_char_list_text)
+
+all_chars = set()
+
+for file in source_directory.rglob("*.txt"):
+    print(file)
+    with open(file, encoding='utf-8') as f:
+        whole_file_string = f.read()
+        for match in en_regex.finditer(whole_file_string):
+            if match:
+                outputline_english_arg = match.group(1)
+                for c in outputline_english_arg:
+                    all_chars.add(c)
+
+all_chars_list = list(all_chars)
+all_chars_list.sort()
+
+chars_to_add = []
+new_char_found = False
+for char in all_chars_list:
+    if char not in existing_font_set:
+        print(f'NEW CHAR: {char}')
+        new_char_found = True
+        chars_to_add.append(char)
+
+if not new_char_found:
+    print("No new characters found!")
+
+final_list = list(existing_font_set.union(all_chars))
+final_list.sort()
+
+for c in final_list:
+    print(c, end='')
+
+print()
+
+
+with open(out_char_list, 'w', encoding='utf-8', newline='') as f:
+    for i, c in enumerate(existing_char_list_text):
+        f.write(c)
+
+        # This is very bad for performance if there are lots of new chars found, but it works for now to maintain ordering
+        for new_character in chars_to_add:
+            if new_character < c:
+                f.write(new_character)
+                chars_to_add.remove(new_character)
+                print(f"Inserting new character {new_character} at position {i} as it is less than {c}")
+
+if chars_to_add:
+    raise Exception(f"One or more characters were not added {chars_to_add}")