ui-editing-scripts/scripts/CharacterInfoExtraction/PythonTextExtractor/extract.py

from pathlib import Path
import re

en_regex = re.compile(r'OutputLine\([^,]*,\s*[^,]*,\s*[^,]*,\s*([^,]*)')

def load_existing_list(path):
    with open(path, encoding='utf-8', newline='') as f:
        return f.read()


existing_char_list = Path('C:/drojf/large_projects/umineko/ui-editing-scripts/scripts/CharacterInfoExtraction/msgothic_2_charset_OtherLang.txt')
out_char_list = existing_char_list.with_suffix(existing_char_list.suffix + '.out')
source_directory = Path('C:/drojf/large_projects/umineko/HIGURASHI_REPOS')

existing_char_list_text = load_existing_list(existing_char_list)
existing_font_set = set(existing_char_list_text)

all_chars = set()

for file in source_directory.rglob("*.txt"):
    print(file)
    with open(file, encoding='utf-8') as f:
        whole_file_string = f.read()
        for match in en_regex.finditer(whole_file_string):
            if match:
                outputline_english_arg = match.group(1)
                for c in outputline_english_arg:
                    all_chars.add(c)

all_chars_list = list(all_chars)
all_chars_list.sort()

chars_to_add = []
new_char_found = False
for char in all_chars_list:
    if char not in existing_font_set:
        print(f'NEW CHAR: {char}')
        new_char_found = True
        chars_to_add.append(char)

if not new_char_found:
    print("No new characters found!")

final_list = list(existing_font_set.union(all_chars))
final_list.sort()

for c in final_list:
    print(c, end='')

print()


with open(out_char_list, 'w', encoding='utf-8', newline='') as f:
    for i, c in enumerate(existing_char_list_text):
        f.write(c)

        # This is very bad for performance if there are lots of new chars found, but it works for now to maintain ordering
        for new_character in chars_to_add:
            if new_character < c:
                f.write(new_character)
                chars_to_add.remove(new_character)
                print(f"Inserting new character {new_character} at position {i} as it is less than {c}")

if chars_to_add:
    raise Exception(f"One or more characters were not added {chars_to_add}")