ui-editing-scripts/scripts/CharacterInfoExtraction/PythonTextExtractor/merge_scripts.py

from pathlib import Path
import re

def load_existing_list(path):
    with open(path, encoding='utf-8', newline='') as f:
        return f.read()

japanese_list = '../msgothic_0_charset_Japanese.txt'
multilang_list = '../msgothic_2_charset_OtherLang.txt'
out_char_list = '../msgothic_2_charset_JP_and_OtherLang.txt'

jp = load_existing_list(japanese_list)
multi = load_existing_list(multilang_list)

chars_to_add = set(jp)
existing_chars = set(multi)


with open(out_char_list, 'w', encoding='utf-8', newline='') as f:
    for i, c in enumerate(multi):
        f.write(c)

        # This is very bad for performance if there are lots of new chars found, but it works for now to maintain ordering
        remove_list = []
        for new_character in chars_to_add:
            if new_character < c:
                f.write(new_character)
                remove_list.append(new_character)
                print(f"Inserting new character {new_character} at position {i} as it is less than {c}")

        for item in remove_list:
            chars_to_add.remove(item)

    remove_list = []
    for char in chars_to_add:
        if char not in existing_chars:
            f.write(char)
        else:
            print(f"WARNING: character {char} already exists, skipping")
        remove_list.append(char)

    for item in remove_list:
        chars_to_add.remove(item)


if chars_to_add:
    raise Exception(f"One or more characters were not added {chars_to_add}")