From 5888b5a70762923c39f53520c1c5e7135aebb621 Mon Sep 17 00:00:00 2001 From: Urban Wallasch Date: Fri, 2 Jul 2021 03:22:47 +0200 Subject: [PATCH] * Fixed kanjidic2.xml parsing. * Support kanjidic2.xml additional (JIS X 0212/0213) kanji. --- README.md | 1 + kanjidic.py | 103 +- kradfile2.utf8 | 5842 ++++++++++++++++++++++++++++++++++++++++++++++++ radkfile2.utf8 | 1213 ++++++++++ 4 files changed, 7109 insertions(+), 50 deletions(-) create mode 100644 kradfile2.utf8 create mode 100644 radkfile2.utf8 diff --git a/README.md b/README.md index 05505a5..135db35 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ available by the EDRDG: * [KANJIDIC2](http://ftp.edrdg.org/pub/Nihongo/kanjidic2.xml.gz) *(alternative)* * Kanji dictionary; XML format; UTF-8 encoding + * contains additional kanji from JIS X 0212/0213 supplementary sets * download file, unpack * install via Edit->Preferences diff --git a/kanjidic.py b/kanjidic.py index b46a6a6..9e5313f 100755 --- a/kanjidic.py +++ b/kanjidic.py @@ -29,8 +29,8 @@ from PyQt5.QtGui import * _KANJIDIC_VERSION = '0.0.10' _KANJIDIC_NAME = 'KanjiDic' _KANJIDIC_DIR = 'jiten-pai' -_KANJIDIC_RADK = 'radkfile.utf8' -_KANJIDIC_KRAD = 'kradfile.utf8' +_KANJIDIC_RADK = ['radkfile.utf8', 'radkfile2.utf8'] +_KANJIDIC_KRAD = ['kradfile.utf8', 'kradfile2.utf8'] _JITENPAI_CFG = 'jiten-pai.conf' @@ -119,41 +119,42 @@ _krad = dict() # format: { 'kanji': 'radical_list', ... } def _rad_load(): res = True - radk_name = _KANJIDIC_RADK - if not os.access(radk_name, os.R_OK): - radk_name = _get_dfile_path(os.path.join(_KANJIDIC_DIR, _KANJIDIC_RADK), mode=os.R_OK) - try: - with open(radk_name) as radk_file: - re_radic = re.compile(r'^\$\s+(.)\s+(\d+)') - re_kanji = re.compile(r'^([^#$]\S*)') - radical = '?' - for line in radk_file: - m = re_kanji.search(line) - if m: - _radk[radical][1] += m.group(1) - continue - m = re_radic.search(line) - if m: - radical = m.group(1) - stroke = int(m.group(2)) - _radk[radical] = [stroke, ''] - _srad[stroke] += radical - except Exception as e: - eprint('_rad_load:', radk_name, str(e)) - res = False - krad_name = _KANJIDIC_KRAD - if not os.access(krad_name, os.R_OK): - krad_name = _get_dfile_path(os.path.join(_KANJIDIC_DIR, _KANJIDIC_KRAD), mode=os.R_OK) - try: - with open(krad_name) as krad_file: - re_krad = re.compile(r'^([^#\s]) : (.+)$') - for line in krad_file: - m = re_krad.search(line) - if m: - _krad[m.group(1)] = m.group(2).replace(' ', '') - except Exception as e: - eprint('_rad_load:', krad_name, str(e)) - res = False + for radk_name in _KANJIDIC_RADK: + if not os.access(radk_name, os.R_OK): + radk_name = _get_dfile_path(os.path.join(_KANJIDIC_DIR, radk_name), mode=os.R_OK) + try: + with open(radk_name) as radk_file: + re_radic = re.compile(r'^\$\s+(.)\s+(\d+)') + re_kanji = re.compile(r'^([^#$]\S*)') + radical = '?' + for line in radk_file: + m = re_kanji.search(line) + if m: + _radk[radical][1] += m.group(1) + continue + m = re_radic.search(line) + if m: + radical = m.group(1) + if radical not in _radk: + stroke = int(m.group(2)) + _radk[radical] = [stroke, ''] + _srad[stroke] += radical + except Exception as e: + eprint('_rad_load:', radk_name, str(e)) + res = False + for krad_name in _KANJIDIC_KRAD: + if not os.access(krad_name, os.R_OK): + krad_name = _get_dfile_path(os.path.join(_KANJIDIC_DIR, krad_name), mode=os.R_OK) + try: + with open(krad_name) as krad_file: + re_krad = re.compile(r'^([^#\s]) : (.+)$') + for line in krad_file: + m = re_krad.search(line) + if m: + _krad[m.group(1)] = m.group(2).replace(' ', '') + except Exception as e: + eprint('_rad_load:', krad_name, str(e)) + res = False return res def _rad2k(rad): @@ -251,10 +252,6 @@ def _kanjidic2_load(dict_fname): try: for char in ET.parse(dict_fname).iterfind('character'): kanji = char.find('literal').text - if not kanji in _krad: - # REVISIT: for now we drop all the obscure kanji not in kradfile - # (which should coincide with the JIS X 0212/0213 Supplementary Kanji sets) - continue info = { 'strokes': '', 'readings': '', @@ -265,12 +262,15 @@ def _kanjidic2_load(dict_fname): 'grade': '', } misc = char.find('misc') - strokes = misc.find('stroke_count') - info['strokes'] = strokes.text - freq = misc.find('freq') - info['freq'] = freq.text if freq else '' - grade = misc.find('grade') - info['grade'] = grade.text if grade else '' + for strokes in misc.findall('stroke_count'): + info['strokes'] = strokes.text + break + for freq in misc.findall('freq'): + info['freq'] = freq.text + break + for grade in misc.findall('grade'): + info['grade'] = grade.text + break rm = char.find('reading_meaning') if rm: rm_group = rm.find('rmgroup') @@ -309,6 +309,9 @@ def _kanjidic_load(dict_fname): tag_line = f.readline() if '