* Added support for kanjidic2.xml.
authorUrban Wallasch <urban.wallasch@freenet.de>
Thu, 1 Jul 2021 15:13:50 +0000 (17:13 +0200)
committerUrban Wallasch <urban.wallasch@freenet.de>
Thu, 1 Jul 2021 18:28:10 +0000 (20:28 +0200)
README.md
kanjidic.py

index 8a1c40b426432766dc0026129e008638474fb0fb..05505a513a0d944b2effb58a8d44b279ae46d02a 100644 (file)
--- a/README.md
+++ b/README.md
@@ -50,10 +50,15 @@ part of Jiten-pai requires installation of the `kanjidic` file, also made
 available by the EDRDG:
 
 * [KANJIDIC](http://ftp.edrdg.org/pub/Nihongo/kanjidic.gz) *(recommended)*
-    * Kanji dictionary; EUC-JP encoding
+    * Kanji dictionary; plain text format; EUC-JP encoding
     * download file, unpack and convert to UTF-8 *(see above)*
     * install via Edit->Preferences
 
+* [KANJIDIC2](http://ftp.edrdg.org/pub/Nihongo/kanjidic2.xml.gz) *(alternative)*
+    * Kanji dictionary; XML format; UTF-8 encoding
+    * download file, unpack
+    * install via Edit->Preferences
+
 The [EDRDG licence page](http://www.edrdg.org/edrdg/licence.html) provides
 dictionary copyright information and licensing terms.
 
index 55f8ffa9ed839a535117aaab71bcc67543646da7..b46a6a63a49ef719aa12fa868049c9fcb8a9521d 100755 (executable)
@@ -171,7 +171,7 @@ def _k2rad(kanji):
 # load kanjidic
 # See: http://www.edrdg.org/kanjidic/kanjidic_doc_legacy.html#IREF02
 #
-# kanjidic example lines:
+# kanjidic1 example lines:
 #
 # 心 3F34 U5fc3 B61 G2 S4 XJ13D38 F157 J3 N1645 V1780 H11 DP11 DK4 DL4 L595
 # DN639 K139 O49 DO80 MN10295 MP4.0937 E147 IN97 DA97 DS95 DF172 DH164 DT96
@@ -188,7 +188,7 @@ def _k2rad(kanji):
 
 _kanjidic = dict()     # format: { 'kanji': {info}, ...}
 
-def _kanjidic_load(dict_fname):
+def _kanjidic1_load(dict_fname):
     ktable = [
         ['F', 'freq'],
         ['G', 'grade'],
@@ -238,10 +238,82 @@ def _kanjidic_load(dict_fname):
                 info['readings'] = line.strip().replace(' ', ', ').replace('T2,', 'T2').replace('T1,', 'T1')
                 _kanjidic[kanji] = info
     except Exception as e:
-        eprint('_kanjidic_load:', dict_fname, str(e))
+        eprint('_kanjidic1_load:', dict_fname, str(e))
         return False
     return True
 
+# load kanjidic2.xml
+# See: http://www.edrdg.org/wiki/index.php/KANJIDIC_Project#Content_.26_Format
+
+import xml.etree.ElementTree as ET
+
+def _kanjidic2_load(dict_fname):
+    try:
+        for char in ET.parse(dict_fname).iterfind('character'):
+            kanji = char.find('literal').text
+            if not kanji in _krad:
+                # REVISIT: for now we drop all the obscure kanji not in kradfile
+                # (which should coincide with the JIS X 0212/0213 Supplementary Kanji sets)
+                continue
+            info = {
+                'strokes': '',
+                'readings': '',
+                'r_korean': '',
+                'r_pinyin': '',
+                'meaning': '',
+                'freq': '',
+                'grade': '',
+            }
+            misc = char.find('misc')
+            strokes = misc.find('stroke_count')
+            info['strokes'] = strokes.text
+            freq = misc.find('freq')
+            info['freq'] = freq.text if freq else ''
+            grade = misc.find('grade')
+            info['grade'] = grade.text if grade else ''
+            rm = char.find('reading_meaning')
+            if rm:
+                rm_group = rm.find('rmgroup')
+                for rd in rm_group.findall('reading'):
+                    r_type = rd.attrib['r_type']
+                    if r_type == 'korean_r':
+                        info['r_korean'] = rd.text
+                    elif r_type == 'pinyin':
+                        info['r_pinyin'] = rd.text
+                    elif r_type[:3] == 'ja_':
+                        info['readings'] += '%s, ' % rd.text
+                for m in rm_group.findall('meaning'):
+                    if m.attrib.get('m_lang', 'en') == 'en':
+                        info['meaning'] += '%s; ' % m.text
+                nanori = ''
+                for n in rm.findall('nanori'):
+                    nanori += '%s, ' % n.text
+                if nanori:
+                    info['readings'] += 'T1 %s' % nanori
+            rad_name = ''
+            for n in misc.findall('rad_name'):
+                rad_name +=  '%s, ' % n.text
+            if rad_name:
+                info['readings'] += 'T2 %s' % rad_name
+            info['readings'] = info['readings'].rstrip(', ')
+            _kanjidic[kanji] = info
+    except Exception as e:
+        eprint('_kanjidic2_load:', dict_fname, str(e))
+        return False
+    return True
+
+def _kanjidic_load(dict_fname):
+    tag_line = ''
+    try:
+        with open(dict_fname) as f:
+            tag_line = f.readline()
+        if '<?xml' in tag_line:
+            return _kanjidic2_load(dict_fname)
+        return _kanjidic1_load(dict_fname)
+    except Exception as e:
+        eprint('_kanjidic_load:', dict_fname, str(e))
+    return False
+
 def _kanjidic_lookup(kanji):
     try:
         kanji = kanji[0]