From c3f75e9969fae10ddd0a268efcc1636ed57b8219 Mon Sep 17 00:00:00 2001
From: Urban Wallasch <urban.wallasch@freenet.de>
Date: Sat, 3 Jul 2021 16:27:14 +0200
Subject: [PATCH] * Moved word class REs to vconj file to keep them in close
 proximity to the rules. * Slightly restructured the verb de-inflection
 procedure. * The vconj file does not originate in Gjiten, but in fact first
 appeared in XJDIC.

---
 README.md    |  4 +--
 jiten-pai.py | 54 ++++++++++------------------------------
 vconj.utf8   | 69 ++++++++++++++++++++++++++--------------------------
 3 files changed, 49 insertions(+), 78 deletions(-)

diff --git a/README.md b/README.md
index c57d7aa..7d0185b 100644
--- a/README.md
+++ b/README.md
@@ -129,8 +129,8 @@ Jiten-pai incorporates parts taken from other projects, namely:
 * Kana conversion code adapted from [jaconv](https://github.com/ikegami-yukino/jaconv);
   Copyright (c) 2014 Yukino Ikegami; MIT License
 
-* VCONJ verb de-inflection rule file adapted from [Gjiten](http://gjiten.sourceforge.net/);
-  Copyright (c) 1999-2005 Botond Botyanszki; GNU General Public License v2.0
+* VCONJ verb de-inflection rule file adapted from XJDIC;
+  Copyright (c) 1998-2003 J.W. Breen; GNU General Public License v2.0
 
 * RADKFILE and KRADFILE kanji radical cross-reference adapted from
   [The KRADFILE/RADKFILE Project](http://www.edrdg.org/krad/kradinf.html);
diff --git a/jiten-pai.py b/jiten-pai.py
index bb37a34..ca3fde2 100755
--- a/jiten-pai.py
+++ b/jiten-pai.py
@@ -163,39 +163,11 @@ except Exception as e:
 ############################################################
 # verb de-inflection
 
-# REs for word classes a specific inflection rule may generally be
-# applicable to, as tagged in the gloss part of dictionary entries.
-_vconj_wclass = { # TODO: adjust these REs to best fit word classes to inflection rule.
-     0: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # plain, negative, nonpast
-     1: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # polite, non-past
-     2: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # conditional
-     3: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # volitional
-     4: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # te-form
-     5: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # plain, past
-     6: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # plain, negative, past
-     7: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # passive
-     8: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # causative
-     9: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # potential or imperative
-    10: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # imperative
-    11: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # polite, past
-    12: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # polite, negative, non-past
-    13: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # polite, negative, past
-    14: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # polite, volitional
-    15: re.compile(r'\((adj|adv|aux|n-adv)'),                  # adj. -> adverb
-    16: re.compile(r'\((adj|adv|aux|n-adv)'),                  # adj., past
-    17: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # polite
-    18: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # polite, volitional
-    19: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # passive or potential
-    20: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # passive (or potential if Grp 2)
-    21: re.compile(r'\((adj|adv|aux|n-adv)'),                  # adj., negative
-    22: re.compile(r'\((adj|adv|aux|n-adv)'),                  # adj., negative, past
-    23: re.compile(r'\((adj|adv|aux|n-adv)'),                  # adj., past
-    24: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # plain verb
-    25: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'),  # polite, te-form
-}
+Vtype = namedtuple('Vtype', 'wclass label')
+Vconj = namedtuple('Vconj', 'regex conj infi rule')
 
-_vconj_type = dict()
-_vconj_deinf = []
+_vconj_type = dict()  # format: { rule_no: (wclass, label), ... }
+_vconj_deinf = []     # format: [ (regex, conj, infinitve, rule_no), ... ]
 _vconj_loaded = False
 
 def _get_dfile_path(fname, mode=os.R_OK):
@@ -215,8 +187,6 @@ def _get_dfile_path(fname, mode=os.R_OK):
     return fname
 
 # load and parse VCONJ rule file
-Vconj = namedtuple('Vconj', 'regex conj infi rule')
-
 def _vconj_load():
     global _vconj_loaded
     vcname = _JITENPAI_VCONJ
@@ -224,24 +194,25 @@ def _vconj_load():
         vcname = _get_dfile_path(os.path.join(_JITENPAI_DIR, _JITENPAI_VCONJ), mode=os.R_OK)
     try:
         with open(vcname) as vcfile:
-            re_type = re.compile(r'^(\d+)\s+(.+)$')
+            re_type = re.compile(r'^(\d+)\s+"(\S+)"\s+(.+)$')
             re_deinf = re.compile(r'^\s*([^#\s]+)\s+(\S+)\s+(\d+)\s*$')
             for line in vcfile:
                 match = re_type.match(line)
                 if match:
-                    _vconj_type[match.group(1)] = match.group(2)
+                    wclass = re.compile(match.group(2))
+                    _vconj_type[int(match.group(1))] = Vtype(wclass, match.group(3))
                     continue
                 match = re_deinf.match(line)
                 if match:
                     regex = re.compile('%s$' % match.group(1))
-                    _vconj_deinf.append(Vconj(regex, match.group(1), match.group(2), match.group(3)))
+                    _vconj_deinf.append(Vconj(regex, match.group(1), match.group(2), int(match.group(3))))
                     continue
         _vconj_loaded = len(_vconj_deinf) > 0
     except Exception as e:
         eprint('_vconj_load:', vcname, str(e))
 
 # collect inflection rules potentially applicable to a verb(-candidate)
-Vinf = namedtuple('Vinf', 'infi blurb rule')
+Vinf = namedtuple('Vinf', 'infi blurb wclass')
 
 def _vconj_deinflect(verb):
     inf = []
@@ -249,8 +220,9 @@ def _vconj_deinflect(verb):
     for deinf in _vconj_deinf:
         verb_inf = deinf.regex.sub(deinf.infi, verb)
         if verb_inf != verb:
-            blurb = '%s %s â %s' % (_vconj_type[deinf.rule], deinf.conj, deinf.infi)
-            inf.append(Vinf(verb_inf, blurb, int(deinf.rule)))
+            blurb = '%s %s â %s' % (_vconj_type[deinf.rule].label, deinf.conj, deinf.infi)
+            wclass = _vconj_type[deinf.rule].wclass
+            inf.append(Vinf(verb_inf, blurb, wclass))
     return inf
 
 
@@ -1355,7 +1327,7 @@ class jpMainWindow(QMainWindow):
             # keep only results belonging to a suitable word class and
             # attach the inflection info; reject everything else
             for r in res:
-                if _vconj_wclass[inf.rule].search(r.gloss):
+                if inf.wclass.search(r.gloss):
                     result.append(EntryEx(r.headword, r.reading, r.gloss, inf))
                     limit -= 1
             if limit <= 0 or not ok:
diff --git a/vconj.utf8 b/vconj.utf8
index 95894b5..1b58519 100644
--- a/vconj.utf8
+++ b/vconj.utf8
@@ -1,44 +1,43 @@
 #
 # V C O N J - control file for verb and adjective deinflection
 #
-# the following section sets up the labels which are used for the
-# various inflections. These are displayed by the program.
-# The initial labels can be edited by the user.
+# Adapted from xjdic, Copyright (c) 2003 J.W. Breen
 #
-#  First there are the labels for the types of conjugations
+# First there are the labels for the types of conjugations, along with
+# regular expressions to filter potential dictionary matches based on
+# the tags in the gloss to only include words of suitable classes,
+# depending on the nature of the rule.
 #
-0	plain, negative, nonpast
-1	polite, non-past
-2	conditional
-3	volitional
-4	te-form
-5	plain, past
-6	plain, negative, past
-7	passive
-8	causative
-9	potential or imperative
-10	imperative
-11	polite, past
-12	polite, negative, non-past
-13	polite, negative, past
-14	polite, volitional
-15	adj. -> adverb
-16	adj., past
-17	polite
-18	polite, volitional
-19	passive or potential
-20	passive (or potential if Grp 2)
-21	adj., negative
-22	adj., negative, past
-23	adj., past
-24	plain verb
-25	polite, te-form
+# TODO: adjust these REs to best fit
 #
-#  and these are the conjugations/inflections, and their dictionary forms
-#	(please note that these are scanned from the top, so the order is
-#	critical if the correct guess is to be made.)
+0	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    plain, negative, nonpast
+1	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    polite, non-past
+2	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    conditional
+3	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    volitional
+4	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    te-form
+5	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    plain, past
+6	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    plain, negative, past
+7	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    passive
+8	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    causative
+9	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    potential or imperative
+10	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    imperative
+11	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    polite, past
+12	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    polite, negative, non-past
+13	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    polite, negative, past
+14	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    polite, volitional
+15	"\((adj|adv|aux|n-adv)"                    adj. -> adverb
+16	"\((adj|adv|aux|n-adv)"                    adj., past
+17	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    polite
+18	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    polite, volitional
+19	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    passive or potential
+20	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    passive (or potential if Grp 2)
+21	"\((adj|adv|aux|n-adv)"                    adj., negative
+22	"\((adj|adv|aux|n-adv)"                    adj., negative, past
+23	"\((adj|adv|aux|n-adv)"                    adj., past
+24	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    plain verb
+25	"\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))"    polite, te-form
 #
-$   this line flags the start of them
+# And these are the conjugations/inflections, and their dictionary forms.
 #
 ã	ã	5
 ã¦	ã	4
@@ -189,7 +188,7 @@ $   this line flags the start of them
 ã¾ããã	ã	18
 ãã°	ã	2
 ãã	ã	3
-#ã¦	ã	4    # 2021-06-19 disabled here, appears in line 44
+#ã¦	ã	4    # 2021-06-19 disabled here, already present above
 ã	ã	5
 ãã	ã	20
 ãã	ã	8
-- 
2.30.2