From c3f75e9969fae10ddd0a268efcc1636ed57b8219 Mon Sep 17 00:00:00 2001 From: Urban Wallasch Date: Sat, 3 Jul 2021 16:27:14 +0200 Subject: [PATCH] * Moved word class REs to vconj file to keep them in close proximity to the rules. * Slightly restructured the verb de-inflection procedure. * The vconj file does not originate in Gjiten, but in fact first appeared in XJDIC. --- README.md | 4 +-- jiten-pai.py | 54 ++++++++++------------------------------ vconj.utf8 | 69 ++++++++++++++++++++++++++-------------------------- 3 files changed, 49 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index c57d7aa..7d0185b 100644 --- a/README.md +++ b/README.md @@ -129,8 +129,8 @@ Jiten-pai incorporates parts taken from other projects, namely: * Kana conversion code adapted from [jaconv](https://github.com/ikegami-yukino/jaconv); Copyright (c) 2014 Yukino Ikegami; MIT License -* VCONJ verb de-inflection rule file adapted from [Gjiten](http://gjiten.sourceforge.net/); - Copyright (c) 1999-2005 Botond Botyanszki; GNU General Public License v2.0 +* VCONJ verb de-inflection rule file adapted from XJDIC; + Copyright (c) 1998-2003 J.W. Breen; GNU General Public License v2.0 * RADKFILE and KRADFILE kanji radical cross-reference adapted from [The KRADFILE/RADKFILE Project](http://www.edrdg.org/krad/kradinf.html); diff --git a/jiten-pai.py b/jiten-pai.py index bb37a34..ca3fde2 100755 --- a/jiten-pai.py +++ b/jiten-pai.py @@ -163,39 +163,11 @@ except Exception as e: ############################################################ # verb de-inflection -# REs for word classes a specific inflection rule may generally be -# applicable to, as tagged in the gloss part of dictionary entries. -_vconj_wclass = { # TODO: adjust these REs to best fit word classes to inflection rule. - 0: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # plain, negative, nonpast - 1: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # polite, non-past - 2: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # conditional - 3: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # volitional - 4: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # te-form - 5: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # plain, past - 6: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # plain, negative, past - 7: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # passive - 8: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # causative - 9: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # potential or imperative - 10: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # imperative - 11: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # polite, past - 12: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # polite, negative, non-past - 13: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # polite, negative, past - 14: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # polite, volitional - 15: re.compile(r'\((adj|adv|aux|n-adv)'), # adj. -> adverb - 16: re.compile(r'\((adj|adv|aux|n-adv)'), # adj., past - 17: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # polite - 18: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # polite, volitional - 19: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # passive or potential - 20: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # passive (or potential if Grp 2) - 21: re.compile(r'\((adj|adv|aux|n-adv)'), # adj., negative - 22: re.compile(r'\((adj|adv|aux|n-adv)'), # adj., negative, past - 23: re.compile(r'\((adj|adv|aux|n-adv)'), # adj., past - 24: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # plain verb - 25: re.compile(r'\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))'), # polite, te-form -} +Vtype = namedtuple('Vtype', 'wclass label') +Vconj = namedtuple('Vconj', 'regex conj infi rule') -_vconj_type = dict() -_vconj_deinf = [] +_vconj_type = dict() # format: { rule_no: (wclass, label), ... } +_vconj_deinf = [] # format: [ (regex, conj, infinitve, rule_no), ... ] _vconj_loaded = False def _get_dfile_path(fname, mode=os.R_OK): @@ -215,8 +187,6 @@ def _get_dfile_path(fname, mode=os.R_OK): return fname # load and parse VCONJ rule file -Vconj = namedtuple('Vconj', 'regex conj infi rule') - def _vconj_load(): global _vconj_loaded vcname = _JITENPAI_VCONJ @@ -224,24 +194,25 @@ def _vconj_load(): vcname = _get_dfile_path(os.path.join(_JITENPAI_DIR, _JITENPAI_VCONJ), mode=os.R_OK) try: with open(vcname) as vcfile: - re_type = re.compile(r'^(\d+)\s+(.+)$') + re_type = re.compile(r'^(\d+)\s+"(\S+)"\s+(.+)$') re_deinf = re.compile(r'^\s*([^#\s]+)\s+(\S+)\s+(\d+)\s*$') for line in vcfile: match = re_type.match(line) if match: - _vconj_type[match.group(1)] = match.group(2) + wclass = re.compile(match.group(2)) + _vconj_type[int(match.group(1))] = Vtype(wclass, match.group(3)) continue match = re_deinf.match(line) if match: regex = re.compile('%s$' % match.group(1)) - _vconj_deinf.append(Vconj(regex, match.group(1), match.group(2), match.group(3))) + _vconj_deinf.append(Vconj(regex, match.group(1), match.group(2), int(match.group(3)))) continue _vconj_loaded = len(_vconj_deinf) > 0 except Exception as e: eprint('_vconj_load:', vcname, str(e)) # collect inflection rules potentially applicable to a verb(-candidate) -Vinf = namedtuple('Vinf', 'infi blurb rule') +Vinf = namedtuple('Vinf', 'infi blurb wclass') def _vconj_deinflect(verb): inf = [] @@ -249,8 +220,9 @@ def _vconj_deinflect(verb): for deinf in _vconj_deinf: verb_inf = deinf.regex.sub(deinf.infi, verb) if verb_inf != verb: - blurb = '%s %s → %s' % (_vconj_type[deinf.rule], deinf.conj, deinf.infi) - inf.append(Vinf(verb_inf, blurb, int(deinf.rule))) + blurb = '%s %s → %s' % (_vconj_type[deinf.rule].label, deinf.conj, deinf.infi) + wclass = _vconj_type[deinf.rule].wclass + inf.append(Vinf(verb_inf, blurb, wclass)) return inf @@ -1355,7 +1327,7 @@ class jpMainWindow(QMainWindow): # keep only results belonging to a suitable word class and # attach the inflection info; reject everything else for r in res: - if _vconj_wclass[inf.rule].search(r.gloss): + if inf.wclass.search(r.gloss): result.append(EntryEx(r.headword, r.reading, r.gloss, inf)) limit -= 1 if limit <= 0 or not ok: diff --git a/vconj.utf8 b/vconj.utf8 index 95894b5..1b58519 100644 --- a/vconj.utf8 +++ b/vconj.utf8 @@ -1,44 +1,43 @@ # # V C O N J - control file for verb and adjective deinflection # -# the following section sets up the labels which are used for the -# various inflections. These are displayed by the program. -# The initial labels can be edited by the user. +# Adapted from xjdic, Copyright (c) 2003 J.W. Breen # -# First there are the labels for the types of conjugations +# First there are the labels for the types of conjugations, along with +# regular expressions to filter potential dictionary matches based on +# the tags in the gloss to only include words of suitable classes, +# depending on the nature of the rule. # -0 plain, negative, nonpast -1 polite, non-past -2 conditional -3 volitional -4 te-form -5 plain, past -6 plain, negative, past -7 passive -8 causative -9 potential or imperative -10 imperative -11 polite, past -12 polite, negative, non-past -13 polite, negative, past -14 polite, volitional -15 adj. -> adverb -16 adj., past -17 polite -18 polite, volitional -19 passive or potential -20 passive (or potential if Grp 2) -21 adj., negative -22 adj., negative, past -23 adj., past -24 plain verb -25 polite, te-form +# TODO: adjust these REs to best fit # -# and these are the conjugations/inflections, and their dictionary forms -# (please note that these are scanned from the top, so the order is -# critical if the correct guess is to be made.) +0 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" plain, negative, nonpast +1 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" polite, non-past +2 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" conditional +3 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" volitional +4 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" te-form +5 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" plain, past +6 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" plain, negative, past +7 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" passive +8 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" causative +9 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" potential or imperative +10 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" imperative +11 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" polite, past +12 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" polite, negative, non-past +13 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" polite, negative, past +14 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" polite, volitional +15 "\((adj|adv|aux|n-adv)" adj. -> adverb +16 "\((adj|adv|aux|n-adv)" adj., past +17 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" polite +18 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" polite, volitional +19 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" passive or potential +20 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" passive (or potential if Grp 2) +21 "\((adj|adv|aux|n-adv)" adj., negative +22 "\((adj|adv|aux|n-adv)" adj., negative, past +23 "\((adj|adv|aux|n-adv)" adj., past +24 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" plain verb +25 "\((adj|adv|aux|n-adv|v(?!ulg|idg|ie))" polite, te-form # -$ this line flags the start of them +# And these are the conjugations/inflections, and their dictionary forms. # た る 5 て る 4 @@ -189,7 +188,7 @@ $ this line flags the start of them ましょう る 18 れば る 2 よう る 3 -#て る 4 # 2021-06-19 disabled here, appears in line 44 +#て る 4 # 2021-06-19 disabled here, already present above た る 5 られ る 20 させ る 8 -- 2.30.2