* Parse dict lines using split() and strip() to get rid of one slow regex. poc
authorUrban Wallasch <urban.wallasch@freenet.de>
Sun, 13 Jun 2021 08:40:30 +0000 (10:40 +0200)
committerUrban Wallasch <urban.wallasch@freenet.de>
Sun, 13 Jun 2021 08:40:30 +0000 (10:40 +0200)
* Added a C reference implementation.

cdemo.sh [new file with mode: 0755]
jiten-pai.py
jitenshi.c [new file with mode: 0644]

diff --git a/cdemo.sh b/cdemo.sh
new file mode 100755 (executable)
index 0000000..d571385
--- /dev/null
+++ b/cdemo.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+cc -W -Wall -Wextra -o jitenshi jitenshi.c
+
+dict=/usr/share/gjiten/dics/edict
+
+terms=(
+    '^こころ$'
+    '^心$'
+    'heart.*spirit'
+    'ビール'
+)
+
+for t in "${terms[@]}"; do
+    echo "search term: '$t'"
+    ./jitenshi -d "$dict" "$t"
+    echo "---------------------------"
+done
index 4b250280e2917dea4d11e8b75361d9b94beecb3f..5c48dcdceb30b292e0bf65fff2e64458d9fc582f 100755 (executable)
@@ -39,16 +39,22 @@ def parse_cmdline():
     if args.term:
         cfg['term'] = args.term
 
-
 def dict_lookup(dict_fname, term):
     with open(dict_fname) as dict_file:
         # edict example line:
         # 〆日 [しめび] /(n) time limit/closing day/settlement day (payment)/deadline/
-        re_split = re.compile(r'^\s*(.*?)\s*\[\s*(.*?)\s*\]\s*/\s*(.*?)\s*/\s*$')
+        #re_split = re.compile(r'^\s*(.*?)\s*\[\s*(.*?)\s*\]\s*/\s*(.*?)\s*/\s*$')
         re_term = re.compile(term)
         for line in dict_file:
             try:
-                kanji, kana, trans = re_split.match(line.strip()).groups()
+                #kanji, kana, trans = re_split.match(line.strip()).groups()
+                # splitting the line manually is twice as fast as the regex, but
+                # still about 3 times slower than the C reference implementation
+                p1 = line.split('[', 1)
+                p2 = p1[1].split(']', 1)
+                kanji = p1[0].strip()
+                kana = p2[0].strip()
+                trans = p2[1].strip('/ \t\r\n')
             except:
                 continue
             # for now promiscuously try to match anything anywhere
diff --git a/jitenshi.c b/jitenshi.c
new file mode 100644 (file)
index 0000000..8755e61
--- /dev/null
@@ -0,0 +1,92 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+
+#include <sys/types.h>
+#include <regex.h>
+#include <unistd.h>
+
+
+static inline int split_line(char *line, char **kanji, char **kana, char **trans) {
+    char *p;
+    // POSIX regexes are refreshingly useless for our line splitting needs.
+    // Commence ye olde spaghetti parser!
+#define SKIP_SPACE(P)   while ( *P && isspace((unsigned char)*P) )     \
+                            ++P;                                       \
+                        if (!*P) return -1
+#define BACK_SPACE(P)   *P = '\0';                                     \
+                        while ( isspace((unsigned char)*--P) )         \
+                            *P = '\0'
+#define MATCH(P, C)     while ( *P && *P != C )                        \
+                            ++P;                                       \
+                        if (!*P) return -1
+    // edict example line:
+    // 〆日 [しめび] /(n) time limit/closing day/settlement day (payment)/deadline/
+    p = line;
+    SKIP_SPACE(p);
+    *kanji = p;
+    MATCH(p, '[');
+    *kana = p + 1;
+    BACK_SPACE(p);
+    SKIP_SPACE(*kana);
+    p = *kana;
+    MATCH(p, ']');
+    *trans = p + 1;
+    BACK_SPACE(p);
+    p = *trans;
+    MATCH(p, '/');
+    *trans = p + 1;
+    SKIP_SPACE(*trans);
+    MATCH(p, '\n');
+    BACK_SPACE(p);
+    MATCH(p, '/');
+    BACK_SPACE(p);
+#undef SKIP_SPACE
+#undef BACK_SPACE
+#undef MATCH
+    return 0;
+}
+
+static int dict_lookup(const char *dict_fname, const char *term) {
+    FILE *dict_file;
+    static char line[16000];
+    char *kanji, *kana, *trans;
+    regex_t re_term;
+
+    if ( NULL == (dict_file = fopen(dict_fname, "r")) )
+        return -1;
+    if ( 0 != regcomp(&re_term, term, REG_EXTENDED | REG_ICASE | REG_NOSUB) )
+        return -1;
+    while ( NULL != fgets(line, sizeof line, dict_file) ) {
+        if ( 0 != split_line(line, &kanji, &kana, &trans) )
+            continue;
+        // for now promiscuously try to match anything anywhere
+        if ( 0 == regexec(&re_term, kanji, 0, NULL, 0)
+            || 0 == regexec(&re_term, kana, 0, NULL, 0)
+            || 0 == regexec(&re_term, trans, 0, NULL, 0) )
+            printf("%s (%s) %s\n", kanji, kana, trans);
+    }
+    fclose(dict_file);
+    return 0;
+}
+
+int main(int argc, char *argv[]) {
+    int rc = 0, c;
+    const char *dict_fname = "";
+
+    while ( ( c = getopt( argc, argv, "+:d:" ) ) != -1 ) {
+        switch (c) {
+        case 'd':
+            dict_fname = optarg;
+            break;
+        case ':':
+        case '?':
+        default:
+            break;
+        }
+    }
+    while ( optind < argc && 0 == rc ) {
+        rc = dict_lookup(dict_fname, argv[optind++]);
+    }
+    exit(rc ? EXIT_FAILURE : EXIT_SUCCESS);
+}