From: Urban Wallasch Date: Sun, 13 Jun 2021 08:40:30 +0000 (+0200) Subject: * Parse dict lines using split() and strip() to get rid of one slow regex. X-Git-Url: https://git.packet-gain.de/?a=commitdiff_plain;h=667ce1e6b7f08c95b62bd6186e828227000c3a22;p=jiten-pai.git * Parse dict lines using split() and strip() to get rid of one slow regex. * Added a C reference implementation. --- diff --git a/cdemo.sh b/cdemo.sh new file mode 100755 index 0000000..d571385 --- /dev/null +++ b/cdemo.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +cc -W -Wall -Wextra -o jitenshi jitenshi.c + +dict=/usr/share/gjiten/dics/edict + +terms=( + '^こころ$' + '^心$' + 'heart.*spirit' + 'ビール' +) + +for t in "${terms[@]}"; do + echo "search term: '$t'" + ./jitenshi -d "$dict" "$t" + echo "---------------------------" +done diff --git a/jiten-pai.py b/jiten-pai.py index 4b25028..5c48dcd 100755 --- a/jiten-pai.py +++ b/jiten-pai.py @@ -39,16 +39,22 @@ def parse_cmdline(): if args.term: cfg['term'] = args.term - def dict_lookup(dict_fname, term): with open(dict_fname) as dict_file: # edict example line: # 〆日 [しめび] /(n) time limit/closing day/settlement day (payment)/deadline/ - re_split = re.compile(r'^\s*(.*?)\s*\[\s*(.*?)\s*\]\s*/\s*(.*?)\s*/\s*$') + #re_split = re.compile(r'^\s*(.*?)\s*\[\s*(.*?)\s*\]\s*/\s*(.*?)\s*/\s*$') re_term = re.compile(term) for line in dict_file: try: - kanji, kana, trans = re_split.match(line.strip()).groups() + #kanji, kana, trans = re_split.match(line.strip()).groups() + # splitting the line manually is twice as fast as the regex, but + # still about 3 times slower than the C reference implementation + p1 = line.split('[', 1) + p2 = p1[1].split(']', 1) + kanji = p1[0].strip() + kana = p2[0].strip() + trans = p2[1].strip('/ \t\r\n') except: continue # for now promiscuously try to match anything anywhere diff --git a/jitenshi.c b/jitenshi.c new file mode 100644 index 0000000..8755e61 --- /dev/null +++ b/jitenshi.c @@ -0,0 +1,92 @@ +#include +#include +#include + +#include +#include +#include + + +static inline int split_line(char *line, char **kanji, char **kana, char **trans) { + char *p; + // POSIX regexes are refreshingly useless for our line splitting needs. + // Commence ye olde spaghetti parser! +#define SKIP_SPACE(P) while ( *P && isspace((unsigned char)*P) ) \ + ++P; \ + if (!*P) return -1 +#define BACK_SPACE(P) *P = '\0'; \ + while ( isspace((unsigned char)*--P) ) \ + *P = '\0' +#define MATCH(P, C) while ( *P && *P != C ) \ + ++P; \ + if (!*P) return -1 + // edict example line: + // 〆日 [しめび] /(n) time limit/closing day/settlement day (payment)/deadline/ + p = line; + SKIP_SPACE(p); + *kanji = p; + MATCH(p, '['); + *kana = p + 1; + BACK_SPACE(p); + SKIP_SPACE(*kana); + p = *kana; + MATCH(p, ']'); + *trans = p + 1; + BACK_SPACE(p); + p = *trans; + MATCH(p, '/'); + *trans = p + 1; + SKIP_SPACE(*trans); + MATCH(p, '\n'); + BACK_SPACE(p); + MATCH(p, '/'); + BACK_SPACE(p); +#undef SKIP_SPACE +#undef BACK_SPACE +#undef MATCH + return 0; +} + +static int dict_lookup(const char *dict_fname, const char *term) { + FILE *dict_file; + static char line[16000]; + char *kanji, *kana, *trans; + regex_t re_term; + + if ( NULL == (dict_file = fopen(dict_fname, "r")) ) + return -1; + if ( 0 != regcomp(&re_term, term, REG_EXTENDED | REG_ICASE | REG_NOSUB) ) + return -1; + while ( NULL != fgets(line, sizeof line, dict_file) ) { + if ( 0 != split_line(line, &kanji, &kana, &trans) ) + continue; + // for now promiscuously try to match anything anywhere + if ( 0 == regexec(&re_term, kanji, 0, NULL, 0) + || 0 == regexec(&re_term, kana, 0, NULL, 0) + || 0 == regexec(&re_term, trans, 0, NULL, 0) ) + printf("%s (%s) %s\n", kanji, kana, trans); + } + fclose(dict_file); + return 0; +} + +int main(int argc, char *argv[]) { + int rc = 0, c; + const char *dict_fname = ""; + + while ( ( c = getopt( argc, argv, "+:d:" ) ) != -1 ) { + switch (c) { + case 'd': + dict_fname = optarg; + break; + case ':': + case '?': + default: + break; + } + } + while ( optind < argc && 0 == rc ) { + rc = dict_lookup(dict_fname, argv[optind++]); + } + exit(rc ? EXIT_FAILURE : EXIT_SUCCESS); +}