if args.term:
cfg['term'] = args.term
-
def dict_lookup(dict_fname, term):
with open(dict_fname) as dict_file:
# edict example line:
# 〆日 [しめび] /(n) time limit/closing day/settlement day (payment)/deadline/
- re_split = re.compile(r'^\s*(.*?)\s*\[\s*(.*?)\s*\]\s*/\s*(.*?)\s*/\s*$')
+ #re_split = re.compile(r'^\s*(.*?)\s*\[\s*(.*?)\s*\]\s*/\s*(.*?)\s*/\s*$')
re_term = re.compile(term)
for line in dict_file:
try:
- kanji, kana, trans = re_split.match(line.strip()).groups()
+ #kanji, kana, trans = re_split.match(line.strip()).groups()
+ # splitting the line manually is twice as fast as the regex, but
+ # still about 3 times slower than the C reference implementation
+ p1 = line.split('[', 1)
+ p2 = p1[1].split(']', 1)
+ kanji = p1[0].strip()
+ kana = p2[0].strip()
+ trans = p2[1].strip('/ \t\r\n')
except:
continue
# for now promiscuously try to match anything anywhere
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+
+#include <sys/types.h>
+#include <regex.h>
+#include <unistd.h>
+
+
+static inline int split_line(char *line, char **kanji, char **kana, char **trans) {
+ char *p;
+ // POSIX regexes are refreshingly useless for our line splitting needs.
+ // Commence ye olde spaghetti parser!
+#define SKIP_SPACE(P) while ( *P && isspace((unsigned char)*P) ) \
+ ++P; \
+ if (!*P) return -1
+#define BACK_SPACE(P) *P = '\0'; \
+ while ( isspace((unsigned char)*--P) ) \
+ *P = '\0'
+#define MATCH(P, C) while ( *P && *P != C ) \
+ ++P; \
+ if (!*P) return -1
+ // edict example line:
+ // 〆日 [しめび] /(n) time limit/closing day/settlement day (payment)/deadline/
+ p = line;
+ SKIP_SPACE(p);
+ *kanji = p;
+ MATCH(p, '[');
+ *kana = p + 1;
+ BACK_SPACE(p);
+ SKIP_SPACE(*kana);
+ p = *kana;
+ MATCH(p, ']');
+ *trans = p + 1;
+ BACK_SPACE(p);
+ p = *trans;
+ MATCH(p, '/');
+ *trans = p + 1;
+ SKIP_SPACE(*trans);
+ MATCH(p, '\n');
+ BACK_SPACE(p);
+ MATCH(p, '/');
+ BACK_SPACE(p);
+#undef SKIP_SPACE
+#undef BACK_SPACE
+#undef MATCH
+ return 0;
+}
+
+static int dict_lookup(const char *dict_fname, const char *term) {
+ FILE *dict_file;
+ static char line[16000];
+ char *kanji, *kana, *trans;
+ regex_t re_term;
+
+ if ( NULL == (dict_file = fopen(dict_fname, "r")) )
+ return -1;
+ if ( 0 != regcomp(&re_term, term, REG_EXTENDED | REG_ICASE | REG_NOSUB) )
+ return -1;
+ while ( NULL != fgets(line, sizeof line, dict_file) ) {
+ if ( 0 != split_line(line, &kanji, &kana, &trans) )
+ continue;
+ // for now promiscuously try to match anything anywhere
+ if ( 0 == regexec(&re_term, kanji, 0, NULL, 0)
+ || 0 == regexec(&re_term, kana, 0, NULL, 0)
+ || 0 == regexec(&re_term, trans, 0, NULL, 0) )
+ printf("%s (%s) %s\n", kanji, kana, trans);
+ }
+ fclose(dict_file);
+ return 0;
+}
+
+int main(int argc, char *argv[]) {
+ int rc = 0, c;
+ const char *dict_fname = "";
+
+ while ( ( c = getopt( argc, argv, "+:d:" ) ) != -1 ) {
+ switch (c) {
+ case 'd':
+ dict_fname = optarg;
+ break;
+ case ':':
+ case '?':
+ default:
+ break;
+ }
+ }
+ while ( optind < argc && 0 == rc ) {
+ rc = dict_lookup(dict_fname, argv[optind++]);
+ }
+ exit(rc ? EXIT_FAILURE : EXIT_SUCCESS);
+}