From d6b1947a8d6f764c1478c8f1f454aa62058289b8 Mon Sep 17 00:00:00 2001 From: Urban Wallasch Date: Fri, 18 Jun 2021 17:36:14 +0200 Subject: [PATCH] * Added eucjp_to_utf8.py transcoder script. --- README.md | 4 ++++ eucjp_to_utf8.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100755 eucjp_to_utf8.py diff --git a/README.md b/README.md index c8b7828..6206ac9 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,10 @@ particular indicate whether a file is actually in EDICT format. In many cases a conversion from EUC-JP to UTF-8 will be necessary, as outlined in the examples above. +**HINT:** In case the `recode` utility is not available, the included +transcoding script may be used instead, e.g.: +> `./eucjp_to_utf8.py enamdict.gz enamdict` + ## Notes diff --git a/eucjp_to_utf8.py b/eucjp_to_utf8.py new file mode 100755 index 0000000..bcfbbb1 --- /dev/null +++ b/eucjp_to_utf8.py @@ -0,0 +1,30 @@ +#!/usr/bin/python3 + +""" + eucjp_to_utf8.py + + Convert EUC-JP encoded text to UTF-8. + Input files with .gz extension are automatically decompressed. + + USAGE: eucjp_to_utf8.py [infile [outfile]] + + Copyright (c) 2021 Urban Wallasch + Modified ("3-clause") BSD License +""" + +import sys +import codecs + +if len(sys.argv) > 1: + iname = sys.argv[1] + ifile = open(iname, 'rb') + if iname[-3:] == '.gz': + import gzip + ifile = gzip.GzipFile(fileobj=ifile) +else: + ifile = sys.stdin.detach() + +ofile = open(sys.argv[2], 'w') if len(sys.argv) > 2 else sys.stdout + +for line in ifile: + print(line.decode('euc_jp'), end='', file=ofile) -- 2.30.2