From: Urban Wallasch Date: Sun, 6 Jun 2021 00:45:15 +0000 (+0200) Subject: * Fixed bug which prevented scanning multiple directories. X-Git-Url: https://git.packet-gain.de/?a=commitdiff_plain;h=506ed16c0b3d9c8aad4a048a70589476be35ab43;p=imgdupe.git * Fixed bug which prevented scanning multiple directories. * Implemented -I include option and default output lead-in. * Implemented -d option to limit directory tree traversal depth. * Amended README.md. * Several minor fixes and improvements, removed cruft and debug code. --- diff --git a/Makefile b/Makefile index b2a62db..f5fedc0 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,6 @@ SELF := $(lastword $(MAKEFILE_LIST)) CC ?= gcc CFLAGS := -W -Wall -Wextra -O2 -std=gnu99 -pthread -MMD -MP CFLAGS += $(shell pkg-config --cflags GraphicsMagickWand) -CFLAGS += -DDEBUG LD := $(CC) LDFLAGS := diff --git a/README.md b/README.md index f4cf95e..f8e2ee6 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,96 @@ # Imdupe -Imdupe is a tool to find potentially duplicate images in a directory tree. +Imdupe is a command line tool to find potential duplicate images in a +directory tree by comparing perceptual hash values of image files. It +writes a shell script to stdout which can in turn be used to further +process the scan results. + +The perceptual hash for an image is calculated by: + + * flattening the image (merging layers/frames) + * transforming the color space to gray scale + * resizing to 16 by 16 pixel while applying Gaussian blur + * normalizing contrast and brightness + * conversion to monochromatic image + * interpreting the pixel values as 256 bit hash value + + +## Usage +``` +imgdupe [OPTIONS] DIR ... + +OPTIONS: + -d n max directory recursion depth; default: 0 (unlimited) + -f file database file + -m file merge additional database file + -p prune missing files from database + -t n similarity threshold in bits (0..256) or percent; default: 256 + -b float blur factor; add -r when changed between runs; default: 2.5 + -r re-scan files already in database + -I file use file contents as lead-in for output + -T n number of scan threads (default: 4) + -h display this help text and exit +``` +**Notes** + + * `-d` limits the recursion depth when scanning a directory tree; a + value of 1 causes `imdupe` to only scan files in the directories + specified on the command line and not descend into any sub-directories, + 0 (the default) means unlimited depth. + + * `-f` specifies the main database to load (if it exists) before starting + the scan and write back afterwards; it should appear at most once per + invocation, `-m` may be used to merge additional database files; if + this option is not present no database file will be written. + + * `-m` specifies an additional database file to merge, it may appear + multiple times. + + * `-p` helps preventing database bloat by removing stale entries. + + * `-t` is the main option affecting the similarity threshold when + comparing image hashes; however, in most cases it should be kept close + to maximum (256 or, equivalent, 100%) to avoid generating too many + false positives. + + * `-b` can be used to fine-tune the perceptual hash calculation; when + this parameter is changed between invocations a full re-scan should be + performed by additionally specifying `-r`. + + * `-r` initiates a full re-scan, even for files already present in the + database. + + * `-I` allows to include a custom lead-in in the output, which is in + particular useful to replace the default dummy versions of the `VIEW()` + and `END()` functions which are used by `imdupe` to structure its + output. + + * `-T` allows to set the number of hashing threads to use; for optimal + performance this should be equal to the number of (physical) processor + cores. + + +## Build + +The [GraphicsMagick](http://www.graphicsmagick.org/) library must be +installed as a prerequisite prior to building `imdupe`, as it is required +to manipulate images during perceptual hash calculation. + +Run `make` in the project directory to build the `imdupe` executable. + +Though it was only tested on GNU/Linux, it should work with few (if any) +modifications on other Unix-like systems. + + +## Caveat + +Due to the overly simplistic handling of numerical values database files +are only portable between systems of equal endianness. ## License -FFpreview is distributed under the Modified ("3-clause") BSD License. +Imdupe is distributed under the Modified ("3-clause") BSD License. See `LICENSE` file for more information. ---------------------------------------------------------------------- diff --git a/db.c b/db.c index e2cbf97..644a350 100644 --- a/db.c +++ b/db.c @@ -89,8 +89,7 @@ db_entry_t *db_entry_new(const char *fname) { db_t *db_init(void) { db_t *db = s_malloc(sizeof *db); - memset(db->p_ent, 0, sizeof db->p_ent); - memset(db->n_ent, 0, sizeof db->n_ent); + memset(db, 0, sizeof *db); pthread_mutex_init(&db->mtx, NULL); InitializeMagick(NULL); return db; @@ -212,7 +211,6 @@ int db_read(db_t *db, const char *dbf) { if ( NULL == (fp = fopen(dbf, "rb")) ) return -1; - db_lock(db); while ( !feof(fp) ) { p = db_entry_new(NULL); fread((void *)&p->phash, sizeof p->phash, 1, fp); @@ -228,14 +226,19 @@ int db_read(db_t *db, const char *dbf) { s_free(p); break; } + if ( NULL != db_find(db, buf) ) { + s_free(p); + continue; + } p->fname = s_strdup(buf); + db_lock(db); p->inext = db->n_ent[p->nhash]; db->n_ent[p->nhash] = p; p->pnext = db->p_ent[p->hamw]; db->p_ent[p->hamw] = p; + db_unlock(db); ++cnt; } - db_unlock(db); return cnt; } diff --git a/db.h b/db.h index 6a6b3b1..dce19fa 100644 --- a/db.h +++ b/db.h @@ -13,16 +13,11 @@ enum db_entry_flags { DB_ENTRY_FLAG_DEL = 1, - DB_ENTRY_FLAG_MARK = 2, }; #define DB_ENTRY_DELETE(p) ((p)->flags |= DB_ENTRY_FLAG_DEL) #define DB_ENTRY_ISDELETED(p) ((p)->flags & DB_ENTRY_FLAG_DEL) -#define DB_ENTRY_MARK(p) ((p)->flags |= DB_ENTRY_FLAG_MARK) -#define DB_ENTRY_UNMARK(p) ((p)->flags &= ~DB_ENTRY_FLAG_MARK) -#define DB_ENTRY_ISMARKED(p) ((p)->flags & DB_ENTRY_FLAG_MARK) - typedef struct db_entry_struct diff --git a/main.c b/main.c index ee2db12..7d5d1b3 100644 --- a/main.c +++ b/main.c @@ -24,6 +24,7 @@ static struct { int rescan; double blur; int thresh; + int maxdepth; int max_fd; int nthreads; } cfg = { @@ -32,6 +33,7 @@ static struct { .rescan = 0, .blur = 2.5, .thresh = 256, + .maxdepth = 0, .max_fd = 1000, .nthreads = 4, }; @@ -76,8 +78,9 @@ void *worker(void *arg) { int scan_dir_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) { (void)sb; - (void)ftwbuf; - if (FTW_F == typeflag) { + if ( cfg.maxdepth && cfg.maxdepth < ftwbuf->level ) + return 0; + if ( FTW_F == typeflag ) { db_entry_t *entry; entry = db_entry_new(fpath); q_enq(q, entry); @@ -121,7 +124,13 @@ static int find_dupes_cb(db_entry_t *dupes) { printf("VIEW"); db_entry_t *p; for ( p = dupes; NULL != p; p = p->aux ) { - printf(" '%s'", p->fname); + printf(" '"); + for ( const char *cp = p->fname; '\0' != *cp; ++cp ) { + if ( '\'' == *cp ) + putchar('\\'); + putchar(*cp); + } + putchar('\''); } puts(""); return 0; @@ -132,14 +141,16 @@ static void usage(char *pname, int ec) { printf("%s - find potentially duplicate images\n", prog); printf("USAGE: %s [OPTIONS] DIR ...\n", prog); printf("OPTIONS:\n" - " -h display this help text and exit\n" - " -b float blur factor; needs -r when changed between runs; default: 1.5\n" - " -f file fingerprint database file\n" - " -m file merge additional fingerprint file\n" + " -d n max directory recursion depth; default: 0 (unlimited)\n" + " -f file database file\n" + " -m file merge additional database file\n" " -p prune missing files from database\n" - " -r rescan files already in database\n" " -t n similarity threshold in bits (0..256) or percent; default: 256\n" - " -T num number of scan threads (default: 4)\n" + " -b float blur factor; add -r when changed between runs; default: 2.5\n" + " -r re-scan files already in database\n" + " -I file use file contents as lead-in for output\n" + " -T n number of scan threads (default: 4)\n" + " -h display this help text and exit\n" ); exit(ec); } @@ -147,18 +158,32 @@ static void usage(char *pname, int ec) { /* main function */ int main(int argc, char *argv[]) { int c, rc = 0, n; + char *cp; db_t *db ; + const char *lead_in = + "#!\\bin\\sh\n" + "VIEW(){ for arg in \"$@\"; do echo \"$arg\"; done; echo '-----'; }\n" + "END(){ echo 'Done'; }\n\n"; db = db_init(); - while ( ( c = getopt( argc, argv, "+:hb:f:m:prt:T:" ) ) != -1 ) { + while ( ( c = getopt( argc, argv, "+:b:d:hI:f:m:prt:T:" ) ) != -1 ) { switch (c) { - case 'h': - usage(argv[0], EXIT_SUCCESS); - break; case 'b': n = atoi(optarg); cfg.blur = n < 1 ? 1 : n; break; + case 'd': + n = atoi(optarg); + cfg.maxdepth = n < 0 ? 0 : n; + break; + case 'h': + usage(argv[0], EXIT_SUCCESS); + break; + case 'I': + n = cat_file(optarg); + lead_in = ""; + dprintf("written %d bytes lead-in (%s)\n", n, optarg); + break; case 'f': cfg.db_outfile = optarg; /* fall through */ @@ -173,15 +198,13 @@ int main(int argc, char *argv[]) { case 'r': cfg.rescan = 1; break; - case 't': { - char *ep; - n = strtol(optarg, &ep, 10); - if ( '%' == *ep ) { - n = n < 0 ? 0 : n > 100 ? 100 : n; - n = 256 * n / 100; - } - cfg.thresh = n < 0 ? 0 : n > 256 ? 256 : n; + case 't': + n = strtol(optarg, &cp, 10); + if ( '%' == *cp ) { + n = n < 0 ? 0 : n > 100 ? 100 : n; + n = 256 * n / 100; } + cfg.thresh = n < 0 ? 0 : n > 256 ? 256 : n; break; case 'T': n = atoi(optarg); @@ -211,8 +234,9 @@ int main(int argc, char *argv[]) { dprintf("scanning '.'\n"); rc = scan_dir(".", q, db); } else { - while ( optind < argc && 0 == rc ) { + while ( optind < argc ){//&& 0 == rc ) { dprintf("scanning '%s'\n", argv[optind]); + q_reset_complete(q); rc = scan_dir(argv[optind++], q, db); } } @@ -230,8 +254,7 @@ int main(int argc, char *argv[]) { } dprintf("searching dupes ...\n"); - printf("#!/bin/bash\n"); - printf(". ~/bin/imgmultiview.inc\n"); + printf("%s", lead_in); rc = db_find_dupes(db, cfg.thresh, find_dupes_cb); printf("END\n"); diff --git a/queue.c b/queue.c index 240c9c6..e53e284 100644 --- a/queue.c +++ b/queue.c @@ -80,4 +80,10 @@ void q_set_complete(queue_t *q) { q_unlock(q); } +void q_reset_complete(queue_t *q) { + q_lock(q); + q->complete = 0; + q_unlock(q); +} + /* EOF */ diff --git a/queue.h b/queue.h index 11ab79d..cd262ae 100644 --- a/queue.h +++ b/queue.h @@ -22,6 +22,7 @@ extern int q_enq(queue_t *q, db_entry_t *entry); extern db_entry_t *q_deq(queue_t *q); extern int q_complete(queue_t *q); extern void q_set_complete(queue_t *q); +extern void q_reset_complete(queue_t *q); #endif //ndef QUEUE_H_INCLUDED diff --git a/util.c b/util.c index f43cef6..3c944a0 100644 --- a/util.c +++ b/util.c @@ -25,8 +25,18 @@ void *s_strdup(const char *s) { return d; } -void s_free(void *p){ +void s_free(void *p) { free(p); } +int cat_file(const char *fname) { + FILE *fp; + int c, cnt = 0; + if ( NULL == (fp = fopen(fname, "rb")) ) + return -1; + while ( EOF != (c = fgetc(fp)) ) + ++cnt, fputc(c, stdout); + return cnt; +} + /* EOF */ diff --git a/util.h b/util.h index 7890c17..631d026 100644 --- a/util.h +++ b/util.h @@ -18,4 +18,6 @@ extern void *s_malloc(size_t sz); extern void *s_strdup(const char *s); extern void s_free(void *p); +extern int cat_file(const char *fname); + #endif //ndef UTIL_H_INCLUDED