From e423448d4758befed8c79b31f1ba2d39e4bcd8ff Mon Sep 17 00:00:00 2001 From: Urban Wallasch Date: Mon, 7 Jun 2021 16:32:09 +0200 Subject: [PATCH] * Fixed duplicate search being to insensitive for low threshold values. * Introduced '-x' option to suppress redundant reports at the cost of reduced accuracy. * Echo hashes in output script for debug builds. --- README.md | 6 ++++++ db.c | 52 +++++++++++++++++++++++++++++++--------------------- db.h | 7 ++++++- main.c | 24 +++++++++++++++++++----- 4 files changed, 62 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 079b174..fbb3f7f 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ OPTIONS: -m file merge additional database file -p prune missing files from database -t n similarity threshold in bits (0..256) or percent; default: 256 + -x ignore images already listed in this run -b float blur factor; add -r when changed between runs; default: 2.5 -r re-scan files already in database -i text add text as lead-in to output @@ -58,6 +59,11 @@ OPTIONS: to maximum (256 or, equivalent, 100%) to avoid generating too many false positives. + * `-x` marks images that have already been reported as duplicates so + they are ignored for the remainder of the run. While this eliminates + redundant reports especially for lower threshold values it may lead + to some duplicates being missed. Marks are reset prior to each run. + * `-b` can be used to fine-tune the perceptual hash calculation; when this parameter is changed between invocations a full re-scan should be performed by additionally specifying `-r`. diff --git a/db.c b/db.c index 4990d8e..def9831 100644 --- a/db.c +++ b/db.c @@ -243,31 +243,41 @@ int db_read(db_t *db, const char *dbf) { } /* find duplicate or similar images */ -int db_find_dupes(db_t *db, int thresh, int(*cb)(db_entry_t *dupes) ) { +static inline db_entry_t *check(db_entry_t *p, db_entry_t *cmp, + db_entry_t *dupes, int tdelta, int mark) { + while ( NULL != cmp ) { + if ( !DB_ENTRY_ISDELETED(cmp) && (!mark || !DB_ENTRY_ISMARKED(cmp)) + && hdist(cmp->phash, p->phash) <= tdelta ) { + DB_ENTRY_MARK(cmp); + cmp->aux = dupes; + dupes = cmp; + } + cmp = cmp->pnext; + } + return dupes; +} + +int db_find_dupes(db_t *db, int thresh, int mark, int(*cb)(db_entry_t *) ) { + int tdelta = PHASH_BITS - thresh; + db_entry_t *p, *dupes; + db_lock(db); - for ( size_t i = 0; i < ARR_SIZE(db->p_ent); ++i ) { - db_entry_t *p, *cmp, *dupes; + /* reset all check marks */ + for ( int i = 0; i < PHASH_BITS; ++i ) { + for ( p = db->p_ent[i]; NULL != p; p = p->pnext ) + DB_ENTRY_UNMARK(p); + } + /* find image sets with similar perceptual hash */ + for ( int i = 0; i <= PHASH_BITS; ++i ) { for ( p = db->p_ent[i]; NULL != p; p = p->pnext ) { if ( DB_ENTRY_ISDELETED(p) ) continue; - dupes = NULL; - for ( cmp = p->pnext; NULL != cmp; cmp = cmp->pnext ) { - if ( !DB_ENTRY_ISDELETED(cmp) - && thresh <= hdist(cmp->phash, p->phash) ) { - cmp->aux = dupes; - dupes = cmp; - } - } - if ( thresh < 256 && i < PHASH_BITS ) { - /* for lower thresh also look in the next bucket over */ - for ( cmp = db->p_ent[i+1]; NULL != cmp; cmp = cmp->pnext ) { - if ( !DB_ENTRY_ISDELETED(cmp) - && thresh <= 256 - hdist(cmp->phash, p->phash) ) { - cmp->aux = dupes; - dupes = cmp; - } - } - } + /* look in remainder of same bucket first */ + dupes = check( p, p->pnext, NULL, tdelta, mark ); + /* for thresh < max also look in the next buckets */ + for ( int j = i + 1; j <= i + tdelta && j <= PHASH_BITS; ++j ) + dupes = check( p, db->p_ent[j], dupes, tdelta, mark ); + /* report found duplicates, if any */ if ( NULL != dupes ) { p->aux = dupes; dupes = p; diff --git a/db.h b/db.h index dce19fa..5c105ba 100644 --- a/db.h +++ b/db.h @@ -13,11 +13,16 @@ enum db_entry_flags { DB_ENTRY_FLAG_DEL = 1, + DB_ENTRY_FLAG_MARK = 2, }; #define DB_ENTRY_DELETE(p) ((p)->flags |= DB_ENTRY_FLAG_DEL) #define DB_ENTRY_ISDELETED(p) ((p)->flags & DB_ENTRY_FLAG_DEL) +#define DB_ENTRY_MARK(p) ((p)->flags |= DB_ENTRY_FLAG_MARK) +#define DB_ENTRY_UNMARK(p) ((p)->flags &= ~DB_ENTRY_FLAG_MARK) +#define DB_ENTRY_ISMARKED(p) ((p)->flags & DB_ENTRY_FLAG_MARK) + typedef struct db_entry_struct @@ -56,7 +61,7 @@ extern int db_insert(db_t *db, db_entry_t *entry, int replace, double blur); extern int db_update(db_t *db, db_entry_t *entry); extern int db_write(db_t *db, const char *dbf); extern int db_read(db_t *db, const char *dbf); -extern int db_find_dupes(db_t *db, int thresh, int(*cb)(db_entry_t *dupes) ); +extern int db_find_dupes(db_t *db, int thresh, int mark, int(*cb)(db_entry_t *) ); #endif //ndef DB_H_INCLUDED diff --git a/main.c b/main.c index e2f8eaf..0ad71f8 100644 --- a/main.c +++ b/main.c @@ -24,6 +24,7 @@ static struct { int rescan; double blur; int thresh; + int mark; int maxdepth; int max_fd; int nthreads; @@ -32,7 +33,8 @@ static struct { .db_prune = 0, .rescan = 0, .blur = 2.5, - .thresh = 256, + .thresh = PHASH_BITS, + .mark = 0, .maxdepth = 0, .max_fd = 1000, .nthreads = 4, @@ -121,8 +123,16 @@ int scan_dir(const char *dir, queue_t *q, db_t *db) { /* callback for db_find_dupes() */ static int find_dupes_cb(db_entry_t *dupes) { - printf("VIEW"); db_entry_t *p; +#ifdef DEBUG + for ( p = dupes; NULL != p; p = p->aux ) { + printf("echo '"); + for ( int i = 0; i < 4; ++i) + printf(" %016"PRIx64, p->phash[i]); + puts("'"); + } +#endif + printf("VIEW"); for ( p = dupes; NULL != p; p = p->aux ) { printf(" '"); for ( const char *cp = p->fname; '\0' != *cp; ++cp ) { @@ -146,6 +156,7 @@ static void usage(char *pname, int ec) { " -m file merge additional database file\n" " -p prune missing files from database\n" " -t n similarity threshold in bits (0..256) or percent; default: 256\n" + " -x ignore images already listed in this run\n" " -b float blur factor; add -r when changed between runs; default: 2.5\n" " -r re-scan files already in database\n" " -i text add text as lead-in to output\n" @@ -223,14 +234,17 @@ int main(int argc, char *argv[]) { n = strtol(optarg, &cp, 10); if ( '%' == *cp ) { n = n < 0 ? 0 : n > 100 ? 100 : n; - n = 256 * n / 100; + n = PHASH_BITS * n / 100; } - cfg.thresh = n < 0 ? 0 : n > 256 ? 256 : n; + cfg.thresh = n < 0 ? 0 : n > PHASH_BITS ? PHASH_BITS : n; break; case 'T': n = atoi(optarg); cfg.nthreads = n > 0 ? n : 1; break; + case 'x': + cfg.mark = 1; + break; case ':': eprintf("ERROR: missing argument for option '-%c'\n", optopt); usage(argv[0], EXIT_FAILURE); @@ -276,7 +290,7 @@ int main(int argc, char *argv[]) { dprintf("searching for potential duplicates ...\n"); printf("%s", lead_in); - db_find_dupes(db, cfg.thresh, find_dupes_cb); + db_find_dupes(db, cfg.thresh, cfg.mark, find_dupes_cb); printf("END\n"); db_destroy(&db); -- 2.30.2