* Introduced '-x' option to suppress redundant reports at the cost of reduced accuracy.
* Echo hashes in output script for debug builds.
-m file merge additional database file
-p prune missing files from database
-t n similarity threshold in bits (0..256) or percent; default: 256
+ -x ignore images already listed in this run
-b float blur factor; add -r when changed between runs; default: 2.5
-r re-scan files already in database
-i text add text as lead-in to output
to maximum (256 or, equivalent, 100%) to avoid generating too many
false positives.
+ * `-x` marks images that have already been reported as duplicates so
+ they are ignored for the remainder of the run. While this eliminates
+ redundant reports especially for lower threshold values it may lead
+ to some duplicates being missed. Marks are reset prior to each run.
+
* `-b` can be used to fine-tune the perceptual hash calculation; when
this parameter is changed between invocations a full re-scan should be
performed by additionally specifying `-r`.
}
/* find duplicate or similar images */
-int db_find_dupes(db_t *db, int thresh, int(*cb)(db_entry_t *dupes) ) {
+static inline db_entry_t *check(db_entry_t *p, db_entry_t *cmp,
+ db_entry_t *dupes, int tdelta, int mark) {
+ while ( NULL != cmp ) {
+ if ( !DB_ENTRY_ISDELETED(cmp) && (!mark || !DB_ENTRY_ISMARKED(cmp))
+ && hdist(cmp->phash, p->phash) <= tdelta ) {
+ DB_ENTRY_MARK(cmp);
+ cmp->aux = dupes;
+ dupes = cmp;
+ }
+ cmp = cmp->pnext;
+ }
+ return dupes;
+}
+
+int db_find_dupes(db_t *db, int thresh, int mark, int(*cb)(db_entry_t *) ) {
+ int tdelta = PHASH_BITS - thresh;
+ db_entry_t *p, *dupes;
+
db_lock(db);
- for ( size_t i = 0; i < ARR_SIZE(db->p_ent); ++i ) {
- db_entry_t *p, *cmp, *dupes;
+ /* reset all check marks */
+ for ( int i = 0; i < PHASH_BITS; ++i ) {
+ for ( p = db->p_ent[i]; NULL != p; p = p->pnext )
+ DB_ENTRY_UNMARK(p);
+ }
+ /* find image sets with similar perceptual hash */
+ for ( int i = 0; i <= PHASH_BITS; ++i ) {
for ( p = db->p_ent[i]; NULL != p; p = p->pnext ) {
if ( DB_ENTRY_ISDELETED(p) )
continue;
- dupes = NULL;
- for ( cmp = p->pnext; NULL != cmp; cmp = cmp->pnext ) {
- if ( !DB_ENTRY_ISDELETED(cmp)
- && thresh <= hdist(cmp->phash, p->phash) ) {
- cmp->aux = dupes;
- dupes = cmp;
- }
- }
- if ( thresh < 256 && i < PHASH_BITS ) {
- /* for lower thresh also look in the next bucket over */
- for ( cmp = db->p_ent[i+1]; NULL != cmp; cmp = cmp->pnext ) {
- if ( !DB_ENTRY_ISDELETED(cmp)
- && thresh <= 256 - hdist(cmp->phash, p->phash) ) {
- cmp->aux = dupes;
- dupes = cmp;
- }
- }
- }
+ /* look in remainder of same bucket first */
+ dupes = check( p, p->pnext, NULL, tdelta, mark );
+ /* for thresh < max also look in the next buckets */
+ for ( int j = i + 1; j <= i + tdelta && j <= PHASH_BITS; ++j )
+ dupes = check( p, db->p_ent[j], dupes, tdelta, mark );
+ /* report found duplicates, if any */
if ( NULL != dupes ) {
p->aux = dupes;
dupes = p;
enum db_entry_flags {
DB_ENTRY_FLAG_DEL = 1,
+ DB_ENTRY_FLAG_MARK = 2,
};
#define DB_ENTRY_DELETE(p) ((p)->flags |= DB_ENTRY_FLAG_DEL)
#define DB_ENTRY_ISDELETED(p) ((p)->flags & DB_ENTRY_FLAG_DEL)
+#define DB_ENTRY_MARK(p) ((p)->flags |= DB_ENTRY_FLAG_MARK)
+#define DB_ENTRY_UNMARK(p) ((p)->flags &= ~DB_ENTRY_FLAG_MARK)
+#define DB_ENTRY_ISMARKED(p) ((p)->flags & DB_ENTRY_FLAG_MARK)
+
typedef
struct db_entry_struct
extern int db_update(db_t *db, db_entry_t *entry);
extern int db_write(db_t *db, const char *dbf);
extern int db_read(db_t *db, const char *dbf);
-extern int db_find_dupes(db_t *db, int thresh, int(*cb)(db_entry_t *dupes) );
+extern int db_find_dupes(db_t *db, int thresh, int mark, int(*cb)(db_entry_t *) );
#endif //ndef DB_H_INCLUDED
int rescan;
double blur;
int thresh;
+ int mark;
int maxdepth;
int max_fd;
int nthreads;
.db_prune = 0,
.rescan = 0,
.blur = 2.5,
- .thresh = 256,
+ .thresh = PHASH_BITS,
+ .mark = 0,
.maxdepth = 0,
.max_fd = 1000,
.nthreads = 4,
/* callback for db_find_dupes() */
static int find_dupes_cb(db_entry_t *dupes) {
- printf("VIEW");
db_entry_t *p;
+#ifdef DEBUG
+ for ( p = dupes; NULL != p; p = p->aux ) {
+ printf("echo '");
+ for ( int i = 0; i < 4; ++i)
+ printf(" %016"PRIx64, p->phash[i]);
+ puts("'");
+ }
+#endif
+ printf("VIEW");
for ( p = dupes; NULL != p; p = p->aux ) {
printf(" '");
for ( const char *cp = p->fname; '\0' != *cp; ++cp ) {
" -m file merge additional database file\n"
" -p prune missing files from database\n"
" -t n similarity threshold in bits (0..256) or percent; default: 256\n"
+ " -x ignore images already listed in this run\n"
" -b float blur factor; add -r when changed between runs; default: 2.5\n"
" -r re-scan files already in database\n"
" -i text add text as lead-in to output\n"
n = strtol(optarg, &cp, 10);
if ( '%' == *cp ) {
n = n < 0 ? 0 : n > 100 ? 100 : n;
- n = 256 * n / 100;
+ n = PHASH_BITS * n / 100;
}
- cfg.thresh = n < 0 ? 0 : n > 256 ? 256 : n;
+ cfg.thresh = n < 0 ? 0 : n > PHASH_BITS ? PHASH_BITS : n;
break;
case 'T':
n = atoi(optarg);
cfg.nthreads = n > 0 ? n : 1;
break;
+ case 'x':
+ cfg.mark = 1;
+ break;
case ':':
eprintf("ERROR: missing argument for option '-%c'\n", optopt);
usage(argv[0], EXIT_FAILURE);
dprintf("searching for potential duplicates ...\n");
printf("%s", lead_in);
- db_find_dupes(db, cfg.thresh, find_dupes_cb);
+ db_find_dupes(db, cfg.thresh, cfg.mark, find_dupes_cb);
printf("END\n");
db_destroy(&db);