* Fixed duplicate search being to insensitive for low threshold values.

author Urban Wallasch <urban.wallasch@freenet.de>

Mon, 7 Jun 2021 14:32:09 +0000 (16:32 +0200)

committer Urban Wallasch <urban.wallasch@freenet.de>

Mon, 7 Jun 2021 14:32:09 +0000 (16:32 +0200)
author Urban Wallasch <urban.wallasch@freenet.de>
Mon, 7 Jun 2021 14:32:09 +0000 (16:32 +0200)
committer Urban Wallasch <urban.wallasch@freenet.de>
Mon, 7 Jun 2021 14:32:09 +0000 (16:32 +0200)
diff --git a/README.md b/README.md

index 079b17488946ec448a14311415eabd3ac386fa5b..fbb3f7fcf87aa53efc7c1e86d73c862e80ed02c7 100644 (file)
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ OPTIONS:
    -m file   merge additional database file
    -p        prune missing files from database
    -t n      similarity threshold in bits (0..256) or percent; default: 256
+  -x        ignore images already listed in this run
    -b float  blur factor; add -r when changed between runs; default: 2.5
    -r        re-scan files already in database
    -i text   add text as lead-in to output
@@ -58,6 +59,11 @@ OPTIONS:
      to maximum (256 or, equivalent, 100%) to avoid generating too many
      false positives.
  
+  * `-x` marks images that have already been reported as duplicates so
+    they are ignored for the remainder of the run. While this eliminates
+    redundant reports especially for lower threshold values it may lead
+    to some duplicates being missed. Marks are reset prior to each run.
+
    * `-b` can be used to fine-tune the perceptual hash calculation; when
      this parameter is changed between invocations a full re-scan should be
      performed by additionally specifying `-r`.
diff --git a/db.c b/db.c

index 4990d8ec1615e0d2e0396d2c3a4e7b1e1f539047..def983139f28b148e673dfdd86a600a0d777a535 100644 (file)
--- a/db.c
+++ b/db.c
@@ -243,31 +243,41 @@ int db_read(db_t *db, const char *dbf) {
  }
  
  /* find duplicate or similar images */
-int db_find_dupes(db_t *db, int thresh, int(*cb)(db_entry_t *dupes) ) {
+static inline db_entry_t *check(db_entry_t *p, db_entry_t *cmp,
+                                db_entry_t *dupes, int tdelta, int mark) {
+    while ( NULL != cmp ) {
+        if ( !DB_ENTRY_ISDELETED(cmp) && (!mark || !DB_ENTRY_ISMARKED(cmp))
+          && hdist(cmp->phash, p->phash) <= tdelta ) {
+            DB_ENTRY_MARK(cmp);
+            cmp->aux = dupes;
+            dupes = cmp;
+        }
+        cmp = cmp->pnext;
+    }
+    return dupes;
+}
+
+int db_find_dupes(db_t *db, int thresh, int mark, int(*cb)(db_entry_t *) ) {
+    int tdelta = PHASH_BITS - thresh;
+    db_entry_t *p, *dupes;
+
      db_lock(db);
-    for ( size_t i = 0; i < ARR_SIZE(db->p_ent); ++i )  {
-        db_entry_t *p, *cmp, *dupes;
+    /* reset all check marks */
+    for ( int i = 0; i < PHASH_BITS; ++i ) {
+        for ( p = db->p_ent[i]; NULL != p; p = p->pnext )
+            DB_ENTRY_UNMARK(p);
+    }
+    /* find image sets with similar perceptual hash */
+    for ( int i = 0; i <= PHASH_BITS; ++i )  {
          for ( p = db->p_ent[i]; NULL != p; p = p->pnext ) {
              if ( DB_ENTRY_ISDELETED(p) )
                  continue;
-            dupes = NULL;
-            for ( cmp = p->pnext; NULL != cmp; cmp = cmp->pnext ) {
-                if ( !DB_ENTRY_ISDELETED(cmp)
-                  && thresh <= hdist(cmp->phash, p->phash) ) {
-                    cmp->aux = dupes;
-                    dupes = cmp;
-                }
-            }
-            if ( thresh < 256  && i < PHASH_BITS ) {
-                /* for lower thresh also look in the next bucket over */
-                for ( cmp = db->p_ent[i+1]; NULL != cmp; cmp = cmp->pnext ) {
-                    if ( !DB_ENTRY_ISDELETED(cmp)
-                      && thresh <= 256 - hdist(cmp->phash, p->phash) ) {
-                        cmp->aux = dupes;
-                        dupes = cmp;
-                    }
-                }
-            }
+            /* look in remainder of same bucket first */
+            dupes = check( p, p->pnext, NULL, tdelta, mark );
+            /* for thresh < max also look in the next buckets */
+            for ( int j = i + 1; j <= i + tdelta && j <= PHASH_BITS; ++j )
+                dupes = check( p, db->p_ent[j], dupes, tdelta, mark );
+            /* report found duplicates, if any */
              if ( NULL != dupes ) {
                  p->aux = dupes;
                  dupes = p;
diff --git a/db.h b/db.h

index dce19fa5c3535066297e7d48db512677351c9e96..5c105ba9c9280c0801b1e792e17771a69fc31490 100644 (file)
--- a/db.h
+++ b/db.h
@@ -13,11 +13,16 @@
  
  enum db_entry_flags {
      DB_ENTRY_FLAG_DEL = 1,
+    DB_ENTRY_FLAG_MARK = 2,
  };
  
  #define DB_ENTRY_DELETE(p) ((p)->flags |= DB_ENTRY_FLAG_DEL)
  #define DB_ENTRY_ISDELETED(p) ((p)->flags & DB_ENTRY_FLAG_DEL)
  
+#define DB_ENTRY_MARK(p) ((p)->flags |= DB_ENTRY_FLAG_MARK)
+#define DB_ENTRY_UNMARK(p) ((p)->flags &= ~DB_ENTRY_FLAG_MARK)
+#define DB_ENTRY_ISMARKED(p) ((p)->flags & DB_ENTRY_FLAG_MARK)
+
  
  typedef
      struct db_entry_struct
@@ -56,7 +61,7 @@ extern int db_insert(db_t *db, db_entry_t *entry, int replace, double blur);
  extern int db_update(db_t *db, db_entry_t *entry);
  extern int db_write(db_t *db, const char *dbf);
  extern int db_read(db_t *db, const char *dbf);
-extern int db_find_dupes(db_t *db, int thresh, int(*cb)(db_entry_t *dupes) );
+extern int db_find_dupes(db_t *db, int thresh, int mark, int(*cb)(db_entry_t *) );
  
  #endif //ndef DB_H_INCLUDED
  
diff --git a/main.c b/main.c

index e2f8eaf277a3935837a99b4d96012e13181e7286..0ad71f8fcb11c5f8f15b4f416c47469e2afbcfc4 100644 (file)
--- a/main.c
+++ b/main.c
@@ -24,6 +24,7 @@ static struct {
      int rescan;
      double blur;
      int thresh;
+    int mark;
      int maxdepth;
      int max_fd;
      int nthreads;
@@ -32,7 +33,8 @@ static struct {
      .db_prune = 0,
      .rescan = 0,
      .blur = 2.5,
-    .thresh = 256,
+    .thresh = PHASH_BITS,
+    .mark = 0,
      .maxdepth = 0,
      .max_fd = 1000,
      .nthreads = 4,
@@ -121,8 +123,16 @@ int scan_dir(const char *dir, queue_t *q, db_t *db) {
  
  /* callback for db_find_dupes() */
  static int find_dupes_cb(db_entry_t *dupes) {
-    printf("VIEW");
      db_entry_t *p;
+#ifdef DEBUG
+    for ( p = dupes; NULL != p; p = p->aux ) {
+        printf("echo '");
+        for ( int i = 0; i < 4; ++i)
+            printf(" %016"PRIx64, p->phash[i]);
+        puts("'");
+    }
+#endif
+    printf("VIEW");
      for ( p = dupes; NULL != p; p = p->aux ) {
          printf(" '");
          for ( const char *cp = p->fname; '\0' != *cp; ++cp ) {
@@ -146,6 +156,7 @@ static void usage(char *pname, int ec) {
             "  -m file   merge additional database file\n"
             "  -p        prune missing files from database\n"
             "  -t n      similarity threshold in bits (0..256) or percent; default: 256\n"
+           "  -x        ignore images already listed in this run\n"
             "  -b float  blur factor; add -r when changed between runs; default: 2.5\n"
             "  -r        re-scan files already in database\n"
             "  -i text   add text as lead-in to output\n"
@@ -223,14 +234,17 @@ int main(int argc, char *argv[]) {
              n = strtol(optarg, &cp, 10);
              if ( '%' == *cp ) {
                  n = n < 0 ? 0 : n > 100 ? 100 : n;
-                n = 256 * n / 100;
+                n = PHASH_BITS * n / 100;
              }
-            cfg.thresh = n < 0 ? 0 : n > 256 ? 256 : n;
+            cfg.thresh = n < 0 ? 0 : n > PHASH_BITS ? PHASH_BITS : n;
              break;
          case 'T':
              n = atoi(optarg);
              cfg.nthreads = n > 0 ? n : 1;
              break;
+        case 'x':
+            cfg.mark = 1;
+            break;
          case ':':
              eprintf("ERROR: missing argument for option '-%c'\n", optopt);
              usage(argv[0], EXIT_FAILURE);
@@ -276,7 +290,7 @@ int main(int argc, char *argv[]) {
  
      dprintf("searching for potential duplicates ...\n");
      printf("%s", lead_in);
-    db_find_dupes(db, cfg.thresh, find_dupes_cb);
+    db_find_dupes(db, cfg.thresh, cfg.mark, find_dupes_cb);
      printf("END\n");
  
      db_destroy(&db);
author	Urban Wallasch <urban.wallasch@freenet.de>
	Mon, 7 Jun 2021 14:32:09 +0000 (16:32 +0200)
committer	Urban Wallasch <urban.wallasch@freenet.de>
	Mon, 7 Jun 2021 14:32:09 +0000 (16:32 +0200)
README.md		patch \| blob \| history
db.c		patch \| blob \| history
db.h		patch \| blob \| history
main.c		patch \| blob \| history