* Implemented -I include option and default output lead-in.
* Implemented -d option to limit directory tree traversal depth.
* Amended README.md.
* Several minor fixes and improvements, removed cruft and debug code.
CC ?= gcc
CFLAGS := -W -Wall -Wextra -O2 -std=gnu99 -pthread -MMD -MP
CFLAGS += $(shell pkg-config --cflags GraphicsMagickWand)
-CFLAGS += -DDEBUG
LD := $(CC)
LDFLAGS :=
# Imdupe
-Imdupe is a tool to find potentially duplicate images in a directory tree.
+Imdupe is a command line tool to find potential duplicate images in a
+directory tree by comparing perceptual hash values of image files. It
+writes a shell script to stdout which can in turn be used to further
+process the scan results.
+
+The perceptual hash for an image is calculated by:
+
+ * flattening the image (merging layers/frames)
+ * transforming the color space to gray scale
+ * resizing to 16 by 16 pixel while applying Gaussian blur
+ * normalizing contrast and brightness
+ * conversion to monochromatic image
+ * interpreting the pixel values as 256 bit hash value
+
+
+## Usage
+```
+imgdupe [OPTIONS] DIR ...
+
+OPTIONS:
+ -d n max directory recursion depth; default: 0 (unlimited)
+ -f file database file
+ -m file merge additional database file
+ -p prune missing files from database
+ -t n similarity threshold in bits (0..256) or percent; default: 256
+ -b float blur factor; add -r when changed between runs; default: 2.5
+ -r re-scan files already in database
+ -I file use file contents as lead-in for output
+ -T n number of scan threads (default: 4)
+ -h display this help text and exit
+```
+**Notes**
+
+ * `-d` limits the recursion depth when scanning a directory tree; a
+ value of 1 causes `imdupe` to only scan files in the directories
+ specified on the command line and not descend into any sub-directories,
+ 0 (the default) means unlimited depth.
+
+ * `-f` specifies the main database to load (if it exists) before starting
+ the scan and write back afterwards; it should appear at most once per
+ invocation, `-m` may be used to merge additional database files; if
+ this option is not present no database file will be written.
+
+ * `-m` specifies an additional database file to merge, it may appear
+ multiple times.
+
+ * `-p` helps preventing database bloat by removing stale entries.
+
+ * `-t` is the main option affecting the similarity threshold when
+ comparing image hashes; however, in most cases it should be kept close
+ to maximum (256 or, equivalent, 100%) to avoid generating too many
+ false positives.
+
+ * `-b` can be used to fine-tune the perceptual hash calculation; when
+ this parameter is changed between invocations a full re-scan should be
+ performed by additionally specifying `-r`.
+
+ * `-r` initiates a full re-scan, even for files already present in the
+ database.
+
+ * `-I` allows to include a custom lead-in in the output, which is in
+ particular useful to replace the default dummy versions of the `VIEW()`
+ and `END()` functions which are used by `imdupe` to structure its
+ output.
+
+ * `-T` allows to set the number of hashing threads to use; for optimal
+ performance this should be equal to the number of (physical) processor
+ cores.
+
+
+## Build
+
+The [GraphicsMagick](http://www.graphicsmagick.org/) library must be
+installed as a prerequisite prior to building `imdupe`, as it is required
+to manipulate images during perceptual hash calculation.
+
+Run `make` in the project directory to build the `imdupe` executable.
+
+Though it was only tested on GNU/Linux, it should work with few (if any)
+modifications on other Unix-like systems.
+
+
+## Caveat
+
+Due to the overly simplistic handling of numerical values database files
+are only portable between systems of equal endianness.
## License
-FFpreview is distributed under the Modified ("3-clause") BSD License.
+Imdupe is distributed under the Modified ("3-clause") BSD License.
See `LICENSE` file for more information.
----------------------------------------------------------------------
db_t *db_init(void) {
db_t *db = s_malloc(sizeof *db);
- memset(db->p_ent, 0, sizeof db->p_ent);
- memset(db->n_ent, 0, sizeof db->n_ent);
+ memset(db, 0, sizeof *db);
pthread_mutex_init(&db->mtx, NULL);
InitializeMagick(NULL);
return db;
if ( NULL == (fp = fopen(dbf, "rb")) )
return -1;
- db_lock(db);
while ( !feof(fp) ) {
p = db_entry_new(NULL);
fread((void *)&p->phash, sizeof p->phash, 1, fp);
s_free(p);
break;
}
+ if ( NULL != db_find(db, buf) ) {
+ s_free(p);
+ continue;
+ }
p->fname = s_strdup(buf);
+ db_lock(db);
p->inext = db->n_ent[p->nhash];
db->n_ent[p->nhash] = p;
p->pnext = db->p_ent[p->hamw];
db->p_ent[p->hamw] = p;
+ db_unlock(db);
++cnt;
}
- db_unlock(db);
return cnt;
}
enum db_entry_flags {
DB_ENTRY_FLAG_DEL = 1,
- DB_ENTRY_FLAG_MARK = 2,
};
#define DB_ENTRY_DELETE(p) ((p)->flags |= DB_ENTRY_FLAG_DEL)
#define DB_ENTRY_ISDELETED(p) ((p)->flags & DB_ENTRY_FLAG_DEL)
-#define DB_ENTRY_MARK(p) ((p)->flags |= DB_ENTRY_FLAG_MARK)
-#define DB_ENTRY_UNMARK(p) ((p)->flags &= ~DB_ENTRY_FLAG_MARK)
-#define DB_ENTRY_ISMARKED(p) ((p)->flags & DB_ENTRY_FLAG_MARK)
-
typedef
struct db_entry_struct
int rescan;
double blur;
int thresh;
+ int maxdepth;
int max_fd;
int nthreads;
} cfg = {
.rescan = 0,
.blur = 2.5,
.thresh = 256,
+ .maxdepth = 0,
.max_fd = 1000,
.nthreads = 4,
};
int scan_dir_cb(const char *fpath, const struct stat *sb,
int typeflag, struct FTW *ftwbuf) {
(void)sb;
- (void)ftwbuf;
- if (FTW_F == typeflag) {
+ if ( cfg.maxdepth && cfg.maxdepth < ftwbuf->level )
+ return 0;
+ if ( FTW_F == typeflag ) {
db_entry_t *entry;
entry = db_entry_new(fpath);
q_enq(q, entry);
printf("VIEW");
db_entry_t *p;
for ( p = dupes; NULL != p; p = p->aux ) {
- printf(" '%s'", p->fname);
+ printf(" '");
+ for ( const char *cp = p->fname; '\0' != *cp; ++cp ) {
+ if ( '\'' == *cp )
+ putchar('\\');
+ putchar(*cp);
+ }
+ putchar('\'');
}
puts("");
return 0;
printf("%s - find potentially duplicate images\n", prog);
printf("USAGE: %s [OPTIONS] DIR ...\n", prog);
printf("OPTIONS:\n"
- " -h display this help text and exit\n"
- " -b float blur factor; needs -r when changed between runs; default: 1.5\n"
- " -f file fingerprint database file\n"
- " -m file merge additional fingerprint file\n"
+ " -d n max directory recursion depth; default: 0 (unlimited)\n"
+ " -f file database file\n"
+ " -m file merge additional database file\n"
" -p prune missing files from database\n"
- " -r rescan files already in database\n"
" -t n similarity threshold in bits (0..256) or percent; default: 256\n"
- " -T num number of scan threads (default: 4)\n"
+ " -b float blur factor; add -r when changed between runs; default: 2.5\n"
+ " -r re-scan files already in database\n"
+ " -I file use file contents as lead-in for output\n"
+ " -T n number of scan threads (default: 4)\n"
+ " -h display this help text and exit\n"
);
exit(ec);
}
/* main function */
int main(int argc, char *argv[]) {
int c, rc = 0, n;
+ char *cp;
db_t *db ;
+ const char *lead_in =
+ "#!\\bin\\sh\n"
+ "VIEW(){ for arg in \"$@\"; do echo \"$arg\"; done; echo '-----'; }\n"
+ "END(){ echo 'Done'; }\n\n";
db = db_init();
- while ( ( c = getopt( argc, argv, "+:hb:f:m:prt:T:" ) ) != -1 ) {
+ while ( ( c = getopt( argc, argv, "+:b:d:hI:f:m:prt:T:" ) ) != -1 ) {
switch (c) {
- case 'h':
- usage(argv[0], EXIT_SUCCESS);
- break;
case 'b':
n = atoi(optarg);
cfg.blur = n < 1 ? 1 : n;
break;
+ case 'd':
+ n = atoi(optarg);
+ cfg.maxdepth = n < 0 ? 0 : n;
+ break;
+ case 'h':
+ usage(argv[0], EXIT_SUCCESS);
+ break;
+ case 'I':
+ n = cat_file(optarg);
+ lead_in = "";
+ dprintf("written %d bytes lead-in (%s)\n", n, optarg);
+ break;
case 'f':
cfg.db_outfile = optarg;
/* fall through */
case 'r':
cfg.rescan = 1;
break;
- case 't': {
- char *ep;
- n = strtol(optarg, &ep, 10);
- if ( '%' == *ep ) {
- n = n < 0 ? 0 : n > 100 ? 100 : n;
- n = 256 * n / 100;
- }
- cfg.thresh = n < 0 ? 0 : n > 256 ? 256 : n;
+ case 't':
+ n = strtol(optarg, &cp, 10);
+ if ( '%' == *cp ) {
+ n = n < 0 ? 0 : n > 100 ? 100 : n;
+ n = 256 * n / 100;
}
+ cfg.thresh = n < 0 ? 0 : n > 256 ? 256 : n;
break;
case 'T':
n = atoi(optarg);
dprintf("scanning '.'\n");
rc = scan_dir(".", q, db);
} else {
- while ( optind < argc && 0 == rc ) {
+ while ( optind < argc ){//&& 0 == rc ) {
dprintf("scanning '%s'\n", argv[optind]);
+ q_reset_complete(q);
rc = scan_dir(argv[optind++], q, db);
}
}
}
dprintf("searching dupes ...\n");
- printf("#!/bin/bash\n");
- printf(". ~/bin/imgmultiview.inc\n");
+ printf("%s", lead_in);
rc = db_find_dupes(db, cfg.thresh, find_dupes_cb);
printf("END\n");
q_unlock(q);
}
+void q_reset_complete(queue_t *q) {
+ q_lock(q);
+ q->complete = 0;
+ q_unlock(q);
+}
+
/* EOF */
extern db_entry_t *q_deq(queue_t *q);
extern int q_complete(queue_t *q);
extern void q_set_complete(queue_t *q);
+extern void q_reset_complete(queue_t *q);
#endif //ndef QUEUE_H_INCLUDED
return d;
}
-void s_free(void *p){
+void s_free(void *p) {
free(p);
}
+int cat_file(const char *fname) {
+ FILE *fp;
+ int c, cnt = 0;
+ if ( NULL == (fp = fopen(fname, "rb")) )
+ return -1;
+ while ( EOF != (c = fgetc(fp)) )
+ ++cnt, fputc(c, stdout);
+ return cnt;
+}
+
/* EOF */
extern void *s_strdup(const char *s);
extern void s_free(void *p);
+extern int cat_file(const char *fname);
+
#endif //ndef UTIL_H_INCLUDED