#include "queue.h"
-/* configuration */
+/* configuration singleton */
static struct {
- char *db_outfile;
- int db_prune;
- int rescan;
- double blur;
- int thresh;
- int mark;
- int maxdepth;
- int max_fd;
- int nthreads;
+ char *db_outfile; /* file to (load and finally) save db to */
+ int db_prune; /* flag to activate pruning of stale entries */
+ int rescan; /* flag for forced re-scan */
+ double blur; /* blur factor during perceptual hash generation */
+ int thresh; /* duplicate assessment threshold */
+ int mark; /* flag to mark already reported db entries */
+ int maxdepth; /* maximum recursion depth during directory walk */
+ int max_fd; /* maximum open file descriptors for nftw() */
+ int nthreads; /* number of file hashing threads */
} cfg = {
.db_outfile = NULL,
.db_prune = 0,
.nthreads = 4,
};
-/* queue used during directory scan; static for scan_dir_cb() */
+/* queue used during directory scan; module scope for scan_dir_cb() */
static queue_t *q;
thread_info_t *thrinf = arg;
db_entry_t *entry = NULL;
- while ( NULL != entry || !q_complete(thrinf->q) ) {
+ while ( NULL != entry || !q_is_complete(thrinf->q) ) {
entry = q_deq(thrinf->q);
if ( NULL == entry ) {
+ /* The queue fills up at a much faster pace than we can drain
+ * it, thus we can get away with this simplistic polling. */
sched_yield();
continue;
}
- /* insert entry in db hash tables */
+ /* analyze file and insert entry in db hash tables */
rc = db_insert(thrinf->db, entry, cfg.rescan, cfg.blur);
if ( 0 != rc ) {
if ( 0 > rc )
return thrinf;
}
-/* callback for nftw directory tree walker */
+/* callback for nftw() directory tree walker, see below */
int scan_dir_cb(const char *fpath, const struct stat *sb,
int typeflag, struct FTW *ftwbuf) {
- if ( cfg.maxdepth && cfg.maxdepth < ftwbuf->level )
+ if ( 0 < cfg.maxdepth && cfg.maxdepth < ftwbuf->level )
return 0;
if ( FTW_F == typeflag )
q_enq(q, db_entry_new(fpath, sb->st_mtime));
return 0;
}
-/* scan directory and build db */
-int scan_dir(const char *dir, queue_t *q, db_t *db) {
- int rc = -1;
- char *dirpath;
+/* scan directories and control worker threads */
+int scan_dirs(char * const dirs[], db_t *db) {
+ int i, rc, err = 0;
+ q = q_init();
/* start worker threads */
thread_info_t thrinf[cfg.nthreads];
- pthread_attr_t attr;
- pthread_attr_init(&attr);
- for ( int i = 0; i < cfg.nthreads; ++i ) {
+ for ( i = 0; i < cfg.nthreads; ++i ) {
thrinf[i].i = i;
thrinf[i].q = q;
thrinf[i].db = db;
- pthread_create(&thrinf[i].tid, &attr, worker, &thrinf[i]);
+ rc = pthread_create(&thrinf[i].tid, NULL, worker, &thrinf[i]);
+ if ( 0 != rc ) {
+ eprintf("ERROR: starting thread %d: %s\n", i, strerror(rc) );
+ ++err;
+ break;
+ }
}
- pthread_attr_destroy(&attr);
- /* main thread: walk directory tree */
- errno = 0;
- if ( NULL != (dirpath = realpath(dir, NULL)) ) {
- rc = nftw(dirpath, scan_dir_cb, cfg.max_fd, FTW_PHYS);
- s_free(dirpath);
+ if ( 1 > i )
+ die("no worker threads started");
+ dprintf("have %d/%d worker threads\n", i, cfg.nthreads);
+ /* main thread: scan directories */
+ for ( i = 0; NULL != dirs[i]; ++i ) {
+ char *dirpath;
+ dprintf("scanning '%s'\n", dirs[i]);
+ if ( NULL != (dirpath = realpath(dirs[i], NULL)) ) {
+ /* walk directory tree */
+ rc = nftw(dirpath, scan_dir_cb, cfg.max_fd, FTW_PHYS);
+ s_free(dirpath);
+ if ( 0 != rc ) {
+ eprintf("ERROR: scanning '%s' failed\n", dirs[i]);
+ ++err;
+ }
+ }
+ else {
+ eprintf("ERROR: '%s': %s\n", dirs[i], strerror(errno));
+ ++err;
+ }
}
- else
- eprintf("ERROR: '%s': %s\n", dir, strerror(errno));
- /* wait for workers to finish */
+ /* wait for worker threads to finish */
q_set_complete(q);
- for ( int i = 0; i < cfg.nthreads; ++i )
+ for ( i = 0; i < cfg.nthreads; ++i )
pthread_join(thrinf[i].tid, NULL);
- return rc;
+ q_destroy(&q);
+ return err;
}
-/* callback for db_find_dupes() */
+/* callback for db_find_dupes(): generate output */
static int find_dupes_cb(db_entry_t *dupes) {
db_entry_t *p;
#ifdef DEBUG
for ( p = dupes; NULL != p; p = p->aux ) {
printf(" '");
for ( const char *cp = p->fname; '\0' != *cp; ++cp ) {
+ /* escape single quotes in file names */
if ( '\'' == *cp )
putchar('\\');
putchar(*cp);
/* main function */
int main(int argc, char *argv[]) {
- int c, rc = 0, n, err = 0;
+ int c, n, err = 0;
char **dirs = NULL;
char *cp;
- db_t *db ;
+ db_t *db;
const char *lead_in =
"#!\\bin\\sh\n"
"VIEW(){ for arg in \"$@\"; do echo \"$arg\"; done; echo '-----'; }\n"
cfg.mark = 1;
break;
case ':':
- eprintf("ERROR: missing argument for option '-%c'\n", optopt);
+ eprintf("FATAL: missing argument for option '-%c'\n", optopt);
usage(argv[0], EXIT_FAILURE);
break;
case '?':
- eprintf("ERROR: unknown option '-%c'\n", optopt);
+ eprintf("FATAL: unknown option '-%c'\n", optopt);
usage( argv[0], EXIT_FAILURE );
break;
default:
db_prune(db);
}
- q = q_init();
+ /* scan directories */
dirs = optind < argc ? &argv[optind] : (char *[]){".", NULL};
- for ( int i = 0; NULL != dirs[i]; ++i ) {
- dprintf("scanning '%s'\n", dirs[i]);
- q_reset_complete(q);
- rc = scan_dir(dirs[i], q, db);
- if ( 0 != rc ) {
- eprintf("ERROR: scanning '%s' failed\n", dirs[i]);
- ++err;
- }
- }
- q_destroy(&q);
+ err += scan_dirs(dirs, db);
+ /* write database file */
if ( NULL != cfg.db_outfile ) {
int cnt;
cnt = db_write(db, cfg.db_outfile);
}
}
+ /* scan db for potential duplicates */
dprintf("searching for potential duplicates ...\n");
printf("%s", lead_in);
db_find_dupes(db, cfg.thresh, cfg.mark, find_dupes_cb);
printf("END\n");
+ /* cleanup and exit */
db_destroy(&db);
dprintf("done, encountered %d error%s\n", err, err==1?"":"s");
exit(err ? EXIT_FAILURE : EXIT_SUCCESS);