From 07b540ad71fd944a19811792fbfda858ae3dcee1 Mon Sep 17 00:00:00 2001 From: Urban Wallasch Date: Thu, 10 Jun 2021 16:22:03 +0200 Subject: [PATCH] * Added lots of comments. * Improved error handling. * Refactoring, redundant code removal, improved thread handling. --- Makefile | 6 ++- db.h | 24 ++++++------ main.c | 113 +++++++++++++++++++++++++++++++------------------------ queue.c | 14 ++----- queue.h | 11 +++--- util.c | 9 ++--- util.h | 2 +- 7 files changed, 93 insertions(+), 86 deletions(-) diff --git a/Makefile b/Makefile index 468119d..f32f17d 100644 --- a/Makefile +++ b/Makefile @@ -18,9 +18,11 @@ LIBS := -lpthread $(shell pkg-config --libs GraphicsMagickWand) STRIP := strip RM := rm -f -.PHONY: all debug clean +.PHONY: all release debug clean -all: $(BIN) +all: release + +release: $(BIN) $(STRIP) $(BIN) debug: CFLAGS += -g -DDEBUG diff --git a/db.h b/db.h index b70a7c3..565494f 100644 --- a/db.h +++ b/db.h @@ -29,15 +29,15 @@ typedef db_entry_t; struct db_entry_struct { - uint64_t phash[4]; - uint16_t nhash; - uint16_t flags; - uint16_t hamw; - uint64_t mtime; - char *fname; - db_entry_t *pnext; - db_entry_t *inext; - db_entry_t *aux; + uint64_t phash[4]; /* PHASH_BITS perceptual hash */ + uint16_t nhash; /* filename hash */ + uint16_t flags; /* see enum above */ + uint16_t hamw; /* phash hamming weight (# of set bits) */ + uint64_t mtime; /* file modification time */ + char *fname; /* file name */ + db_entry_t *pnext; /* phash table bucket linkage */ + db_entry_t *inext; /* name hash bucket linkage */ + db_entry_t *aux; /* auxiliary linkage (queue, dupes list) */ }; @@ -46,9 +46,9 @@ typedef db_t; struct db_struct { - db_entry_t *p_ent[PHASH_BITS + 1]; - db_entry_t *n_ent[NHASH_SIZE]; - pthread_mutex_t mtx; + db_entry_t *p_ent[PHASH_BITS + 1]; /* perceptual hash weight table */ + db_entry_t *n_ent[NHASH_SIZE]; /* file name hash table */ + pthread_mutex_t mtx; /* thread sync */ }; diff --git a/main.c b/main.c index c3cc348..6677ab1 100644 --- a/main.c +++ b/main.c @@ -17,17 +17,17 @@ #include "queue.h" -/* configuration */ +/* configuration singleton */ static struct { - char *db_outfile; - int db_prune; - int rescan; - double blur; - int thresh; - int mark; - int maxdepth; - int max_fd; - int nthreads; + char *db_outfile; /* file to (load and finally) save db to */ + int db_prune; /* flag to activate pruning of stale entries */ + int rescan; /* flag for forced re-scan */ + double blur; /* blur factor during perceptual hash generation */ + int thresh; /* duplicate assessment threshold */ + int mark; /* flag to mark already reported db entries */ + int maxdepth; /* maximum recursion depth during directory walk */ + int max_fd; /* maximum open file descriptors for nftw() */ + int nthreads; /* number of file hashing threads */ } cfg = { .db_outfile = NULL, .db_prune = 0, @@ -40,7 +40,7 @@ static struct { .nthreads = 4, }; -/* queue used during directory scan; static for scan_dir_cb() */ +/* queue used during directory scan; module scope for scan_dir_cb() */ static queue_t *q; @@ -58,13 +58,15 @@ void *worker(void *arg) { thread_info_t *thrinf = arg; db_entry_t *entry = NULL; - while ( NULL != entry || !q_complete(thrinf->q) ) { + while ( NULL != entry || !q_is_complete(thrinf->q) ) { entry = q_deq(thrinf->q); if ( NULL == entry ) { + /* The queue fills up at a much faster pace than we can drain + * it, thus we can get away with this simplistic polling. */ sched_yield(); continue; } - /* insert entry in db hash tables */ + /* analyze file and insert entry in db hash tables */ rc = db_insert(thrinf->db, entry, cfg.rescan, cfg.blur); if ( 0 != rc ) { if ( 0 > rc ) @@ -76,48 +78,64 @@ void *worker(void *arg) { return thrinf; } -/* callback for nftw directory tree walker */ +/* callback for nftw() directory tree walker, see below */ int scan_dir_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) { - if ( cfg.maxdepth && cfg.maxdepth < ftwbuf->level ) + if ( 0 < cfg.maxdepth && cfg.maxdepth < ftwbuf->level ) return 0; if ( FTW_F == typeflag ) q_enq(q, db_entry_new(fpath, sb->st_mtime)); return 0; } -/* scan directory and build db */ -int scan_dir(const char *dir, queue_t *q, db_t *db) { - int rc = -1; - char *dirpath; +/* scan directories and control worker threads */ +int scan_dirs(char * const dirs[], db_t *db) { + int i, rc, err = 0; + q = q_init(); /* start worker threads */ thread_info_t thrinf[cfg.nthreads]; - pthread_attr_t attr; - pthread_attr_init(&attr); - for ( int i = 0; i < cfg.nthreads; ++i ) { + for ( i = 0; i < cfg.nthreads; ++i ) { thrinf[i].i = i; thrinf[i].q = q; thrinf[i].db = db; - pthread_create(&thrinf[i].tid, &attr, worker, &thrinf[i]); + rc = pthread_create(&thrinf[i].tid, NULL, worker, &thrinf[i]); + if ( 0 != rc ) { + eprintf("ERROR: starting thread %d: %s\n", i, strerror(rc) ); + ++err; + break; + } } - pthread_attr_destroy(&attr); - /* main thread: walk directory tree */ - errno = 0; - if ( NULL != (dirpath = realpath(dir, NULL)) ) { - rc = nftw(dirpath, scan_dir_cb, cfg.max_fd, FTW_PHYS); - s_free(dirpath); + if ( 1 > i ) + die("no worker threads started"); + dprintf("have %d/%d worker threads\n", i, cfg.nthreads); + /* main thread: scan directories */ + for ( i = 0; NULL != dirs[i]; ++i ) { + char *dirpath; + dprintf("scanning '%s'\n", dirs[i]); + if ( NULL != (dirpath = realpath(dirs[i], NULL)) ) { + /* walk directory tree */ + rc = nftw(dirpath, scan_dir_cb, cfg.max_fd, FTW_PHYS); + s_free(dirpath); + if ( 0 != rc ) { + eprintf("ERROR: scanning '%s' failed\n", dirs[i]); + ++err; + } + } + else { + eprintf("ERROR: '%s': %s\n", dirs[i], strerror(errno)); + ++err; + } } - else - eprintf("ERROR: '%s': %s\n", dir, strerror(errno)); - /* wait for workers to finish */ + /* wait for worker threads to finish */ q_set_complete(q); - for ( int i = 0; i < cfg.nthreads; ++i ) + for ( i = 0; i < cfg.nthreads; ++i ) pthread_join(thrinf[i].tid, NULL); - return rc; + q_destroy(&q); + return err; } -/* callback for db_find_dupes() */ +/* callback for db_find_dupes(): generate output */ static int find_dupes_cb(db_entry_t *dupes) { db_entry_t *p; #ifdef DEBUG @@ -132,6 +150,7 @@ static int find_dupes_cb(db_entry_t *dupes) { for ( p = dupes; NULL != p; p = p->aux ) { printf(" '"); for ( const char *cp = p->fname; '\0' != *cp; ++cp ) { + /* escape single quotes in file names */ if ( '\'' == *cp ) putchar('\\'); putchar(*cp); @@ -165,10 +184,10 @@ static void usage(char *pname, int ec) { /* main function */ int main(int argc, char *argv[]) { - int c, rc = 0, n, err = 0; + int c, n, err = 0; char **dirs = NULL; char *cp; - db_t *db ; + db_t *db; const char *lead_in = "#!\\bin\\sh\n" "VIEW(){ for arg in \"$@\"; do echo \"$arg\"; done; echo '-----'; }\n" @@ -242,11 +261,11 @@ int main(int argc, char *argv[]) { cfg.mark = 1; break; case ':': - eprintf("ERROR: missing argument for option '-%c'\n", optopt); + eprintf("FATAL: missing argument for option '-%c'\n", optopt); usage(argv[0], EXIT_FAILURE); break; case '?': - eprintf("ERROR: unknown option '-%c'\n", optopt); + eprintf("FATAL: unknown option '-%c'\n", optopt); usage( argv[0], EXIT_FAILURE ); break; default: @@ -260,19 +279,11 @@ int main(int argc, char *argv[]) { db_prune(db); } - q = q_init(); + /* scan directories */ dirs = optind < argc ? &argv[optind] : (char *[]){".", NULL}; - for ( int i = 0; NULL != dirs[i]; ++i ) { - dprintf("scanning '%s'\n", dirs[i]); - q_reset_complete(q); - rc = scan_dir(dirs[i], q, db); - if ( 0 != rc ) { - eprintf("ERROR: scanning '%s' failed\n", dirs[i]); - ++err; - } - } - q_destroy(&q); + err += scan_dirs(dirs, db); + /* write database file */ if ( NULL != cfg.db_outfile ) { int cnt; cnt = db_write(db, cfg.db_outfile); @@ -284,11 +295,13 @@ int main(int argc, char *argv[]) { } } + /* scan db for potential duplicates */ dprintf("searching for potential duplicates ...\n"); printf("%s", lead_in); db_find_dupes(db, cfg.thresh, cfg.mark, find_dupes_cb); printf("END\n"); + /* cleanup and exit */ db_destroy(&db); dprintf("done, encountered %d error%s\n", err, err==1?"":"s"); exit(err ? EXIT_FAILURE : EXIT_SUCCESS); diff --git a/queue.c b/queue.c index e53e284..f642a55 100644 --- a/queue.c +++ b/queue.c @@ -66,12 +66,12 @@ db_entry_t *q_deq(queue_t *q) { return entry; } -int q_complete(queue_t *q) { - int qcpl = 0; +int q_is_complete(queue_t *q) { + int complete = 0; q_lock(q); - qcpl = q->complete; + complete = q->complete; q_unlock(q); - return qcpl; + return complete; } void q_set_complete(queue_t *q) { @@ -80,10 +80,4 @@ void q_set_complete(queue_t *q) { q_unlock(q); } -void q_reset_complete(queue_t *q) { - q_lock(q); - q->complete = 0; - q_unlock(q); -} - /* EOF */ diff --git a/queue.h b/queue.h index cd262ae..de58a08 100644 --- a/queue.h +++ b/queue.h @@ -10,19 +10,18 @@ typedef queue_t; struct queue_struct { - db_entry_t *penq; - db_entry_t *pdeq; - int complete; - pthread_mutex_t mtx; + db_entry_t *penq; /* current enqueue position (tail) */ + db_entry_t *pdeq; /* current dequeue position (head) */ + int complete; /* thread sync: signal enqueuing finished */ + pthread_mutex_t mtx; /* thread sync */ }; extern queue_t *q_init(void); extern void q_destroy(queue_t **pq); extern int q_enq(queue_t *q, db_entry_t *entry); extern db_entry_t *q_deq(queue_t *q); -extern int q_complete(queue_t *q); +extern int q_is_complete(queue_t *q); extern void q_set_complete(queue_t *q); -extern void q_reset_complete(queue_t *q); #endif //ndef QUEUE_H_INCLUDED diff --git a/util.c b/util.c index 3c944a0..074e1c3 100644 --- a/util.c +++ b/util.c @@ -5,23 +5,22 @@ #include "util.h" -void die(int eno) { - if ( eno ) - eprintf("%s\n", strerror(eno)); +void die(const char *msg) { + eprintf("FATAL: %s (aborting)\n", msg); exit(EXIT_FAILURE); } void *s_malloc(size_t sz) { void *m = malloc(sz); if ( NULL == m ) - die(errno); + die(strerror(errno)); return m; } void *s_strdup(const char *s) { void *d = strdup(s); if ( NULL == d ) - die(errno); + die(strerror(errno)); return d; } diff --git a/util.h b/util.h index 631d026..08f7663 100644 --- a/util.h +++ b/util.h @@ -13,7 +13,7 @@ #define dprintf(...) #endif -extern void die(int eno); +extern void die(const char *msg); extern void *s_malloc(size_t sz); extern void *s_strdup(const char *s); extern void s_free(void *p); -- 2.30.2