From bd7bcf4f7d0fc5b4d7b04aeb8bf8c992ab4bd9bf Mon Sep 17 00:00:00 2001
From: Jorrit Schaap <schaap@astron.nl>
Date: Mon, 12 Sep 2016 14:21:59 +0000
Subject: [PATCH] Task #8721: only insert new files/dirs

---
 LTA/ltastorageoverview/lib/scraper.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/LTA/ltastorageoverview/lib/scraper.py b/LTA/ltastorageoverview/lib/scraper.py
index 5249428d606..597fe447132 100755
--- a/LTA/ltastorageoverview/lib/scraper.py
+++ b/LTA/ltastorageoverview/lib/scraper.py
@@ -275,13 +275,30 @@ class ResultGetterThread(threading.Thread):
 
                 with lock:
                     if result.files:
-                        self.db.insertFileInfos([(file.filename, file.size, file.created_at, dir_id) for file in result.files])
+                        result_file_tuple_set = set( (file.filename, file.size, file.created_at, dir_id) for file in result.files)
+
+                        #check for already known files in the db
+                        known_file_tuple_set = set((file['filename'], file['size'], file['created_at'], dir_id) for file in self.db.filesInDirectory(dir_id))
+                        new_file_tuple_set = result_file_tuple_set - known_file_tuple_set;
+
+                        logger.info("%d out of %d files are new, and %d are already known", len(new_file_tuple_set), len(result_file_tuple_set), len(known_file_tuple_set))
+
+                        if new_file_tuple_set:
+                            self.db.insertFileInfos(new_file_tuple_set)
 
                     subDirectoryNames = [loc.directory for loc in result.subDirectories]
 
                     if subDirectoryNames:
-                        not_visited_yet_timestamp = datetime.datetime.utcnow() - datetime.timedelta(days=1000)
-                        self.db.insertSubDirectories(subDirectoryNames, dir_id, not_visited_yet_timestamp)
+                        #check for already known subdirectories in the db
+                        known_subDirectoryNames_set = set(subdir['name'] for subdir in self.db.subDirectories(dir_id))
+
+                        new_subdir_name_set = set(subDirectoryNames) - known_subDirectoryNames_set;
+
+                        logger.info("%d out of %d subdirs are new, and %d are already known", len(new_subdir_name_set), len(subDirectoryNames), len(known_subDirectoryNames_set))
+
+                        if new_subdir_name_set:
+                            not_visited_yet_timestamp = datetime.datetime.utcnow() - datetime.timedelta(days=1000)
+                            self.db.insertSubDirectories(new_subdir_name_set, dir_id, not_visited_yet_timestamp)
 
             except (SrmlsException, ParseException) as e:
                 logger.error('Error while scanning %s\n%s' % (location.path(), str(e)))
-- 
GitLab