From bd7bcf4f7d0fc5b4d7b04aeb8bf8c992ab4bd9bf Mon Sep 17 00:00:00 2001 From: Jorrit Schaap <schaap@astron.nl> Date: Mon, 12 Sep 2016 14:21:59 +0000 Subject: [PATCH] Task #8721: only insert new files/dirs --- LTA/ltastorageoverview/lib/scraper.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/LTA/ltastorageoverview/lib/scraper.py b/LTA/ltastorageoverview/lib/scraper.py index 5249428d606..597fe447132 100755 --- a/LTA/ltastorageoverview/lib/scraper.py +++ b/LTA/ltastorageoverview/lib/scraper.py @@ -275,13 +275,30 @@ class ResultGetterThread(threading.Thread): with lock: if result.files: - self.db.insertFileInfos([(file.filename, file.size, file.created_at, dir_id) for file in result.files]) + result_file_tuple_set = set( (file.filename, file.size, file.created_at, dir_id) for file in result.files) + + #check for already known files in the db + known_file_tuple_set = set((file['filename'], file['size'], file['created_at'], dir_id) for file in self.db.filesInDirectory(dir_id)) + new_file_tuple_set = result_file_tuple_set - known_file_tuple_set; + + logger.info("%d out of %d files are new, and %d are already known", len(new_file_tuple_set), len(result_file_tuple_set), len(known_file_tuple_set)) + + if new_file_tuple_set: + self.db.insertFileInfos(new_file_tuple_set) subDirectoryNames = [loc.directory for loc in result.subDirectories] if subDirectoryNames: - not_visited_yet_timestamp = datetime.datetime.utcnow() - datetime.timedelta(days=1000) - self.db.insertSubDirectories(subDirectoryNames, dir_id, not_visited_yet_timestamp) + #check for already known subdirectories in the db + known_subDirectoryNames_set = set(subdir['name'] for subdir in self.db.subDirectories(dir_id)) + + new_subdir_name_set = set(subDirectoryNames) - known_subDirectoryNames_set; + + logger.info("%d out of %d subdirs are new, and %d are already known", len(new_subdir_name_set), len(subDirectoryNames), len(known_subDirectoryNames_set)) + + if new_subdir_name_set: + not_visited_yet_timestamp = datetime.datetime.utcnow() - datetime.timedelta(days=1000) + self.db.insertSubDirectories(new_subdir_name_set, dir_id, not_visited_yet_timestamp) except (SrmlsException, ParseException) as e: logger.error('Error while scanning %s\n%s' % (location.path(), str(e))) -- GitLab