diff --git a/LTA/ltastorageoverview/lib/scraper.py b/LTA/ltastorageoverview/lib/scraper.py index 5249428d6060d89c858cc1bc4d19010250966b35..597fe4471321fc03ef3818c1bdab8eea1dd54e09 100755 --- a/LTA/ltastorageoverview/lib/scraper.py +++ b/LTA/ltastorageoverview/lib/scraper.py @@ -275,13 +275,30 @@ class ResultGetterThread(threading.Thread): with lock: if result.files: - self.db.insertFileInfos([(file.filename, file.size, file.created_at, dir_id) for file in result.files]) + result_file_tuple_set = set( (file.filename, file.size, file.created_at, dir_id) for file in result.files) + + #check for already known files in the db + known_file_tuple_set = set((file['filename'], file['size'], file['created_at'], dir_id) for file in self.db.filesInDirectory(dir_id)) + new_file_tuple_set = result_file_tuple_set - known_file_tuple_set; + + logger.info("%d out of %d files are new, and %d are already known", len(new_file_tuple_set), len(result_file_tuple_set), len(known_file_tuple_set)) + + if new_file_tuple_set: + self.db.insertFileInfos(new_file_tuple_set) subDirectoryNames = [loc.directory for loc in result.subDirectories] if subDirectoryNames: - not_visited_yet_timestamp = datetime.datetime.utcnow() - datetime.timedelta(days=1000) - self.db.insertSubDirectories(subDirectoryNames, dir_id, not_visited_yet_timestamp) + #check for already known subdirectories in the db + known_subDirectoryNames_set = set(subdir['name'] for subdir in self.db.subDirectories(dir_id)) + + new_subdir_name_set = set(subDirectoryNames) - known_subDirectoryNames_set; + + logger.info("%d out of %d subdirs are new, and %d are already known", len(new_subdir_name_set), len(subDirectoryNames), len(known_subDirectoryNames_set)) + + if new_subdir_name_set: + not_visited_yet_timestamp = datetime.datetime.utcnow() - datetime.timedelta(days=1000) + self.db.insertSubDirectories(new_subdir_name_set, dir_id, not_visited_yet_timestamp) except (SrmlsException, ParseException) as e: logger.error('Error while scanning %s\n%s' % (location.path(), str(e)))