ROHD-692: only persist the most important subset of the cache to disk, because...

ROHD-692: only persist the most important subset of the cache to disk, because (de)serialization of large dicts is just too slow and memomy consuming

ROHD-692: only persist the most important subset of the cache to disk, because...
bc02a5fc · Jorrit Schaap · d0a93a43 · bc02a5fc
Commit bc02a5fc authored 7 years ago by Jorrit Schaap
--- a/SAS/DataManagement/StorageQueryService/cache.py
+++ b/SAS/DataManagement/StorageQueryService/cache.py
@@ -103,6 +103,10 @@ class CacheManager:
                        self._cache = eval(file.read().strip())
                        if not isinstance(self._cache, dict):
                            self._cache = {'path_du_results': {}, 'otdb_id2path': {} }
+                        if 'path_du_results' not in self._cache:
+                            self._cache['path_du_results'] = {}
+                        if 'otdb_id2path' not in self._cache:
+                            self._cache['otdb_id2path'] = {}
        except Exception as e:
            logger.error("Error while reading in du cache: %s", e)
            with self._cacheLock:
@@ -111,11 +115,20 @@ class CacheManager:
    def _writeCacheToDisk(self):
        try:
-            if datetime.datetime.utcnow() - self._last_cache_write_timestamp > datetime.timedelta(minutes=5):
+            # only persist (a subset of) the cache to disk every once in a while.
+            if datetime.datetime.utcnow() - self._last_cache_write_timestamp > datetime.timedelta(minutes=0.2):
                tmp_path = '/tmp/tmp_storagequery_cache.py'
                cache_str = ''
                with self._cacheLock:
-                    cache_str = str(self._cache)
+                    # Take a subset of the entire cache
+                    # only the path_du_results for paths at project level (like /data/projects, /data/projects/LC9_001)
+                    # Do not store path_du_results for deeper levels on disk, because that makes the disk read/write too slow,
+                    # and the deeper levels can be obtained via rhb-du calls quite fast anyway.
+                    # Furthermore, once a deeper level du results is stored in the memory cache, then it is also available for fast lookup.
+                    # We just don't store these deep levels on disk.
+                    sub_cache = { path:du_result for path,du_result in self._cache['path_du_results'].items()
+                                  if self.getDepthToProjectsDir(path) <= 1 }
+                    cache_str = str(sub_cache)
                with open(tmp_path, 'w') as file:
                    file.write(cache_str)
@@ -179,10 +192,17 @@ class CacheManager:
        return result
+    def getDepthToProjectsDir(self, path):
+        return len(path.replace(self.disk_usage.path_resolver.projects_path, '').strip('/').split('/'))
    def _scanProjectsTree(self):
        try:
            def addSubDirectoriesToCache(directory):
-                depth = len(directory.replace(self.disk_usage.path_resolver.projects_path, '').strip('/').split('/'))
+                depth = self.getDepthToProjectsDir(directory)
+                #depth=0 : projects
+                #depth=1 : projects/<project>
+                #depth=2 : projects/<project>/<obs>
+                #depth=3 : projects/<project>/<obs>/<sub_dir>
                if depth > 3:
                    return