From bc02a5fc1d0cfb3c2559b862db9fc262d00efa95 Mon Sep 17 00:00:00 2001
From: Jorrit Schaap <schaap@astron.nl>
Date: Wed, 4 Apr 2018 10:20:01 +0000
Subject: [PATCH] ROHD-692: only persist the most important subset of the cache
 to disk, because (de)serialization of large dicts is just too slow and memomy
 consuming

---
 .../StorageQueryService/cache.py              | 26 ++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/SAS/DataManagement/StorageQueryService/cache.py b/SAS/DataManagement/StorageQueryService/cache.py
index 43bd5fdbd41..45b61063f80 100644
--- a/SAS/DataManagement/StorageQueryService/cache.py
+++ b/SAS/DataManagement/StorageQueryService/cache.py
@@ -103,6 +103,10 @@ class CacheManager:
                         self._cache = eval(file.read().strip())
                         if not isinstance(self._cache, dict):
                             self._cache = {'path_du_results': {}, 'otdb_id2path': {} }
+                        if 'path_du_results' not in self._cache:
+                            self._cache['path_du_results'] = {}
+                        if 'otdb_id2path' not in self._cache:
+                            self._cache['otdb_id2path'] = {}
         except Exception as e:
             logger.error("Error while reading in du cache: %s", e)
             with self._cacheLock:
@@ -111,11 +115,20 @@ class CacheManager:
 
     def _writeCacheToDisk(self):
         try:
-            if datetime.datetime.utcnow() - self._last_cache_write_timestamp > datetime.timedelta(minutes=5):
+            # only persist (a subset of) the cache to disk every once in a while.
+            if datetime.datetime.utcnow() - self._last_cache_write_timestamp > datetime.timedelta(minutes=0.2):
                 tmp_path = '/tmp/tmp_storagequery_cache.py'
                 cache_str = ''
                 with self._cacheLock:
-                    cache_str = str(self._cache)
+                    # Take a subset of the entire cache
+                    # only the path_du_results for paths at project level (like /data/projects, /data/projects/LC9_001)
+                    # Do not store path_du_results for deeper levels on disk, because that makes the disk read/write too slow,
+                    # and the deeper levels can be obtained via rhb-du calls quite fast anyway.
+                    # Furthermore, once a deeper level du results is stored in the memory cache, then it is also available for fast lookup.
+                    # We just don't store these deep levels on disk.
+                    sub_cache = { path:du_result for path,du_result in self._cache['path_du_results'].items()
+                                  if self.getDepthToProjectsDir(path) <= 1 }
+                    cache_str = str(sub_cache)
 
                 with open(tmp_path, 'w') as file:
                     file.write(cache_str)
@@ -179,10 +192,17 @@ class CacheManager:
 
         return result
 
+    def getDepthToProjectsDir(self, path):
+        return len(path.replace(self.disk_usage.path_resolver.projects_path, '').strip('/').split('/'))
+
     def _scanProjectsTree(self):
         try:
             def addSubDirectoriesToCache(directory):
-                depth = len(directory.replace(self.disk_usage.path_resolver.projects_path, '').strip('/').split('/'))
+                depth = self.getDepthToProjectsDir(directory)
+                #depth=0 : projects
+                #depth=1 : projects/<project>
+                #depth=2 : projects/<project>/<obs>
+                #depth=3 : projects/<project>/<obs>/<sub_dir>
                 if depth > 3:
                     return
 
-- 
GitLab