Added chunked data download for Zooniverse client plugin. Incremented minor...

Added chunked data download for Zooniverse client plugin. Incremented minor version number. Added installation instructions to README.

Added chunked data download for Zooniverse client plugin. Incremented minor...
5e31eeaa · Hugh Dickinson · 47f7812b · 5e31eeaa · 5e31eeaa · 5e31eeaa
Commit 5e31eeaa authored 4 years ago by Hugh Dickinson
--- a/README.md
+++ b/README.md
@@ -4,6 +4,14 @@ A Python client for the ESCAPE ESAP User Profile REST API.
 The `shopping_client` module, which communicates with the ESCAPE ESAP User Profile REST API is very lightweight. Archive-specific functionality is delegated to "connector" modules like the `zooniverse` module.
+### Installation
+The client and the Zooniverse client cat be installed using pip:
+```sh
+$ pip install git+https://git.astron.nl/astron-sdc/esap-userprofile-python-client.git
+```
 ### Example - Using the Shopping Client with the Zooniverse connector
 ```python

--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
 setuptools.setup(
    name="esap-userprofile-python-client",
-    version="0.0.2",
+    version="0.0.3",
    author="Hugh Dickinson",
    author_email="hugh.dickinson@open.ac.uk",
    description="Python client for ESAP Data Discovery Shoipping Basket",

--- a/zooniverse/zooniverse.py
+++ b/zooniverse/zooniverse.py
@@ -48,11 +48,7 @@ class zooniverse:
            return False
    def generate(
-        self,
+        self, item: Union[dict, pd.Series], wait: bool = False, **read_csv_args
-        item: Union[dict, pd.Series],
-        wait: bool = False,
-        convert_to_pandas: bool = True,
-        **read_csv_args
    ) -> Union[requests.Response, pd.DataFrame, None]:
        """Generate an export of data from the Zooniverse panoptes database
        specified by an item from the shopping basket.
@@ -64,8 +60,6 @@ class zooniverse:
            or a converted `pd.Series`.
        wait : bool
            If `True` blocks until the requested item has been generated.
-        convert_to_pandas : bool
-            If `True` the retrieved, generated data are parsed into a pd.DataFrame.
        **read_csv_args : type
            Extra arguments passed to `pd.read_csv()` when parsing the retrieved
            data.
@@ -84,19 +78,7 @@ class zooniverse:
        response = self._get_entity(item).get_export(
            self._get_item_entry(item, "category"), generate=True, wait=wait
        )
-        if response.ok:
+        if response.ok and wait:
-            if convert_to_pandas:
-                return (
-                    pd.read_csv(
-                        io.BytesIO(response.content),
-                        converters=zooniverse.category_converters[
-                            self._get_item_entry(item, "category")
-                        ],
-                    )
-                    if wait
-                    else response
-                )
-            else:
            return response
        else:
            return None
@@ -107,7 +89,9 @@ class zooniverse:
        generate: bool = False,
        wait: bool = False,
        convert_to_pandas: bool = True,
-        **read_csv_args
+        chunked_retrieve: bool = False,
+        chunk_size: int = int(1e5),
+        **read_csv_args,
    ) -> Union[requests.Response, pd.DataFrame, None]:
        """Retrieve data specified by an item from the shopping basket from the
        Zooniverse panoptes database. Optionally (re)generate the requested
@@ -128,6 +112,12 @@ class zooniverse:
            has no effect.
        convert_to_pandas : bool
            If `True` the retrieved data are parsed into a pd.DataFrame.
+        chunked_retrieve : bool
+            If `True` read the requested data objects in chunks to avoid
+            exhausting memory.
+        chunk_size : int
+            The number of lines of returned data in each chunk if
+            `chunked_retrieve` is `True`.
        **read_csv_args : type
            Extra arguments passed to `pd.read_csv()` when parsing the retrieved
            data.
@@ -150,30 +140,60 @@ class zooniverse:
                return None
            else:
                print("Generating requested export...")
-                if wait:
+                response = self.generate(item, wait)
-                    print("\t\tWaiting for generation to complete...")
+        if response is None:
-                else:
+            warning("No data immediately available. Returning NoneType")
-                    print("\t\tNot waiting for generation to complete...")
+            return None
-                response = self._get_entity(item).get_export(
-                    self._get_item_entry(item, "category"), generate=True, wait=wait
-                )
        if response.ok:
            if convert_to_pandas:
                return (
-                    pd.read_csv(
+                    self._chunked_content(item, response, chunk_size=chunk_size)
+                    if chunked_retrieve
+                    else pd.read_csv(
                        io.BytesIO(response.content),
                        converters=zooniverse.category_converters[
                            self._get_item_entry(item, "category")
                        ],
                    )
-                    if wait
-                    else response
                )
            else:
                return response
        else:
            return None
+    def _chunked_content(
+        self,
+        item: Union[dict, pd.Series],
+        response: requests.Response,
+        chunk_size: int = int(1e5),
+    ):
+        response_iterator = response.iter_lines(1)
+        chunk_frames = []
+        while True:
+            try:
+                chunk = b"\n".join(
+                    [
+                        line
+                        for _, line in zip(range(chunk_size), response_iterator)
+                        if line
+                    ]
+                )
+                if len(chunk) == 0:
+                    # response_iterator exhausted
+                    print("All data received.")
+                    break
+                chunk_frames.append(
+                    pd.read_csv(
+                        io.BytesIO(chunk),
+                        converters=zooniverse.category_converters[
+                            self._get_item_entry(item, "category")
+                        ],
+                        header=None if len(chunk_frames) else 0,
+                        names=chunk_frames[0].columns if len(chunk_frames) else None,
+                    )
+                )
+        return pd.concat(chunk_frames, axis=0, ignore_index=True)
    def _get_entity(self, item):
        entity = zooniverse.entity_types[self._get_item_entry(item, "catalog")].find(
            int(self._get_item_entry(item, self._catalogue_to_id_string(item)))