Skip to content
Snippets Groups Projects
Commit 5e31eeaa authored by Hugh Dickinson's avatar Hugh Dickinson
Browse files

Added chunked data download for Zooniverse client plugin. Incremented minor...

Added chunked data download for Zooniverse client plugin. Incremented minor version number. Added installation instructions to README.
parent 47f7812b
Branches
No related tags found
No related merge requests found
...@@ -4,6 +4,14 @@ A Python client for the ESCAPE ESAP User Profile REST API. ...@@ -4,6 +4,14 @@ A Python client for the ESCAPE ESAP User Profile REST API.
The `shopping_client` module, which communicates with the ESCAPE ESAP User Profile REST API is very lightweight. Archive-specific functionality is delegated to "connector" modules like the `zooniverse` module. The `shopping_client` module, which communicates with the ESCAPE ESAP User Profile REST API is very lightweight. Archive-specific functionality is delegated to "connector" modules like the `zooniverse` module.
### Installation
The client and the Zooniverse client cat be installed using pip:
```sh
$ pip install git+https://git.astron.nl/astron-sdc/esap-userprofile-python-client.git
```
### Example - Using the Shopping Client with the Zooniverse connector ### Example - Using the Shopping Client with the Zooniverse connector
```python ```python
......
...@@ -5,7 +5,7 @@ with open("README.md", "r") as fh: ...@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
setuptools.setup( setuptools.setup(
name="esap-userprofile-python-client", name="esap-userprofile-python-client",
version="0.0.2", version="0.0.3",
author="Hugh Dickinson", author="Hugh Dickinson",
author_email="hugh.dickinson@open.ac.uk", author_email="hugh.dickinson@open.ac.uk",
description="Python client for ESAP Data Discovery Shoipping Basket", description="Python client for ESAP Data Discovery Shoipping Basket",
......
...@@ -48,11 +48,7 @@ class zooniverse: ...@@ -48,11 +48,7 @@ class zooniverse:
return False return False
def generate( def generate(
self, self, item: Union[dict, pd.Series], wait: bool = False, **read_csv_args
item: Union[dict, pd.Series],
wait: bool = False,
convert_to_pandas: bool = True,
**read_csv_args
) -> Union[requests.Response, pd.DataFrame, None]: ) -> Union[requests.Response, pd.DataFrame, None]:
"""Generate an export of data from the Zooniverse panoptes database """Generate an export of data from the Zooniverse panoptes database
specified by an item from the shopping basket. specified by an item from the shopping basket.
...@@ -64,8 +60,6 @@ class zooniverse: ...@@ -64,8 +60,6 @@ class zooniverse:
or a converted `pd.Series`. or a converted `pd.Series`.
wait : bool wait : bool
If `True` blocks until the requested item has been generated. If `True` blocks until the requested item has been generated.
convert_to_pandas : bool
If `True` the retrieved, generated data are parsed into a pd.DataFrame.
**read_csv_args : type **read_csv_args : type
Extra arguments passed to `pd.read_csv()` when parsing the retrieved Extra arguments passed to `pd.read_csv()` when parsing the retrieved
data. data.
...@@ -84,19 +78,7 @@ class zooniverse: ...@@ -84,19 +78,7 @@ class zooniverse:
response = self._get_entity(item).get_export( response = self._get_entity(item).get_export(
self._get_item_entry(item, "category"), generate=True, wait=wait self._get_item_entry(item, "category"), generate=True, wait=wait
) )
if response.ok: if response.ok and wait:
if convert_to_pandas:
return (
pd.read_csv(
io.BytesIO(response.content),
converters=zooniverse.category_converters[
self._get_item_entry(item, "category")
],
)
if wait
else response
)
else:
return response return response
else: else:
return None return None
...@@ -107,7 +89,9 @@ class zooniverse: ...@@ -107,7 +89,9 @@ class zooniverse:
generate: bool = False, generate: bool = False,
wait: bool = False, wait: bool = False,
convert_to_pandas: bool = True, convert_to_pandas: bool = True,
**read_csv_args chunked_retrieve: bool = False,
chunk_size: int = int(1e5),
**read_csv_args,
) -> Union[requests.Response, pd.DataFrame, None]: ) -> Union[requests.Response, pd.DataFrame, None]:
"""Retrieve data specified by an item from the shopping basket from the """Retrieve data specified by an item from the shopping basket from the
Zooniverse panoptes database. Optionally (re)generate the requested Zooniverse panoptes database. Optionally (re)generate the requested
...@@ -128,6 +112,12 @@ class zooniverse: ...@@ -128,6 +112,12 @@ class zooniverse:
has no effect. has no effect.
convert_to_pandas : bool convert_to_pandas : bool
If `True` the retrieved data are parsed into a pd.DataFrame. If `True` the retrieved data are parsed into a pd.DataFrame.
chunked_retrieve : bool
If `True` read the requested data objects in chunks to avoid
exhausting memory.
chunk_size : int
The number of lines of returned data in each chunk if
`chunked_retrieve` is `True`.
**read_csv_args : type **read_csv_args : type
Extra arguments passed to `pd.read_csv()` when parsing the retrieved Extra arguments passed to `pd.read_csv()` when parsing the retrieved
data. data.
...@@ -150,30 +140,60 @@ class zooniverse: ...@@ -150,30 +140,60 @@ class zooniverse:
return None return None
else: else:
print("Generating requested export...") print("Generating requested export...")
if wait: response = self.generate(item, wait)
print("\t\tWaiting for generation to complete...") if response is None:
else: warning("No data immediately available. Returning NoneType")
print("\t\tNot waiting for generation to complete...") return None
response = self._get_entity(item).get_export(
self._get_item_entry(item, "category"), generate=True, wait=wait
)
if response.ok: if response.ok:
if convert_to_pandas: if convert_to_pandas:
return ( return (
pd.read_csv( self._chunked_content(item, response, chunk_size=chunk_size)
if chunked_retrieve
else pd.read_csv(
io.BytesIO(response.content), io.BytesIO(response.content),
converters=zooniverse.category_converters[ converters=zooniverse.category_converters[
self._get_item_entry(item, "category") self._get_item_entry(item, "category")
], ],
) )
if wait
else response
) )
else: else:
return response return response
else: else:
return None return None
def _chunked_content(
self,
item: Union[dict, pd.Series],
response: requests.Response,
chunk_size: int = int(1e5),
):
response_iterator = response.iter_lines(1)
chunk_frames = []
while True:
try:
chunk = b"\n".join(
[
line
for _, line in zip(range(chunk_size), response_iterator)
if line
]
)
if len(chunk) == 0:
# response_iterator exhausted
print("All data received.")
break
chunk_frames.append(
pd.read_csv(
io.BytesIO(chunk),
converters=zooniverse.category_converters[
self._get_item_entry(item, "category")
],
header=None if len(chunk_frames) else 0,
names=chunk_frames[0].columns if len(chunk_frames) else None,
)
)
return pd.concat(chunk_frames, axis=0, ignore_index=True)
def _get_entity(self, item): def _get_entity(self, item):
entity = zooniverse.entity_types[self._get_item_entry(item, "catalog")].find( entity = zooniverse.entity_types[self._get_item_entry(item, "catalog")].find(
int(self._get_item_entry(item, self._catalogue_to_id_string(item))) int(self._get_item_entry(item, self._catalogue_to_id_string(item)))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment