rewrite + fix export_dataset

59df96e7 · Eelco van der Wel · 6cdd832f · 59df96e7 · 59df96e7 · 59df96e7
Commit 59df96e7 authored 3 years ago by Eelco van der Wel 💬
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 789 additions and 12 deletions
+789 -12
--- a/nbs/exporters.query.ipynb
+++ b/nbs/exporters.query.ipynb
--- a/pymemri/_nbdev.py
+++ b/pymemri/_nbdev.py
@@ -18,7 +18,10 @@ index = {"read_file": "basic.ipynb",
         "DEFAULT_ENCODING": "data.photo.ipynb",
         "show_images": "data.photo.ipynb",
         "Photo": "data.photo.ipynb",
-         "Query": "exporters.query.ipynb",
+         "Query": "exporters.exporters.ipynb",
+         "Dataset": "exporters.exporters.ipynb",
+         "filter_missing": "exporters.exporters.ipynb",
+         "export_dataset": "exporters.exporters.ipynb",
         "ALL_EDGES": "itembase.ipynb",
         "Edge": "itembase.ipynb",
         "ItemBase": "itembase.ipynb",
@@ -94,7 +97,7 @@ index = {"read_file": "basic.ipynb",
 modules = ["data/basic.py",
           "cvu/utils.py",
           "data/photo.py",
-           "exporters/query.py",
+           "exporters/exporters.py",
           "data/itembase.py",
           "plugin/authenticators/credentials.py",
           "plugin/authenticators/oauth.py",

--- a/pymemri/exporters/__init__.py
+++ b/pymemri/exporters/__init__.py
+from .exporters import export_dataset
+
+__all__ = ["export_dataset"]
\ No newline at end of file
--- a/pymemri/exporters/query.py
+++ b/pymemri/exporters/query.py
-# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/exporters.query.ipynb (unless otherwise specified).
+# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/exporters.exporters.ipynb (unless otherwise specified).

-__all__ = ['Query']
+__all__ = ['Query', 'Dataset', 'filter_missing', 'export_dataset']

 # Cell
 # hide
@@ -27,9 +27,12 @@ class Query:
        items = items.copy()

        for edge in edges:
-            items_to_query = dict()
+            ids_to_query = list()
+            query_item_idx = list()
            for i in range(len(items)):
                item = items[i]
+                if item is None:
+                    continue
                # Replace item with target item. If the edge is empty, it has to be queried again.
                try:
                    if edge not in item.edges:
@@ -37,17 +40,17 @@ class Query:
                    else:
                        items[i] = getattr(item, edge)[0]
                except Exception:
-                    items_to_query[i] = item
+                    ids_to_query.append(item.id)
+                    query_item_idx.append(i)
                    items[i] = None

-            # TODO Pod can't currently get multiple items by ID, API call for each item is required for now.
-            for i, item in items_to_query.items():
+
+            new_items = client.search({"ids": ids_to_query})
+            for i, new_item in zip(query_item_idx, new_items):
                try:
-                    result = client.get(item.id)
-                    items[i] = getattr(result, edge)[0]
+                    items[i] = getattr(new_item, edge)[0]
                except Exception:
                    items[i] = None
-
        return items

    def get_property_values(
@@ -80,4 +83,43 @@ class Query:
        result = {
            prop: self.get_property_values(client, prop, items) for prop in self.properties
        }
-        return self.convert_dtype(result, dtype)
\ No newline at end of file
+        return self.convert_dtype(result, dtype)
+
+# Cell
+class Dataset(Item):
+    """
+    Temporary dataset schema, remove when MVP2 is done.
+    """
+    properties= Item.properties + ["queryStr"]
+    edges = Item.edges + ["item"]
+
+    def __init__(self, queryStr: str = None, item: list = None, **kwargs):
+        super().__init__(**kwargs)
+        self.queryStr = queryStr
+        self.item: list = item if item is not None else []
+
+# Cell
+def filter_missing(dataset: dict) -> dict:
+    missing_idx = set()
+    for column in dataset.values():
+        missing_idx.update([i for i, val in enumerate(column) if val is None])
+    return {
+        k: [item for i, item in enumerate(v) if i not in missing_idx] for k, v in dataset.items()
+    }
+
+def export_dataset(
+    client: PodClient,
+    dataset: Dataset,
+    content_fields: List[str] = ["content"],
+    label_field: str = "label.value",
+    missing_values: bool = False,
+    dtype: str = "dict",
+):
+    items = dataset.item
+    query = Query("id", *content_fields, label_field)
+    result = query.execute(client, dataset.item)
+
+    if not missing_values:
+        result = filter_missing(result)
+
+    return query.convert_dtype(result, dtype)
\ No newline at end of file