Commit 59df96e7 authored by Eelco van der Wel's avatar Eelco van der Wel 💬
Browse files

rewrite + fix export_dataset

parent 6cdd832f
Showing with 789 additions and 12 deletions
+789 -12
......@@ -18,7 +18,10 @@ index = {"read_file": "basic.ipynb",
"DEFAULT_ENCODING": "data.photo.ipynb",
"show_images": "data.photo.ipynb",
"Photo": "data.photo.ipynb",
"Query": "exporters.query.ipynb",
"Query": "exporters.exporters.ipynb",
"Dataset": "exporters.exporters.ipynb",
"filter_missing": "exporters.exporters.ipynb",
"export_dataset": "exporters.exporters.ipynb",
"ALL_EDGES": "itembase.ipynb",
"Edge": "itembase.ipynb",
"ItemBase": "itembase.ipynb",
......@@ -94,7 +97,7 @@ index = {"read_file": "basic.ipynb",
modules = ["data/basic.py",
"cvu/utils.py",
"data/photo.py",
"exporters/query.py",
"exporters/exporters.py",
"data/itembase.py",
"plugin/authenticators/credentials.py",
"plugin/authenticators/oauth.py",
......
from .exporters import export_dataset
__all__ = ["export_dataset"]
\ No newline at end of file
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/exporters.query.ipynb (unless otherwise specified).
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/exporters.exporters.ipynb (unless otherwise specified).
__all__ = ['Query']
__all__ = ['Query', 'Dataset', 'filter_missing', 'export_dataset']
# Cell
# hide
......@@ -27,9 +27,12 @@ class Query:
items = items.copy()
for edge in edges:
items_to_query = dict()
ids_to_query = list()
query_item_idx = list()
for i in range(len(items)):
item = items[i]
if item is None:
continue
# Replace item with target item. If the edge is empty, it has to be queried again.
try:
if edge not in item.edges:
......@@ -37,17 +40,17 @@ class Query:
else:
items[i] = getattr(item, edge)[0]
except Exception:
items_to_query[i] = item
ids_to_query.append(item.id)
query_item_idx.append(i)
items[i] = None
# TODO Pod can't currently get multiple items by ID, API call for each item is required for now.
for i, item in items_to_query.items():
new_items = client.search({"ids": ids_to_query})
for i, new_item in zip(query_item_idx, new_items):
try:
result = client.get(item.id)
items[i] = getattr(result, edge)[0]
items[i] = getattr(new_item, edge)[0]
except Exception:
items[i] = None
return items
def get_property_values(
......@@ -80,4 +83,43 @@ class Query:
result = {
prop: self.get_property_values(client, prop, items) for prop in self.properties
}
return self.convert_dtype(result, dtype)
\ No newline at end of file
return self.convert_dtype(result, dtype)
# Cell
class Dataset(Item):
"""
Temporary dataset schema, remove when MVP2 is done.
"""
properties= Item.properties + ["queryStr"]
edges = Item.edges + ["item"]
def __init__(self, queryStr: str = None, item: list = None, **kwargs):
super().__init__(**kwargs)
self.queryStr = queryStr
self.item: list = item if item is not None else []
# Cell
def filter_missing(dataset: dict) -> dict:
missing_idx = set()
for column in dataset.values():
missing_idx.update([i for i, val in enumerate(column) if val is None])
return {
k: [item for i, item in enumerate(v) if i not in missing_idx] for k, v in dataset.items()
}
def export_dataset(
client: PodClient,
dataset: Dataset,
content_fields: List[str] = ["content"],
label_field: str = "label.value",
missing_values: bool = False,
dtype: str = "dict",
):
items = dataset.item
query = Query("id", *content_fields, label_field)
result = query.execute(client, dataset.item)
if not missing_values:
result = filter_missing(result)
return query.convert_dtype(result, dtype)
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment