refactor notelistindexer

06edc574 · Koen van der Veen · b7138fd9 · 06edc574 · 06edc574 · 06edc574
Commit 06edc574 authored 4 years ago by Koen van der Veen
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 395 additions and 430 deletions
+395 -430
--- a/integrators/_nbdev.py
+++ b/integrators/_nbdev.py
@@ -31,18 +31,16 @@ index = {"read_file": "basic.ipynb",
         "ISpan": "indexers.NoteListIndexer.NoteList.ipynb",
         "get_span": "indexers.NoteListIndexer.NoteList.ipynb",
         "HTMLListParser": "indexers.NoteListIndexer.Parser.ipynb",
+         "LISTTYPE_VERBS": "indexers.NoteListIndexer.Parser.ipynb",
+         "LIST_PREFIXES": "indexers.NoteListIndexer.Parser.ipynb",
         "NotesListIndexer": "indexers.NoteListIndexer.ipynb",
         "ListTypePredictor": "indexers.NoteListIndexer.ipynb",
-         "get_toplevel_elements": "indexers.NoteListIndexer.util.ipynb",
         "remove_html": "indexers.NoteListIndexer.util.ipynb",
         "remove_prefix_chars": "indexers.NoteListIndexer.util.ipynb",
         "is_newline": "indexers.NoteListIndexer.util.ipynb",
-         "is_newline_div": "indexers.NoteListIndexer.util.ipynb",
-         "is_newline_paragraph": "indexers.NoteListIndexer.util.ipynb",
-         "div_is_unstructured_list_title": "indexers.NoteListIndexer.util.ipynb",
+         "is_newline_par": "indexers.NoteListIndexer.util.ipynb",
+         "is_title": "indexers.NoteListIndexer.util.ipynb",
         "trim_till_newline": "indexers.NoteListIndexer.util.ipynb",
-         "find_till_double_br": "indexers.NoteListIndexer.util.ipynb",
-         "get_children": "indexers.NoteListIndexer.util.ipynb",
         "contains": "indexers.NoteListIndexer.util.ipynb",
         "load_spacy_model": "indexers.NoteListIndexer.util.ipynb",
         "HTML_LINEBREAK_REGEX": "indexers.NoteListIndexer.util.ipynb",

--- a/integrators/indexers/notelist/notelist.py
+++ b/integrators/indexers/notelist/notelist.py
@@ -5,6 +5,7 @@ __all__ = ['LIST_CLASSES', 'INote', 'INoteList', 'ULNoteList', 'ISpan', 'get_spa
 # Cell
 from ...data.schema import *
 from .util import *
+import bs4

 # Cell
 TODO, TOWATCH, TOREAD, TOLISTEN, TOBUY, UNKOWN = "todo","towatch", "toread", "tolisten", "tobuy", "unknown"
@@ -59,9 +60,10 @@ class ULNoteList(INoteList):
    def get_items(self, remove_html_=False, skip_nested=False):

        if self.content is None: return [self.textContent]
-
-        result = [i for i in get_toplevel_elements(str(self.content), "li")
-                  if len(i("ul")) == 0]
+        parsed = bs4.BeautifulSoup(self.content, "html.parser").ul
+        result = [x for x in parsed.find_all("li", recursive=False) if len(x("ul")) == 0]
+#         result = [i for i in get_toplevel_elements(str(self.content), "li")
+#                   if len(i("ul")) == 0]

        if remove_html_: result = [remove_html(str(x)) for x in result]
        result = [str(x) for x in result if x != ""]

--- a/integrators/indexers/notelist/parser.py
+++ b/integrators/indexers/notelist/parser.py
 # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/indexers.NoteListIndexer.Parser.ipynb (unless otherwise specified).

-__all__ = ['HTMLListParser']
+__all__ = ['HTMLListParser', 'LISTTYPE_VERBS', 'LIST_PREFIXES']

 # Cell
 import bs4
@@ -12,94 +12,89 @@ from ...data.basic import *
 from ...imports import *

 # Cell
+
+LISTTYPE_VERBS = ["do", "read", "watch", "buy", "listen"]
+LIST_PREFIXES = ["to", "to-", "to ", ""]
+
 class HTMLListParser():
    '''Extracts lists from HTML data, generated by an HTML text editor like evernote'''

    def __init__(self):
+        self.single_item_list_patterns = [p+v for v in LISTTYPE_VERBS for p in LIST_PREFIXES]

-        self.one_line_list_pa = ["buy", "read", "watch"]
-
-        words    = ["do", "read", "watch", "buy", "listen"]
-        prefixes = ["to", "to-", "to ", ""]
-
-        self.single_item_list_patterns = [prefix+word for word in words for prefix in prefixes]
+    def get_html_lists(self, note, parsed):
+        html_lists = parsed.find_all("ul", recursive=False) + parsed.find_all("ol", recursive=False)
+        return [ULNoteList.from_data(title=None, content=str(x), textContent=x.get_text(),
+                                     note=note, span=get_span(x, parsed)) for x in html_lists]

    def get_lists(self, note):
        """Extracts lists from a note"""
-
-        text = note.content
-        parsed = bs4.BeautifulSoup(text, 'html.parser')
+        parsed = bs4.BeautifulSoup(note.content, 'html.parser')
        note.content=str(parsed)

-        uls = get_toplevel_elements(text, "ul", parsed=parsed)
-        ols = get_toplevel_elements(text, "ol", parsed=parsed)
-
-        html_lists = [ULNoteList.from_data(title=None, content=str(x),
-         textContent=remove_html(str(x)), note=note, span=get_span(note, x, parsed))
-         for x in uls + ols]
-
-        unformatted_lists = self.get_unformatted_lists(note, text, parsed)
-        all_lists = html_lists + unformatted_lists
-
-        for l in all_lists:
-            note.add_edge("noteList", l)
+        all_lists = self.get_html_lists(note, parsed) + \
+                    self.get_unformatted_lists(note, parsed)
+        for l in all_lists: note.add_edge("noteList", l)

        return all_lists

-    def get_single_line_list(self, elem):
+    def parse(self, x, tag=None):
+        if isinstance(x, bs4.BeautifulSoup): return x.find(tag) if tag is not None else x
+        elif isinstance(x, bs4.element.Tag): return x
+        else:
+            res =  bs4.BeautifulSoup(x, 'html.parser')
+            return res.find(tag) if tag is not None else res
+
+    def get_single_line_list(self, par):
        """Get single list lists. An example could be: '<strong>read</strong>: great book title'"""
-        ps = ["read", "buy", "watch", "do"]
-        pat = "|".join([f"(<strong>|<em>|<u>)?{p}(</strong>|</em>|</u>)?:? ?" for p in ps])
-        match = re.search(pat, str(elem), re.IGNORECASE)
-        if match is None: return None, None
+        par = self.parse(par, "p")
+        par_html = "".join(mapped(str, par.contents))

-        cleaned_elem = remove_html(str(elem))
+        pat = "|".join([f"(<strong>|<em>|<u>)?{v}:? ?(</strong>|</em>|</u>)?:? ?"
+                        for v in LISTTYPE_VERBS])
+        match = re.search(pat, par_html, re.IGNORECASE)
+        if match is None: return None, None

-        cleaned_title = remove_html(match.group()) if match is not None else None
+        title_html = match.group() if match is not None else None

-        if len(cleaned_elem) > len(cleaned_title) + 2:
+        if len(par.get_text()) > len(remove_html(title_html)) + 2:
            title = match.group()
-            content = str(elem)[len(title):]
+            content = par_html[par_html.index(title) + len(title):]
            return title, content
        else:
            return None, None

-    def get_unformatted_lists(self, note, txt, parsed):
+    def get_unformatted_lists(self, note, parsed):
        """retrieve lists without <ul></ul> tags. We have two options:
                1) multiline lists prefixed with a title keyword (e.g. "Buy:" "Read:")
                2) single element single line lists"""

-        toplevel_div = get_toplevel_elements(txt, "div")[0]
-        ls = []
-
-        for elem in toplevel_div.children:
-            if elem.name == "div" and not is_newline_div(elem):
-
-                children = get_children(elem)
-                for i, child in enumerate(children):
-                    # this extracts the lists that have a title and are not on a single line
-                    if div_is_unstructured_list_title(child):
-                        print(child)
-                        title = child
-                        successors = list(children)[i+1:]
-                        if len(successors) == 0:
-                            continue
-                        items = [x for x in find_till_double_br(successors) if not is_newline(str(x))]
-                        items_str = [str(x) for x in items]
-                        items_span = [get_span(note, x, parsed) for x in items_str]
-
-                        span1 = get_span(note, title, parsed)
-                        span2 = get_span(note, items[-1], parsed)
-                        span = Span.from_data(startIdx=span1.startIdx, endIdx=span2.endIdx)
-                        html_content = "".join(items_str)
-                        l = INoteList.from_data(note=note, title=title, content=str(html_content), itemSpan=items_span, span=span)
-                        ls.append(l)
-
-                    else:
-                        title, html_content = self.get_single_line_list(child)
-                        if title is not None:
-                            span = get_span(note, child, parsed)
-                            itemSpan = [Span.from_data(startIdx=span.startIdx + len(str(title)), endIdx=span.endIdx)]
-                            l = INoteList.from_data(note=note, title=title, content=str(html_content), itemSpan=itemSpan, span=span)
-                            ls.append(l)
-        return ls
\ No newline at end of file
+        parsed = parsed if parsed is not None else self.parse(note.content)
+        toplevel_paragraphs = parsed.find_all("p", recursive=False)
+        res = []
+
+
+        for i, par in enumerate(toplevel_paragraphs):
+            if is_title(par):
+                # this extracts the lists that have a title and are not on a single line
+                items = trim_till_newline(list(toplevel_paragraphs)[i+1:])
+                if len(items) == 0: continue
+                list_span  = Span.from_data(startIdx=get_span(title, parsed).startIdx,
+                                            endIdx=get_span(items[-1], parsed).endIdx)
+
+                l = INoteList.from_data(note=note,span=list_span,
+                                        title=str(par.contents[0]),
+                                        content="".join(mapped(str,items)),
+                                        itemSpan=[get_span(x, parsed) for x in items])
+                res.append(l)
+
+            else:
+                title, html_content = self.get_single_line_list(par)
+                if title is not None:
+                    span = get_span(str(par), parsed)
+                    itemSpans = [Span.from_data(startIdx=span.startIdx + len(str(title)),
+                                                endIdx=span.endIdx)]
+                    l = INoteList.from_data(note=note, title=title, content=str(html_content),
+                                            itemSpan=itemSpans, span=get_span(par, parsed))
+                    res.append(l)
+        return res
\ No newline at end of file
--- a/integrators/indexers/notelist/util.py
+++ b/integrators/indexers/notelist/util.py
 # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/indexers.NoteListIndexer.util.ipynb (unless otherwise specified).

-__all__ = ['get_toplevel_elements', 'remove_html', 'remove_prefix_chars', 'is_newline', 'is_newline_div',
-           'is_newline_paragraph', 'div_is_unstructured_list_title', 'trim_till_newline', 'find_till_double_br',
-           'get_children', 'contains', 'load_spacy_model', 'HTML_LINEBREAK_REGEX']
+__all__ = ['remove_html', 'remove_prefix_chars', 'is_newline', 'is_newline_par', 'is_newline', 'is_title',
+           'trim_till_newline', 'contains', 'load_spacy_model', 'HTML_LINEBREAK_REGEX']

 # Cell
 import re, bs4, spacy
@@ -10,19 +9,6 @@ import re, bs4, spacy
 # Cell
 HTML_LINEBREAK_REGEX = "<br[^<]*/>"

-def get_toplevel_elements(str_, element, parsed=None):
-    if parsed is None:
-        parsed = bs4.BeautifulSoup(str_, "html.parser")
-
-    skip, result = [], []
-
-    for l in parsed(element):
-        if str(l) not in skip:
-            result.append(l)
-
-        skip += [str(l_nested) for l_nested in l(element)]
-    return result
-
 def remove_html(str_):
    return re.sub('<[^<]+?>', '', str_)

@@ -36,24 +22,24 @@ def is_newline(str_):
    if res.group() == str_: return True
    else: return False

-def is_newline_div(div):
-    c = div.contents
-    if is_newline_paragraph(div): return True
+def is_newline_par(par):
+    c = par.contents
+    if is_newline(par): return True
    elif len(c) == 0: return False
    elif len(c) == 1 and is_newline(str(c[0])): return True
    else: return False

-def is_newline_paragraph(p):
+def is_newline(p):
    return str(p) == "<p></p>"

-def div_is_unstructured_list_title(div):
+def is_title(par):
    p = "read"
    title_regex = f"(?<!<li>){p}|(?<!<li>)buy"

-    match = re.search(title_regex, str(div), re.IGNORECASE)
+    match = re.search(title_regex, str(par), re.IGNORECASE)
    if match is None: return False

-    cleaned_div = remove_html(str(div))
+    cleaned_div = remove_html(str(par))
    cleaned_title = remove_html(match.group()) if match is not None else None

    # the title should be the bulk of the div
@@ -68,34 +54,34 @@ def trim_till_newline(pars):
        end_idx = i
        if i == len(pars)-1:
            break
-        if is_newline_paragraph(par):
+        if is_newline_par(par):
            break
    return pars[:end_idx]

-def find_till_double_br(divs):
-    end_idx = 0
-    for i, s in enumerate(divs):
-        end_idx = i
-        if i == len(divs)-1:
-            break
-        if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
-#             print("FOUND ", successors[:i])
-            break
-    return divs[:end_idx]
-
-def get_children(elem):
-    """Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
-    res = []
-    inc_str = ""
-
-    for c in elem.children:
-        c = str(c)
-        inc_str += c
-
-        if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
-            res.append(inc_str)
-            inc_str = ""
-    return res
+# def find_till_double_br(divs):
+#     end_idx = 0
+#     for i, s in enumerate(divs):
+#         end_idx = i
+#         if i == len(divs)-1:
+#             break
+#         if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
+# #             print("FOUND ", successors[:i])
+#             break
+#     return divs[:end_idx]
+
+# def get_children(elem):
+#     """Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
+#     res = []
+#     inc_str = ""
+
+#     for c in elem.children:
+#         c = str(c)
+#         inc_str += c
+
+#         if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
+#             res.append(inc_str)
+#             inc_str = ""
+#     return res

 def contains(str_, pat):
    '''case insensitive match'''

--- a/nbs/indexers.NoteListIndexer.NoteList.ipynb
+++ b/nbs/indexers.NoteListIndexer.NoteList.ipynb
@@ -19,7 +19,8 @@
   "source": [
    "# export\n",
    "from integrators.data.schema import *\n",
-    "from integrators.indexers.notelist.util import *"
+    "from integrators.indexers.notelist.util import *\n",
+    "import bs4"
   ]
  },
  {
@@ -126,9 +127,10 @@
    "    def get_items(self, remove_html_=False, skip_nested=False):\n",
    "        \n",
    "        if self.content is None: return [self.textContent]\n",
-    "\n",
-    "        result = [i for i in get_toplevel_elements(str(self.content), \"li\")\n",
-    "                  if len(i(\"ul\")) == 0]\n",
+    "        parsed = bs4.BeautifulSoup(self.content, \"html.parser\").ul\n",
+    "        result = [x for x in parsed.find_all(\"li\", recursive=False) if len(x(\"ul\")) == 0]\n",
+    "#         result = [i for i in get_toplevel_elements(str(self.content), \"li\")\n",
+    "#                   if len(i(\"ul\")) == 0]\n",
    "        \n",
    "        if remove_html_: result = [remove_html(str(x)) for x in result]\n",
    "        result = [str(x) for x in result if x != \"\"]\n",
@@ -149,7 +151,7 @@
    {
     "data": {
      "text/plain": [
-       "# Awesome title \n",
+       "ULNoteList # Awesome title \n",
       "\n"
      ]
     },
@@ -159,7 +161,7 @@
    }
   ],
   "source": [
-    "ULNoteList.from_data(title=\"Awesome title\", content=\"Awesome content\")"
+    "ULNoteList.from_data(title=\"Awesome title\", content=\"<ul>Awesome content</ul>\")"
   ]
  },
  {

 %% Cell type:code id: tags:

 ``` python
 %load_ext autoreload
 %autoreload 2
 # default_exp indexers.notelist.notelist
 ```

 %% Cell type:code id: tags:

 ``` python
 # export
 from integrators.data.schema import *
 from integrators.indexers.notelist.util import *
+import bs4
 ```

 %% Cell type:code id: tags:

 ``` python
 # export
 TODO, TOWATCH, TOREAD, TOLISTEN, TOBUY, UNKOWN = "todo","towatch", "toread", "tolisten", "tobuy", "unknown"
 LIST_CLASSES = [TODO, TOWATCH, TOREAD, TOLISTEN, TOBUY, UNKOWN]
 ```

 %% Cell type:markdown id: tags:

 # Note
 A note is an html formatted piece of text

 %% Cell type:code id: tags:

 ``` python
 # export
 class INote(Note):

    def __repr__(self):
        content = self.content[:50] + " ..." if len(self.content) > 20 else self.content
        return f"INote (#{self.uid}) {content}"

    def show(self):
        print(f"INote (#{self.uid}) {self.content}")
 ```

 %% Cell type:markdown id: tags:

 # NoteLists

 A notelist object denotes a list contained in a written html note.

 %% Cell type:code id: tags:

 ``` python
 # export
 class INoteList(NoteList):
    def __str__(self):
        return str(self.content)

    def infer_cat_from_title(self):
        if self.title is None: return None
        if contains(self.title, "do"): return TODO
        if contains(self.title, "read"): return TOREAD
        if contains(self.title, "watch"): return TOWATCH
        if contains(self.title, "listen"): return TOLISTEN
        if contains(self.title, "buy"): return TOBUY
        else: return None

    def get_items(self, remove_html_=False, skip_nested=True):

        if self.itemSpan is not None:
            return [str(self.note.content)[s.startIdx:s.endIdx] for s in self.itemSpan]

        else:
            return []

    def __repr__(self):
        cat_str = f"({self.category})" if self.category   is not None else ""
        title = remove_html(self.title) if self.title is not None else "Untitled"
        return f"(INoteList) # {title} {cat_str}\n{self.content}\n\n"

    def __eq__(self, other):
        return self.uid == other.uid

 ```

 %% Cell type:markdown id: tags:

 ## ULNoteList
 A ULNoteList is the most vanilla kind of list. It is a list of items encapsulated by \<ul> \</ul> tags.

 %% Cell type:code id: tags:

 ``` python
 # export
 class ULNoteList(INoteList):
    '''A <ul> </ul> list extracted from a note. '''

    def get_items(self, remove_html_=False, skip_nested=False):

        if self.content is None: return [self.textContent]
-
-        result = [i for i in get_toplevel_elements(str(self.content), "li")
-                  if len(i("ul")) == 0]
+        parsed = bs4.BeautifulSoup(self.content, "html.parser").ul
+        result = [x for x in parsed.find_all("li", recursive=False) if len(x("ul")) == 0]
+#         result = [i for i in get_toplevel_elements(str(self.content), "li")
+#                   if len(i("ul")) == 0]

        if remove_html_: result = [remove_html(str(x)) for x in result]
        result = [str(x) for x in result if x != ""]
        return result

    def __repr__(self):
        items = "\n".join(self.get_items(remove_html_=True))
        cat_str = f"({self.category})" if self.category is not None else ""
        title = remove_html(self.title) if self.title is not None else "Untitled"
        return f"ULNoteList # {title} {cat_str}\n{items}\n\n"
 ```

 %% Cell type:code id: tags:

 ``` python
-ULNoteList.from_data(title="Awesome title", content="Awesome content")
+ULNoteList.from_data(title="Awesome title", content="<ul>Awesome content</ul>")
 ```

 %% Output

-    # Awesome title
+    ULNoteList # Awesome title
    

 %% Cell type:markdown id: tags:

 ## Span
 We use spans to specify a range within a piece of text. If we for instance have a piece of text "Memri solves all your problems" and a span with startIdx=6 and endIdx=16, it points to "solves all".

 %% Cell type:code id: tags:

 ``` python
 # export
 class ISpan(Span):
    '''A span of an element in a piece of text'''

    def __eq__(self, other):
        return self.startIdx == other.startIdx and self.endIdx == other.endIdx

    def __repr__(self):
        return f"ISpan [{self.startIdx}, {self.endIdx}]"
 ```

 %% Cell type:code id: tags:

 ``` python
 # export
 def get_span(elem, parsed):
    e_str = str(elem)
    parsed_str = str(parsed)

    begin = parsed_str.find(e_str)
    end = begin + len(e_str)

    return ISpan.from_data(startIdx=begin, endIdx=end)
 ```

 %% Cell type:markdown id: tags:

 # Export -

 %% Cell type:code id: tags:

 ``` python
 # hide
 from nbdev.export import *
 notebook2script()
 ```

 %% Output

    Converted basic.ipynb.
    Converted importers.EmailImporter.ipynb.
    Converted importers.Importer.ipynb.
    Converted importers.util.ipynb.
    Converted index.ipynb.
    Converted indexers.FaceRecognitionIndexer.ipynb.
    Converted indexers.FacerecognitionIndexer.Photo.ipynb.
    Converted indexers.GeoIndexer.ipynb.
    Converted indexers.NoteListIndexer.NoteList.ipynb.
    Converted indexers.NoteListIndexer.Parser.ipynb.
    Converted indexers.NoteListIndexer.ipynb.
    Converted indexers.NoteListIndexer.util.ipynb.
    Converted indexers.indexer.ipynb.
    Converted itembase.ipynb.
    Converted pod.client.ipynb.

 %% Cell type:code id: tags:

 ``` python
 ```

--- a/nbs/indexers.NoteListIndexer.Parser.ipynb
+++ b/nbs/indexers.NoteListIndexer.Parser.ipynb
--- a/nbs/indexers.NoteListIndexer.util.ipynb
+++ b/nbs/indexers.NoteListIndexer.util.ipynb
@@ -37,19 +37,6 @@
    "# export\n",
    "HTML_LINEBREAK_REGEX = \"<br[^<]*/>\"\n",
    "\n",
-    "def get_toplevel_elements(str_, element, parsed=None):\n",
-    "    if parsed is None:\n",
-    "        parsed = bs4.BeautifulSoup(str_, \"html.parser\")\n",
-    "\n",
-    "    skip, result = [], []\n",
-    "\n",
-    "    for l in parsed(element):\n",
-    "        if str(l) not in skip:\n",
-    "            result.append(l)\n",
-    "\n",
-    "        skip += [str(l_nested) for l_nested in l(element)]\n",
-    "    return result\n",
-    "\n",
    "def remove_html(str_):\n",
    "    return re.sub('<[^<]+?>', '', str_)\n",
    "\n",
@@ -63,24 +50,24 @@
    "    if res.group() == str_: return True\n",
    "    else: return False\n",
    "\n",
-    "def is_newline_div(div):\n",
-    "    c = div.contents\n",
-    "    if is_newline_paragraph(div): return True\n",
+    "def is_newline_par(par):\n",
+    "    c = par.contents\n",
+    "    if is_newline(par): return True\n",
    "    elif len(c) == 0: return False\n",
    "    elif len(c) == 1 and is_newline(str(c[0])): return True\n",
    "    else: return False\n",
    "    \n",
-    "def is_newline_paragraph(p):\n",
+    "def is_newline(p):\n",
    "    return str(p) == \"<p></p>\"\n",
    "\n",
-    "def div_is_unstructured_list_title(div):\n",
+    "def is_title(par):\n",
    "    p = \"read\"\n",
    "    title_regex = f\"(?<!<li>){p}|(?<!<li>)buy\"\n",
    "\n",
-    "    match = re.search(title_regex, str(div), re.IGNORECASE)\n",
+    "    match = re.search(title_regex, str(par), re.IGNORECASE)\n",
    "    if match is None: return False\n",
    "    \n",
-    "    cleaned_div = remove_html(str(div))    \n",
+    "    cleaned_div = remove_html(str(par))    \n",
    "    cleaned_title = remove_html(match.group()) if match is not None else None\n",
    "\n",
    "    # the title should be the bulk of the div\n",
@@ -95,34 +82,34 @@
    "        end_idx = i\n",
    "        if i == len(pars)-1:\n",
    "            break\n",
-    "        if is_newline_paragraph(par):\n",
+    "        if is_newline_par(par):\n",
    "            break\n",
    "    return pars[:end_idx]\n",
    "\n",
-    "def find_till_double_br(divs):\n",
-    "    end_idx = 0\n",
-    "    for i, s in enumerate(divs):\n",
-    "        end_idx = i\n",
-    "        if i == len(divs)-1:\n",
-    "            break\n",
-    "        if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):\n",
-    "#             print(\"FOUND \", successors[:i])\n",
-    "            break\n",
-    "    return divs[:end_idx]\n",
+    "# def find_till_double_br(divs):\n",
+    "#     end_idx = 0\n",
+    "#     for i, s in enumerate(divs):\n",
+    "#         end_idx = i\n",
+    "#         if i == len(divs)-1:\n",
+    "#             break\n",
+    "#         if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):\n",
+    "# #             print(\"FOUND \", successors[:i])\n",
+    "#             break\n",
+    "#     return divs[:end_idx]\n",
    "\n",
-    "def get_children(elem):\n",
-    "    \"\"\"Fetches children of an element, put combines children when they are style element like <strong>example</strong>\"\"\"\n",
-    "    res = []\n",
-    "    inc_str = \"\"\n",
+    "# def get_children(elem):\n",
+    "#     \"\"\"Fetches children of an element, put combines children when they are style element like <strong>example</strong>\"\"\"\n",
+    "#     res = []\n",
+    "#     inc_str = \"\"\n",
    "    \n",
-    "    for c in elem.children:\n",
-    "        c = str(c)\n",
-    "        inc_str += c\n",
+    "#     for c in elem.children:\n",
+    "#         c = str(c)\n",
+    "#         inc_str += c\n",
    "        \n",
-    "        if not(c.startswith(\"<strong>\") | c.startswith(\"<em>\") | c.startswith(\"<u>\")):\n",
-    "            res.append(inc_str)\n",
-    "            inc_str = \"\"\n",
-    "    return res\n",
+    "#         if not(c.startswith(\"<strong>\") | c.startswith(\"<em>\") | c.startswith(\"<u>\")):\n",
+    "#             res.append(inc_str)\n",
+    "#             inc_str = \"\"\n",
+    "#     return res\n",
    "\n",
    "def contains(str_, pat):\n",
    "    '''case insensitive match'''\n",

 %% Cell type:code id: tags:

 ``` python
 %load_ext autoreload
 %autoreload 2
 # default_exp indexers.notelist.util
 ```

 %% Cell type:code id: tags:

 ``` python
 # export
 import re, bs4, spacy
 ```

 %% Cell type:markdown id: tags:

 # Util

 %% Cell type:code id: tags:

 ``` python
 # export
 HTML_LINEBREAK_REGEX = "<br[^<]*/>"

-def get_toplevel_elements(str_, element, parsed=None):
-    if parsed is None:
-        parsed = bs4.BeautifulSoup(str_, "html.parser")
-
-    skip, result = [], []
-
-    for l in parsed(element):
-        if str(l) not in skip:
-            result.append(l)
-
-        skip += [str(l_nested) for l_nested in l(element)]
-    return result
-
 def remove_html(str_):
    return re.sub('<[^<]+?>', '', str_)

 def remove_prefix_chars(s, chars):
    while s[0] in chars: s = s[1:]
    return s

 def is_newline(str_):
    res = re.search(HTML_LINEBREAK_REGEX, str_, re.IGNORECASE)
    if res is None: return False
    if res.group() == str_: return True
    else: return False

-def is_newline_div(div):
-    c = div.contents
-    if is_newline_paragraph(div): return True
+def is_newline_par(par):
+    c = par.contents
+    if is_newline(par): return True
    elif len(c) == 0: return False
    elif len(c) == 1 and is_newline(str(c[0])): return True
    else: return False

-def is_newline_paragraph(p):
+def is_newline(p):
    return str(p) == "<p></p>"

-def div_is_unstructured_list_title(div):
+def is_title(par):
    p = "read"
    title_regex = f"(?<!<li>){p}|(?<!<li>)buy"

-    match = re.search(title_regex, str(div), re.IGNORECASE)
+    match = re.search(title_regex, str(par), re.IGNORECASE)
    if match is None: return False

-    cleaned_div = remove_html(str(div))
+    cleaned_div = remove_html(str(par))
    cleaned_title = remove_html(match.group()) if match is not None else None

    # the title should be the bulk of the div
    if len(cleaned_title) > len(cleaned_div) - 2:
        return True
    else:
        return False

 def trim_till_newline(pars):
    end_idx = 0
    for i, par in enumerate(pars):
        end_idx = i
        if i == len(pars)-1:
            break
-        if is_newline_paragraph(par):
+        if is_newline_par(par):
            break
    return pars[:end_idx]

-def find_till_double_br(divs):
-    end_idx = 0
-    for i, s in enumerate(divs):
-        end_idx = i
-        if i == len(divs)-1:
-            break
-        if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
-#             print("FOUND ", successors[:i])
-            break
-    return divs[:end_idx]
-
-def get_children(elem):
-    """Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
-    res = []
-    inc_str = ""
-
-    for c in elem.children:
-        c = str(c)
-        inc_str += c
-
-        if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
-            res.append(inc_str)
-            inc_str = ""
-    return res
+# def find_till_double_br(divs):
+#     end_idx = 0
+#     for i, s in enumerate(divs):
+#         end_idx = i
+#         if i == len(divs)-1:
+#             break
+#         if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
+# #             print("FOUND ", successors[:i])
+#             break
+#     return divs[:end_idx]
+
+# def get_children(elem):
+#     """Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
+#     res = []
+#     inc_str = ""
+
+#     for c in elem.children:
+#         c = str(c)
+#         inc_str += c
+
+#         if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
+#             res.append(inc_str)
+#             inc_str = ""
+#     return res

 def contains(str_, pat):
    '''case insensitive match'''
    return re.search(pat, str_, re.IGNORECASE) is not None

 def load_spacy_model(m):
    try:
        nlp = spacy.load(m)
    except OSError:
        print("Downloading language model for spaCy, this will only happen once")
        from spacy.cli import download
        download(m)
        nlp = spacy.load(m)
    return nlp
 ```

 %% Cell type:markdown id: tags:

 # Export -

 %% Cell type:code id: tags:

 ``` python
 # hide
 from nbdev.export import *
 notebook2script()
 ```

 %% Output

    Converted basic.ipynb.
    Converted importers.EmailImporter.ipynb.
    Converted importers.Importer.ipynb.
    Converted importers.util.ipynb.
    Converted index.ipynb.
    Converted indexers.FaceRecognitionIndexer.ipynb.
    Converted indexers.FacerecognitionIndexer.Photo.ipynb.
    Converted indexers.GeoIndexer.ipynb.
    Converted indexers.NoteListIndexer.NoteList.ipynb.
    Converted indexers.NoteListIndexer.Parser.ipynb.
    Converted indexers.NoteListIndexer.ipynb.
    Converted indexers.NoteListIndexer.util.ipynb.
    Converted indexers.indexer.ipynb.
    Converted itembase.ipynb.
    Converted pod.client.ipynb.

 %% Cell type:code id: tags:

 ``` python
 ```