Commit 06edc574 authored by Koen van der Veen's avatar Koen van der Veen
Browse files

refactor notelistindexer

parent b7138fd9
Showing with 395 additions and 430 deletions
+395 -430
......@@ -31,18 +31,16 @@ index = {"read_file": "basic.ipynb",
"ISpan": "indexers.NoteListIndexer.NoteList.ipynb",
"get_span": "indexers.NoteListIndexer.NoteList.ipynb",
"HTMLListParser": "indexers.NoteListIndexer.Parser.ipynb",
"LISTTYPE_VERBS": "indexers.NoteListIndexer.Parser.ipynb",
"LIST_PREFIXES": "indexers.NoteListIndexer.Parser.ipynb",
"NotesListIndexer": "indexers.NoteListIndexer.ipynb",
"ListTypePredictor": "indexers.NoteListIndexer.ipynb",
"get_toplevel_elements": "indexers.NoteListIndexer.util.ipynb",
"remove_html": "indexers.NoteListIndexer.util.ipynb",
"remove_prefix_chars": "indexers.NoteListIndexer.util.ipynb",
"is_newline": "indexers.NoteListIndexer.util.ipynb",
"is_newline_div": "indexers.NoteListIndexer.util.ipynb",
"is_newline_paragraph": "indexers.NoteListIndexer.util.ipynb",
"div_is_unstructured_list_title": "indexers.NoteListIndexer.util.ipynb",
"is_newline_par": "indexers.NoteListIndexer.util.ipynb",
"is_title": "indexers.NoteListIndexer.util.ipynb",
"trim_till_newline": "indexers.NoteListIndexer.util.ipynb",
"find_till_double_br": "indexers.NoteListIndexer.util.ipynb",
"get_children": "indexers.NoteListIndexer.util.ipynb",
"contains": "indexers.NoteListIndexer.util.ipynb",
"load_spacy_model": "indexers.NoteListIndexer.util.ipynb",
"HTML_LINEBREAK_REGEX": "indexers.NoteListIndexer.util.ipynb",
......
......@@ -5,6 +5,7 @@ __all__ = ['LIST_CLASSES', 'INote', 'INoteList', 'ULNoteList', 'ISpan', 'get_spa
# Cell
from ...data.schema import *
from .util import *
import bs4
# Cell
TODO, TOWATCH, TOREAD, TOLISTEN, TOBUY, UNKOWN = "todo","towatch", "toread", "tolisten", "tobuy", "unknown"
......@@ -59,9 +60,10 @@ class ULNoteList(INoteList):
def get_items(self, remove_html_=False, skip_nested=False):
if self.content is None: return [self.textContent]
result = [i for i in get_toplevel_elements(str(self.content), "li")
if len(i("ul")) == 0]
parsed = bs4.BeautifulSoup(self.content, "html.parser").ul
result = [x for x in parsed.find_all("li", recursive=False) if len(x("ul")) == 0]
# result = [i for i in get_toplevel_elements(str(self.content), "li")
# if len(i("ul")) == 0]
if remove_html_: result = [remove_html(str(x)) for x in result]
result = [str(x) for x in result if x != ""]
......
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/indexers.NoteListIndexer.Parser.ipynb (unless otherwise specified).
__all__ = ['HTMLListParser']
__all__ = ['HTMLListParser', 'LISTTYPE_VERBS', 'LIST_PREFIXES']
# Cell
import bs4
......@@ -12,94 +12,89 @@ from ...data.basic import *
from ...imports import *
# Cell
LISTTYPE_VERBS = ["do", "read", "watch", "buy", "listen"]
LIST_PREFIXES = ["to", "to-", "to ", ""]
class HTMLListParser():
'''Extracts lists from HTML data, generated by an HTML text editor like evernote'''
def __init__(self):
self.single_item_list_patterns = [p+v for v in LISTTYPE_VERBS for p in LIST_PREFIXES]
self.one_line_list_pa = ["buy", "read", "watch"]
words = ["do", "read", "watch", "buy", "listen"]
prefixes = ["to", "to-", "to ", ""]
self.single_item_list_patterns = [prefix+word for word in words for prefix in prefixes]
def get_html_lists(self, note, parsed):
html_lists = parsed.find_all("ul", recursive=False) + parsed.find_all("ol", recursive=False)
return [ULNoteList.from_data(title=None, content=str(x), textContent=x.get_text(),
note=note, span=get_span(x, parsed)) for x in html_lists]
def get_lists(self, note):
"""Extracts lists from a note"""
text = note.content
parsed = bs4.BeautifulSoup(text, 'html.parser')
parsed = bs4.BeautifulSoup(note.content, 'html.parser')
note.content=str(parsed)
uls = get_toplevel_elements(text, "ul", parsed=parsed)
ols = get_toplevel_elements(text, "ol", parsed=parsed)
html_lists = [ULNoteList.from_data(title=None, content=str(x),
textContent=remove_html(str(x)), note=note, span=get_span(note, x, parsed))
for x in uls + ols]
unformatted_lists = self.get_unformatted_lists(note, text, parsed)
all_lists = html_lists + unformatted_lists
for l in all_lists:
note.add_edge("noteList", l)
all_lists = self.get_html_lists(note, parsed) + \
self.get_unformatted_lists(note, parsed)
for l in all_lists: note.add_edge("noteList", l)
return all_lists
def get_single_line_list(self, elem):
def parse(self, x, tag=None):
if isinstance(x, bs4.BeautifulSoup): return x.find(tag) if tag is not None else x
elif isinstance(x, bs4.element.Tag): return x
else:
res = bs4.BeautifulSoup(x, 'html.parser')
return res.find(tag) if tag is not None else res
def get_single_line_list(self, par):
"""Get single list lists. An example could be: '<strong>read</strong>: great book title'"""
ps = ["read", "buy", "watch", "do"]
pat = "|".join([f"(<strong>|<em>|<u>)?{p}(</strong>|</em>|</u>)?:? ?" for p in ps])
match = re.search(pat, str(elem), re.IGNORECASE)
if match is None: return None, None
par = self.parse(par, "p")
par_html = "".join(mapped(str, par.contents))
cleaned_elem = remove_html(str(elem))
pat = "|".join([f"(<strong>|<em>|<u>)?{v}:? ?(</strong>|</em>|</u>)?:? ?"
for v in LISTTYPE_VERBS])
match = re.search(pat, par_html, re.IGNORECASE)
if match is None: return None, None
cleaned_title = remove_html(match.group()) if match is not None else None
title_html = match.group() if match is not None else None
if len(cleaned_elem) > len(cleaned_title) + 2:
if len(par.get_text()) > len(remove_html(title_html)) + 2:
title = match.group()
content = str(elem)[len(title):]
content = par_html[par_html.index(title) + len(title):]
return title, content
else:
return None, None
def get_unformatted_lists(self, note, txt, parsed):
def get_unformatted_lists(self, note, parsed):
"""retrieve lists without <ul></ul> tags. We have two options:
1) multiline lists prefixed with a title keyword (e.g. "Buy:" "Read:")
2) single element single line lists"""
toplevel_div = get_toplevel_elements(txt, "div")[0]
ls = []
for elem in toplevel_div.children:
if elem.name == "div" and not is_newline_div(elem):
children = get_children(elem)
for i, child in enumerate(children):
# this extracts the lists that have a title and are not on a single line
if div_is_unstructured_list_title(child):
print(child)
title = child
successors = list(children)[i+1:]
if len(successors) == 0:
continue
items = [x for x in find_till_double_br(successors) if not is_newline(str(x))]
items_str = [str(x) for x in items]
items_span = [get_span(note, x, parsed) for x in items_str]
span1 = get_span(note, title, parsed)
span2 = get_span(note, items[-1], parsed)
span = Span.from_data(startIdx=span1.startIdx, endIdx=span2.endIdx)
html_content = "".join(items_str)
l = INoteList.from_data(note=note, title=title, content=str(html_content), itemSpan=items_span, span=span)
ls.append(l)
else:
title, html_content = self.get_single_line_list(child)
if title is not None:
span = get_span(note, child, parsed)
itemSpan = [Span.from_data(startIdx=span.startIdx + len(str(title)), endIdx=span.endIdx)]
l = INoteList.from_data(note=note, title=title, content=str(html_content), itemSpan=itemSpan, span=span)
ls.append(l)
return ls
\ No newline at end of file
parsed = parsed if parsed is not None else self.parse(note.content)
toplevel_paragraphs = parsed.find_all("p", recursive=False)
res = []
for i, par in enumerate(toplevel_paragraphs):
if is_title(par):
# this extracts the lists that have a title and are not on a single line
items = trim_till_newline(list(toplevel_paragraphs)[i+1:])
if len(items) == 0: continue
list_span = Span.from_data(startIdx=get_span(title, parsed).startIdx,
endIdx=get_span(items[-1], parsed).endIdx)
l = INoteList.from_data(note=note,span=list_span,
title=str(par.contents[0]),
content="".join(mapped(str,items)),
itemSpan=[get_span(x, parsed) for x in items])
res.append(l)
else:
title, html_content = self.get_single_line_list(par)
if title is not None:
span = get_span(str(par), parsed)
itemSpans = [Span.from_data(startIdx=span.startIdx + len(str(title)),
endIdx=span.endIdx)]
l = INoteList.from_data(note=note, title=title, content=str(html_content),
itemSpan=itemSpans, span=get_span(par, parsed))
res.append(l)
return res
\ No newline at end of file
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/indexers.NoteListIndexer.util.ipynb (unless otherwise specified).
__all__ = ['get_toplevel_elements', 'remove_html', 'remove_prefix_chars', 'is_newline', 'is_newline_div',
'is_newline_paragraph', 'div_is_unstructured_list_title', 'trim_till_newline', 'find_till_double_br',
'get_children', 'contains', 'load_spacy_model', 'HTML_LINEBREAK_REGEX']
__all__ = ['remove_html', 'remove_prefix_chars', 'is_newline', 'is_newline_par', 'is_newline', 'is_title',
'trim_till_newline', 'contains', 'load_spacy_model', 'HTML_LINEBREAK_REGEX']
# Cell
import re, bs4, spacy
......@@ -10,19 +9,6 @@ import re, bs4, spacy
# Cell
HTML_LINEBREAK_REGEX = "<br[^<]*/>"
def get_toplevel_elements(str_, element, parsed=None):
if parsed is None:
parsed = bs4.BeautifulSoup(str_, "html.parser")
skip, result = [], []
for l in parsed(element):
if str(l) not in skip:
result.append(l)
skip += [str(l_nested) for l_nested in l(element)]
return result
def remove_html(str_):
return re.sub('<[^<]+?>', '', str_)
......@@ -36,24 +22,24 @@ def is_newline(str_):
if res.group() == str_: return True
else: return False
def is_newline_div(div):
c = div.contents
if is_newline_paragraph(div): return True
def is_newline_par(par):
c = par.contents
if is_newline(par): return True
elif len(c) == 0: return False
elif len(c) == 1 and is_newline(str(c[0])): return True
else: return False
def is_newline_paragraph(p):
def is_newline(p):
return str(p) == "<p></p>"
def div_is_unstructured_list_title(div):
def is_title(par):
p = "read"
title_regex = f"(?<!<li>){p}|(?<!<li>)buy"
match = re.search(title_regex, str(div), re.IGNORECASE)
match = re.search(title_regex, str(par), re.IGNORECASE)
if match is None: return False
cleaned_div = remove_html(str(div))
cleaned_div = remove_html(str(par))
cleaned_title = remove_html(match.group()) if match is not None else None
# the title should be the bulk of the div
......@@ -68,34 +54,34 @@ def trim_till_newline(pars):
end_idx = i
if i == len(pars)-1:
break
if is_newline_paragraph(par):
if is_newline_par(par):
break
return pars[:end_idx]
def find_till_double_br(divs):
end_idx = 0
for i, s in enumerate(divs):
end_idx = i
if i == len(divs)-1:
break
if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
# print("FOUND ", successors[:i])
break
return divs[:end_idx]
def get_children(elem):
"""Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
res = []
inc_str = ""
for c in elem.children:
c = str(c)
inc_str += c
if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
res.append(inc_str)
inc_str = ""
return res
# def find_till_double_br(divs):
# end_idx = 0
# for i, s in enumerate(divs):
# end_idx = i
# if i == len(divs)-1:
# break
# if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
# # print("FOUND ", successors[:i])
# break
# return divs[:end_idx]
# def get_children(elem):
# """Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
# res = []
# inc_str = ""
# for c in elem.children:
# c = str(c)
# inc_str += c
# if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
# res.append(inc_str)
# inc_str = ""
# return res
def contains(str_, pat):
'''case insensitive match'''
......
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
# default_exp indexers.notelist.notelist
```
%% Cell type:code id: tags:
``` python
# export
from integrators.data.schema import *
from integrators.indexers.notelist.util import *
import bs4
```
%% Cell type:code id: tags:
``` python
# export
TODO, TOWATCH, TOREAD, TOLISTEN, TOBUY, UNKOWN = "todo","towatch", "toread", "tolisten", "tobuy", "unknown"
LIST_CLASSES = [TODO, TOWATCH, TOREAD, TOLISTEN, TOBUY, UNKOWN]
```
%% Cell type:markdown id: tags:
# Note
A note is an html formatted piece of text
%% Cell type:code id: tags:
``` python
# export
class INote(Note):
def __repr__(self):
content = self.content[:50] + " ..." if len(self.content) > 20 else self.content
return f"INote (#{self.uid}) {content}"
def show(self):
print(f"INote (#{self.uid}) {self.content}")
```
%% Cell type:markdown id: tags:
# NoteLists
A notelist object denotes a list contained in a written html note.
%% Cell type:code id: tags:
``` python
# export
class INoteList(NoteList):
def __str__(self):
return str(self.content)
def infer_cat_from_title(self):
if self.title is None: return None
if contains(self.title, "do"): return TODO
if contains(self.title, "read"): return TOREAD
if contains(self.title, "watch"): return TOWATCH
if contains(self.title, "listen"): return TOLISTEN
if contains(self.title, "buy"): return TOBUY
else: return None
def get_items(self, remove_html_=False, skip_nested=True):
if self.itemSpan is not None:
return [str(self.note.content)[s.startIdx:s.endIdx] for s in self.itemSpan]
else:
return []
def __repr__(self):
cat_str = f"({self.category})" if self.category is not None else ""
title = remove_html(self.title) if self.title is not None else "Untitled"
return f"(INoteList) # {title} {cat_str}\n{self.content}\n\n"
def __eq__(self, other):
return self.uid == other.uid
```
%% Cell type:markdown id: tags:
## ULNoteList
A ULNoteList is the most vanilla kind of list. It is a list of items encapsulated by \<ul> \</ul> tags.
%% Cell type:code id: tags:
``` python
# export
class ULNoteList(INoteList):
'''A <ul> </ul> list extracted from a note. '''
def get_items(self, remove_html_=False, skip_nested=False):
if self.content is None: return [self.textContent]
result = [i for i in get_toplevel_elements(str(self.content), "li")
if len(i("ul")) == 0]
parsed = bs4.BeautifulSoup(self.content, "html.parser").ul
result = [x for x in parsed.find_all("li", recursive=False) if len(x("ul")) == 0]
# result = [i for i in get_toplevel_elements(str(self.content), "li")
# if len(i("ul")) == 0]
if remove_html_: result = [remove_html(str(x)) for x in result]
result = [str(x) for x in result if x != ""]
return result
def __repr__(self):
items = "\n".join(self.get_items(remove_html_=True))
cat_str = f"({self.category})" if self.category is not None else ""
title = remove_html(self.title) if self.title is not None else "Untitled"
return f"ULNoteList # {title} {cat_str}\n{items}\n\n"
```
%% Cell type:code id: tags:
``` python
ULNoteList.from_data(title="Awesome title", content="Awesome content")
ULNoteList.from_data(title="Awesome title", content="<ul>Awesome content</ul>")
```
%% Output
# Awesome title
ULNoteList # Awesome title
%% Cell type:markdown id: tags:
## Span
We use spans to specify a range within a piece of text. If we for instance have a piece of text "Memri solves all your problems" and a span with startIdx=6 and endIdx=16, it points to "solves all".
%% Cell type:code id: tags:
``` python
# export
class ISpan(Span):
'''A span of an element in a piece of text'''
def __eq__(self, other):
return self.startIdx == other.startIdx and self.endIdx == other.endIdx
def __repr__(self):
return f"ISpan [{self.startIdx}, {self.endIdx}]"
```
%% Cell type:code id: tags:
``` python
# export
def get_span(elem, parsed):
e_str = str(elem)
parsed_str = str(parsed)
begin = parsed_str.find(e_str)
end = begin + len(e_str)
return ISpan.from_data(startIdx=begin, endIdx=end)
```
%% Cell type:markdown id: tags:
# Export -
%% Cell type:code id: tags:
``` python
# hide
from nbdev.export import *
notebook2script()
```
%% Output
Converted basic.ipynb.
Converted importers.EmailImporter.ipynb.
Converted importers.Importer.ipynb.
Converted importers.util.ipynb.
Converted index.ipynb.
Converted indexers.FaceRecognitionIndexer.ipynb.
Converted indexers.FacerecognitionIndexer.Photo.ipynb.
Converted indexers.GeoIndexer.ipynb.
Converted indexers.NoteListIndexer.NoteList.ipynb.
Converted indexers.NoteListIndexer.Parser.ipynb.
Converted indexers.NoteListIndexer.ipynb.
Converted indexers.NoteListIndexer.util.ipynb.
Converted indexers.indexer.ipynb.
Converted itembase.ipynb.
Converted pod.client.ipynb.
%% Cell type:code id: tags:
``` python
```
......
This diff is collapsed.
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
# default_exp indexers.notelist.util
```
%% Cell type:code id: tags:
``` python
# export
import re, bs4, spacy
```
%% Cell type:markdown id: tags:
# Util
%% Cell type:code id: tags:
``` python
# export
HTML_LINEBREAK_REGEX = "<br[^<]*/>"
def get_toplevel_elements(str_, element, parsed=None):
if parsed is None:
parsed = bs4.BeautifulSoup(str_, "html.parser")
skip, result = [], []
for l in parsed(element):
if str(l) not in skip:
result.append(l)
skip += [str(l_nested) for l_nested in l(element)]
return result
def remove_html(str_):
return re.sub('<[^<]+?>', '', str_)
def remove_prefix_chars(s, chars):
while s[0] in chars: s = s[1:]
return s
def is_newline(str_):
res = re.search(HTML_LINEBREAK_REGEX, str_, re.IGNORECASE)
if res is None: return False
if res.group() == str_: return True
else: return False
def is_newline_div(div):
c = div.contents
if is_newline_paragraph(div): return True
def is_newline_par(par):
c = par.contents
if is_newline(par): return True
elif len(c) == 0: return False
elif len(c) == 1 and is_newline(str(c[0])): return True
else: return False
def is_newline_paragraph(p):
def is_newline(p):
return str(p) == "<p></p>"
def div_is_unstructured_list_title(div):
def is_title(par):
p = "read"
title_regex = f"(?<!<li>){p}|(?<!<li>)buy"
match = re.search(title_regex, str(div), re.IGNORECASE)
match = re.search(title_regex, str(par), re.IGNORECASE)
if match is None: return False
cleaned_div = remove_html(str(div))
cleaned_div = remove_html(str(par))
cleaned_title = remove_html(match.group()) if match is not None else None
# the title should be the bulk of the div
if len(cleaned_title) > len(cleaned_div) - 2:
return True
else:
return False
def trim_till_newline(pars):
end_idx = 0
for i, par in enumerate(pars):
end_idx = i
if i == len(pars)-1:
break
if is_newline_paragraph(par):
if is_newline_par(par):
break
return pars[:end_idx]
def find_till_double_br(divs):
end_idx = 0
for i, s in enumerate(divs):
end_idx = i
if i == len(divs)-1:
break
if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
# print("FOUND ", successors[:i])
break
return divs[:end_idx]
def get_children(elem):
"""Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
res = []
inc_str = ""
for c in elem.children:
c = str(c)
inc_str += c
if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
res.append(inc_str)
inc_str = ""
return res
# def find_till_double_br(divs):
# end_idx = 0
# for i, s in enumerate(divs):
# end_idx = i
# if i == len(divs)-1:
# break
# if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
# # print("FOUND ", successors[:i])
# break
# return divs[:end_idx]
# def get_children(elem):
# """Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
# res = []
# inc_str = ""
# for c in elem.children:
# c = str(c)
# inc_str += c
# if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
# res.append(inc_str)
# inc_str = ""
# return res
def contains(str_, pat):
'''case insensitive match'''
return re.search(pat, str_, re.IGNORECASE) is not None
def load_spacy_model(m):
try:
nlp = spacy.load(m)
except OSError:
print("Downloading language model for spaCy, this will only happen once")
from spacy.cli import download
download(m)
nlp = spacy.load(m)
return nlp
```
%% Cell type:markdown id: tags:
# Export -
%% Cell type:code id: tags:
``` python
# hide
from nbdev.export import *
notebook2script()
```
%% Output
Converted basic.ipynb.
Converted importers.EmailImporter.ipynb.
Converted importers.Importer.ipynb.
Converted importers.util.ipynb.
Converted index.ipynb.
Converted indexers.FaceRecognitionIndexer.ipynb.
Converted indexers.FacerecognitionIndexer.Photo.ipynb.
Converted indexers.GeoIndexer.ipynb.
Converted indexers.NoteListIndexer.NoteList.ipynb.
Converted indexers.NoteListIndexer.Parser.ipynb.
Converted indexers.NoteListIndexer.ipynb.
Converted indexers.NoteListIndexer.util.ipynb.
Converted indexers.indexer.ipynb.
Converted itembase.ipynb.
Converted pod.client.ipynb.
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment