Skip to content
GitLab
Explore
Projects
Groups
Snippets
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Dima Gerasimov
pyIntegrators
Commits
06edc574
Commit
06edc574
authored
4 years ago
by
Koen van der Veen
Browse files
Options
Download
Email Patches
Plain Diff
refactor notelistindexer
parent
b7138fd9
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
integrators/_nbdev.py
+4
-6
integrators/_nbdev.py
integrators/indexers/notelist/notelist.py
+5
-3
integrators/indexers/notelist/notelist.py
integrators/indexers/notelist/parser.py
+61
-66
integrators/indexers/notelist/parser.py
integrators/indexers/notelist/util.py
+34
-48
integrators/indexers/notelist/util.py
nbs/indexers.NoteListIndexer.NoteList.ipynb
+8
-6
nbs/indexers.NoteListIndexer.NoteList.ipynb
nbs/indexers.NoteListIndexer.Parser.ipynb
+254
-259
nbs/indexers.NoteListIndexer.Parser.ipynb
nbs/indexers.NoteListIndexer.util.ipynb
+29
-42
nbs/indexers.NoteListIndexer.util.ipynb
with
395 additions
and
430 deletions
+395
-430
integrators/_nbdev.py
+
4
-
6
View file @
06edc574
...
...
@@ -31,18 +31,16 @@ index = {"read_file": "basic.ipynb",
"ISpan"
:
"indexers.NoteListIndexer.NoteList.ipynb"
,
"get_span"
:
"indexers.NoteListIndexer.NoteList.ipynb"
,
"HTMLListParser"
:
"indexers.NoteListIndexer.Parser.ipynb"
,
"LISTTYPE_VERBS"
:
"indexers.NoteListIndexer.Parser.ipynb"
,
"LIST_PREFIXES"
:
"indexers.NoteListIndexer.Parser.ipynb"
,
"NotesListIndexer"
:
"indexers.NoteListIndexer.ipynb"
,
"ListTypePredictor"
:
"indexers.NoteListIndexer.ipynb"
,
"get_toplevel_elements"
:
"indexers.NoteListIndexer.util.ipynb"
,
"remove_html"
:
"indexers.NoteListIndexer.util.ipynb"
,
"remove_prefix_chars"
:
"indexers.NoteListIndexer.util.ipynb"
,
"is_newline"
:
"indexers.NoteListIndexer.util.ipynb"
,
"is_newline_div"
:
"indexers.NoteListIndexer.util.ipynb"
,
"is_newline_paragraph"
:
"indexers.NoteListIndexer.util.ipynb"
,
"div_is_unstructured_list_title"
:
"indexers.NoteListIndexer.util.ipynb"
,
"is_newline_par"
:
"indexers.NoteListIndexer.util.ipynb"
,
"is_title"
:
"indexers.NoteListIndexer.util.ipynb"
,
"trim_till_newline"
:
"indexers.NoteListIndexer.util.ipynb"
,
"find_till_double_br"
:
"indexers.NoteListIndexer.util.ipynb"
,
"get_children"
:
"indexers.NoteListIndexer.util.ipynb"
,
"contains"
:
"indexers.NoteListIndexer.util.ipynb"
,
"load_spacy_model"
:
"indexers.NoteListIndexer.util.ipynb"
,
"HTML_LINEBREAK_REGEX"
:
"indexers.NoteListIndexer.util.ipynb"
,
...
...
This diff is collapsed.
Click to expand it.
integrators/indexers/notelist/notelist.py
+
5
-
3
View file @
06edc574
...
...
@@ -5,6 +5,7 @@ __all__ = ['LIST_CLASSES', 'INote', 'INoteList', 'ULNoteList', 'ISpan', 'get_spa
# Cell
from
...data.schema
import
*
from
.util
import
*
import
bs4
# Cell
TODO
,
TOWATCH
,
TOREAD
,
TOLISTEN
,
TOBUY
,
UNKOWN
=
"todo"
,
"towatch"
,
"toread"
,
"tolisten"
,
"tobuy"
,
"unknown"
...
...
@@ -59,9 +60,10 @@ class ULNoteList(INoteList):
def
get_items
(
self
,
remove_html_
=
False
,
skip_nested
=
False
):
if
self
.
content
is
None
:
return
[
self
.
textContent
]
result
=
[
i
for
i
in
get_toplevel_elements
(
str
(
self
.
content
),
"li"
)
if
len
(
i
(
"ul"
))
==
0
]
parsed
=
bs4
.
BeautifulSoup
(
self
.
content
,
"html.parser"
).
ul
result
=
[
x
for
x
in
parsed
.
find_all
(
"li"
,
recursive
=
False
)
if
len
(
x
(
"ul"
))
==
0
]
# result = [i for i in get_toplevel_elements(str(self.content), "li")
# if len(i("ul")) == 0]
if
remove_html_
:
result
=
[
remove_html
(
str
(
x
))
for
x
in
result
]
result
=
[
str
(
x
)
for
x
in
result
if
x
!=
""
]
...
...
This diff is collapsed.
Click to expand it.
integrators/indexers/notelist/parser.py
+
61
-
66
View file @
06edc574
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/indexers.NoteListIndexer.Parser.ipynb (unless otherwise specified).
__all__
=
[
'HTMLListParser'
]
__all__
=
[
'HTMLListParser'
,
'LISTTYPE_VERBS'
,
'LIST_PREFIXES'
]
# Cell
import
bs4
...
...
@@ -12,94 +12,89 @@ from ...data.basic import *
from
...imports
import
*
# Cell
LISTTYPE_VERBS
=
[
"do"
,
"read"
,
"watch"
,
"buy"
,
"listen"
]
LIST_PREFIXES
=
[
"to"
,
"to-"
,
"to "
,
""
]
class
HTMLListParser
():
'''Extracts lists from HTML data, generated by an HTML text editor like evernote'''
def
__init__
(
self
):
self
.
single_item_list_patterns
=
[
p
+
v
for
v
in
LISTTYPE_VERBS
for
p
in
LIST_PREFIXES
]
self
.
one_line_list_pa
=
[
"buy"
,
"read"
,
"watch"
]
words
=
[
"do"
,
"read"
,
"watch"
,
"buy"
,
"listen"
]
prefixes
=
[
"to"
,
"to-"
,
"to "
,
""
]
self
.
single_item_list_patterns
=
[
prefix
+
word
for
word
in
words
for
prefix
in
prefixes
]
def
get_html_lists
(
self
,
note
,
parsed
):
html_lists
=
parsed
.
find_all
(
"ul"
,
recursive
=
False
)
+
parsed
.
find_all
(
"ol"
,
recursive
=
False
)
return
[
ULNoteList
.
from_data
(
title
=
None
,
content
=
str
(
x
),
textContent
=
x
.
get_text
(),
note
=
note
,
span
=
get_span
(
x
,
parsed
))
for
x
in
html_lists
]
def
get_lists
(
self
,
note
):
"""Extracts lists from a note"""
text
=
note
.
content
parsed
=
bs4
.
BeautifulSoup
(
text
,
'html.parser'
)
parsed
=
bs4
.
BeautifulSoup
(
note
.
content
,
'html.parser'
)
note
.
content
=
str
(
parsed
)
uls
=
get_toplevel_elements
(
text
,
"ul"
,
parsed
=
parsed
)
ols
=
get_toplevel_elements
(
text
,
"ol"
,
parsed
=
parsed
)
html_lists
=
[
ULNoteList
.
from_data
(
title
=
None
,
content
=
str
(
x
),
textContent
=
remove_html
(
str
(
x
)),
note
=
note
,
span
=
get_span
(
note
,
x
,
parsed
))
for
x
in
uls
+
ols
]
unformatted_lists
=
self
.
get_unformatted_lists
(
note
,
text
,
parsed
)
all_lists
=
html_lists
+
unformatted_lists
for
l
in
all_lists
:
note
.
add_edge
(
"noteList"
,
l
)
all_lists
=
self
.
get_html_lists
(
note
,
parsed
)
+
\
self
.
get_unformatted_lists
(
note
,
parsed
)
for
l
in
all_lists
:
note
.
add_edge
(
"noteList"
,
l
)
return
all_lists
def
get_single_line_list
(
self
,
elem
):
def
parse
(
self
,
x
,
tag
=
None
):
if
isinstance
(
x
,
bs4
.
BeautifulSoup
):
return
x
.
find
(
tag
)
if
tag
is
not
None
else
x
elif
isinstance
(
x
,
bs4
.
element
.
Tag
):
return
x
else
:
res
=
bs4
.
BeautifulSoup
(
x
,
'html.parser'
)
return
res
.
find
(
tag
)
if
tag
is
not
None
else
res
def
get_single_line_list
(
self
,
par
):
"""Get single list lists. An example could be: '<strong>read</strong>: great book title'"""
ps
=
[
"read"
,
"buy"
,
"watch"
,
"do"
]
pat
=
"|"
.
join
([
f
"(<strong>|<em>|<u>)?
{
p
}
(</strong>|</em>|</u>)?:? ?"
for
p
in
ps
])
match
=
re
.
search
(
pat
,
str
(
elem
),
re
.
IGNORECASE
)
if
match
is
None
:
return
None
,
None
par
=
self
.
parse
(
par
,
"p"
)
par_html
=
""
.
join
(
mapped
(
str
,
par
.
contents
))
cleaned_elem
=
remove_html
(
str
(
elem
))
pat
=
"|"
.
join
([
f
"(<strong>|<em>|<u>)?
{
v
}
:? ?(</strong>|</em>|</u>)?:? ?"
for
v
in
LISTTYPE_VERBS
])
match
=
re
.
search
(
pat
,
par_html
,
re
.
IGNORECASE
)
if
match
is
None
:
return
None
,
None
cleaned_title
=
remov
e_html
(
match
.
group
()
)
if
match
is
not
None
else
None
titl
e_html
=
match
.
group
()
if
match
is
not
None
else
None
if
len
(
cleaned_elem
)
>
len
(
cleaned_title
)
+
2
:
if
len
(
par
.
get_text
())
>
len
(
remove_html
(
title_html
)
)
+
2
:
title
=
match
.
group
()
content
=
str
(
elem
)[
len
(
title
):]
content
=
par_html
[
par_html
.
index
(
title
)
+
len
(
title
):]
return
title
,
content
else
:
return
None
,
None
def
get_unformatted_lists
(
self
,
note
,
txt
,
parsed
):
def
get_unformatted_lists
(
self
,
note
,
parsed
):
"""retrieve lists without <ul></ul> tags. We have two options:
1) multiline lists prefixed with a title keyword (e.g. "Buy:" "Read:")
2) single element single line lists"""
toplevel_div
=
get_toplevel_elements
(
txt
,
"div"
)[
0
]
ls
=
[]
for
elem
in
toplevel_div
.
children
:
if
elem
.
name
==
"div"
and
not
is_newline_div
(
elem
):
children
=
get_children
(
elem
)
for
i
,
child
in
enumerate
(
children
):
# this extracts the lists that have a title and are not on a single line
if
div_is_unstructured_list_title
(
child
):
print
(
child
)
title
=
child
successors
=
list
(
children
)[
i
+
1
:]
if
len
(
successors
)
==
0
:
continue
items
=
[
x
for
x
in
find_till_double_br
(
successors
)
if
not
is_newline
(
str
(
x
))]
items_str
=
[
str
(
x
)
for
x
in
items
]
items_span
=
[
get_span
(
note
,
x
,
parsed
)
for
x
in
items_str
]
span1
=
get_span
(
note
,
title
,
parsed
)
span2
=
get_span
(
note
,
items
[
-
1
],
parsed
)
span
=
Span
.
from_data
(
startIdx
=
span1
.
startIdx
,
endIdx
=
span2
.
endIdx
)
html_content
=
""
.
join
(
items_str
)
l
=
INoteList
.
from_data
(
note
=
note
,
title
=
title
,
content
=
str
(
html_content
),
itemSpan
=
items_span
,
span
=
span
)
ls
.
append
(
l
)
else
:
title
,
html_content
=
self
.
get_single_line_list
(
child
)
if
title
is
not
None
:
span
=
get_span
(
note
,
child
,
parsed
)
itemSpan
=
[
Span
.
from_data
(
startIdx
=
span
.
startIdx
+
len
(
str
(
title
)),
endIdx
=
span
.
endIdx
)]
l
=
INoteList
.
from_data
(
note
=
note
,
title
=
title
,
content
=
str
(
html_content
),
itemSpan
=
itemSpan
,
span
=
span
)
ls
.
append
(
l
)
return
ls
\ No newline at end of file
parsed
=
parsed
if
parsed
is
not
None
else
self
.
parse
(
note
.
content
)
toplevel_paragraphs
=
parsed
.
find_all
(
"p"
,
recursive
=
False
)
res
=
[]
for
i
,
par
in
enumerate
(
toplevel_paragraphs
):
if
is_title
(
par
):
# this extracts the lists that have a title and are not on a single line
items
=
trim_till_newline
(
list
(
toplevel_paragraphs
)[
i
+
1
:])
if
len
(
items
)
==
0
:
continue
list_span
=
Span
.
from_data
(
startIdx
=
get_span
(
title
,
parsed
).
startIdx
,
endIdx
=
get_span
(
items
[
-
1
],
parsed
).
endIdx
)
l
=
INoteList
.
from_data
(
note
=
note
,
span
=
list_span
,
title
=
str
(
par
.
contents
[
0
]),
content
=
""
.
join
(
mapped
(
str
,
items
)),
itemSpan
=
[
get_span
(
x
,
parsed
)
for
x
in
items
])
res
.
append
(
l
)
else
:
title
,
html_content
=
self
.
get_single_line_list
(
par
)
if
title
is
not
None
:
span
=
get_span
(
str
(
par
),
parsed
)
itemSpans
=
[
Span
.
from_data
(
startIdx
=
span
.
startIdx
+
len
(
str
(
title
)),
endIdx
=
span
.
endIdx
)]
l
=
INoteList
.
from_data
(
note
=
note
,
title
=
title
,
content
=
str
(
html_content
),
itemSpan
=
itemSpans
,
span
=
get_span
(
par
,
parsed
))
res
.
append
(
l
)
return
res
\ No newline at end of file
This diff is collapsed.
Click to expand it.
integrators/indexers/notelist/util.py
+
34
-
48
View file @
06edc574
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/indexers.NoteListIndexer.util.ipynb (unless otherwise specified).
__all__
=
[
'get_toplevel_elements'
,
'remove_html'
,
'remove_prefix_chars'
,
'is_newline'
,
'is_newline_div'
,
'is_newline_paragraph'
,
'div_is_unstructured_list_title'
,
'trim_till_newline'
,
'find_till_double_br'
,
'get_children'
,
'contains'
,
'load_spacy_model'
,
'HTML_LINEBREAK_REGEX'
]
__all__
=
[
'remove_html'
,
'remove_prefix_chars'
,
'is_newline'
,
'is_newline_par'
,
'is_newline'
,
'is_title'
,
'trim_till_newline'
,
'contains'
,
'load_spacy_model'
,
'HTML_LINEBREAK_REGEX'
]
# Cell
import
re
,
bs4
,
spacy
...
...
@@ -10,19 +9,6 @@ import re, bs4, spacy
# Cell
HTML_LINEBREAK_REGEX
=
"<br[^<]*/>"
def
get_toplevel_elements
(
str_
,
element
,
parsed
=
None
):
if
parsed
is
None
:
parsed
=
bs4
.
BeautifulSoup
(
str_
,
"html.parser"
)
skip
,
result
=
[],
[]
for
l
in
parsed
(
element
):
if
str
(
l
)
not
in
skip
:
result
.
append
(
l
)
skip
+=
[
str
(
l_nested
)
for
l_nested
in
l
(
element
)]
return
result
def
remove_html
(
str_
):
return
re
.
sub
(
'<[^<]+?>'
,
''
,
str_
)
...
...
@@ -36,24 +22,24 @@ def is_newline(str_):
if
res
.
group
()
==
str_
:
return
True
else
:
return
False
def
is_newline_
div
(
div
):
c
=
div
.
contents
if
is_newline
_
par
agraph
(
div
):
return
True
def
is_newline_
par
(
par
):
c
=
par
.
contents
if
is_newline
(
par
):
return
True
elif
len
(
c
)
==
0
:
return
False
elif
len
(
c
)
==
1
and
is_newline
(
str
(
c
[
0
])):
return
True
else
:
return
False
def
is_newline
_paragraph
(
p
):
def
is_newline
(
p
):
return
str
(
p
)
==
"<p></p>"
def
div_is_unstructured_list
_title
(
div
):
def
is
_title
(
par
):
p
=
"read"
title_regex
=
f
"(?<!<li>)
{
p
}
|(?<!<li>)buy"
match
=
re
.
search
(
title_regex
,
str
(
div
),
re
.
IGNORECASE
)
match
=
re
.
search
(
title_regex
,
str
(
par
),
re
.
IGNORECASE
)
if
match
is
None
:
return
False
cleaned_div
=
remove_html
(
str
(
div
))
cleaned_div
=
remove_html
(
str
(
par
))
cleaned_title
=
remove_html
(
match
.
group
())
if
match
is
not
None
else
None
# the title should be the bulk of the div
...
...
@@ -68,34 +54,34 @@ def trim_till_newline(pars):
end_idx
=
i
if
i
==
len
(
pars
)
-
1
:
break
if
is_newline_par
agraph
(
par
):
if
is_newline_par
(
par
):
break
return
pars
[:
end_idx
]
def
find_till_double_br
(
divs
):
end_idx
=
0
for
i
,
s
in
enumerate
(
divs
):
end_idx
=
i
if
i
==
len
(
divs
)
-
1
:
break
if
is_newline
(
str
(
divs
[
i
]))
and
is_newline
(
str
(
divs
[
i
+
1
])):
# print("FOUND ", successors[:i])
break
return
divs
[:
end_idx
]
def
get_children
(
elem
):
"""Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
res
=
[]
inc_str
=
""
for
c
in
elem
.
children
:
c
=
str
(
c
)
inc_str
+=
c
if
not
(
c
.
startswith
(
"<strong>"
)
|
c
.
startswith
(
"<em>"
)
|
c
.
startswith
(
"<u>"
)):
res
.
append
(
inc_str
)
inc_str
=
""
return
res
#
def find_till_double_br(divs):
#
end_idx = 0
#
for i, s in enumerate(divs):
#
end_idx = i
#
if i == len(divs)-1:
#
break
#
if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
#
# print("FOUND ", successors[:i])
#
break
#
return divs[:end_idx]
#
def get_children(elem):
#
"""Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
#
res = []
#
inc_str = ""
#
for c in elem.children:
#
c = str(c)
#
inc_str += c
#
if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
#
res.append(inc_str)
#
inc_str = ""
#
return res
def
contains
(
str_
,
pat
):
'''case insensitive match'''
...
...
This diff is collapsed.
Click to expand it.
nbs/indexers.NoteListIndexer.NoteList.ipynb
+
8
-
6
View file @
06edc574
...
...
@@ -19,7 +19,8 @@
"source": [
"# export\n",
"from integrators.data.schema import *\n",
"from integrators.indexers.notelist.util import *"
"from integrators.indexers.notelist.util import *\n",
"import bs4"
]
},
{
...
...
@@ -126,9 +127,10 @@
" def get_items(self, remove_html_=False, skip_nested=False):\n",
" \n",
" if self.content is None: return [self.textContent]\n",
"\n",
" result = [i for i in get_toplevel_elements(str(self.content), \"li\")\n",
" if len(i(\"ul\")) == 0]\n",
" parsed = bs4.BeautifulSoup(self.content, \"html.parser\").ul\n",
" result = [x for x in parsed.find_all(\"li\", recursive=False) if len(x(\"ul\")) == 0]\n",
"# result = [i for i in get_toplevel_elements(str(self.content), \"li\")\n",
"# if len(i(\"ul\")) == 0]\n",
" \n",
" if remove_html_: result = [remove_html(str(x)) for x in result]\n",
" result = [str(x) for x in result if x != \"\"]\n",
...
...
@@ -149,7 +151,7 @@
{
"data": {
"text/plain": [
"# Awesome title \n",
"
ULNoteList
# Awesome title \n",
"\n"
]
},
...
...
@@ -159,7 +161,7 @@
}
],
"source": [
"ULNoteList.from_data(title=\"Awesome title\", content=\"Awesome content\")"
"ULNoteList.from_data(title=\"Awesome title\", content=\"
<ul>
Awesome content
</ul>
\")"
]
},
{
...
...
%% Cell type:code id: tags:
```
python
%
load_ext
autoreload
%
autoreload
2
# default_exp indexers.notelist.notelist
```
%% Cell type:code id: tags:
```
python
# export
from
integrators.data.schema
import
*
from
integrators.indexers.notelist.util
import
*
import
bs4
```
%% Cell type:code id: tags:
```
python
# export
TODO
,
TOWATCH
,
TOREAD
,
TOLISTEN
,
TOBUY
,
UNKOWN
=
"todo"
,
"towatch"
,
"toread"
,
"tolisten"
,
"tobuy"
,
"unknown"
LIST_CLASSES
=
[
TODO
,
TOWATCH
,
TOREAD
,
TOLISTEN
,
TOBUY
,
UNKOWN
]
```
%% Cell type:markdown id: tags:
# Note
A note is an html formatted piece of text
%% Cell type:code id: tags:
```
python
# export
class
INote
(
Note
):
def
__repr__
(
self
):
content
=
self
.
content
[:
50
]
+
" ..."
if
len
(
self
.
content
)
>
20
else
self
.
content
return
f
"INote (#
{
self
.
uid
}
)
{
content
}
"
def
show
(
self
):
print
(
f
"INote (#
{
self
.
uid
}
)
{
self
.
content
}
"
)
```
%% Cell type:markdown id: tags:
# NoteLists
A notelist object denotes a list contained in a written html note.
%% Cell type:code id: tags:
```
python
# export
class
INoteList
(
NoteList
):
def
__str__
(
self
):
return
str
(
self
.
content
)
def
infer_cat_from_title
(
self
):
if
self
.
title
is
None
:
return
None
if
contains
(
self
.
title
,
"do"
):
return
TODO
if
contains
(
self
.
title
,
"read"
):
return
TOREAD
if
contains
(
self
.
title
,
"watch"
):
return
TOWATCH
if
contains
(
self
.
title
,
"listen"
):
return
TOLISTEN
if
contains
(
self
.
title
,
"buy"
):
return
TOBUY
else
:
return
None
def
get_items
(
self
,
remove_html_
=
False
,
skip_nested
=
True
):
if
self
.
itemSpan
is
not
None
:
return
[
str
(
self
.
note
.
content
)[
s
.
startIdx
:
s
.
endIdx
]
for
s
in
self
.
itemSpan
]
else
:
return
[]
def
__repr__
(
self
):
cat_str
=
f
"(
{
self
.
category
}
)"
if
self
.
category
is
not
None
else
""
title
=
remove_html
(
self
.
title
)
if
self
.
title
is
not
None
else
"Untitled"
return
f
"(INoteList) #
{
title
}
{
cat_str
}
\n
{
self
.
content
}
\n\n
"
def
__eq__
(
self
,
other
):
return
self
.
uid
==
other
.
uid
```
%% Cell type:markdown id: tags:
## ULNoteList
A ULNoteList is the most vanilla kind of list. It is a list of items encapsulated by
\<
ul>
\<
/ul> tags.
%% Cell type:code id: tags:
```
python
# export
class
ULNoteList
(
INoteList
):
'''A <ul> </ul> list extracted from a note. '''
def
get_items
(
self
,
remove_html_
=
False
,
skip_nested
=
False
):
if
self
.
content
is
None
:
return
[
self
.
textContent
]
result
=
[
i
for
i
in
get_toplevel_elements
(
str
(
self
.
content
),
"li"
)
if
len
(
i
(
"ul"
))
==
0
]
parsed
=
bs4
.
BeautifulSoup
(
self
.
content
,
"html.parser"
).
ul
result
=
[
x
for
x
in
parsed
.
find_all
(
"li"
,
recursive
=
False
)
if
len
(
x
(
"ul"
))
==
0
]
# result = [i for i in get_toplevel_elements(str(self.content), "li")
# if len(i("ul")) == 0]
if
remove_html_
:
result
=
[
remove_html
(
str
(
x
))
for
x
in
result
]
result
=
[
str
(
x
)
for
x
in
result
if
x
!=
""
]
return
result
def
__repr__
(
self
):
items
=
"
\n
"
.
join
(
self
.
get_items
(
remove_html_
=
True
))
cat_str
=
f
"(
{
self
.
category
}
)"
if
self
.
category
is
not
None
else
""
title
=
remove_html
(
self
.
title
)
if
self
.
title
is
not
None
else
"Untitled"
return
f
"ULNoteList #
{
title
}
{
cat_str
}
\n
{
items
}
\n\n
"
```
%% Cell type:code id: tags:
```
python
ULNoteList
.
from_data
(
title
=
"Awesome title"
,
content
=
"Awesome content"
)
ULNoteList
.
from_data
(
title
=
"Awesome title"
,
content
=
"
<ul>
Awesome content
</ul>
"
)
```
%% Output
# Awesome title
ULNoteList
# Awesome title
%% Cell type:markdown id: tags:
## Span
We use spans to specify a range within a piece of text. If we for instance have a piece of text "Memri solves all your problems" and a span with startIdx=6 and endIdx=16, it points to "solves all".
%% Cell type:code id: tags:
```
python
# export
class
ISpan
(
Span
):
'''A span of an element in a piece of text'''
def
__eq__
(
self
,
other
):
return
self
.
startIdx
==
other
.
startIdx
and
self
.
endIdx
==
other
.
endIdx
def
__repr__
(
self
):
return
f
"ISpan [
{
self
.
startIdx
}
,
{
self
.
endIdx
}
]"
```
%% Cell type:code id: tags:
```
python
# export
def
get_span
(
elem
,
parsed
):
e_str
=
str
(
elem
)
parsed_str
=
str
(
parsed
)
begin
=
parsed_str
.
find
(
e_str
)
end
=
begin
+
len
(
e_str
)
return
ISpan
.
from_data
(
startIdx
=
begin
,
endIdx
=
end
)
```
%% Cell type:markdown id: tags:
# Export -
%% Cell type:code id: tags:
```
python
# hide
from
nbdev.export
import
*
notebook2script
()
```
%% Output
Converted basic.ipynb.
Converted importers.EmailImporter.ipynb.
Converted importers.Importer.ipynb.
Converted importers.util.ipynb.
Converted index.ipynb.
Converted indexers.FaceRecognitionIndexer.ipynb.
Converted indexers.FacerecognitionIndexer.Photo.ipynb.
Converted indexers.GeoIndexer.ipynb.
Converted indexers.NoteListIndexer.NoteList.ipynb.
Converted indexers.NoteListIndexer.Parser.ipynb.
Converted indexers.NoteListIndexer.ipynb.
Converted indexers.NoteListIndexer.util.ipynb.
Converted indexers.indexer.ipynb.
Converted itembase.ipynb.
Converted pod.client.ipynb.
%% Cell type:code id: tags:
```
python
```
...
...
This diff is collapsed.
Click to expand it.
nbs/indexers.NoteListIndexer.Parser.ipynb
+
254
-
259
View file @
06edc574
This diff is collapsed.
Click to expand it.
nbs/indexers.NoteListIndexer.util.ipynb
+
29
-
42
View file @
06edc574
...
...
@@ -37,19 +37,6 @@
"# export\n",
"HTML_LINEBREAK_REGEX = \"<br[^<]*/>\"\n",
"\n",
"def get_toplevel_elements(str_, element, parsed=None):\n",
" if parsed is None:\n",
" parsed = bs4.BeautifulSoup(str_, \"html.parser\")\n",
"\n",
" skip, result = [], []\n",
"\n",
" for l in parsed(element):\n",
" if str(l) not in skip:\n",
" result.append(l)\n",
"\n",
" skip += [str(l_nested) for l_nested in l(element)]\n",
" return result\n",
"\n",
"def remove_html(str_):\n",
" return re.sub('<[^<]+?>', '', str_)\n",
"\n",
...
...
@@ -63,24 +50,24 @@
" if res.group() == str_: return True\n",
" else: return False\n",
"\n",
"def is_newline_
div(div
):\n",
" c =
div
.contents\n",
" if is_newline
_
par
agraph(div
): return True\n",
"def is_newline_
par(par
):\n",
" c =
par
.contents\n",
" if is_newline
(
par): return True\n",
" elif len(c) == 0: return False\n",
" elif len(c) == 1 and is_newline(str(c[0])): return True\n",
" else: return False\n",
" \n",
"def is_newline
_paragraph
(p):\n",
"def is_newline(p):\n",
" return str(p) == \"<p></p>\"\n",
"\n",
"def
div_is_unstructured_list
_title(
div
):\n",
"def
is
_title(
par
):\n",
" p = \"read\"\n",
" title_regex = f\"(?<!<li>){p}|(?<!<li>)buy\"\n",
"\n",
" match = re.search(title_regex, str(
div
), re.IGNORECASE)\n",
" match = re.search(title_regex, str(
par
), re.IGNORECASE)\n",
" if match is None: return False\n",
" \n",
" cleaned_div = remove_html(str(
div
)) \n",
" cleaned_div = remove_html(str(
par
)) \n",
" cleaned_title = remove_html(match.group()) if match is not None else None\n",
"\n",
" # the title should be the bulk of the div\n",
...
...
@@ -95,34 +82,34 @@
" end_idx = i\n",
" if i == len(pars)-1:\n",
" break\n",
" if is_newline_par
agraph
(par):\n",
" if is_newline_par(par):\n",
" break\n",
" return pars[:end_idx]\n",
"\n",
"def find_till_double_br(divs):\n",
" end_idx = 0\n",
" for i, s in enumerate(divs):\n",
" end_idx = i\n",
" if i == len(divs)-1:\n",
" break\n",
" if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):\n",
"# print(\"FOUND \", successors[:i])\n",
" break\n",
" return divs[:end_idx]\n",
"
#
def find_till_double_br(divs):\n",
"
#
end_idx = 0\n",
"
#
for i, s in enumerate(divs):\n",
"
#
end_idx = i\n",
"
#
if i == len(divs)-1:\n",
"
#
break\n",
"
#
if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):\n",
"#
#
print(\"FOUND \", successors[:i])\n",
"
#
break\n",
"
#
return divs[:end_idx]\n",
"\n",
"def get_children(elem):\n",
" \"\"\"Fetches children of an element, put combines children when they are style element like <strong>example</strong>\"\"\"\n",
" res = []\n",
" inc_str = \"\"\n",
"
#
def get_children(elem):\n",
"
#
\"\"\"Fetches children of an element, put combines children when they are style element like <strong>example</strong>\"\"\"\n",
"
#
res = []\n",
"
#
inc_str = \"\"\n",
" \n",
" for c in elem.children:\n",
" c = str(c)\n",
" inc_str += c\n",
"
#
for c in elem.children:\n",
"
#
c = str(c)\n",
"
#
inc_str += c\n",
" \n",
" if not(c.startswith(\"<strong>\") | c.startswith(\"<em>\") | c.startswith(\"<u>\")):\n",
" res.append(inc_str)\n",
" inc_str = \"\"\n",
" return res\n",
"
#
if not(c.startswith(\"<strong>\") | c.startswith(\"<em>\") | c.startswith(\"<u>\")):\n",
"
#
res.append(inc_str)\n",
"
#
inc_str = \"\"\n",
"
#
return res\n",
"\n",
"def contains(str_, pat):\n",
" '''case insensitive match'''\n",
...
...
%% Cell type:code id: tags:
```
python
%
load_ext
autoreload
%
autoreload
2
# default_exp indexers.notelist.util
```
%% Cell type:code id: tags:
```
python
# export
import
re
,
bs4
,
spacy
```
%% Cell type:markdown id: tags:
# Util
%% Cell type:code id: tags:
```
python
# export
HTML_LINEBREAK_REGEX
=
"<br[^<]*/>"
def
get_toplevel_elements
(
str_
,
element
,
parsed
=
None
):
if
parsed
is
None
:
parsed
=
bs4
.
BeautifulSoup
(
str_
,
"html.parser"
)
skip
,
result
=
[],
[]
for
l
in
parsed
(
element
):
if
str
(
l
)
not
in
skip
:
result
.
append
(
l
)
skip
+=
[
str
(
l_nested
)
for
l_nested
in
l
(
element
)]
return
result
def
remove_html
(
str_
):
return
re
.
sub
(
'<[^<]+?>'
,
''
,
str_
)
def
remove_prefix_chars
(
s
,
chars
):
while
s
[
0
]
in
chars
:
s
=
s
[
1
:]
return
s
def
is_newline
(
str_
):
res
=
re
.
search
(
HTML_LINEBREAK_REGEX
,
str_
,
re
.
IGNORECASE
)
if
res
is
None
:
return
False
if
res
.
group
()
==
str_
:
return
True
else
:
return
False
def
is_newline_
div
(
div
):
c
=
div
.
contents
if
is_newline
_
par
agraph
(
div
):
return
True
def
is_newline_
par
(
par
):
c
=
par
.
contents
if
is_newline
(
par
):
return
True
elif
len
(
c
)
==
0
:
return
False
elif
len
(
c
)
==
1
and
is_newline
(
str
(
c
[
0
])):
return
True
else
:
return
False
def
is_newline
_paragraph
(
p
):
def
is_newline
(
p
):
return
str
(
p
)
==
"<p></p>"
def
div_is_unstructured_list
_title
(
div
):
def
is
_title
(
par
):
p
=
"read"
title_regex
=
f
"(?<!<li>)
{
p
}
|(?<!<li>)buy"
match
=
re
.
search
(
title_regex
,
str
(
div
),
re
.
IGNORECASE
)
match
=
re
.
search
(
title_regex
,
str
(
par
),
re
.
IGNORECASE
)
if
match
is
None
:
return
False
cleaned_div
=
remove_html
(
str
(
div
))
cleaned_div
=
remove_html
(
str
(
par
))
cleaned_title
=
remove_html
(
match
.
group
())
if
match
is
not
None
else
None
# the title should be the bulk of the div
if
len
(
cleaned_title
)
>
len
(
cleaned_div
)
-
2
:
return
True
else
:
return
False
def
trim_till_newline
(
pars
):
end_idx
=
0
for
i
,
par
in
enumerate
(
pars
):
end_idx
=
i
if
i
==
len
(
pars
)
-
1
:
break
if
is_newline_par
agraph
(
par
):
if
is_newline_par
(
par
):
break
return
pars
[:
end_idx
]
def
find_till_double_br
(
divs
):
end_idx
=
0
for
i
,
s
in
enumerate
(
divs
):
end_idx
=
i
if
i
==
len
(
divs
)
-
1
:
break
if
is_newline
(
str
(
divs
[
i
]))
and
is_newline
(
str
(
divs
[
i
+
1
])):
# print("FOUND ", successors[:i])
break
return
divs
[:
end_idx
]
def
get_children
(
elem
):
"""Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
res
=
[]
inc_str
=
""
for
c
in
elem
.
children
:
c
=
str
(
c
)
inc_str
+=
c
if
not
(
c
.
startswith
(
"<strong>"
)
|
c
.
startswith
(
"<em>"
)
|
c
.
startswith
(
"<u>"
)):
res
.
append
(
inc_str
)
inc_str
=
""
return
res
#
def find_till_double_br(divs):
#
end_idx = 0
#
for i, s in enumerate(divs):
#
end_idx = i
#
if i == len(divs)-1:
#
break
#
if is_newline(str(divs[i])) and is_newline(str(divs[i+1])):
#
# print("FOUND ", successors[:i])
#
break
#
return divs[:end_idx]
#
def get_children(elem):
#
"""Fetches children of an element, put combines children when they are style element like <strong>example</strong>"""
#
res = []
#
inc_str = ""
#
for c in elem.children:
#
c = str(c)
#
inc_str += c
#
if not(c.startswith("<strong>") | c.startswith("<em>") | c.startswith("<u>")):
#
res.append(inc_str)
#
inc_str = ""
#
return res
def
contains
(
str_
,
pat
):
'''case insensitive match'''
return
re
.
search
(
pat
,
str_
,
re
.
IGNORECASE
)
is
not
None
def
load_spacy_model
(
m
):
try
:
nlp
=
spacy
.
load
(
m
)
except
OSError
:
print
(
"Downloading language model for spaCy, this will only happen once"
)
from
spacy.cli
import
download
download
(
m
)
nlp
=
spacy
.
load
(
m
)
return
nlp
```
%% Cell type:markdown id: tags:
# Export -
%% Cell type:code id: tags:
```
python
# hide
from
nbdev.export
import
*
notebook2script
()
```
%% Output
Converted basic.ipynb.
Converted importers.EmailImporter.ipynb.
Converted importers.Importer.ipynb.
Converted importers.util.ipynb.
Converted index.ipynb.
Converted indexers.FaceRecognitionIndexer.ipynb.
Converted indexers.FacerecognitionIndexer.Photo.ipynb.
Converted indexers.GeoIndexer.ipynb.
Converted indexers.NoteListIndexer.NoteList.ipynb.
Converted indexers.NoteListIndexer.Parser.ipynb.
Converted indexers.NoteListIndexer.ipynb.
Converted indexers.NoteListIndexer.util.ipynb.
Converted indexers.indexer.ipynb.
Converted itembase.ipynb.
Converted pod.client.ipynb.
%% Cell type:code id: tags:
```
python
```
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Snippets