Skip to content
GitLab
Explore
Projects
Groups
Snippets
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Memri
plugins
RSS importer
Commits
6ebd5970
Commit
6ebd5970
authored
1 year ago
by
Alp Deniz Ogut
Browse files
Options
Download
Plain Diff
Merge branch 'dev' into 'qa'
Merge Dev into QA See merge request
!17
parents
a32378ec
84633d2d
Pipeline
#15026
passed with stages
in 1 minute and 37 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
rss_importer/app.py
+9
-7
rss_importer/app.py
rss_importer/config.py
+13
-2
rss_importer/config.py
rss_importer/plugin.py
+36
-28
rss_importer/plugin.py
rss_importer/rss.py
+1
-2
rss_importer/rss.py
with
59 additions
and
39 deletions
+59
-39
rss_importer/app.py
+
9
-
7
View file @
6ebd5970
...
...
@@ -89,7 +89,7 @@ def post_feed(
feed_or_err
=
plugin
.
setup_feed
(
url
=
url
,
name
=
name
)
match
feed_or_err
:
case
Ok
(
feed
)
if
feed
.
id
is
not
None
:
run_postprocess
(
plugin
,
delta_days
=
1
)
logger
.
info
(
"Successfully created feed"
)
case
Err
(
e
):
logger
.
error
(
e
)
raise
HTTPException
(
status_code
=
404
,
detail
=
"Could not import feed"
)
...
...
@@ -140,17 +140,19 @@ def get_feeds(
feeds
=
[
n
for
n
in
feeds
if
n
.
id
==
feed_id
]
return
feeds
class
RSSFeedChat
(
BaseModel
):
messages
:
list
[
dict
]
index_name
:
str
=
"rss_feed"
category
:
str
=
"all"
@
app
.
post
(
"/feed/chat"
)
def
feed_chat
(
re
q
:
RSSFeedChat
,
plugin
:
RSSImporter
=
Depends
(
get_plugin
)
):
return
StreamingResponse
(
plugin
.
feed_chat
(
req
.
index_name
,
req
.
messages
,
req
.
category
),
media_type
=
"text/event-stream"
)
def
feed_chat
(
req
:
RSSFeedChat
,
plugin
:
RSSImporter
=
Depends
(
get_plugin
)):
re
turn
StreamingResponse
(
plugin
.
feed_chat
(
req
.
index_name
,
req
.
messages
,
req
.
category
),
media_type
=
"text/event-stream"
,
)
@
app
.
get
(
"/update_all_feeds"
)
...
...
@@ -322,7 +324,7 @@ def get_summary_endpoint(category=None, plugin: RSSImporter = Depends(get_plugin
@
app
.
get
(
"/summaries"
)
def
get_summar
y
_endpoint
(
def
get_summar
ies
_endpoint
(
categories
:
list
[
str
]
=
Query
(
default
=
[
"business"
],
description
=
"Returns summaries for the given categories. use 'all' to get all summaries."
,
...
...
This diff is collapsed.
Click to expand it.
rss_importer/config.py
+
13
-
2
View file @
6ebd5970
...
...
@@ -61,7 +61,7 @@ DEFAULT_FEEDS = [
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
,
"CNBCE"
,
),
# Top news
(
"https://www.buzzfeed.com/index.xml"
,
"BuzzFeed"
),
# Contains :/// notation
#
("https://www.buzzfeed.com/index.xml", "BuzzFeed"), # Contains :/// notation
(
"https://techcrunch.com/feed/"
,
"TechCrunch"
),
(
"https://thenextweb.com/feed"
,
"The Next Web"
),
(
"https://venturebeat.com/feed/"
,
"VentureBeat"
),
...
...
@@ -83,4 +83,15 @@ DEFAULT_FEEDS = [
(
"https://steveblank.com/feed/"
,
"Steve Blank"
),
(
"https://financialpost.com/feed"
,
"Financial Post"
),
(
"https://cms.qz.com/feed/"
,
"Quartz"
),
]
\ No newline at end of file
# ("https://www.inc.com/rss/", "Inc.com"), # blocked by fw, requires browser-like behavior
(
"https://feeds.feedburner.com/fastcompany/headlines"
,
"Fast Company"
),
(
"https://sloanreview.mit.edu/feed/"
,
"MIT Sloan Management"
),
(
"https://moxie.foxbusiness.com/google-publisher/latest.xml"
,
"Fox Business"
),
(
"https://www.businessoffashion.com/arc/outboundfeeds/rss/?outputType=xml"
,
"Business of Fashion"
,
),
# ("https://www.economist.com/the-world-this-week/rss.xml # feedparser.bozoexception: Could not parse feed h: <unknown>:2:0: syntax error
# ("https://www.mckinsey.com/insights/rss", "McKinsey") # document declared as us-ascii, but parsed as utf-8
(
"https://smallbiztrends.com/feed"
,
"Small Business Trends"
),
]
This diff is collapsed.
Click to expand it.
rss_importer/plugin.py
+
36
-
28
View file @
6ebd5970
...
...
@@ -210,18 +210,21 @@ class RSSImporter(PluginBase):
logger
.
info
(
f
"Summarizing
{
len
(
entries
)
}
entries in batches of
{
batch_size
}
"
)
for
batch
in
chunked
(
entries
,
batch_size
):
logger
.
debug
(
f
"Summarizing
{
len
(
batch
)
}
entries"
)
documents
=
[
entry
.
plain_text
()
for
entry
in
batch
]
summaries
=
summary_request
(
documents
,
min_length
=
min_length
,
max_length
=
max_length
,
)
for
entry
,
summary
in
zip
(
batch
,
summaries
):
if
summary
:
entry
.
summary
=
bulletize_summary
(
summary
)
entry
.
summarySource
=
SUMMARY_SOURCE
logger
.
debug
(
f
"Updating
{
len
(
batch
)
}
entries"
)
self
.
client
.
bulk_action
(
update_items
=
batch
)
try
:
documents
=
[
entry
.
plain_text
()
for
entry
in
batch
]
summaries
=
summary_request
(
documents
,
min_length
=
min_length
,
max_length
=
max_length
,
)
for
entry
,
summary
in
zip
(
batch
,
summaries
):
if
summary
:
entry
.
summary
=
bulletize_summary
(
summary
)
entry
.
summarySource
=
SUMMARY_SOURCE
logger
.
debug
(
f
"Updating
{
len
(
batch
)
}
entries"
)
self
.
client
.
bulk_action
(
update_items
=
batch
)
except
Exception
as
e
:
logger
.
error
(
f
"Error while summarizing entries:
{
e
}
"
)
return
entries
def
label_rss_entries
(
...
...
@@ -234,7 +237,12 @@ class RSSImporter(PluginBase):
for
batch
in
chunked
(
entries
,
batch_size
):
logger
.
debug
(
f
"Labeling
{
len
(
batch
)
}
entries"
)
titles
=
[
entry
.
title
or
""
for
entry
in
batch
]
labels_of_docs
=
semantic_search_request
(
titles
)
try
:
labels_of_docs
=
semantic_search_request
(
titles
)
except
Exception
as
e
:
logger
.
error
(
f
"Error while labeling entries:
{
e
}
"
)
continue
create_edges
=
[]
create_items
=
[]
...
...
@@ -279,16 +287,16 @@ class RSSImporter(PluginBase):
def
generate_feed_summary
(
self
,
category_text
=
None
):
index_query
=
"impactful, affecting a wide range of people, very recent, published today or yesterday."
instructions
=
"
"
.
join
(
[
"Generate up-to 4 bullet point list of distinct signficant information from the news. Use only asteriks as bullets."
,
"Start strictly with the 1 level bulleted list, comforming to markdown format. Do not generate intro or warm up words before or after the list."
,
"
Each bullet
point
should
contain more than 5 and under 20 words. There should be no sub bullet points."
,
"Output only the one level bullet point list. Do not generate words other than the bullet point list. Do not generate sub items to the list."
,
"Include source articles in format '(source: {Item Id})' just after the each bullet point sentence."
,
"Make sure that the output is valid markdown and the list has no nested items"
,
]
)
instructions
=
"
""Generate a 4-item markdown bullet list summary from distinct news articles:
* Use asterisks as bullets, ensuring valid markdown format.
* Each bullet should be unformatted 10-20 words, representing a unique subject.
*
Each bullet should
briefly and clearly inform the user about the subject.
* Bullet items should not contain any markup or formatting.
* No introduction, conclusion, sub-bullets, or nested items.
* Append each with '(source: {Item Id})'.
Ensure the output is a one-level bullet list without additional text.
"""
if
category_text
is
not
None
and
category_text
!=
"all"
:
index_query
=
f
"
{
index_query
}
{
category_text
}
"
instructions
+=
f
" Keep the list strictly bounded by
{
category_text
}
. Do not include list items that are not closely related to the tags or categories."
...
...
@@ -313,12 +321,10 @@ class RSSImporter(PluginBase):
return
summary
def
feed_chat
(
self
,
index_name
,
messages
,
category_text
=
None
):
index_query
=
messages
[
-
1
][
'
content
'
]
index_query
=
messages
[
-
1
][
"
content
"
]
if
category_text
is
not
None
and
category_text
!=
"all"
:
index_query
=
f
"
{
index_query
}
{
category_text
}
"
print
(
f
"Sending chat request with query:
{
index_query
}
"
)
print
(
f
"Sending chat request with query:
{
index_query
}
"
)
rss_client_kwargs
=
{
"url"
:
RSS_POD_URL
,
"owner_key"
:
RSS_OWNER_KEY
,
...
...
@@ -332,6 +338,7 @@ class RSSImporter(PluginBase):
logger
.
error
(
f
"Error during chat request stream:
{
e
}
"
)
return
None
def
bulletize_summary
(
summary
:
str
)
->
str
:
"""Converts a summary to a bulletized list per sentence."""
nltk
.
download
(
"punkt"
,
quiet
=
True
)
...
...
@@ -376,6 +383,7 @@ def chat_request(index_name, messages, pod_client_kwargs, index_query=None):
logger
.
error
(
f
"JSON decoding failed for LLM reply:
{
r
}
"
)
return
reply
def
chat_request_stream
(
index_name
,
messages
,
pod_client_kwargs
,
index_query
=
None
):
data
:
dict
=
{
"index_name"
:
index_name
,
...
...
@@ -389,7 +397,7 @@ def chat_request_stream(index_name, messages, pod_client_kwargs, index_query=Non
f
"
{
MEMRI_BOT_URL
}
/v1/memory/chat"
,
json
=
data
,
timeout
=
120
,
stream
=
True
).
iter_lines
():
if
r
:
yield
r
+
b
'
\n
'
yield
r
+
b
"
\n
"
def
semantic_index_request
(
ids
:
list
[
str
],
documents
:
list
[
str
]):
...
...
This diff is collapsed.
Click to expand it.
rss_importer/rss.py
+
1
-
2
View file @
6ebd5970
...
...
@@ -33,7 +33,7 @@ def update_feed_properties(old_feed: RSSFeed, new_feed: RSSFeed):
if
property_name
in
Item
.
properties
:
continue
# We don't want to update explicitly set property 'title'
if
property_name
==
'
title
'
:
if
property_name
==
"
title
"
:
title
=
getattr
(
old_feed
,
property_name
,
None
)
if
title
:
continue
...
...
@@ -100,7 +100,6 @@ def update_feed(
except
Exception
as
e
:
logger
.
warning
(
f
"Could not parse feed
{
feed
.
href
}
:
{
e
}
"
)
return
feed
,
[]
feed_dict
=
feedparser
.
parse
(
feed
.
href
)
if
feed_dict
.
bozo
:
logger
.
warning
(
f
"Could not parse feed
{
feed
.
href
}
:
{
feed_dict
.
bozo_exception
}
"
)
return
feed
,
[]
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Snippets