Commit 6ebd5970 authored by Alp Deniz Ogut's avatar Alp Deniz Ogut
Browse files

Merge branch 'dev' into 'qa'

Merge Dev into QA

See merge request !17
parents a32378ec 84633d2d
Pipeline #15026 passed with stages
in 1 minute and 37 seconds
Showing with 59 additions and 39 deletions
+59 -39
......@@ -89,7 +89,7 @@ def post_feed(
feed_or_err = plugin.setup_feed(url=url, name=name)
match feed_or_err:
case Ok(feed) if feed.id is not None:
run_postprocess(plugin, delta_days=1)
logger.info("Successfully created feed")
case Err(e):
logger.error(e)
raise HTTPException(status_code=404, detail="Could not import feed")
......@@ -140,17 +140,19 @@ def get_feeds(
feeds = [n for n in feeds if n.id == feed_id]
return feeds
class RSSFeedChat(BaseModel):
messages: list[dict]
index_name: str = "rss_feed"
category: str = "all"
@app.post("/feed/chat")
def feed_chat(
req: RSSFeedChat,
plugin: RSSImporter = Depends(get_plugin)
):
return StreamingResponse(plugin.feed_chat(req.index_name, req.messages, req.category), media_type="text/event-stream")
def feed_chat(req: RSSFeedChat, plugin: RSSImporter = Depends(get_plugin)):
return StreamingResponse(
plugin.feed_chat(req.index_name, req.messages, req.category),
media_type="text/event-stream",
)
@app.get("/update_all_feeds")
......@@ -322,7 +324,7 @@ def get_summary_endpoint(category=None, plugin: RSSImporter = Depends(get_plugin
@app.get("/summaries")
def get_summary_endpoint(
def get_summaries_endpoint(
categories: list[str] = Query(
default=["business"],
description="Returns summaries for the given categories. use 'all' to get all summaries.",
......
......@@ -61,7 +61,7 @@ DEFAULT_FEEDS = [
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114",
"CNBCE",
), # Top news
("https://www.buzzfeed.com/index.xml", "BuzzFeed"), # Contains :/// notation
# ("https://www.buzzfeed.com/index.xml", "BuzzFeed"), # Contains :/// notation
("https://techcrunch.com/feed/", "TechCrunch"),
("https://thenextweb.com/feed", "The Next Web"),
("https://venturebeat.com/feed/", "VentureBeat"),
......@@ -83,4 +83,15 @@ DEFAULT_FEEDS = [
("https://steveblank.com/feed/", "Steve Blank"),
("https://financialpost.com/feed", "Financial Post"),
("https://cms.qz.com/feed/", "Quartz"),
]
\ No newline at end of file
# ("https://www.inc.com/rss/", "Inc.com"), # blocked by fw, requires browser-like behavior
("https://feeds.feedburner.com/fastcompany/headlines", "Fast Company"),
("https://sloanreview.mit.edu/feed/", "MIT Sloan Management"),
("https://moxie.foxbusiness.com/google-publisher/latest.xml", "Fox Business"),
(
"https://www.businessoffashion.com/arc/outboundfeeds/rss/?outputType=xml",
"Business of Fashion",
),
# ("https://www.economist.com/the-world-this-week/rss.xml # feedparser.bozoexception: Could not parse feed h: <unknown>:2:0: syntax error
# ("https://www.mckinsey.com/insights/rss", "McKinsey") # document declared as us-ascii, but parsed as utf-8
("https://smallbiztrends.com/feed", "Small Business Trends"),
]
......@@ -210,18 +210,21 @@ class RSSImporter(PluginBase):
logger.info(f"Summarizing {len(entries)} entries in batches of {batch_size}")
for batch in chunked(entries, batch_size):
logger.debug(f"Summarizing {len(batch)} entries")
documents = [entry.plain_text() for entry in batch]
summaries = summary_request(
documents,
min_length=min_length,
max_length=max_length,
)
for entry, summary in zip(batch, summaries):
if summary:
entry.summary = bulletize_summary(summary)
entry.summarySource = SUMMARY_SOURCE
logger.debug(f"Updating {len(batch)} entries")
self.client.bulk_action(update_items=batch)
try:
documents = [entry.plain_text() for entry in batch]
summaries = summary_request(
documents,
min_length=min_length,
max_length=max_length,
)
for entry, summary in zip(batch, summaries):
if summary:
entry.summary = bulletize_summary(summary)
entry.summarySource = SUMMARY_SOURCE
logger.debug(f"Updating {len(batch)} entries")
self.client.bulk_action(update_items=batch)
except Exception as e:
logger.error(f"Error while summarizing entries: {e}")
return entries
def label_rss_entries(
......@@ -234,7 +237,12 @@ class RSSImporter(PluginBase):
for batch in chunked(entries, batch_size):
logger.debug(f"Labeling {len(batch)} entries")
titles = [entry.title or "" for entry in batch]
labels_of_docs = semantic_search_request(titles)
try:
labels_of_docs = semantic_search_request(titles)
except Exception as e:
logger.error(f"Error while labeling entries: {e}")
continue
create_edges = []
create_items = []
......@@ -279,16 +287,16 @@ class RSSImporter(PluginBase):
def generate_feed_summary(self, category_text=None):
index_query = "impactful, affecting a wide range of people, very recent, published today or yesterday."
instructions = " ".join(
[
"Generate up-to 4 bullet point list of distinct signficant information from the news. Use only asteriks as bullets.",
"Start strictly with the 1 level bulleted list, comforming to markdown format. Do not generate intro or warm up words before or after the list.",
"Each bullet point should contain more than 5 and under 20 words. There should be no sub bullet points.",
"Output only the one level bullet point list. Do not generate words other than the bullet point list. Do not generate sub items to the list.",
"Include source articles in format '(source: {Item Id})' just after the each bullet point sentence.",
"Make sure that the output is valid markdown and the list has no nested items",
]
)
instructions = """Generate a 4-item markdown bullet list summary from distinct news articles:
* Use asterisks as bullets, ensuring valid markdown format.
* Each bullet should be unformatted 10-20 words, representing a unique subject.
* Each bullet should briefly and clearly inform the user about the subject.
* Bullet items should not contain any markup or formatting.
* No introduction, conclusion, sub-bullets, or nested items.
* Append each with '(source: {Item Id})'.
Ensure the output is a one-level bullet list without additional text.
"""
if category_text is not None and category_text != "all":
index_query = f"{index_query} {category_text}"
instructions += f" Keep the list strictly bounded by {category_text}. Do not include list items that are not closely related to the tags or categories."
......@@ -313,12 +321,10 @@ class RSSImporter(PluginBase):
return summary
def feed_chat(self, index_name, messages, category_text=None):
index_query = messages[-1]['content']
index_query = messages[-1]["content"]
if category_text is not None and category_text != "all":
index_query = f"{index_query} {category_text}"
print(
f"Sending chat request with query: {index_query}"
)
print(f"Sending chat request with query: {index_query}")
rss_client_kwargs = {
"url": RSS_POD_URL,
"owner_key": RSS_OWNER_KEY,
......@@ -332,6 +338,7 @@ class RSSImporter(PluginBase):
logger.error(f"Error during chat request stream: {e}")
return None
def bulletize_summary(summary: str) -> str:
"""Converts a summary to a bulletized list per sentence."""
nltk.download("punkt", quiet=True)
......@@ -376,6 +383,7 @@ def chat_request(index_name, messages, pod_client_kwargs, index_query=None):
logger.error(f"JSON decoding failed for LLM reply: {r}")
return reply
def chat_request_stream(index_name, messages, pod_client_kwargs, index_query=None):
data: dict = {
"index_name": index_name,
......@@ -389,7 +397,7 @@ def chat_request_stream(index_name, messages, pod_client_kwargs, index_query=Non
f"{MEMRI_BOT_URL}/v1/memory/chat", json=data, timeout=120, stream=True
).iter_lines():
if r:
yield r + b'\n'
yield r + b"\n"
def semantic_index_request(ids: list[str], documents: list[str]):
......
......@@ -33,7 +33,7 @@ def update_feed_properties(old_feed: RSSFeed, new_feed: RSSFeed):
if property_name in Item.properties:
continue
# We don't want to update explicitly set property 'title'
if property_name == 'title':
if property_name == "title":
title = getattr(old_feed, property_name, None)
if title:
continue
......@@ -100,7 +100,6 @@ def update_feed(
except Exception as e:
logger.warning(f"Could not parse feed {feed.href}: {e}")
return feed, []
feed_dict = feedparser.parse(feed.href)
if feed_dict.bozo:
logger.warning(f"Could not parse feed {feed.href}: {feed_dict.bozo_exception}")
return feed, []
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment