Merge branch 'dev' into 'qa'

Merge Dev into QA See merge request !17

Merge branch 'dev' into 'qa'
Merge Dev into QA See merge request !17
6ebd5970 · Alp Deniz Ogut · a32378ec · 84633d2d · 6ebd5970 · 6ebd5970
Commit 6ebd5970 authored 1 year ago by Alp Deniz Ogut
Hide whitespace changes
Inline Side-by-side

Showing

with 59 additions and 39 deletions
+59 -39
--- a/rss_importer/app.py
+++ b/rss_importer/app.py
@@ -89,7 +89,7 @@ def post_feed(
    feed_or_err = plugin.setup_feed(url=url, name=name)
    match feed_or_err:
        case Ok(feed) if feed.id is not None:
-            run_postprocess(plugin, delta_days=1)
+            logger.info("Successfully created feed")
        case Err(e):
            logger.error(e)
            raise HTTPException(status_code=404, detail="Could not import feed")
@@ -140,17 +140,19 @@ def get_feeds(
        feeds = [n for n in feeds if n.id == feed_id]
    return feeds

+
 class RSSFeedChat(BaseModel):
    messages: list[dict]
    index_name: str = "rss_feed"
    category: str = "all"

+
 @app.post("/feed/chat")
-def feed_chat(
-    req: RSSFeedChat,
-    plugin: RSSImporter = Depends(get_plugin)
-):
-    return StreamingResponse(plugin.feed_chat(req.index_name, req.messages, req.category), media_type="text/event-stream")
+def feed_chat(req: RSSFeedChat, plugin: RSSImporter = Depends(get_plugin)):
+    return StreamingResponse(
+        plugin.feed_chat(req.index_name, req.messages, req.category),
+        media_type="text/event-stream",
+    )


 @app.get("/update_all_feeds")
@@ -322,7 +324,7 @@ def get_summary_endpoint(category=None, plugin: RSSImporter = Depends(get_plugin


 @app.get("/summaries")
-def get_summary_endpoint(
+def get_summaries_endpoint(
    categories: list[str] = Query(
        default=["business"],
        description="Returns summaries for the given categories. use 'all' to get all summaries.",

--- a/rss_importer/config.py
+++ b/rss_importer/config.py
@@ -61,7 +61,7 @@ DEFAULT_FEEDS = [
        "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114",
        "CNBCE",
    ),  # Top news
-    ("https://www.buzzfeed.com/index.xml", "BuzzFeed"),  # Contains :/// notation
+    # ("https://www.buzzfeed.com/index.xml", "BuzzFeed"),  # Contains :/// notation
    ("https://techcrunch.com/feed/", "TechCrunch"),
    ("https://thenextweb.com/feed", "The Next Web"),
    ("https://venturebeat.com/feed/", "VentureBeat"),
@@ -83,4 +83,15 @@ DEFAULT_FEEDS = [
    ("https://steveblank.com/feed/", "Steve Blank"),
    ("https://financialpost.com/feed", "Financial Post"),
    ("https://cms.qz.com/feed/", "Quartz"),
-]
\ No newline at end of file
+    #  ("https://www.inc.com/rss/", "Inc.com"), # blocked by fw, requires browser-like behavior
+    ("https://feeds.feedburner.com/fastcompany/headlines", "Fast Company"),
+    ("https://sloanreview.mit.edu/feed/", "MIT Sloan Management"),
+    ("https://moxie.foxbusiness.com/google-publisher/latest.xml", "Fox Business"),
+    (
+        "https://www.businessoffashion.com/arc/outboundfeeds/rss/?outputType=xml",
+        "Business of Fashion",
+    ),
+    #  ("https://www.economist.com/the-world-this-week/rss.xml # feedparser.bozoexception: Could not parse feed h: <unknown>:2:0: syntax error
+    #  ("https://www.mckinsey.com/insights/rss", "McKinsey") # document declared as us-ascii, but parsed as utf-8
+    ("https://smallbiztrends.com/feed", "Small Business Trends"),
+]
--- a/rss_importer/plugin.py
+++ b/rss_importer/plugin.py
@@ -210,18 +210,21 @@ class RSSImporter(PluginBase):
        logger.info(f"Summarizing {len(entries)} entries in batches of {batch_size}")
        for batch in chunked(entries, batch_size):
            logger.debug(f"Summarizing {len(batch)} entries")
-            documents = [entry.plain_text() for entry in batch]
-            summaries = summary_request(
-                documents,
-                min_length=min_length,
-                max_length=max_length,
-            )
-            for entry, summary in zip(batch, summaries):
-                if summary:
-                    entry.summary = bulletize_summary(summary)
-                    entry.summarySource = SUMMARY_SOURCE
-            logger.debug(f"Updating {len(batch)} entries")
-            self.client.bulk_action(update_items=batch)
+            try:
+                documents = [entry.plain_text() for entry in batch]
+                summaries = summary_request(
+                    documents,
+                    min_length=min_length,
+                    max_length=max_length,
+                )
+                for entry, summary in zip(batch, summaries):
+                    if summary:
+                        entry.summary = bulletize_summary(summary)
+                        entry.summarySource = SUMMARY_SOURCE
+                logger.debug(f"Updating {len(batch)} entries")
+                self.client.bulk_action(update_items=batch)
+            except Exception as e:
+                logger.error(f"Error while summarizing entries: {e}")
        return entries

    def label_rss_entries(
@@ -234,7 +237,12 @@ class RSSImporter(PluginBase):
        for batch in chunked(entries, batch_size):
            logger.debug(f"Labeling {len(batch)} entries")
            titles = [entry.title or "" for entry in batch]
-            labels_of_docs = semantic_search_request(titles)
+            try:
+                labels_of_docs = semantic_search_request(titles)
+            except Exception as e:
+                logger.error(f"Error while labeling entries: {e}")
+                continue
+
            create_edges = []
            create_items = []

@@ -279,16 +287,16 @@ class RSSImporter(PluginBase):

    def generate_feed_summary(self, category_text=None):
        index_query = "impactful, affecting a wide range of people, very recent, published today or yesterday."
-        instructions = " ".join(
-            [
-                "Generate up-to 4 bullet point list of distinct signficant information from the news. Use only asteriks as bullets.",
-                "Start strictly with the 1 level bulleted list, comforming to markdown format. Do not generate intro or warm up words before or after the list.",
-                "Each bullet point should contain more than 5 and under 20 words. There should be no sub bullet points.",
-                "Output only the one level bullet point list. Do not generate words other than the bullet point list. Do not generate sub items to the list.",
-                "Include source articles in format '(source: {Item Id})' just after the each bullet point sentence.",
-                "Make sure that the output is valid markdown and the list has no nested items",
-            ]
-        )
+        instructions = """Generate a 4-item markdown bullet list summary from distinct news articles:
+
+* Use asterisks as bullets, ensuring valid markdown format.
+* Each bullet should be unformatted 10-20 words, representing a unique subject.
+* Each bullet should briefly and clearly inform the user about the subject.
+* Bullet items should not contain any markup or formatting.
+* No introduction, conclusion, sub-bullets, or nested items.
+* Append each with '(source: {Item Id})'.
+Ensure the output is a one-level bullet list without additional text.
+"""
        if category_text is not None and category_text != "all":
            index_query = f"{index_query} {category_text}"
            instructions += f" Keep the list strictly bounded by {category_text}. Do not include list items that are not closely related to the tags or categories."
@@ -313,12 +321,10 @@ class RSSImporter(PluginBase):
        return summary

    def feed_chat(self, index_name, messages, category_text=None):
-        index_query = messages[-1]['content']
+        index_query = messages[-1]["content"]
        if category_text is not None and category_text != "all":
            index_query = f"{index_query} {category_text}"
-        print(
-            f"Sending chat request with query: {index_query}"
-        )
+        print(f"Sending chat request with query: {index_query}")
        rss_client_kwargs = {
            "url": RSS_POD_URL,
            "owner_key": RSS_OWNER_KEY,
@@ -332,6 +338,7 @@ class RSSImporter(PluginBase):
            logger.error(f"Error during chat request stream: {e}")
            return None

+
 def bulletize_summary(summary: str) -> str:
    """Converts a summary to a bulletized list per sentence."""
    nltk.download("punkt", quiet=True)
@@ -376,6 +383,7 @@ def chat_request(index_name, messages, pod_client_kwargs, index_query=None):
                logger.error(f"JSON decoding failed for LLM reply: {r}")
    return reply

+
 def chat_request_stream(index_name, messages, pod_client_kwargs, index_query=None):
    data: dict = {
        "index_name": index_name,
@@ -389,7 +397,7 @@ def chat_request_stream(index_name, messages, pod_client_kwargs, index_query=Non
        f"{MEMRI_BOT_URL}/v1/memory/chat", json=data, timeout=120, stream=True
    ).iter_lines():
        if r:
-            yield r + b'\n'
+            yield r + b"\n"


 def semantic_index_request(ids: list[str], documents: list[str]):

--- a/rss_importer/rss.py
+++ b/rss_importer/rss.py
@@ -33,7 +33,7 @@ def update_feed_properties(old_feed: RSSFeed, new_feed: RSSFeed):
        if property_name in Item.properties:
            continue
        # We don't want to update explicitly set property 'title'
-        if property_name == 'title':
+        if property_name == "title":
            title = getattr(old_feed, property_name, None)
            if title:
                continue
@@ -100,7 +100,6 @@ def update_feed(
    except Exception as e:
        logger.warning(f"Could not parse feed {feed.href}: {e}")
        return feed, []
-    feed_dict = feedparser.parse(feed.href)
    if feed_dict.bozo:
        logger.warning(f"Could not parse feed {feed.href}: {feed_dict.bozo_exception}")
        return feed, []