plugin.py

from enum import Enum
from functools import cache
import os
import re
import time
import json
from typing import Set
from dataclasses import dataclass
from nltk.tokenize import sent_tokenize
import nltk
from loguru import logger
from more_itertools import chunked
from pymemri.data.schema import File, Photo, CategoricalPrediction
from pymemri.plugin.pluginbase import PluginBase
import requests
from result import Err, Ok, Result
from rss_importer.config import (
    RSS_POD_URL,
    RSS_OWNER_KEY,
    RSS_DATABASE_KEY,
    SEMANTIC_SEARCH_URL,
    MEMRI_BOT_URL,
    SUMMARIZATION_URL,
    SUMMARY_MAX_LENGTH,
    SUMMARY_MIN_LENGTH,
    SUMMARY_SOURCE,
    DEFAULT_FEEDS,
)


from rss_importer.labels import get_tags


from .rss import get_feed, scrape_entries, update_feed
from .schema import RSSEntry, RSSFeed, RSSFeedSummary


class LabelSource(str, Enum):
    ZERO_SHOT = "zero-shot"
    SEMANTIC_SEARCH = "semantic-search"


@dataclass
class Label:
    label: str
    score: float


class RSSImporter(PluginBase):
    def __init__(
        self,
        setup_on_start: bool = False,
        max_entries_on_start: int | None = None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.add_to_schema()
        if setup_on_start:
            self.setup_feeds(DEFAULT_FEEDS, max_entries=max_entries_on_start)

    def run(self):
        pass

    def _feed_url_in_pod(self, feed_url: str) -> bool:
        """
        Returns True if a feed with the same href exists in the Pod
        """

        query = """