Skip to content

Commit

Permalink
refactor: move metadata extracting code
Browse files Browse the repository at this point in the history
  • Loading branch information
david20571015 committed Jun 8, 2024
1 parent 1d92f1d commit ef7ee81
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 13 deletions.
10 changes: 0 additions & 10 deletions sync_crawler/models/news.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from collections.abc import Sequence
from datetime import datetime
from typing import Any, ClassVar

import pydantic

Expand All @@ -18,15 +17,6 @@ class News(pydantic.BaseModel):

model_config = pydantic.ConfigDict(extra="forbid")

excluded_metadata_keys: ClassVar = ["modified_date"]

@property
def text(self) -> str:
return " ".join(self.content)

@property
def metadata(self) -> dict[str, Any]:
return self.model_dump(
mode="json", # use json mode to convert `modified_date` to string
include={"title", "category", "modified_date"},
)
14 changes: 11 additions & 3 deletions sync_crawler/writer/chromadb_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,23 @@ def __init__(
embed_model=HuggingFaceEmbedding(model_name=embedding_model),
)

@staticmethod
def _get_metadata(news: News):
return news.model_dump(include={"title", "category"}) | {
"modified_date": news.modified_date.timestamp()
}

excluded_metadata_keys = ["modified_date"]

@override
def write(self, news_with_id):
docs = [
Document(
doc_id=str(id_),
text=ns.text,
extra_info=ns.metadata,
excluded_embed_metadata_keys=News.excluded_metadata_keys,
excluded_llm_metadata_keys=News.excluded_metadata_keys,
extra_info=self._get_metadata(ns),
excluded_embed_metadata_keys=self.excluded_metadata_keys,
excluded_llm_metadata_keys=self.excluded_metadata_keys,
)
for id_, ns in news_with_id
]
Expand Down

0 comments on commit ef7ee81

Please sign in to comment.