Skip to content

Commit

Permalink
fix(file): improve content vector generation command
Browse files Browse the repository at this point in the history
  • Loading branch information
Yelinz committed Jun 19, 2024
1 parent 422cf77 commit 145ae23
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 5 deletions.
21 changes: 18 additions & 3 deletions alexandria/core/management/commands/generate_content_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,27 @@ def handle(self, *args, **options):
)
)

failed_files = []
for file in tqdm(
File.objects.filter(variant="original", content_vector__isnull=True)
):
file.set_content_vector()
file.save()
try:
file.save() # this will call set_content_vector
except FileNotFoundError as e:
failed_files.append(file.id)
self.stdout.write(
self.style.WARNING(f"Error processing {file.id}: {e}")
)

if virtual_memory().available < 300_000_000: # pragma: no cover
print("about to run out of memory, stopping")
self.stdout.write(
self.style.ERROR("about to run out of memory, stopping")
)
break

if failed_files:
self.stdout.write(
self.style.WARNING(
f"Failed to process {len(failed_files)} files: {failed_files}"
)
)
18 changes: 16 additions & 2 deletions alexandria/core/tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def test_generate_content_vector(db, settings, file_factory):
file_with_vector.refresh_from_db()
file_without_vector.refresh_from_db()

assert tika.parser.from_buffer.called_once()
assert tika.language.from_buffer.called_once()
assert tika.parser.from_buffer.call_count == 2
assert tika.language.from_buffer.call_count == 2

assert file_with_vector.content_vector == "'import':2B 'neu':1A 'text':3B"
assert file_without_vector.content_vector == "'inhalt':4B 'old':1A"
Expand All @@ -89,3 +89,17 @@ def test_generate_content_vector_disabled(db, settings, file_factory):
"Content search is not enabled. Skipping vectorization of file contents."
in out.getvalue()
)


def test_generate_content_vector_error(db, settings, file_factory, mocker):
settings.ALEXANDRIA_ENABLE_CONTENT_SEARCH = False
file_factory(name="old")
settings.ALEXANDRIA_ENABLE_CONTENT_SEARCH = True
mocker.patch(
"alexandria.core.models.File.set_content_vector", side_effect=FileNotFoundError
)

out = StringIO()
call_command("generate_content_vectors", stdout=out)

assert "Failed to process 1 file" in out.getvalue()

0 comments on commit 145ae23

Please sign in to comment.