Skip to content

Commit

Permalink
Added initial zimwriterfs clone
Browse files Browse the repository at this point in the history
  • Loading branch information
rgaudin committed May 27, 2024
1 parent 7d49831 commit a288082
Show file tree
Hide file tree
Showing 3 changed files with 343 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- `zimwriterfs` script that mimics zim-tools's zimwriterfs

### Changed
- Simplify type annotations by replacing Union and Optional with pipe character ("|") for improved readability and clarity

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ dev = [

[project.scripts]
fix_ogvjs_dist = "zimscraperlib.fix_ogvjs_dist:run"
zimwriterfs = "zimwriterfs:entrypoint"

[tool.hatch.version]
path = "src/zimscraperlib/__about__.py"
Expand Down
338 changes: 338 additions & 0 deletions src/zimwriterfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,338 @@
#!/usr/bin/env python3

""" zimwriterfs alternative implementation
requires libmagic1 and zimscraperlib (python)
WARN: the following zimwriterfs features are not implemented:
`--inflateHtml`
`--skip-libmagic-check`
"""

from __future__ import annotations

import argparse
import datetime
import logging
from pathlib import Path

from zimscraperlib.__about__ import __version__
from zimscraperlib.constants import (
MAXIMUM_DESCRIPTION_METADATA_LENGTH,
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
RECOMMENDED_MAX_TITLE_LENGTH,
)
from zimscraperlib.logging import getLogger
from zimscraperlib.zim.creator import Creator
from zimscraperlib.zim.filesystem import add_redirects_to_zim, add_to_zim

SCRAPER = f"zimwriterfs(py) {__version__}"
logger = getLogger("zimwriterfs", level=logging.INFO)


def main(
*,
src_dir: str,
dst_path: str,
name: str,
language: str,
title: str,
description: str,
creator: str,
publisher: str,
illus_fname: str,
tags: str,
welcome: str,
verbose: bool,
threads: int,
without_ft_index: bool,
redirects_path: str | None = None,
long_description: str | None = None,
flavour: str | None = None,
source: str | None = None,
scraper: str | None = None,
license_m: str | None = None,
relation: str | None = None,
cluster_size: int | None = None,
):
if verbose:
logger = getLogger("zimwriterfs", level=logging.DEBUG)

logger.info(f"Dumping {src_dir} into {dst_path}")

build_dir = Path(src_dir).expanduser().resolve()
if not build_dir.exists() or not build_dir.is_dir():
raise OSError(f"Incorrect build_dir: {build_dir}")

illustration_fpath = build_dir / illus_fname
if not illustration_fpath.exists() or not illustration_fpath.is_file():
raise OSError(f"Incorrect illustration: {illus_fname} ({illustration_fpath})")
with open(illustration_fpath, "rb") as fh:
illustration_data = fh.read()

redirects_fpath = Path(redirects_path) if redirects_path else None

zim_file = (
Creator(
filename=Path(dst_path).expanduser().resolve(),
main_path=welcome,
ignore_duplicates=False,
disable_metadata_checks=False,
)
.config_indexing(not without_ft_index, language)
.config_verbose(verbose)
.config_nbworkers(threads)
)

if cluster_size:
zim_file.config_clustersize(cluster_size)

zim_file.config_metadata(
Name=name,
Language=language,
Title=title,
Description=description,
LongDescription=long_description,
Creator=creator,
Publisher=publisher,
Date=datetime.date.today(), # noqa: DTZ011
Illustration_48x48_at_1=illustration_data,
Tags=";".join(tags.split(";")) if tags else None,
Scraper=scraper,
Flavour=flavour,
Source=source,
License=license_m,
Relation=relation,
)

zim_file.start()
try:
logger.debug(f"Preparing zimfile at {zim_file.filename}")

# recursively add content from build_dir
logger.debug(f"Recursively adding files from {build_dir}")
add_to_zim(build_dir, zim_file, build_dir)

if redirects_fpath:
logger.debug("Creating redirects")
add_redirects_to_zim(zim_file, redirects_file=redirects_fpath)

# prevents .finish() which would create an incomplete .zim file
# this would leave a .zim.tmp folder behind.
# UPSTREAM: wait until a proper cancel() is provided
except Exception:
zim_file.can_finish = False # pragma: no cover
raise
finally:
zim_file.finish()


def entrypoint():
parser = argparse.ArgumentParser(
prog="zimwriterfs",
description="Create a ZIM file off a directory containing a static website",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""Notes:
- openZIM Metadata: https://wiki.openzim.org/wiki/Metadata
- Set `ZIMSCRAPERLIB_DEBUG=1` environ to see all files included in ZIM""",
)

parser.add_argument("src_dir")
parser.add_argument("dst_path")

parser.add_argument(
"-V",
"--version",
help="Display version and exit",
action="version",
version=SCRAPER,
)

parser.add_argument(
"-n",
"--name",
help="Human identifier for the content (see spec)",
required=True,
)

parser.add_argument(
"-t",
"--title",
help=f"Title of the content ({RECOMMENDED_MAX_TITLE_LENGTH} chars max)",
required=True,
)

parser.add_argument(
"-l",
"--language",
help="ISO-639-3 Language code of the content",
required=True,
)

parser.add_argument(
"-d",
"--description",
help="Description of the content "
f"({MAXIMUM_DESCRIPTION_METADATA_LENGTH} chars max)",
required=True,
)

parser.add_argument(
"-w",
"--welcome",
help="Relative path to home/main page (ex: `home.html`)",
required=True,
)

parser.add_argument(
"-I",
"--illustration",
dest="illus_fname",
help="Relative path to the ZIM's illustration. Must be a 48x48px PNG",
required=True,
)

parser.add_argument(
"-c",
"--creator",
help="Creator of the Content",
required=True,
)

parser.add_argument(
"-p",
"--publisher",
help="Publisher of the Content",
required=True,
)

parser.add_argument(
"-r",
"--redirects",
dest="redirects_path",
help="Path to a TSV file containing a list of redirects (url title target_url)",
required=False,
)

parser.add_argument(
"-a",
"--tags",
help="ZIM Tags, semicolon separated",
default="",
required=False,
)

parser.add_argument(
"-e",
"--source",
help="Content source URL",
required=False,
)

parser.add_argument(
"-o",
"--flavour",
help="custom (version independent) content flavour",
required=False,
)

parser.add_argument(
"-s",
"--scraper",
help="custom (version independent) content flavour",
default=SCRAPER,
required=False,
)

parser.add_argument(
"--license",
help="License of the content",
dest="license_m",
required=False,
)

parser.add_argument(
"--relation",
help="URI of external related ressources ",
required=False,
)

parser.add_argument(
"-L",
"--longDescription",
dest="long_description",
help="Longer description of the content "
f"({MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH} chars max)",
required=False,
)

parser.add_argument(
"-v",
"--verbose",
help="Whether to display processing details",
action="store_true",
default=False,
required=False,
)

parser.add_argument(
"-m",
"--clusterSize",
dest="cluster_size",
type=int,
help="Number of bytes per ZIM cluster (libzim defaults to 2MiB)",
default=None,
required=False,
)

parser.add_argument(
"-J",
"--threads",
type=int,
default=4,
help="Number of threads/workers for libzim to use",
required=False,
)

parser.add_argument(
"-j",
"--withoutFTIndex",
dest="without_ft_index",
action="store_true",
default=False,
help="Don't create and add a fulltext index of the content to the ZIM",
required=False,
)

# parser.add_argument(
# "-x",
# "--inflateHtml",
# dest="inflate_html",
# action="store_true",
# default=False,
# help="[NOT IMPLEMENTED] try to inflate HTML files before packing",
# required=False,
# )

# parser.add_argument(
# "--skip-libmagic-check",
# dest="skip_libmagic_check",
# action="store_true",
# default=False,
# help="Accept to run even if magic file cannot be loaded "
# + "(mimetypes in the zim file may be wrong)",
# required=False,
# )

args = parser.parse_args()

try:
main(**dict(args._get_kwargs()))
except Exception as exc:
logger.exception(exc)
logger.error(f"FAILED. An error occured: {exc}")
raise SystemExit(1) from exc


if __name__ == "__main__":
entrypoint()

0 comments on commit a288082

Please sign in to comment.