Skip to content

Commit

Permalink
Merge pull request #230 from hakonhagland/filter_meta
Browse files Browse the repository at this point in the history
Added script to clean up meta files
  • Loading branch information
lisajulia authored Apr 15, 2024
2 parents 70676d4 + b739463 commit 06c7c35
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 35 deletions.
1 change: 1 addition & 0 deletions scripts/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ fodt-split-subdocument = "fodt.split_subdocument:split_subdocument"
fodt-splitter = "fodt.splitter:main"
fodt-validate-document = "fodt.validate_automatic_styles:validate"
fodt-xml-sax-filter-all = "fodt.xml_sax_filter_all:xml_sax_filter_all"
fodt-xml-sax-filter-meta = "fodt.xml_filter_meta:xml_sax_filter_meta"

[build-system]
requires = ["poetry-core"]
Expand Down
61 changes: 61 additions & 0 deletions scripts/python/src/fodt/xml_filter_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging
import xml.sax
import xml.sax.handler
import xml.sax.xmlreader
import xml.sax.saxutils
from pathlib import Path

import click

from fodt.constants import ClickOptions, Directories, FileExtensions
from fodt.xml_handlers import PassThroughFilterHandler


class FilterAll:
def __init__(self, maindir: str) -> None:
self.maindir = Path(maindir)

def run_filter(self) -> None:
meta_dir = self.maindir / Directories.meta / Directories.sections
if not meta_dir.is_dir():
logging.info(f"Directory {meta_dir} does not exist.")
return
for i, filename in enumerate(meta_dir.glob("*.xml"), start=1):
logging.info(f"Processing file: {filename}")
self.filter_file(filename)
#if i == 1:
# break

def filter_file(self, filename: Path) -> None:
parser = xml.sax.make_parser()
handler = PassThroughFilterHandler()
parser.setContentHandler(handler)
parser.parse(filename)
with open(filename, "w", encoding='utf8') as f:
f.write(handler.get_content())



# USAGE:
#
# fodt-xml-sax-filter-meta \
# --maindir=<main directory> \
#
# DESCRIPTION:
#
# Runs xml.sax pass-through filter on all xml files in the parts/meta/sections
# directory. The files in this directory are used by among other the
# fodt-add-keyword script.
# This means that each xml file is read by the xml.sax parser, and
# the content is then written back to the file using xml.sax.saxutils.escape()
# to escape the content.
# This is useful to check for inconsistencies in the XML content written by LibreOffice
# and the content written by the xml.sax parser and to initially algin the XML content
# with the format written by LibreOffice.
#
@click.command()
@ClickOptions.maindir(required=False)
def xml_sax_filter_meta(maindir: str) -> None:
"""Filter all xml files in the meta dir."""
logging.basicConfig(level=logging.INFO)
FilterAll(maindir).run_filter()
36 changes: 36 additions & 0 deletions scripts/python/src/fodt/xml_handlers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,45 @@
import io
import re
import xml.sax
import xml.sax.handler
import xml.sax.xmlreader
import xml.sax.saxutils

from fodt.xml_helpers import XMLHelper

class PassThroughFilterHandler(xml.sax.handler.ContentHandler):
def __init__(self) -> None:
self.content = io.StringIO()
self.start_tag_open = False # For empty tags, do not close with />

def characters(self, content: str):
if self.start_tag_open:
# NOTE: characters() is only called if there is content between the start
# tag and the end tag. If there is no content, characters() is not called.
self.content.write(">")
self.start_tag_open = False
self.content.write(XMLHelper.escape(content))

def endElement(self, name: str):
if self.start_tag_open:
self.content.write("/>")
self.start_tag_open = False
else:
self.content.write(XMLHelper.endtag(name))

def get_content(self) -> str:
return self.content.getvalue()

def startDocument(self):
self.content.write(XMLHelper.header)

def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
if self.start_tag_open:
self.content.write(">")
self.start_tag_open = True
self.content.write(XMLHelper.starttag(name, attrs, close_tag=False))


class GetUsedStylesHandler(xml.sax.handler.ContentHandler):
def __init__(self) -> None:
# The values of the dict below list the attribute-names where the style is used.
Expand Down
37 changes: 2 additions & 35 deletions scripts/python/src/fodt/xml_sax_filter_all.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import io
import logging
import xml.sax
import xml.sax.handler
Expand All @@ -9,39 +8,7 @@
import click

from fodt.constants import ClickOptions
from fodt.xml_helpers import XMLHelper

class ElementHandler(xml.sax.handler.ContentHandler):
def __init__(self) -> None:
self.content = io.StringIO()
self.start_tag_open = False # For empty tags, do not close with />

def characters(self, content: str):
if self.start_tag_open:
# NOTE: characters() is only called if there is content between the start
# tag and the end tag. If there is no content, characters() is not called.
self.content.write(">")
self.start_tag_open = False
self.content.write(XMLHelper.escape(content))

def endElement(self, name: str):
if self.start_tag_open:
self.content.write("/>")
self.start_tag_open = False
else:
self.content.write(XMLHelper.endtag(name))

def get_content(self) -> str:
return self.content.getvalue()

def startDocument(self):
self.content.write(XMLHelper.header)

def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
if self.start_tag_open:
self.content.write(">")
self.start_tag_open = True
self.content.write(XMLHelper.starttag(name, attrs, close_tag=False))
from fodt.xml_handlers import PassThroughFilterHandler


class FilterAll:
Expand All @@ -57,7 +24,7 @@ def run_filter(self) -> None:

def filter_file(self, filename: Path) -> None:
parser = xml.sax.make_parser()
handler = ElementHandler()
handler = PassThroughFilterHandler()
parser.setContentHandler(handler)
parser.parse(filename)
with open(filename, "w", encoding='utf8') as f:
Expand Down

0 comments on commit 06c7c35

Please sign in to comment.