Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve diffs on large documents #196

Merged
merged 8 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 51 additions & 11 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -490,21 +490,61 @@ do_md_fixups() {
# TODO: Turn this into a Pandoc filter.
sed -i.bak '0,/\\tableofcontents/s/^# \(.*\)/\\section*\{\U\1\}/g' "${input}"
}
do_tex_fixups() {

# latexdiff is pretty great, but it has some incompatibilities with our template, so we
# unfortunately have to do a lot of massaging of the diff .tex file here.
# In the future, we should explore whether latexdiff can be further configured, our
# our custom extensions can be redesigned to avoid some of these problems.
do_diff_tex_fixups() {
local input=$1
# latexdiff is appending its own generated preamble to our custom one
# (in apparent contradiction of the documentation). Strip it out.
sed -i.bak '/^% End Custom TCG/,/^%DIF END PREAMBLE EXTENSION/d' "${input}"

# latexdiff uses %DIF < and %DIF > to prefix changed lines in code environments
# prefix these lines with + and -
sed -i.bak 's/^%DIF < /%DIF <- /g' "${input}"
sed -i.bak 's/^%DIF > /%DIF >+ /g' "${input}"

# latexdiff' \DIFaddbegin absorbs a space before it.
# This is fairly common (e.g., in the case of an added sentence)
# Preserve them by inserting a space after.
sed -i.bak 's/ \\DIFaddbegin/ \\DIFaddbegin ~/g' "${input}"
# prefix these lines with + and - and replace %DIF with DIFDIFDIFDIF (inside DIFverbatim) so that
# we don't delete the verbatim diff markers when we delete comments below.
sed -i.bak '/\\begin{DIFverbatim}/,/\\end{DIFverbatim}/s/^%DIF < /DIFDIFDIFDIF <- /g' "${input}"
sed -i.bak '/\\begin{DIFverbatim}/,/\\end{DIFverbatim}/s/^%DIF > /DIFDIFDIFDIF >+ /g' "${input}"

# Remove all block begin and end markers after the beginning of the document. See latexdiff.tex for some discussion on this.
# TL;DR: the begin and end markers get put into tricky places, and we don't need to do anything inside those commands.
sed -i.bak '/^\\begin{document}/,$s/\\DIF\(add\|del\|mod\)\(begin\|end\)\(FL\|\) //g ' "${input}"

# latexdiff erroneously puts \DIFadd inside the second argument to \multicolumn.
# Move it out.
sed -i.bak 's/\\multicolumn{\([^{}]*\)}{\\DIFadd{\([^{}]*\|[^{}]*{[^{}]*}\)}}/\\multicolumn{\1}{\2}/g' "${input}"

# Delete all lines containing only comments.
sed -i.bak '/^\s*%.*$/d' "${input}"

# Strip comments (everything after unescaped percent signs) inside of xltabular to make the below steps easier.
sed -i.bak '/\\begin{xltabular}/,/\\end{xltabular}/s/\([^\\]\)%.*$/\1/g' "${input}"
sed -i.bak 's/^%.*$//g' "${input}"

# Combine lines inside of the xltabular environment so that (non-empty) lines all end in \\ or \\*
perl -ne 's/\n/ / if $s = /\\begin{xltabular}/ .. ($e = /\\end{xltabular}/)
and $s > 1 and !$e and !/.*\\\\$/ and !/.*\\\\\*$/;
print' < "${input}" > "${input}".bak && mv "${input}".bak "${input}"

# Put newlines after \endhead, \endfirsthead, \endfoot, and \endlastfoot
sed -i.bak 's/\(\\end\(head\|firsthead\|foot\|lastfoot\)\)/\1\n/g' "${input}"

# latexdiff inserts its markers before \multicolumn sometimes.
# The \multicolumn needs to be the first thing in the cell.
# Swap the order of any \DIF stuff and \multicolumn invocation inside a cell.
sed -i.bak 's/\(\\DIF[^&]*\)\(\\multicolumn{[^{}]*}\({[^{}]*}\|{[^{}]*{[^{}]*}}\)\)/\2\1/g' "${input}"

# latexdiff inserts its markers before \hline sometimes.
# After the transformations above, \hline needs to be the first thing in a line of text.
sed -i.bak 's/\(\s*\)\(.*\)\(\\hline \|\\hlineifmdframed \)\(.*\)/\1\3\2\4/g' "${input}"

# latexdiff inside of \texttt breaks. Prefer \ttfamily.
sed -i.bak 's/\\texttt{/{\\ttfamily /g' "${input}"

# Delete all empty DIFadd/mod/del
sed -i.bak 's/\\DIF\(add\|del\|mod\){}\(FL\|\)//g' "${input}"

}

if test "${DO_GITVERSION}" == "yes"; then
Expand Down Expand Up @@ -819,15 +859,15 @@ if [ -n "${DIFFPDF_OUTPUT}" -o -n "${DIFFTEX_OUTPUT}" ]; then
do_latex "${BUILD_DIR}/${INPUT_FILE}" "${TEMP_DIFFBASE_TEX_FILE}" "${EXTRA_PANDOC_OPTIONS} -V keepstaleimages=true"
echo "Running latexdiff... (this may take a while for complex changes)"
start=$(date +%s)
latexdiff-fast --preamble /resources/templates/latexdiff.tex --config /resources/templates/latexdiff.cfg --append-safecmd /resources/templates/latexdiff.safe --exclude-safecmd /resources/templates/latexdiff.unsafe "${TEMP_DIFFBASE_TEX_FILE}" "${TEMP_TEX_FILE}" > "${TEMP_DIFF_TEX_FILE}" 2>"${TEMP_LATEXDIFF_LOG}"
latexdiff-fast --math-markup=whole --preamble /resources/templates/latexdiff.tex --config /resources/templates/latexdiff.cfg --append-safecmd /resources/templates/latexdiff.safe --exclude-safecmd /resources/templates/latexdiff.unsafe "${TEMP_DIFFBASE_TEX_FILE}" "${TEMP_TEX_FILE}" > "${TEMP_DIFF_TEX_FILE}" 2>"${TEMP_LATEXDIFF_LOG}"
end=$(date +%s)
echo "Elapsed time: $(($end-$start)) seconds"
if [ $? -ne 0 ]; then
FAILED=true
>&2 cat "${TEMP_LATEXDIFF_LOG}"
echo "latexdiff failed"
else
do_tex_fixups "${TEMP_DIFF_TEX_FILE}"
do_diff_tex_fixups "${TEMP_DIFF_TEX_FILE}"
if [ -n "${DIFFTEX_OUTPUT}" ]; then
mkdir -p "$(dirname ${SOURCE_DIR}/${DIFFTEX_OUTPUT})"
cp "${TEMP_DIFF_TEX_FILE}" "${SOURCE_DIR}/${DIFFTEX_OUTPUT}"
Expand Down
Loading