From 92c48d732db5fb07d21aba600f34de58ded9a38e Mon Sep 17 00:00:00 2001 From: Zac Spitzer Date: Fri, 8 Sep 2023 09:46:56 +0200 Subject: [PATCH] LDEV-3048 add support for action="extractImages" (#53) workaround to return extracted image in correct order --- .github/workflows/main-5.4.yml | 1 + .github/workflows/main.yml | 3 +- .../src/org/lucee/extension/pdf/tag/PDF.java | 25 +++- .../org/lucee/extension/pdf/util/PDFUtil.java | 51 +++++++- tests/LDEV3048.cfc | 120 ++++++++++++++++++ tests/LDEV967.cfc | 12 ++ 6 files changed, 207 insertions(+), 5 deletions(-) create mode 100644 tests/LDEV3048.cfc diff --git a/.github/workflows/main-5.4.yml b/.github/workflows/main-5.4.yml index 2ec2ee7..50fa7e3 100644 --- a/.github/workflows/main-5.4.yml +++ b/.github/workflows/main-5.4.yml @@ -54,6 +54,7 @@ jobs: luceeVersion: ${{ env.luceeVersion }} luceeVersionQuery: ${{ env.luceeVersionQuery }} extensionDir: ${{ github.workspace }}/dist + extensions: B737ABC4-D43F-4D91-8E8E973E37C40D1B # image-ext for tests env: testLabels: pdf testAdditional: ${{ github.workspace }}/tests diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d8bca11..0c310ca 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -53,7 +53,8 @@ jobs: execute: /bootstrap-tests.cfm luceeVersion: ${{ env.luceeVersion }} luceeVersionQuery: ${{ env.luceeVersionQuery }} - extensionDir: ${{ github.workspace }}/dist + extensionDir: ${{ github.workspace }}/dist + extensions: B737ABC4-D43F-4D91-8E8E973E37C40D1B # image-ext for tests env: testLabels: pdf testAdditional: ${{ github.workspace }}/tests diff --git a/source/java/src/org/lucee/extension/pdf/tag/PDF.java b/source/java/src/org/lucee/extension/pdf/tag/PDF.java index 01b3532..e89863d 100644 --- a/source/java/src/org/lucee/extension/pdf/tag/PDF.java +++ b/source/java/src/org/lucee/extension/pdf/tag/PDF.java @@ -34,6 +34,7 @@ import java.util.Map.Entry; import java.util.Set; +import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.lucee.extension.pdf.PDFStruct; import org.lucee.extension.pdf.util.PDFUtil; @@ -82,6 +83,7 @@ public class PDF extends BodyTagImpl { private static final int ACTION_ADD_HEADER = 12; private static final int ACTION_ADD_FOOTER = 13; private static final int ACTION_OPEN = 14; + private static final int ACTION_EXTRACT_IMAGES = 15; private static final String FORMAT_JPG = "jpg"; private static final String FORMAT_TIFF = "tiff"; @@ -309,9 +311,12 @@ public void setAction(String strAction) throws PageException { else if ("extract_text".equals(strAction)) action = ACTION_EXTRACT_TEXT; else if ("addheader".equals(strAction)) action = ACTION_ADD_HEADER; else if ("addfooter".equals(strAction)) action = ACTION_ADD_FOOTER; + else if ("extractimages".equals(strAction)) action = ACTION_EXTRACT_IMAGES; + else if ("extract-images".equals(strAction)) action = ACTION_EXTRACT_IMAGES; + else if ("extract_images".equals(strAction)) action = ACTION_EXTRACT_IMAGES; else throw engine.getExceptionUtil().createApplicationException( - "Invalid PDF action [" + strAction + "], supported actions are " + "[addHeader, addFooter, addWatermark, deletePages, extractText, getInfo, merge, open, " + "Invalid PDF action [" + strAction + "], supported actions are " + "[addHeader, addFooter, addWatermark, deletePages, extractText, extractImage, getInfo, merge, open, " + "removePassword, protect, read, removeWatermark, setInfo, thumbnail, write]"); } @@ -692,9 +697,11 @@ public int doEndTag() throws PageException { else if (ACTION_PROTECT == action) doActionProtect(true); else if (ACTION_OPEN == action) doActionProtect(false); else if (ACTION_THUMBNAIL == action) doActionThumbnail(); + else if (ACTION_EXTRACT_IMAGES == action) doActionExtractImages(); else if (ACTION_EXTRACT_TEXT == action) { doActionExtractText(); } + // else if(ACTION_PROCESSDDX==action) throw // engine.getExceptionUtil().createApplicationException("action [processddx] not supported"); @@ -933,7 +940,7 @@ private void doActionThumbnail() throws PageException, IOException, DocumentExce Resource resource; if (imagePrefix == null) imagePrefix = (resource = doc.getResource()) != null ? getName(resource.getName()): "thumbnail"; - PDFUtil.thumbnail(pageContext, doc, destination.toString(), pageSet, format, imagePrefix, scale); + PDFUtil.thumbnail(pageContext, doc, destination.toString(), pageSet, format, imagePrefix, scale, overwrite); } finally { reader.close(); @@ -1446,6 +1453,20 @@ private void doActionExtractText() throws PageException, IOException { } } + private void doActionExtractImages() throws PageException, IOException, InvalidPasswordException { + required("pdf", "extractImages", "source", source); + required("pdf", "extractImages", "destination", destination); + required("pdf", "extractImages", "imagePrefix", imagePrefix); + required("pdf", "extractImages", "format", format); + PDFStruct doc = toPDFDocument(source, password, null); + PdfReader reader = doc.getPdfReader(); + int len = reader.getNumberOfPages(); + if (pages == null || pages.equals("*")) pages = "1-" + len + ""; + Set pageSet = PDFUtil.parsePageDefinition(pages, len); + + PDFUtil.extractImages(pageContext,doc,pageSet,destination,imagePrefix, format, overwrite); + } + private Object allowed(boolean encrypted, int permissions, int permission) { return (!encrypted || (permissions & permission) > 0) ? "Allowed" : "Not Allowed"; } diff --git a/source/java/src/org/lucee/extension/pdf/util/PDFUtil.java b/source/java/src/org/lucee/extension/pdf/util/PDFUtil.java index b2ea912..360d93d 100755 --- a/source/java/src/org/lucee/extension/pdf/util/PDFUtil.java +++ b/source/java/src/org/lucee/extension/pdf/util/PDFUtil.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -33,8 +34,13 @@ import java.util.Map; import java.util.Set; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; import org.lucee.extension.pdf.PDFStruct; @@ -384,7 +390,7 @@ public static Object extractText(PDFStruct doc, Set pageNumbers, int ty // return pdDoc.getDocumentCatalog().getAllPages().get(2); } - public static void thumbnail(PageContext pc, PDFStruct doc, String destination, Set pageNumbers, String format, String imagePrefix, int scale) throws IOException { + public static void thumbnail(PageContext pc, PDFStruct doc, String destination, Set pageNumbers, String format, String imagePrefix, int scale, boolean overwrite) throws IOException { CFMLEngine engine = CFMLEngineFactory.getInstance(); @@ -406,7 +412,48 @@ public static void thumbnail(PageContext pc, PDFStruct doc, String destination, BufferedImage thumbnailImage = pdfRender.renderImageWithDPI(p - 1, scale); ByteArrayOutputStream baos = new ByteArrayOutputStream(); ImageIO.write(thumbnailImage, format, baos); // this one not support .tiff format - engine.getIOUtil().copy(new ByteArrayInputStream(baos.toByteArray()), engine.getResourceUtil().toResourceNotExisting(pc, imageDestination), true); + Resource res = engine.getResourceUtil().toResourceNotExisting(pc, imageDestination); + if (res.exists() && !overwrite) throw new RuntimeException("Thumbnail image file already exists [" + imageDestination + "] and overwrite was false"); + engine.getIOUtil().copy(new ByteArrayInputStream(baos.toByteArray()), res, true); } } + + public static void extractImages(PageContext pc,PDFStruct doc, Set pageNumbers,Resource destination, String imagePrefix, String format, boolean overwrite) throws IOException, InvalidPasswordException,PageException { + + PDDocument pdDoc = doc.toPDDocument(); + int n = pdDoc.getNumberOfPages(); + Iterator it = pageNumbers.iterator(); + int p; + PDPageTree pages= pdDoc.getPages(); + int i = 1; + while (it.hasNext()) { + p = it.next(); + if (p > n) throw new RuntimeException("pdf page size [" + p + "] out of range, maximum page size is [" + n + "]"); + PDResources pdResources = pages.get(p - 1).getResources(); + + // workjaround, getXObjectNames() returns images in reverse order + ArrayList xObjectNamesReversed = new ArrayList<>(); + for (COSName name : pdResources.getXObjectNames()) { + xObjectNamesReversed.add(name); + } + Collections.reverse(xObjectNamesReversed); + + for (COSName name : xObjectNamesReversed) { + PDXObject o = pdResources.getXObject(name); + + if (o instanceof PDImageXObject) { + PDImageXObject image = (PDImageXObject)o; + String filename = destination + "/" + imagePrefix + "-" + i + "." + format; + CFMLEngine engine = CFMLEngineFactory.getInstance(); + Resource res = engine.getResourceUtil().toResourceNotExisting(pc,filename); + if (res.exists() && !overwrite) throw new RuntimeException("image file already exists [" + filename + "] and overwrite was false"); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ImageIO.write(image.getImage(), format, baos); + CFMLEngineFactory.getInstance().getIOUtil().copy(new ByteArrayInputStream(baos.toByteArray()),res.getOutputStream(),true, true); + i++; + } + } + } + + } } diff --git a/tests/LDEV3048.cfc b/tests/LDEV3048.cfc new file mode 100644 index 0000000..2c673a6 --- /dev/null +++ b/tests/LDEV3048.cfc @@ -0,0 +1,120 @@ +component extends="org.lucee.cfml.test.LuceeTestCase" labels="pdf" { + + function beforeAll() { + variables.outputDir = getDirectoryFromPath(getCurrentTemplatePath()) & "LDEV3048_images\"; + if (!directoryExists( variables.outputDir ) ) + directoryCreate( variables.outputDir ); + + var img1file = getTempFile( variables.outputDir, "ldev3048-1", "png" ); + var img2file = getTempFile( variables.outputDir, "ldev3048-2", "png" ); + var img3file = getTempFile( variables.outputDir, "ldev3048-3", "png" ); + + var img1 = ImageNew("", 111, 111, "rgb", "red"); + var img2 = ImageNew("", 222, 222, "rgb", "yellow"); + var img3 = ImageNew("", 333, 333, "rgb", "green"); + + imageWrite(img1, img1file, true); + imageWrite(img2, img2file, true); + imageWrite(img3, img3file, true); + + if (!directoryExists(variables.outputDir)) directoryCreate(variables.outputDir); + document fileName="#variables.outputDir#noImages.pdf" name="pdfVar" overwrite=true { + writeoutput("test pdf file"); + } + + document fileName="#variables.outputDir#withImages.pdf" name="pdfVar" overwrite=true { + ``` + + + + + + LDEV-3048 + + + +
+ +
+
+ + +
+ + +
+ ``` + } + fileDelete(img1File); + fileDelete(img2File); + fileDelete(img3File); + } + + + + function run( testResults , testBox ) { + describe( "testcase for LDEV-3048", function() { + + it( title="cfpdf extractImages, pdf with no images", body=function( currentSpec ) { + pdf action="extractImages" source="#outputDir#noImages.pdf" + overwrite="true" format="png" imageprefix="no-image" password="" + destination="#outputDir#"; + + var imageFiles = directoryList( path=outputDir, filter="no-image*.png" ); + + expect( len( imageFiles ) ).toBe( 0 ); + }); + + it( title="cfpdf extractImages, pdf with 2 images, 1 per page", body=function( currentSpec ) { + pdf action="extractImages" source="#outputDir#withImages.pdf" pages="*" + overwrite="true" format="png" imageprefix="two-image" password="" + destination="#outputDir#"; + + var imageFiles = directoryList( path=outputDir, filter="two-image*.png" ); + + expect( len( imageFiles ) ).toBe( 3 ); + var imgInfo = ImageInfo( outputDir & "two-image-1.png" ); + expect( imgInfo.height ).toBe( 111 ); + expect( imgInfo.width ).toBe( 111 ); + + }); + + it( title="cfpdf extractImages, pdf with 2 images, 1 per page, only from page 2", body=function( currentSpec ) { + pdf action="extractImages" source="#outputDir#withImages.pdf" pages="2" + overwrite="true" format="png" imageprefix="page-image" password="" + destination="#outputDir#"; + + var imageFiles = directoryList( path=outputDir, filter="page-image*.png" ); + + expect( len( imageFiles ) ).toBe( 2 ); + var imgInfo = ImageInfo( outputDir & "page-image-1.png" ); + expect( imgInfo.height ).toBe( 222 ); + expect( imgInfo.width ).toBe( 222 ); + + expect(function(){ + pdf action="extractImages" source="#outputDir#withImages.pdf" pages="2" + overwrite="false" format="png" imageprefix="page-image" password="" + destination="#outputDir#"; + }).toThrow(); // overwrite="false" and images already exist + }); + + it( title="cfpdf extractImages, invalid image format", body=function( currentSpec ) { + expect(function(){ + pdf action="extractImages" source="#outputDir#withImages.pdf" pages="2" + overwrite="true" format="monkey" imageprefix="invalid-image" password="" + destination="#outputDir#"; + }).toThrow(); + }); + + }); + } + + function afterAll() { + if ( directoryExists( variables.outputDir ) ) + directoryDelete(variables.outputDir, true); + } +} diff --git a/tests/LDEV967.cfc b/tests/LDEV967.cfc index d0356c6..60cbd44 100644 --- a/tests/LDEV967.cfc +++ b/tests/LDEV967.cfc @@ -65,6 +65,18 @@ component extends = "org.lucee.cfml.test.LuceeTestCase" labels="pdf" { expect(arrayEvery(imgFiles, (e) => { return listLast(e,".") == "png"})).toBeTrue(); }); + it(title="CFPDF action=thumbnail - overwrite false", body=function( currentSpec ) { + pdf action="thumbnail" source="#res#" overwrite="true" destination="#variables.thumbnaildir#" imageprefix="thumbImage"; + var imgFiles = directoryList( path="#variables.thumbnaildir#", listInfo="name"); + imgFiles.sort("text"); + expect(imgFiles[1]).toBe("thumbImage_page_1.jpg"); + expect(arrayEvery(imgFiles, (e) => { return find("thumbImage", e)})).toBeTrue(); + + expect(function(){ + pdf action="thumbnail" source="#res#" overwrite="false" destination="#variables.thumbnaildir#" imageprefix="thumbImage"; + }).toThrow(); // overwite is false and file exists + }); + }); }