From ebe3c5055514a2955a3459df01527d3fa89327ce Mon Sep 17 00:00:00 2001 From: Micah Snyder Date: Mon, 15 Jan 2024 23:03:02 -0500 Subject: [PATCH] PDF: Minor optimizations Store temp files with obj id and gen id so analysts know which is which. Don't dump decoded objects immediately. They'll get dumped later at the end of pdf_extract_obj(). At the end of PDF object extraction, we don't need to find out the "dumpid" (aka the object index in our list of pdf objects). It isn't actually used! So I removed the unused parameter. --- libclamav/pdf.c | 25 +++++++++-------------- libclamav/pdfdecode.c | 47 ------------------------------------------- 2 files changed, 9 insertions(+), 63 deletions(-) diff --git a/libclamav/pdf.c b/libclamav/pdf.c index ab4e08d428..059101c19c 100644 --- a/libclamav/pdf.c +++ b/libclamav/pdf.c @@ -1040,15 +1040,13 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha #define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION)) -static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, int dumpid) +static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd) { int ret; struct cli_bc_ctx *bc_ctx; cli_ctx *ctx = NULL; fmap_t *map; - UNUSEDPARAM(dumpid); - if (NULL == pdf) return CL_EARG; @@ -1387,7 +1385,7 @@ static void process(struct text_norm_state *s, enum cstate *st, const char *buf, } while (length > 0); } -static int pdf_scan_contents(int fd, struct pdf_struct *pdf) +static int pdf_scan_contents(int fd, struct pdf_struct *pdf, struct pdf_obj *obj) { struct text_norm_state s; char fullname[1024]; @@ -1398,7 +1396,7 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf) cl_error_t rc; enum cstate st = CSTATE_NONE; - snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf%02u_c", pdf->dir, (pdf->files - 1)); + snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d contents", pdf->dir, obj->id >> 8, obj->id & 0xff); fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600); if (fout < 0) { char err[128]; @@ -1481,7 +1479,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t cli_dbgmsg("pdf_extract_obj: dumping obj %u %u\n", obj->id >> 8, obj->id & 0xff); - snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf%02u", pdf->dir, pdf->files++); + snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d", pdf->dir, obj->id >> 8, obj->id & 0xff); fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600); if (fout < 0) { char err[128]; @@ -1839,12 +1837,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t } if ((rc == CL_CLEAN) || (rc == CL_VIRUS)) { - unsigned int dumpid = 0; - for (dumpid = 0; dumpid < pdf->nobjs; dumpid++) { - if (pdf->objs[dumpid] == obj) - break; - } - rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout, dumpid); + rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout); if (rc2 == CL_VIRUS) { rc = rc2; goto really_done; @@ -1855,7 +1848,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t lseek(fout, 0, SEEK_SET); cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff); - rc2 = pdf_scan_contents(fout, pdf); + rc2 = pdf_scan_contents(fout, pdf, obj); if (rc2 != CL_SUCCESS) { rc = rc2; goto really_done; @@ -3644,7 +3637,7 @@ static cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf) } if (CL_SUCCESS == status) { - status = run_pdf_hooks(pdf, PDF_PHASE_PARSED, -1, -1); + status = run_pdf_hooks(pdf, PDF_PHASE_PARSED, -1); cli_dbgmsg("pdf_find_and_extract_objs: (parsed hooks) returned %d\n", status); } @@ -3880,7 +3873,7 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) pdf.startoff = offset; - rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1, -1); + rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1); if (CL_SUCCESS != rc) { cli_dbgmsg("cli_pdf: (pre hooks) returning %d\n", rc); @@ -3909,7 +3902,7 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) if (pdf.flags && CL_SUCCESS == rc) { cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags); - rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1, -1); + rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1); if (CL_SUCCESS == rc && SCAN_HEURISTICS && (ctx->dconf->other & OTHER_CONF_PDFNAMEOBJ)) { if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) { diff --git a/libclamav/pdfdecode.c b/libclamav/pdfdecode.c index 0893a7f596..e6c5ef6162 100644 --- a/libclamav/pdfdecode.c +++ b/libclamav/pdfdecode.c @@ -78,7 +78,6 @@ struct pdf_token { }; static size_t pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token, int fout, cl_error_t *status, struct objstm_struct *objstm); -static cl_error_t pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token, uint32_t lvl); static cl_error_t filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token); static cl_error_t filter_rldecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token); @@ -338,13 +337,6 @@ static size_t pdf_decodestream_internal( break; } token->success++; - - /* Dump the stream content to a text file if keeptmp is enabled. */ - if (pdf->ctx->engine->keeptmp) { - if (CL_SUCCESS != pdf_decode_dump(pdf, obj, token, i + 1)) { - cli_errmsg("pdf_decodestream_internal: failed to write decoded stream content to temp file\n"); - } - } } if ((token->success > 0) && (NULL != token->content)) { @@ -399,45 +391,6 @@ static size_t pdf_decodestream_internal( return bytes_scanned; } -/** - * @brief Dump PDF filter content such as stream contents to a temp file. - * - * Temp file is created in the pdf->dir directory. - * Filename format is "pdffiles-1>_". - * - * @param pdf Pdf context structure. - * @param obj The object we found the filter content in. - * @param token The struct for the filter contents. - * @param lvl A unique index to distinguish the files from each other. - * @return cl_error_t - */ -static cl_error_t pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token, uint32_t lvl) -{ - char fname[1024]; - int ifd; - - snprintf(fname, sizeof(fname), "%s" PATHSEP "pdf%02u_%02u", pdf->dir, (pdf->files - 1), lvl); - ifd = open(fname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600); - if (ifd < 0) { - char err[128]; - - cli_errmsg("cli_pdf: can't create intermediate temporary file %s: %s\n", fname, cli_strerror(errno, err, sizeof(err))); - return CL_ETMPFILE; - } - - cli_dbgmsg("cli_pdf: decoded filter %u obj %u %u\n", lvl, obj->id >> 8, obj->id & 0xff); - cli_dbgmsg(" ... to %s\n", fname); - - if (cli_writen(ifd, token->content, token->length) != token->length) { - cli_errmsg("cli_pdf: failed to write output file\n"); - close(ifd); - return CL_EWRITE; - } - - close(ifd); - return CL_SUCCESS; -} - /* * ascii85 inflation * See http://www.piclist.com/techref/method/encode.htm (look for base85)