Skip to content

Commit

Permalink
PDF: Minor optimizations
Browse files Browse the repository at this point in the history
Store temp files with obj id and gen id so analysts know which is which.

Don't dump decoded objects immediately. They'll get dumped later at the
end of pdf_extract_obj().

At the end of PDF object extraction, we don't need to find out the
"dumpid" (aka the object index in our list of pdf objects).
It isn't actually used! So I removed the unused parameter.
  • Loading branch information
micahsnyder committed Jan 22, 2024
1 parent 35f277c commit ebe3c50
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 63 deletions.
25 changes: 9 additions & 16 deletions libclamav/pdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1040,15 +1040,13 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha

#define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION))

static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, int dumpid)
static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd)
{
int ret;
struct cli_bc_ctx *bc_ctx;
cli_ctx *ctx = NULL;
fmap_t *map;

UNUSEDPARAM(dumpid);

if (NULL == pdf)
return CL_EARG;

Expand Down Expand Up @@ -1387,7 +1385,7 @@ static void process(struct text_norm_state *s, enum cstate *st, const char *buf,
} while (length > 0);
}

static int pdf_scan_contents(int fd, struct pdf_struct *pdf)
static int pdf_scan_contents(int fd, struct pdf_struct *pdf, struct pdf_obj *obj)
{
struct text_norm_state s;
char fullname[1024];
Expand All @@ -1398,7 +1396,7 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf)
cl_error_t rc;
enum cstate st = CSTATE_NONE;

snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf%02u_c", pdf->dir, (pdf->files - 1));
snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d contents", pdf->dir, obj->id >> 8, obj->id & 0xff);
fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600);
if (fout < 0) {
char err[128];
Expand Down Expand Up @@ -1481,7 +1479,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t

cli_dbgmsg("pdf_extract_obj: dumping obj %u %u\n", obj->id >> 8, obj->id & 0xff);

snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf%02u", pdf->dir, pdf->files++);
snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d", pdf->dir, obj->id >> 8, obj->id & 0xff);
fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600);
if (fout < 0) {
char err[128];
Expand Down Expand Up @@ -1839,12 +1837,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
}

if ((rc == CL_CLEAN) || (rc == CL_VIRUS)) {
unsigned int dumpid = 0;
for (dumpid = 0; dumpid < pdf->nobjs; dumpid++) {
if (pdf->objs[dumpid] == obj)
break;
}
rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout, dumpid);
rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout);
if (rc2 == CL_VIRUS) {
rc = rc2;
goto really_done;
Expand All @@ -1855,7 +1848,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
lseek(fout, 0, SEEK_SET);
cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff);

rc2 = pdf_scan_contents(fout, pdf);
rc2 = pdf_scan_contents(fout, pdf, obj);
if (rc2 != CL_SUCCESS) {
rc = rc2;
goto really_done;
Expand Down Expand Up @@ -3644,7 +3637,7 @@ static cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf)
}

if (CL_SUCCESS == status) {
status = run_pdf_hooks(pdf, PDF_PHASE_PARSED, -1, -1);
status = run_pdf_hooks(pdf, PDF_PHASE_PARSED, -1);
cli_dbgmsg("pdf_find_and_extract_objs: (parsed hooks) returned %d\n", status);
}

Expand Down Expand Up @@ -3880,7 +3873,7 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)

pdf.startoff = offset;

rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1, -1);
rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1);
if (CL_SUCCESS != rc) {
cli_dbgmsg("cli_pdf: (pre hooks) returning %d\n", rc);

Expand Down Expand Up @@ -3909,7 +3902,7 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)

if (pdf.flags && CL_SUCCESS == rc) {
cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1, -1);
rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1);

if (CL_SUCCESS == rc && SCAN_HEURISTICS && (ctx->dconf->other & OTHER_CONF_PDFNAMEOBJ)) {
if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) {
Expand Down
47 changes: 0 additions & 47 deletions libclamav/pdfdecode.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ struct pdf_token {
};

static size_t pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token, int fout, cl_error_t *status, struct objstm_struct *objstm);
static cl_error_t pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token, uint32_t lvl);

static cl_error_t filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
static cl_error_t filter_rldecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
Expand Down Expand Up @@ -338,13 +337,6 @@ static size_t pdf_decodestream_internal(
break;
}
token->success++;

/* Dump the stream content to a text file if keeptmp is enabled. */
if (pdf->ctx->engine->keeptmp) {
if (CL_SUCCESS != pdf_decode_dump(pdf, obj, token, i + 1)) {
cli_errmsg("pdf_decodestream_internal: failed to write decoded stream content to temp file\n");
}
}
}

if ((token->success > 0) && (NULL != token->content)) {
Expand Down Expand Up @@ -399,45 +391,6 @@ static size_t pdf_decodestream_internal(
return bytes_scanned;
}

/**
* @brief Dump PDF filter content such as stream contents to a temp file.
*
* Temp file is created in the pdf->dir directory.
* Filename format is "pdf<pdf->files-1>_<lvl>".
*
* @param pdf Pdf context structure.
* @param obj The object we found the filter content in.
* @param token The struct for the filter contents.
* @param lvl A unique index to distinguish the files from each other.
* @return cl_error_t
*/
static cl_error_t pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token, uint32_t lvl)
{
char fname[1024];
int ifd;

snprintf(fname, sizeof(fname), "%s" PATHSEP "pdf%02u_%02u", pdf->dir, (pdf->files - 1), lvl);
ifd = open(fname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600);
if (ifd < 0) {
char err[128];

cli_errmsg("cli_pdf: can't create intermediate temporary file %s: %s\n", fname, cli_strerror(errno, err, sizeof(err)));
return CL_ETMPFILE;
}

cli_dbgmsg("cli_pdf: decoded filter %u obj %u %u\n", lvl, obj->id >> 8, obj->id & 0xff);
cli_dbgmsg(" ... to %s\n", fname);

if (cli_writen(ifd, token->content, token->length) != token->length) {
cli_errmsg("cli_pdf: failed to write output file\n");
close(ifd);
return CL_EWRITE;
}

close(ifd);
return CL_SUCCESS;
}

/*
* ascii85 inflation
* See http://www.piclist.com/techref/method/encode.htm (look for base85)
Expand Down

0 comments on commit ebe3c50

Please sign in to comment.