From 3a99457225327e098f532ae5be004953e1e2f324 Mon Sep 17 00:00:00 2001 From: blankie Date: Sat, 27 Jan 2024 08:47:33 +1100 Subject: [PATCH] Fix null pointer read while parsing CDX file --- src/warc.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/warc.c b/src/warc.c index 50822225..bf9d58dc 100644 --- a/src/warc.c +++ b/src/warc.c @@ -1427,7 +1427,8 @@ store_warc_record (const char *uri, const char *date, const char *uuid, checksum and record ID fields. */ static bool warc_parse_cdx_header (char *lineptr, int *field_num_original_url, - int *field_num_checksum, int *field_num_record_id) + int *field_num_date, int *field_num_checksum, + int *field_num_record_id) { char *token; char *save_ptr; @@ -1451,6 +1452,9 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url, case 'a': *field_num_original_url = field_num; break; + case 'b': + *field_num_date = field_num; + break; case 'k': *field_num_checksum = field_num; break; @@ -1464,6 +1468,7 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url, } return *field_num_original_url != -1 + && *field_num_date != -1 && *field_num_checksum != -1 && *field_num_record_id != -1; } @@ -1471,9 +1476,11 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url, /* Parse the CDX record and add it to the warc_dedup_table hash table. */ static void warc_process_cdx_line (char *lineptr, int field_num_original_url, - int field_num_checksum, int field_num_record_id) + int field_num_date, int field_num_checksum, + int field_num_record_id) { char *original_url = NULL; + char *date = NULL; char *checksum = NULL; char *record_id = NULL; char *token; @@ -1487,6 +1494,8 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, char **val; if (field_num == field_num_original_url) val = &original_url; + else if (field_num == field_num_date) + val = &date; else if (field_num == field_num_checksum) val = &checksum; else if (field_num == field_num_record_id) @@ -1501,7 +1510,10 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, field_num++; } - if (original_url != NULL && checksum != NULL && record_id != NULL) + if (original_url != NULL + && date != NULL + && checksum != NULL + && record_id != NULL) { /* For some extra efficiency, we decode the base32 encoded checksum value. This should produce exactly SHA1_DIGEST_SIZE @@ -1515,12 +1527,13 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE) { /* This is a valid line with a valid checksum. */ - store_warc_record(original_url, NULL, record_id, checksum_v); + store_warc_record(original_url, date, record_id, checksum_v); xfree (checksum_v); } else { xfree (original_url); + xfree (date); xfree (checksum_v); xfree (record_id); } @@ -1529,6 +1542,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, { xfree(checksum); xfree(original_url); + xfree(date); xfree(record_id); } } @@ -1543,6 +1557,7 @@ warc_load_cdx_dedup_file (void) size_t n = 0; ssize_t line_length; int field_num_original_url = -1; + int field_num_date = -1; int field_num_checksum = -1; int field_num_record_id = -1; @@ -1558,7 +1573,8 @@ warc_load_cdx_dedup_file (void) line_length = getline (&lineptr, &n, f); if (line_length != -1) warc_parse_cdx_header (lineptr, &field_num_original_url, - &field_num_checksum, &field_num_record_id); + &field_num_date, &field_num_checksum, + &field_num_record_id); /* If the file contains all three fields, read the complete file. */ if (field_num_original_url == -1 @@ -1568,6 +1584,9 @@ warc_load_cdx_dedup_file (void) if (field_num_original_url == -1) logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n")); + if (field_num_date == -1) + logprintf (LOG_NOTQUIET, +_("CDX file does not list dates. (Missing column 'b'.)\n")); if (field_num_checksum == -1) logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n")); @@ -1587,7 +1606,8 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); if (line_length != -1) { warc_process_cdx_line (lineptr, field_num_original_url, - field_num_checksum, field_num_record_id); + field_num_date, field_num_checksum, + field_num_record_id); } }