Skip to content

Commit

Permalink
Fix null pointer read while parsing CDX file
Browse files Browse the repository at this point in the history
  • Loading branch information
the-blank-x committed Jan 26, 2024
1 parent 8f34226 commit 3a99457
Showing 1 changed file with 26 additions and 6 deletions.
32 changes: 26 additions & 6 deletions src/warc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1427,7 +1427,8 @@ store_warc_record (const char *uri, const char *date, const char *uuid,
checksum and record ID fields. */
static bool
warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
int *field_num_checksum, int *field_num_record_id)
int *field_num_date, int *field_num_checksum,
int *field_num_record_id)
{
char *token;
char *save_ptr;
Expand All @@ -1451,6 +1452,9 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
case 'a':
*field_num_original_url = field_num;
break;
case 'b':
*field_num_date = field_num;
break;
case 'k':
*field_num_checksum = field_num;
break;
Expand All @@ -1464,16 +1468,19 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
}

return *field_num_original_url != -1
&& *field_num_date != -1
&& *field_num_checksum != -1
&& *field_num_record_id != -1;
}

/* Parse the CDX record and add it to the warc_dedup_table hash table. */
static void
warc_process_cdx_line (char *lineptr, int field_num_original_url,
int field_num_checksum, int field_num_record_id)
int field_num_date, int field_num_checksum,
int field_num_record_id)
{
char *original_url = NULL;
char *date = NULL;
char *checksum = NULL;
char *record_id = NULL;
char *token;
Expand All @@ -1487,6 +1494,8 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
char **val;
if (field_num == field_num_original_url)
val = &original_url;
else if (field_num == field_num_date)
val = &date;
else if (field_num == field_num_checksum)
val = &checksum;
else if (field_num == field_num_record_id)
Expand All @@ -1501,7 +1510,10 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
field_num++;
}

if (original_url != NULL && checksum != NULL && record_id != NULL)
if (original_url != NULL
&& date != NULL
&& checksum != NULL
&& record_id != NULL)
{
/* For some extra efficiency, we decode the base32 encoded
checksum value. This should produce exactly SHA1_DIGEST_SIZE
Expand All @@ -1515,12 +1527,13 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
{
/* This is a valid line with a valid checksum. */
store_warc_record(original_url, NULL, record_id, checksum_v);
store_warc_record(original_url, date, record_id, checksum_v);
xfree (checksum_v);
}
else
{
xfree (original_url);
xfree (date);
xfree (checksum_v);
xfree (record_id);
}
Expand All @@ -1529,6 +1542,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
{
xfree(checksum);
xfree(original_url);
xfree(date);
xfree(record_id);
}
}
Expand All @@ -1543,6 +1557,7 @@ warc_load_cdx_dedup_file (void)
size_t n = 0;
ssize_t line_length;
int field_num_original_url = -1;
int field_num_date = -1;
int field_num_checksum = -1;
int field_num_record_id = -1;

Expand All @@ -1558,7 +1573,8 @@ warc_load_cdx_dedup_file (void)
line_length = getline (&lineptr, &n, f);
if (line_length != -1)
warc_parse_cdx_header (lineptr, &field_num_original_url,
&field_num_checksum, &field_num_record_id);
&field_num_date, &field_num_checksum,
&field_num_record_id);

/* If the file contains all three fields, read the complete file. */
if (field_num_original_url == -1
Expand All @@ -1568,6 +1584,9 @@ warc_load_cdx_dedup_file (void)
if (field_num_original_url == -1)
logprintf (LOG_NOTQUIET,
_("CDX file does not list original urls. (Missing column 'a'.)\n"));
if (field_num_date == -1)
logprintf (LOG_NOTQUIET,
_("CDX file does not list dates. (Missing column 'b'.)\n"));
if (field_num_checksum == -1)
logprintf (LOG_NOTQUIET,
_("CDX file does not list checksums. (Missing column 'k'.)\n"));
Expand All @@ -1587,7 +1606,8 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
if (line_length != -1)
{
warc_process_cdx_line (lineptr, field_num_original_url,
field_num_checksum, field_num_record_id);
field_num_date, field_num_checksum,
field_num_record_id);
}

}
Expand Down

0 comments on commit 3a99457

Please sign in to comment.