diff --git a/config/enrichments/06_url.conf b/config/enrichments/06_url.conf index 076790b7..f2f16610 100644 --- a/config/enrichments/06_url.conf +++ b/config/enrichments/06_url.conf @@ -1,12 +1,13 @@ # Copyright [2021] [Cargill, Incorporated.] # SPDX-License-Identifier: Apache-2.0 -# 1. Removes all url.x fields, except for [url][full] -# 2. Copies [url][full] to [url][original] -# 3. Lowercase [url][full] -# 4. If [url][full] then parses [url][full] into subfields -# 5. if [url][domain] processes [url][domain] with Top Level Domain (TLD) filter -# 6. Rename tld fields to ECS fields -# 7. Remove the remaining TLD fields not ECS +# 1. Copy all url.original to url.full if full does not exsist +# 2. Removes all url.x fields, except for [url][full] and [url][original] +# 2. Copies [url][full] to [url][original] if does not exsit +# 4. Lowercase [url][full] +# 5. If [url][full] then parses [url][full] into subfields +# 6. if [url][domain] processes [url][domain] with Top Level Domain (TLD) filter +# 7. Rename tld fields to ECS fields +# 8. Remove the remaining TLD fields not ECS filter { if "disable_url_enrichment" in [tags] or "disable_code_reduction" in [tags] or "disable_enrichments" in [tags] { mutate { @@ -14,21 +15,26 @@ filter { } } else { ### URL enrichment + if [url][original] and [url][original] != "" and ![url][full] { + mutate { + copy => { "[url][original]" => "[url][full]" } + } + } if [url][full] and [url][full] != "" { mutate { - remove_field => [ "[url][domain]", "[url][extension]", "[url][fragment]", "[url][full][text]", "[url][original]", "[url][original][text]", "[url][password]", "[url][path]", "[url][port]", "[url][query]", "[url][registered_domain]", "[url][scheme]", "[url][top_level_domain]", "[url][username]" ] - tag_on_failure => "_mutate_error_url_en_1" + remove_field => [ "[url][domain]", "[url][extension]", "[url][fragment]", "[url][full][text]", "[url][original][text]", "[url][password]", "[url][path]", "[url][port]", "[url][query]", "[url][registered_domain]", "[url][scheme]", "[url][top_level_domain]", "[url][username]" ] } ### mutate order of operation has lowercase before copy - mutate { - copy => { "[url][full]" => "[url][original]" } - tag_on_failure => "_mutate_error_url_en_2" + if ![url][original] { + mutate { + copy => { "[url][full]" => "[url][original]" } + } } mutate { lowercase => [ "[url][full]" ] } grok { - match => {"[url][full]" => "^((?<[url][scheme]>[A-Za-z]+(\+[A-Za-z+]+)?):\/\/)?((?<[url][username]>.*?):(?<[url][password]>.*?)@)?(?<[url][domain]>\w+(\.|\-+)\w+(\.|\-?\w+)+)(:(?<[url][port]>\d+))?(\/(?<[url][path]>.*?))?(\?(?<[url][query]>.*?))?(\#(?<[url][fragment]>.*?))?$"} + match => {"[url][full]" => "^((?<[url][scheme]>.*?)://)?((?<[user][name]>.*?):(?<[user][password]>.*?)@)?(?<[url][domain]>\w+((\.\w+){1,})|\d+\.\d+\.\d+\.\d+)(:(?<[url][port]>\d+))?(/|$)((?<[url][path]>.*?))?(\?(?<[url][query]>.*?))?(\#(?<[url][fragment]>.*?))?$" } timeout_millis => 500 tag_on_failure => "_groktimeout_url_en_1" } @@ -48,12 +54,16 @@ filter { # https://github.com/logstash-plugins/logstash-filter-tld/issues/8 tld { source => "[url][domain]" - target => "tld_object" - } + target => "tmp" + } mutate { - rename => {"[tld_object][tld]" => "[url][top_level_domain]"} - rename => {"[tld_object][domain]" => "[url][registered_domain]"} - remove_field => ["tld_object"] + convert => { "[url][port]" => "integer" } + rename => { + "[tmp][domain]" => "[url][registered_domain]" + "[tmp][sld]" => "[url][subdomain]" + "[tmp][tld]" => "[url][top_level_domain]" + } + "remove_field" => [ "tmp" ] } } }