Skip to content

Commit

Permalink
Updated UIL parsing to match addtional patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
brian-grabau committed Aug 19, 2024
1 parent 3258d47 commit e135360
Showing 1 changed file with 28 additions and 18 deletions.
46 changes: 28 additions & 18 deletions config/enrichments/06_url.conf
Original file line number Diff line number Diff line change
@@ -1,34 +1,40 @@
# Copyright [2021] [Cargill, Incorporated.]
# SPDX-License-Identifier: Apache-2.0
# 1. Removes all url.x fields, except for [url][full]
# 2. Copies [url][full] to [url][original]
# 3. Lowercase [url][full]
# 4. If [url][full] then parses [url][full] into subfields
# 5. if [url][domain] processes [url][domain] with Top Level Domain (TLD) filter
# 6. Rename tld fields to ECS fields
# 7. Remove the remaining TLD fields not ECS
# 1. Copy all url.original to url.full if full does not exsist
# 2. Removes all url.x fields, except for [url][full] and [url][original]
# 2. Copies [url][full] to [url][original] if does not exsit
# 4. Lowercase [url][full]
# 5. If [url][full] then parses [url][full] into subfields
# 6. if [url][domain] processes [url][domain] with Top Level Domain (TLD) filter
# 7. Rename tld fields to ECS fields
# 8. Remove the remaining TLD fields not ECS
filter {
if "disable_url_enrichment" in [tags] or "disable_code_reduction" in [tags] or "disable_enrichments" in [tags] {
mutate {
remove_tag => ["disable_url_enrichment"]
}
} else {
### URL enrichment
if [url][original] and [url][original] != "" and ![url][full] {
mutate {
copy => { "[url][original]" => "[url][full]" }
}
}
if [url][full] and [url][full] != "" {
mutate {
remove_field => [ "[url][domain]", "[url][extension]", "[url][fragment]", "[url][full][text]", "[url][original]", "[url][original][text]", "[url][password]", "[url][path]", "[url][port]", "[url][query]", "[url][registered_domain]", "[url][scheme]", "[url][top_level_domain]", "[url][username]" ]
tag_on_failure => "_mutate_error_url_en_1"
remove_field => [ "[url][domain]", "[url][extension]", "[url][fragment]", "[url][full][text]", "[url][original][text]", "[url][password]", "[url][path]", "[url][port]", "[url][query]", "[url][registered_domain]", "[url][scheme]", "[url][top_level_domain]", "[url][username]" ]
}
### mutate order of operation has lowercase before copy
mutate {
copy => { "[url][full]" => "[url][original]" }
tag_on_failure => "_mutate_error_url_en_2"
if ![url][original] {
mutate {
copy => { "[url][full]" => "[url][original]" }
}
}
mutate {
lowercase => [ "[url][full]" ]
}
grok {
match => {"[url][full]" => "^((?<[url][scheme]>[A-Za-z]+(\+[A-Za-z+]+)?):\/\/)?((?<[url][username]>.*?):(?<[url][password]>.*?)@)?(?<[url][domain]>\w+(\.|\-+)\w+(\.|\-?\w+)+)(:(?<[url][port]>\d+))?(\/(?<[url][path]>.*?))?(\?(?<[url][query]>.*?))?(\#(?<[url][fragment]>.*?))?$"}
match => {"[url][full]" => "^((?<[url][scheme]>.*?)://)?((?<[user][name]>.*?):(?<[user][password]>.*?)@)?(?<[url][domain]>\w+((\.\w+){1,})|\d+\.\d+\.\d+\.\d+)(:(?<[url][port]>\d+))?(/|$)((?<[url][path]>.*?))?(\?(?<[url][query]>.*?))?(\#(?<[url][fragment]>.*?))?$" }
timeout_millis => 500
tag_on_failure => "_groktimeout_url_en_1"
}
Expand All @@ -48,12 +54,16 @@ filter {
# https://github.com/logstash-plugins/logstash-filter-tld/issues/8
tld {
source => "[url][domain]"
target => "tld_object"
}
target => "tmp"
}
mutate {
rename => {"[tld_object][tld]" => "[url][top_level_domain]"}
rename => {"[tld_object][domain]" => "[url][registered_domain]"}
remove_field => ["tld_object"]
convert => { "[url][port]" => "integer" }
rename => {
"[tmp][domain]" => "[url][registered_domain]"
"[tmp][sld]" => "[url][subdomain]"
"[tmp][tld]" => "[url][top_level_domain]"
}
"remove_field" => [ "tmp" ]
}
}
}
Expand Down

0 comments on commit e135360

Please sign in to comment.