Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new url cleaner feature #99

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 46 additions & 8 deletions BotNet.CommandHandlers/BotUpdate/Message/MessageUpdateHandler.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
using BotNet.Commands;
using System.Text.RegularExpressions;
using BotNet.Commands;
using BotNet.Commands.BotUpdate.Message;
using BotNet.Commands.CommandPrioritization;
using BotNet.Services.BotProfile;
using BotNet.Services.SocialLink;
using BotNet.Services.UrlCleaner;
using RG.Ninja;
using Telegram.Bot;
using Telegram.Bot.Types.Enums;
Expand Down Expand Up @@ -42,16 +44,17 @@ await _commandQueue.DispatchAsync(
return;
}

// Handle Social Link (better preview)
if ((update.Message.Text ?? update.Message.Caption) is { } textOrCaption) {
IEnumerable<Uri> possibleUrls = SocialLinkEmbedFixer.GetPossibleUrls(textOrCaption);

if (possibleUrls.Any()) {
// Handle Social Link (better preview)
IEnumerable<Uri> possibleSocialUrls = SocialLinkEmbedFixer.GetPossibleUrls(textOrCaption);
if (possibleSocialUrls.Any()) {
// Fire and forget
Task _ = Task.Run(async () => {
try {
foreach (Uri url in possibleUrls) {
Uri fixedUrl = SocialLinkEmbedFixer.Fix(url);
foreach (Uri url in possibleSocialUrls) {
Uri cleanedUrl = UrlCleaner.Clean(url);
Uri fixedUrl = SocialLinkEmbedFixer.Fix(cleanedUrl);
await _telegramBotClient.SendTextMessageAsync(
chatId: update.Message.Chat.Id,
text: $"Preview: {fixedUrl.OriginalString}",
Expand All @@ -65,6 +68,39 @@ await _telegramBotClient.SendTextMessageAsync(
});
return;
}

// get list of urls from message (start with http or https or www)
string pattern = @"(?i)\b((?:https?://|www\.)\S+)\b";
MatchCollection matches = Regex.Matches(textOrCaption, pattern);
List<string> urls = matches.Select(m => m.Value).ToList();

// Clean the url
if (urls.Count > 0) {
// Fire and forget
Task _ = Task.Run(async () => {
try {
foreach (string url in urls) {
Uri cleanedUrl = UrlCleaner.Clean(new Uri(url));

// if the url is same, don't send the message
if (cleanedUrl.OriginalString == new Uri(url).OriginalString) {
continue;
}

await _telegramBotClient.SendTextMessageAsync(
chatId: update.Message.Chat.Id,
text: $"Cleaned URL: {cleanedUrl.OriginalString}",
replyToMessageId: update.Message.MessageId,
cancellationToken: cancellationToken
);
}
} catch (OperationCanceledException) {
// Terminate gracefully
}
});
return;
}

}

// Handle reddit mirroring
Expand All @@ -74,7 +110,8 @@ await _telegramBotClient.SendTextMessageAsync(
Offset: var offset,
Length: var length
} && update.Message.Text?.Substring(offset, length) is { } url
&& url.StartsWith("https://www.reddit.com/", out string? remainingUrl)) {
&& UrlCleaner.Clean(new Uri(url)) is { } cleanedUrl
&& cleanedUrl.ToString().StartsWith("https://www.reddit.com/", out string? remainingUrl)) {
// Fire and forget
Task _ = Task.Run(async () => {
try {
Expand All @@ -93,7 +130,8 @@ await _telegramBotClient.SendTextMessageAsync(
} else if (update.Message?.Entities?.FirstOrDefault(entity => entity is {
Type: MessageEntityType.TextLink
}) is { Url: { } textUrl }
&& textUrl.StartsWith("https://www.reddit.com/", out string? remainingTextUrl)) {
&& UrlCleaner.Clean(new Uri(textUrl)) is { } cleanedTextUrl
&& cleanedTextUrl.ToString().StartsWith("https://www.reddit.com/", out string? remainingTextUrl)) {
// Fire and forget
Task _ = Task.Run(async () => {
try {
Expand Down
170 changes: 170 additions & 0 deletions BotNet.Services/CleanUrl/Rule.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace BotNet.Services.UrlCleaner {
public class Rule {
public required string Name { get; set; }
public required Regex Match { get; set; }
public required List<string> Rules { get; set; }
public List<string>? Replace { get; set; }

}

public static class RuleData {
/// <summary>
/// Represents a list of rules for cleaning URLs.
/// </summary>
public static List<Rule> Rules = [
new Rule
{
Name = "Global",
Match = new Regex("./*"),
Rules =
[
// https://en.wikipedia.org/wiki/UTM_parameters
"utm_source", "utm_medium", "utm_term", "utm_campaign",
"utm_content", "utm_name", "utm_cid", "utm_reader", "utm_viz_id",
"utm_pubreferrer", "utm_swu", "utm_social-type", "utm_brand",
"utm_team", "utm_feeditemid", "utm_id", "utm_marketing_tactic",
"utm_creative_format", "utm_campaign_id", "utm_source_platform",
"utm_timestamp", "utm_souce",
// ITM parameters, a variant of UTM parameters
"itm_source", "itm_medium", "itm_term", "itm_campaign", "itm_content",
"itm_channel", "itm_source_s", "itm_medium_s", "itm_campaign_s",
"itm_audience",
// INT parameters, another variant of UTM
"int_source", "int_cmp_name", "int_cmp_id", "int_cmp_creative",
"int_medium", "int_campaign",
// piwik
"pk_campaign", "pk_cpn", "pk_source", "pk_medium",
"pk_keyword", "pk_kwd", "pk_content", "pk_cid",
"piwik_campaign", "piwik_cpn", "piwik_source", "piwik_medium",
"piwik_keyword", "piwik_kwd", "piwik_content", "piwik_cid",
// Google Ads
"gclid", "ga_source", "ga_medium", "ga_term", "ga_content", "ga_campaign",
"ga_place", "gclid", "gclsrc",
// hhsa
"hsa_cam", "hsa_grp", "hsa_mt", "hsa_src", "hsa_ad", "hsa_acc",
"hsa_net", "hsa_kw", "hsa_tgt", "hsa_ver", "hsa_la", "hsa_ol",
// Facebook
"fbclid",
// Olytics
"oly_enc_id", "oly_anon_id",
// Vero
"vero_id", "vero_conv",
// Drip
"__s",
// HubSpot
"_hsenc", "_hsmi", "__hssc", "__hstc", "__hsfp", "hsCtaTracking",
// Marketo
"mkt_tok",
// Matomo
"mtm_campaign", "mtm_keyword", "mtm_kwd", "mtm_source", "mtm_medium",
"mtm_content", "mtm_cid", "mtm_group", "mtm_placement",
// Oracle Eloqua
"elqTrackId", "elq", "elqaid", "elqat", "elqCampaignId", "elqTrack",
// MailChimp
"mc_cid", "mc_eid",
// Other
"ncid", "cmpid", "mbid",
// Reddit Ads
"rdt_cid"
]
},
new Rule
{
Name = "audible.com",
Match = new Regex("www.audible.com", RegexOptions.IgnoreCase),
Rules = ["qid", "sr", "pf_rd_p", "pf_rd_r", "plink", "ref"]
},
new Rule
{
Name = "reddit.com",
Match = new Regex(@".*\.reddit\.com", RegexOptions.IgnoreCase),
Rules =
[
"ref_campaign", "ref_source", "tags", "keyword", "channel", "campaign",
"user_agent", "domain", "base_url", "$android_deeplink_path",
"$deeplink_path", "$og_redirect", "share_id", "correlation_id", "$deep_link", "post_index", "ref", "_branch_match_id", "post_fullname", "$3p", "_branch_referrer"
]
},
new Rule
{
Name = "facebook.com",
Match = new Regex(@".*\.facebook\.com", RegexOptions.IgnoreCase),
Rules =
[
"fbclid", "fb_ref", "fb_source", "referral_code", "referral_story_type", "tracking", "ref", "mibextid", "app", "_rdr", "m_entstream_source", "paipv", "locale", "eav"
],
},
new Rule
{
Name = "shopee.com",
Match = new Regex(@"^(?:https?:\/\/)?(?:[^.]+\.)?shopee\.[a-z0-9]{0,3}", RegexOptions.IgnoreCase),
Rules =
[
"af_siteid", "pid", "af_click_lookback", "af_viewthrough_lookback",
"is_retargeting", "af_reengagement_window", "af_sub_siteid", "c"
]
},
new Rule
{
Name = "instagram.com",
Match = new Regex(@"^(?:https?:\/\/)?(?:[^.]+\.)?instagram\.com", RegexOptions.IgnoreCase),
Rules = ["igshid", "source"],
},
new Rule
{
Name = "twitter.com or x.com",
Match = new Regex("(twitter.com|x.com)", RegexOptions.IgnoreCase),
Rules = ["s", "src", "ref_url", "ref_src"]
},
new Rule
{
Name = "youtube.com",
Match = new Regex(@".*\.youtube\.com", RegexOptions.IgnoreCase),
Rules = ["gclid", "feature", "app", "src", "lId", "cId", "embeds_referring_euri"],
},
new Rule
{
Name = "discord.com",
Match = new Regex(@".*\.discord\.com", RegexOptions.IgnoreCase),
Rules = ["source"]
},
new Rule
{
Name = "medium.com",
Match = new Regex(@"medium\.com", RegexOptions.IgnoreCase),
Rules = ["source"]
},
new Rule
{
Name = "apple.com",
Match = new Regex(@".*\.apple\.com", RegexOptions.IgnoreCase),
Rules = ["uo", "app", "at", "ct", "ls", "pt", "mt", "itsct", "itscg", "referrer", "src", "cid"]
},
new Rule
{
Name = "music.apple.com",
Match = new Regex(@"music\.apple\.com", RegexOptions.IgnoreCase),
Rules = ["i", "lId", "cId", "sr", "src"]
},
new Rule
{
Name = "play.google.com",
Match = new Regex(@"play\.google\.com", RegexOptions.IgnoreCase),
Rules = ["referrer", "pcampaignid"]
},
new Rule
{
Name = "bing.com",
Match = new Regex(@"^www\.bing\.com", RegexOptions.IgnoreCase),
Rules = [
"qs", "form", "sp", "pq", "sc", "sk", "cvid", "FORM",
"ck", "simid", "thid", "cdnurl", "pivotparams", "ghsh", "ghacc",
"ccid", "", "ru"
]
}
];
}
}
26 changes: 26 additions & 0 deletions BotNet.Services/CleanUrl/UrlCleaner.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using System;
using System.Text.RegularExpressions;

namespace BotNet.Services.UrlCleaner {
public partial class UrlCleaner {
/// <summary>
/// Cleans the specified URL by removing query parameters based on predefined rules.
/// </summary>
/// <param name="url">The URL to be cleaned.</param>
/// <returns>A cleaned URI.</returns>
public static Uri Clean(Uri url) {
foreach (Rule rule in RuleData.Rules) {
if (rule.Match.IsMatch(url.ToString())) {
foreach (string r in rule.Rules) {
url = new Uri(Regex.Replace(url.ToString(), $"[&?]({r})=?[^&]*", ""));
}
}
}

// Remove trailing '?' or '&' if present
string cleanedUrl = url.ToString().TrimEnd('?', '&').TrimEnd('/');

return new Uri(cleanedUrl);
}
}
}
20 changes: 20 additions & 0 deletions BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
using Xunit;
using BotNet.Services.UrlCleaner;

namespace BotNet.Tests.Services.CleanUrl {
public class UrlCleanerTests {

[Theory]
[InlineData("https://nasional.kompas.com/read/2024/01/10/17560541/jokowi-belum-ucapkan-selamat-ultah-ke-pdi-p-ganjar-lupa-kali?utm_source=Telegram&utm_medium=Referral&utm_campaign=Top_Desktop", "https://nasional.kompas.com/read/2024/01/10/17560541/jokowi-belum-ucapkan-selamat-ultah-ke-pdi-p-ganjar-lupa-kali")]
[InlineData("https://www.reddit.com/r/indonesia/comments/10nc28j/kerugian_udah_tembus_7m_gara2_bug_promo/?$deep_link=true&correlation_id=b1d34957-35e3-4ce1-9120-eb111509ae81&post_fullname=t3_10nc28j&post_index=1&ref=email_digest&ref_campaign=email_digest&ref_source=email&utm_content=post_title&utm_medium=Email%20Amazon%20SES&$3p=e_as&_branch_match_id=696254937267305114&_branch_referrer=H4sIAAAAAAAAA22QXWrDMBCET%2BO%2B2Yksp0kKoRRKr7GspY2jRH9IK9Ljd920fSpIMHyj3Rl0Yc71ZbMpZK3jAXMevIu3jc6v3TjpfCLA%2BiQyFbe4iB5a8afLOtXpt278kHO%2F34efeZOCgCLXRZsiVYeihQaKXEWqbTTj4SrqRqUtDiM0ixdgCnOrsA%2BwYMER5rZALimkNUJLymSJMqzdOv3OpVE3PptUCnlklyI4K3xWVk%2FH3b7XO9L9ZEj1RzVue5qVUrvtEemgZC6nynBu3kcMtK7T8NfrYUp7%2BhRnfV3oLIoCOg%2FWLVT5AcFgyOiW%2BL9bUyuGfj2BjQOYFFl%2BQuh3DDv29AW7S%2FV8gwEAAA%3D%3D", "https://www.reddit.com/r/indonesia/comments/10nc28j/kerugian_udah_tembus_7m_gara2_bug_promo/?$deep_link=true&$3p=e_as")]
[InlineData("https://www.kaorinusantara.or.id/newsline/194064/kak-seto-beneran-jadi-seto-kaiba-di-google?fbclid=IwAR2TTZgHLAAYJtZj_L5MKRGrHzrCa04_y8SMwYG-cteuyL6A5u1VVDjqh_c", "https://www.kaorinusantara.or.id/newsline/194064/kak-seto-beneran-jadi-seto-kaiba-di-google")]
[InlineData("https://www.facebook.com/groups/informatika.cringeposting/permalink/1110311033679168/?ref=share&mibextid=Cw5JYn", "https://www.facebook.com/groups/informatika.cringeposting/permalink/1110311033679168")]
[InlineData("https://www.instagram.com/reel/CvOeEfJhG0f/?igshid=NTc4MTIwNjQ2YQ%3D%3D", "https://www.instagram.com/reel/CvOeEfJhG0f")]
[InlineData("https://www.facebook.com/story.php?story_fbid=410430604519673&id=109479077948162&m_entstream_source=permalink&locale=ms_MY&paipv=0&eav=AfYTS-fXj_ioV3KMOuFjUx1hAV_g_FQX1W_Jfxi1SowzDV9LarINjBexw46zqBxKCTo&_rdr", "https://www.facebook.com/story.php?story_fbid=410430604519673&id=109479077948162")]
[InlineData("https://twitter.com/petergyang/status/1573489316147306496?ref_src=twsrc%5Etfw%7Ctwcamp%5Etweetembed%7Ctwterm%5E1573489316147306496%7Ctwgr%5E9bfbec9d831b2a896ffc769afc3b65024c52850b%7Ctwcon%5Es1_&ref_url=https%3A%2F%2Fgames.ensipedia.id%2Fnews%2Fcerdas-mahasiswa-ini-manfaatkan-ai-untuk-mengerjakan-tugas-kuliah-dan-dapat-nilai-a%2F", "https://twitter.com/petergyang/status/1573489316147306496")]
public void CleanUrl_ShouldRemoveQueryParametersBasedOnRules(string url, string result) {
string cleanedUrl = UrlCleaner.Clean(new System.Uri(url)).ToString();
Assert.Equal(result, cleanedUrl);
}
}
}
Loading