Skip to content

Commit

Permalink
Produce first batch of classifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
huitema committed Jul 15, 2024
1 parent f839441 commit 2d72254
Show file tree
Hide file tree
Showing 4 changed files with 447 additions and 151 deletions.
182 changes: 31 additions & 151 deletions imrs/imrs_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,118 +18,15 @@
import numpy as np
from sklearn.linear_model import LinearRegression
import random

def file_has_header(imrs_file):
has_header = False
# get the first line
line = ""
for line in open(imrs_file, "r"):
break
if len(line) > 0:
parts = line.split(",")
if len(parts) > 1:
try:
queries = int(parts[1])
print("No header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...")
has_header = False
except:
print("header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...")
has_header = True
return has_header

def load_imrs_to_frame(imrs_file):
if file_has_header(imrs_file):
df = pd.read_csv(imrs_ratio_file)
else:
df = pd.read_csv(imrs_file, header=None, names=imrs.imrs_headers, dtype={"network": str}, index_col = False)
return df

def protected_ratio(v, d):
r = 0
if d > 0:
r = v/d
return r

def protected_count(x, r, list):
s = r
if r > 0:
mx = 0
for k in list:
if x[k] > mx:
mx = x[k]
s = r*mx
count = 0
for k in list:
if x[k] > s:
count += 1
return count

def reset_d31(x, list):
s = 0
for k in list:
s += x[k]
d31 = x["queries"] - s
if d31 < 0:
d31 = 0
return d31

def compute_nb_tlds(x):
tld_count = 0
for tld in [ "COM", "NET", "ORG", "INFO", "CN", "IN", "DE", "US" ]:
if x[tld] > 0:
tld_count += 1
tld_count += int(x["TLDs"])
return tld_count

def compute_nb_slds(x):
sld_count = 0
for sld in [ "RESOLVER", "EC2", "CLOUD", "WPAD", "CORP", "MAIL", "_TCP", "PROD" ]:
if x[sld] > 0:
sld_count += 1
sld_count += int(x["SLDs"])
if sld_count < 1:
sld_count = 1
return sld_count
import imrs_pandas
from imrs_pandas import print_stats, plot_or_save, example_and_count, print_names, print_mean

def compute_l10_sa(x, y, n, intercept):
d = float(intercept)
for i in range(len(y)):
d += float(x[n[i]]*y[i])
return d

def print_stats(x_df, name):
print(name)
x_des = x_df.describe()
print(x_des.transpose())
x_cor = x_df.corr()
print(x_cor)

def plot_or_save(plot_dir, image_name):
if plot_dir == "-":
plt.show()
else:
image_path = join(plot_dir, image_name)
plt.savefig(image_path)

def example_and_count(df, name):
count = df.shape[0]
all_rows = df.shape[1]
queries = 0
network = ""
sample = df.sample(13)
nb_rows = sample.shape[0]
# print(name + ": samples = " + str(nb_rows) + ", out of " + str(all_rows))
sdp = sample[["network", "queries"]]
sdp_np = sdp.to_numpy()
# print("Sample shape: " + str(sdp.shape))
# print("Sdp_np shape: " + str(np.shape(sdp_np)))
for i in range(np.shape(sdp_np)[0]):
if sdp_np[i,1] > queries:
queries = sdp_np[i,1]
network = str(sdp_np[i,0])
print(name + ": count=" + str(count) + ", network=" + network)
return count, network

# main
if len(sys.argv) != 2 and len(sys.argv) != 3:
for x in range(0, len(sys.argv)):
Expand All @@ -141,54 +38,16 @@ def example_and_count(df, name):
if len(sys.argv) == 3:
plot_dir = sys.argv[2]

full_df = load_imrs_to_frame(imrs_file)
full_df = imrs_pandas.load_imrs_to_frame(imrs_file)
print("Loaded full")
# apply corections for day overlow bug:
# ignore d00, it is always 0
# compute d31 = queries - sum (d01..d30)
# compute arpa = arpa0 - d31
days = [
"d01", "d02", "d03", "d04", "d05", "d06", "d07", "d08", "d09", "d10", \
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", \
"d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", \
"d31" ]
full_df["d31"] = full_df.apply(lambda x: reset_d31(x, days[:-1]), axis=1)
full_df["arpa"] = full_df["arpa0"] - full_df["d31"]

print("Computed corrections")
# compute the good column
full_df["good"] = full_df.apply(lambda x: x["queries"] - x["no_such"], axis=1)
# compute the ratio of good over APNIC
full_df["r_good_apnic"] = full_df.apply(lambda x: protected_ratio(x["good"], x["APNIC"]), axis=1)

# compute log10 column of queries and apnic
full_df["l10_q"] = np.log10(full_df["queries"])
full_df["l10_a"] = np.log10(2*full_df["APNIC"] + 1)
full_df["l10_g"] = np.log10(2*full_df["good"] + 1)
full_df["l_tld"] = np.log10(2*full_df["TLDs"] + 1)
full_df["l_sld"] = np.log10(2*full_df["SLDs"] + 1)
# add columns for ratios
for d in [ "no_such", "AAAA", "NS", "PTR", "NSEC", "SOA", "APNIC" ]:
r_d = "r_" + d
full_df[r_d] = full_df[d] / full_df["queries"]

full_df["r_arpa"] = full_df["arpa"] / (2*full_df["queries"])

full_df["r_COM"] = full_df.apply(lambda x: protected_ratio(x["COM"], x["queries"] - x["no_such"]), axis=1)
full_df["r_INFO"] = full_df.apply(lambda x: protected_ratio(x["INFO"], x["queries"] - x["no_such"]), axis=1)
print("Computed ratios")

hours = ["h00", "h01", "h02", "h03", "h04", "h05", "h06", "h07", "h08", "h09", \
"h10", "h11", "h12", "h13", "h14", "h15", "h16", "h17", "h18", "h19", \
"h20", "h21", "h22", "h23" ]

full_df["h_count"] = full_df.apply(lambda x: protected_count(x, 0, hours), axis=1)
full_df["d_count"] = full_df.apply(lambda x: protected_count(x, 0, days), axis=1)
print("Computed hours")

imrs_pandas.imrs_corrections(full_df)
print("Applied corrections")

# First, study the APNIC Data
# get APNIC subset
apnic_df = full_df[full_df["l10_a"] > 0]
apnic_selected = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM" ]

# select 4 subsets, based on 2 variables
apnic_loneg_df = apnic_df[apnic_df["r_no_such"] < 0.1]
Expand All @@ -199,6 +58,13 @@ def example_and_count(df, name):
apnic_hineg_loap_df = apnic_hineg_df[apnic_hineg_df["r_good_apnic"] < 300]
apnic_hineg_hiap_df = apnic_hineg_df[apnic_hineg_df["r_good_apnic"] >= 300]

print_names(apnic_loneg_loap_df)
print_mean(apnic_loneg_loap_df,"apnic_loneg_loap_df", apnic_selected)
print_mean(apnic_loneg_hiap_df,"apnic_loneg_hiap_df", apnic_selected)
print_mean(apnic_hineg_loap_df,"apnic_hineg_loap_df", apnic_selected)
print_mean(apnic_hineg_hiap_df,"apnic_hineg_hiap_df", apnic_selected)


# select 4 subsets, based on 2 variables
apnic_loneg_df = apnic_df[apnic_df["r_no_such"] < 0.1]
apnic_hineg_df = apnic_df[apnic_df["r_no_such"] >= 0.1]
Expand All @@ -222,6 +88,14 @@ def example_and_count(df, name):
apnic_hineg_hiap_df.plot.scatter(ax=axb, x="queries", y="APNIC", alpha=0.5, color="red")
plot_or_save(plot_dir, "apnic-queries.jpg")

axtld = apnic_df.plot.scatter(x="APNIC", y="TLDs", alpha=0.5, logx=True, logy=False, color="blue")
plot_or_save(plot_dir, "tlds-apnic.jpg")
axcom = apnic_df.plot.scatter(x="APNIC", y="COM", alpha=0.5, logx=True, logy=False, color="blue")
plot_or_save(plot_dir, "com-apnic.jpg")
axnosuch = full_df.plot.scatter(x="queries", y="r_no_such", alpha=0.5, logx=True, logy=False, color="blue")
apnic_df.plot.scatter(ax=axnosuch, x="queries", y="r_no_such", alpha=0.5, color="orange")
plot_or_save(plot_dir, "no_such-queries.jpg")

# plot APNIC/Queries/no_such
axb = apnic_loneg_loap_df.plot.scatter(x="queries", y="r_no_such", alpha=0.5, logx=True, logy=False, color="blue")
apnic_loneg_hiap_df.plot.scatter(ax=axb, x="queries", y="r_no_such", alpha=0.5, color="green")
Expand All @@ -236,7 +110,6 @@ def example_and_count(df, name):

# study the APNIC correlations
# get a view of only the important columns
apnic_selected = [ "network", "l10_q", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l_tld", "l_sld", "l10_a" ]
full_selected_df = full_df[apnic_selected]
apnic_selected_df = full_selected_df[full_selected_df["l10_a"] > 0]

Expand All @@ -260,6 +133,7 @@ def example_and_count(df, name):
full_df["l10_sa"] = full_df.apply(lambda x: compute_l10_sa(x, lr.coef_.T, apnic_coeffs[:-1], lr.intercept_[0]), axis=1)
full_df["l10_gsa"] = full_df.apply(lambda x: x["l10_g"] - x["l10_sa"], axis=1)
#print(list(full_df))

apnic_coeffs_x = [ "network", "l10_sa", "l10_gsa", "l10_q", "r_no_such", "l10_a", "queries" ]
full_data_x_df = full_df[apnic_coeffs_x]
# print(list(full_data_x_df))
Expand All @@ -274,7 +148,7 @@ def example_and_count(df, name):
#print_stats(full_df["l10_sa", "l10_a"], "full_df")

# apply regression to classify not APNIC traffic
notap_df = full_data_x_df[full_data_x_df["l10_a"] == 0]
notap_df = full_df[full_df["l10_a"] == 0]

axp = notap_df.plot.scatter(x="queries", y="r_no_such", alpha=0.1, logx=True, logy=False, color="blue")
apnic_df.plot.scatter(ax=axp, x="queries", y="r_no_such", alpha=0.2, color="orange")
Expand Down Expand Up @@ -318,4 +192,10 @@ def example_and_count(df, name):
example_and_count(notap_loneg_loap_df, "notap_loneg_loap_df (blue)")
example_and_count(notap_loneg_hiap_df, "notap_loneg_hiap_df (green)")
example_and_count(notap_hineg_loap_df, "notap_hineg_loap_df (orange)")
example_and_count(notap_hineg_hiap_df, "notap_hineg_hiap_df (red)")
example_and_count(notap_hineg_hiap_df, "notap_hineg_hiap_df (red)")

print_names(notap_loneg_loap_df)
print_mean(notap_loneg_loap_df,"notap_loneg_loap_df", apnic_selected)
print_mean(notap_loneg_hiap_df,"notap_loneg_hiap_df", apnic_selected)
print_mean(notap_hineg_loap_df,"notap_hineg_loap_df", apnic_selected)
print_mean(notap_hineg_hiap_df,"notap_hineg_hiap_df", apnic_selected)
123 changes: 123 additions & 0 deletions imrs/imrs_classifier2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#
# Exploration of the ipstats file for each network
#

import sys
import traceback
import random
import time
import concurrent.futures
import math
import os
from os import listdir
from os.path import isfile, isdir, join
import imrs
from imrs import parse_imrs_volume_only, apnic_record
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
import random
import imrs_pandas
from imrs_pandas import print_stats, save_stats, save_selected_stats, \
plot_or_save, plot_and_explore, example_and_count, \
print_names, print_mean


# main
if len(sys.argv) != 2 and len(sys.argv) != 3:
for x in range(0, len(sys.argv)):
print(str(x) + ":" + sys.argv[x])
print("Usage: imrs_classifier.py <imrs_ratio csv file> [<img_folder>]")
exit(1)
imrs_file = sys.argv[1]
plot_dir = "-"
out_file = sys.stdout
if len(sys.argv) == 3:
plot_dir = sys.argv[2]
csv_path = join(plot_dir, "stats.csv")
out_file = open(csv_path, "w")
out_file.write("frame, property, count, mean, std, min, c25%, c50%, c75%, max\n")


full_df = imrs_pandas.load_imrs_to_frame(imrs_file)
print("Loaded full")

imrs_pandas.imrs_corrections(full_df)
print("Applied corrections")

tracked = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM" ]
save_selected_stats(out_file, full_df, tracked, "full_df")
plot_and_explore(full_df, plot_dir, "full", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

# First, isolate the "small" nodes, defined
# as sending fewer than 40 queries.
small_df = full_df[full_df["queries"] <= 100]
big_df = full_df[full_df["queries"] > 100]

save_selected_stats(out_file, small_df, tracked,"small_df")
save_selected_stats(out_file, big_df, tracked,"big_df")

# then, create three subsets of the big sites:
# ns_low: no-such < 5%
# ns_high: no-such > 90%
# ns_mid: in_between

ns_low = big_df[big_df["r_no_such"] < 0.05]
ns_other = big_df[big_df["r_no_such"] >= 0.05]
ns_high = ns_other[ns_other["r_no_such"] > 0.9]
ns_mid = ns_other[ns_other["r_no_such"] <= 0.9]

save_selected_stats(out_file, ns_other, tracked,"ns_other")
plot_and_explore(ns_other, plot_dir, "other", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

save_selected_stats(out_file, ns_low, tracked,"ns_low")
save_selected_stats(out_file, ns_high, tracked,"ns_high")
save_selected_stats(out_file, ns_mid, tracked,"ns_mid")

# At this stage, we have separated 4 groups.
# We will ignore the "small" group for now, because in the absence of
# traffic it is hard to classify anything.

plot_and_explore(ns_low, plot_dir, "low", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(ns_mid, plot_dir, "mid", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(ns_high, plot_dir, "high", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

# In the "low NS" group, the plot of TLDs versus queries shows a break
# at somewhere between 500 and 1000 TLDs seen. Above that line we find
# very few APNIC servers but many large non APNIC nodes. This could
# be nodes engaged in some kind of scanning process.

low_lt500t = ns_low[ns_low["TLDs"] <= 500]
low_gt500t = ns_low[ns_low["TLDs"] > 500]
save_selected_stats(out_file, low_lt500t, tracked,"low_lt500t")
save_selected_stats(out_file, low_gt500t, tracked,"low_gt500t")
plot_and_explore(low_lt500t, plot_dir, "low_lt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(low_gt500t, plot_dir, "low_gt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

# In the "high NS" group, there seems
# to be two interesting subgroups: more than 500 TLDs, as in the
# "low" case, and more than about 10^6 queries, which separates
# a bunch of high values from the bulk of APNNIC resolvers.

high_lt500t = ns_high[ns_high["TLDs"] <= 500]
high_gt500t = ns_high[ns_high["TLDs"] > 500]
save_selected_stats(out_file, high_lt500t, tracked,"high_lt500t")
save_selected_stats(out_file, high_gt500t, tracked,"high_gt500t")
plot_and_explore(high_lt500t, plot_dir, "high_lt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(high_gt500t, plot_dir, "high_gt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

# In the "mid" group, the pictures are murky. There seems to be
# a separation between resolvers with more than 1 million
# queries and others. (or is it 100K?)

mid_lt1Mq = ns_mid[ns_mid["queries"] <= 1000000]
mid_gt1Mq = ns_mid[ns_mid["queries"] > 1000000]

save_selected_stats(out_file, mid_lt1Mq, tracked,"mid_lt1Mq")
save_selected_stats(out_file, mid_gt1Mq, tracked,"mid_gt1Mq")
plot_and_explore(mid_lt1Mq, plot_dir, "mid_lt1Mq", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(mid_gt1Mq, plot_dir, "mid_gt1Mq", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

if out_file != sys.stdout:
out_file.close()
Loading

0 comments on commit 2d72254

Please sign in to comment.