Skip to content

Commit

Permalink
Merge pull request #244 from private-octopus/extract-ip-stats
Browse files Browse the repository at this point in the history
Statistics by Instance and by Cluster
  • Loading branch information
huitema authored Apr 21, 2024
2 parents 3027744 + 354634e commit 0d81728
Show file tree
Hide file tree
Showing 5 changed files with 472 additions and 0 deletions.
99 changes: 99 additions & 0 deletions imrs/imrs_apnic_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#
# This script will try to build a sample of the input file.
# The purpose of the sample is, get a realistic test file
# that is small enough for iterative development, measures,
# etc., yet big enough to obtain statistically significant
# results.
#
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file>
#

import sys
import traceback
import random
import time
import concurrent.futures
import math
import os
from os import listdir
from os.path import isfile, isdir, join

class imrs_apnic_item:
def __init__(self, ip, apnic_use, imrs_use):
self.ip = ip
self.apnic_use = apnic_use
self.imrs_use = imrs_use

def head():
s = "IP, apnic_use, imrs_use,"
return s


def text(self):
s = ip + "," + str(apnic_use) + "," + str(imrs_use) + ","
return s

class apnic_record:
def __init__(self):
self.ip = ""
self.use_count = 0
self.seen_in_imrs = False
self.imrs_count = 0

def parse(self, line):
parts = line.split(",")
nb_parts = len(parts)
if nb_parts >= 4:
try:
self.ip = parts[0].strip()
self.use_count = int(parts[3].strip())
except Exception as e:
traceback.print_exc()
print("Cannot parse APNIC Record:\n" + line.strip() + "\nException: " + str(e))
return False
return True

def parse_imrs(line):
ok = False
ip = ""
count = 0
try:
parts = line.split(",")
ip = parts[0].strip()
count = int(parts[1].strip())
ok = True
except Exception as e:
traceback.print_exc()
print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e))
return ok, ip, count


# main

if len(sys.argv) != 4:
print("Usage: imrs_apnic_list.py <imrs_file> <apnic_file> <output_file>")
exit(1)
imrs_file = sys.argv[1]
apnic_file = sys.argv[2]
output_file = sys.argv[3]

apnic_dict = dict()

for line in open(apnic_file,"r"):
apnic = apnic_record()
if apnic.parse(line):
apnic_dict[apnic.ip] = apnic

for line in open(imrs_file,"r"):
ok, ip, count = parse_imrs(line)
if ok:
if ip in apnic_dict:
apnic_dict[ip].seen_in_imrs = True
apnic_dict[ip].imrs_count = count

with open(output_file, "w") as F:
F.write("IP, apnic_use, imrs_use,\n")
for ip in apnic_dict:
apnic_entry = apnic_dict[ip]
if apnic_entry.seen_in_imrs:
F.write(apnic_entry.ip + "," + str(apnic_entry.use_count) + "," + str(apnic_entry.imrs_count) + "\n")
78 changes: 78 additions & 0 deletions imrs/imrs_frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#
# This script will try to build a sample of the input file.
# The purpose of the sample is, get a realistic test file
# that is small enough for iterative development, measures,
# etc., yet big enough to obtain statistically significant
# results.
#
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file>
#

import sys
import traceback
import random
import time
import concurrent.futures
import math
import os
from os import listdir
from os.path import isfile, isdir, join

def parse_imrs(line):
ok = False
ip = ""
count = 0
try:
parts = line.split(",")
ip = parts[0].strip()
count = int(parts[1].strip())
ok = True
except Exception as e:
traceback.print_exc()
print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e))
return ok, ip, count


# main

if len(sys.argv) < 3 or len(sys.argv) > 4:
print("Usage: imrs_frequency.py <imrs_file> <output_file> [load_step%]")
exit(1)
imrs_file = sys.argv[1]
output_file = sys.argv[2]
load_step = 0
if len(sys.argv) == 4:
s_load_step = sys.argv[3]
if not s_load_step.endswith("%"):
print("Load step should be %, e.g. 1%, 0.1%, not " + s_load_step)
exit(1)
else:
load_step = float(s_load_step[:-1])/100.0

load_vec = []

total_load = 0
for line in open(imrs_file,"r"):
ok, ip, use_count = parse_imrs(line)
if ok:
load_vec.append(use_count)
total_load += use_count

load_vec.sort(reverse=True)

with open(output_file, "w") as F:
cumulative_use = 0
cumulative_count = 0
delta_threshold = int(total_load*load_step)
threshold = 0
last_written = 0
F.write("Count, Queries, frequency,\n")
for use_count in load_vec:
cumulative_count += 1
cumulative_use += use_count
if cumulative_use >= threshold:
F.write(str(cumulative_count) + "," + str(cumulative_use) + "," + str(cumulative_use/total_load) + ",\n")
threshold += delta_threshold
last_written = cumulative_count
if last_written < cumulative_count:
F.write(str(cumulative_count) + "," + str(cumulative_use) + "," + str(cumulative_use/total_load) + ",\n")
90 changes: 90 additions & 0 deletions imrs/imrs_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#
# This script will try to build a sample of the input file.
# The purpose of the sample is, get a realistic test file
# that is small enough for iterative development, measures,
# etc., yet big enough to obtain statistically significant
# results.
#
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file>
#

import sys
import traceback
import random
import time
import concurrent.futures
import math
import os
from os import listdir
from os.path import isfile, isdir, join


def imrs_parse_one_number(parts, parsed):
v = 0
p = parts[parsed].strip()
v = int(parts[parsed])
parsed += 1
return v, parsed

def imrs_parse_one_vector(parts, parsed, v):
for i in range(0, len(v)):
v[i],parsed = imrs_parse_one_number(parts, parsed)
return parsed

class imrs_hyperloglog:
def __init__(self):
self.E = 0.0
self.hllv=[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
pass;
def parse(self, parts, parsed):
self.E = float(parts[parsed].strip())
parsed += 1
for i in range(0, len(self.hllv)):
self.hllv[i], parsed = imrs_parse_one_number(parts,parsed)
return parsed

class imrs_record:
def __init__(self):
self.ip = ""
self.query_volume = 0
self.hourly_volume = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
self.daily_volume = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
self.arpa_count = 0
self.no_such_domain_queries = 0
self.no_such_domain_reserved = 0
self.no_such_domain_frequent = 0
self.no_such_domain_chromioids = 0
self.tld_counts = [0,0,0,0,0,0,0,0]
self.tld_hyperlog = imrs_hyperloglog()
self.sld_counts = [0,0,0,0,0,0,0,0]
self.sld_hyperlog = imrs_hyperloglog()
self.name_parts = [0,0,0,0,0,0,0,0]
self.rr_types = [0,0,0,0,0,0,0,0]
self.locales = [0,0,0,0,0,0,0,0]

def parse_imrs(self, line):
ok = False
try:
parts = line.split(",")
self.ip = parts[0].strip()
parsed = 1
self.query_volume, parsed = imrs_parse_one_number(parts, parsed)
parsed = imrs_parse_one_vector(parts, parsed, self.hourly_volume)
parsed = imrs_parse_one_vector(parts, parsed, self.daily_volume)
self.arpa_count, parsed = imrs_parse_one_number(parts, parsed)
self.no_such_domain_queries, parsed = imrs_parse_one_number(parts, parsed)
self.no_such_domain_reserved, parsed = imrs_parse_one_number(parts, parsed)
self.no_such_domain_frequent, parsed = imrs_parse_one_number(parts, parsed)
self.no_such_domain_chromioids, parsed = imrs_parse_one_number(parts, parsed)
parsed = imrs_parse_one_vector(parts, parsed, self.tld_counts)
parsed = self.tld_hyperlog = imrs_hyperloglog(parts, parsed)
parsed = imrs_parse_one_vector(parts, parsed, self.sld_counts)
parsed = self.sld_hyperlog = imrs_hyperloglog(parts, parsed)
parsed = imrs_parse_one_vector(parts, parsed, self.name_parts)
parsed = imrs_parse_one_vector(parts, parsed, self.rr_types)
parsed = imrs_parse_one_vector(parts, parsed, self.locales)
ok = True
except Exception as e:
traceback.print_exc()
print("Cannot parse IMRS Record after " + str(parsed) + " parts:\n" + line.strip() + "\nException: " + str(e))
return ok
95 changes: 95 additions & 0 deletions src/imrs_instances.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/python
# coding=utf-8
#
# This computes the montly totals for all the instances found
# in the east and west folders. The raw data is organized as:
# - ipstats / west / one folder per instance / files per date
# / east / one folder per instance / files per date
# The first processing step is to collect the list of file names
# for each instance: one file per date, possibly more if the
# same instance is present in east and west.
#
#

import sys
import traceback
import random
import time
import concurrent.futures
import os
from os import listdir
from os.path import isfile, isdir, join

def prepare_instances_list(ipstats_folder, month):
instances = dict()
for pole in [ "east", "west" ]:
pole_dir = join(ipstats_folder, pole)
folder_pole = listdir(pole_dir)
for instance_id in folder_pole:
instance_folder = join(pole_dir, instance_id)
if isdir(instance_folder):
if not instance_id in instances:
instances[instance_id] = []
file_list = listdir(instance_folder)
for file_name in file_list:
instances[instance_id].append(join(instance_folder, file_name))
return instances

def check_or_create_dir(dir_path):
if not isdir(dir_path):
try:
os.mkdir(dir_path)
except Exception as e:
traceback.print_exc()
print("Cannot create <" + dir_path + ">\nException: " + str(e))
return False
return True

def process_instance(instance_id, month, result_folder, tmp_folder, ithitool, instances, do_debug):
result_file = instance_id + "_" + month + "-ipstats.csv"
result_path = join(result_folder, result_file)
tmp_file = instance_id + "_" + month + "-file-list.txt"
tmp_path = join(tmp_folder, tmp_file)
with open(tmp_path,"wt") as F:
for file_name in instances[instance_id]:
F.write(file_name + "\n")
merge_cmd = ithitool + ' -I ' + result_path + " " + tmp_path
cmd_ret = os.system(merge_cmd)
if cmd_ret == 0:
if do_debug:
print(result_file + ": computed.")
else:
print(result_file + ": computation failed, error:" + str(cmd_ret))
return False
return True

# main
if len(sys.argv) < 4 or len(sys.argv) > 5 or \
(len(sys.argv) == 5 and sys.argv[4] != "debug"):
print("Usage: imrs_instances <ipstats_folder> <yyyymm> <ithitool> [debug]")
print("There are just " + str(len(sys.argv)) + " arguments.")
exit (1)
ipstats_folder = sys.argv[1]
month = sys.argv[2]
ithitool = sys.argv[3]
do_debug = len(sys.argv) == 5

print("Writing instance monthly files for: " + ipstats_folder)
try:
instances = prepare_instances_list(ipstats_folder, month)
result_folder = join(ipstats_folder, "instances")
tmp_folder = join(ipstats_folder, "tmp")
if check_or_create_dir(result_folder) and \
check_or_create_dir(tmp_folder):
for instance_id in instances:
if len(instances[instance_id]) > 0:
if not process_instance(instance_id, month, result_folder, tmp_folder, ithitool, instances, do_debug):
exit(1)
except Exception as exc:
traceback.print_exc()
print('\nCode generated an exception: %s' % (exc))





Loading

0 comments on commit 0d81728

Please sign in to comment.