-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #244 from private-octopus/extract-ip-stats
Statistics by Instance and by Cluster
- Loading branch information
Showing
5 changed files
with
472 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# | ||
# This script will try to build a sample of the input file. | ||
# The purpose of the sample is, get a realistic test file | ||
# that is small enough for iterative development, measures, | ||
# etc., yet big enough to obtain statistically significant | ||
# results. | ||
# | ||
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file> | ||
# | ||
|
||
import sys | ||
import traceback | ||
import random | ||
import time | ||
import concurrent.futures | ||
import math | ||
import os | ||
from os import listdir | ||
from os.path import isfile, isdir, join | ||
|
||
class imrs_apnic_item: | ||
def __init__(self, ip, apnic_use, imrs_use): | ||
self.ip = ip | ||
self.apnic_use = apnic_use | ||
self.imrs_use = imrs_use | ||
|
||
def head(): | ||
s = "IP, apnic_use, imrs_use," | ||
return s | ||
|
||
|
||
def text(self): | ||
s = ip + "," + str(apnic_use) + "," + str(imrs_use) + "," | ||
return s | ||
|
||
class apnic_record: | ||
def __init__(self): | ||
self.ip = "" | ||
self.use_count = 0 | ||
self.seen_in_imrs = False | ||
self.imrs_count = 0 | ||
|
||
def parse(self, line): | ||
parts = line.split(",") | ||
nb_parts = len(parts) | ||
if nb_parts >= 4: | ||
try: | ||
self.ip = parts[0].strip() | ||
self.use_count = int(parts[3].strip()) | ||
except Exception as e: | ||
traceback.print_exc() | ||
print("Cannot parse APNIC Record:\n" + line.strip() + "\nException: " + str(e)) | ||
return False | ||
return True | ||
|
||
def parse_imrs(line): | ||
ok = False | ||
ip = "" | ||
count = 0 | ||
try: | ||
parts = line.split(",") | ||
ip = parts[0].strip() | ||
count = int(parts[1].strip()) | ||
ok = True | ||
except Exception as e: | ||
traceback.print_exc() | ||
print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e)) | ||
return ok, ip, count | ||
|
||
|
||
# main | ||
|
||
if len(sys.argv) != 4: | ||
print("Usage: imrs_apnic_list.py <imrs_file> <apnic_file> <output_file>") | ||
exit(1) | ||
imrs_file = sys.argv[1] | ||
apnic_file = sys.argv[2] | ||
output_file = sys.argv[3] | ||
|
||
apnic_dict = dict() | ||
|
||
for line in open(apnic_file,"r"): | ||
apnic = apnic_record() | ||
if apnic.parse(line): | ||
apnic_dict[apnic.ip] = apnic | ||
|
||
for line in open(imrs_file,"r"): | ||
ok, ip, count = parse_imrs(line) | ||
if ok: | ||
if ip in apnic_dict: | ||
apnic_dict[ip].seen_in_imrs = True | ||
apnic_dict[ip].imrs_count = count | ||
|
||
with open(output_file, "w") as F: | ||
F.write("IP, apnic_use, imrs_use,\n") | ||
for ip in apnic_dict: | ||
apnic_entry = apnic_dict[ip] | ||
if apnic_entry.seen_in_imrs: | ||
F.write(apnic_entry.ip + "," + str(apnic_entry.use_count) + "," + str(apnic_entry.imrs_count) + "\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# | ||
# This script will try to build a sample of the input file. | ||
# The purpose of the sample is, get a realistic test file | ||
# that is small enough for iterative development, measures, | ||
# etc., yet big enough to obtain statistically significant | ||
# results. | ||
# | ||
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file> | ||
# | ||
|
||
import sys | ||
import traceback | ||
import random | ||
import time | ||
import concurrent.futures | ||
import math | ||
import os | ||
from os import listdir | ||
from os.path import isfile, isdir, join | ||
|
||
def parse_imrs(line): | ||
ok = False | ||
ip = "" | ||
count = 0 | ||
try: | ||
parts = line.split(",") | ||
ip = parts[0].strip() | ||
count = int(parts[1].strip()) | ||
ok = True | ||
except Exception as e: | ||
traceback.print_exc() | ||
print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e)) | ||
return ok, ip, count | ||
|
||
|
||
# main | ||
|
||
if len(sys.argv) < 3 or len(sys.argv) > 4: | ||
print("Usage: imrs_frequency.py <imrs_file> <output_file> [load_step%]") | ||
exit(1) | ||
imrs_file = sys.argv[1] | ||
output_file = sys.argv[2] | ||
load_step = 0 | ||
if len(sys.argv) == 4: | ||
s_load_step = sys.argv[3] | ||
if not s_load_step.endswith("%"): | ||
print("Load step should be %, e.g. 1%, 0.1%, not " + s_load_step) | ||
exit(1) | ||
else: | ||
load_step = float(s_load_step[:-1])/100.0 | ||
|
||
load_vec = [] | ||
|
||
total_load = 0 | ||
for line in open(imrs_file,"r"): | ||
ok, ip, use_count = parse_imrs(line) | ||
if ok: | ||
load_vec.append(use_count) | ||
total_load += use_count | ||
|
||
load_vec.sort(reverse=True) | ||
|
||
with open(output_file, "w") as F: | ||
cumulative_use = 0 | ||
cumulative_count = 0 | ||
delta_threshold = int(total_load*load_step) | ||
threshold = 0 | ||
last_written = 0 | ||
F.write("Count, Queries, frequency,\n") | ||
for use_count in load_vec: | ||
cumulative_count += 1 | ||
cumulative_use += use_count | ||
if cumulative_use >= threshold: | ||
F.write(str(cumulative_count) + "," + str(cumulative_use) + "," + str(cumulative_use/total_load) + ",\n") | ||
threshold += delta_threshold | ||
last_written = cumulative_count | ||
if last_written < cumulative_count: | ||
F.write(str(cumulative_count) + "," + str(cumulative_use) + "," + str(cumulative_use/total_load) + ",\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
# | ||
# This script will try to build a sample of the input file. | ||
# The purpose of the sample is, get a realistic test file | ||
# that is small enough for iterative development, measures, | ||
# etc., yet big enough to obtain statistically significant | ||
# results. | ||
# | ||
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file> | ||
# | ||
|
||
import sys | ||
import traceback | ||
import random | ||
import time | ||
import concurrent.futures | ||
import math | ||
import os | ||
from os import listdir | ||
from os.path import isfile, isdir, join | ||
|
||
|
||
def imrs_parse_one_number(parts, parsed): | ||
v = 0 | ||
p = parts[parsed].strip() | ||
v = int(parts[parsed]) | ||
parsed += 1 | ||
return v, parsed | ||
|
||
def imrs_parse_one_vector(parts, parsed, v): | ||
for i in range(0, len(v)): | ||
v[i],parsed = imrs_parse_one_number(parts, parsed) | ||
return parsed | ||
|
||
class imrs_hyperloglog: | ||
def __init__(self): | ||
self.E = 0.0 | ||
self.hllv=[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0] | ||
pass; | ||
def parse(self, parts, parsed): | ||
self.E = float(parts[parsed].strip()) | ||
parsed += 1 | ||
for i in range(0, len(self.hllv)): | ||
self.hllv[i], parsed = imrs_parse_one_number(parts,parsed) | ||
return parsed | ||
|
||
class imrs_record: | ||
def __init__(self): | ||
self.ip = "" | ||
self.query_volume = 0 | ||
self.hourly_volume = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | ||
self.daily_volume = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | ||
self.arpa_count = 0 | ||
self.no_such_domain_queries = 0 | ||
self.no_such_domain_reserved = 0 | ||
self.no_such_domain_frequent = 0 | ||
self.no_such_domain_chromioids = 0 | ||
self.tld_counts = [0,0,0,0,0,0,0,0] | ||
self.tld_hyperlog = imrs_hyperloglog() | ||
self.sld_counts = [0,0,0,0,0,0,0,0] | ||
self.sld_hyperlog = imrs_hyperloglog() | ||
self.name_parts = [0,0,0,0,0,0,0,0] | ||
self.rr_types = [0,0,0,0,0,0,0,0] | ||
self.locales = [0,0,0,0,0,0,0,0] | ||
|
||
def parse_imrs(self, line): | ||
ok = False | ||
try: | ||
parts = line.split(",") | ||
self.ip = parts[0].strip() | ||
parsed = 1 | ||
self.query_volume, parsed = imrs_parse_one_number(parts, parsed) | ||
parsed = imrs_parse_one_vector(parts, parsed, self.hourly_volume) | ||
parsed = imrs_parse_one_vector(parts, parsed, self.daily_volume) | ||
self.arpa_count, parsed = imrs_parse_one_number(parts, parsed) | ||
self.no_such_domain_queries, parsed = imrs_parse_one_number(parts, parsed) | ||
self.no_such_domain_reserved, parsed = imrs_parse_one_number(parts, parsed) | ||
self.no_such_domain_frequent, parsed = imrs_parse_one_number(parts, parsed) | ||
self.no_such_domain_chromioids, parsed = imrs_parse_one_number(parts, parsed) | ||
parsed = imrs_parse_one_vector(parts, parsed, self.tld_counts) | ||
parsed = self.tld_hyperlog = imrs_hyperloglog(parts, parsed) | ||
parsed = imrs_parse_one_vector(parts, parsed, self.sld_counts) | ||
parsed = self.sld_hyperlog = imrs_hyperloglog(parts, parsed) | ||
parsed = imrs_parse_one_vector(parts, parsed, self.name_parts) | ||
parsed = imrs_parse_one_vector(parts, parsed, self.rr_types) | ||
parsed = imrs_parse_one_vector(parts, parsed, self.locales) | ||
ok = True | ||
except Exception as e: | ||
traceback.print_exc() | ||
print("Cannot parse IMRS Record after " + str(parsed) + " parts:\n" + line.strip() + "\nException: " + str(e)) | ||
return ok |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/usr/bin/python | ||
# coding=utf-8 | ||
# | ||
# This computes the montly totals for all the instances found | ||
# in the east and west folders. The raw data is organized as: | ||
# - ipstats / west / one folder per instance / files per date | ||
# / east / one folder per instance / files per date | ||
# The first processing step is to collect the list of file names | ||
# for each instance: one file per date, possibly more if the | ||
# same instance is present in east and west. | ||
# | ||
# | ||
|
||
import sys | ||
import traceback | ||
import random | ||
import time | ||
import concurrent.futures | ||
import os | ||
from os import listdir | ||
from os.path import isfile, isdir, join | ||
|
||
def prepare_instances_list(ipstats_folder, month): | ||
instances = dict() | ||
for pole in [ "east", "west" ]: | ||
pole_dir = join(ipstats_folder, pole) | ||
folder_pole = listdir(pole_dir) | ||
for instance_id in folder_pole: | ||
instance_folder = join(pole_dir, instance_id) | ||
if isdir(instance_folder): | ||
if not instance_id in instances: | ||
instances[instance_id] = [] | ||
file_list = listdir(instance_folder) | ||
for file_name in file_list: | ||
instances[instance_id].append(join(instance_folder, file_name)) | ||
return instances | ||
|
||
def check_or_create_dir(dir_path): | ||
if not isdir(dir_path): | ||
try: | ||
os.mkdir(dir_path) | ||
except Exception as e: | ||
traceback.print_exc() | ||
print("Cannot create <" + dir_path + ">\nException: " + str(e)) | ||
return False | ||
return True | ||
|
||
def process_instance(instance_id, month, result_folder, tmp_folder, ithitool, instances, do_debug): | ||
result_file = instance_id + "_" + month + "-ipstats.csv" | ||
result_path = join(result_folder, result_file) | ||
tmp_file = instance_id + "_" + month + "-file-list.txt" | ||
tmp_path = join(tmp_folder, tmp_file) | ||
with open(tmp_path,"wt") as F: | ||
for file_name in instances[instance_id]: | ||
F.write(file_name + "\n") | ||
merge_cmd = ithitool + ' -I ' + result_path + " " + tmp_path | ||
cmd_ret = os.system(merge_cmd) | ||
if cmd_ret == 0: | ||
if do_debug: | ||
print(result_file + ": computed.") | ||
else: | ||
print(result_file + ": computation failed, error:" + str(cmd_ret)) | ||
return False | ||
return True | ||
|
||
# main | ||
if len(sys.argv) < 4 or len(sys.argv) > 5 or \ | ||
(len(sys.argv) == 5 and sys.argv[4] != "debug"): | ||
print("Usage: imrs_instances <ipstats_folder> <yyyymm> <ithitool> [debug]") | ||
print("There are just " + str(len(sys.argv)) + " arguments.") | ||
exit (1) | ||
ipstats_folder = sys.argv[1] | ||
month = sys.argv[2] | ||
ithitool = sys.argv[3] | ||
do_debug = len(sys.argv) == 5 | ||
|
||
print("Writing instance monthly files for: " + ipstats_folder) | ||
try: | ||
instances = prepare_instances_list(ipstats_folder, month) | ||
result_folder = join(ipstats_folder, "instances") | ||
tmp_folder = join(ipstats_folder, "tmp") | ||
if check_or_create_dir(result_folder) and \ | ||
check_or_create_dir(tmp_folder): | ||
for instance_id in instances: | ||
if len(instances[instance_id]) > 0: | ||
if not process_instance(instance_id, month, result_folder, tmp_folder, ithitool, instances, do_debug): | ||
exit(1) | ||
except Exception as exc: | ||
traceback.print_exc() | ||
print('\nCode generated an exception: %s' % (exc)) | ||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.