Skip to content

Commit

Permalink
Save sampler script
Browse files Browse the repository at this point in the history
  • Loading branch information
huitema committed Apr 15, 2024
1 parent 6787cdb commit adf1490
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 0 deletions.
56 changes: 56 additions & 0 deletions imrs/imrs_sampler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# This script will try to build a sample of the input file.
# The purpose of the sample is, get a realistic test file
# that is small enough for iterative development, measures,
# etc., yet big enough to obtain statistically significant
# results.
#
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file>
#

import sys
import traceback
import random
import time
import concurrent.futures
import os
from os import listdir
from os.path import isfile, isdir, join


# main

if len(sys.argv) != 4:
print("Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file>")
exit(1)
file_name_in = sys.argv[1]
rate_text = sys.argv[2]
file_name_out = sys.argv[3]
sampling_rate = 0
if not rate_text.endswith("%"):
print("sampling rate should be e.g. 5\%, 0.1%, not " + rate_text)
exit(1)
try:
rate_percent = float(rate_text[:-1])
sampling_rate = rate_percent/100.0
except Exception as e:
traceback.print_exc()
print("Cannot parse <" + rate_percent + ">\nException: " + str(e))
exit(1)

nb_lines_in = 0
nb_lines_out = 0
with open(file_name_out,"wt") as F_OUT:
for line in open(file_name_in, "rt"):
nb_lines_in += 1
if random.random() < sampling_rate:
F_OUT.write(line)
nb_lines_out += 1

if nb_lines_in == 0:
print("Input file " + file_name_in + " is empty.")
else:
print("Input file " + file_name_in + ": " + str(nb_lines_in) + " lines")
print("Output file " + file_name_out + ": " + str(nb_lines_out) + " lines")
print("Sampling rate requested: " + str(sampling_rate))
print("Sampling rate actual: " + str(nb_lines_out/nb_lines_in))
86 changes: 86 additions & 0 deletions src/imrs_total.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/python
# coding=utf-8
#
# This script organizes the sum of IMRS resolver data per
# cluster for the month. The results are collected in
# the folder ~/ipstats/cluster, with one subfolder
# per cluster, using names like ~/ipstats/cluster/us-lax/.
# For each cluster, the script compute a single file
# such as ~/ipstats/cluster/us-lax.202403.csv, containing
# the aggregated statistics for the whole month.
#

import sys
import traceback
import random
import time
import concurrent.futures
import os
from os import listdir
from os.path import isfile, isdir, join

def check_or_create_dir(dir_path):
if not isdir(dir_path):
try:
os.mkdir(dir_path)
except Exception as e:
traceback.print_exc()
print("Cannot create <" + dir_path + ">\nException: " + str(e))
return False
return True

# main
if len(sys.argv) < 4 or len(sys.argv) > 5 or \
(len(sys.argv) == 5 and sys.argv[4] != "debug"):
print("Usage: imrs_monthly <ipstats_folder> <yyyymm> <ithitool> [\"debug\"]")
print("There are just " + str(len(sys.argv)) + " arguments.")
exit (1)
ipstats_folder = sys.argv[1]
month = sys.argv[2]
ithitool = sys.argv[3]
do_debug = len(sys.argv) == 5

print("Writing monthly per custom clusters aggregates for: " + ipstats_folder)
try:
# Look at every cluster under the "clusters" folder
monthly_folder = join(ipstats_folder, "monthly")
tmp_folder = join(ipstats_folder, "tmp")
monthly_list = listdir(monthly_folder)
if check_or_create_dir(monthly_folder) and \
check_or_create_dir(tmp_folder):
tmp_file_name = join(tmp_folder, month + ".txt")
with open(tmp_file_name, "wt") as F:
# check that this is a cluster, and not some other file
# Watch for: cluster_id + "." + month + "-" + "ipstats.csv"
monthly_file_end = month + "-" + "ipstats.csv"
for monthly_file in monthly_list:
monthly_path = join(monthly_folder, monthly_file)
if len(monthly_file) > 7 and \
monthly_file[2] == "-" and \
monthly_file[6] == "." and \
monthly_file.endswith(monthly_file_end):
F.write(monthly_path +"\n")
if do_debug:
print("Adding: " + monthly_file)
elif do_debug:
print("Not a monthly file: " + monthly_path)
total_file = "total-" + month + "-" + "ipstats.csv"
total_path = join(ipstats_folder, total_file)
merge_cmd = ithitool + ' -I ' + total_path + " " + tmp_file_name
if do_debug:
print("Running: " + merge_cmd)
sys.stdout.flush()
cmd_ret = os.system(merge_cmd)
if cmd_ret == 0:
if do_debug:
print(total_file + ": computed.")
else:
print(report_name + ": computation failed, error:" + str(cmd_ret))
except Exception as exc:
traceback.print_exc()
print('\nCode generated an exception: %s' % (exc))





0 comments on commit adf1490

Please sign in to comment.