From d8b9c23f5eaef933f9c3ef5e579aa75acf2ec199 Mon Sep 17 00:00:00 2001
From: HS <hs@apotell.com>
Date: Wed, 20 Sep 2023 12:47:54 -0700
Subject: [PATCH] A few handy scripts for extracting and comparing UHDM DB

* extract - Cherry-pick log and/or DB from CI artifacts
* compare - Batch compare using uhdm-cmp
---
 scripts/dbcompare.py | 294 +++++++++++++++++++++++++++++++++++++++++++
 scripts/extract.py   | 257 +++++++++++++++++++++++++++++++++++++
 2 files changed, 551 insertions(+)
 create mode 100644 scripts/dbcompare.py
 create mode 100644 scripts/extract.py

diff --git a/scripts/dbcompare.py b/scripts/dbcompare.py
new file mode 100644
index 0000000000..d16a43c788
--- /dev/null
+++ b/scripts/dbcompare.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+
+"""
+Batch compare UHDM DB outputs. Useful to compare outputs generated by two CI builds.
+"""
+
+import argparse
+import multiprocessing
+import os
+import platform
+import pprint
+import re
+import shutil
+import subprocess
+import sys
+import time
+import traceback
+
+from collections import OrderedDict
+from contextlib import redirect_stdout, redirect_stderr
+from datetime import datetime
+from enum import Enum, unique
+from pathlib import Path
+from threading import Lock
+
+
+_this_filepath = os.path.realpath(__file__)
+_default_dbname = 'surelog.uhdm'
+
+
+_log_mutex = Lock()
+def log(text, end='\n'):
+  _log_mutex.acquire()
+  try:
+    print(text, end=end, flush=True)
+  finally:
+    _log_mutex.release()
+
+
+@unique
+class Status(Enum):
+  PASS = 0
+  FAIL = -1
+  MISSING = -2
+
+  def __str__(self):
+    return str(self.name)
+
+
+def _mkdir(dirpath, retries=10):
+  count = 0
+  while count < retries:
+    os.makedirs(dirpath, exist_ok=True)
+
+    if os.path.exists(dirpath):
+      return True
+
+    count += 1
+    time.sleep(0.1)
+
+  return os.path.exists(dirpath)
+
+
+def _rmdir(dirpath, retries=10):
+  count = 0
+  while count < retries:
+    shutil.rmtree(dirpath, ignore_errors=True)
+
+    if not os.path.exists(dirpath):
+      return True
+
+    count += 1
+    time.sleep(0.1)
+
+  shutil.rmtree(dirpath)
+  return not os.path.exists(dirpath)
+
+
+def _find_files(dirpath, pattern):
+  return list(Path(dirpath).rglob(pattern))
+
+
+def _is_filtered(name, filters):
+  if not filters:
+    return True
+
+  for filter in filters:
+    if isinstance(filter, str):
+      if filter.lower() == name.lower():
+        return True
+    else:
+      if filter.search(name):  # Note: match() reports success only if the match is at index 0
+        return True
+
+  return False
+
+
+def _compare_one(params):
+  start_dt = datetime.now()
+  name, lhs_dirpath, rhs_dirpath, uhdm_cmp_filepath, output_dirpath = params
+
+  log_filepath = os.path.join(output_dirpath, f'{name}.log')
+  lhs_filepath = os.path.join(lhs_dirpath, name, _default_dbname)
+  rhs_filepath = os.path.join(rhs_dirpath, name, _default_dbname)
+
+  log(f'Comparing {name}')
+
+  status = Status.FAIL
+  with open(log_filepath, 'wt') as log_strm, redirect_stdout(log_strm), redirect_stderr(log_strm):
+    lhs_exists = os.path.isfile(lhs_filepath)
+    rhs_exists = os.path.isfile(rhs_filepath)
+
+    print(f'start-time: {start_dt}')
+    print( '')
+    print( 'Environment:')
+    print(f'             name: {name}')
+    print(f'      lhs-dirpath: {lhs_dirpath}')
+    print(f'     rhs-filepath: {rhs_dirpath}')
+    print(f'uhdm_cmp-filepath: {uhdm_cmp_filepath}')
+    print(f'   output-dirpath: {output_dirpath}')
+    print(f'     lhs-filepath: {lhs_filepath}')
+    print(f'     rhs-filepath: {rhs_filepath}')
+    print(f'     log-filepath: {log_filepath}')
+    print(f'       lhs-exists: {lhs_exists}')
+    print(f'       rhs-exists: {rhs_exists}')
+    print( '\n')
+
+    if lhs_exists and rhs_exists:
+      args = [uhdm_cmp_filepath, lhs_filepath, rhs_filepath]
+
+      print(f'Launching uhdm-cmp with arguments:')
+      pprint.pprint(args)
+      print('\n', flush=True)
+
+      try:
+        result = subprocess.run(
+            args,
+            stdout=log_strm,
+            stderr=subprocess.STDOUT,
+            check=False,
+            cwd=os.path.dirname(uhdm_cmp_filepath))
+        print(f'uhdm-cmp terminated with exit code: {result.returncode}')
+        status = Status.PASS if result.returncode == 0 else Status.FAIL
+      except:
+        status = Status.FAIL
+        print(f'uhdm-cmp threw an exception with exit code: {result.returncode}')
+        traceback.print_exc()
+    else:
+      print(f'Existence mismatch: lhs={lhs_exists}, rhs={rhs_exists}')
+      status = Status.MISSING
+
+    end_dt = datetime.now()
+    delta = end_dt - start_dt
+    print(f'end-time: {str(end_dt)} {str(delta)}', flush=True)
+
+  return (name, status)
+
+
+def _compare(args):
+  lhs_filepaths = _find_files(args.lhs_dirpath, _default_dbname)
+  rhs_filepaths = _find_files(args.rhs_dirpath, _default_dbname)
+
+  test_names = set(Path(Path(filepath).parent).stem for filepath in lhs_filepaths + rhs_filepaths)
+
+  params = [(
+    name,
+    args.lhs_dirpath,
+    args.rhs_dirpath,
+    args.uhdm_cmp_filepath,
+    args.output_dirpath
+  ) for name in test_names if _is_filtered(name, args.filters)]
+
+  jobs = min(args.jobs, len(params))
+
+  if jobs <= 1:
+    results = [_compare_one(param) for param in params]
+  else:
+    with multiprocessing.Pool(processes=jobs) as pool:
+      results = pool.map(_compare_one, params)
+
+  return dict(results)
+
+
+def _print_report(results):
+  columns = ['TESTNAME', 'STATUS']
+
+  rows = []
+  summary = OrderedDict([(status.name, 0) for status in Status])
+  for name, status in sorted(results.items(), key=lambda r: (-r[1].value, r[0])):
+    summary[status.name] += 1
+    rows.append([name, status.name])
+
+  widths = [max([len(row[index]) for row in [columns] + rows]) for index in range(len(columns))]
+  row_format = '| ' + ' | '.join([f'{{:{widths[i]}}}' for i in range(len(widths))]) + ' |'
+  separator = '+-' + '-+-'.join(['-' * width for width in widths]) + '-+'
+
+  print('')
+  print('Results: ')
+  print(separator)
+  print(row_format.format(*columns))
+  print(separator)
+  for row in rows:
+    print(row_format.format(*row))
+  print(separator, flush=True)
+
+  rows = [[k, str(v)] for k, v in summary.items()]
+  widths = [max([len(str(row[index])) for row in rows]) for index in range(2)]
+  row_format = '| ' + ' | '.join([f'{{:{width}}}' for width in widths]) + ' |'
+  separator = '+-' + '-+-'.join(['-' * width for width in widths]) + '-+'
+
+  print('')
+  print('Summary: ')
+  print(separator)
+  for row in rows:
+    print(row_format.format(*row))
+  print(separator, flush=True)
+
+
+def _main():
+  start_dt = datetime.now()
+  print(f'Starting UHDM Database Compare Regression Tests @ {str(start_dt)}')
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument('lhs_dirpath', type=str, help='LHS directory containing uhdm DBs.')
+  parser.add_argument('rhs_dirpath', type=str, help='RHS directory containing uhdm DBs.')
+
+  parser.add_argument(
+      '--uhdm-cmp-filepath', dest='uhdm_cmp_filepath', required=True, type=str, help='Location of uhdm-cmp executable')
+  parser.add_argument(
+      '--output-dirpath', dest='output_dirpath', required=True, type=str, help='Output directory path')
+
+  parser.add_argument(
+      '--filters', nargs='+', required=False, default=[], type=str,
+      help='Filter comparing only matching these regex inputs')
+  parser.add_argument(
+      '--jobs', nargs='?', required=False, default=multiprocessing.cpu_count(), type=int,
+      help='Run tests in parallel, optionally providing max number of concurrent processes. Set 0 to run sequentially.')
+  args = parser.parse_args()
+
+  args.uhdm_cmp_filepath = os.path.abspath(args.uhdm_cmp_filepath)
+  args.output_dirpath = os.path.abspath(args.output_dirpath)
+
+  if args.filters:
+    filters = set()
+    for filter in args.filters:
+      if filter.startswith('@'):
+        with open(filter[1:], 'rt') as strm:
+          for line in strm:
+            line = line.strip()
+            if line:
+              filters.add(line)
+      else:
+        filters.add(filter)
+    args.filters = filters
+
+  args.filters = [text if text.isalnum() else re.compile(text, re.IGNORECASE) for text in args.filters]
+
+  if (args.jobs == None) or (args.jobs > multiprocessing.cpu_count()):
+    args.jobs = multiprocessing.cpu_count()
+
+  print( 'Environment:')
+  print(f'              command-line: {" ".join(sys.argv)}')
+  print(f'           current-dirpath: {os.getcwd()}')
+  print(f'               lhs_dirpath: {args.lhs_dirpath}')
+  print(f'               rhs_dirpath: {args.rhs_dirpath}')
+  print(f'         uhdm-cmp-filepath: {args.uhdm_cmp_filepath}')
+  print(f'            output-dirpath: {args.output_dirpath}')
+  print(f'                   filters: {args.filters}')
+  print(f'                      jobs: {args.jobs}')
+  print( '\n\n')
+
+  _mkdir(args.output_dirpath)
+
+
+  print(f'Comparing UHDM Databases ...')
+  results = _compare(args)
+
+  failed_count = len([_ for _ in results.values() if _ != Status.PASS])
+  compare_result = sum(_.value for _ in results.values() if _ != Status.PASS)
+  print(f'Sucessfully compared {len(results)} databases with {failed_count} failures.')
+
+  _print_report(results)
+
+  end_dt = datetime.now()
+  delta = round((end_dt - start_dt).total_seconds())
+
+  print('')
+  print(f'Surelog UHDM Database Compare Regression Test Completed @ {str(end_dt)} in {str(delta)} seconds')
+
+  return compare_result
+
+
+if __name__ == '__main__':
+  sys.exit(_main())
diff --git a/scripts/extract.py b/scripts/extract.py
new file mode 100644
index 0000000000..5751970f7e
--- /dev/null
+++ b/scripts/extract.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python3
+
+"""
+Script to extract log and/or UHDM DBs out from artifacts generated by CI builds.
+NOTE: CI artifacts are hierarhical tarballs i.e. tarball within tarball within tarball.
+"""
+
+import argparse
+import multiprocessing
+import os
+import pprint
+import re
+import shutil
+import sys
+import tarfile
+import time
+import zipfile
+
+from datetime import datetime
+from enum import Enum, unique
+from pathlib import Path
+from threading import Lock
+
+
+_default_dbname = 'surelog.uhdm'
+_platform_ids = ['', '.linux', '.osx', '.msys', '.win']
+
+
+_log_mutex = Lock()
+def log(text, end='\n'):
+  _log_mutex.acquire()
+  try:
+    print(text, end=end, flush=True)
+  finally:
+    _log_mutex.release()
+
+
+@unique
+class Status(Enum):
+  PASS = 0
+  FAIL = -1
+  NULL = -2
+
+  def __str__(self):
+    return str(self.name)
+
+
+def _mkdir(dirpath, retries=10):
+  count = 0
+  while count < retries:
+    os.makedirs(dirpath, exist_ok=True)
+
+    if os.path.exists(dirpath):
+      return True
+
+    count += 1
+    time.sleep(0.1)
+
+  return os.path.exists(dirpath)
+
+
+def _rmdir(dirpath, retries=10):
+  count = 0
+  while count < retries:
+    shutil.rmtree(dirpath, ignore_errors=True)
+
+    if not os.path.exists(dirpath):
+      return True
+
+    count += 1
+    time.sleep(0.1)
+
+  shutil.rmtree(dirpath)
+  return not os.path.exists(dirpath)
+
+
+def _is_filtered(name, filters):
+  if not filters:
+    return True
+
+  for filter in filters:
+    if isinstance(filter, str):
+      if filter.lower() == name.lower():
+        return True
+    else:
+      if filter.search(name):  # Note: match() reports success only if the match is at index 0
+        return True
+
+  return False
+
+
+def _extract_worker(params):
+  zip_filepath, output_dirpath, modes, filters = params
+  archive_name = Path(zip_filepath).stem
+
+  results = {}
+  with zipfile.ZipFile(zip_filepath, 'r') as zipfile_strm:
+    with zipfile_strm.open(f'{archive_name}.tar.gz') as tarfile_strm:
+      with tarfile.open(fileobj=tarfile_strm) as archive_strm:
+        for test_archive_path in archive_strm.getnames():
+          test_archive_name = Path(Path(test_archive_path).stem).stem
+
+          if not _is_filtered(test_archive_name, filters):
+            continue
+
+          dst_dirpath = os.path.join(output_dirpath, test_archive_name)
+          _mkdir(dst_dirpath)
+
+          results[test_archive_name] = {mode: Status.NULL for mode in modes}
+
+          with tarfile.open(fileobj=archive_strm.extractfile(test_archive_path)) as test_archive_strm:
+            if 'db' in modes:
+              for slpp in ['slpp_all', 'slpp_unit']:
+                src_filepath = f'{test_archive_name}/{slpp}/{_default_dbname}'
+
+                if src_filepath in test_archive_strm.getnames():
+                  dst_filepath = os.path.join(dst_dirpath, _default_dbname)
+
+                  try:
+                    src_strm = test_archive_strm.extractfile(src_filepath)
+
+                    with open(dst_filepath, 'wb') as dst_strm:
+                      dst_strm.write(src_strm.read())
+                      dst_strm.flush()
+
+                    if results[test_archive_name]['db'] == Status.NULL:
+                      results[test_archive_name]['db'] = Status.PASS
+                  except Exception:
+                    results[test_archive_name]['db'] = Status.FAIL
+
+                  break
+
+            if 'log' in modes:
+              for platform_id in _platform_ids:
+                src_filepath = f'{test_archive_name}/{test_archive_name}{platform_id}.log'
+
+                if src_filepath in test_archive_strm.getnames():
+                  dst_filepath = os.path.join(dst_dirpath, f'{test_archive_name}{platform_id}.log')
+
+                  try:
+                    src_strm = test_archive_strm.extractfile(src_filepath)
+
+                    with open(dst_filepath, 'wb') as dst_strm:
+                      dst_strm.write(src_strm.read())
+                      dst_strm.flush()
+
+                    if results[test_archive_name]['log'] == Status.NULL:
+                      results[test_archive_name]['log'] = Status.PASS
+                  except Exception:
+                    results[test_archive_name]['log'] = Status.FAIL
+
+  return results
+
+
+def _main():
+  start_dt = datetime.now()
+  print(f'Starting CI artifact extraction @ {str(start_dt)}')
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument('modes', nargs='+', choices=['db', 'log'], type=str, help='Pick what to extract from available choices')
+  parser.add_argument('--build-no', dest='build_no', required=True, type=int, help='CI build no.')
+  parser.add_argument('--archive-filename-pattern', dest='archive_filename_pattern', required=False, type=str,
+                      default='sl-{}-linux-gcc-release-regression-{}.zip', help='Archive filepath pattern (with extension).')
+  parser.add_argument('--shards', dest='shards', nargs='+', type=int,
+                      default=[0, 1, 2, 3, 4, 5], help='List of shards to extract')
+  parser.add_argument(
+      '--input-dirpath', dest='input_dirpath', required=False, type=str,
+      help='Input directory path to find the artifacts.')
+  parser.add_argument(
+      '--output-dirpath', dest='output_dirpath', required=False, type=str,
+      help='Output directory path to extract to.')
+  parser.add_argument(
+      '--filters', nargs='+', required=False, default=[], type=str, help='Filter tests matching these regex inputs')
+  parser.add_argument(
+      '--jobs', nargs='?', required=False, default=multiprocessing.cpu_count(), type=int,
+      help='Run tests in parallel, optionally providing max number of concurrent processes. Set 0 to run sequentially.')
+  args = parser.parse_args()
+
+  args.modes = sorted(set(args.modes))
+  args.shards = sorted(set(args.shards))
+
+  if not os.path.isabs(args.input_dirpath):
+    args.input_dirpath = os.getcwd()
+  args.input_dirpath = os.path.abspath(args.input_dirpath)
+
+  if not os.path.isabs(args.output_dirpath):
+    args.output_dirpath = os.getcwd()
+  args.output_dirpath = os.path.abspath(args.output_dirpath)
+
+  if args.filters:
+    filters = set()
+    for filter in args.filters:
+      if filter.startswith('@'):
+        with open(filter[1:], 'rt') as strm:
+          for line in strm:
+            line = line.strip()
+            if line:
+              filters.add(line)
+      else:
+        filters.add(filter)
+    args.filters = filters
+
+  args.filters = [text if text.isalnum() else re.compile(text, re.IGNORECASE) for text in args.filters]
+
+  if (args.jobs == None) or (args.jobs > multiprocessing.cpu_count()):
+    args.jobs = multiprocessing.cpu_count()
+
+
+  print( 'Environment:')
+  print(f'            command-line: {" ".join(sys.argv)}')
+  print(f'         current-dirpath: {os.getcwd()}')
+  print(f'                build-no: {args.build_no}')
+  print(f'archive-filename-pattern: {args.archive_filename_pattern}')
+  print(f'                  shards: {args.shards}')
+  print(f'           input-dirpath: {args.input_dirpath}')
+  print(f'          output-dirpath: {args.output_dirpath}')
+  print(f'                 filters: {args.filters}')
+  print(f'                    jobs: {args.jobs}')
+  print( '\n\n')
+
+  _mkdir(args.output_dirpath)
+
+  params = [(
+    os.path.join(args.input_dirpath, args.archive_filename_pattern.format(args.build_no, shard)),
+    args.output_dirpath,
+    args.modes, args.filters
+  ) for shard in args.shards]
+
+  jobs = min(args.jobs, len(params))
+
+  if jobs <= 1:
+    results = [_extract_worker(param) for param in params]
+  else:
+    with multiprocessing.Pool(processes=jobs) as pool:
+      results = pool.map(_extract_worker, params)
+
+  def _test_failed(v: dict):
+    return sum(0 if s == Status.PASS else 1 for s in v.values()) != 0
+
+  results = {k: v for result in results for k, v in result.items()}
+  failures = {k: v for k, v in results.items() if _test_failed(v)}
+
+  end_dt = datetime.now()
+  delta = round((end_dt - start_dt).total_seconds())
+
+  if failures:
+    print(f'Following {len(failures)} failed:')
+    pprint.pprint(sorted(failures.keys()))
+
+  print('')
+  print(f'Completed CI artifact extraction @ {str(end_dt)} in {str(delta)} seconds')
+
+  return -1 if failures else 0
+
+
+if __name__ == '__main__':
+  sys.exit(_main())