From 1638b171fb34089f9f657f4848228cf8786b34e9 Mon Sep 17 00:00:00 2001 From: Fede Tux Date: Tue, 6 Feb 2024 16:40:01 -0300 Subject: [PATCH] Add comments on handling files and streams with pyarrow for future reference --- integrations/stdin_to_securitylake.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/integrations/stdin_to_securitylake.py b/integrations/stdin_to_securitylake.py index 034b729c1208d..1604bc2ed9ebc 100755 --- a/integrations/stdin_to_securitylake.py +++ b/integrations/stdin_to_securitylake.py @@ -15,13 +15,14 @@ def map_to_ocsf(): ## Code that translates fields to OCSF def encode_parquet(list): + ### We can write directly to S3 from pyarrow: + ### https://arrow.apache.org/docs/python/filesystems.html#s3 + ### + ### Credentials can be stored in /root/.aws/credentials + ### https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html table = Table.from_pylist(list) parquet.write_table(table, '/tmp/{}.parquet'.format(clockstr)) -def push_to_s3(parquet): - ## Fill with AWS S3 code - pass - def read_block(fileobject,length): output=[] for i in range(0,length): @@ -62,12 +63,18 @@ def parse_arguments(): try: while True: + ### We can possibly replace all the custom code here + ### and just use Arrow's built-in input and output facilities: + ### * https://arrow.apache.org/docs/python/memory.html#input-and-output + ### * https://arrow.apache.org/docs/python/ipc.html#reading-from-stream-and-file-format-for-pandas + ### * https://stackoverflow.com/questions/52945609/pandas-dataframe-to-parquet-buffer-in-memory + current_block = read_block(stdin,args.linebuffer) if current_block[-1] == block_ending : output_buffer += current_block[0:current_block.index(block_ending)] time.sleep(args.sleeptime) if len(output_buffer) > args.maxlength or get_elapsedseconds(starttimestamp) > args.pushinterval: - push_to_s3(encode_parquet(output_buffer)) + encode_parquet(output_buffer) logging.debug(json.dumps(output_buffer)) starttimestamp = datetime.now(tz='UTC') output_buffer = []