updated documentation for S3 Extracting Tool

2025-09-01 13:07:52 +02:00 · 2025-09-01 13:07:52 +02:00 · 57b7a71984
parent 203908152c
commit 57b7a71984
2 changed files with 32 additions and 18 deletions
--- a/S3ExtractingTool/ExtractS3README.txt
+++ b/S3ExtractingTool/ExtractS3README.txt
@ -77,14 +77,18 @@ The script dynamically generates headers for the output CSV file based on the ke
 extracted data, providing a clear and understandable format for subsequent analysis. The headers correspond to the keys used for data extraction, making
 it easy to identify and analyze the extracted data.
 4)Advanced Data Processing Capabilities:
-Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature
+i) Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature
 is particularly useful for analytical tasks that require numerical data processing.
 ii) Sampling Stepsize: The --sampling_stepsize parameter enables users to define the granularity of the time range for data extraction. By specifying the number
 of 1 minute intervals, users can adjust the sampling interval, allowing for flexible data retrieval based on time.
 Example Command:
-python3 extractS3data.py 1749062721 1749106001 --keys AcDc/SystemControl/ResetAlarmsAndWarnings,AcDc/Devices/1/Status/Ac/L1/Voltage --bucket-number 12 --product_name=SodistoreMax
+python3 extractS3data.py 1749062721 1749106001 --keys AcDc/SystemControl/ResetAlarmsAndWarnings,AcDc/Devices/1/Status/Ac/L1/Voltage --bucket-number 12 --product_name=SodistoreMax  --sampling_stepsize 2  --booleans_as_numbers
 This command extracts data for AcDc/SystemControl/ResetAlarmsAndWarnings and AcDc/Devices/1/Status/Ac/L1/Voltage keys from bucket number 12, between the specified timestamps, with boolean values converted to numbers.
 The script will fetch data in 2 minutes intervals
--- a/S3ExtractingTool/extractS3data.py
+++ b/S3ExtractingTool/extractS3data.py
@ -18,14 +18,15 @@ def extract_timestamp(filename):
    except ValueError:
        return 0
 import subprocess
-def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number):
+def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize, product_type, bucket_number):
-    if product_type == "Salimax" or product_type=="SodistoreMax":
+    if product_type in ["Salimax", "SodistoreMax"]:
        hash = "3e5b3069-214a-43ee-8d85-57d72000c19d"
    elif product_type == "Salidomo":
        hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e"
    else:
-        raise ValueError("Invalid product type option. Use Salimax or Salidomo or SodistoreMax")
+        raise ValueError("Invalid product type option.")
    # Find common prefix
    common_prefix = ""
@ -43,20 +44,31 @@ def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,produc
        output = subprocess.check_output(s3cmd_command, shell=True, text=True)
        files = [line.split()[-1] for line in output.strip().split("\n") if line.strip()]
        filenames = []
        count=0
        for f in files:
-            name = f.split("/")[-1]  # e.g., 1748802020.json
+            name = f.split("/")[-1]
-            timestamp_str = name.split(".")[0]  # extract '1748802020'
+            timestamp_str = name.split(".")[0]
-            if timestamp_str.isdigit() and int(timestamp_str) <= int(end_timestamp):
+
-                filenames.append(name)
+            if timestamp_str.isdigit():
-            else:
+                timestamp = int(timestamp_str)
-                break
+
                if start_timestamp <= timestamp <= end_timestamp  :
                    if count % sampling_stepsize == 0:
                        filenames.append(name)
                    count += 1
        print(filenames)
        return filenames
    except subprocess.CalledProcessError:
        print(f"No files found for prefix {common_prefix}")
        return []
 def get_nested_value(data, key_path):
    try:
        for key in key_path:
@ -151,7 +163,7 @@ def download_files(bucket_number, filenames_to_download, product_type):
                    print(f"Files with prefix '{filename}' downloaded successfully.")
                    decompress_file(os.path.join(output_directory, filename), output_directory)
            except subprocess.CalledProcessError as e:
-               # print(f"Error downloading files: {e}")
+                print(f"Error downloading files: {e}")
                continue
        else:
            print(f"File '{filename}.json' already exists locally. Skipping download.")
@ -187,7 +199,7 @@ def get_last_component(path):
    path_without_slashes = path.replace('/', '')
    return path_without_slashes
-def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type):
+def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, product_type):
    output_directory = f"S3cmdData_{bucket_number}"
    #if os.path.exists(output_directory):
@ -200,7 +212,7 @@ def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sa
    filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number)
    existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.json"))]
    files_to_download = set(filenames_to_check) - set(existing_files)
-    print(files_to_download)
+    #print(files_to_download)
    #if os.listdir(output_directory):
    #    print("Files already exist in the local folder. Skipping download.")
@ -231,9 +243,8 @@ def main():
    parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)')
    parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys')
    parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from')
-    parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval')
+    parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 1 minute intervals, which define the length of the sampling interval in S3 file retrieval')
    parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]')
    parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text')
    parser.add_argument('--product_name', required=True, help='Use Salimax, Salidomo or SodistoreMax')
    args = parser.parse_args()
@ -243,14 +254,13 @@ def main():
    bucket_number = args.bucket_number
    sampling_stepsize = args.sampling_stepsize
    booleans_as_numbers = args.booleans_as_numbers
    exact_match = args.exact_match
    # new arg for product type
    product_type = args.product_name
    if start_timestamp >= end_timestamp:
        print("Error: start_timestamp must be smaller than end_timestamp.")
        return
-    download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type)
+    download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, product_type)
 if __name__ == "__main__":
    main()