diff --git a/S3ExtractingTool/ExtractS3README.txt b/S3ExtractingTool/ExtractS3README.txt index c4cd1cb04..18e0e7119 100644 --- a/S3ExtractingTool/ExtractS3README.txt +++ b/S3ExtractingTool/ExtractS3README.txt @@ -77,14 +77,18 @@ The script dynamically generates headers for the output CSV file based on the ke extracted data, providing a clear and understandable format for subsequent analysis. The headers correspond to the keys used for data extraction, making it easy to identify and analyze the extracted data. + 4)Advanced Data Processing Capabilities: -Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature +i) Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature is particularly useful for analytical tasks that require numerical data processing. +ii) Sampling Stepsize: The --sampling_stepsize parameter enables users to define the granularity of the time range for data extraction. By specifying the number +of 1 minute intervals, users can adjust the sampling interval, allowing for flexible data retrieval based on time. Example Command: -python3 extractS3data.py 1749062721 1749106001 --keys AcDc/SystemControl/ResetAlarmsAndWarnings,AcDc/Devices/1/Status/Ac/L1/Voltage --bucket-number 12 --product_name=SodistoreMax +python3 extractS3data.py 1749062721 1749106001 --keys AcDc/SystemControl/ResetAlarmsAndWarnings,AcDc/Devices/1/Status/Ac/L1/Voltage --bucket-number 12 --product_name=SodistoreMax --sampling_stepsize 2 --booleans_as_numbers This command extracts data for AcDc/SystemControl/ResetAlarmsAndWarnings and AcDc/Devices/1/Status/Ac/L1/Voltage keys from bucket number 12, between the specified timestamps, with boolean values converted to numbers. +The script will fetch data in 2 minutes intervals diff --git a/S3ExtractingTool/extractS3data.py b/S3ExtractingTool/extractS3data.py index a40765351..167540951 100644 --- a/S3ExtractingTool/extractS3data.py +++ b/S3ExtractingTool/extractS3data.py @@ -18,14 +18,15 @@ def extract_timestamp(filename): except ValueError: return 0 +import subprocess -def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number): - if product_type == "Salimax" or product_type=="SodistoreMax": +def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize, product_type, bucket_number): + if product_type in ["Salimax", "SodistoreMax"]: hash = "3e5b3069-214a-43ee-8d85-57d72000c19d" elif product_type == "Salidomo": hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e" else: - raise ValueError("Invalid product type option. Use Salimax or Salidomo or SodistoreMax") + raise ValueError("Invalid product type option.") # Find common prefix common_prefix = "" @@ -43,20 +44,31 @@ def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,produc output = subprocess.check_output(s3cmd_command, shell=True, text=True) files = [line.split()[-1] for line in output.strip().split("\n") if line.strip()] filenames = [] + count=0 + for f in files: - name = f.split("/")[-1] # e.g., 1748802020.json - timestamp_str = name.split(".")[0] # extract '1748802020' - if timestamp_str.isdigit() and int(timestamp_str) <= int(end_timestamp): - filenames.append(name) - else: - break + name = f.split("/")[-1] + timestamp_str = name.split(".")[0] + + if timestamp_str.isdigit(): + timestamp = int(timestamp_str) + + + if start_timestamp <= timestamp <= end_timestamp : + if count % sampling_stepsize == 0: + filenames.append(name) + count += 1 + + print(filenames) return filenames + except subprocess.CalledProcessError: print(f"No files found for prefix {common_prefix}") return [] + def get_nested_value(data, key_path): try: for key in key_path: @@ -151,7 +163,7 @@ def download_files(bucket_number, filenames_to_download, product_type): print(f"Files with prefix '{filename}' downloaded successfully.") decompress_file(os.path.join(output_directory, filename), output_directory) except subprocess.CalledProcessError as e: - # print(f"Error downloading files: {e}") + print(f"Error downloading files: {e}") continue else: print(f"File '{filename}.json' already exists locally. Skipping download.") @@ -187,7 +199,7 @@ def get_last_component(path): path_without_slashes = path.replace('/', '') return path_without_slashes -def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type): +def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, product_type): output_directory = f"S3cmdData_{bucket_number}" #if os.path.exists(output_directory): @@ -200,7 +212,7 @@ def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sa filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number) existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.json"))] files_to_download = set(filenames_to_check) - set(existing_files) - print(files_to_download) + #print(files_to_download) #if os.listdir(output_directory): # print("Files already exist in the local folder. Skipping download.") @@ -231,9 +243,8 @@ def main(): parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)') parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys') parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from') - parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval') + parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 1 minute intervals, which define the length of the sampling interval in S3 file retrieval') parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]') - parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text') parser.add_argument('--product_name', required=True, help='Use Salimax, Salidomo or SodistoreMax') args = parser.parse_args() @@ -243,14 +254,13 @@ def main(): bucket_number = args.bucket_number sampling_stepsize = args.sampling_stepsize booleans_as_numbers = args.booleans_as_numbers - exact_match = args.exact_match # new arg for product type product_type = args.product_name if start_timestamp >= end_timestamp: print("Error: start_timestamp must be smaller than end_timestamp.") return - download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type) + download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, product_type) if __name__ == "__main__": main()