updated documentation for S3 Extracting Tool

This commit is contained in:
Noe 2025-09-01 13:07:52 +02:00
parent 203908152c
commit 57b7a71984
2 changed files with 32 additions and 18 deletions

View File

@ -77,14 +77,18 @@ The script dynamically generates headers for the output CSV file based on the ke
extracted data, providing a clear and understandable format for subsequent analysis. The headers correspond to the keys used for data extraction, making
it easy to identify and analyze the extracted data.
4)Advanced Data Processing Capabilities:
Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature
i) Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature
is particularly useful for analytical tasks that require numerical data processing.
ii) Sampling Stepsize: The --sampling_stepsize parameter enables users to define the granularity of the time range for data extraction. By specifying the number
of 1 minute intervals, users can adjust the sampling interval, allowing for flexible data retrieval based on time.
Example Command:
python3 extractS3data.py 1749062721 1749106001 --keys AcDc/SystemControl/ResetAlarmsAndWarnings,AcDc/Devices/1/Status/Ac/L1/Voltage --bucket-number 12 --product_name=SodistoreMax
python3 extractS3data.py 1749062721 1749106001 --keys AcDc/SystemControl/ResetAlarmsAndWarnings,AcDc/Devices/1/Status/Ac/L1/Voltage --bucket-number 12 --product_name=SodistoreMax --sampling_stepsize 2 --booleans_as_numbers
This command extracts data for AcDc/SystemControl/ResetAlarmsAndWarnings and AcDc/Devices/1/Status/Ac/L1/Voltage keys from bucket number 12, between the specified timestamps, with boolean values converted to numbers.
The script will fetch data in 2 minutes intervals

View File

@ -18,14 +18,15 @@ def extract_timestamp(filename):
except ValueError:
return 0
import subprocess
def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number):
if product_type == "Salimax" or product_type=="SodistoreMax":
def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize, product_type, bucket_number):
if product_type in ["Salimax", "SodistoreMax"]:
hash = "3e5b3069-214a-43ee-8d85-57d72000c19d"
elif product_type == "Salidomo":
hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e"
else:
raise ValueError("Invalid product type option. Use Salimax or Salidomo or SodistoreMax")
raise ValueError("Invalid product type option.")
# Find common prefix
common_prefix = ""
@ -43,20 +44,31 @@ def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,produc
output = subprocess.check_output(s3cmd_command, shell=True, text=True)
files = [line.split()[-1] for line in output.strip().split("\n") if line.strip()]
filenames = []
count=0
for f in files:
name = f.split("/")[-1] # e.g., 1748802020.json
timestamp_str = name.split(".")[0] # extract '1748802020'
if timestamp_str.isdigit() and int(timestamp_str) <= int(end_timestamp):
name = f.split("/")[-1]
timestamp_str = name.split(".")[0]
if timestamp_str.isdigit():
timestamp = int(timestamp_str)
if start_timestamp <= timestamp <= end_timestamp :
if count % sampling_stepsize == 0:
filenames.append(name)
else:
break
count += 1
print(filenames)
return filenames
except subprocess.CalledProcessError:
print(f"No files found for prefix {common_prefix}")
return []
def get_nested_value(data, key_path):
try:
for key in key_path:
@ -151,7 +163,7 @@ def download_files(bucket_number, filenames_to_download, product_type):
print(f"Files with prefix '{filename}' downloaded successfully.")
decompress_file(os.path.join(output_directory, filename), output_directory)
except subprocess.CalledProcessError as e:
# print(f"Error downloading files: {e}")
print(f"Error downloading files: {e}")
continue
else:
print(f"File '{filename}.json' already exists locally. Skipping download.")
@ -187,7 +199,7 @@ def get_last_component(path):
path_without_slashes = path.replace('/', '')
return path_without_slashes
def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type):
def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, product_type):
output_directory = f"S3cmdData_{bucket_number}"
#if os.path.exists(output_directory):
@ -200,7 +212,7 @@ def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sa
filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number)
existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.json"))]
files_to_download = set(filenames_to_check) - set(existing_files)
print(files_to_download)
#print(files_to_download)
#if os.listdir(output_directory):
# print("Files already exist in the local folder. Skipping download.")
@ -231,9 +243,8 @@ def main():
parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)')
parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys')
parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from')
parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval')
parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 1 minute intervals, which define the length of the sampling interval in S3 file retrieval')
parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]')
parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text')
parser.add_argument('--product_name', required=True, help='Use Salimax, Salidomo or SodistoreMax')
args = parser.parse_args()
@ -243,14 +254,13 @@ def main():
bucket_number = args.bucket_number
sampling_stepsize = args.sampling_stepsize
booleans_as_numbers = args.booleans_as_numbers
exact_match = args.exact_match
# new arg for product type
product_type = args.product_name
if start_timestamp >= end_timestamp:
print("Error: start_timestamp must be smaller than end_timestamp.")
return
download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type)
download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, product_type)
if __name__ == "__main__":
main()