updated documentation for S3 Extracting Tool
This commit is contained in:
parent
203908152c
commit
57b7a71984
|
|
@ -77,14 +77,18 @@ The script dynamically generates headers for the output CSV file based on the ke
|
||||||
extracted data, providing a clear and understandable format for subsequent analysis. The headers correspond to the keys used for data extraction, making
|
extracted data, providing a clear and understandable format for subsequent analysis. The headers correspond to the keys used for data extraction, making
|
||||||
it easy to identify and analyze the extracted data.
|
it easy to identify and analyze the extracted data.
|
||||||
|
|
||||||
|
|
||||||
4)Advanced Data Processing Capabilities:
|
4)Advanced Data Processing Capabilities:
|
||||||
|
|
||||||
Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature
|
i) Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature
|
||||||
is particularly useful for analytical tasks that require numerical data processing.
|
is particularly useful for analytical tasks that require numerical data processing.
|
||||||
|
|
||||||
|
ii) Sampling Stepsize: The --sampling_stepsize parameter enables users to define the granularity of the time range for data extraction. By specifying the number
|
||||||
|
of 1 minute intervals, users can adjust the sampling interval, allowing for flexible data retrieval based on time.
|
||||||
Example Command:
|
Example Command:
|
||||||
|
|
||||||
python3 extractS3data.py 1749062721 1749106001 --keys AcDc/SystemControl/ResetAlarmsAndWarnings,AcDc/Devices/1/Status/Ac/L1/Voltage --bucket-number 12 --product_name=SodistoreMax
|
python3 extractS3data.py 1749062721 1749106001 --keys AcDc/SystemControl/ResetAlarmsAndWarnings,AcDc/Devices/1/Status/Ac/L1/Voltage --bucket-number 12 --product_name=SodistoreMax --sampling_stepsize 2 --booleans_as_numbers
|
||||||
|
|
||||||
This command extracts data for AcDc/SystemControl/ResetAlarmsAndWarnings and AcDc/Devices/1/Status/Ac/L1/Voltage keys from bucket number 12, between the specified timestamps, with boolean values converted to numbers.
|
This command extracts data for AcDc/SystemControl/ResetAlarmsAndWarnings and AcDc/Devices/1/Status/Ac/L1/Voltage keys from bucket number 12, between the specified timestamps, with boolean values converted to numbers.
|
||||||
|
The script will fetch data in 2 minutes intervals
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,14 +18,15 @@ def extract_timestamp(filename):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
|
||||||
def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number):
|
def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize, product_type, bucket_number):
|
||||||
if product_type == "Salimax" or product_type=="SodistoreMax":
|
if product_type in ["Salimax", "SodistoreMax"]:
|
||||||
hash = "3e5b3069-214a-43ee-8d85-57d72000c19d"
|
hash = "3e5b3069-214a-43ee-8d85-57d72000c19d"
|
||||||
elif product_type == "Salidomo":
|
elif product_type == "Salidomo":
|
||||||
hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e"
|
hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e"
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid product type option. Use Salimax or Salidomo or SodistoreMax")
|
raise ValueError("Invalid product type option.")
|
||||||
|
|
||||||
# Find common prefix
|
# Find common prefix
|
||||||
common_prefix = ""
|
common_prefix = ""
|
||||||
|
|
@ -43,20 +44,31 @@ def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,produc
|
||||||
output = subprocess.check_output(s3cmd_command, shell=True, text=True)
|
output = subprocess.check_output(s3cmd_command, shell=True, text=True)
|
||||||
files = [line.split()[-1] for line in output.strip().split("\n") if line.strip()]
|
files = [line.split()[-1] for line in output.strip().split("\n") if line.strip()]
|
||||||
filenames = []
|
filenames = []
|
||||||
|
count=0
|
||||||
|
|
||||||
for f in files:
|
for f in files:
|
||||||
name = f.split("/")[-1] # e.g., 1748802020.json
|
name = f.split("/")[-1]
|
||||||
timestamp_str = name.split(".")[0] # extract '1748802020'
|
timestamp_str = name.split(".")[0]
|
||||||
if timestamp_str.isdigit() and int(timestamp_str) <= int(end_timestamp):
|
|
||||||
filenames.append(name)
|
if timestamp_str.isdigit():
|
||||||
else:
|
timestamp = int(timestamp_str)
|
||||||
break
|
|
||||||
|
|
||||||
|
if start_timestamp <= timestamp <= end_timestamp :
|
||||||
|
if count % sampling_stepsize == 0:
|
||||||
|
filenames.append(name)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(filenames)
|
print(filenames)
|
||||||
return filenames
|
return filenames
|
||||||
|
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
print(f"No files found for prefix {common_prefix}")
|
print(f"No files found for prefix {common_prefix}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def get_nested_value(data, key_path):
|
def get_nested_value(data, key_path):
|
||||||
try:
|
try:
|
||||||
for key in key_path:
|
for key in key_path:
|
||||||
|
|
@ -151,7 +163,7 @@ def download_files(bucket_number, filenames_to_download, product_type):
|
||||||
print(f"Files with prefix '{filename}' downloaded successfully.")
|
print(f"Files with prefix '{filename}' downloaded successfully.")
|
||||||
decompress_file(os.path.join(output_directory, filename), output_directory)
|
decompress_file(os.path.join(output_directory, filename), output_directory)
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
# print(f"Error downloading files: {e}")
|
print(f"Error downloading files: {e}")
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
print(f"File '{filename}.json' already exists locally. Skipping download.")
|
print(f"File '{filename}.json' already exists locally. Skipping download.")
|
||||||
|
|
@ -187,7 +199,7 @@ def get_last_component(path):
|
||||||
path_without_slashes = path.replace('/', '')
|
path_without_slashes = path.replace('/', '')
|
||||||
return path_without_slashes
|
return path_without_slashes
|
||||||
|
|
||||||
def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type):
|
def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, product_type):
|
||||||
output_directory = f"S3cmdData_{bucket_number}"
|
output_directory = f"S3cmdData_{bucket_number}"
|
||||||
|
|
||||||
#if os.path.exists(output_directory):
|
#if os.path.exists(output_directory):
|
||||||
|
|
@ -200,7 +212,7 @@ def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sa
|
||||||
filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number)
|
filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize,product_type,bucket_number)
|
||||||
existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.json"))]
|
existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.json"))]
|
||||||
files_to_download = set(filenames_to_check) - set(existing_files)
|
files_to_download = set(filenames_to_check) - set(existing_files)
|
||||||
print(files_to_download)
|
#print(files_to_download)
|
||||||
|
|
||||||
#if os.listdir(output_directory):
|
#if os.listdir(output_directory):
|
||||||
# print("Files already exist in the local folder. Skipping download.")
|
# print("Files already exist in the local folder. Skipping download.")
|
||||||
|
|
@ -231,9 +243,8 @@ def main():
|
||||||
parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)')
|
parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)')
|
||||||
parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys')
|
parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys')
|
||||||
parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from')
|
parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from')
|
||||||
parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval')
|
parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 1 minute intervals, which define the length of the sampling interval in S3 file retrieval')
|
||||||
parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]')
|
parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]')
|
||||||
parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text')
|
|
||||||
parser.add_argument('--product_name', required=True, help='Use Salimax, Salidomo or SodistoreMax')
|
parser.add_argument('--product_name', required=True, help='Use Salimax, Salidomo or SodistoreMax')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
@ -243,14 +254,13 @@ def main():
|
||||||
bucket_number = args.bucket_number
|
bucket_number = args.bucket_number
|
||||||
sampling_stepsize = args.sampling_stepsize
|
sampling_stepsize = args.sampling_stepsize
|
||||||
booleans_as_numbers = args.booleans_as_numbers
|
booleans_as_numbers = args.booleans_as_numbers
|
||||||
exact_match = args.exact_match
|
|
||||||
# new arg for product type
|
# new arg for product type
|
||||||
product_type = args.product_name
|
product_type = args.product_name
|
||||||
|
|
||||||
if start_timestamp >= end_timestamp:
|
if start_timestamp >= end_timestamp:
|
||||||
print("Error: start_timestamp must be smaller than end_timestamp.")
|
print("Error: start_timestamp must be smaller than end_timestamp.")
|
||||||
return
|
return
|
||||||
download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type)
|
download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, product_type)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue