Innovenergy_trunk/csharp/App/Backend/generate_alarm_translations.py

#!/usr/bin/env python3
"""
generate_alarm_translations.py

One-time script: reads AlarmKnowledgeBase.cs, calls Mistral API to translate
all alarm entries into German (de), French (fr), and Italian (it), and writes:

  Resources/AlarmTranslations.de.json   ← backend uses these at startup
  Resources/AlarmTranslations.fr.json
  Resources/AlarmTranslations.it.json
  Resources/AlarmNames.de.json          ← frontend lang file additions
  Resources/AlarmNames.fr.json
  Resources/AlarmNames.it.json

Usage:
  export MISTRAL_API_KEY=your_key_here
  python3 generate_alarm_translations.py

Output files can be reviewed/edited before committing.
"""

import re
import json
import os
import sys
import time
from typing import Optional
import requests

# ── Config ─────────────────────────────────────────────────────────────────

KNOWLEDGE_BASE_FILE = "Services/AlarmKnowledgeBase.cs"
RESOURCES_DIR       = "Resources"
MISTRAL_URL         = "https://api.mistral.ai/v1/chat/completions"
MISTRAL_MODEL       = "mistral-small-latest"
BATCH_SIZE          = 3    # alarms per API call — smaller = less chance of token truncation
RETRY_DELAY         = 5    # seconds between retries on rate-limit
MAX_RETRIES         = 3
REQUEST_TIMEOUT     = (10, 90)  # (connect_timeout, read_timeout) in seconds

LANGUAGES = {
    "de": "German",
    "fr": "French",
    "it": "Italian",
}

# ── Parsing ─────────────────────────────────────────────────────────────────

def split_camel_case(name: str) -> str:
    """'AbnormalGridVoltage' → 'Abnormal Grid Voltage'"""
    return re.sub(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])', ' ', name).strip()


def parse_knowledge_base(filepath: str) -> dict:
    """
    Parses AlarmKnowledgeBase.cs and returns a dict:
      { "AlarmKey": { "Explanation": "...", "Causes": [...], "NextSteps": [...] } }
    """
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    alarms = {}

    # Find positions of all alarm key declarations: ["Key"] = new()
    key_matches = list(re.finditer(r'\["(\w+)"\]\s*=\s*new\(\)', content))

    for i, key_match in enumerate(key_matches):
        key = key_match.group(1)
        start = key_match.start()
        end = key_matches[i + 1].start() if i + 1 < len(key_matches) else len(content)
        block = content[start:end]

        # Explanation (single string)
        exp_match = re.search(r'Explanation\s*=\s*"((?:[^"\\]|\\.)*)"', block)
        explanation = exp_match.group(1) if exp_match else ""

        # Causes (string array)
        causes_section = re.search(r'Causes\s*=\s*new\[\]\s*\{([^}]+)\}', block, re.DOTALL)
        causes = re.findall(r'"((?:[^"\\]|\\.)*)"', causes_section.group(1)) if causes_section else []

        # NextSteps (string array)
        steps_section = re.search(r'NextSteps\s*=\s*new\[\]\s*\{([^}]+)\}', block, re.DOTALL)
        next_steps = re.findall(r'"((?:[^"\\]|\\.)*)"', steps_section.group(1)) if steps_section else []

        if explanation or causes or next_steps:
            alarms[key] = {
                "Explanation": explanation,
                "Causes": causes,
                "NextSteps": next_steps,
            }

    return alarms


# ── Mistral API ─────────────────────────────────────────────────────────────

def call_mistral(api_key: str, prompt: str) -> Optional[str]:
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }
    body = {
        "model": MISTRAL_MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 1400,  # ~3 alarms × ~450 tokens each (German is verbose)
        "temperature": 0.1,  # low for consistent translations
    }

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = requests.post(MISTRAL_URL, headers=headers, json=body, timeout=REQUEST_TIMEOUT)
            if resp.status_code == 429:
                print(f"  Rate limited, waiting {RETRY_DELAY}s (attempt {attempt}/{MAX_RETRIES})...")
                time.sleep(RETRY_DELAY * attempt)
                continue
            resp.raise_for_status()
            data = resp.json()
            content = data["choices"][0]["message"]["content"].strip()
            # Strip markdown code fences if present
            if content.startswith("```"):
                first_newline = content.index("\n")
                content = content[first_newline + 1:]
                if content.endswith("```"):
                    content = content[:-3].strip()
            return content
        except requests.RequestException as e:
            print(f"  HTTP error: {e} (attempt {attempt}/{MAX_RETRIES})")
            time.sleep(RETRY_DELAY)

    return None


def translate_batch(api_key: str, batch: dict, language_name: str) -> Optional[dict]:
    """
    Translates a batch of alarms into the target language.
    Returns dict with same keys + translated content including a localized Name.
    """
    # Build input JSON (only English content, no need to send back keys)
    input_data = {}
    for key, entry in batch.items():
        english_name = split_camel_case(key)
        input_data[key] = {
            "EnglishName": english_name,
            "Explanation": entry["Explanation"],
            "Causes": entry["Causes"],
            "NextSteps": entry["NextSteps"],
        }

    prompt = f"""You are translating battery energy storage system alarm descriptions into {language_name}.
Translate each alarm entry. The "Name" should be a short (2-5 word) localized display title for the alarm.
Keep technical terms accurate but use plain language a homeowner would understand.

Input JSON:
{json.dumps(input_data, ensure_ascii=False, indent=2)}

Return ONLY a valid JSON object with the same alarm keys. Each value must have exactly these fields:
{{
  "Name": "short {language_name} title",
  "Explanation": "translated explanation sentence",
  "Causes": ["translated cause 1", "translated cause 2"],
  "NextSteps": ["translated step 1", "translated step 2"]
}}

Reply with ONLY the JSON object, no markdown, no extra text."""

    raw = call_mistral(api_key, prompt)
    if raw is None:
        return None

    try:
        result = json.loads(raw)
        return result
    except json.JSONDecodeError as e:
        print(f"  JSON parse error: {e}")
        print(f"  Raw response (first 300 chars): {raw[:300]}")
        return None


# ── Main ────────────────────────────────────────────────────────────────────

def load_env_file(env_path: str) -> dict:
    """Parse a simple KEY=VALUE .env file."""
    env = {}
    try:
        with open(env_path) as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith("#") and "=" in line:
                    k, _, v = line.partition("=")
                    env[k.strip()] = v.strip()
    except FileNotFoundError:
        pass
    return env


def main():
    # Try environment variable first, then .env file in the same directory
    api_key = os.environ.get("MISTRAL_API_KEY", "").strip()
    if not api_key:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        env_vars = load_env_file(os.path.join(script_dir, ".env"))
        api_key = env_vars.get("MISTRAL_API_KEY", "").strip()

    if not api_key:
        print("ERROR: MISTRAL_API_KEY not found in environment or .env file.")
        sys.exit(1)

    print("MISTRAL_API_KEY loaded.")

    # Parse knowledge base
    print(f"Parsing {KNOWLEDGE_BASE_FILE}...")
    alarms = parse_knowledge_base(KNOWLEDGE_BASE_FILE)
    print(f"  Found {len(alarms)} alarm entries.")

    if not alarms:
        print("ERROR: No alarms parsed. Check the file path and format.")
        sys.exit(1)

    alarm_keys = list(alarms.keys())
    os.makedirs(RESOURCES_DIR, exist_ok=True)

    # Process each language
    for lang_code, lang_name in LANGUAGES.items():
        print(f"\n── Translating to {lang_name} ({lang_code}) ──")

        translations     = {}  # key → {Name, Explanation, Causes, NextSteps}
        alarm_name_keys  = {}  # "alarm_Key" → translated name (for lang JSON files)
        failed_keys      = []

        # Split into batches
        batches = [
            {k: alarms[k] for k in alarm_keys[i:i + BATCH_SIZE]}
            for i in range(0, len(alarm_keys), BATCH_SIZE)
        ]

        for batch_num, batch in enumerate(batches, 1):
            keys_in_batch = list(batch.keys())
            print(f"  Batch {batch_num}/{len(batches)}: {', '.join(keys_in_batch)}")

            result = translate_batch(api_key, batch, lang_name)

            if result is None:
                print(f"  FAILED batch {batch_num} — will mark keys as failed")
                failed_keys.extend(keys_in_batch)
                continue

            for key in keys_in_batch:
                if key in result:
                    entry = result[key]
                    translations[key] = {
                        "Explanation": entry.get("Explanation", ""),
                        "Causes":      entry.get("Causes", []),
                        "NextSteps":   entry.get("NextSteps", []),
                    }
                    alarm_name_keys[f"alarm_{key}"] = entry.get("Name", split_camel_case(key))
                else:
                    print(f"  WARNING: key '{key}' missing from batch result")
                    failed_keys.append(key)

            # Small pause between batches to avoid rate limits
            if batch_num < len(batches):
                time.sleep(1)

        # Write backend translation file
        backend_file = os.path.join(RESOURCES_DIR, f"AlarmTranslations.{lang_code}.json")
        with open(backend_file, "w", encoding="utf-8") as f:
            json.dump(translations, f, ensure_ascii=False, indent=2)
        print(f"  Wrote {len(translations)} entries → {backend_file}")

        # Write frontend alarm name file (to be merged into lang JSON)
        names_file = os.path.join(RESOURCES_DIR, f"AlarmNames.{lang_code}.json")
        with open(names_file, "w", encoding="utf-8") as f:
            json.dump(alarm_name_keys, f, ensure_ascii=False, indent=2)
        print(f"  Wrote {len(alarm_name_keys)} name keys → {names_file}")

        if failed_keys:
            print(f"  FAILED keys ({len(failed_keys)}): {failed_keys}")

    print("\n✓ Done. Review the output files in Resources/ before committing.")
    print("  Next: merge AlarmNames.*.json entries into src/lang/de.json, fr.json, it.json")


if __name__ == "__main__":
    main()