from flask import Flask, request, jsonify
import os
import streamlit as st
from langchain.chains import create_sql_query_chain
from langchain_google_genai import GoogleGenerativeAI
from sqlalchemy import create_engine
from sqlalchemy.exc import ProgrammingError
from langchain_community.utilities import SQLDatabase
import google.generativeai as genai
import pymysql
import pandas as pd
import numpy as np
import random
from io import StringIO
import json
import re
from pathlib import Path
import mimetypes
from flask import request

# Configure GenAI Key
from dotenv import load_dotenv
load_dotenv() 


#SET API KEY
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
api_key = os.getenv("GOOGLE_API_KEY")

# Model Configuration
MODEL_CONFIG = {
  "temperature": 0.2,
  "top_p": 1,
  "top_k": 32,
  "max_output_tokens": 4096,
}

## Safety Settings of Model
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  }
]

#LOAD GEMINI MODEL WITH MODEL CONFIGURATIONS
model = genai.GenerativeModel(model_name = "gemini-2.5-flash-lite",
                              generation_config = MODEL_CONFIG,
                              safety_settings = safety_settings)


app = Flask(__name__)

@app.route('/process_data', methods=['GET'])
def process_data():
    data = request.json
    input_value = data.get('input_value')

    # Perform your Python logic here
    result = f"Python processed: {input_value} and doubled it to {input_value * 2}"

    return jsonify({"status": "success", "result": result})

@app.route('/home', methods=['GET'])
def home():
    return jsonify({"status": "success"})

@app.route('/read-pdf', methods=['GET'])
def read_pdf():
    #EXTRACTING WHOLE DATA IN JSON FROM INVOICE
    system_prompt = """
                You are an expert in converting shipping invoices into a structured JSON format. 
                Your task is to extract information from the provided PDF and organize it into a single JSON object. 
                This JSON object must have a single parent tag named 'invoice_details'. 
                Inside 'invoice_details', there should be exactly four nested objects with the following names: 'supplier_data', 'customer_data', 'charges_data' and 'invoice_metadata'.
                
                The 'supplier_data' section must be a JSON object with the following keys: 'name', 'address', 'gstin', 'cin', and 'pan'.

                The 'customer_data' section must be a JSON object with the following keys: 'client_no', 'invoice_to', 'address', 'state', 'gstin', and 'pan'.

                The 'charges_data' section must be a list of objects. 
                Each object in this list must have the following keys: 'charge', 'hsn_code', 'qty_x_rate_curr', 'currency', 'total_curr', 'roe', 'total_inr', 'sgst_ugst_percent', 'sgst_ugst_amount', 'cgst_percent', 'cgst_amount', 'igst_percent', and 'igst_amount'.

                This 'invoice_metadata' tag must contain the following specific keys: 'invoice_no', 'invoice_date', 'place_of_supply', 'vessel', 'voyage', 'pol', 'pod', 'bl_no', 'taxable_value_inr', 'total_gst_amount_inr', and 'total_invoice_amount_inr'. 
                Within 'invoice_metadata', you must also include a nested JSON object called 'currency_total' with the keys 'amount_in_eur', 'amount_in_usd', and 'amount_in_inr'. 
                
                Extract the corresponding data from the invoice and populate these tags accordingly.
                """
    #system_prompt = "Convert Invoice data into json format with appropriate json tags as required for the data in image "
    #image_path = "one_bill.pdf"
    image_path = request.args.get('file_name')
    user_prompt = """Please extract the data from the attached invoice PDF and convert it to the specified JSON format."""

    try:
        response = gemini_output(image_path, system_prompt, user_prompt)
        #output = gemini_output(image_path, system_prompt, user_prompt)
        #output=output[7:-4]
        #return output

        if response.prompt_feedback and response.prompt_feedback.block_reason:
            # For simplicity, if blocked, we'll return an error JSON
            return json.dumps({"error": "Content blocked", "reason": response.prompt_feedback.block_reason})

        if not response.text:
            return json.dumps({"error": "No text content received from Gemini"})

        raw_gemini_output = response.text
        print(f"Raw Gemini output (before extraction):\n{raw_gemini_output}")
        # Try to extract JSON from markdown fences
        json_string_extracted = extract_json_from_markdown(raw_gemini_output)

        if json_string_extracted:
            try:
                # Validate JSON by parsing it, then return the original string
                parsed_data = json.loads(json_string_extracted)
                return json.dumps(parsed_data) # Re-dump to ensure it's a single line and valid JSON
            except json.JSONDecodeError as e:
                return json.dumps({"error": "JSON decoding failed after extraction", "message": str(e), "extracted_json": json_string_extracted, "raw_output": raw_gemini_output})
        else:
            # If no markdown fences found, try to parse the whole output as JSON
            # This handles cases where the model might sometimes skip fences or add other text
            try:
                parsed_data = json.loads(raw_gemini_output)
                return json.dumps(parsed_data)
            except json.JSONDecodeError as e:
                return json.dumps({"error": "No JSON markdown fences found and direct JSON parsing failed", "message": str(e), "raw_output": raw_gemini_output})

    except Exception as e:
        return json.dumps({"error": "API call failed", "message": str(e)})


#USER METHODS
#DEFINE PDF FORMAT TO INPUT IN GEMINI
def read_pdf_bytes(pdf_path: str):
    """
    Reads a PDF file, gets its raw bytes, and formats it into a dictionary
    with the 'application/pdf' MIME type. This format is often used for APIs
    that accept PDF file inputs.

    Args:
        pdf_path (str): The path to the input PDF file.

    Returns:
        list: A list containing a dictionary with 'mime_type' and 'data' keys.
              Returns an empty list if the file is not found or not a PDF.
    """
    pdf_file = Path(pdf_path)

    if not pdf_file.exists():
        raise FileNotFoundError(f"Could not find PDF file: {pdf_file}")

    # Explicitly set MIME type for PDF
    mime_type = "application/pdf"

    # Optional: You could add a check here to ensure it's likely a PDF
    # based on extension, though reading bytes will work for any file.
    # For a stricter check, you might inspect the first few bytes (magic numbers)
    # or rely on a more robust library if validation is critical.
    if pdf_file.suffix.lower() != ".pdf":
        print(f"Warning: The file '{pdf_file.name}' does not have a .pdf extension. "
              f"Proceeding assuming it's a PDF, but this might indicate an issue.")
        # Alternatively, you could raise an error here if you only want to process .pdf files
        # raise ValueError(f"File '{pdf_file.name}' is not a PDF based on its extension.")


    try:
        # Read the entire content of the PDF file as bytes
        pdf_data = pdf_file.read_bytes()
    except Exception as e:
        raise IOError(f"Error reading bytes from PDF file '{pdf_path}': {e}")


    pdf_parts = [
        {
            "mime_type": mime_type,
            "data": pdf_data
        }
    ]
    return pdf_parts

#GEMINI MODEL OUTPUT
def gemini_output(image_path, system_prompt, user_prompt):

#    image_info = image_format(image_path)
    image_info = read_pdf_bytes(image_path)
    input_prompt= [system_prompt, image_info[0], user_prompt]
    response = model.generate_content(input_prompt)
    #return response.text
    return response

#EXTRACT JSON FROM MARKDOWN FENCES (```json ... ```).
def extract_json_from_markdown(text):
    """
    Extracts a JSON string enclosed in markdown code fences (```json ... ```).
    Returns the extracted JSON string or None if not found/invalid.
    """
    # Regex to find content between ```json and ```
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

#END USER METHODS

if __name__ == '__main__':
    app.run(port=5000) # Run on port 5000

