import cv2
import numpy as np
from PIL import Image
import pytesseract
import requests
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# Suppress SSL warning (optional)
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

def image_to_text_json(image_bytes):
    text, error = extract_text_from_file(image_bytes)
    print("DEBUG: OCR Extracted Text:", repr(text))
    if error:
        return json.dumps({"error": error})
    return json.dumps({"text": text})

def preprocess_image_opencv(image):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding (binary inverse, for dark text on light background)
    _, thresholded = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)

    # Optionally, apply Gaussian blur or median filtering to reduce noise
    blurred = cv2.GaussianBlur(thresholded, (5, 5), 0)

    return blurred

def extract_text_from_file(file_bytes):
    try:
        # Convert byte data to a NumPy array and then read it with OpenCV
        np_array = np.frombuffer(file_bytes, np.uint8)
        image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)

        # Preprocess image using OpenCV
        preprocessed_image = preprocess_image_opencv(image)

        # Convert the preprocessed OpenCV image to a PIL image
        pil_image = Image.fromarray(preprocessed_image)

        # Loop through different PSM values and extract text
        psm_values = [3, 4, 6, 11, 13]  # You can customize this list based on your needs
        # psm_values = [11]
        extracted_texts = {}

        for psm in psm_values:
            text = pytesseract.image_to_string(pil_image, lang='eng', config=f'--psm {psm}')
            extracted_texts[psm] = text.strip()

        # Return extracted text for all PSM values as a string
        # Optionally, you can return just the best PSM or combine them into one result
        return "\n".join([f"PSM {psm}: {text}" for psm, text in extracted_texts.items()]), None

    except Exception as e:
        return "", f"OCR failed for uploaded image: {str(e)}"

def extract_text_from_image_url(image_url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0',
            'Accept': 'image/*',
            'Connection': 'keep-alive'
        }

        response = requests.get(image_url, headers=headers, timeout=15, verify=False, stream=True)
        if response.status_code != 200:
            return "", f"Failed to fetch image: {response.status_code}"

        image_data = response.content

        # Convert byte data to a NumPy array and then read it with OpenCV
        np_array = np.frombuffer(image_data, np.uint8)
        image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)

        # Preprocess image using OpenCV
        preprocessed_image = preprocess_image_opencv(image)

        # Convert the preprocessed OpenCV image to a PIL image
        pil_image = Image.fromarray(preprocessed_image)

        # Loop through different PSM values and extract text
        psm_values = [3, 4, 6, 11, 13]  # You can customize this list based on your needs
        # psm_values = [11]
        extracted_texts = {}

        for psm in psm_values:
            text = pytesseract.image_to_string(pil_image, lang='eng', config=f'--psm {psm}')
            extracted_texts[psm] = text.strip()

        # Return extracted text for all PSM values as a string
        # Optionally, you can return just the best PSM or combine them into one result
        return "\n".join([f"PSM {psm}: {text}" for psm, text in extracted_texts.items()]), None

    except Exception as e:
        return "", f"OCR failed from URL: {str(e)}"