import gradio as gr
from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel
import torch



git_processor_base = AutoProcessor.from_pretrained("microsoft/git-base-coco")
git_model_base = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

git_processor_large = AutoProcessor.from_pretrained("microsoft/git-large-coco")
git_model_large = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")

blip_processor_base = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model_base = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

vitgpt_processor = AutoImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vitgpt_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vitgpt_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = "cuda" if torch.cuda.is_available() else "cpu"

git_model_base.to(device)
blip_model_base.to(device)
git_model_large.to(device)
blip_model_large.to(device)
vitgpt_model.to(device)

def generate_caption(processor, model, image, tokenizer=None):
    inputs = processor(images=image, return_tensors="pt").to(device)
    
    generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)

    if tokenizer is not None:
        generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    else:
        generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
   
    return generated_caption


def generate_captions(image):
    caption_git_base = generate_caption(git_processor_base, git_model_base, image)

    caption_git_large = generate_caption(git_processor_large, git_model_large, image)

    caption_blip_base = generate_caption(blip_processor_base, blip_model_base, image)

    caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)

    caption_vitgpt = generate_caption(vitgpt_processor, vitgpt_model, image, vitgpt_tokenizer)

    return caption_git_base, caption_git_large, caption_blip_base, caption_blip_large, caption_vitgpt

   
examples = [["cat.jpg"], ["dog.jpg"], ["horse.jpg"]]
outputs = [gr.outputs.Textbox(label="Caption generated by GIT-base"), gr.outputs.Textbox(label="Caption generated by GIT-large"), gr.outputs.Textbox(label="Caption generated by BLIP-base"), gr.outputs.Textbox(label="Caption generated by BLIP-large"), gr.outputs.Textbox(label="Caption generated by ViT+GPT-2")] 

title = "Image to Text : Multiple Models"
description = "Explore the Gradio Demo for comparing three state-of-the-art vision+language models: GIT, BLIP, and ViT+GPT2. To use the demo, upload your image and click 'submit,' or choose from the provided examples."
article = "<p style='text-align: center'><a href='https://huggingface.co/docs/transformers/main/model_doc/blip' target='_blank'>BLIP docs</a> | <a href='https://huggingface.co/docs/transformers/main/model_doc/git' target='_blank'>GIT docs</a></p>"

iface = gr.Interface(fn=generate_captions, 
                         inputs=gr.inputs.Image(type="pil"),
                         outputs=outputs,
                         examples=examples, 
                         title=title,
                         description=description,
                         article=article, 
                         enable_queue=True)
iface.launch(server_name="0.0.0.0", server_port=7860)

'''

import gradio as gr
import numpy as np
from PIL import Image

def generate_ascii_art(image):
    try:
        # Convert the numpy array to a PIL Image
        img = Image.fromarray(np.uint8(image))

        # Resize the image to a smaller size for faster processing
        img = img.resize((80, 60))

        # Convert the image to grayscale
        img = img.convert("L")

        # Define ASCII characters to represent different intensity levels
        #ascii_chars = "@%#*+=-:. "
        ascii_chars = "$@B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/|()1{}[]?-_+~<>i!lI;:,\\^`'. "

        # Convert each pixel to ASCII character based on intensity
        ascii_image = ""
        for pixel_value in img.getdata():
            ascii_image += ascii_chars[pixel_value // 25]

        # Reshape the ASCII string to match the resized image dimensions
        ascii_image = "\n".join([ascii_image[i:i + img.width] for i in range(0, len(ascii_image), img.width)])

        return ascii_image
    except Exception as e:
        return f"Error: {e}"

iface = gr.Interface(
    fn=generate_ascii_art,
    inputs="image",
    outputs="text",
    title="ASCII Art Generator",
    description="Upload an image, and this app will turn it into ASCII art!  - Simple  Gradio App from  Docker",
    live=True
)

iface.launch(server_name="0.0.0.0", server_port=7860)



import gradio as gr
import subprocess

def run_command(command):
    try:
        result = subprocess.check_output(command, shell=True, text=True)
        return result
    except subprocess.CalledProcessError as e:
        return f"Error: {e}"

iface = gr.Interface(
    fn=run_command,
    inputs="text",
    outputs="text",
    #live=True,
    title="Command Output Viewer",
    description="Enter a command and view its output.",
    examples=[
    ["ls"],
    ["pwd"],
    ["echo 'Hello, Gradio!'"]]
)

iface.launch(server_name="0.0.0.0", server_port=7860)
'''