import geopandas as gpd import sqlite3 import pandas as pd import torch import faiss import numpy as np import os from shapely.geometry import shape from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModelForCausalLM import streamlit as st # Set the environment variables for GPU usage in Hugging Face os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Hugging Face uses GPU 0 by default os.environ["TOKENIZERS_PARALLELISM"] = "false" # Set device to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") st.write(f"Using device: {device}") # Step 1: Load and Process Floodland Data conn = sqlite3.connect('NY.db') cursor = conn.cursor() # Load shapefile gdf = gpd.read_file('S_FLD_HAZ_AR.shp') # Validate geometries gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None) gdf = gdf.dropna(subset=['geometry']) # Convert CRS to UTM Zone 18N (New York) gdf = gdf.to_crs(epsg=32618) # Calculate acreage (1 square meter = 0.000247105 acres) gdf['acreage'] = gdf.geometry.area * 0.000247105 # Define flood-prone zones and calculate usable area flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE'] gdf['usable_area'] = gdf.apply(lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1) # Convert geometry to WKT format gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt) # Step 2: Load Embedding Model (Sentence-Transformer) embedder = SentenceTransformer('all-MiniLM-L6-v2') # Convert floodland descriptions into text gdf['text'] = gdf.apply( lambda row: f"Flood Zone: {row['FLD_ZONE']}, Subtype: {row['ZONE_SUBTY']}, Acreage: {row['acreage']:.2f} acres, Usable Area: {row['usable_area']:.2f} acres", axis=1 ) # Generate text embeddings embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True) # Create FAISS index d = embeddings.shape[1] index = faiss.IndexFlatL2(d) index.add(embeddings) # Store embeddings in DataFrame gdf['embedding'] = list(embeddings) # Step 3: Load LLaMA Model for Summarization llama_model_name = "meta-llama/Llama-2-7b-chat-hf" tokenizer = AutoTokenizer.from_pretrained(llama_model_name) model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype=torch.float16, device_map="auto") # Function to Generate Summary using LLaMA def llama_summarize(text, total_acreage, usable_acreage, location_data, max_length=250): input_text = f""" **Total Land Area**: {total_acreage:.2f} acres **Usable Area**: {usable_acreage:.2f} acres **Flood-prone Zones**: {location_data} Summarization in sentence """ inputs = tokenizer(input_text, return_tensors="pt").to(device) # Calculate max_new_tokens based on input size input_length = inputs['input_ids'].shape[1] max_new_tokens = max_length - input_length if max_new_tokens <= 0: max_new_tokens = 200 # Ensure at least a few tokens are generated with torch.no_grad(): output_tokens = model.generate( **inputs, max_new_tokens=max_new_tokens, # Use max_new_tokens to control the generated length temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2 ) summary = tokenizer.decode(output_tokens[0], skip_special_tokens=True) return summary # Step 4: RAG Summarization Function def rag_summarize(query, gdf, index, k=5): query = query.lower().strip() query_embedding = embedder.encode([query])[0] # Retrieve top-k relevant documents distances, indices = index.search(np.array([query_embedding]), k) retrieved_docs = gdf.iloc[indices[0]] # Aggregate data total_acreage = retrieved_docs['acreage'].sum() usable_acreage = retrieved_docs['usable_area'].sum() location_data = "\n".join([ f"- **Flood Zone**: {row['FLD_ZONE']}, **Subtype**: {row['ZONE_SUBTY']}, " f"**Acreage**: {row['acreage']:.2f}, **Usable Area**: {row['usable_area']:.2f} acres" for _, row in retrieved_docs.iterrows() ]) # Use LLaMA for summarization summary = llama_summarize(query, total_acreage, usable_acreage, location_data) return summary # Streamlit Interface st.title("🌊 Floodland Summary Bot (Powered by LLaMA-2)") # Input for location user_input = st.text_input("Enter a location (e.g., New York)", "") # When the user inputs a query, display the summary if user_input: query = user_input.lower().strip() summary = rag_summarize(query, gdf, index) st.write(summary)