import streamlit as st import geopandas as gpd import sqlite3 import pandas as pd import os import torch from transformers import pipeline from sentence_transformers import SentenceTransformer import faiss import numpy as np # Set the SHAPE_RESTORE_SHX configuration option to YES os.environ['SHAPE_RESTORE_SHX'] = 'YES' # Set device to GPU if available, otherwise CPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") st.write(f"Using device: {device}") # Connect to SQLite database conn = sqlite3.connect('NY.db') cursor = conn.cursor() # Create table for floodland data cursor.execute(''' CREATE TABLE IF NOT EXISTS floodlands ( id INTEGER PRIMARY KEY AUTOINCREMENT, DFIRM_ID TEXT, VERSION_ID TEXT, FLD_AR_ID TEXT, STUDY_TYP TEXT, FLD_ZONE TEXT, ZONE_SUBTY TEXT, SFHA_TF TEXT, STATIC_BFE FLOAT, V_DATUM TEXT, DEPTH FLOAT, LEN_UNIT TEXT, VELOCITY FLOAT, VEL_UNIT TEXT, AR_REVERT TEXT, AR_SUBTRV TEXT, BFE_REVERT FLOAT, DEP_REVERT FLOAT, DUAL_ZONE TEXT, SOURCE_CIT TEXT, geometry TEXT, acreage FLOAT, usable_area FLOAT ) ''') conn.commit() # Load and process the shapefile shapefile_path = os.path.join(os.path.dirname(__file__), 'S_FLD_HAZ_AR.shp') gdf = gpd.read_file(shapefile_path) # Check the initial CRS st.write("Initial CRS:", gdf.crs) # If the CRS is None, set it to WGS84 (EPSG:4326), which is common for FEMA shapefiles if gdf.crs is None: gdf.set_crs(epsg=4326, inplace=True) st.write("CRS was missing; set to EPSG:4326 (WGS84).") # Validate geometries gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None) if gdf['geometry'].isnull().any(): st.write(f"Found {gdf['geometry'].isnull().sum()} invalid or null geometries. Dropping these rows.") gdf = gdf.dropna(subset=['geometry']) # Reproject to UTM Zone 18N (EPSG:32618) for accurate area calculations gdf = gdf.to_crs(epsg=32618) st.write("CRS after reprojection:", gdf.crs) # Calculate acreage (1 square meter = 0.000247105 acres) gdf['acreage'] = gdf.geometry.area * 0.000247105 # Calculate usable area (excluding flood-prone zones) flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE'] gdf['usable_area'] = gdf.apply( lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1 ) # Convert geometry to WKT for storage in SQLite gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt) # Create a new DataFrame without the geometry column for SQLite storage gdf_for_sql = gdf.drop(columns=['geometry']) # Store in SQLite rows_inserted = gdf_for_sql.to_sql('floodlands', conn, if_exists='replace', index=False) st.write(f"Inserted {rows_inserted} rows into the database.") # Close the database connection conn.close() # Load sentence transformer for embedding embedder = SentenceTransformer('all-MiniLM-L6-v2') # Load summarization model (using google/pegasus-xsum, which is publicly accessible) summarizer = pipeline("summarization", model="google/pegasus-xsum", device=0 if torch.cuda.is_available() else -1) # Create a text representation of each floodland record for embedding gdf['text'] = gdf.apply( lambda row: f"Floodland area with FLD_ZONE: {row['FLD_ZONE']}, ZONE_SUBTY: {row['ZONE_SUBTY']}, " f"acreage: {row['acreage']:.2f} acres, usable area: {row['usable_area']:.2f} acres", axis=1 ) # Embed the text representations embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True) # Create a FAISS index for retrieval d = embeddings.shape[1] # Dimension of embeddings index = faiss.IndexFlatL2(d) index.add(embeddings) # Store the embeddings gdf['embedding'] = list(embeddings) # RAG-based summarization function (without prompt) def rag_summarize(query, gdf, index, k=5): # Embed the query query_embedding = embedder.encode([query])[0] # Retrieve top-k relevant documents distances, indices = index.search(np.array([query_embedding]), k) retrieved_docs = gdf.iloc[indices[0]] # Aggregate acreage and usable area from retrieved documents total_acreage = retrieved_docs['acreage'].sum() usable_acreage = retrieved_docs['usable_area'].sum() # Create a simplified narrative context context = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres, with {usable_acreage:.2f} acres usable for development. " f"A significant portion, spanning {retrieved_docs.iloc[0]['acreage']:.2f} acres, is classified as {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone, " f"indicating potential flood risks that could impact development projects.") # Debug: Display the narrative context st.write("Narrative context for summarization:") st.write(context) # Generate summary without a prompt try: summary_output = summarizer(context, max_length=150, min_length=100, do_sample=False) st.write("Raw summarizer output:", summary_output) if isinstance(summary_output, list) and len(summary_output) > 0 and 'summary_text' in summary_output[0]: summary = summary_output[0]['summary_text'] else: raise ValueError("Unexpected output format from summarizer.") except Exception as e: st.write(f"Error in summarization: {e}") # Fallback: Generate a basic summary manually summary = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres. " f"Only {usable_acreage:.2f} acres are usable for development due to flood risks. " f"A significant portion, including {retrieved_docs.iloc[0]['acreage']:.2f} acres, falls under the {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone. " f"This indicates a high risk of flooding, which could impact power plant projects. " f"Overall, the limited usable land poses challenges for development in this region.") return summary # Streamlit interface st.title("Floodland Summary Bot") # Input field for the user to enter a location user_input = st.text_input("Input:", placeholder="Enter state name (e.g., Chicago)") # Submit button if st.button("Submit"): if user_input: st.write("Hi, How can I help you today?") st.write(f"User input: {user_input}") # Generate summary summary = rag_summarize(user_input, gdf, index, k=5) st.write(summary) else: st.write("Please enter a location to proceed.")