CTFin / app.py
khushidhar1210's picture
with pegasus
3a1a521 verified
import streamlit as st
import geopandas as gpd
import sqlite3
import pandas as pd
import os
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Set the SHAPE_RESTORE_SHX configuration option to YES
os.environ['SHAPE_RESTORE_SHX'] = 'YES'
# Set device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
st.write(f"Using device: {device}")
# Connect to SQLite database
conn = sqlite3.connect('NY.db')
cursor = conn.cursor()
# Create table for floodland data
cursor.execute('''
CREATE TABLE IF NOT EXISTS floodlands (
id INTEGER PRIMARY KEY AUTOINCREMENT,
DFIRM_ID TEXT,
VERSION_ID TEXT,
FLD_AR_ID TEXT,
STUDY_TYP TEXT,
FLD_ZONE TEXT,
ZONE_SUBTY TEXT,
SFHA_TF TEXT,
STATIC_BFE FLOAT,
V_DATUM TEXT,
DEPTH FLOAT,
LEN_UNIT TEXT,
VELOCITY FLOAT,
VEL_UNIT TEXT,
AR_REVERT TEXT,
AR_SUBTRV TEXT,
BFE_REVERT FLOAT,
DEP_REVERT FLOAT,
DUAL_ZONE TEXT,
SOURCE_CIT TEXT,
geometry TEXT,
acreage FLOAT,
usable_area FLOAT
)
''')
conn.commit()
# Load and process the shapefile
shapefile_path = os.path.join(os.path.dirname(__file__), 'S_FLD_HAZ_AR.shp')
gdf = gpd.read_file(shapefile_path)
# Check the initial CRS
st.write("Initial CRS:", gdf.crs)
# If the CRS is None, set it to WGS84 (EPSG:4326), which is common for FEMA shapefiles
if gdf.crs is None:
gdf.set_crs(epsg=4326, inplace=True)
st.write("CRS was missing; set to EPSG:4326 (WGS84).")
# Validate geometries
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None)
if gdf['geometry'].isnull().any():
st.write(f"Found {gdf['geometry'].isnull().sum()} invalid or null geometries. Dropping these rows.")
gdf = gdf.dropna(subset=['geometry'])
# Reproject to UTM Zone 18N (EPSG:32618) for accurate area calculations
gdf = gdf.to_crs(epsg=32618)
st.write("CRS after reprojection:", gdf.crs)
# Calculate acreage (1 square meter = 0.000247105 acres)
gdf['acreage'] = gdf.geometry.area * 0.000247105
# Calculate usable area (excluding flood-prone zones)
flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE']
gdf['usable_area'] = gdf.apply(
lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1
)
# Convert geometry to WKT for storage in SQLite
gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt)
# Create a new DataFrame without the geometry column for SQLite storage
gdf_for_sql = gdf.drop(columns=['geometry'])
# Store in SQLite
rows_inserted = gdf_for_sql.to_sql('floodlands', conn, if_exists='replace', index=False)
st.write(f"Inserted {rows_inserted} rows into the database.")
# Close the database connection
conn.close()
# Load sentence transformer for embedding
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# Load summarization model (using google/pegasus-xsum, which is publicly accessible)
summarizer = pipeline("summarization", model="google/pegasus-xsum", device=0 if torch.cuda.is_available() else -1)
# Create a text representation of each floodland record for embedding
gdf['text'] = gdf.apply(
lambda row: f"Floodland area with FLD_ZONE: {row['FLD_ZONE']}, ZONE_SUBTY: {row['ZONE_SUBTY']}, "
f"acreage: {row['acreage']:.2f} acres, usable area: {row['usable_area']:.2f} acres",
axis=1
)
# Embed the text representations
embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True)
# Create a FAISS index for retrieval
d = embeddings.shape[1] # Dimension of embeddings
index = faiss.IndexFlatL2(d)
index.add(embeddings)
# Store the embeddings
gdf['embedding'] = list(embeddings)
# RAG-based summarization function (without prompt)
def rag_summarize(query, gdf, index, k=5):
# Embed the query
query_embedding = embedder.encode([query])[0]
# Retrieve top-k relevant documents
distances, indices = index.search(np.array([query_embedding]), k)
retrieved_docs = gdf.iloc[indices[0]]
# Aggregate acreage and usable area from retrieved documents
total_acreage = retrieved_docs['acreage'].sum()
usable_acreage = retrieved_docs['usable_area'].sum()
# Create a simplified narrative context
context = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres, with {usable_acreage:.2f} acres usable for development. "
f"A significant portion, spanning {retrieved_docs.iloc[0]['acreage']:.2f} acres, is classified as {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone, "
f"indicating potential flood risks that could impact development projects.")
# Debug: Display the narrative context
st.write("Narrative context for summarization:")
st.write(context)
# Generate summary without a prompt
try:
summary_output = summarizer(context, max_length=150, min_length=100, do_sample=False)
st.write("Raw summarizer output:", summary_output)
if isinstance(summary_output, list) and len(summary_output) > 0 and 'summary_text' in summary_output[0]:
summary = summary_output[0]['summary_text']
else:
raise ValueError("Unexpected output format from summarizer.")
except Exception as e:
st.write(f"Error in summarization: {e}")
# Fallback: Generate a basic summary manually
summary = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres. "
f"Only {usable_acreage:.2f} acres are usable for development due to flood risks. "
f"A significant portion, including {retrieved_docs.iloc[0]['acreage']:.2f} acres, falls under the {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone. "
f"This indicates a high risk of flooding, which could impact power plant projects. "
f"Overall, the limited usable land poses challenges for development in this region.")
return summary
# Streamlit interface
st.title("Floodland Summary Bot")
# Input field for the user to enter a location
user_input = st.text_input("Input:", placeholder="Enter state name (e.g., Chicago)")
# Submit button
if st.button("Submit"):
if user_input:
st.write("Hi, How can I help you today?")
st.write(f"User input: {user_input}")
# Generate summary
summary = rag_summarize(user_input, gdf, index, k=5)
st.write(summary)
else:
st.write("Please enter a location to proceed.")