Spaces:
Sleeping
Sleeping
import streamlit as st | |
import geopandas as gpd | |
import sqlite3 | |
import pandas as pd | |
import os | |
import torch | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
# Set the SHAPE_RESTORE_SHX configuration option to YES | |
os.environ['SHAPE_RESTORE_SHX'] = 'YES' | |
# Set device to GPU if available, otherwise CPU | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
st.write(f"Using device: {device}") | |
# Connect to SQLite database | |
conn = sqlite3.connect('NY.db') | |
cursor = conn.cursor() | |
# Create table for floodland data | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS floodlands ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
DFIRM_ID TEXT, | |
VERSION_ID TEXT, | |
FLD_AR_ID TEXT, | |
STUDY_TYP TEXT, | |
FLD_ZONE TEXT, | |
ZONE_SUBTY TEXT, | |
SFHA_TF TEXT, | |
STATIC_BFE FLOAT, | |
V_DATUM TEXT, | |
DEPTH FLOAT, | |
LEN_UNIT TEXT, | |
VELOCITY FLOAT, | |
VEL_UNIT TEXT, | |
AR_REVERT TEXT, | |
AR_SUBTRV TEXT, | |
BFE_REVERT FLOAT, | |
DEP_REVERT FLOAT, | |
DUAL_ZONE TEXT, | |
SOURCE_CIT TEXT, | |
geometry TEXT, | |
acreage FLOAT, | |
usable_area FLOAT | |
) | |
''') | |
conn.commit() | |
# Load and process the shapefile | |
shapefile_path = os.path.join(os.path.dirname(__file__), 'S_FLD_HAZ_AR.shp') | |
gdf = gpd.read_file(shapefile_path) | |
# Check the initial CRS | |
st.write("Initial CRS:", gdf.crs) | |
# If the CRS is None, set it to WGS84 (EPSG:4326), which is common for FEMA shapefiles | |
if gdf.crs is None: | |
gdf.set_crs(epsg=4326, inplace=True) | |
st.write("CRS was missing; set to EPSG:4326 (WGS84).") | |
# Validate geometries | |
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None) | |
if gdf['geometry'].isnull().any(): | |
st.write(f"Found {gdf['geometry'].isnull().sum()} invalid or null geometries. Dropping these rows.") | |
gdf = gdf.dropna(subset=['geometry']) | |
# Reproject to UTM Zone 18N (EPSG:32618) for accurate area calculations | |
gdf = gdf.to_crs(epsg=32618) | |
st.write("CRS after reprojection:", gdf.crs) | |
# Calculate acreage (1 square meter = 0.000247105 acres) | |
gdf['acreage'] = gdf.geometry.area * 0.000247105 | |
# Calculate usable area (excluding flood-prone zones) | |
flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE'] | |
gdf['usable_area'] = gdf.apply( | |
lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1 | |
) | |
# Convert geometry to WKT for storage in SQLite | |
gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt) | |
# Create a new DataFrame without the geometry column for SQLite storage | |
gdf_for_sql = gdf.drop(columns=['geometry']) | |
# Store in SQLite | |
rows_inserted = gdf_for_sql.to_sql('floodlands', conn, if_exists='replace', index=False) | |
st.write(f"Inserted {rows_inserted} rows into the database.") | |
# Close the database connection | |
conn.close() | |
# Load sentence transformer for embedding | |
embedder = SentenceTransformer('all-MiniLM-L6-v2') | |
# Load summarization model (using google/pegasus-xsum, which is publicly accessible) | |
summarizer = pipeline("summarization", model="google/pegasus-xsum", device=0 if torch.cuda.is_available() else -1) | |
# Create a text representation of each floodland record for embedding | |
gdf['text'] = gdf.apply( | |
lambda row: f"Floodland area with FLD_ZONE: {row['FLD_ZONE']}, ZONE_SUBTY: {row['ZONE_SUBTY']}, " | |
f"acreage: {row['acreage']:.2f} acres, usable area: {row['usable_area']:.2f} acres", | |
axis=1 | |
) | |
# Embed the text representations | |
embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True) | |
# Create a FAISS index for retrieval | |
d = embeddings.shape[1] # Dimension of embeddings | |
index = faiss.IndexFlatL2(d) | |
index.add(embeddings) | |
# Store the embeddings | |
gdf['embedding'] = list(embeddings) | |
# RAG-based summarization function (without prompt) | |
def rag_summarize(query, gdf, index, k=5): | |
# Embed the query | |
query_embedding = embedder.encode([query])[0] | |
# Retrieve top-k relevant documents | |
distances, indices = index.search(np.array([query_embedding]), k) | |
retrieved_docs = gdf.iloc[indices[0]] | |
# Aggregate acreage and usable area from retrieved documents | |
total_acreage = retrieved_docs['acreage'].sum() | |
usable_acreage = retrieved_docs['usable_area'].sum() | |
# Create a simplified narrative context | |
context = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres, with {usable_acreage:.2f} acres usable for development. " | |
f"A significant portion, spanning {retrieved_docs.iloc[0]['acreage']:.2f} acres, is classified as {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone, " | |
f"indicating potential flood risks that could impact development projects.") | |
# Debug: Display the narrative context | |
st.write("Narrative context for summarization:") | |
st.write(context) | |
# Generate summary without a prompt | |
try: | |
summary_output = summarizer(context, max_length=150, min_length=100, do_sample=False) | |
st.write("Raw summarizer output:", summary_output) | |
if isinstance(summary_output, list) and len(summary_output) > 0 and 'summary_text' in summary_output[0]: | |
summary = summary_output[0]['summary_text'] | |
else: | |
raise ValueError("Unexpected output format from summarizer.") | |
except Exception as e: | |
st.write(f"Error in summarization: {e}") | |
# Fallback: Generate a basic summary manually | |
summary = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres. " | |
f"Only {usable_acreage:.2f} acres are usable for development due to flood risks. " | |
f"A significant portion, including {retrieved_docs.iloc[0]['acreage']:.2f} acres, falls under the {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone. " | |
f"This indicates a high risk of flooding, which could impact power plant projects. " | |
f"Overall, the limited usable land poses challenges for development in this region.") | |
return summary | |
# Streamlit interface | |
st.title("Floodland Summary Bot") | |
# Input field for the user to enter a location | |
user_input = st.text_input("Input:", placeholder="Enter state name (e.g., Chicago)") | |
# Submit button | |
if st.button("Submit"): | |
if user_input: | |
st.write("Hi, How can I help you today?") | |
st.write(f"User input: {user_input}") | |
# Generate summary | |
summary = rag_summarize(user_input, gdf, index, k=5) | |
st.write(summary) | |
else: | |
st.write("Please enter a location to proceed.") |