Spaces:

khushidhar1210
/

CTFin

Sleeping

File size: 6,612 Bytes

import streamlit as st
import geopandas as gpd
import sqlite3
import pandas as pd
import os
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Set the SHAPE_RESTORE_SHX configuration option to YES
os.environ['SHAPE_RESTORE_SHX'] = 'YES'

# Set device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
st.write(f"Using device: {device}")

# Connect to SQLite database
conn = sqlite3.connect('NY.db')
cursor = conn.cursor()

# Create table for floodland data
cursor.execute('''
    CREATE TABLE IF NOT EXISTS floodlands (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        DFIRM_ID TEXT,
        VERSION_ID TEXT,
        FLD_AR_ID TEXT,
        STUDY_TYP TEXT,
        FLD_ZONE TEXT,
        ZONE_SUBTY TEXT,
        SFHA_TF TEXT,
        STATIC_BFE FLOAT,
        V_DATUM TEXT,
        DEPTH FLOAT,
        LEN_UNIT TEXT,
        VELOCITY FLOAT,
        VEL_UNIT TEXT,
        AR_REVERT TEXT,
        AR_SUBTRV TEXT,
        BFE_REVERT FLOAT,
        DEP_REVERT FLOAT,
        DUAL_ZONE TEXT,
        SOURCE_CIT TEXT,
        geometry TEXT,
        acreage FLOAT,
        usable_area FLOAT
    )
''')
conn.commit()

# Load and process the shapefile
shapefile_path = os.path.join(os.path.dirname(__file__), 'S_FLD_HAZ_AR.shp')
gdf = gpd.read_file(shapefile_path)

# Check the initial CRS
st.write("Initial CRS:", gdf.crs)

# If the CRS is None, set it to WGS84 (EPSG:4326), which is common for FEMA shapefiles
if gdf.crs is None:
    gdf.set_crs(epsg=4326, inplace=True)
    st.write("CRS was missing; set to EPSG:4326 (WGS84).")

# Validate geometries
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None)
if gdf['geometry'].isnull().any():
    st.write(f"Found {gdf['geometry'].isnull().sum()} invalid or null geometries. Dropping these rows.")
    gdf = gdf.dropna(subset=['geometry'])

# Reproject to UTM Zone 18N (EPSG:32618) for accurate area calculations
gdf = gdf.to_crs(epsg=32618)
st.write("CRS after reprojection:", gdf.crs)

# Calculate acreage (1 square meter = 0.000247105 acres)
gdf['acreage'] = gdf.geometry.area * 0.000247105

# Calculate usable area (excluding flood-prone zones)
flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE']
gdf['usable_area'] = gdf.apply(
    lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1
)

# Convert geometry to WKT for storage in SQLite
gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt)

# Create a new DataFrame without the geometry column for SQLite storage
gdf_for_sql = gdf.drop(columns=['geometry'])

# Store in SQLite
rows_inserted = gdf_for_sql.to_sql('floodlands', conn, if_exists='replace', index=False)
st.write(f"Inserted {rows_inserted} rows into the database.")

# Close the database connection
conn.close()

# Load sentence transformer for embedding
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Load summarization model (using google/pegasus-xsum, which is publicly accessible)
summarizer = pipeline("summarization", model="google/pegasus-xsum", device=0 if torch.cuda.is_available() else -1)

# Create a text representation of each floodland record for embedding
gdf['text'] = gdf.apply(
    lambda row: f"Floodland area with FLD_ZONE: {row['FLD_ZONE']}, ZONE_SUBTY: {row['ZONE_SUBTY']}, "
                f"acreage: {row['acreage']:.2f} acres, usable area: {row['usable_area']:.2f} acres",
    axis=1
)

# Embed the text representations
embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True)

# Create a FAISS index for retrieval
d = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)
index.add(embeddings)

# Store the embeddings
gdf['embedding'] = list(embeddings)

# RAG-based summarization function (without prompt)
def rag_summarize(query, gdf, index, k=5):
    # Embed the query
    query_embedding = embedder.encode([query])[0]
    
    # Retrieve top-k relevant documents
    distances, indices = index.search(np.array([query_embedding]), k)
    retrieved_docs = gdf.iloc[indices[0]]
    
    # Aggregate acreage and usable area from retrieved documents
    total_acreage = retrieved_docs['acreage'].sum()
    usable_acreage = retrieved_docs['usable_area'].sum()
    
    # Create a simplified narrative context
    context = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres, with {usable_acreage:.2f} acres usable for development. "
               f"A significant portion, spanning {retrieved_docs.iloc[0]['acreage']:.2f} acres, is classified as {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone, "
               f"indicating potential flood risks that could impact development projects.")
    
    # Debug: Display the narrative context
    st.write("Narrative context for summarization:")
    st.write(context)
    
    # Generate summary without a prompt
    try:
        summary_output = summarizer(context, max_length=150, min_length=100, do_sample=False)
        st.write("Raw summarizer output:", summary_output)
        if isinstance(summary_output, list) and len(summary_output) > 0 and 'summary_text' in summary_output[0]:
            summary = summary_output[0]['summary_text']
        else:
            raise ValueError("Unexpected output format from summarizer.")
    except Exception as e:
        st.write(f"Error in summarization: {e}")
        # Fallback: Generate a basic summary manually
        summary = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres. "
                   f"Only {usable_acreage:.2f} acres are usable for development due to flood risks. "
                   f"A significant portion, including {retrieved_docs.iloc[0]['acreage']:.2f} acres, falls under the {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone. "
                   f"This indicates a high risk of flooding, which could impact power plant projects. "
                   f"Overall, the limited usable land poses challenges for development in this region.")
    
    return summary

# Streamlit interface
st.title("Floodland Summary Bot")

# Input field for the user to enter a location
user_input = st.text_input("Input:", placeholder="Enter state name (e.g., Chicago)")

# Submit button
if st.button("Submit"):
    if user_input:
        st.write("Hi, How can I help you today?")
        st.write(f"User input: {user_input}")
        
        # Generate summary
        summary = rag_summarize(user_input, gdf, index, k=5)
        st.write(summary)
    else:
        st.write("Please enter a location to proceed.")