File size: 6,612 Bytes
3a1a521
5785b6c
 
 
3a1a521
5785b6c
3a1a521
 
5785b6c
 
 
3a1a521
779693b
 
 
5785b6c
 
 
3a1a521
5785b6c
 
 
3a1a521
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779693b
 
 
 
 
 
 
 
 
 
 
5785b6c
 
3a1a521
 
 
5785b6c
3a1a521
5785b6c
779693b
5785b6c
 
 
 
3a1a521
5785b6c
3a1a521
 
 
5785b6c
3a1a521
5785b6c
 
3a1a521
 
 
 
 
 
 
 
 
 
 
5785b6c
 
3a1a521
 
 
 
5785b6c
3a1a521
 
5785b6c
 
 
3a1a521
5785b6c
 
3a1a521
 
5785b6c
 
 
3a1a521
5785b6c
 
3a1a521
5785b6c
3a1a521
5785b6c
3a1a521
5785b6c
 
 
3a1a521
 
5785b6c
 
3a1a521
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5785b6c
 
 
3a1a521
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import streamlit as st
import geopandas as gpd
import sqlite3
import pandas as pd
import os
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Set the SHAPE_RESTORE_SHX configuration option to YES
os.environ['SHAPE_RESTORE_SHX'] = 'YES'

# Set device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
st.write(f"Using device: {device}")

# Connect to SQLite database
conn = sqlite3.connect('NY.db')
cursor = conn.cursor()

# Create table for floodland data
cursor.execute('''
    CREATE TABLE IF NOT EXISTS floodlands (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        DFIRM_ID TEXT,
        VERSION_ID TEXT,
        FLD_AR_ID TEXT,
        STUDY_TYP TEXT,
        FLD_ZONE TEXT,
        ZONE_SUBTY TEXT,
        SFHA_TF TEXT,
        STATIC_BFE FLOAT,
        V_DATUM TEXT,
        DEPTH FLOAT,
        LEN_UNIT TEXT,
        VELOCITY FLOAT,
        VEL_UNIT TEXT,
        AR_REVERT TEXT,
        AR_SUBTRV TEXT,
        BFE_REVERT FLOAT,
        DEP_REVERT FLOAT,
        DUAL_ZONE TEXT,
        SOURCE_CIT TEXT,
        geometry TEXT,
        acreage FLOAT,
        usable_area FLOAT
    )
''')
conn.commit()

# Load and process the shapefile
shapefile_path = os.path.join(os.path.dirname(__file__), 'S_FLD_HAZ_AR.shp')
gdf = gpd.read_file(shapefile_path)

# Check the initial CRS
st.write("Initial CRS:", gdf.crs)

# If the CRS is None, set it to WGS84 (EPSG:4326), which is common for FEMA shapefiles
if gdf.crs is None:
    gdf.set_crs(epsg=4326, inplace=True)
    st.write("CRS was missing; set to EPSG:4326 (WGS84).")

# Validate geometries
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None)
if gdf['geometry'].isnull().any():
    st.write(f"Found {gdf['geometry'].isnull().sum()} invalid or null geometries. Dropping these rows.")
    gdf = gdf.dropna(subset=['geometry'])

# Reproject to UTM Zone 18N (EPSG:32618) for accurate area calculations
gdf = gdf.to_crs(epsg=32618)
st.write("CRS after reprojection:", gdf.crs)

# Calculate acreage (1 square meter = 0.000247105 acres)
gdf['acreage'] = gdf.geometry.area * 0.000247105

# Calculate usable area (excluding flood-prone zones)
flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE']
gdf['usable_area'] = gdf.apply(
    lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1
)

# Convert geometry to WKT for storage in SQLite
gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt)

# Create a new DataFrame without the geometry column for SQLite storage
gdf_for_sql = gdf.drop(columns=['geometry'])

# Store in SQLite
rows_inserted = gdf_for_sql.to_sql('floodlands', conn, if_exists='replace', index=False)
st.write(f"Inserted {rows_inserted} rows into the database.")

# Close the database connection
conn.close()

# Load sentence transformer for embedding
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Load summarization model (using google/pegasus-xsum, which is publicly accessible)
summarizer = pipeline("summarization", model="google/pegasus-xsum", device=0 if torch.cuda.is_available() else -1)

# Create a text representation of each floodland record for embedding
gdf['text'] = gdf.apply(
    lambda row: f"Floodland area with FLD_ZONE: {row['FLD_ZONE']}, ZONE_SUBTY: {row['ZONE_SUBTY']}, "
                f"acreage: {row['acreage']:.2f} acres, usable area: {row['usable_area']:.2f} acres",
    axis=1
)

# Embed the text representations
embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True)

# Create a FAISS index for retrieval
d = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)
index.add(embeddings)

# Store the embeddings
gdf['embedding'] = list(embeddings)

# RAG-based summarization function (without prompt)
def rag_summarize(query, gdf, index, k=5):
    # Embed the query
    query_embedding = embedder.encode([query])[0]
    
    # Retrieve top-k relevant documents
    distances, indices = index.search(np.array([query_embedding]), k)
    retrieved_docs = gdf.iloc[indices[0]]
    
    # Aggregate acreage and usable area from retrieved documents
    total_acreage = retrieved_docs['acreage'].sum()
    usable_acreage = retrieved_docs['usable_area'].sum()
    
    # Create a simplified narrative context
    context = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres, with {usable_acreage:.2f} acres usable for development. "
               f"A significant portion, spanning {retrieved_docs.iloc[0]['acreage']:.2f} acres, is classified as {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone, "
               f"indicating potential flood risks that could impact development projects.")
    
    # Debug: Display the narrative context
    st.write("Narrative context for summarization:")
    st.write(context)
    
    # Generate summary without a prompt
    try:
        summary_output = summarizer(context, max_length=150, min_length=100, do_sample=False)
        st.write("Raw summarizer output:", summary_output)
        if isinstance(summary_output, list) and len(summary_output) > 0 and 'summary_text' in summary_output[0]:
            summary = summary_output[0]['summary_text']
        else:
            raise ValueError("Unexpected output format from summarizer.")
    except Exception as e:
        st.write(f"Error in summarization: {e}")
        # Fallback: Generate a basic summary manually
        summary = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres. "
                   f"Only {usable_acreage:.2f} acres are usable for development due to flood risks. "
                   f"A significant portion, including {retrieved_docs.iloc[0]['acreage']:.2f} acres, falls under the {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone. "
                   f"This indicates a high risk of flooding, which could impact power plant projects. "
                   f"Overall, the limited usable land poses challenges for development in this region.")
    
    return summary

# Streamlit interface
st.title("Floodland Summary Bot")

# Input field for the user to enter a location
user_input = st.text_input("Input:", placeholder="Enter state name (e.g., Chicago)")

# Submit button
if st.button("Submit"):
    if user_input:
        st.write("Hi, How can I help you today?")
        st.write(f"User input: {user_input}")
        
        # Generate summary
        summary = rag_summarize(user_input, gdf, index, k=5)
        st.write(summary)
    else:
        st.write("Please enter a location to proceed.")