Spaces:

michaellupo74
/

grants-rag

Sleeping

App Files Files

xet

Community

michaellupo74 commited on 28 days ago

Commit

b53e303

1 Parent(s): cd54f1c

feat(ingest): JS card/grid + scroll container + skip_filters

Browse files

feat(ui): raise top_k & results paging
chore: add ingestors/ranking/utils modules
fix: YAML selectors & SPA wait tuning

Files changed (10) hide show

.gitignore +24 -0
app/ingest.py +364 -25
app/ingestors/http_html_js.py +102 -0
app/ranking/rerank.py +17 -0
app/ranking/rules.py +17 -0
app/ranking/score.py +8 -0
app/ui_streamlit.py +356 -244
app/utils/dedupe.py +29 -0
app/utils/normalize.py +15 -0
config/sources.yaml +81 -9

.gitignore CHANGED Viewed

@@ -30,3 +30,27 @@ runtime/
 data/exports/
 *.env*
 .env

 data/exports/
 *.env*
 .env
+# Python
+__pycache__/
+*.pyc
+.venv/
+venv/
+*.egg-info/
+# App data/output
+data/
+snapshots/
+app/static/
+*.log
+# Local/playwright artifacts
+playwright/.cache/
+*.png
+# Editor
+.vscode/
+.DS_Store
+# Backups
+*.bak
+*.bak.py

app/ingest.py CHANGED Viewed

@@ -17,6 +17,9 @@ import hashlib
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime, timezone
 # -------------------- Config --------------------
@@ -183,7 +186,7 @@ def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
     hits = out.get("hits", []) if isinstance(out, dict) else (out or [])
     return [h for h in hits if isinstance(h, dict)]
-# -------------------- NEW: Generic HTML / PDF collectors --------------------
 _HTTP_HEADERS = {
     "User-Agent": "grants-rag/1.0 (+https://example.local) requests",
@@ -280,6 +283,8 @@ def _normalize_web_record(
     rec["is_active"] = _compute_is_active(rec["deadline"])
     return rec
 def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
     """
     Supports types: 'web_page' and 'http_html'
@@ -350,6 +355,219 @@ def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any
     return rows
 def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
     """
     type: 'http_pdf'
@@ -384,9 +602,119 @@ def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]
     rows.append(_normalize_web_record(source_name, url, title, body, static, extra={"posted_date": None}))
     return rows
 # -------------------- Write docstore & build index --------------------
-def _save_docstore(recs: List[Dict[str, Any]]) -> str:
     DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
     path = DOCSTORE_DIR / "docstore.jsonl"
     with path.open("w", encoding="utf-8") as f:
@@ -425,6 +753,7 @@ def _build_index_from_docstore() -> int:
                 "is_active": rec.get("is_active"),
                 "program_number": rec.get("program_number"),
                 "posted_date": rec.get("posted_date"),
             })
     print(f"[index] Rows loaded from docstore: {len(texts)}")
@@ -467,7 +796,7 @@ __all__ = ["ingest"]
 def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
     """
     Reads config, fetches from enabled sources via adapters, normalizes to a single schema,
-    applies filters (capacity / PA-MD), dedupes, writes docstore, and builds the FAISS index.
     Returns (docstore_path, n_indexed).
     """
     cfg = load_config(cfg_path)
@@ -492,6 +821,7 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
         typ = entry.get("type")
         rows: List[Dict[str, Any]] = []
         if typ == "grantsgov_api":
             raw_hits = _collect_from_grantsgov_api(entry)
             rows = [normalize("grants_gov", h, static) for h in raw_hits]
@@ -499,6 +829,9 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
         elif typ in ("web_page", "http_html"):
             rows = _collect_from_http_html(entry, name, static)
         elif typ == "http_pdf":
             rows = _collect_from_http_pdf(entry, name, static)
@@ -508,31 +841,37 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
             items = blob.get("opportunities") or []
             rows = [normalize("local_sample", op, static) for op in items]
-        # ---- Apply capacity / geo filters BEFORE collecting ----
-        if rows and (capacity_only or pa_md_only):
-            filtered = []
-            for r in rows:
-                t = _doc_text_from_row(r)
-                if capacity_only and not _is_capacity_building_text(t):
-                    continue
-                if pa_md_only and not _is_pa_md_text(t):
-                    continue
-                filtered.append(r)
-            print(f"[filter] {name}: kept {len(filtered)}/{len(rows)} after filters")
-            rows = filtered
-        print(f"[collect] {name} → {len(rows)} rows")
         all_rows.extend(rows)
-    # ---- DEDUPE (by id → url → title) ----
-    seen, unique = set(), []
-    for r in all_rows:
-        key = r.get("id") or r.get("url") or r.get("title")
-        if not key or key in seen:
-            continue
-        seen.add(key)
-        unique.append(r)
     print(f"[ingest] Unique records to index: {len(unique)}")
     path = _save_docstore(unique)

 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime, timezone
+from difflib import SequenceMatcher
+from urllib.parse import urljoin
 # -------------------- Config --------------------
     hits = out.get("hits", []) if isinstance(out, dict) else (out or [])
     return [h for h in hits if isinstance(h, dict)]
+# -------------------- HTTP helpers --------------------
 _HTTP_HEADERS = {
     "User-Agent": "grants-rag/1.0 (+https://example.local) requests",
     rec["is_active"] = _compute_is_active(rec["deadline"])
     return rec
+# -------------------- Collectors: http_html / web_page --------------------
 def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
     """
     Supports types: 'web_page' and 'http_html'
     return rows
+# -------------------- Collector: http_html_js (Playwright) --------------------
+def _collect_from_http_html_js(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    JS-rendered pages using Playwright, with per-card extraction and robust scrolling.
+    entry.options:
+      - wait_for (css or ms int)
+      - scroll (bool)
+      - scroll_selector (css)      # NEW: scroll a container div, not the window
+      - scroll_times (int)         # NEW: default 20
+      - scroll_wait_ms (int)       # NEW: default 400
+      - min_cards (int)            # NEW: wait until at least N cards exist
+      - click_selector (css)
+      - max_pages (int)
+      - timeout_ms (int)
+      - network_idle (bool)
+      - debug (bool)
+    entry.selectors: card, title, link, description, meta
+    """
+    try:
+        from playwright.sync_api import sync_playwright  # type: ignore
+    except Exception:
+        print(f"[collect][skip] {source_name}: Playwright not installed.")
+        return []
+    url = entry.get("url")
+    if not url:
+        return []
+    options   = entry.get("options", {}) or {}
+    parse     = entry.get("parse", {}) or entry.get("extract", {}) or {}
+    selectors = entry.get("selectors", {}) or {}
+    content_selectors = parse.get("content_selectors") or []
+    timeout_ms   = int(options.get("timeout_ms", 6000))
+    network_idle = bool(options.get("network_idle", True))
+    debug        = bool(options.get("debug", False))
+    max_pages    = int(options.get("max_pages", 1))
+    click_sel    = options.get("click_selector") or entry.get("next_selector")
+    wait_for     = options.get("wait_for")
+    rows: List[Dict[str, Any]] = []
+    def _text_first(soup: BeautifulSoup, css_list: str) -> str:
+        if not css_list:
+            return ""
+        for css in [c.strip() for c in css_list.split(",")]:
+            el = soup.select_one(css)
+            if el:
+                txt = el.get_text(separator=" ", strip=True)
+                if txt:
+                    return txt
+        return ""
+    def _attr_first(soup: BeautifulSoup, css_list: str, attr: str) -> Optional[str]:
+        if not css_list:
+            return None
+        for css in [c.strip() for c in css_list.split(",")]:
+            el = soup.select_one(css)
+            if el:
+                val = el.get(attr)
+                if val:
+                    return val
+        return None
+    def _parse_cards(page_html: str, base_url: str) -> List[Dict[str, Any]]:
+        s = _soup(page_html)
+        card_css = selectors.get("card", "")
+        if not card_css:
+            title, body = _text_from_soup(s, content_selectors)
+            return [_normalize_web_record(source_name, base_url, title, body, static, extra={"posted_date": None})]
+        out: List[Dict[str, Any]] = []
+        for card in s.select(card_css) or []:
+            csoup = BeautifulSoup(str(card), "lxml")
+            title = _text_first(csoup, selectors.get("title", "h1, h2, h3"))
+            href  = _attr_first(csoup, selectors.get("link", "a"), "href")
+            link  = urljoin(base_url, href) if href else base_url
+            desc  = _text_first(csoup, selectors.get("description", "p, .summary, .excerpt, .card-text"))
+            meta  = _text_first(csoup, selectors.get("meta", ".meta, .tags, .badge, .location"))
+            body  = "\n".join([p for p in (desc, meta) if p]).strip()
+            if not (title or body):
+                continue
+            out.append(_normalize_web_record(source_name, link, title or link, body, static, extra={"posted_date": None}))
+        return out
+    def _wait_page_ready(page, *, wait_for, timeout_ms, options, selectors):
+        # wait_for can be CSS or milliseconds
+        if isinstance(wait_for, int):
+            page.wait_for_timeout(wait_for)
+        elif isinstance(wait_for, str) and wait_for:
+            page.wait_for_selector(wait_for, timeout=min(timeout_ms, 15000))
+        # Scroll window or a container div
+        if options.get("scroll"):
+            scroll_sel   = options.get("scroll_selector")
+            scroll_times = int(options.get("scroll_times", 20))
+            scroll_wait  = int(options.get("scroll_wait_ms", 400))
+            if scroll_sel:
+                page.evaluate(
+                    """(sel, times, wait) => new Promise(res => {
+                        const el = document.querySelector(sel);
+                        if (!el) { res(); return; }
+                        let i = 0;
+                        const t = setInterval(() => {
+                            const y = el.scrollTop;
+                            el.scrollTop = el.scrollHeight;
+                            i++;
+                            if (el.scrollTop === y || i >= times) { clearInterval(t); res(); }
+                        }, wait);
+                    })""",
+                    scroll_sel, scroll_times, scroll_wait
+                )
+            else:
+                page.evaluate(
+                    """(times, wait) => new Promise(res => {
+                        let i = 0;
+                        const t = setInterval(() => {
+                            const y = window.scrollY;
+                            window.scrollBy(0, document.body.scrollHeight);
+                            i++;
+                            if (window.scrollY === y || i >= times) { clearInterval(t); res(); }
+                        }, wait);
+                    })""",
+                    scroll_times, scroll_wait
+                )
+        # Optionally wait for a minimum number of cards (virtualized lists)
+        min_cards = int(options.get("min_cards", 0))
+        card_css  = (selectors or {}).get("card", "")
+        if min_cards and card_css:
+            try:
+                page.wait_for_function(
+                    """([sel, target]) => {
+                        const els = document.querySelectorAll(sel);
+                        return els && els.length >= target;
+                    }""",
+                    arg=[card_css, min_cards],
+                    timeout=min(timeout_ms, 15000),
+                )
+            except Exception:
+                pass  # best-effort
+    def _try_once(p):
+        def _route(route):
+            r = route.request
+            if r.resource_type in {"image", "media", "font"}:
+                return route.abort()
+            return route.continue_()
+        browser = p.chromium.launch(headless=not debug)
+        context = browser.new_context(user_agent=(
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
+        ))
+        context.route("**/*", _route)
+        page = context.new_page()
+        try:
+            page.set_default_timeout(timeout_ms)
+            page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
+            if network_idle:
+                page.wait_for_load_state("networkidle", timeout=min(timeout_ms, 15000))
+            _wait_page_ready(page, wait_for=wait_for, timeout_ms=timeout_ms, options=options, selectors=selectors)
+            htmls = [page.content()]
+            # pagination
+            for _ in range(max_pages - 1):
+                sel = click_sel
+                if not sel or page.locator(sel).count() == 0:
+                    break
+                page.click(sel)
+                page.wait_for_load_state("domcontentloaded")
+                if network_idle:
+                    page.wait_for_load_state("networkidle", timeout=min(timeout_ms, 15000))
+                _wait_page_ready(page, wait_for=wait_for, timeout_ms=timeout_ms, options=options, selectors=selectors)
+                htmls.append(page.content())
+            for html in htmls:
+                found = _parse_cards(html, url)
+                rows.extend(found)
+                print(f"[collect][cards] {source_name}: found {len(found)} cards on this page")
+            browser.close()
+            return True
+        except Exception as e:
+            if debug:
+                try:
+                    snap = DOCSTORE_DIR / f"playwright_error_{hashlib.sha1(url.encode()).hexdigest()}.png"
+                    page.screenshot(path=str(snap))
+                    print(f"[collect][debug] Saved screenshot: {snap}")
+                except Exception:
+                    pass
+            print(f"[collect][warn] {source_name}: {e.__class__.__name__}: {e}")
+            try:
+                browser.close()
+            except Exception:
+                pass
+            return False
+    from playwright.sync_api import sync_playwright  # late import for clarity
+    with sync_playwright() as p:
+        ok = _try_once(p)
+        if not ok:
+            time.sleep(1.5)
+            _try_once(p)
+    if not rows:
+        print(f"[collect][skip] {source_name}: no content after retries.")
+    return rows
+# -------------------- PDF collector --------------------
 def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
     """
     type: 'http_pdf'
     rows.append(_normalize_web_record(source_name, url, title, body, static, extra={"posted_date": None}))
     return rows
+# -------------------- De-dup helpers (5.2) --------------------
+def _norm(text: str) -> str:
+    t = (text or "").lower()
+    t = re.sub(r'[^a-z0-9 ]+', ' ', t)
+    return re.sub(r'\s+', ' ', t).strip()
+def _hash_fingerprint(title: str, agency: str, deadline: str) -> str:
+    base = f"{_norm(title)}|{_norm(agency)}|{deadline or ''}"
+    return hashlib.sha1(base.encode()).hexdigest()
+def _near_duplicate(a: Dict[str, Any], b: Dict[str, Any]) -> bool:
+    # Deadlines equal or both missing
+    dates_close = (a.get("deadline") == b.get("deadline")) or (not a.get("deadline") and not b.get("deadline"))
+    t_sim = SequenceMatcher(None, _norm(a.get("title","")), _norm(b.get("title",""))).ratio()
+    ag_sim = SequenceMatcher(None, _norm(a.get("agency","")), _norm(b.get("agency",""))).ratio()
+    return dates_close and (t_sim > 0.88) and (ag_sim > 0.75)
+def _merge_records(primary: Dict[str, Any], other: Dict[str, Any]) -> Dict[str, Any]:
+    """Merge fields while preserving best data and provenance."""
+    merged = dict(primary)
+    # Prefer non-empty fields; combine categories; keep earliest posted_date; keep earliest deadline if different.
+    def choose(a, b):
+        return a if (a not in (None, "", [], {})) else b
+    merged["url"] = choose(primary.get("url"), other.get("url"))
+    merged["title"] = choose(primary.get("title"), other.get("title"))
+    merged["synopsis"] = choose(primary.get("synopsis"), other.get("synopsis"))
+    merged["summary"] = choose(primary.get("summary"), other.get("summary"))
+    merged["agency"] = choose(primary.get("agency"), other.get("agency"))
+    merged["eligibility"] = choose(primary.get("eligibility"), other.get("eligibility"))
+    merged["program_number"] = choose(primary.get("program_number"), other.get("program_number"))
+    merged["geo"] = choose(primary.get("geo"), other.get("geo"))
+    # categories → union (list)
+    cats_a = primary.get("categories") or []
+    cats_b = other.get("categories") or []
+    if not isinstance(cats_a, list): cats_a = [cats_a]
+    if not isinstance(cats_b, list): cats_b = [cats_b]
+    merged["categories"] = sorted(set([c for c in cats_a + cats_b if c]))
+    # deadline: choose earlier known date (safer to surface sooner one)
+    da, db = primary.get("deadline"), other.get("deadline")
+    if da and db:
+        merged["deadline"] = min(da, db)
+    else:
+        merged["deadline"] = da or db
+    # carry a deadline_text if any
+    merged["deadline_text"] = choose(primary.get("deadline_text"), other.get("deadline_text"))
+    merged["is_active"] = _compute_is_active(merged.get("deadline"))
+    # posted_date: keep earliest if both
+    pa, pb = primary.get("posted_date"), other.get("posted_date")
+    merged["posted_date"] = min(pa, pb) if (pa and pb) else (pa or pb)
+    # provenance: combine sources + urls
+    prov_sources = set()
+    for s in (primary.get("source"), other.get("source")):
+        if not s: continue
+        if isinstance(s, list): prov_sources.update(s)
+        else: prov_sources.add(s)
+    merged["source"] = sorted(prov_sources) if prov_sources else None
+    prov_urls = set()
+    for u in (primary.get("url"), other.get("url")):
+        if u: prov_urls.add(u)
+    # keep a list of all discovered urls
+    merged["all_urls"] = sorted(prov_urls.union(set(primary.get("all_urls") or []), set(other.get("all_urls") or [])))
+    # recompute ID based on merged fingerprint (title/agency/deadline)
+    merged["id"] = _hash_fingerprint(merged.get("title",""), merged.get("agency",""), merged.get("deadline",""))
+    return merged
+def _dedupe_and_merge(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Exact fingerprint + fuzzy near-dup consolidation across sources."""
+    uniques: List[Dict[str, Any]] = []
+    by_fp: Dict[str, int] = {}
+    for r in rows:
+        fp = _hash_fingerprint(r.get("title",""), r.get("agency",""), r.get("deadline",""))
+        if fp in by_fp:
+            # exact dup: merge into existing
+            idx = by_fp[fp]
+            uniques[idx] = _merge_records(uniques[idx], r)
+            continue
+        # fuzzy check against current uniques
+        found_idx = None
+        for i, u in enumerate(uniques):
+            if _near_duplicate(r, u):
+                found_idx = i
+                break
+        if found_idx is not None:
+            uniques[found_idx] = _merge_records(uniques[found_idx], r)
+            # also index its fingerprint (so later exact matches land here)
+            new_fp = _hash_fingerprint(uniques[found_idx].get("title",""),
+                                       uniques[found_idx].get("agency",""),
+                                       uniques[found_idx].get("deadline",""))
+            by_fp[new_fp] = found_idx
+        else:
+            by_fp[fp] = len(uniques)
+            # initialize provenance
+            r.setdefault("all_urls", [r.get("url")] if r.get("url") else [])
+            uniques.append(r)
+    return uniques
 # -------------------- Write docstore & build index --------------------
+def _save_docstore(recs: List[Dict, Any]) -> str:
     DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
     path = DOCSTORE_DIR / "docstore.jsonl"
     with path.open("w", encoding="utf-8") as f:
                 "is_active": rec.get("is_active"),
                 "program_number": rec.get("program_number"),
                 "posted_date": rec.get("posted_date"),
+                "all_urls": rec.get("all_urls"),
             })
     print(f"[index] Rows loaded from docstore: {len(texts)}")
 def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
     """
     Reads config, fetches from enabled sources via adapters, normalizes to a single schema,
+    applies filters (capacity / PA-MD), de-dupes, writes docstore, and builds the FAISS index.
     Returns (docstore_path, n_indexed).
     """
     cfg = load_config(cfg_path)
         typ = entry.get("type")
         rows: List[Dict[str, Any]] = []
+        # -------- Collect from each adapter --------
         if typ == "grantsgov_api":
             raw_hits = _collect_from_grantsgov_api(entry)
             rows = [normalize("grants_gov", h, static) for h in raw_hits]
         elif typ in ("web_page", "http_html"):
             rows = _collect_from_http_html(entry, name, static)
+        elif typ == "http_html_js":
+            rows = _collect_from_http_html_js(entry, name, static)
         elif typ == "http_pdf":
             rows = _collect_from_http_pdf(entry, name, static)
             items = blob.get("opportunities") or []
             rows = [normalize("local_sample", op, static) for op in items]
+        else:
+            print(f"[collect] {name}: unknown type '{typ}', skipping.")
+            continue
+        print(f"[collect] {name}: fetched_rows={len(rows)}")
+        # ---- Apply capacity / geo filters BEFORE indexing (allow per-source bypass) ----
+        if rows:
+            if entry.get("skip_filters"):
+                print(f"[filter] {name}: skip_filters=true → keeping all {len(rows)}")
+            else:
+                pre = len(rows)
+                filtered = []
+                for r in rows:
+                    t = _doc_text_from_row(r)
+                    if capacity_only and not _is_capacity_building_text(t):
+                        continue
+                    if pa_md_only and not _is_pa_md_text(t):
+                        continue
+                    filtered.append(r)
+                print(
+                    f"[filter] {name}: kept {len(filtered)}/{pre} after filters "
+                    f"(capacity_only={capacity_only}, pa_md_only={pa_md_only})"
+                )
+                rows = filtered
+        print(f"[collect] {name} → rows_after_filters={len(rows)}")
         all_rows.extend(rows)
+    # ---- Cross-source DEDUPE + MERGE ----
+    unique = _dedupe_and_merge(all_rows)
     print(f"[ingest] Unique records to index: {len(unique)}")
     path = _save_docstore(unique)

app/ingestors/http_html_js.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# app/ingestors/http_html_js.py
+import asyncio, time
+from typing import List, Dict, Any, Optional
+from urllib.parse import urljoin
+from playwright.async_api import async_playwright
+DEFAULT_WAIT_MS = 3000
+async def _scrape_page(page, url: str, wait_ms: int, selectors: Dict[str, str]) -> List[Dict[str, Any]]:
+    await page.goto(url, wait_until="domcontentloaded", timeout=45000)
+    if wait_ms:
+        await page.wait_for_timeout(wait_ms)
+    cards = []
+    card_sel = selectors.get("card", "")
+    title_sel = selectors.get("title", "")
+    link_sel = selectors.get("link", "")
+    desc_sel = selectors.get("description", "")
+    meta_sel = selectors.get("meta", "")  # e.g., location/focus
+    elements = await page.query_selector_all(card_sel) if card_sel else []
+    for el in elements:
+        title = (await (await el.query_selector(title_sel)).inner_text()).strip() if title_sel and await el.query_selector(title_sel) else ""
+        link_el = await el.query_selector(link_sel) if link_sel else None
+        href = await link_el.get_attribute("href") if link_el else None
+        link = urljoin(url, href) if href else url
+        desc = (await (await el.query_selector(desc_sel)).inner_text()).strip() if desc_sel and await el.query_selector(desc_sel) else ""
+        meta = (await (await el.query_selector(meta_sel)).inner_text()).strip() if meta_sel and await el.query_selector(meta_sel) else ""
+        if title or desc:
+            cards.append({
+                "title": title,
+                "url": link,
+                "summary": desc,
+                "meta": meta
+            })
+    return cards
+async def scrape_js_site(
+    start_url: str,
+    max_pages: int = 1,
+    wait_ms: int = DEFAULT_WAIT_MS,
+    selectors: Optional[Dict[str, str]] = None,
+    next_selector: Optional[str] = None
+) -> List[Dict[str, Any]]:
+    selectors = selectors or {}
+    results: List[Dict[str, Any]] = []
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        page = await browser.new_page()
+        current_url = start_url
+        for _ in range(max_pages):
+            page_results = await _scrape_page(page, current_url, wait_ms, selectors)
+            results.extend(page_results)
+            if not next_selector:
+                break
+            next_btn = await page.query_selector(next_selector)
+            if not next_btn:
+                break
+            await next_btn.click()
+            await page.wait_for_timeout(800)  # polite delay
+        await browser.close()
+    return results
+def ingest_http_html_js(cfg: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    cfg example:
+    {
+      "url": "...",
+      "options": {"wait_for": 3000, "max_pages": 3},
+      "selectors": {
+         "card": ".result-card",
+         "title": ".card-title",
+         "link": "a.card-link",
+         "description": ".card-body",
+         "meta": ".card-meta"
+      },
+      "next_selector": "a[rel=next]"
+    }
+    """
+    url = cfg["url"]
+    opts = cfg.get("options", {})
+    wait_ms = int(opts.get("wait_for", DEFAULT_WAIT_MS))
+    max_pages = int(opts.get("max_pages", 1))
+    selectors = cfg.get("selectors", {})
+    next_selector = cfg.get("next_selector")
+    # Run event loop
+    results = asyncio.run(scrape_js_site(url, max_pages, wait_ms, selectors, next_selector))
+    # Normalize to your index schema
+    docs = []
+    for r in results:
+        docs.append({
+            "title": r["title"] or "Untitled foundation",
+            "url": r["url"],
+            "body": f"{r.get('summary','')}\n{r.get('meta','')}",
+            "source_type": "foundation_private",
+            "geo": "US-MidAtlantic",
+            "tags": ["faith-based", "foundation"],
+        })
+    return docs

app/ranking/rerank.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# app/ranking/rerank.py
+import numpy as np
+from sentence_transformers import SentenceTransformer, util
+INTENT_TEXT = """
+nonprofit 501(c)(3) capacity building, community outreach, reentry, youth,
+food security, housing stabilization, violence prevention, mental health,
+addiction recovery, faith-based programs, workforce, mentorship
+"""
+_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+_intent_vec = _model.encode(INTENT_TEXT, normalize_embeddings=True)
+def embed_score(title: str, body: str) -> float:
+    text = f"{title}\n{body or ''}"
+    v = _model.encode(text, normalize_embeddings=True)
+    return float(util.cos_sim(_intent_vec, v).item())

app/ranking/rules.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# app/ranking/rules.py
+INCLUDE_HINTS = [
+  "community", "capacity building", "re-entry", "workforce", "housing",
+  "human services", "addiction", "youth", "violence prevention",
+  "nonprofit", "faith", "church", "outreach", "mentorship"
+]
+EXCLUDE_HINTS = [
+  "r01", "r21", "sbir", "sttr", "lab solicitation", "postdoctoral",
+  "basic research", "scoping study", "hypothesis", "principal investigator"
+]
+def rule_score(text: str) -> int:
+    t = (text or "").lower()
+    s = 0
+    s += sum(2 for k in INCLUDE_HINTS if k in t)
+    s -= sum(2 for k in EXCLUDE_HINTS if k in t)
+    return s

app/ranking/score.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# app/ranking/score.py
+def confidence(item) -> float:
+    rs = rule_score(item.get("body","") + " " + item.get("title",""))
+    es = embed_score(item.get("title",""), item.get("body",""))
+    dl_boost = 0.0
+    if item.get("deadline"): dl_boost += 0.05
+    if "NONPROFIT_501C3" in (item.get("eligibility") or []): dl_boost += 0.10
+    return max(0.0, min(1.0, 0.4*es + 0.4*(rs/8.0) + dl_boost))

app/ui_streamlit.py CHANGED Viewed

@@ -8,99 +8,149 @@ if str(ROOT) not in sys.path:
 import os, json
 from pathlib import Path
 from app.main import get_env, ensure_index_exists
 from app.search import search
-import streamlit as st
 st.markdown("""
 <style>
-/* --- Global safety net: make default text dark --- */
-html, body, [class^="css"], [class*=" css"] {
-  color: #0f172a !important;  /* slate-900 */
 }
-/* --- Streamlit selectbox/multiselect (BaseWeb rendering) --- */
-div[data-baseweb="select"] * { color: #0f172a !important; }
-div[data-baseweb="select"] { background: #ffffff !important; border-color: #cbd5e1 !important; }
-/* placeholder inside the closed select */
-div[data-baseweb="select"] div[aria-hidden="true"] { color: #64748b !important; }
-/* open dropdown menu (BaseWeb popover) */
-div[data-baseweb="popover"] [role="listbox"], div[data-baseweb="menu"] { background: #ffffff !important; }
-div[data-baseweb="popover"] [role="option"], div[data-baseweb="menu"] li { color: #0f172a !important; background: #ffffff !important; }
-/* --- Alternative rendering (ARIA hooks) in newer Streamlit builds --- */
-div[role="button"][aria-haspopup="listbox"] * { color: #0f172a !important; }
-ul[role="listbox"] li, div[role="option"] { color: #0f172a !important; background: #ffffff !important; }
-/* --- Streamlit component wrappers --- */
-.stSelectbox, .stMultiSelect { color: #0f172a !important; }
-.stSelectbox div, .stMultiSelect div { color: #0f172a !important; }
-/* --- Hard reset in case a global rule set all <span> to white --- */
-span, li { color: inherit !important; }
-</style>
-""", unsafe_allow_html=True)
-# ── Streamlit config ──────────────────────────────────────────────────────────
-st.set_page_config(page_title="Grants Discovery App By Lupo", page_icon="🧭", layout="wide")
-# ── Theme & CSS (BLACK + ORANGE, dark selects) ────────────────────────────────
-st.markdown("""
-<style>
-/* App base */
-.stApp { background-color: #000000; color: #f8fafc; }
-/* Text defaults */
-html, body, [class*="css"], h1, h2, h3, h4, h5, h6, p, span, div { color: #f8fafc !important; }
-/* Accents */
-a, .stRadio > label, .stSlider label { color: #f97316 !important; }
 /* Buttons */
-.stButton>button { background:#f97316; color:#fff; border:none; border-radius:8px; padding:0.5rem 0.9rem; font-weight:600; }
 .stButton>button:hover { filter:brightness(1.1); }
-/* Text input */
-.stTextInput input { background:#111827 !important; color:#f8fafc !important; border:1px solid #334155 !important; }
-/* Closed control (select/multiselect) */
-.stSelectbox div[data-baseweb="select"], .stMultiSelect div[data-baseweb="select"],
-.stSelectbox div[role="combobox"], .stMultiSelect div[role="combobox"] {
-  background-color:#1e293b !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:8px !important;
 }
-/* Text & icons inside control */
-.stSelectbox div[data-baseweb="select"] div, .stMultiSelect div[data-baseweb="select"] div,
-.stSelectbox div[data-baseweb="select"] input, .stMultiSelect div[data-baseweb="select"] input,
-.stSelectbox svg, .stMultiSelect svg { color:#f8fafc !important; fill:#f8fafc !important; }
-/* Placeholder */
-.stSelectbox div[data-baseweb="select"] input::placeholder, .stMultiSelect div[data-baseweb="select"] input::placeholder { color:#94a3b8 !important; }
-/* Selected chips (multiselect) */
-.stMultiSelect [data-baseweb="tag"] { background-color:#334155 !important; color:#e2e8f0 !important; border-radius:999px !important; }
-/* Open dropdown menu */
-div[data-baseweb="menu"] { background-color:#0b1220 !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:10px !important; }
-div[data-baseweb="menu"] [role="option"] { background:transparent !important; color:#f8fafc !important; }
-div[data-baseweb="menu"] [role="option"]:hover { background:#1f2937 !important; }
-div[data-baseweb="menu"] [role="option"][aria-selected="true"] { background:#334155 !important; color:#f8fafc !important; }
-/* Result cards */
-.result-card { border:1px solid #1e293b; background:#1e293b; border-radius:14px; padding:16px; margin:10px 0; box-shadow:0 1px 2px rgba(0,0,0,0.2); }
-.result-meta { font-size:13px; color:#94a3b8; margin-top:6px; }
-span.chip { display:inline-block; padding:3px 8px; border-radius:999px; background:#334155; margin-right:6px; font-size:12px; color:#e2e8f0; }
-/* Compact hero (single, 240px) */
-.hero { height: 240px; border-radius: 16px; margin: 6px 0 16px;
   background: linear-gradient(rgba(0,0,0,.45), rgba(0,0,0,.45)),
   url('https://images.unsplash.com/photo-1469474968028-56623f02e42e?auto=format&fit=crop&w=1280&q=80') center/cover no-repeat; }
 .hero-text { height:100%; display:flex; flex-direction:column; align-items:center; justify-content:center; text-align:center; color:#fff; }
-.hero-text h1 { margin:0; font-size:28px; font-weight:700; color:#f97316; }
 .hero-text p  { margin:6px 0 0; font-size:15px; color:#fcd34d; }
-/* ===== FORCE DARK SELECT / MULTISELECT ===== */
-[data-testid="stSelectbox"] div[role="combobox"], [data-testid="stMultiSelect"] div[role="combobox"],
-div[role="combobox"][aria-haspopup="listbox"] { background-color:#1e293b !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:8px !important; }
-[data-testid="stSelectbox"] div[role="combobox"] input, [data-testid="stMultiSelect"] div[role="combobox"] input,
-div[role="combobox"] input { color:#f8fafc !important; }
-div[role="combobox"] input::placeholder { color:#94a3b8 !important; }
-div[role="combobox"] svg { color:#f8fafc !important; fill:#f8fafc !important; }
-[data-testid="stMultiSelect"] [data-baseweb="tag"], [data-testid="stMultiSelect"] [aria-label="remove"] { background-color:#334155 !important; color:#e2e8f0 !important; border-radius:999px !important; }
-div[role="listbox"], ul[role="listbox"], div[data-baseweb="menu"] { background-color:#0b1220 !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:10px !important; }
-[role="listbox"] [role="option"], div[data-baseweb="menu"] [role="option"] { background:transparent !important; color:#f8fafc !important; }
-[role="listbox"] [role="option"]:hover, div[data-baseweb="menu"] [role="option"]:hover { background:#1f2937 !important; }
-[role="listbox"] [role="option"][aria-selected="true"], div[data-baseweb="menu"] [role="option"][aria-selected="true"] { background:#334155 !important; color:#f8fafc !important; }
 </style>
 """, unsafe_allow_html=True)
-# ── Hero block (single) ───────────────────────────────────────────────────────
 st.markdown("""
 <div class="hero">
   <div class="hero-text">
@@ -110,27 +160,13 @@ st.markdown("""
 </div>
 """, unsafe_allow_html=True)
-# ── Hide developer diagnostics by default ─────────────────────────────────────
-SHOW_DEV = os.environ.get("SHOW_DEV") == "1"
-# ── Environment + index ───────────────────────────────────────────────────────
 _env = get_env()
 ensure_index_exists(_env)
-# ---------- helpers ----------
-def _dedup_records(rows):
-    seen, out = set(), []
-    for r in rows or []:
-        k = r.get("id") or r.get("url") or r.get("title")
-        if not k or k in seen:
-            continue
-        seen.add(k)
-        out.append(r)
-    return out
 def _norm_list(v):
-    if v is None:
-        return []
     if isinstance(v, str):
         parts = [p.strip() for p in v.replace(";", ",").split(",")]
         return [p.lower() for p in parts if p]
@@ -146,8 +182,7 @@ def _matches_filters(rec, geo_sel, cat_sel):
     return g_ok and c_ok
 def _ministry_filter(rows):
-    if not rows:
-        return rows
     banned_terms = [
         "broad agency announcement", "baa", "research", "r&d", "prototype",
         "laboratory", "university", "sbir", "sttr",
@@ -155,42 +190,40 @@ def _ministry_filter(rows):
         "w911", "n00014", "fa-", "afrl", "arpa"
     ]
     preferred_agencies = {
-        "FTA", "HHS", "ACL", "USDA", "USDA-FNS", "USDA-RD", "DOL", "DOJ", "OJP", "OVW",
-        "EDA", "HRSA", "SAMHSA", "CFPB", "HUD"
     }
-    required_any_terms = [
-        "vehicle", "van", "bus", "paratransit", "mobility",
-        "congregate meals", "home-delivered meals", "senior nutrition",
-        "food pantry", "food bank", "hunger relief", "refrigeration", "freezer",
-        "community", "faith", "church", "ministry", "nonprofit",
-        "reentry", "workforce", "case management", "technical assistance"
     ]
     def txt(r):
-        return " ".join([
-            str(r.get("title","")),
-            str(r.get("synopsis") or r.get("summary") or ""),
-            str(r.get("agency") or ""),
-        ]).lower()
-    kept = []
     for r in rows:
         t = txt(r)
-        if any(b in t for b in banned_terms):
-            continue
         agency = (r.get("agency") or "").upper()
-        cats = [c.lower() for c in (r.get("categories") or [])]
-        is_preferred_agency = any(agency.startswith(a) for a in preferred_agencies)
-        has_ministry_cue = any(term in t for term in required_any_terms) or any(
             c in {"transportation","vehicle","elderly","disabled","food","community","justice","reentry","workforce"} for c in cats
         )
-        if is_preferred_agency or has_ministry_cue:
             kept.append(r)
     return kept
 def _days_until(iso):
-    from datetime import date, datetime
-    if not iso:
-        return None
     try:
         d = datetime.fromisoformat(str(iso)).date()
         return (d - date.today()).days
@@ -198,39 +231,13 @@ def _days_until(iso):
         return None
 def _deadline_badge(days_left):
-    if days_left is None:
-        return "🟦 TBD"
-    if days_left < 0:
-        return "⬛ Closed"
-    if days_left <= 14:
-        return f"🟥 Due in {days_left}d"
-    if days_left <= 30:
-        return f"🟨 {days_left}d"
     return f"🟩 {days_left}d"
-# ---------- end helpers ----------
-# ---------- optional diagnostics ----------
-with st.expander("Diagnostics (optional)", expanded=False):
-    idx = Path(_env["INDEX_DIR"])
-    st.write("INDEX_DIR:", str(idx))
-    st.write("faiss.index exists:", (idx / "faiss.index").exists())
-    st.write("meta.json exists:", (idx / "meta.json").exists())
-    if (idx / "meta.json").exists():
-        try:
-            meta = json.loads((idx / "meta.json").read_text())
-            st.write("meta.json count:", len(meta))
-            st.write("meta head:", [{"id": m.get("id"), "title": m.get("title")} for m in meta[:2]])
-        except Exception as e:
-            st.error(f"Failed to read meta.json: {e!r}")
-    try:
-        demo = search("transportation", _env, top_k=3, filters={})
-        st.write("sample search('transportation') results:", len(demo))
-        if demo:
-            st.write(demo[:3])
-    except Exception as e:
-        st.error(f"search() raised: {e!r}")
-# ---------- end diagnostics ----------
 st.title("Grants Discovery RAG (Capacity Building)")
 preset = st.radio(
@@ -238,7 +245,6 @@ preset = st.radio(
     ["General", "Elderly", "Prison Ministry", "Evangelism", "Vehicles/Transport", "FTA 5310"],
     horizontal=True
 )
 default_q = {
     "General": "capacity building",
     "Elderly": "capacity building for seniors and aging services",
@@ -248,7 +254,6 @@ default_q = {
     "FTA 5310": "5310 Enhanced Mobility Seniors Individuals with Disabilities",
 }.get(preset, "capacity building")
-# --- controls ---
 q = st.text_input("Search query", value=default_q)
 geo = st.multiselect("Geo filter (optional)", options=["US", "MD", "PA"], default=[])
@@ -256,40 +261,38 @@ categories = st.multiselect(
     "Category filter (optional)",
     options=[
         "capacity_building","elderly","prison_ministry","evangelism",
-        "transportation","vehicle",
-        "justice","reentry","victim_services","youth","women","food","workforce"
     ],
     default=[]
 )
-top_k = st.slider("Results", 5, 50, 15)
 sort_by = st.selectbox("Sort by", ["Relevance", "Deadline (soonest first)"], index=0)
 only_open = st.checkbox("Only show opportunities with a future deadline", value=True)
 ministry_focus = st.checkbox("Ministry Focus (hide research/defense/academic BAAs)", value=True)
-# NEW: Sprint 2 view + agency facet
 view = st.selectbox("View", ["All", "Saved", "Hidden"], index=0)
-# pre-load agencies list (from meta.json when present)
 try:
-    meta_for_agencies = json.loads(Path(_env["INDEX_DIR"], "meta.json").read_text())
     agency_options = sorted({m.get("agency") for m in meta_for_agencies if m.get("agency")})
 except Exception:
     agency_options = []
 sel_agencies = st.multiselect("Agency filter (optional)", options=agency_options, default=[])
-# Build backend filters (if the search() supports them)
 backend_filters = {}
 if geo: backend_filters["geo"] = geo
 if categories: backend_filters["categories"] = categories
 if sel_agencies: backend_filters["agency"] = sel_agencies
-# --- Sprint 2 session state for Save/Hide ---
-if "saved_ids" not in st.session_state:
-    st.session_state.saved_ids = set()
-if "hidden_ids" not in st.session_state:
-    st.session_state.hidden_ids = set()
-# action helpers
 def _save_item(item_id: str):
     st.session_state.saved_ids.add(item_id)
     st.session_state.hidden_ids.discard(item_id)
@@ -300,101 +303,77 @@ def _hide_item(item_id: str):
     st.session_state.saved_ids.discard(item_id)
     st.experimental_rerun()
-col1, col2 = st.columns([1, 1])
-with col1:
     if st.button("Search"):
         try:
-            raw = search(q, _env, top_k=top_k, filters=backend_filters)
-            dedup = _dedup_records(raw)
-            # 1) Geo/Category client-side filter (fallback if backend ignores)
             if geo or categories:
-                base_filtered = [r for r in dedup if _matches_filters(r, geo, categories)]
             else:
-                base_filtered = dedup
-            # 2) Only-open filter
-            from datetime import date, datetime
             def _to_date_safe(val):
                 if not val: return None
                 try: return datetime.fromisoformat(str(val)).date()
                 except Exception: return None
             open_filtered = base_filtered
             if only_open:
                 open_filtered = [r for r in base_filtered
                                  if (_to_date_safe(r.get("deadline")) or date.max) >= date.today()]
-            # 3) Agency filter (client-side, in case backend didn't apply)
             if sel_agencies:
                 af = set(sel_agencies)
                 open_filtered = [r for r in open_filtered if (r.get("agency") in af)]
-            # 4) Ministry filter
             final_results = _ministry_filter(open_filtered) if ministry_focus else open_filtered
-            # Clear/show hidden toggle mgmt
-            if not ministry_focus and st.session_state.get("show_hidden"):
-                st.session_state.pop("show_hidden", None)
-            hidden_due_to_ministry = 0
-            if ministry_focus:
-                hidden_due_to_ministry = len(open_filtered) - len(final_results)
-                st.session_state.pop("show_hidden", None)
             st.session_state["results"] = final_results
             st.session_state["last_query"] = q
             st.session_state["last_filters"] = {
-                "geo": geo, "categories": categories,
-                "only_open": only_open, "ministry_focus": ministry_focus,
-                "agencies": sel_agencies,
             }
-            st.success(
-                f"Found {len(dedup)} total • After geo/cat: {len(base_filtered)} • "
-                f"Open-only: {len(open_filtered)} • Displaying: {len(final_results)}"
-                + (f" • Hidden by ministry filter: {hidden_due_to_ministry}" if ministry_focus else "")
-            )
-            if ministry_focus and hidden_due_to_ministry > 0:
-                if st.checkbox(f"Show hidden items ({hidden_due_to_ministry})", value=False, key="show_hidden"):
-                    st.session_state["results"] = open_filtered
         except Exception as e:
             st.error(str(e))
-with col2:
     if st.button("Export Results to CSV"):
         results_for_export = st.session_state.get("results", [])
         if not results_for_export:
             st.warning("No results to export. Run a search first.")
         else:
-            os.makedirs(_env["EXPORT_DIR"], exist_ok=True)
-            out_path = os.path.join(_env["EXPORT_DIR"], "results.csv")
             import pandas as pd
             pd.DataFrame(results_for_export).to_csv(out_path, index=False)
             st.success(f"Exported to {out_path}")
 st.markdown("---")
-# ---- Sorting/filter helpers ----
-from datetime import date, datetime
-def _to_date(d):
-    if not d: return None
-    try: return datetime.fromisoformat(str(d)).date()
-    except Exception: return None
-# ---- Render results ----
 results = st.session_state.get("results", [])
-# Apply "View" (All/Saved/Hidden)
 if view == "Saved":
     results = [r for r in results if r.get("id") in st.session_state.saved_ids]
 elif view == "Hidden":
     results = [r for r in results if r.get("id") in st.session_state.hidden_ids]
-# Apply sort if selected
 if sort_by.startswith("Deadline") and results:
     results.sort(
         key=lambda r: (
@@ -403,42 +382,175 @@ if sort_by.startswith("Deadline") and results:
         )
     )
-# Did the user run a search?
-ran_search = bool(st.session_state.get("last_query"))
-if results:
-    st.caption(f"Results: {len(results)}")
-    for r in results:
-        title = r.get("title", "(no title)")
-        url = r.get("url", "")
-        cats = r.get("categories") or r.get("cats") or []
-        geo_tags = r.get("geo") or []
-        _id = r.get("id") or r.get("url") or title
-        st.markdown(f"### {title}")
-        st.write(f"**Source:** {r.get('source','')} | **Geo:** {', '.join(geo_tags) if isinstance(geo_tags, list) else geo_tags} | **Categories:** {', '.join(cats) if isinstance(cats, list) else cats}")
-        # Link / score
-        if url and not url.startswith("http"):
-            st.caption("Note: This item may display an ID or number instead of a full link. Open on Grants.gov if needed.")
-        st.write(f"[Open Link]({url})  \nScore: {r.get('score', 0):.3f}")
-        # Deadline + badge
-        posted = r.get("posted_date") or ""
-        deadline = r.get("deadline") or ""
-        days_left = _days_until(deadline)
-        st.caption(f"Posted: {posted}  •  Deadline: {deadline}  •  {_deadline_badge(days_left)}")
-        # Save / Hide buttons
-        c1, c2, _ = st.columns([1,1,6])
-        if c1.button(("✅ Saved" if _id in st.session_state.saved_ids else "💾 Save"), key=f"save-{_id}"):
-            _save_item(_id)
-        if c2.button(("🙈 Hidden" if _id in st.session_state.hidden_ids else "🙈 Hide"), key=f"hide-{_id}"):
-            _hide_item(_id)
-        st.markdown("---")
 else:
     if ran_search:
-        st.info("No active grants match these filters right now. We’ll notify you when the next cycle opens.")
     else:
         st.info("Enter a query and click Search.")

 import os, json
 from pathlib import Path
+from datetime import date, datetime
+import streamlit as st
 from app.main import get_env, ensure_index_exists
 from app.search import search
+# ── Page config ───────────────────────────────────────────────────────────────
+st.set_page_config(page_title="Grants Discovery App By Lupo", page_icon="🧭", layout="wide")
+# ── THEME / CSS — single, unified block (dark app; readable controls) ─────────
 st.markdown("""
 <style>
+/* App base */
+:root {
+  --bg: #0a0f1a;
+  --panel: #121827;
+  --text: #e5eefb;
+  --muted: #95a3b8;
+  --accent: #f97316;
+  --border: #2b3a55;
 }
+.stApp { background-color: var(--bg); color: var(--text); }
+/* Typo & links */
+html, body, [class*="st-"], h1,h2,h3,h4,h5,h6, p, span, div { color: var(--text) !important; }
+a, .stRadio > label, .stSlider label { color: var(--accent) !important; }
 /* Buttons */
+.stButton>button {
+  background: var(--accent); color:#fff; border:none; border-radius:10px;
+  padding: 0.5rem 0.9rem; font-weight:600;
+}
 .stButton>button:hover { filter:brightness(1.1); }
+/* Text inputs */
+.stTextInput input, .stTextArea textarea {
+  background: var(--panel) !important; color: var(--text) !important;
+  border: 1px solid var(--border) !important; border-radius: 10px !important;
+}
+/* ===== FIXED: Select/Multiselect controls - HIGH CONTRAST ===== */
+/* Labels above the controls */
+[data-testid="stSelectbox"] label div,
+[data-testid="stMultiSelect"] label div {
+  color: #e5eefb !important;   /* Light text for dark background */
+  font-weight: 600;
+}
+/* Closed control (the combobox) */
+[data-testid="stSelectbox"] div[role="combobox"],
+[data-testid="stMultiSelect"] div[role="combobox"] {
+  background: #1e293b !important;   /* Dark field */
+  color: #f8fafc !important;        /* Light text - HIGH CONTRAST */
+  border: 1px solid #475569 !important;
+  border-radius: 10px !important;
+  font-weight: 500;
+}
+/* Text & icons inside the closed control */
+[data-testid="stSelectbox"] div[role="combobox"] *,
+[data-testid="stMultiSelect"] div[role="combobox"] * {
+  color: #f8fafc !important;        /* Force light text */
+  fill:  #f8fafc !important;        /* Force light icons */
 }
+/* Placeholder text */
+[data-testid="stMultiSelect"] input::placeholder {
+  color: #94a3b8 !important;        /* Muted but visible placeholder */
+}
+/* Multiselect chips */
+[data-baseweb="tag"] {
+  background: #334155 !important;
+  color: #e2e8f0 !important;        /* Light text on chips */
+  border-radius: 999px !important;
+  font-weight: 500;
+}
+/* Dropdown menu (popover) - DARK THEME */
+div[data-baseweb="popover"] {
+  z-index: 999999 !important;       /* Ensure it appears above everything */
+}
+div[data-baseweb="popover"] [role="listbox"],
+div[data-baseweb="menu"],
+ul[role="listbox"] {
+  background: #1e293b !important;   /* Dark menu background */
+  color: #f8fafc !important;        /* Light text - HIGH CONTRAST */
+  border: 1px solid #475569 !important;
+  border-radius: 10px !important;
+}
+/* Options in dropdown */
+[role="listbox"] [role="option"],
+div[data-baseweb="menu"] [role="option"] {
+  background: transparent !important;
+  color: #f8fafc !important;        /* Light text */
+  font-weight: 500;
+}
+/* Hover state */
+[role="listbox"] [role="option"]:hover,
+div[data-baseweb="menu"] [role="option"]:hover {
+  background: #334155 !important;    /* Slightly lighter on hover */
+  color: #ffffff !important;
+}
+/* Selected state */
+[role="listbox"] [role="option"][aria-selected="true"],
+div[data-baseweb="menu"] [role="option"][aria-selected="true"] {
+  background: #475569 !important;    /* Highlight selected */
+  color: #ffffff !important;
+  font-weight: 600;
+}
+/* Search input inside multiselect dropdown */
+div[data-baseweb="popover"] input {
+  background: #0f172a !important;
+  color: #f8fafc !important;
+  border: 1px solid #475569 !important;
+  border-radius: 6px !important;
+}
+/* Cards */
+.result-card { border:1px solid var(--border); background: var(--panel);
+  border-radius:14px; padding:16px; margin:10px 0; box-shadow:0 1px 2px rgba(0,0,0,0.2); }
+.result-meta { font-size:13px; color: var(--muted); margin-top:6px; }
+span.chip { display:inline-block; padding:3px 8px; border-radius:999px; background:#2a354a;
+  margin-right:6px; font-size:12px; color:var(--text); }
+/* Hero */
+.hero { height: 220px; border-radius: 16px; margin: 6px 0 16px;
   background: linear-gradient(rgba(0,0,0,.45), rgba(0,0,0,.45)),
   url('https://images.unsplash.com/photo-1469474968028-56623f02e42e?auto=format&fit=crop&w=1280&q=80') center/cover no-repeat; }
 .hero-text { height:100%; display:flex; flex-direction:column; align-items:center; justify-content:center; text-align:center; color:#fff; }
+.hero-text h1 { margin:0; font-size:28px; font-weight:700; color: var(--accent); }
 .hero-text p  { margin:6px 0 0; font-size:15px; color:#fcd34d; }
 </style>
 """, unsafe_allow_html=True)
+# ── Hero ──────────────────────────────────────────────────────────────────────
 st.markdown("""
 <div class="hero">
   <div class="hero-text">
 </div>
 """, unsafe_allow_html=True)
+# ── Environment & index ───────────────────────────────────────────────────────
 _env = get_env()
 ensure_index_exists(_env)
+# ── Helpers ───────────────────────────────────────────────────────────────────
 def _norm_list(v):
+    if v is None: return []
     if isinstance(v, str):
         parts = [p.strip() for p in v.replace(";", ",").split(",")]
         return [p.lower() for p in parts if p]
     return g_ok and c_ok
 def _ministry_filter(rows):
+    if not rows: return rows
     banned_terms = [
         "broad agency announcement", "baa", "research", "r&d", "prototype",
         "laboratory", "university", "sbir", "sttr",
         "w911", "n00014", "fa-", "afrl", "arpa"
     ]
     preferred_agencies = {
+        "FTA","HHS","ACL","USDA","USDA-FNS","USDA-RD","DOL","DOJ","OJP","OVW","EDA","HRSA","SAMHSA","CFPB","HUD"
     }
+    terms = [
+        "vehicle","van","bus","paratransit","mobility",
+        "congregate meals","home-delivered meals","senior nutrition",
+        "food pantry","food bank","hunger relief","refrigeration","freezer",
+        "community","faith","church","ministry","nonprofit",
+        "reentry","workforce","case management","technical assistance","capacity"
     ]
     def txt(r):
+        return " ".join([str(r.get("title","")),
+                         str(r.get("synopsis") or r.get("summary") or ""),
+                         str(r.get("agency") or "")]).lower()
+    kept=[]
     for r in rows:
         t = txt(r)
+        if any(b in t for b in banned_terms): continue
         agency = (r.get("agency") or "").upper()
+        cats = [c.lower() for c in (r.get("categories") or [])] if isinstance(r.get("categories"), list) else []
+        prefer = any(agency.startswith(a) for a in preferred_agencies)
+        has_cue = any(term in t for term in terms) or any(
             c in {"transportation","vehicle","elderly","disabled","food","community","justice","reentry","workforce"} for c in cats
         )
+        if prefer or has_cue:
             kept.append(r)
     return kept
+def _to_date(d):
+    if not d: return None
+    try: return datetime.fromisoformat(str(d)).date()
+    except Exception: return None
 def _days_until(iso):
+    if not iso: return None
     try:
         d = datetime.fromisoformat(str(iso)).date()
         return (d - date.today()).days
         return None
 def _deadline_badge(days_left):
+    if days_left is None: return "🟦 TBD"
+    if days_left < 0:    return "⬛ Closed"
+    if days_left <= 14:  return f"🟥 Due in {days_left}d"
+    if days_left <= 30:  return f"🟨 {days_left}d"
     return f"🟩 {days_left}d"
+# ── UI: Presets & inputs ──────────────────────────────────────────────────────
 st.title("Grants Discovery RAG (Capacity Building)")
 preset = st.radio(
     ["General", "Elderly", "Prison Ministry", "Evangelism", "Vehicles/Transport", "FTA 5310"],
     horizontal=True
 )
 default_q = {
     "General": "capacity building",
     "Elderly": "capacity building for seniors and aging services",
     "FTA 5310": "5310 Enhanced Mobility Seniors Individuals with Disabilities",
 }.get(preset, "capacity building")
 q = st.text_input("Search query", value=default_q)
 geo = st.multiselect("Geo filter (optional)", options=["US", "MD", "PA"], default=[])
     "Category filter (optional)",
     options=[
         "capacity_building","elderly","prison_ministry","evangelism",
+        "transportation","vehicle","justice","reentry",
+        "victim_services","youth","women","food","workforce"
     ],
     default=[]
 )
+# Fetch more so pagination is meaningful
+top_k = st.slider("Fetch up to (results)", 50, 500, 200, step=50)
 sort_by = st.selectbox("Sort by", ["Relevance", "Deadline (soonest first)"], index=0)
 only_open = st.checkbox("Only show opportunities with a future deadline", value=True)
 ministry_focus = st.checkbox("Ministry Focus (hide research/defense/academic BAAs)", value=True)
 view = st.selectbox("View", ["All", "Saved", "Hidden"], index=0)
+# Agencies facet from meta
 try:
+    meta_for_agencies = json.loads(Path(get_env()["INDEX_DIR"], "meta.json").read_text())
     agency_options = sorted({m.get("agency") for m in meta_for_agencies if m.get("agency")})
 except Exception:
     agency_options = []
 sel_agencies = st.multiselect("Agency filter (optional)", options=agency_options, default=[])
 backend_filters = {}
 if geo: backend_filters["geo"] = geo
 if categories: backend_filters["categories"] = categories
 if sel_agencies: backend_filters["agency"] = sel_agencies
+# Sprint 2: Save/Hide state
+if "saved_ids" not in st.session_state:  st.session_state.saved_ids = set()
+if "hidden_ids" not in st.session_state: st.session_state.hidden_ids = set()
 def _save_item(item_id: str):
     st.session_state.saved_ids.add(item_id)
     st.session_state.hidden_ids.discard(item_id)
     st.session_state.saved_ids.discard(item_id)
     st.experimental_rerun()
+# ── Search & filter pipeline (stores full result set) ─────────────────────────
+c1, c2 = st.columns([1,1])
+with c1:
     if st.button("Search"):
         try:
+            raw = search(q, get_env(), top_k=top_k, filters=backend_filters)  # fetch many
+            # Geo/Category client-side fallback
             if geo or categories:
+                base_filtered = [r for r in raw if _matches_filters(r, geo, categories)]
             else:
+                base_filtered = raw
+            # Only open
             def _to_date_safe(val):
                 if not val: return None
                 try: return datetime.fromisoformat(str(val)).date()
                 except Exception: return None
             open_filtered = base_filtered
             if only_open:
                 open_filtered = [r for r in base_filtered
                                  if (_to_date_safe(r.get("deadline")) or date.max) >= date.today()]
+            # Agency
             if sel_agencies:
                 af = set(sel_agencies)
                 open_filtered = [r for r in open_filtered if (r.get("agency") in af)]
+            # Ministry
             final_results = _ministry_filter(open_filtered) if ministry_focus else open_filtered
             st.session_state["results"] = final_results
             st.session_state["last_query"] = q
             st.session_state["last_filters"] = {
+                "geo": geo, "categories": categories, "only_open": only_open,
+                "ministry_focus": ministry_focus, "agencies": sel_agencies,
             }
+            # RESET PAGINATION on new run
+            st.session_state.page = 1
+            st.success(f"Fetched {len(raw)} • After filters: {len(final_results)}")
         except Exception as e:
             st.error(str(e))
+with c2:
     if st.button("Export Results to CSV"):
         results_for_export = st.session_state.get("results", [])
         if not results_for_export:
             st.warning("No results to export. Run a search first.")
         else:
+            out_dir = get_env()["EXPORT_DIR"]
+            os.makedirs(out_dir, exist_ok=True)
+            out_path = os.path.join(out_dir, "results.csv")
             import pandas as pd
             pd.DataFrame(results_for_export).to_csv(out_path, index=False)
             st.success(f"Exported to {out_path}")
 st.markdown("---")
+# ── Post-search view/sort/pagination (5.4) ────────────────────────────────────
 results = st.session_state.get("results", [])
+ran_search = bool(st.session_state.get("last_query"))
+# View filter
 if view == "Saved":
     results = [r for r in results if r.get("id") in st.session_state.saved_ids]
 elif view == "Hidden":
     results = [r for r in results if r.get("id") in st.session_state.hidden_ids]
+# Sort
 if sort_by.startswith("Deadline") and results:
     results.sort(
         key=lambda r: (
         )
     )
+# Pagination state
+if "page_size" not in st.session_state:
+    st.session_state.page_size = 25
+if "page" not in st.session_state:
+    st.session_state.page = 1
+total = len(results)
+st.caption(f"Results: {total}")
+# Controls
+cols = st.columns([1,1,2,2,2])
+with cols[0]:
+    page_size = st.selectbox("Page size", [10, 25, 50, 100], index=1)
+    st.session_state.page_size = page_size
+# compute pages
+total_pages = max(1, (total + page_size - 1) // page_size)
+with cols[1]:
+    page = st.number_input("Page", min_value=1, max_value=total_pages,
+                           value=min(st.session_state.page, total_pages), step=1)
+    st.session_state.page = page
+# Slice AFTER filters & sort
+start = (st.session_state.page - 1) * st.session_state.page_size
+end = min(start + st.session_state.page_size, total)
+page_items = results[start:end]
+st.caption(f"Showing {start+1 if total else 0}–{end} of {total} • Page {st.session_state.page}/{total_pages}")
+# Nav buttons
+prev_col, _, next_col = st.columns([1,6,1])
+with prev_col:
+    if st.button("◀ Prev", disabled=(st.session_state.page <= 1)):
+        st.session_state.page = max(1, st.session_state.page - 1)
+        st.experimental_rerun()
+with next_col:
+    if st.button("Next ▶", disabled=(st.session_state.page >= total_pages)):
+        st.session_state.page = min(total_pages, st.session_state.page + 1)
+        st.experimental_rerun()
+# ── Render page items ─────────────────────────────────────────────────────────
+def _render_card(r):
+    title = r.get("title", "(no title)")
+    url = r.get("url", "")
+    cats = r.get("categories") or r.get("cats") or []
+    geo_tags = r.get("geo") or []
+    _id = r.get("id") or r.get("url") or title
+    posted = r.get("posted_date") or ""
+    deadline = r.get("deadline") or ""
+    days_left = _days_until(deadline)
+    st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
+    st.markdown(f"### {title}")
+    meta = f"**Source:** {r.get('source','')} • **Geo:** {', '.join(geo_tags) if isinstance(geo_tags,list) else geo_tags} • **Categories:** {', '.join(cats) if isinstance(cats,list) else cats}"
+    st.markdown(f"<div class='result-meta'>{meta}</div>", unsafe_allow_html=True)
+    # Link / score
+    if url and not url.startswith('http'):
+        st.caption("Note: This item may display an ID instead of a full link. Open on Grants.gov if needed.")
+    if url:
+        st.write(f"[Open Link]({url})")
+    if r.get("score") is not None:
+        st.caption(f"Score: {r.get('score', 0):.3f}")
+    # Deadline
+    st.caption(f"Posted: {posted}  •  Deadline: {deadline}  •  {_deadline_badge(days_left)}")
+    # Save/Hide
+    c1, c2, _ = st.columns([1,1,6])
+    if c1.button(("✅ Saved" if _id in st.session_state.saved_ids else "💾 Save"), key=f"save-{_id}"):
+        _save_item(_id)
+    if c2.button(("🙈 Hidden" if _id in st.session_state.hidden_ids else "🙈 Hide"), key=f"hide-{_id}"):
+        _hide_item(_id)
+    st.markdown("</div>", unsafe_allow_html=True)
+if page_items:
+    for r in page_items:
+        _render_card(r)
 else:
     if ran_search:
+        st.info("No active grants match these filters right now.")
     else:
         st.info("Enter a query and click Search.")
+st.markdown("""
+<style>
+/* ================== SELECT/MULTISELECT HARD OVERRIDE ================== */
+/* Goal: kill white-on-white by styling the BaseWeb select root + portal.    */
+/* Works across Chrome/Safari/Firefox; includes -webkit-text-fill-color fix. */
+/* 1) CLOSED CONTROL (the visible field) — target the BaseWeb root */
+body div[data-baseweb="select"] {
+  background: #1e293b !important;          /* dark field */
+  color: #f8fafc !important;                /* light text */
+  border: 1px solid #475569 !important;
+  border-radius: 10px !important;
+}
+/* Make absolutely everything inside readable (some builds render text in spans) */
+body div[data-baseweb="select"] * {
+  color: #f8fafc !important;
+  -webkit-text-fill-color: #f8fafc !important;  /* Safari/Chromium quirk */
+  fill: #f8fafc !important;
+}
+/* Placeholder node BaseWeb renders (aria-hidden) */
+body div[data-baseweb="select"] div[aria-hidden="true"] {
+  color: #94a3b8 !important;
+  -webkit-text-fill-color: #94a3b8 !important;
+}
+/* Ensure the actual input inherits readable color */
+body div[data-baseweb="select"] input {
+  color: #f8fafc !important;
+  -webkit-text-fill-color: #f8fafc !important;
+  caret-color: #f8fafc !important;
+  background: transparent !important;
+}
+/* 2) OPEN DROPDOWN MENU (lives in a portal under <body>) */
+body div[data-baseweb="popover"] [role="listbox"],
+body div[data-baseweb="menu"],
+body ul[role="listbox"] {
+  background: #1e293b !important;
+  color: #f8fafc !important;
+  border: 1px solid #475569 !important;
+  border-radius: 10px !important;
+  z-index: 2147483647 !important;
+}
+/* Options inside the menu */
+body [role="listbox"] [role="option"],
+body div[data-baseweb="menu"] [role="option"] {
+  background: transparent !important;
+  color: #f8fafc !important;
+}
+body [role="listbox"] [role="option"]:hover,
+body div[data-baseweb="menu"] [role="option"]:hover {
+  background: #334155 !important;
+}
+body [role="listbox"] [role="option"][aria-selected="true"],
+body div[data-baseweb="menu"] [role="option"][aria-selected="true"] {
+  background: #475569 !important;
+  color: #ffffff !important;
+}
+/* 3) EMERGENCY FALLBACK — if a theme forces a white menu inline, flip text dark */
+body div[data-baseweb="popover"][style*="rgb(255"] [role="listbox"],
+body div[data-baseweb="popover"][style*="255, 255, 255"] [role="listbox"] {
+  background: #ffffff !important;
+  color: #0f172a !important;
+  border: 1px solid #cbd5e1 !important;
+}
+body div[data-baseweb="popover"][style*="rgb(255"] [role="listbox"] * ,
+body div[data-baseweb="popover"][style*="255, 255, 255"] [role="listbox"] * {
+  color: #0f172a !important;
+  -webkit-text-fill-color: #0f172a !important;
+}
+/* 4) MULTISELECT CHIPS */
+body [data-baseweb="tag"] {
+  background: #334155 !important;
+  color: #e2e8f0 !important;
+  border-radius: 999px !important;
+}
+/* 5) OPTIONAL: turn on outlines once to verify the selector match (debug)
+body div[data-baseweb="select"] { outline: 1px dashed #22d3ee !important; }
+body div[data-baseweb="popover"] [role="listbox"] { outline: 1px dashed #22d3ee !important; }
+*/
+</style>
+""", unsafe_allow_html=True)

app/utils/dedupe.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import re, hashlib
+from difflib import SequenceMatcher
+def _norm(text: str) -> str:
+    """Lowercase, strip punctuation, collapse whitespace."""
+    t = (text or "").lower()
+    t = re.sub(r'[^a-z0-9 ]+', ' ', t)
+    return re.sub(r'\s+', ' ', t).strip()
+def hash_fingerprint(title: str, agency: str, deadline: str) -> str:
+    """
+    A strong key: normalized title + agency + deadline.
+    Use this as a primary key in your datastore.
+    """
+    base = f"{_norm(title)}|{_norm(agency)}|{deadline or ''}"
+    return hashlib.sha1(base.encode()).hexdigest()
+def near_duplicate(a: dict, b: dict) -> bool:
+    """
+    Fuzzy fallback: similar title & agency,
+    and deadlines match or are both blank.
+    """
+    dates_close = (a.get("deadline") == b.get("deadline")) \
+                  or (not a.get("deadline") and not b.get("deadline"))
+    t_sim = SequenceMatcher(None, _norm(a.get("title","")),
+                            _norm(b.get("title",""))).ratio()
+    ag_sim = SequenceMatcher(None, _norm(a.get("agency","")),
+                             _norm(b.get("agency",""))).ratio()
+    return dates_close and (t_sim > 0.88) and (ag_sim > 0.75)

app/utils/normalize.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# app/utils/normalize.py
+ELIGIBILITY_MAP = {
+  "nonprofit": "NONPROFIT_501C3",
+  "501(c)(3)": "NONPROFIT_501C3",
+  "local government": "LOCAL_GOV",
+  "state government": "STATE_GOV",
+  "higher education": "HIGHER_ED",
+}
+def normalize_eligibility(raw: str) -> list[str]:
+    vals = []
+    txt = (raw or "").lower()
+    for k,v in ELIGIBILITY_MAP.items():
+        if k in txt:
+            vals.append(v)
+    return sorted(set(vals)) or ["UNKNOWN"]

config/sources.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # Minimal, valid config — v6.3
 filters:
-  capacity_only: true   # keep only capacity-building items
   pa_md_only: false     # set to true to restrict index to PA/MD
 sources:
@@ -46,7 +46,6 @@ sources:
       page_size: 100
       max_pages: 3
       payload:
-        # Target 5310 by ALN and keywords
         aln: "20.513"
         keyword: "\"Enhanced Mobility\" OR \"Section 5310\" OR seniors OR elderly OR disabilities OR paratransit OR wheelchair OR shuttle OR van OR bus"
         oppStatuses: "posted"
@@ -86,9 +85,27 @@ sources:
         fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
         sortBy: "openDate|desc"
   # ---------- STATE & METRO PASS-THROUGHS (FTA 5310 etc.) ----------
-  # NOTE: These require adapters (http_html/web_page/http_pdf) you haven't implemented yet.
-  # They are kept here (enabled) in case your runtime supports them; otherwise set enabled: false.
   - name: "Maryland MTA — Grants (incl. 5310)"
     type: web_page
@@ -162,7 +179,7 @@ sources:
       mode: "article"
       keep_links: true
-  # --- Pennsylvania: PCA (state arts) ---
   - name: "PA Creative Industries – Capacity Building (landing)"
     type: http_html
     enabled: true
@@ -226,22 +243,30 @@ sources:
     geo: "PA"
     categories: ["capacity_building"]
-  # --- Maryland: OneStop (statewide grant listings with 'capacity' search) ---
-  - name: "Maryland OneStop – Capacity search"
-    type: http_html
     enabled: true
     url: "https://onestop.md.gov/search?query=capacity"
     geo: "MD"
     categories: ["capacity_building"]
     parse:
       follow_links: true
       link_selectors:
         - "a[href*='/forms/']"
         - "a[href*='/search/']"
       content_selectors:
         - "main"
         - "article"
-        - "[role='main']"
   # --- Maryland: DHCD (housing/community programs & press) ---
   - name: "MD DHCD – Programs (grants & loans index)"
@@ -304,6 +329,29 @@ sources:
     geo: "MD"
     categories: ["capacity_building"]
   # ---------- OPTIONAL: Curated JSON (enable after you generate it) ----------
   - name: "State 5310 Listings (curated JSON)"
     type: json_static
@@ -311,3 +359,27 @@ sources:
     file: "data/state_5310_listings.json"
     geo: "PA|MD|VA|DC"
     categories: ["transportation","elderly","disabilities","5310","deadlines"]

 # Minimal, valid config — v6.3
 filters:
+  capacity_only: false   # keep only capacity-building items
   pa_md_only: false     # set to true to restrict index to PA/MD
 sources:
       page_size: 100
       max_pages: 3
       payload:
         aln: "20.513"
         keyword: "\"Enhanced Mobility\" OR \"Section 5310\" OR seniors OR elderly OR disabilities OR paratransit OR wheelchair OR shuttle OR van OR bus"
         oppStatuses: "posted"
         fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
         sortBy: "openDate|desc"
+  # ---------- FEDERAL: Federal Register (broad NOFO scanning) ----------
+  - name: "Federal Register — Funding/NOFO keywords (API)"
+    type: http_json
+    enabled: true
+    url: "https://www.federalregister.gov/api/v1/documents.json"
+    geo: "US"
+    categories: ["capacity_building", "notices"]
+    api:
+      payload:
+        conditions[term]: "funding opportunity OR cooperative agreement OR NOFO"
+        per_page: 50
+        order: "newest"
+    parse:
+      item_path: "results[]"
+      title: "title"
+      link: "html_url"
+      published_at: "publication_date"
+      body: "abstract"
   # ---------- STATE & METRO PASS-THROUGHS (FTA 5310 etc.) ----------
   - name: "Maryland MTA — Grants (incl. 5310)"
     type: web_page
       mode: "article"
       keep_links: true
+  # --- Pennsylvania: PA Creative Industries (PCA) ---
   - name: "PA Creative Industries – Capacity Building (landing)"
     type: http_html
     enabled: true
     geo: "PA"
     categories: ["capacity_building"]
+  # --- Maryland: OneStop (JS-rendered search) ---
+  - name: "Maryland OneStop – Capacity search (JS)"
+    type: http_html_js        # Playwright adapter
     enabled: true
     url: "https://onestop.md.gov/search?query=capacity"
     geo: "MD"
     categories: ["capacity_building"]
+    options:
+      wait_for: "[role='main']"
+      scroll: true
+      max_pages: 3
+      timeout_ms: 180000       # NEW: longer timeout for SPA
+      network_idle: true       # NEW: wait for background XHR/fetch to settle
+      # debug: true            # optional: screenshot on failure
+      # click_selector: "a[aria-label='Next']"   # uncomment if pagination controls appear
     parse:
       follow_links: true
       link_selectors:
         - "a[href*='/forms/']"
         - "a[href*='/search/']"
       content_selectors:
+        - "[role='main']"
         - "main"
         - "article"
   # --- Maryland: DHCD (housing/community programs & press) ---
   - name: "MD DHCD – Programs (grants & loans index)"
     geo: "MD"
     categories: ["capacity_building"]
+  # --- Pennsylvania: DCED (Programs index; JS-rendered) ---
+  - name: "PA DCED — Programs (JS)"
+    type: http_html_js
+    enabled: true
+    url: "https://dced.pa.gov/programs/"
+    geo: "PA"
+    categories: ["capacity_building","community_development","economic_development"]
+    options:
+      wait_for: "main"
+      scroll: true
+      max_pages: 5
+      timeout_ms: 180000       # NEW
+      network_idle: true       # NEW
+      # click_selector: ".pagination a.next"
+      # debug: true
+    parse:
+      item_selector: ".program-listing .program, .content"   # fallback
+      title: ".program-title, h1, h2"
+      link: ".program-title a@href, a@href"
+      body: ".program-summary, .entry-content, main"
+      deadline_selector: ".deadline, .key-dates"
+      eligibility_selector: ".eligibility, .who-eligible"
   # ---------- OPTIONAL: Curated JSON (enable after you generate it) ----------
   - name: "State 5310 Listings (curated JSON)"
     type: json_static
     file: "data/state_5310_listings.json"
     geo: "PA|MD|VA|DC"
     categories: ["transportation","elderly","disabilities","5310","deadlines"]
+  - name: "Faith-based Foundations — Card/Grid (JS)"
+    type: http_html_js
+    enabled: true
+    skip_filters: true
+    url: "https://example.org/foundations/maryland/religion-related"
+    geo: "MD|PA|DE|NJ|VA"
+    categories: ["foundation_private","faith_based","capacity_building"]
+    options:
+      wait_for: "[role='main']"        # or the results container CSS
+      scroll: true
+      scroll_selector: ".results-pane" # ← replace with the REAL scrolling DIV
+      scroll_times: 40
+      scroll_wait_ms: 250
+      min_cards: 20
+      timeout_ms: 30000
+      network_idle: false
+      # click_selector: ".pagination a.next"  # only if the page has a Next button
+    selectors:
+      card: ".result-card, .card, article, .search-result"
+      title: "h2 a, h3 a, .card-title a, .result-title a, h2, h3, .card-title"
+      link:  "h2 a, h3 a, .card-title a, .result-title a, a"
+      description: ".summary, .card-text, .excerpt, p"
+      meta: ".meta, .tags, .badge, .location"