michaellupo74 commited on
Commit
b53e303
Β·
1 Parent(s): cd54f1c

feat(ingest): JS card/grid + scroll container + skip_filters

Browse files

feat(ui): raise top_k & results paging
chore: add ingestors/ranking/utils modules
fix: YAML selectors & SPA wait tuning

.gitignore CHANGED
@@ -30,3 +30,27 @@ runtime/
30
  data/exports/
31
  *.env*
32
  .env
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  data/exports/
31
  *.env*
32
  .env
33
+ # Python
34
+ __pycache__/
35
+ *.pyc
36
+ .venv/
37
+ venv/
38
+ *.egg-info/
39
+
40
+ # App data/output
41
+ data/
42
+ snapshots/
43
+ app/static/
44
+ *.log
45
+
46
+ # Local/playwright artifacts
47
+ playwright/.cache/
48
+ *.png
49
+
50
+ # Editor
51
+ .vscode/
52
+ .DS_Store
53
+
54
+ # Backups
55
+ *.bak
56
+ *.bak.py
app/ingest.py CHANGED
@@ -17,6 +17,9 @@ import hashlib
17
  import requests
18
  from bs4 import BeautifulSoup
19
  from datetime import datetime, timezone
 
 
 
20
 
21
  # -------------------- Config --------------------
22
 
@@ -183,7 +186,7 @@ def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
183
  hits = out.get("hits", []) if isinstance(out, dict) else (out or [])
184
  return [h for h in hits if isinstance(h, dict)]
185
 
186
- # -------------------- NEW: Generic HTML / PDF collectors --------------------
187
 
188
  _HTTP_HEADERS = {
189
  "User-Agent": "grants-rag/1.0 (+https://example.local) requests",
@@ -280,6 +283,8 @@ def _normalize_web_record(
280
  rec["is_active"] = _compute_is_active(rec["deadline"])
281
  return rec
282
 
 
 
283
  def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
284
  """
285
  Supports types: 'web_page' and 'http_html'
@@ -350,6 +355,219 @@ def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any
350
 
351
  return rows
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
354
  """
355
  type: 'http_pdf'
@@ -384,9 +602,119 @@ def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]
384
  rows.append(_normalize_web_record(source_name, url, title, body, static, extra={"posted_date": None}))
385
  return rows
386
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  # -------------------- Write docstore & build index --------------------
388
 
389
- def _save_docstore(recs: List[Dict[str, Any]]) -> str:
390
  DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
391
  path = DOCSTORE_DIR / "docstore.jsonl"
392
  with path.open("w", encoding="utf-8") as f:
@@ -425,6 +753,7 @@ def _build_index_from_docstore() -> int:
425
  "is_active": rec.get("is_active"),
426
  "program_number": rec.get("program_number"),
427
  "posted_date": rec.get("posted_date"),
 
428
  })
429
 
430
  print(f"[index] Rows loaded from docstore: {len(texts)}")
@@ -467,7 +796,7 @@ __all__ = ["ingest"]
467
  def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
468
  """
469
  Reads config, fetches from enabled sources via adapters, normalizes to a single schema,
470
- applies filters (capacity / PA-MD), dedupes, writes docstore, and builds the FAISS index.
471
  Returns (docstore_path, n_indexed).
472
  """
473
  cfg = load_config(cfg_path)
@@ -492,6 +821,7 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
492
  typ = entry.get("type")
493
  rows: List[Dict[str, Any]] = []
494
 
 
495
  if typ == "grantsgov_api":
496
  raw_hits = _collect_from_grantsgov_api(entry)
497
  rows = [normalize("grants_gov", h, static) for h in raw_hits]
@@ -499,6 +829,9 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
499
  elif typ in ("web_page", "http_html"):
500
  rows = _collect_from_http_html(entry, name, static)
501
 
 
 
 
502
  elif typ == "http_pdf":
503
  rows = _collect_from_http_pdf(entry, name, static)
504
 
@@ -508,31 +841,37 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
508
  items = blob.get("opportunities") or []
509
  rows = [normalize("local_sample", op, static) for op in items]
510
 
511
- # ---- Apply capacity / geo filters BEFORE collecting ----
512
- if rows and (capacity_only or pa_md_only):
513
- filtered = []
514
- for r in rows:
515
- t = _doc_text_from_row(r)
516
- if capacity_only and not _is_capacity_building_text(t):
517
- continue
518
- if pa_md_only and not _is_pa_md_text(t):
519
- continue
520
- filtered.append(r)
521
- print(f"[filter] {name}: kept {len(filtered)}/{len(rows)} after filters")
522
- rows = filtered
523
 
524
- print(f"[collect] {name} β†’ {len(rows)} rows")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
  all_rows.extend(rows)
526
 
527
- # ---- DEDUPE (by id β†’ url β†’ title) ----
528
- seen, unique = set(), []
529
- for r in all_rows:
530
- key = r.get("id") or r.get("url") or r.get("title")
531
- if not key or key in seen:
532
- continue
533
- seen.add(key)
534
- unique.append(r)
535
-
536
  print(f"[ingest] Unique records to index: {len(unique)}")
537
 
538
  path = _save_docstore(unique)
 
17
  import requests
18
  from bs4 import BeautifulSoup
19
  from datetime import datetime, timezone
20
+ from difflib import SequenceMatcher
21
+ from urllib.parse import urljoin
22
+
23
 
24
  # -------------------- Config --------------------
25
 
 
186
  hits = out.get("hits", []) if isinstance(out, dict) else (out or [])
187
  return [h for h in hits if isinstance(h, dict)]
188
 
189
+ # -------------------- HTTP helpers --------------------
190
 
191
  _HTTP_HEADERS = {
192
  "User-Agent": "grants-rag/1.0 (+https://example.local) requests",
 
283
  rec["is_active"] = _compute_is_active(rec["deadline"])
284
  return rec
285
 
286
+ # -------------------- Collectors: http_html / web_page --------------------
287
+
288
  def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
289
  """
290
  Supports types: 'web_page' and 'http_html'
 
355
 
356
  return rows
357
 
358
+ # -------------------- Collector: http_html_js (Playwright) --------------------
359
+
360
+ def _collect_from_http_html_js(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
361
+ """
362
+ JS-rendered pages using Playwright, with per-card extraction and robust scrolling.
363
+ entry.options:
364
+ - wait_for (css or ms int)
365
+ - scroll (bool)
366
+ - scroll_selector (css) # NEW: scroll a container div, not the window
367
+ - scroll_times (int) # NEW: default 20
368
+ - scroll_wait_ms (int) # NEW: default 400
369
+ - min_cards (int) # NEW: wait until at least N cards exist
370
+ - click_selector (css)
371
+ - max_pages (int)
372
+ - timeout_ms (int)
373
+ - network_idle (bool)
374
+ - debug (bool)
375
+ entry.selectors: card, title, link, description, meta
376
+ """
377
+ try:
378
+ from playwright.sync_api import sync_playwright # type: ignore
379
+ except Exception:
380
+ print(f"[collect][skip] {source_name}: Playwright not installed.")
381
+ return []
382
+
383
+ url = entry.get("url")
384
+ if not url:
385
+ return []
386
+
387
+ options = entry.get("options", {}) or {}
388
+ parse = entry.get("parse", {}) or entry.get("extract", {}) or {}
389
+ selectors = entry.get("selectors", {}) or {}
390
+ content_selectors = parse.get("content_selectors") or []
391
+
392
+ timeout_ms = int(options.get("timeout_ms", 6000))
393
+ network_idle = bool(options.get("network_idle", True))
394
+ debug = bool(options.get("debug", False))
395
+ max_pages = int(options.get("max_pages", 1))
396
+ click_sel = options.get("click_selector") or entry.get("next_selector")
397
+ wait_for = options.get("wait_for")
398
+
399
+ rows: List[Dict[str, Any]] = []
400
+
401
+ def _text_first(soup: BeautifulSoup, css_list: str) -> str:
402
+ if not css_list:
403
+ return ""
404
+ for css in [c.strip() for c in css_list.split(",")]:
405
+ el = soup.select_one(css)
406
+ if el:
407
+ txt = el.get_text(separator=" ", strip=True)
408
+ if txt:
409
+ return txt
410
+ return ""
411
+
412
+ def _attr_first(soup: BeautifulSoup, css_list: str, attr: str) -> Optional[str]:
413
+ if not css_list:
414
+ return None
415
+ for css in [c.strip() for c in css_list.split(",")]:
416
+ el = soup.select_one(css)
417
+ if el:
418
+ val = el.get(attr)
419
+ if val:
420
+ return val
421
+ return None
422
+
423
+ def _parse_cards(page_html: str, base_url: str) -> List[Dict[str, Any]]:
424
+ s = _soup(page_html)
425
+ card_css = selectors.get("card", "")
426
+ if not card_css:
427
+ title, body = _text_from_soup(s, content_selectors)
428
+ return [_normalize_web_record(source_name, base_url, title, body, static, extra={"posted_date": None})]
429
+
430
+ out: List[Dict[str, Any]] = []
431
+ for card in s.select(card_css) or []:
432
+ csoup = BeautifulSoup(str(card), "lxml")
433
+ title = _text_first(csoup, selectors.get("title", "h1, h2, h3"))
434
+ href = _attr_first(csoup, selectors.get("link", "a"), "href")
435
+ link = urljoin(base_url, href) if href else base_url
436
+ desc = _text_first(csoup, selectors.get("description", "p, .summary, .excerpt, .card-text"))
437
+ meta = _text_first(csoup, selectors.get("meta", ".meta, .tags, .badge, .location"))
438
+ body = "\n".join([p for p in (desc, meta) if p]).strip()
439
+ if not (title or body):
440
+ continue
441
+ out.append(_normalize_web_record(source_name, link, title or link, body, static, extra={"posted_date": None}))
442
+ return out
443
+
444
+ def _wait_page_ready(page, *, wait_for, timeout_ms, options, selectors):
445
+ # wait_for can be CSS or milliseconds
446
+ if isinstance(wait_for, int):
447
+ page.wait_for_timeout(wait_for)
448
+ elif isinstance(wait_for, str) and wait_for:
449
+ page.wait_for_selector(wait_for, timeout=min(timeout_ms, 15000))
450
+
451
+ # Scroll window or a container div
452
+ if options.get("scroll"):
453
+ scroll_sel = options.get("scroll_selector")
454
+ scroll_times = int(options.get("scroll_times", 20))
455
+ scroll_wait = int(options.get("scroll_wait_ms", 400))
456
+ if scroll_sel:
457
+ page.evaluate(
458
+ """(sel, times, wait) => new Promise(res => {
459
+ const el = document.querySelector(sel);
460
+ if (!el) { res(); return; }
461
+ let i = 0;
462
+ const t = setInterval(() => {
463
+ const y = el.scrollTop;
464
+ el.scrollTop = el.scrollHeight;
465
+ i++;
466
+ if (el.scrollTop === y || i >= times) { clearInterval(t); res(); }
467
+ }, wait);
468
+ })""",
469
+ scroll_sel, scroll_times, scroll_wait
470
+ )
471
+ else:
472
+ page.evaluate(
473
+ """(times, wait) => new Promise(res => {
474
+ let i = 0;
475
+ const t = setInterval(() => {
476
+ const y = window.scrollY;
477
+ window.scrollBy(0, document.body.scrollHeight);
478
+ i++;
479
+ if (window.scrollY === y || i >= times) { clearInterval(t); res(); }
480
+ }, wait);
481
+ })""",
482
+ scroll_times, scroll_wait
483
+ )
484
+
485
+ # Optionally wait for a minimum number of cards (virtualized lists)
486
+ min_cards = int(options.get("min_cards", 0))
487
+ card_css = (selectors or {}).get("card", "")
488
+ if min_cards and card_css:
489
+ try:
490
+ page.wait_for_function(
491
+ """([sel, target]) => {
492
+ const els = document.querySelectorAll(sel);
493
+ return els && els.length >= target;
494
+ }""",
495
+ arg=[card_css, min_cards],
496
+ timeout=min(timeout_ms, 15000),
497
+ )
498
+ except Exception:
499
+ pass # best-effort
500
+
501
+ def _try_once(p):
502
+ def _route(route):
503
+ r = route.request
504
+ if r.resource_type in {"image", "media", "font"}:
505
+ return route.abort()
506
+ return route.continue_()
507
+
508
+ browser = p.chromium.launch(headless=not debug)
509
+ context = browser.new_context(user_agent=(
510
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
511
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
512
+ ))
513
+ context.route("**/*", _route)
514
+ page = context.new_page()
515
+ try:
516
+ page.set_default_timeout(timeout_ms)
517
+ page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
518
+ if network_idle:
519
+ page.wait_for_load_state("networkidle", timeout=min(timeout_ms, 15000))
520
+
521
+ _wait_page_ready(page, wait_for=wait_for, timeout_ms=timeout_ms, options=options, selectors=selectors)
522
+
523
+ htmls = [page.content()]
524
+
525
+ # pagination
526
+ for _ in range(max_pages - 1):
527
+ sel = click_sel
528
+ if not sel or page.locator(sel).count() == 0:
529
+ break
530
+ page.click(sel)
531
+ page.wait_for_load_state("domcontentloaded")
532
+ if network_idle:
533
+ page.wait_for_load_state("networkidle", timeout=min(timeout_ms, 15000))
534
+ _wait_page_ready(page, wait_for=wait_for, timeout_ms=timeout_ms, options=options, selectors=selectors)
535
+ htmls.append(page.content())
536
+
537
+ for html in htmls:
538
+ found = _parse_cards(html, url)
539
+ rows.extend(found)
540
+ print(f"[collect][cards] {source_name}: found {len(found)} cards on this page")
541
+ browser.close()
542
+ return True
543
+ except Exception as e:
544
+ if debug:
545
+ try:
546
+ snap = DOCSTORE_DIR / f"playwright_error_{hashlib.sha1(url.encode()).hexdigest()}.png"
547
+ page.screenshot(path=str(snap))
548
+ print(f"[collect][debug] Saved screenshot: {snap}")
549
+ except Exception:
550
+ pass
551
+ print(f"[collect][warn] {source_name}: {e.__class__.__name__}: {e}")
552
+ try:
553
+ browser.close()
554
+ except Exception:
555
+ pass
556
+ return False
557
+
558
+ from playwright.sync_api import sync_playwright # late import for clarity
559
+ with sync_playwright() as p:
560
+ ok = _try_once(p)
561
+ if not ok:
562
+ time.sleep(1.5)
563
+ _try_once(p)
564
+
565
+ if not rows:
566
+ print(f"[collect][skip] {source_name}: no content after retries.")
567
+ return rows
568
+
569
+ # -------------------- PDF collector --------------------
570
+
571
  def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
572
  """
573
  type: 'http_pdf'
 
602
  rows.append(_normalize_web_record(source_name, url, title, body, static, extra={"posted_date": None}))
603
  return rows
604
 
605
+ # -------------------- De-dup helpers (5.2) --------------------
606
+
607
+ def _norm(text: str) -> str:
608
+ t = (text or "").lower()
609
+ t = re.sub(r'[^a-z0-9 ]+', ' ', t)
610
+ return re.sub(r'\s+', ' ', t).strip()
611
+
612
+ def _hash_fingerprint(title: str, agency: str, deadline: str) -> str:
613
+ base = f"{_norm(title)}|{_norm(agency)}|{deadline or ''}"
614
+ return hashlib.sha1(base.encode()).hexdigest()
615
+
616
+ def _near_duplicate(a: Dict[str, Any], b: Dict[str, Any]) -> bool:
617
+ # Deadlines equal or both missing
618
+ dates_close = (a.get("deadline") == b.get("deadline")) or (not a.get("deadline") and not b.get("deadline"))
619
+ t_sim = SequenceMatcher(None, _norm(a.get("title","")), _norm(b.get("title",""))).ratio()
620
+ ag_sim = SequenceMatcher(None, _norm(a.get("agency","")), _norm(b.get("agency",""))).ratio()
621
+ return dates_close and (t_sim > 0.88) and (ag_sim > 0.75)
622
+
623
+ def _merge_records(primary: Dict[str, Any], other: Dict[str, Any]) -> Dict[str, Any]:
624
+ """Merge fields while preserving best data and provenance."""
625
+ merged = dict(primary)
626
+
627
+ # Prefer non-empty fields; combine categories; keep earliest posted_date; keep earliest deadline if different.
628
+ def choose(a, b):
629
+ return a if (a not in (None, "", [], {})) else b
630
+
631
+ merged["url"] = choose(primary.get("url"), other.get("url"))
632
+ merged["title"] = choose(primary.get("title"), other.get("title"))
633
+ merged["synopsis"] = choose(primary.get("synopsis"), other.get("synopsis"))
634
+ merged["summary"] = choose(primary.get("summary"), other.get("summary"))
635
+ merged["agency"] = choose(primary.get("agency"), other.get("agency"))
636
+ merged["eligibility"] = choose(primary.get("eligibility"), other.get("eligibility"))
637
+ merged["program_number"] = choose(primary.get("program_number"), other.get("program_number"))
638
+ merged["geo"] = choose(primary.get("geo"), other.get("geo"))
639
+
640
+ # categories β†’ union (list)
641
+ cats_a = primary.get("categories") or []
642
+ cats_b = other.get("categories") or []
643
+ if not isinstance(cats_a, list): cats_a = [cats_a]
644
+ if not isinstance(cats_b, list): cats_b = [cats_b]
645
+ merged["categories"] = sorted(set([c for c in cats_a + cats_b if c]))
646
+
647
+ # deadline: choose earlier known date (safer to surface sooner one)
648
+ da, db = primary.get("deadline"), other.get("deadline")
649
+ if da and db:
650
+ merged["deadline"] = min(da, db)
651
+ else:
652
+ merged["deadline"] = da or db
653
+ # carry a deadline_text if any
654
+ merged["deadline_text"] = choose(primary.get("deadline_text"), other.get("deadline_text"))
655
+ merged["is_active"] = _compute_is_active(merged.get("deadline"))
656
+
657
+ # posted_date: keep earliest if both
658
+ pa, pb = primary.get("posted_date"), other.get("posted_date")
659
+ merged["posted_date"] = min(pa, pb) if (pa and pb) else (pa or pb)
660
+
661
+ # provenance: combine sources + urls
662
+ prov_sources = set()
663
+ for s in (primary.get("source"), other.get("source")):
664
+ if not s: continue
665
+ if isinstance(s, list): prov_sources.update(s)
666
+ else: prov_sources.add(s)
667
+ merged["source"] = sorted(prov_sources) if prov_sources else None
668
+
669
+ prov_urls = set()
670
+ for u in (primary.get("url"), other.get("url")):
671
+ if u: prov_urls.add(u)
672
+ # keep a list of all discovered urls
673
+ merged["all_urls"] = sorted(prov_urls.union(set(primary.get("all_urls") or []), set(other.get("all_urls") or [])))
674
+
675
+ # recompute ID based on merged fingerprint (title/agency/deadline)
676
+ merged["id"] = _hash_fingerprint(merged.get("title",""), merged.get("agency",""), merged.get("deadline",""))
677
+
678
+ return merged
679
+
680
+ def _dedupe_and_merge(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
681
+ """Exact fingerprint + fuzzy near-dup consolidation across sources."""
682
+ uniques: List[Dict[str, Any]] = []
683
+ by_fp: Dict[str, int] = {}
684
+
685
+ for r in rows:
686
+ fp = _hash_fingerprint(r.get("title",""), r.get("agency",""), r.get("deadline",""))
687
+ if fp in by_fp:
688
+ # exact dup: merge into existing
689
+ idx = by_fp[fp]
690
+ uniques[idx] = _merge_records(uniques[idx], r)
691
+ continue
692
+
693
+ # fuzzy check against current uniques
694
+ found_idx = None
695
+ for i, u in enumerate(uniques):
696
+ if _near_duplicate(r, u):
697
+ found_idx = i
698
+ break
699
+
700
+ if found_idx is not None:
701
+ uniques[found_idx] = _merge_records(uniques[found_idx], r)
702
+ # also index its fingerprint (so later exact matches land here)
703
+ new_fp = _hash_fingerprint(uniques[found_idx].get("title",""),
704
+ uniques[found_idx].get("agency",""),
705
+ uniques[found_idx].get("deadline",""))
706
+ by_fp[new_fp] = found_idx
707
+ else:
708
+ by_fp[fp] = len(uniques)
709
+ # initialize provenance
710
+ r.setdefault("all_urls", [r.get("url")] if r.get("url") else [])
711
+ uniques.append(r)
712
+
713
+ return uniques
714
+
715
  # -------------------- Write docstore & build index --------------------
716
 
717
+ def _save_docstore(recs: List[Dict, Any]) -> str:
718
  DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
719
  path = DOCSTORE_DIR / "docstore.jsonl"
720
  with path.open("w", encoding="utf-8") as f:
 
753
  "is_active": rec.get("is_active"),
754
  "program_number": rec.get("program_number"),
755
  "posted_date": rec.get("posted_date"),
756
+ "all_urls": rec.get("all_urls"),
757
  })
758
 
759
  print(f"[index] Rows loaded from docstore: {len(texts)}")
 
796
  def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
797
  """
798
  Reads config, fetches from enabled sources via adapters, normalizes to a single schema,
799
+ applies filters (capacity / PA-MD), de-dupes, writes docstore, and builds the FAISS index.
800
  Returns (docstore_path, n_indexed).
801
  """
802
  cfg = load_config(cfg_path)
 
821
  typ = entry.get("type")
822
  rows: List[Dict[str, Any]] = []
823
 
824
+ # -------- Collect from each adapter --------
825
  if typ == "grantsgov_api":
826
  raw_hits = _collect_from_grantsgov_api(entry)
827
  rows = [normalize("grants_gov", h, static) for h in raw_hits]
 
829
  elif typ in ("web_page", "http_html"):
830
  rows = _collect_from_http_html(entry, name, static)
831
 
832
+ elif typ == "http_html_js":
833
+ rows = _collect_from_http_html_js(entry, name, static)
834
+
835
  elif typ == "http_pdf":
836
  rows = _collect_from_http_pdf(entry, name, static)
837
 
 
841
  items = blob.get("opportunities") or []
842
  rows = [normalize("local_sample", op, static) for op in items]
843
 
844
+ else:
845
+ print(f"[collect] {name}: unknown type '{typ}', skipping.")
846
+ continue
 
 
 
 
 
 
 
 
 
847
 
848
+ print(f"[collect] {name}: fetched_rows={len(rows)}")
849
+
850
+ # ---- Apply capacity / geo filters BEFORE indexing (allow per-source bypass) ----
851
+ if rows:
852
+ if entry.get("skip_filters"):
853
+ print(f"[filter] {name}: skip_filters=true β†’ keeping all {len(rows)}")
854
+ else:
855
+ pre = len(rows)
856
+ filtered = []
857
+ for r in rows:
858
+ t = _doc_text_from_row(r)
859
+ if capacity_only and not _is_capacity_building_text(t):
860
+ continue
861
+ if pa_md_only and not _is_pa_md_text(t):
862
+ continue
863
+ filtered.append(r)
864
+ print(
865
+ f"[filter] {name}: kept {len(filtered)}/{pre} after filters "
866
+ f"(capacity_only={capacity_only}, pa_md_only={pa_md_only})"
867
+ )
868
+ rows = filtered
869
+
870
+ print(f"[collect] {name} β†’ rows_after_filters={len(rows)}")
871
  all_rows.extend(rows)
872
 
873
+ # ---- Cross-source DEDUPE + MERGE ----
874
+ unique = _dedupe_and_merge(all_rows)
 
 
 
 
 
 
 
875
  print(f"[ingest] Unique records to index: {len(unique)}")
876
 
877
  path = _save_docstore(unique)
app/ingestors/http_html_js.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/ingestors/http_html_js.py
2
+ import asyncio, time
3
+ from typing import List, Dict, Any, Optional
4
+ from urllib.parse import urljoin
5
+ from playwright.async_api import async_playwright
6
+
7
+ DEFAULT_WAIT_MS = 3000
8
+
9
+ async def _scrape_page(page, url: str, wait_ms: int, selectors: Dict[str, str]) -> List[Dict[str, Any]]:
10
+ await page.goto(url, wait_until="domcontentloaded", timeout=45000)
11
+ if wait_ms:
12
+ await page.wait_for_timeout(wait_ms)
13
+
14
+ cards = []
15
+ card_sel = selectors.get("card", "")
16
+ title_sel = selectors.get("title", "")
17
+ link_sel = selectors.get("link", "")
18
+ desc_sel = selectors.get("description", "")
19
+ meta_sel = selectors.get("meta", "") # e.g., location/focus
20
+
21
+ elements = await page.query_selector_all(card_sel) if card_sel else []
22
+ for el in elements:
23
+ title = (await (await el.query_selector(title_sel)).inner_text()).strip() if title_sel and await el.query_selector(title_sel) else ""
24
+ link_el = await el.query_selector(link_sel) if link_sel else None
25
+ href = await link_el.get_attribute("href") if link_el else None
26
+ link = urljoin(url, href) if href else url
27
+ desc = (await (await el.query_selector(desc_sel)).inner_text()).strip() if desc_sel and await el.query_selector(desc_sel) else ""
28
+ meta = (await (await el.query_selector(meta_sel)).inner_text()).strip() if meta_sel and await el.query_selector(meta_sel) else ""
29
+
30
+ if title or desc:
31
+ cards.append({
32
+ "title": title,
33
+ "url": link,
34
+ "summary": desc,
35
+ "meta": meta
36
+ })
37
+ return cards
38
+
39
+ async def scrape_js_site(
40
+ start_url: str,
41
+ max_pages: int = 1,
42
+ wait_ms: int = DEFAULT_WAIT_MS,
43
+ selectors: Optional[Dict[str, str]] = None,
44
+ next_selector: Optional[str] = None
45
+ ) -> List[Dict[str, Any]]:
46
+ selectors = selectors or {}
47
+ results: List[Dict[str, Any]] = []
48
+ async with async_playwright() as p:
49
+ browser = await p.chromium.launch(headless=True)
50
+ page = await browser.new_page()
51
+ current_url = start_url
52
+ for _ in range(max_pages):
53
+ page_results = await _scrape_page(page, current_url, wait_ms, selectors)
54
+ results.extend(page_results)
55
+ if not next_selector:
56
+ break
57
+ next_btn = await page.query_selector(next_selector)
58
+ if not next_btn:
59
+ break
60
+ await next_btn.click()
61
+ await page.wait_for_timeout(800) # polite delay
62
+ await browser.close()
63
+ return results
64
+
65
+ def ingest_http_html_js(cfg: Dict[str, Any]) -> List[Dict[str, Any]]:
66
+ """
67
+ cfg example:
68
+ {
69
+ "url": "...",
70
+ "options": {"wait_for": 3000, "max_pages": 3},
71
+ "selectors": {
72
+ "card": ".result-card",
73
+ "title": ".card-title",
74
+ "link": "a.card-link",
75
+ "description": ".card-body",
76
+ "meta": ".card-meta"
77
+ },
78
+ "next_selector": "a[rel=next]"
79
+ }
80
+ """
81
+ url = cfg["url"]
82
+ opts = cfg.get("options", {})
83
+ wait_ms = int(opts.get("wait_for", DEFAULT_WAIT_MS))
84
+ max_pages = int(opts.get("max_pages", 1))
85
+ selectors = cfg.get("selectors", {})
86
+ next_selector = cfg.get("next_selector")
87
+
88
+ # Run event loop
89
+ results = asyncio.run(scrape_js_site(url, max_pages, wait_ms, selectors, next_selector))
90
+
91
+ # Normalize to your index schema
92
+ docs = []
93
+ for r in results:
94
+ docs.append({
95
+ "title": r["title"] or "Untitled foundation",
96
+ "url": r["url"],
97
+ "body": f"{r.get('summary','')}\n{r.get('meta','')}",
98
+ "source_type": "foundation_private",
99
+ "geo": "US-MidAtlantic",
100
+ "tags": ["faith-based", "foundation"],
101
+ })
102
+ return docs
app/ranking/rerank.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/ranking/rerank.py
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer, util
4
+
5
+ INTENT_TEXT = """
6
+ nonprofit 501(c)(3) capacity building, community outreach, reentry, youth,
7
+ food security, housing stabilization, violence prevention, mental health,
8
+ addiction recovery, faith-based programs, workforce, mentorship
9
+ """
10
+
11
+ _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
12
+ _intent_vec = _model.encode(INTENT_TEXT, normalize_embeddings=True)
13
+
14
+ def embed_score(title: str, body: str) -> float:
15
+ text = f"{title}\n{body or ''}"
16
+ v = _model.encode(text, normalize_embeddings=True)
17
+ return float(util.cos_sim(_intent_vec, v).item())
app/ranking/rules.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/ranking/rules.py
2
+ INCLUDE_HINTS = [
3
+ "community", "capacity building", "re-entry", "workforce", "housing",
4
+ "human services", "addiction", "youth", "violence prevention",
5
+ "nonprofit", "faith", "church", "outreach", "mentorship"
6
+ ]
7
+ EXCLUDE_HINTS = [
8
+ "r01", "r21", "sbir", "sttr", "lab solicitation", "postdoctoral",
9
+ "basic research", "scoping study", "hypothesis", "principal investigator"
10
+ ]
11
+
12
+ def rule_score(text: str) -> int:
13
+ t = (text or "").lower()
14
+ s = 0
15
+ s += sum(2 for k in INCLUDE_HINTS if k in t)
16
+ s -= sum(2 for k in EXCLUDE_HINTS if k in t)
17
+ return s
app/ranking/score.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # app/ranking/score.py
2
+ def confidence(item) -> float:
3
+ rs = rule_score(item.get("body","") + " " + item.get("title",""))
4
+ es = embed_score(item.get("title",""), item.get("body",""))
5
+ dl_boost = 0.0
6
+ if item.get("deadline"): dl_boost += 0.05
7
+ if "NONPROFIT_501C3" in (item.get("eligibility") or []): dl_boost += 0.10
8
+ return max(0.0, min(1.0, 0.4*es + 0.4*(rs/8.0) + dl_boost))
app/ui_streamlit.py CHANGED
@@ -8,99 +8,149 @@ if str(ROOT) not in sys.path:
8
 
9
  import os, json
10
  from pathlib import Path
 
 
 
11
 
12
  from app.main import get_env, ensure_index_exists
13
  from app.search import search
14
 
15
- import streamlit as st
 
16
 
 
17
  st.markdown("""
18
  <style>
19
- /* --- Global safety net: make default text dark --- */
20
- html, body, [class^="css"], [class*=" css"] {
21
- color: #0f172a !important; /* slate-900 */
 
 
 
 
 
22
  }
23
- /* --- Streamlit selectbox/multiselect (BaseWeb rendering) --- */
24
- div[data-baseweb="select"] * { color: #0f172a !important; }
25
- div[data-baseweb="select"] { background: #ffffff !important; border-color: #cbd5e1 !important; }
26
- /* placeholder inside the closed select */
27
- div[data-baseweb="select"] div[aria-hidden="true"] { color: #64748b !important; }
28
- /* open dropdown menu (BaseWeb popover) */
29
- div[data-baseweb="popover"] [role="listbox"], div[data-baseweb="menu"] { background: #ffffff !important; }
30
- div[data-baseweb="popover"] [role="option"], div[data-baseweb="menu"] li { color: #0f172a !important; background: #ffffff !important; }
31
- /* --- Alternative rendering (ARIA hooks) in newer Streamlit builds --- */
32
- div[role="button"][aria-haspopup="listbox"] * { color: #0f172a !important; }
33
- ul[role="listbox"] li, div[role="option"] { color: #0f172a !important; background: #ffffff !important; }
34
- /* --- Streamlit component wrappers --- */
35
- .stSelectbox, .stMultiSelect { color: #0f172a !important; }
36
- .stSelectbox div, .stMultiSelect div { color: #0f172a !important; }
37
- /* --- Hard reset in case a global rule set all <span> to white --- */
38
- span, li { color: inherit !important; }
39
- </style>
40
- """, unsafe_allow_html=True)
41
 
42
- # ── Streamlit config ──────────────────────────────────────────────────────────
43
- st.set_page_config(page_title="Grants Discovery App By Lupo", page_icon="🧭", layout="wide")
 
44
 
45
- # ── Theme & CSS (BLACK + ORANGE, dark selects) ────────────────────────────────
46
- st.markdown("""
47
- <style>
48
- /* App base */
49
- .stApp { background-color: #000000; color: #f8fafc; }
50
- /* Text defaults */
51
- html, body, [class*="css"], h1, h2, h3, h4, h5, h6, p, span, div { color: #f8fafc !important; }
52
- /* Accents */
53
- a, .stRadio > label, .stSlider label { color: #f97316 !important; }
54
  /* Buttons */
55
- .stButton>button { background:#f97316; color:#fff; border:none; border-radius:8px; padding:0.5rem 0.9rem; font-weight:600; }
 
 
 
56
  .stButton>button:hover { filter:brightness(1.1); }
57
- /* Text input */
58
- .stTextInput input { background:#111827 !important; color:#f8fafc !important; border:1px solid #334155 !important; }
59
- /* Closed control (select/multiselect) */
60
- .stSelectbox div[data-baseweb="select"], .stMultiSelect div[data-baseweb="select"],
61
- .stSelectbox div[role="combobox"], .stMultiSelect div[role="combobox"] {
62
- background-color:#1e293b !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:8px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  }
64
- /* Text & icons inside control */
65
- .stSelectbox div[data-baseweb="select"] div, .stMultiSelect div[data-baseweb="select"] div,
66
- .stSelectbox div[data-baseweb="select"] input, .stMultiSelect div[data-baseweb="select"] input,
67
- .stSelectbox svg, .stMultiSelect svg { color:#f8fafc !important; fill:#f8fafc !important; }
68
- /* Placeholder */
69
- .stSelectbox div[data-baseweb="select"] input::placeholder, .stMultiSelect div[data-baseweb="select"] input::placeholder { color:#94a3b8 !important; }
70
- /* Selected chips (multiselect) */
71
- .stMultiSelect [data-baseweb="tag"] { background-color:#334155 !important; color:#e2e8f0 !important; border-radius:999px !important; }
72
- /* Open dropdown menu */
73
- div[data-baseweb="menu"] { background-color:#0b1220 !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:10px !important; }
74
- div[data-baseweb="menu"] [role="option"] { background:transparent !important; color:#f8fafc !important; }
75
- div[data-baseweb="menu"] [role="option"]:hover { background:#1f2937 !important; }
76
- div[data-baseweb="menu"] [role="option"][aria-selected="true"] { background:#334155 !important; color:#f8fafc !important; }
77
- /* Result cards */
78
- .result-card { border:1px solid #1e293b; background:#1e293b; border-radius:14px; padding:16px; margin:10px 0; box-shadow:0 1px 2px rgba(0,0,0,0.2); }
79
- .result-meta { font-size:13px; color:#94a3b8; margin-top:6px; }
80
- span.chip { display:inline-block; padding:3px 8px; border-radius:999px; background:#334155; margin-right:6px; font-size:12px; color:#e2e8f0; }
81
- /* Compact hero (single, 240px) */
82
- .hero { height: 240px; border-radius: 16px; margin: 6px 0 16px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  background: linear-gradient(rgba(0,0,0,.45), rgba(0,0,0,.45)),
84
  url('https://images.unsplash.com/photo-1469474968028-56623f02e42e?auto=format&fit=crop&w=1280&q=80') center/cover no-repeat; }
85
  .hero-text { height:100%; display:flex; flex-direction:column; align-items:center; justify-content:center; text-align:center; color:#fff; }
86
- .hero-text h1 { margin:0; font-size:28px; font-weight:700; color:#f97316; }
87
  .hero-text p { margin:6px 0 0; font-size:15px; color:#fcd34d; }
88
- /* ===== FORCE DARK SELECT / MULTISELECT ===== */
89
- [data-testid="stSelectbox"] div[role="combobox"], [data-testid="stMultiSelect"] div[role="combobox"],
90
- div[role="combobox"][aria-haspopup="listbox"] { background-color:#1e293b !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:8px !important; }
91
- [data-testid="stSelectbox"] div[role="combobox"] input, [data-testid="stMultiSelect"] div[role="combobox"] input,
92
- div[role="combobox"] input { color:#f8fafc !important; }
93
- div[role="combobox"] input::placeholder { color:#94a3b8 !important; }
94
- div[role="combobox"] svg { color:#f8fafc !important; fill:#f8fafc !important; }
95
- [data-testid="stMultiSelect"] [data-baseweb="tag"], [data-testid="stMultiSelect"] [aria-label="remove"] { background-color:#334155 !important; color:#e2e8f0 !important; border-radius:999px !important; }
96
- div[role="listbox"], ul[role="listbox"], div[data-baseweb="menu"] { background-color:#0b1220 !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:10px !important; }
97
- [role="listbox"] [role="option"], div[data-baseweb="menu"] [role="option"] { background:transparent !important; color:#f8fafc !important; }
98
- [role="listbox"] [role="option"]:hover, div[data-baseweb="menu"] [role="option"]:hover { background:#1f2937 !important; }
99
- [role="listbox"] [role="option"][aria-selected="true"], div[data-baseweb="menu"] [role="option"][aria-selected="true"] { background:#334155 !important; color:#f8fafc !important; }
100
  </style>
101
  """, unsafe_allow_html=True)
102
 
103
- # ── Hero block (single) ───────────────────────────────────────────────────────
104
  st.markdown("""
105
  <div class="hero">
106
  <div class="hero-text">
@@ -110,27 +160,13 @@ st.markdown("""
110
  </div>
111
  """, unsafe_allow_html=True)
112
 
113
- # ── Hide developer diagnostics by default ─────────────────────────────────────
114
- SHOW_DEV = os.environ.get("SHOW_DEV") == "1"
115
-
116
- # ── Environment + index ───────────────────────────────────────────────────────
117
  _env = get_env()
118
  ensure_index_exists(_env)
119
 
120
- # ---------- helpers ----------
121
- def _dedup_records(rows):
122
- seen, out = set(), []
123
- for r in rows or []:
124
- k = r.get("id") or r.get("url") or r.get("title")
125
- if not k or k in seen:
126
- continue
127
- seen.add(k)
128
- out.append(r)
129
- return out
130
-
131
  def _norm_list(v):
132
- if v is None:
133
- return []
134
  if isinstance(v, str):
135
  parts = [p.strip() for p in v.replace(";", ",").split(",")]
136
  return [p.lower() for p in parts if p]
@@ -146,8 +182,7 @@ def _matches_filters(rec, geo_sel, cat_sel):
146
  return g_ok and c_ok
147
 
148
  def _ministry_filter(rows):
149
- if not rows:
150
- return rows
151
  banned_terms = [
152
  "broad agency announcement", "baa", "research", "r&d", "prototype",
153
  "laboratory", "university", "sbir", "sttr",
@@ -155,42 +190,40 @@ def _ministry_filter(rows):
155
  "w911", "n00014", "fa-", "afrl", "arpa"
156
  ]
157
  preferred_agencies = {
158
- "FTA", "HHS", "ACL", "USDA", "USDA-FNS", "USDA-RD", "DOL", "DOJ", "OJP", "OVW",
159
- "EDA", "HRSA", "SAMHSA", "CFPB", "HUD"
160
  }
161
- required_any_terms = [
162
- "vehicle", "van", "bus", "paratransit", "mobility",
163
- "congregate meals", "home-delivered meals", "senior nutrition",
164
- "food pantry", "food bank", "hunger relief", "refrigeration", "freezer",
165
- "community", "faith", "church", "ministry", "nonprofit",
166
- "reentry", "workforce", "case management", "technical assistance"
167
  ]
168
  def txt(r):
169
- return " ".join([
170
- str(r.get("title","")),
171
- str(r.get("synopsis") or r.get("summary") or ""),
172
- str(r.get("agency") or ""),
173
- ]).lower()
174
-
175
- kept = []
176
  for r in rows:
177
  t = txt(r)
178
- if any(b in t for b in banned_terms):
179
- continue
180
  agency = (r.get("agency") or "").upper()
181
- cats = [c.lower() for c in (r.get("categories") or [])]
182
- is_preferred_agency = any(agency.startswith(a) for a in preferred_agencies)
183
- has_ministry_cue = any(term in t for term in required_any_terms) or any(
184
  c in {"transportation","vehicle","elderly","disabled","food","community","justice","reentry","workforce"} for c in cats
185
  )
186
- if is_preferred_agency or has_ministry_cue:
187
  kept.append(r)
188
  return kept
189
 
 
 
 
 
 
190
  def _days_until(iso):
191
- from datetime import date, datetime
192
- if not iso:
193
- return None
194
  try:
195
  d = datetime.fromisoformat(str(iso)).date()
196
  return (d - date.today()).days
@@ -198,39 +231,13 @@ def _days_until(iso):
198
  return None
199
 
200
  def _deadline_badge(days_left):
201
- if days_left is None:
202
- return "🟦 TBD"
203
- if days_left < 0:
204
- return "⬛ Closed"
205
- if days_left <= 14:
206
- return f"πŸŸ₯ Due in {days_left}d"
207
- if days_left <= 30:
208
- return f"🟨 {days_left}d"
209
  return f"🟩 {days_left}d"
210
- # ---------- end helpers ----------
211
-
212
- # ---------- optional diagnostics ----------
213
- with st.expander("Diagnostics (optional)", expanded=False):
214
- idx = Path(_env["INDEX_DIR"])
215
- st.write("INDEX_DIR:", str(idx))
216
- st.write("faiss.index exists:", (idx / "faiss.index").exists())
217
- st.write("meta.json exists:", (idx / "meta.json").exists())
218
- if (idx / "meta.json").exists():
219
- try:
220
- meta = json.loads((idx / "meta.json").read_text())
221
- st.write("meta.json count:", len(meta))
222
- st.write("meta head:", [{"id": m.get("id"), "title": m.get("title")} for m in meta[:2]])
223
- except Exception as e:
224
- st.error(f"Failed to read meta.json: {e!r}")
225
- try:
226
- demo = search("transportation", _env, top_k=3, filters={})
227
- st.write("sample search('transportation') results:", len(demo))
228
- if demo:
229
- st.write(demo[:3])
230
- except Exception as e:
231
- st.error(f"search() raised: {e!r}")
232
- # ---------- end diagnostics ----------
233
 
 
234
  st.title("Grants Discovery RAG (Capacity Building)")
235
 
236
  preset = st.radio(
@@ -238,7 +245,6 @@ preset = st.radio(
238
  ["General", "Elderly", "Prison Ministry", "Evangelism", "Vehicles/Transport", "FTA 5310"],
239
  horizontal=True
240
  )
241
-
242
  default_q = {
243
  "General": "capacity building",
244
  "Elderly": "capacity building for seniors and aging services",
@@ -248,7 +254,6 @@ default_q = {
248
  "FTA 5310": "5310 Enhanced Mobility Seniors Individuals with Disabilities",
249
  }.get(preset, "capacity building")
250
 
251
- # --- controls ---
252
  q = st.text_input("Search query", value=default_q)
253
 
254
  geo = st.multiselect("Geo filter (optional)", options=["US", "MD", "PA"], default=[])
@@ -256,40 +261,38 @@ categories = st.multiselect(
256
  "Category filter (optional)",
257
  options=[
258
  "capacity_building","elderly","prison_ministry","evangelism",
259
- "transportation","vehicle",
260
- "justice","reentry","victim_services","youth","women","food","workforce"
261
  ],
262
  default=[]
263
  )
264
 
265
- top_k = st.slider("Results", 5, 50, 15)
 
 
266
  sort_by = st.selectbox("Sort by", ["Relevance", "Deadline (soonest first)"], index=0)
267
  only_open = st.checkbox("Only show opportunities with a future deadline", value=True)
268
  ministry_focus = st.checkbox("Ministry Focus (hide research/defense/academic BAAs)", value=True)
269
 
270
- # NEW: Sprint 2 view + agency facet
271
  view = st.selectbox("View", ["All", "Saved", "Hidden"], index=0)
272
- # pre-load agencies list (from meta.json when present)
 
273
  try:
274
- meta_for_agencies = json.loads(Path(_env["INDEX_DIR"], "meta.json").read_text())
275
  agency_options = sorted({m.get("agency") for m in meta_for_agencies if m.get("agency")})
276
  except Exception:
277
  agency_options = []
278
  sel_agencies = st.multiselect("Agency filter (optional)", options=agency_options, default=[])
279
 
280
- # Build backend filters (if the search() supports them)
281
  backend_filters = {}
282
  if geo: backend_filters["geo"] = geo
283
  if categories: backend_filters["categories"] = categories
284
  if sel_agencies: backend_filters["agency"] = sel_agencies
285
 
286
- # --- Sprint 2 session state for Save/Hide ---
287
- if "saved_ids" not in st.session_state:
288
- st.session_state.saved_ids = set()
289
- if "hidden_ids" not in st.session_state:
290
- st.session_state.hidden_ids = set()
291
 
292
- # action helpers
293
  def _save_item(item_id: str):
294
  st.session_state.saved_ids.add(item_id)
295
  st.session_state.hidden_ids.discard(item_id)
@@ -300,101 +303,77 @@ def _hide_item(item_id: str):
300
  st.session_state.saved_ids.discard(item_id)
301
  st.experimental_rerun()
302
 
303
- col1, col2 = st.columns([1, 1])
 
304
 
305
- with col1:
306
  if st.button("Search"):
307
  try:
308
- raw = search(q, _env, top_k=top_k, filters=backend_filters)
309
- dedup = _dedup_records(raw)
310
-
311
- # 1) Geo/Category client-side filter (fallback if backend ignores)
312
  if geo or categories:
313
- base_filtered = [r for r in dedup if _matches_filters(r, geo, categories)]
314
  else:
315
- base_filtered = dedup
316
 
317
- # 2) Only-open filter
318
- from datetime import date, datetime
319
  def _to_date_safe(val):
320
  if not val: return None
321
  try: return datetime.fromisoformat(str(val)).date()
322
  except Exception: return None
323
-
324
  open_filtered = base_filtered
325
  if only_open:
326
  open_filtered = [r for r in base_filtered
327
  if (_to_date_safe(r.get("deadline")) or date.max) >= date.today()]
328
 
329
- # 3) Agency filter (client-side, in case backend didn't apply)
330
  if sel_agencies:
331
  af = set(sel_agencies)
332
  open_filtered = [r for r in open_filtered if (r.get("agency") in af)]
333
 
334
- # 4) Ministry filter
335
  final_results = _ministry_filter(open_filtered) if ministry_focus else open_filtered
336
 
337
- # Clear/show hidden toggle mgmt
338
- if not ministry_focus and st.session_state.get("show_hidden"):
339
- st.session_state.pop("show_hidden", None)
340
-
341
- hidden_due_to_ministry = 0
342
- if ministry_focus:
343
- hidden_due_to_ministry = len(open_filtered) - len(final_results)
344
- st.session_state.pop("show_hidden", None)
345
-
346
  st.session_state["results"] = final_results
347
  st.session_state["last_query"] = q
348
  st.session_state["last_filters"] = {
349
- "geo": geo, "categories": categories,
350
- "only_open": only_open, "ministry_focus": ministry_focus,
351
- "agencies": sel_agencies,
352
  }
353
 
354
- st.success(
355
- f"Found {len(dedup)} total β€’ After geo/cat: {len(base_filtered)} β€’ "
356
- f"Open-only: {len(open_filtered)} β€’ Displaying: {len(final_results)}"
357
- + (f" β€’ Hidden by ministry filter: {hidden_due_to_ministry}" if ministry_focus else "")
358
- )
359
 
360
- if ministry_focus and hidden_due_to_ministry > 0:
361
- if st.checkbox(f"Show hidden items ({hidden_due_to_ministry})", value=False, key="show_hidden"):
362
- st.session_state["results"] = open_filtered
363
  except Exception as e:
364
  st.error(str(e))
365
 
366
-
367
- with col2:
368
  if st.button("Export Results to CSV"):
369
  results_for_export = st.session_state.get("results", [])
370
  if not results_for_export:
371
  st.warning("No results to export. Run a search first.")
372
  else:
373
- os.makedirs(_env["EXPORT_DIR"], exist_ok=True)
374
- out_path = os.path.join(_env["EXPORT_DIR"], "results.csv")
 
375
  import pandas as pd
376
  pd.DataFrame(results_for_export).to_csv(out_path, index=False)
377
  st.success(f"Exported to {out_path}")
378
 
379
  st.markdown("---")
380
 
381
- # ---- Sorting/filter helpers ----
382
- from datetime import date, datetime
383
- def _to_date(d):
384
- if not d: return None
385
- try: return datetime.fromisoformat(str(d)).date()
386
- except Exception: return None
387
-
388
- # ---- Render results ----
389
  results = st.session_state.get("results", [])
 
390
 
391
- # Apply "View" (All/Saved/Hidden)
392
  if view == "Saved":
393
  results = [r for r in results if r.get("id") in st.session_state.saved_ids]
394
  elif view == "Hidden":
395
  results = [r for r in results if r.get("id") in st.session_state.hidden_ids]
396
 
397
- # Apply sort if selected
398
  if sort_by.startswith("Deadline") and results:
399
  results.sort(
400
  key=lambda r: (
@@ -403,42 +382,175 @@ if sort_by.startswith("Deadline") and results:
403
  )
404
  )
405
 
406
- # Did the user run a search?
407
- ran_search = bool(st.session_state.get("last_query"))
408
-
409
- if results:
410
- st.caption(f"Results: {len(results)}")
411
- for r in results:
412
- title = r.get("title", "(no title)")
413
- url = r.get("url", "")
414
- cats = r.get("categories") or r.get("cats") or []
415
- geo_tags = r.get("geo") or []
416
- _id = r.get("id") or r.get("url") or title
417
-
418
- st.markdown(f"### {title}")
419
- st.write(f"**Source:** {r.get('source','')} | **Geo:** {', '.join(geo_tags) if isinstance(geo_tags, list) else geo_tags} | **Categories:** {', '.join(cats) if isinstance(cats, list) else cats}")
420
-
421
- # Link / score
422
- if url and not url.startswith("http"):
423
- st.caption("Note: This item may display an ID or number instead of a full link. Open on Grants.gov if needed.")
424
- st.write(f"[Open Link]({url}) \nScore: {r.get('score', 0):.3f}")
425
-
426
- # Deadline + badge
427
- posted = r.get("posted_date") or ""
428
- deadline = r.get("deadline") or ""
429
- days_left = _days_until(deadline)
430
- st.caption(f"Posted: {posted} β€’ Deadline: {deadline} β€’ {_deadline_badge(days_left)}")
431
-
432
- # Save / Hide buttons
433
- c1, c2, _ = st.columns([1,1,6])
434
- if c1.button(("βœ… Saved" if _id in st.session_state.saved_ids else "πŸ’Ύ Save"), key=f"save-{_id}"):
435
- _save_item(_id)
436
- if c2.button(("πŸ™ˆ Hidden" if _id in st.session_state.hidden_ids else "πŸ™ˆ Hide"), key=f"hide-{_id}"):
437
- _hide_item(_id)
438
-
439
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  else:
441
  if ran_search:
442
- st.info("No active grants match these filters right now. We’ll notify you when the next cycle opens.")
443
  else:
444
  st.info("Enter a query and click Search.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  import os, json
10
  from pathlib import Path
11
+ from datetime import date, datetime
12
+
13
+ import streamlit as st
14
 
15
  from app.main import get_env, ensure_index_exists
16
  from app.search import search
17
 
18
+ # ── Page config ───────────────────────────────────────────────────────────────
19
+ st.set_page_config(page_title="Grants Discovery App By Lupo", page_icon="🧭", layout="wide")
20
 
21
+ # ── THEME / CSS β€” single, unified block (dark app; readable controls) ─────────
22
  st.markdown("""
23
  <style>
24
+ /* App base */
25
+ :root {
26
+ --bg: #0a0f1a;
27
+ --panel: #121827;
28
+ --text: #e5eefb;
29
+ --muted: #95a3b8;
30
+ --accent: #f97316;
31
+ --border: #2b3a55;
32
  }
33
+ .stApp { background-color: var(--bg); color: var(--text); }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ /* Typo & links */
36
+ html, body, [class*="st-"], h1,h2,h3,h4,h5,h6, p, span, div { color: var(--text) !important; }
37
+ a, .stRadio > label, .stSlider label { color: var(--accent) !important; }
38
 
 
 
 
 
 
 
 
 
 
39
  /* Buttons */
40
+ .stButton>button {
41
+ background: var(--accent); color:#fff; border:none; border-radius:10px;
42
+ padding: 0.5rem 0.9rem; font-weight:600;
43
+ }
44
  .stButton>button:hover { filter:brightness(1.1); }
45
+
46
+ /* Text inputs */
47
+ .stTextInput input, .stTextArea textarea {
48
+ background: var(--panel) !important; color: var(--text) !important;
49
+ border: 1px solid var(--border) !important; border-radius: 10px !important;
50
+ }
51
+
52
+ /* ===== FIXED: Select/Multiselect controls - HIGH CONTRAST ===== */
53
+
54
+ /* Labels above the controls */
55
+ [data-testid="stSelectbox"] label div,
56
+ [data-testid="stMultiSelect"] label div {
57
+ color: #e5eefb !important; /* Light text for dark background */
58
+ font-weight: 600;
59
+ }
60
+
61
+ /* Closed control (the combobox) */
62
+ [data-testid="stSelectbox"] div[role="combobox"],
63
+ [data-testid="stMultiSelect"] div[role="combobox"] {
64
+ background: #1e293b !important; /* Dark field */
65
+ color: #f8fafc !important; /* Light text - HIGH CONTRAST */
66
+ border: 1px solid #475569 !important;
67
+ border-radius: 10px !important;
68
+ font-weight: 500;
69
+ }
70
+
71
+ /* Text & icons inside the closed control */
72
+ [data-testid="stSelectbox"] div[role="combobox"] *,
73
+ [data-testid="stMultiSelect"] div[role="combobox"] * {
74
+ color: #f8fafc !important; /* Force light text */
75
+ fill: #f8fafc !important; /* Force light icons */
76
  }
77
+
78
+ /* Placeholder text */
79
+ [data-testid="stMultiSelect"] input::placeholder {
80
+ color: #94a3b8 !important; /* Muted but visible placeholder */
81
+ }
82
+
83
+ /* Multiselect chips */
84
+ [data-baseweb="tag"] {
85
+ background: #334155 !important;
86
+ color: #e2e8f0 !important; /* Light text on chips */
87
+ border-radius: 999px !important;
88
+ font-weight: 500;
89
+ }
90
+
91
+ /* Dropdown menu (popover) - DARK THEME */
92
+ div[data-baseweb="popover"] {
93
+ z-index: 999999 !important; /* Ensure it appears above everything */
94
+ }
95
+
96
+ div[data-baseweb="popover"] [role="listbox"],
97
+ div[data-baseweb="menu"],
98
+ ul[role="listbox"] {
99
+ background: #1e293b !important; /* Dark menu background */
100
+ color: #f8fafc !important; /* Light text - HIGH CONTRAST */
101
+ border: 1px solid #475569 !important;
102
+ border-radius: 10px !important;
103
+ }
104
+
105
+ /* Options in dropdown */
106
+ [role="listbox"] [role="option"],
107
+ div[data-baseweb="menu"] [role="option"] {
108
+ background: transparent !important;
109
+ color: #f8fafc !important; /* Light text */
110
+ font-weight: 500;
111
+ }
112
+
113
+ /* Hover state */
114
+ [role="listbox"] [role="option"]:hover,
115
+ div[data-baseweb="menu"] [role="option"]:hover {
116
+ background: #334155 !important; /* Slightly lighter on hover */
117
+ color: #ffffff !important;
118
+ }
119
+
120
+ /* Selected state */
121
+ [role="listbox"] [role="option"][aria-selected="true"],
122
+ div[data-baseweb="menu"] [role="option"][aria-selected="true"] {
123
+ background: #475569 !important; /* Highlight selected */
124
+ color: #ffffff !important;
125
+ font-weight: 600;
126
+ }
127
+
128
+ /* Search input inside multiselect dropdown */
129
+ div[data-baseweb="popover"] input {
130
+ background: #0f172a !important;
131
+ color: #f8fafc !important;
132
+ border: 1px solid #475569 !important;
133
+ border-radius: 6px !important;
134
+ }
135
+
136
+ /* Cards */
137
+ .result-card { border:1px solid var(--border); background: var(--panel);
138
+ border-radius:14px; padding:16px; margin:10px 0; box-shadow:0 1px 2px rgba(0,0,0,0.2); }
139
+ .result-meta { font-size:13px; color: var(--muted); margin-top:6px; }
140
+ span.chip { display:inline-block; padding:3px 8px; border-radius:999px; background:#2a354a;
141
+ margin-right:6px; font-size:12px; color:var(--text); }
142
+
143
+ /* Hero */
144
+ .hero { height: 220px; border-radius: 16px; margin: 6px 0 16px;
145
  background: linear-gradient(rgba(0,0,0,.45), rgba(0,0,0,.45)),
146
  url('https://images.unsplash.com/photo-1469474968028-56623f02e42e?auto=format&fit=crop&w=1280&q=80') center/cover no-repeat; }
147
  .hero-text { height:100%; display:flex; flex-direction:column; align-items:center; justify-content:center; text-align:center; color:#fff; }
148
+ .hero-text h1 { margin:0; font-size:28px; font-weight:700; color: var(--accent); }
149
  .hero-text p { margin:6px 0 0; font-size:15px; color:#fcd34d; }
 
 
 
 
 
 
 
 
 
 
 
 
150
  </style>
151
  """, unsafe_allow_html=True)
152
 
153
+ # ── Hero ──────────────────────────────────────────────────────────────────────
154
  st.markdown("""
155
  <div class="hero">
156
  <div class="hero-text">
 
160
  </div>
161
  """, unsafe_allow_html=True)
162
 
163
+ # ── Environment & index ───────────────────────────────────────────────────────
 
 
 
164
  _env = get_env()
165
  ensure_index_exists(_env)
166
 
167
+ # ── Helpers ───────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
168
  def _norm_list(v):
169
+ if v is None: return []
 
170
  if isinstance(v, str):
171
  parts = [p.strip() for p in v.replace(";", ",").split(",")]
172
  return [p.lower() for p in parts if p]
 
182
  return g_ok and c_ok
183
 
184
  def _ministry_filter(rows):
185
+ if not rows: return rows
 
186
  banned_terms = [
187
  "broad agency announcement", "baa", "research", "r&d", "prototype",
188
  "laboratory", "university", "sbir", "sttr",
 
190
  "w911", "n00014", "fa-", "afrl", "arpa"
191
  ]
192
  preferred_agencies = {
193
+ "FTA","HHS","ACL","USDA","USDA-FNS","USDA-RD","DOL","DOJ","OJP","OVW","EDA","HRSA","SAMHSA","CFPB","HUD"
 
194
  }
195
+ terms = [
196
+ "vehicle","van","bus","paratransit","mobility",
197
+ "congregate meals","home-delivered meals","senior nutrition",
198
+ "food pantry","food bank","hunger relief","refrigeration","freezer",
199
+ "community","faith","church","ministry","nonprofit",
200
+ "reentry","workforce","case management","technical assistance","capacity"
201
  ]
202
  def txt(r):
203
+ return " ".join([str(r.get("title","")),
204
+ str(r.get("synopsis") or r.get("summary") or ""),
205
+ str(r.get("agency") or "")]).lower()
206
+ kept=[]
 
 
 
207
  for r in rows:
208
  t = txt(r)
209
+ if any(b in t for b in banned_terms): continue
 
210
  agency = (r.get("agency") or "").upper()
211
+ cats = [c.lower() for c in (r.get("categories") or [])] if isinstance(r.get("categories"), list) else []
212
+ prefer = any(agency.startswith(a) for a in preferred_agencies)
213
+ has_cue = any(term in t for term in terms) or any(
214
  c in {"transportation","vehicle","elderly","disabled","food","community","justice","reentry","workforce"} for c in cats
215
  )
216
+ if prefer or has_cue:
217
  kept.append(r)
218
  return kept
219
 
220
+ def _to_date(d):
221
+ if not d: return None
222
+ try: return datetime.fromisoformat(str(d)).date()
223
+ except Exception: return None
224
+
225
  def _days_until(iso):
226
+ if not iso: return None
 
 
227
  try:
228
  d = datetime.fromisoformat(str(iso)).date()
229
  return (d - date.today()).days
 
231
  return None
232
 
233
  def _deadline_badge(days_left):
234
+ if days_left is None: return "🟦 TBD"
235
+ if days_left < 0: return "⬛ Closed"
236
+ if days_left <= 14: return f"πŸŸ₯ Due in {days_left}d"
237
+ if days_left <= 30: return f"🟨 {days_left}d"
 
 
 
 
238
  return f"🟩 {days_left}d"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ # ── UI: Presets & inputs ──────────────────────────────────────────────────────
241
  st.title("Grants Discovery RAG (Capacity Building)")
242
 
243
  preset = st.radio(
 
245
  ["General", "Elderly", "Prison Ministry", "Evangelism", "Vehicles/Transport", "FTA 5310"],
246
  horizontal=True
247
  )
 
248
  default_q = {
249
  "General": "capacity building",
250
  "Elderly": "capacity building for seniors and aging services",
 
254
  "FTA 5310": "5310 Enhanced Mobility Seniors Individuals with Disabilities",
255
  }.get(preset, "capacity building")
256
 
 
257
  q = st.text_input("Search query", value=default_q)
258
 
259
  geo = st.multiselect("Geo filter (optional)", options=["US", "MD", "PA"], default=[])
 
261
  "Category filter (optional)",
262
  options=[
263
  "capacity_building","elderly","prison_ministry","evangelism",
264
+ "transportation","vehicle","justice","reentry",
265
+ "victim_services","youth","women","food","workforce"
266
  ],
267
  default=[]
268
  )
269
 
270
+ # Fetch more so pagination is meaningful
271
+ top_k = st.slider("Fetch up to (results)", 50, 500, 200, step=50)
272
+
273
  sort_by = st.selectbox("Sort by", ["Relevance", "Deadline (soonest first)"], index=0)
274
  only_open = st.checkbox("Only show opportunities with a future deadline", value=True)
275
  ministry_focus = st.checkbox("Ministry Focus (hide research/defense/academic BAAs)", value=True)
276
 
 
277
  view = st.selectbox("View", ["All", "Saved", "Hidden"], index=0)
278
+
279
+ # Agencies facet from meta
280
  try:
281
+ meta_for_agencies = json.loads(Path(get_env()["INDEX_DIR"], "meta.json").read_text())
282
  agency_options = sorted({m.get("agency") for m in meta_for_agencies if m.get("agency")})
283
  except Exception:
284
  agency_options = []
285
  sel_agencies = st.multiselect("Agency filter (optional)", options=agency_options, default=[])
286
 
 
287
  backend_filters = {}
288
  if geo: backend_filters["geo"] = geo
289
  if categories: backend_filters["categories"] = categories
290
  if sel_agencies: backend_filters["agency"] = sel_agencies
291
 
292
+ # Sprint 2: Save/Hide state
293
+ if "saved_ids" not in st.session_state: st.session_state.saved_ids = set()
294
+ if "hidden_ids" not in st.session_state: st.session_state.hidden_ids = set()
 
 
295
 
 
296
  def _save_item(item_id: str):
297
  st.session_state.saved_ids.add(item_id)
298
  st.session_state.hidden_ids.discard(item_id)
 
303
  st.session_state.saved_ids.discard(item_id)
304
  st.experimental_rerun()
305
 
306
+ # ── Search & filter pipeline (stores full result set) ─────────────────────────
307
+ c1, c2 = st.columns([1,1])
308
 
309
+ with c1:
310
  if st.button("Search"):
311
  try:
312
+ raw = search(q, get_env(), top_k=top_k, filters=backend_filters) # fetch many
313
+ # Geo/Category client-side fallback
 
 
314
  if geo or categories:
315
+ base_filtered = [r for r in raw if _matches_filters(r, geo, categories)]
316
  else:
317
+ base_filtered = raw
318
 
319
+ # Only open
 
320
  def _to_date_safe(val):
321
  if not val: return None
322
  try: return datetime.fromisoformat(str(val)).date()
323
  except Exception: return None
 
324
  open_filtered = base_filtered
325
  if only_open:
326
  open_filtered = [r for r in base_filtered
327
  if (_to_date_safe(r.get("deadline")) or date.max) >= date.today()]
328
 
329
+ # Agency
330
  if sel_agencies:
331
  af = set(sel_agencies)
332
  open_filtered = [r for r in open_filtered if (r.get("agency") in af)]
333
 
334
+ # Ministry
335
  final_results = _ministry_filter(open_filtered) if ministry_focus else open_filtered
336
 
 
 
 
 
 
 
 
 
 
337
  st.session_state["results"] = final_results
338
  st.session_state["last_query"] = q
339
  st.session_state["last_filters"] = {
340
+ "geo": geo, "categories": categories, "only_open": only_open,
341
+ "ministry_focus": ministry_focus, "agencies": sel_agencies,
 
342
  }
343
 
344
+ # RESET PAGINATION on new run
345
+ st.session_state.page = 1
 
 
 
346
 
347
+ st.success(f"Fetched {len(raw)} β€’ After filters: {len(final_results)}")
 
 
348
  except Exception as e:
349
  st.error(str(e))
350
 
351
+ with c2:
 
352
  if st.button("Export Results to CSV"):
353
  results_for_export = st.session_state.get("results", [])
354
  if not results_for_export:
355
  st.warning("No results to export. Run a search first.")
356
  else:
357
+ out_dir = get_env()["EXPORT_DIR"]
358
+ os.makedirs(out_dir, exist_ok=True)
359
+ out_path = os.path.join(out_dir, "results.csv")
360
  import pandas as pd
361
  pd.DataFrame(results_for_export).to_csv(out_path, index=False)
362
  st.success(f"Exported to {out_path}")
363
 
364
  st.markdown("---")
365
 
366
+ # ── Post-search view/sort/pagination (5.4) ────────────────────────────────────
 
 
 
 
 
 
 
367
  results = st.session_state.get("results", [])
368
+ ran_search = bool(st.session_state.get("last_query"))
369
 
370
+ # View filter
371
  if view == "Saved":
372
  results = [r for r in results if r.get("id") in st.session_state.saved_ids]
373
  elif view == "Hidden":
374
  results = [r for r in results if r.get("id") in st.session_state.hidden_ids]
375
 
376
+ # Sort
377
  if sort_by.startswith("Deadline") and results:
378
  results.sort(
379
  key=lambda r: (
 
382
  )
383
  )
384
 
385
+ # Pagination state
386
+ if "page_size" not in st.session_state:
387
+ st.session_state.page_size = 25
388
+ if "page" not in st.session_state:
389
+ st.session_state.page = 1
390
+
391
+ total = len(results)
392
+ st.caption(f"Results: {total}")
393
+
394
+ # Controls
395
+ cols = st.columns([1,1,2,2,2])
396
+ with cols[0]:
397
+ page_size = st.selectbox("Page size", [10, 25, 50, 100], index=1)
398
+ st.session_state.page_size = page_size
399
+ # compute pages
400
+ total_pages = max(1, (total + page_size - 1) // page_size)
401
+ with cols[1]:
402
+ page = st.number_input("Page", min_value=1, max_value=total_pages,
403
+ value=min(st.session_state.page, total_pages), step=1)
404
+ st.session_state.page = page
405
+
406
+ # Slice AFTER filters & sort
407
+ start = (st.session_state.page - 1) * st.session_state.page_size
408
+ end = min(start + st.session_state.page_size, total)
409
+ page_items = results[start:end]
410
+ st.caption(f"Showing {start+1 if total else 0}–{end} of {total} β€’ Page {st.session_state.page}/{total_pages}")
411
+
412
+ # Nav buttons
413
+ prev_col, _, next_col = st.columns([1,6,1])
414
+ with prev_col:
415
+ if st.button("β—€ Prev", disabled=(st.session_state.page <= 1)):
416
+ st.session_state.page = max(1, st.session_state.page - 1)
417
+ st.experimental_rerun()
418
+ with next_col:
419
+ if st.button("Next β–Ά", disabled=(st.session_state.page >= total_pages)):
420
+ st.session_state.page = min(total_pages, st.session_state.page + 1)
421
+ st.experimental_rerun()
422
+
423
+ # ── Render page items ─────────────────────────────────────────────────────────
424
+ def _render_card(r):
425
+ title = r.get("title", "(no title)")
426
+ url = r.get("url", "")
427
+ cats = r.get("categories") or r.get("cats") or []
428
+ geo_tags = r.get("geo") or []
429
+ _id = r.get("id") or r.get("url") or title
430
+ posted = r.get("posted_date") or ""
431
+ deadline = r.get("deadline") or ""
432
+ days_left = _days_until(deadline)
433
+
434
+ st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
435
+ st.markdown(f"### {title}")
436
+ meta = f"**Source:** {r.get('source','')} β€’ **Geo:** {', '.join(geo_tags) if isinstance(geo_tags,list) else geo_tags} β€’ **Categories:** {', '.join(cats) if isinstance(cats,list) else cats}"
437
+ st.markdown(f"<div class='result-meta'>{meta}</div>", unsafe_allow_html=True)
438
+
439
+ # Link / score
440
+ if url and not url.startswith('http'):
441
+ st.caption("Note: This item may display an ID instead of a full link. Open on Grants.gov if needed.")
442
+ if url:
443
+ st.write(f"[Open Link]({url})")
444
+ if r.get("score") is not None:
445
+ st.caption(f"Score: {r.get('score', 0):.3f}")
446
+
447
+ # Deadline
448
+ st.caption(f"Posted: {posted} β€’ Deadline: {deadline} β€’ {_deadline_badge(days_left)}")
449
+
450
+ # Save/Hide
451
+ c1, c2, _ = st.columns([1,1,6])
452
+ if c1.button(("βœ… Saved" if _id in st.session_state.saved_ids else "πŸ’Ύ Save"), key=f"save-{_id}"):
453
+ _save_item(_id)
454
+ if c2.button(("πŸ™ˆ Hidden" if _id in st.session_state.hidden_ids else "πŸ™ˆ Hide"), key=f"hide-{_id}"):
455
+ _hide_item(_id)
456
+
457
+ st.markdown("</div>", unsafe_allow_html=True)
458
+
459
+ if page_items:
460
+ for r in page_items:
461
+ _render_card(r)
462
  else:
463
  if ran_search:
464
+ st.info("No active grants match these filters right now.")
465
  else:
466
  st.info("Enter a query and click Search.")
467
+
468
+ st.markdown("""
469
+ <style>
470
+ /* ================== SELECT/MULTISELECT HARD OVERRIDE ================== */
471
+ /* Goal: kill white-on-white by styling the BaseWeb select root + portal. */
472
+ /* Works across Chrome/Safari/Firefox; includes -webkit-text-fill-color fix. */
473
+
474
+ /* 1) CLOSED CONTROL (the visible field) β€” target the BaseWeb root */
475
+ body div[data-baseweb="select"] {
476
+ background: #1e293b !important; /* dark field */
477
+ color: #f8fafc !important; /* light text */
478
+ border: 1px solid #475569 !important;
479
+ border-radius: 10px !important;
480
+ }
481
+
482
+ /* Make absolutely everything inside readable (some builds render text in spans) */
483
+ body div[data-baseweb="select"] * {
484
+ color: #f8fafc !important;
485
+ -webkit-text-fill-color: #f8fafc !important; /* Safari/Chromium quirk */
486
+ fill: #f8fafc !important;
487
+ }
488
+
489
+ /* Placeholder node BaseWeb renders (aria-hidden) */
490
+ body div[data-baseweb="select"] div[aria-hidden="true"] {
491
+ color: #94a3b8 !important;
492
+ -webkit-text-fill-color: #94a3b8 !important;
493
+ }
494
+
495
+ /* Ensure the actual input inherits readable color */
496
+ body div[data-baseweb="select"] input {
497
+ color: #f8fafc !important;
498
+ -webkit-text-fill-color: #f8fafc !important;
499
+ caret-color: #f8fafc !important;
500
+ background: transparent !important;
501
+ }
502
+
503
+ /* 2) OPEN DROPDOWN MENU (lives in a portal under <body>) */
504
+ body div[data-baseweb="popover"] [role="listbox"],
505
+ body div[data-baseweb="menu"],
506
+ body ul[role="listbox"] {
507
+ background: #1e293b !important;
508
+ color: #f8fafc !important;
509
+ border: 1px solid #475569 !important;
510
+ border-radius: 10px !important;
511
+ z-index: 2147483647 !important;
512
+ }
513
+
514
+ /* Options inside the menu */
515
+ body [role="listbox"] [role="option"],
516
+ body div[data-baseweb="menu"] [role="option"] {
517
+ background: transparent !important;
518
+ color: #f8fafc !important;
519
+ }
520
+ body [role="listbox"] [role="option"]:hover,
521
+ body div[data-baseweb="menu"] [role="option"]:hover {
522
+ background: #334155 !important;
523
+ }
524
+ body [role="listbox"] [role="option"][aria-selected="true"],
525
+ body div[data-baseweb="menu"] [role="option"][aria-selected="true"] {
526
+ background: #475569 !important;
527
+ color: #ffffff !important;
528
+ }
529
+
530
+ /* 3) EMERGENCY FALLBACK β€” if a theme forces a white menu inline, flip text dark */
531
+ body div[data-baseweb="popover"][style*="rgb(255"] [role="listbox"],
532
+ body div[data-baseweb="popover"][style*="255, 255, 255"] [role="listbox"] {
533
+ background: #ffffff !important;
534
+ color: #0f172a !important;
535
+ border: 1px solid #cbd5e1 !important;
536
+ }
537
+ body div[data-baseweb="popover"][style*="rgb(255"] [role="listbox"] * ,
538
+ body div[data-baseweb="popover"][style*="255, 255, 255"] [role="listbox"] * {
539
+ color: #0f172a !important;
540
+ -webkit-text-fill-color: #0f172a !important;
541
+ }
542
+
543
+ /* 4) MULTISELECT CHIPS */
544
+ body [data-baseweb="tag"] {
545
+ background: #334155 !important;
546
+ color: #e2e8f0 !important;
547
+ border-radius: 999px !important;
548
+ }
549
+
550
+ /* 5) OPTIONAL: turn on outlines once to verify the selector match (debug)
551
+ body div[data-baseweb="select"] { outline: 1px dashed #22d3ee !important; }
552
+ body div[data-baseweb="popover"] [role="listbox"] { outline: 1px dashed #22d3ee !important; }
553
+ */
554
+ </style>
555
+ """, unsafe_allow_html=True)
556
+
app/utils/dedupe.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, hashlib
2
+ from difflib import SequenceMatcher
3
+
4
+ def _norm(text: str) -> str:
5
+ """Lowercase, strip punctuation, collapse whitespace."""
6
+ t = (text or "").lower()
7
+ t = re.sub(r'[^a-z0-9 ]+', ' ', t)
8
+ return re.sub(r'\s+', ' ', t).strip()
9
+
10
+ def hash_fingerprint(title: str, agency: str, deadline: str) -> str:
11
+ """
12
+ A strong key: normalized title + agency + deadline.
13
+ Use this as a primary key in your datastore.
14
+ """
15
+ base = f"{_norm(title)}|{_norm(agency)}|{deadline or ''}"
16
+ return hashlib.sha1(base.encode()).hexdigest()
17
+
18
+ def near_duplicate(a: dict, b: dict) -> bool:
19
+ """
20
+ Fuzzy fallback: similar title & agency,
21
+ and deadlines match or are both blank.
22
+ """
23
+ dates_close = (a.get("deadline") == b.get("deadline")) \
24
+ or (not a.get("deadline") and not b.get("deadline"))
25
+ t_sim = SequenceMatcher(None, _norm(a.get("title","")),
26
+ _norm(b.get("title",""))).ratio()
27
+ ag_sim = SequenceMatcher(None, _norm(a.get("agency","")),
28
+ _norm(b.get("agency",""))).ratio()
29
+ return dates_close and (t_sim > 0.88) and (ag_sim > 0.75)
app/utils/normalize.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/utils/normalize.py
2
+ ELIGIBILITY_MAP = {
3
+ "nonprofit": "NONPROFIT_501C3",
4
+ "501(c)(3)": "NONPROFIT_501C3",
5
+ "local government": "LOCAL_GOV",
6
+ "state government": "STATE_GOV",
7
+ "higher education": "HIGHER_ED",
8
+ }
9
+ def normalize_eligibility(raw: str) -> list[str]:
10
+ vals = []
11
+ txt = (raw or "").lower()
12
+ for k,v in ELIGIBILITY_MAP.items():
13
+ if k in txt:
14
+ vals.append(v)
15
+ return sorted(set(vals)) or ["UNKNOWN"]
config/sources.yaml CHANGED
@@ -1,6 +1,6 @@
1
  # Minimal, valid config β€” v6.3
2
  filters:
3
- capacity_only: true # keep only capacity-building items
4
  pa_md_only: false # set to true to restrict index to PA/MD
5
 
6
  sources:
@@ -46,7 +46,6 @@ sources:
46
  page_size: 100
47
  max_pages: 3
48
  payload:
49
- # Target 5310 by ALN and keywords
50
  aln: "20.513"
51
  keyword: "\"Enhanced Mobility\" OR \"Section 5310\" OR seniors OR elderly OR disabilities OR paratransit OR wheelchair OR shuttle OR van OR bus"
52
  oppStatuses: "posted"
@@ -86,9 +85,27 @@ sources:
86
  fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
87
  sortBy: "openDate|desc"
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  # ---------- STATE & METRO PASS-THROUGHS (FTA 5310 etc.) ----------
90
- # NOTE: These require adapters (http_html/web_page/http_pdf) you haven't implemented yet.
91
- # They are kept here (enabled) in case your runtime supports them; otherwise set enabled: false.
92
 
93
  - name: "Maryland MTA β€” Grants (incl. 5310)"
94
  type: web_page
@@ -162,7 +179,7 @@ sources:
162
  mode: "article"
163
  keep_links: true
164
 
165
- # --- Pennsylvania: PCA (state arts) ---
166
  - name: "PA Creative Industries – Capacity Building (landing)"
167
  type: http_html
168
  enabled: true
@@ -226,22 +243,30 @@ sources:
226
  geo: "PA"
227
  categories: ["capacity_building"]
228
 
229
- # --- Maryland: OneStop (statewide grant listings with 'capacity' search) ---
230
- - name: "Maryland OneStop – Capacity search"
231
- type: http_html
232
  enabled: true
233
  url: "https://onestop.md.gov/search?query=capacity"
234
  geo: "MD"
235
  categories: ["capacity_building"]
 
 
 
 
 
 
 
 
236
  parse:
237
  follow_links: true
238
  link_selectors:
239
  - "a[href*='/forms/']"
240
  - "a[href*='/search/']"
241
  content_selectors:
 
242
  - "main"
243
  - "article"
244
- - "[role='main']"
245
 
246
  # --- Maryland: DHCD (housing/community programs & press) ---
247
  - name: "MD DHCD – Programs (grants & loans index)"
@@ -304,6 +329,29 @@ sources:
304
  geo: "MD"
305
  categories: ["capacity_building"]
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  # ---------- OPTIONAL: Curated JSON (enable after you generate it) ----------
308
  - name: "State 5310 Listings (curated JSON)"
309
  type: json_static
@@ -311,3 +359,27 @@ sources:
311
  file: "data/state_5310_listings.json"
312
  geo: "PA|MD|VA|DC"
313
  categories: ["transportation","elderly","disabilities","5310","deadlines"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Minimal, valid config β€” v6.3
2
  filters:
3
+ capacity_only: false # keep only capacity-building items
4
  pa_md_only: false # set to true to restrict index to PA/MD
5
 
6
  sources:
 
46
  page_size: 100
47
  max_pages: 3
48
  payload:
 
49
  aln: "20.513"
50
  keyword: "\"Enhanced Mobility\" OR \"Section 5310\" OR seniors OR elderly OR disabilities OR paratransit OR wheelchair OR shuttle OR van OR bus"
51
  oppStatuses: "posted"
 
85
  fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
86
  sortBy: "openDate|desc"
87
 
88
+ # ---------- FEDERAL: Federal Register (broad NOFO scanning) ----------
89
+
90
+ - name: "Federal Register β€” Funding/NOFO keywords (API)"
91
+ type: http_json
92
+ enabled: true
93
+ url: "https://www.federalregister.gov/api/v1/documents.json"
94
+ geo: "US"
95
+ categories: ["capacity_building", "notices"]
96
+ api:
97
+ payload:
98
+ conditions[term]: "funding opportunity OR cooperative agreement OR NOFO"
99
+ per_page: 50
100
+ order: "newest"
101
+ parse:
102
+ item_path: "results[]"
103
+ title: "title"
104
+ link: "html_url"
105
+ published_at: "publication_date"
106
+ body: "abstract"
107
+
108
  # ---------- STATE & METRO PASS-THROUGHS (FTA 5310 etc.) ----------
 
 
109
 
110
  - name: "Maryland MTA β€” Grants (incl. 5310)"
111
  type: web_page
 
179
  mode: "article"
180
  keep_links: true
181
 
182
+ # --- Pennsylvania: PA Creative Industries (PCA) ---
183
  - name: "PA Creative Industries – Capacity Building (landing)"
184
  type: http_html
185
  enabled: true
 
243
  geo: "PA"
244
  categories: ["capacity_building"]
245
 
246
+ # --- Maryland: OneStop (JS-rendered search) ---
247
+ - name: "Maryland OneStop – Capacity search (JS)"
248
+ type: http_html_js # Playwright adapter
249
  enabled: true
250
  url: "https://onestop.md.gov/search?query=capacity"
251
  geo: "MD"
252
  categories: ["capacity_building"]
253
+ options:
254
+ wait_for: "[role='main']"
255
+ scroll: true
256
+ max_pages: 3
257
+ timeout_ms: 180000 # NEW: longer timeout for SPA
258
+ network_idle: true # NEW: wait for background XHR/fetch to settle
259
+ # debug: true # optional: screenshot on failure
260
+ # click_selector: "a[aria-label='Next']" # uncomment if pagination controls appear
261
  parse:
262
  follow_links: true
263
  link_selectors:
264
  - "a[href*='/forms/']"
265
  - "a[href*='/search/']"
266
  content_selectors:
267
+ - "[role='main']"
268
  - "main"
269
  - "article"
 
270
 
271
  # --- Maryland: DHCD (housing/community programs & press) ---
272
  - name: "MD DHCD – Programs (grants & loans index)"
 
329
  geo: "MD"
330
  categories: ["capacity_building"]
331
 
332
+ # --- Pennsylvania: DCED (Programs index; JS-rendered) ---
333
+ - name: "PA DCED β€” Programs (JS)"
334
+ type: http_html_js
335
+ enabled: true
336
+ url: "https://dced.pa.gov/programs/"
337
+ geo: "PA"
338
+ categories: ["capacity_building","community_development","economic_development"]
339
+ options:
340
+ wait_for: "main"
341
+ scroll: true
342
+ max_pages: 5
343
+ timeout_ms: 180000 # NEW
344
+ network_idle: true # NEW
345
+ # click_selector: ".pagination a.next"
346
+ # debug: true
347
+ parse:
348
+ item_selector: ".program-listing .program, .content" # fallback
349
+ title: ".program-title, h1, h2"
350
+ link: ".program-title a@href, a@href"
351
+ body: ".program-summary, .entry-content, main"
352
+ deadline_selector: ".deadline, .key-dates"
353
+ eligibility_selector: ".eligibility, .who-eligible"
354
+
355
  # ---------- OPTIONAL: Curated JSON (enable after you generate it) ----------
356
  - name: "State 5310 Listings (curated JSON)"
357
  type: json_static
 
359
  file: "data/state_5310_listings.json"
360
  geo: "PA|MD|VA|DC"
361
  categories: ["transportation","elderly","disabilities","5310","deadlines"]
362
+
363
+ - name: "Faith-based Foundations β€” Card/Grid (JS)"
364
+ type: http_html_js
365
+ enabled: true
366
+ skip_filters: true
367
+ url: "https://example.org/foundations/maryland/religion-related"
368
+ geo: "MD|PA|DE|NJ|VA"
369
+ categories: ["foundation_private","faith_based","capacity_building"]
370
+ options:
371
+ wait_for: "[role='main']" # or the results container CSS
372
+ scroll: true
373
+ scroll_selector: ".results-pane" # ← replace with the REAL scrolling DIV
374
+ scroll_times: 40
375
+ scroll_wait_ms: 250
376
+ min_cards: 20
377
+ timeout_ms: 30000
378
+ network_idle: false
379
+ # click_selector: ".pagination a.next" # only if the page has a Next button
380
+ selectors:
381
+ card: ".result-card, .card, article, .search-result"
382
+ title: "h2 a, h3 a, .card-title a, .result-title a, h2, h3, .card-title"
383
+ link: "h2 a, h3 a, .card-title a, .result-title a, a"
384
+ description: ".summary, .card-text, .excerpt, p"
385
+ meta: ".meta, .tags, .badge, .location"