Spaces:
Sleeping
Sleeping
Commit
Β·
b53e303
1
Parent(s):
cd54f1c
feat(ingest): JS card/grid + scroll container + skip_filters
Browse filesfeat(ui): raise top_k & results paging
chore: add ingestors/ranking/utils modules
fix: YAML selectors & SPA wait tuning
- .gitignore +24 -0
- app/ingest.py +364 -25
- app/ingestors/http_html_js.py +102 -0
- app/ranking/rerank.py +17 -0
- app/ranking/rules.py +17 -0
- app/ranking/score.py +8 -0
- app/ui_streamlit.py +356 -244
- app/utils/dedupe.py +29 -0
- app/utils/normalize.py +15 -0
- config/sources.yaml +81 -9
.gitignore
CHANGED
@@ -30,3 +30,27 @@ runtime/
|
|
30 |
data/exports/
|
31 |
*.env*
|
32 |
.env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
data/exports/
|
31 |
*.env*
|
32 |
.env
|
33 |
+
# Python
|
34 |
+
__pycache__/
|
35 |
+
*.pyc
|
36 |
+
.venv/
|
37 |
+
venv/
|
38 |
+
*.egg-info/
|
39 |
+
|
40 |
+
# App data/output
|
41 |
+
data/
|
42 |
+
snapshots/
|
43 |
+
app/static/
|
44 |
+
*.log
|
45 |
+
|
46 |
+
# Local/playwright artifacts
|
47 |
+
playwright/.cache/
|
48 |
+
*.png
|
49 |
+
|
50 |
+
# Editor
|
51 |
+
.vscode/
|
52 |
+
.DS_Store
|
53 |
+
|
54 |
+
# Backups
|
55 |
+
*.bak
|
56 |
+
*.bak.py
|
app/ingest.py
CHANGED
@@ -17,6 +17,9 @@ import hashlib
|
|
17 |
import requests
|
18 |
from bs4 import BeautifulSoup
|
19 |
from datetime import datetime, timezone
|
|
|
|
|
|
|
20 |
|
21 |
# -------------------- Config --------------------
|
22 |
|
@@ -183,7 +186,7 @@ def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
|
|
183 |
hits = out.get("hits", []) if isinstance(out, dict) else (out or [])
|
184 |
return [h for h in hits if isinstance(h, dict)]
|
185 |
|
186 |
-
# --------------------
|
187 |
|
188 |
_HTTP_HEADERS = {
|
189 |
"User-Agent": "grants-rag/1.0 (+https://example.local) requests",
|
@@ -280,6 +283,8 @@ def _normalize_web_record(
|
|
280 |
rec["is_active"] = _compute_is_active(rec["deadline"])
|
281 |
return rec
|
282 |
|
|
|
|
|
283 |
def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
|
284 |
"""
|
285 |
Supports types: 'web_page' and 'http_html'
|
@@ -350,6 +355,219 @@ def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any
|
|
350 |
|
351 |
return rows
|
352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
|
354 |
"""
|
355 |
type: 'http_pdf'
|
@@ -384,9 +602,119 @@ def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]
|
|
384 |
rows.append(_normalize_web_record(source_name, url, title, body, static, extra={"posted_date": None}))
|
385 |
return rows
|
386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
# -------------------- Write docstore & build index --------------------
|
388 |
|
389 |
-
def _save_docstore(recs: List[Dict
|
390 |
DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
|
391 |
path = DOCSTORE_DIR / "docstore.jsonl"
|
392 |
with path.open("w", encoding="utf-8") as f:
|
@@ -425,6 +753,7 @@ def _build_index_from_docstore() -> int:
|
|
425 |
"is_active": rec.get("is_active"),
|
426 |
"program_number": rec.get("program_number"),
|
427 |
"posted_date": rec.get("posted_date"),
|
|
|
428 |
})
|
429 |
|
430 |
print(f"[index] Rows loaded from docstore: {len(texts)}")
|
@@ -467,7 +796,7 @@ __all__ = ["ingest"]
|
|
467 |
def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
|
468 |
"""
|
469 |
Reads config, fetches from enabled sources via adapters, normalizes to a single schema,
|
470 |
-
applies filters (capacity / PA-MD),
|
471 |
Returns (docstore_path, n_indexed).
|
472 |
"""
|
473 |
cfg = load_config(cfg_path)
|
@@ -492,6 +821,7 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
|
|
492 |
typ = entry.get("type")
|
493 |
rows: List[Dict[str, Any]] = []
|
494 |
|
|
|
495 |
if typ == "grantsgov_api":
|
496 |
raw_hits = _collect_from_grantsgov_api(entry)
|
497 |
rows = [normalize("grants_gov", h, static) for h in raw_hits]
|
@@ -499,6 +829,9 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
|
|
499 |
elif typ in ("web_page", "http_html"):
|
500 |
rows = _collect_from_http_html(entry, name, static)
|
501 |
|
|
|
|
|
|
|
502 |
elif typ == "http_pdf":
|
503 |
rows = _collect_from_http_pdf(entry, name, static)
|
504 |
|
@@ -508,31 +841,37 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
|
|
508 |
items = blob.get("opportunities") or []
|
509 |
rows = [normalize("local_sample", op, static) for op in items]
|
510 |
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
for r in rows:
|
515 |
-
t = _doc_text_from_row(r)
|
516 |
-
if capacity_only and not _is_capacity_building_text(t):
|
517 |
-
continue
|
518 |
-
if pa_md_only and not _is_pa_md_text(t):
|
519 |
-
continue
|
520 |
-
filtered.append(r)
|
521 |
-
print(f"[filter] {name}: kept {len(filtered)}/{len(rows)} after filters")
|
522 |
-
rows = filtered
|
523 |
|
524 |
-
print(f"[collect] {name}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
all_rows.extend(rows)
|
526 |
|
527 |
-
# ---- DEDUPE
|
528 |
-
|
529 |
-
for r in all_rows:
|
530 |
-
key = r.get("id") or r.get("url") or r.get("title")
|
531 |
-
if not key or key in seen:
|
532 |
-
continue
|
533 |
-
seen.add(key)
|
534 |
-
unique.append(r)
|
535 |
-
|
536 |
print(f"[ingest] Unique records to index: {len(unique)}")
|
537 |
|
538 |
path = _save_docstore(unique)
|
|
|
17 |
import requests
|
18 |
from bs4 import BeautifulSoup
|
19 |
from datetime import datetime, timezone
|
20 |
+
from difflib import SequenceMatcher
|
21 |
+
from urllib.parse import urljoin
|
22 |
+
|
23 |
|
24 |
# -------------------- Config --------------------
|
25 |
|
|
|
186 |
hits = out.get("hits", []) if isinstance(out, dict) else (out or [])
|
187 |
return [h for h in hits if isinstance(h, dict)]
|
188 |
|
189 |
+
# -------------------- HTTP helpers --------------------
|
190 |
|
191 |
_HTTP_HEADERS = {
|
192 |
"User-Agent": "grants-rag/1.0 (+https://example.local) requests",
|
|
|
283 |
rec["is_active"] = _compute_is_active(rec["deadline"])
|
284 |
return rec
|
285 |
|
286 |
+
# -------------------- Collectors: http_html / web_page --------------------
|
287 |
+
|
288 |
def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
|
289 |
"""
|
290 |
Supports types: 'web_page' and 'http_html'
|
|
|
355 |
|
356 |
return rows
|
357 |
|
358 |
+
# -------------------- Collector: http_html_js (Playwright) --------------------
|
359 |
+
|
360 |
+
def _collect_from_http_html_js(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
|
361 |
+
"""
|
362 |
+
JS-rendered pages using Playwright, with per-card extraction and robust scrolling.
|
363 |
+
entry.options:
|
364 |
+
- wait_for (css or ms int)
|
365 |
+
- scroll (bool)
|
366 |
+
- scroll_selector (css) # NEW: scroll a container div, not the window
|
367 |
+
- scroll_times (int) # NEW: default 20
|
368 |
+
- scroll_wait_ms (int) # NEW: default 400
|
369 |
+
- min_cards (int) # NEW: wait until at least N cards exist
|
370 |
+
- click_selector (css)
|
371 |
+
- max_pages (int)
|
372 |
+
- timeout_ms (int)
|
373 |
+
- network_idle (bool)
|
374 |
+
- debug (bool)
|
375 |
+
entry.selectors: card, title, link, description, meta
|
376 |
+
"""
|
377 |
+
try:
|
378 |
+
from playwright.sync_api import sync_playwright # type: ignore
|
379 |
+
except Exception:
|
380 |
+
print(f"[collect][skip] {source_name}: Playwright not installed.")
|
381 |
+
return []
|
382 |
+
|
383 |
+
url = entry.get("url")
|
384 |
+
if not url:
|
385 |
+
return []
|
386 |
+
|
387 |
+
options = entry.get("options", {}) or {}
|
388 |
+
parse = entry.get("parse", {}) or entry.get("extract", {}) or {}
|
389 |
+
selectors = entry.get("selectors", {}) or {}
|
390 |
+
content_selectors = parse.get("content_selectors") or []
|
391 |
+
|
392 |
+
timeout_ms = int(options.get("timeout_ms", 6000))
|
393 |
+
network_idle = bool(options.get("network_idle", True))
|
394 |
+
debug = bool(options.get("debug", False))
|
395 |
+
max_pages = int(options.get("max_pages", 1))
|
396 |
+
click_sel = options.get("click_selector") or entry.get("next_selector")
|
397 |
+
wait_for = options.get("wait_for")
|
398 |
+
|
399 |
+
rows: List[Dict[str, Any]] = []
|
400 |
+
|
401 |
+
def _text_first(soup: BeautifulSoup, css_list: str) -> str:
|
402 |
+
if not css_list:
|
403 |
+
return ""
|
404 |
+
for css in [c.strip() for c in css_list.split(",")]:
|
405 |
+
el = soup.select_one(css)
|
406 |
+
if el:
|
407 |
+
txt = el.get_text(separator=" ", strip=True)
|
408 |
+
if txt:
|
409 |
+
return txt
|
410 |
+
return ""
|
411 |
+
|
412 |
+
def _attr_first(soup: BeautifulSoup, css_list: str, attr: str) -> Optional[str]:
|
413 |
+
if not css_list:
|
414 |
+
return None
|
415 |
+
for css in [c.strip() for c in css_list.split(",")]:
|
416 |
+
el = soup.select_one(css)
|
417 |
+
if el:
|
418 |
+
val = el.get(attr)
|
419 |
+
if val:
|
420 |
+
return val
|
421 |
+
return None
|
422 |
+
|
423 |
+
def _parse_cards(page_html: str, base_url: str) -> List[Dict[str, Any]]:
|
424 |
+
s = _soup(page_html)
|
425 |
+
card_css = selectors.get("card", "")
|
426 |
+
if not card_css:
|
427 |
+
title, body = _text_from_soup(s, content_selectors)
|
428 |
+
return [_normalize_web_record(source_name, base_url, title, body, static, extra={"posted_date": None})]
|
429 |
+
|
430 |
+
out: List[Dict[str, Any]] = []
|
431 |
+
for card in s.select(card_css) or []:
|
432 |
+
csoup = BeautifulSoup(str(card), "lxml")
|
433 |
+
title = _text_first(csoup, selectors.get("title", "h1, h2, h3"))
|
434 |
+
href = _attr_first(csoup, selectors.get("link", "a"), "href")
|
435 |
+
link = urljoin(base_url, href) if href else base_url
|
436 |
+
desc = _text_first(csoup, selectors.get("description", "p, .summary, .excerpt, .card-text"))
|
437 |
+
meta = _text_first(csoup, selectors.get("meta", ".meta, .tags, .badge, .location"))
|
438 |
+
body = "\n".join([p for p in (desc, meta) if p]).strip()
|
439 |
+
if not (title or body):
|
440 |
+
continue
|
441 |
+
out.append(_normalize_web_record(source_name, link, title or link, body, static, extra={"posted_date": None}))
|
442 |
+
return out
|
443 |
+
|
444 |
+
def _wait_page_ready(page, *, wait_for, timeout_ms, options, selectors):
|
445 |
+
# wait_for can be CSS or milliseconds
|
446 |
+
if isinstance(wait_for, int):
|
447 |
+
page.wait_for_timeout(wait_for)
|
448 |
+
elif isinstance(wait_for, str) and wait_for:
|
449 |
+
page.wait_for_selector(wait_for, timeout=min(timeout_ms, 15000))
|
450 |
+
|
451 |
+
# Scroll window or a container div
|
452 |
+
if options.get("scroll"):
|
453 |
+
scroll_sel = options.get("scroll_selector")
|
454 |
+
scroll_times = int(options.get("scroll_times", 20))
|
455 |
+
scroll_wait = int(options.get("scroll_wait_ms", 400))
|
456 |
+
if scroll_sel:
|
457 |
+
page.evaluate(
|
458 |
+
"""(sel, times, wait) => new Promise(res => {
|
459 |
+
const el = document.querySelector(sel);
|
460 |
+
if (!el) { res(); return; }
|
461 |
+
let i = 0;
|
462 |
+
const t = setInterval(() => {
|
463 |
+
const y = el.scrollTop;
|
464 |
+
el.scrollTop = el.scrollHeight;
|
465 |
+
i++;
|
466 |
+
if (el.scrollTop === y || i >= times) { clearInterval(t); res(); }
|
467 |
+
}, wait);
|
468 |
+
})""",
|
469 |
+
scroll_sel, scroll_times, scroll_wait
|
470 |
+
)
|
471 |
+
else:
|
472 |
+
page.evaluate(
|
473 |
+
"""(times, wait) => new Promise(res => {
|
474 |
+
let i = 0;
|
475 |
+
const t = setInterval(() => {
|
476 |
+
const y = window.scrollY;
|
477 |
+
window.scrollBy(0, document.body.scrollHeight);
|
478 |
+
i++;
|
479 |
+
if (window.scrollY === y || i >= times) { clearInterval(t); res(); }
|
480 |
+
}, wait);
|
481 |
+
})""",
|
482 |
+
scroll_times, scroll_wait
|
483 |
+
)
|
484 |
+
|
485 |
+
# Optionally wait for a minimum number of cards (virtualized lists)
|
486 |
+
min_cards = int(options.get("min_cards", 0))
|
487 |
+
card_css = (selectors or {}).get("card", "")
|
488 |
+
if min_cards and card_css:
|
489 |
+
try:
|
490 |
+
page.wait_for_function(
|
491 |
+
"""([sel, target]) => {
|
492 |
+
const els = document.querySelectorAll(sel);
|
493 |
+
return els && els.length >= target;
|
494 |
+
}""",
|
495 |
+
arg=[card_css, min_cards],
|
496 |
+
timeout=min(timeout_ms, 15000),
|
497 |
+
)
|
498 |
+
except Exception:
|
499 |
+
pass # best-effort
|
500 |
+
|
501 |
+
def _try_once(p):
|
502 |
+
def _route(route):
|
503 |
+
r = route.request
|
504 |
+
if r.resource_type in {"image", "media", "font"}:
|
505 |
+
return route.abort()
|
506 |
+
return route.continue_()
|
507 |
+
|
508 |
+
browser = p.chromium.launch(headless=not debug)
|
509 |
+
context = browser.new_context(user_agent=(
|
510 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
511 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
512 |
+
))
|
513 |
+
context.route("**/*", _route)
|
514 |
+
page = context.new_page()
|
515 |
+
try:
|
516 |
+
page.set_default_timeout(timeout_ms)
|
517 |
+
page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
|
518 |
+
if network_idle:
|
519 |
+
page.wait_for_load_state("networkidle", timeout=min(timeout_ms, 15000))
|
520 |
+
|
521 |
+
_wait_page_ready(page, wait_for=wait_for, timeout_ms=timeout_ms, options=options, selectors=selectors)
|
522 |
+
|
523 |
+
htmls = [page.content()]
|
524 |
+
|
525 |
+
# pagination
|
526 |
+
for _ in range(max_pages - 1):
|
527 |
+
sel = click_sel
|
528 |
+
if not sel or page.locator(sel).count() == 0:
|
529 |
+
break
|
530 |
+
page.click(sel)
|
531 |
+
page.wait_for_load_state("domcontentloaded")
|
532 |
+
if network_idle:
|
533 |
+
page.wait_for_load_state("networkidle", timeout=min(timeout_ms, 15000))
|
534 |
+
_wait_page_ready(page, wait_for=wait_for, timeout_ms=timeout_ms, options=options, selectors=selectors)
|
535 |
+
htmls.append(page.content())
|
536 |
+
|
537 |
+
for html in htmls:
|
538 |
+
found = _parse_cards(html, url)
|
539 |
+
rows.extend(found)
|
540 |
+
print(f"[collect][cards] {source_name}: found {len(found)} cards on this page")
|
541 |
+
browser.close()
|
542 |
+
return True
|
543 |
+
except Exception as e:
|
544 |
+
if debug:
|
545 |
+
try:
|
546 |
+
snap = DOCSTORE_DIR / f"playwright_error_{hashlib.sha1(url.encode()).hexdigest()}.png"
|
547 |
+
page.screenshot(path=str(snap))
|
548 |
+
print(f"[collect][debug] Saved screenshot: {snap}")
|
549 |
+
except Exception:
|
550 |
+
pass
|
551 |
+
print(f"[collect][warn] {source_name}: {e.__class__.__name__}: {e}")
|
552 |
+
try:
|
553 |
+
browser.close()
|
554 |
+
except Exception:
|
555 |
+
pass
|
556 |
+
return False
|
557 |
+
|
558 |
+
from playwright.sync_api import sync_playwright # late import for clarity
|
559 |
+
with sync_playwright() as p:
|
560 |
+
ok = _try_once(p)
|
561 |
+
if not ok:
|
562 |
+
time.sleep(1.5)
|
563 |
+
_try_once(p)
|
564 |
+
|
565 |
+
if not rows:
|
566 |
+
print(f"[collect][skip] {source_name}: no content after retries.")
|
567 |
+
return rows
|
568 |
+
|
569 |
+
# -------------------- PDF collector --------------------
|
570 |
+
|
571 |
def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
|
572 |
"""
|
573 |
type: 'http_pdf'
|
|
|
602 |
rows.append(_normalize_web_record(source_name, url, title, body, static, extra={"posted_date": None}))
|
603 |
return rows
|
604 |
|
605 |
+
# -------------------- De-dup helpers (5.2) --------------------
|
606 |
+
|
607 |
+
def _norm(text: str) -> str:
|
608 |
+
t = (text or "").lower()
|
609 |
+
t = re.sub(r'[^a-z0-9 ]+', ' ', t)
|
610 |
+
return re.sub(r'\s+', ' ', t).strip()
|
611 |
+
|
612 |
+
def _hash_fingerprint(title: str, agency: str, deadline: str) -> str:
|
613 |
+
base = f"{_norm(title)}|{_norm(agency)}|{deadline or ''}"
|
614 |
+
return hashlib.sha1(base.encode()).hexdigest()
|
615 |
+
|
616 |
+
def _near_duplicate(a: Dict[str, Any], b: Dict[str, Any]) -> bool:
|
617 |
+
# Deadlines equal or both missing
|
618 |
+
dates_close = (a.get("deadline") == b.get("deadline")) or (not a.get("deadline") and not b.get("deadline"))
|
619 |
+
t_sim = SequenceMatcher(None, _norm(a.get("title","")), _norm(b.get("title",""))).ratio()
|
620 |
+
ag_sim = SequenceMatcher(None, _norm(a.get("agency","")), _norm(b.get("agency",""))).ratio()
|
621 |
+
return dates_close and (t_sim > 0.88) and (ag_sim > 0.75)
|
622 |
+
|
623 |
+
def _merge_records(primary: Dict[str, Any], other: Dict[str, Any]) -> Dict[str, Any]:
|
624 |
+
"""Merge fields while preserving best data and provenance."""
|
625 |
+
merged = dict(primary)
|
626 |
+
|
627 |
+
# Prefer non-empty fields; combine categories; keep earliest posted_date; keep earliest deadline if different.
|
628 |
+
def choose(a, b):
|
629 |
+
return a if (a not in (None, "", [], {})) else b
|
630 |
+
|
631 |
+
merged["url"] = choose(primary.get("url"), other.get("url"))
|
632 |
+
merged["title"] = choose(primary.get("title"), other.get("title"))
|
633 |
+
merged["synopsis"] = choose(primary.get("synopsis"), other.get("synopsis"))
|
634 |
+
merged["summary"] = choose(primary.get("summary"), other.get("summary"))
|
635 |
+
merged["agency"] = choose(primary.get("agency"), other.get("agency"))
|
636 |
+
merged["eligibility"] = choose(primary.get("eligibility"), other.get("eligibility"))
|
637 |
+
merged["program_number"] = choose(primary.get("program_number"), other.get("program_number"))
|
638 |
+
merged["geo"] = choose(primary.get("geo"), other.get("geo"))
|
639 |
+
|
640 |
+
# categories β union (list)
|
641 |
+
cats_a = primary.get("categories") or []
|
642 |
+
cats_b = other.get("categories") or []
|
643 |
+
if not isinstance(cats_a, list): cats_a = [cats_a]
|
644 |
+
if not isinstance(cats_b, list): cats_b = [cats_b]
|
645 |
+
merged["categories"] = sorted(set([c for c in cats_a + cats_b if c]))
|
646 |
+
|
647 |
+
# deadline: choose earlier known date (safer to surface sooner one)
|
648 |
+
da, db = primary.get("deadline"), other.get("deadline")
|
649 |
+
if da and db:
|
650 |
+
merged["deadline"] = min(da, db)
|
651 |
+
else:
|
652 |
+
merged["deadline"] = da or db
|
653 |
+
# carry a deadline_text if any
|
654 |
+
merged["deadline_text"] = choose(primary.get("deadline_text"), other.get("deadline_text"))
|
655 |
+
merged["is_active"] = _compute_is_active(merged.get("deadline"))
|
656 |
+
|
657 |
+
# posted_date: keep earliest if both
|
658 |
+
pa, pb = primary.get("posted_date"), other.get("posted_date")
|
659 |
+
merged["posted_date"] = min(pa, pb) if (pa and pb) else (pa or pb)
|
660 |
+
|
661 |
+
# provenance: combine sources + urls
|
662 |
+
prov_sources = set()
|
663 |
+
for s in (primary.get("source"), other.get("source")):
|
664 |
+
if not s: continue
|
665 |
+
if isinstance(s, list): prov_sources.update(s)
|
666 |
+
else: prov_sources.add(s)
|
667 |
+
merged["source"] = sorted(prov_sources) if prov_sources else None
|
668 |
+
|
669 |
+
prov_urls = set()
|
670 |
+
for u in (primary.get("url"), other.get("url")):
|
671 |
+
if u: prov_urls.add(u)
|
672 |
+
# keep a list of all discovered urls
|
673 |
+
merged["all_urls"] = sorted(prov_urls.union(set(primary.get("all_urls") or []), set(other.get("all_urls") or [])))
|
674 |
+
|
675 |
+
# recompute ID based on merged fingerprint (title/agency/deadline)
|
676 |
+
merged["id"] = _hash_fingerprint(merged.get("title",""), merged.get("agency",""), merged.get("deadline",""))
|
677 |
+
|
678 |
+
return merged
|
679 |
+
|
680 |
+
def _dedupe_and_merge(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
681 |
+
"""Exact fingerprint + fuzzy near-dup consolidation across sources."""
|
682 |
+
uniques: List[Dict[str, Any]] = []
|
683 |
+
by_fp: Dict[str, int] = {}
|
684 |
+
|
685 |
+
for r in rows:
|
686 |
+
fp = _hash_fingerprint(r.get("title",""), r.get("agency",""), r.get("deadline",""))
|
687 |
+
if fp in by_fp:
|
688 |
+
# exact dup: merge into existing
|
689 |
+
idx = by_fp[fp]
|
690 |
+
uniques[idx] = _merge_records(uniques[idx], r)
|
691 |
+
continue
|
692 |
+
|
693 |
+
# fuzzy check against current uniques
|
694 |
+
found_idx = None
|
695 |
+
for i, u in enumerate(uniques):
|
696 |
+
if _near_duplicate(r, u):
|
697 |
+
found_idx = i
|
698 |
+
break
|
699 |
+
|
700 |
+
if found_idx is not None:
|
701 |
+
uniques[found_idx] = _merge_records(uniques[found_idx], r)
|
702 |
+
# also index its fingerprint (so later exact matches land here)
|
703 |
+
new_fp = _hash_fingerprint(uniques[found_idx].get("title",""),
|
704 |
+
uniques[found_idx].get("agency",""),
|
705 |
+
uniques[found_idx].get("deadline",""))
|
706 |
+
by_fp[new_fp] = found_idx
|
707 |
+
else:
|
708 |
+
by_fp[fp] = len(uniques)
|
709 |
+
# initialize provenance
|
710 |
+
r.setdefault("all_urls", [r.get("url")] if r.get("url") else [])
|
711 |
+
uniques.append(r)
|
712 |
+
|
713 |
+
return uniques
|
714 |
+
|
715 |
# -------------------- Write docstore & build index --------------------
|
716 |
|
717 |
+
def _save_docstore(recs: List[Dict, Any]) -> str:
|
718 |
DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
|
719 |
path = DOCSTORE_DIR / "docstore.jsonl"
|
720 |
with path.open("w", encoding="utf-8") as f:
|
|
|
753 |
"is_active": rec.get("is_active"),
|
754 |
"program_number": rec.get("program_number"),
|
755 |
"posted_date": rec.get("posted_date"),
|
756 |
+
"all_urls": rec.get("all_urls"),
|
757 |
})
|
758 |
|
759 |
print(f"[index] Rows loaded from docstore: {len(texts)}")
|
|
|
796 |
def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
|
797 |
"""
|
798 |
Reads config, fetches from enabled sources via adapters, normalizes to a single schema,
|
799 |
+
applies filters (capacity / PA-MD), de-dupes, writes docstore, and builds the FAISS index.
|
800 |
Returns (docstore_path, n_indexed).
|
801 |
"""
|
802 |
cfg = load_config(cfg_path)
|
|
|
821 |
typ = entry.get("type")
|
822 |
rows: List[Dict[str, Any]] = []
|
823 |
|
824 |
+
# -------- Collect from each adapter --------
|
825 |
if typ == "grantsgov_api":
|
826 |
raw_hits = _collect_from_grantsgov_api(entry)
|
827 |
rows = [normalize("grants_gov", h, static) for h in raw_hits]
|
|
|
829 |
elif typ in ("web_page", "http_html"):
|
830 |
rows = _collect_from_http_html(entry, name, static)
|
831 |
|
832 |
+
elif typ == "http_html_js":
|
833 |
+
rows = _collect_from_http_html_js(entry, name, static)
|
834 |
+
|
835 |
elif typ == "http_pdf":
|
836 |
rows = _collect_from_http_pdf(entry, name, static)
|
837 |
|
|
|
841 |
items = blob.get("opportunities") or []
|
842 |
rows = [normalize("local_sample", op, static) for op in items]
|
843 |
|
844 |
+
else:
|
845 |
+
print(f"[collect] {name}: unknown type '{typ}', skipping.")
|
846 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
847 |
|
848 |
+
print(f"[collect] {name}: fetched_rows={len(rows)}")
|
849 |
+
|
850 |
+
# ---- Apply capacity / geo filters BEFORE indexing (allow per-source bypass) ----
|
851 |
+
if rows:
|
852 |
+
if entry.get("skip_filters"):
|
853 |
+
print(f"[filter] {name}: skip_filters=true β keeping all {len(rows)}")
|
854 |
+
else:
|
855 |
+
pre = len(rows)
|
856 |
+
filtered = []
|
857 |
+
for r in rows:
|
858 |
+
t = _doc_text_from_row(r)
|
859 |
+
if capacity_only and not _is_capacity_building_text(t):
|
860 |
+
continue
|
861 |
+
if pa_md_only and not _is_pa_md_text(t):
|
862 |
+
continue
|
863 |
+
filtered.append(r)
|
864 |
+
print(
|
865 |
+
f"[filter] {name}: kept {len(filtered)}/{pre} after filters "
|
866 |
+
f"(capacity_only={capacity_only}, pa_md_only={pa_md_only})"
|
867 |
+
)
|
868 |
+
rows = filtered
|
869 |
+
|
870 |
+
print(f"[collect] {name} β rows_after_filters={len(rows)}")
|
871 |
all_rows.extend(rows)
|
872 |
|
873 |
+
# ---- Cross-source DEDUPE + MERGE ----
|
874 |
+
unique = _dedupe_and_merge(all_rows)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
875 |
print(f"[ingest] Unique records to index: {len(unique)}")
|
876 |
|
877 |
path = _save_docstore(unique)
|
app/ingestors/http_html_js.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/ingestors/http_html_js.py
|
2 |
+
import asyncio, time
|
3 |
+
from typing import List, Dict, Any, Optional
|
4 |
+
from urllib.parse import urljoin
|
5 |
+
from playwright.async_api import async_playwright
|
6 |
+
|
7 |
+
DEFAULT_WAIT_MS = 3000
|
8 |
+
|
9 |
+
async def _scrape_page(page, url: str, wait_ms: int, selectors: Dict[str, str]) -> List[Dict[str, Any]]:
|
10 |
+
await page.goto(url, wait_until="domcontentloaded", timeout=45000)
|
11 |
+
if wait_ms:
|
12 |
+
await page.wait_for_timeout(wait_ms)
|
13 |
+
|
14 |
+
cards = []
|
15 |
+
card_sel = selectors.get("card", "")
|
16 |
+
title_sel = selectors.get("title", "")
|
17 |
+
link_sel = selectors.get("link", "")
|
18 |
+
desc_sel = selectors.get("description", "")
|
19 |
+
meta_sel = selectors.get("meta", "") # e.g., location/focus
|
20 |
+
|
21 |
+
elements = await page.query_selector_all(card_sel) if card_sel else []
|
22 |
+
for el in elements:
|
23 |
+
title = (await (await el.query_selector(title_sel)).inner_text()).strip() if title_sel and await el.query_selector(title_sel) else ""
|
24 |
+
link_el = await el.query_selector(link_sel) if link_sel else None
|
25 |
+
href = await link_el.get_attribute("href") if link_el else None
|
26 |
+
link = urljoin(url, href) if href else url
|
27 |
+
desc = (await (await el.query_selector(desc_sel)).inner_text()).strip() if desc_sel and await el.query_selector(desc_sel) else ""
|
28 |
+
meta = (await (await el.query_selector(meta_sel)).inner_text()).strip() if meta_sel and await el.query_selector(meta_sel) else ""
|
29 |
+
|
30 |
+
if title or desc:
|
31 |
+
cards.append({
|
32 |
+
"title": title,
|
33 |
+
"url": link,
|
34 |
+
"summary": desc,
|
35 |
+
"meta": meta
|
36 |
+
})
|
37 |
+
return cards
|
38 |
+
|
39 |
+
async def scrape_js_site(
|
40 |
+
start_url: str,
|
41 |
+
max_pages: int = 1,
|
42 |
+
wait_ms: int = DEFAULT_WAIT_MS,
|
43 |
+
selectors: Optional[Dict[str, str]] = None,
|
44 |
+
next_selector: Optional[str] = None
|
45 |
+
) -> List[Dict[str, Any]]:
|
46 |
+
selectors = selectors or {}
|
47 |
+
results: List[Dict[str, Any]] = []
|
48 |
+
async with async_playwright() as p:
|
49 |
+
browser = await p.chromium.launch(headless=True)
|
50 |
+
page = await browser.new_page()
|
51 |
+
current_url = start_url
|
52 |
+
for _ in range(max_pages):
|
53 |
+
page_results = await _scrape_page(page, current_url, wait_ms, selectors)
|
54 |
+
results.extend(page_results)
|
55 |
+
if not next_selector:
|
56 |
+
break
|
57 |
+
next_btn = await page.query_selector(next_selector)
|
58 |
+
if not next_btn:
|
59 |
+
break
|
60 |
+
await next_btn.click()
|
61 |
+
await page.wait_for_timeout(800) # polite delay
|
62 |
+
await browser.close()
|
63 |
+
return results
|
64 |
+
|
65 |
+
def ingest_http_html_js(cfg: Dict[str, Any]) -> List[Dict[str, Any]]:
|
66 |
+
"""
|
67 |
+
cfg example:
|
68 |
+
{
|
69 |
+
"url": "...",
|
70 |
+
"options": {"wait_for": 3000, "max_pages": 3},
|
71 |
+
"selectors": {
|
72 |
+
"card": ".result-card",
|
73 |
+
"title": ".card-title",
|
74 |
+
"link": "a.card-link",
|
75 |
+
"description": ".card-body",
|
76 |
+
"meta": ".card-meta"
|
77 |
+
},
|
78 |
+
"next_selector": "a[rel=next]"
|
79 |
+
}
|
80 |
+
"""
|
81 |
+
url = cfg["url"]
|
82 |
+
opts = cfg.get("options", {})
|
83 |
+
wait_ms = int(opts.get("wait_for", DEFAULT_WAIT_MS))
|
84 |
+
max_pages = int(opts.get("max_pages", 1))
|
85 |
+
selectors = cfg.get("selectors", {})
|
86 |
+
next_selector = cfg.get("next_selector")
|
87 |
+
|
88 |
+
# Run event loop
|
89 |
+
results = asyncio.run(scrape_js_site(url, max_pages, wait_ms, selectors, next_selector))
|
90 |
+
|
91 |
+
# Normalize to your index schema
|
92 |
+
docs = []
|
93 |
+
for r in results:
|
94 |
+
docs.append({
|
95 |
+
"title": r["title"] or "Untitled foundation",
|
96 |
+
"url": r["url"],
|
97 |
+
"body": f"{r.get('summary','')}\n{r.get('meta','')}",
|
98 |
+
"source_type": "foundation_private",
|
99 |
+
"geo": "US-MidAtlantic",
|
100 |
+
"tags": ["faith-based", "foundation"],
|
101 |
+
})
|
102 |
+
return docs
|
app/ranking/rerank.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/ranking/rerank.py
|
2 |
+
import numpy as np
|
3 |
+
from sentence_transformers import SentenceTransformer, util
|
4 |
+
|
5 |
+
INTENT_TEXT = """
|
6 |
+
nonprofit 501(c)(3) capacity building, community outreach, reentry, youth,
|
7 |
+
food security, housing stabilization, violence prevention, mental health,
|
8 |
+
addiction recovery, faith-based programs, workforce, mentorship
|
9 |
+
"""
|
10 |
+
|
11 |
+
_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
12 |
+
_intent_vec = _model.encode(INTENT_TEXT, normalize_embeddings=True)
|
13 |
+
|
14 |
+
def embed_score(title: str, body: str) -> float:
|
15 |
+
text = f"{title}\n{body or ''}"
|
16 |
+
v = _model.encode(text, normalize_embeddings=True)
|
17 |
+
return float(util.cos_sim(_intent_vec, v).item())
|
app/ranking/rules.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/ranking/rules.py
|
2 |
+
INCLUDE_HINTS = [
|
3 |
+
"community", "capacity building", "re-entry", "workforce", "housing",
|
4 |
+
"human services", "addiction", "youth", "violence prevention",
|
5 |
+
"nonprofit", "faith", "church", "outreach", "mentorship"
|
6 |
+
]
|
7 |
+
EXCLUDE_HINTS = [
|
8 |
+
"r01", "r21", "sbir", "sttr", "lab solicitation", "postdoctoral",
|
9 |
+
"basic research", "scoping study", "hypothesis", "principal investigator"
|
10 |
+
]
|
11 |
+
|
12 |
+
def rule_score(text: str) -> int:
|
13 |
+
t = (text or "").lower()
|
14 |
+
s = 0
|
15 |
+
s += sum(2 for k in INCLUDE_HINTS if k in t)
|
16 |
+
s -= sum(2 for k in EXCLUDE_HINTS if k in t)
|
17 |
+
return s
|
app/ranking/score.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/ranking/score.py
|
2 |
+
def confidence(item) -> float:
|
3 |
+
rs = rule_score(item.get("body","") + " " + item.get("title",""))
|
4 |
+
es = embed_score(item.get("title",""), item.get("body",""))
|
5 |
+
dl_boost = 0.0
|
6 |
+
if item.get("deadline"): dl_boost += 0.05
|
7 |
+
if "NONPROFIT_501C3" in (item.get("eligibility") or []): dl_boost += 0.10
|
8 |
+
return max(0.0, min(1.0, 0.4*es + 0.4*(rs/8.0) + dl_boost))
|
app/ui_streamlit.py
CHANGED
@@ -8,99 +8,149 @@ if str(ROOT) not in sys.path:
|
|
8 |
|
9 |
import os, json
|
10 |
from pathlib import Path
|
|
|
|
|
|
|
11 |
|
12 |
from app.main import get_env, ensure_index_exists
|
13 |
from app.search import search
|
14 |
|
15 |
-
|
|
|
16 |
|
|
|
17 |
st.markdown("""
|
18 |
<style>
|
19 |
-
/*
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
}
|
23 |
-
|
24 |
-
div[data-baseweb="select"] * { color: #0f172a !important; }
|
25 |
-
div[data-baseweb="select"] { background: #ffffff !important; border-color: #cbd5e1 !important; }
|
26 |
-
/* placeholder inside the closed select */
|
27 |
-
div[data-baseweb="select"] div[aria-hidden="true"] { color: #64748b !important; }
|
28 |
-
/* open dropdown menu (BaseWeb popover) */
|
29 |
-
div[data-baseweb="popover"] [role="listbox"], div[data-baseweb="menu"] { background: #ffffff !important; }
|
30 |
-
div[data-baseweb="popover"] [role="option"], div[data-baseweb="menu"] li { color: #0f172a !important; background: #ffffff !important; }
|
31 |
-
/* --- Alternative rendering (ARIA hooks) in newer Streamlit builds --- */
|
32 |
-
div[role="button"][aria-haspopup="listbox"] * { color: #0f172a !important; }
|
33 |
-
ul[role="listbox"] li, div[role="option"] { color: #0f172a !important; background: #ffffff !important; }
|
34 |
-
/* --- Streamlit component wrappers --- */
|
35 |
-
.stSelectbox, .stMultiSelect { color: #0f172a !important; }
|
36 |
-
.stSelectbox div, .stMultiSelect div { color: #0f172a !important; }
|
37 |
-
/* --- Hard reset in case a global rule set all <span> to white --- */
|
38 |
-
span, li { color: inherit !important; }
|
39 |
-
</style>
|
40 |
-
""", unsafe_allow_html=True)
|
41 |
|
42 |
-
|
43 |
-
st
|
|
|
44 |
|
45 |
-
# ββ Theme & CSS (BLACK + ORANGE, dark selects) ββββββββββββββββββββββββββββββββ
|
46 |
-
st.markdown("""
|
47 |
-
<style>
|
48 |
-
/* App base */
|
49 |
-
.stApp { background-color: #000000; color: #f8fafc; }
|
50 |
-
/* Text defaults */
|
51 |
-
html, body, [class*="css"], h1, h2, h3, h4, h5, h6, p, span, div { color: #f8fafc !important; }
|
52 |
-
/* Accents */
|
53 |
-
a, .stRadio > label, .stSlider label { color: #f97316 !important; }
|
54 |
/* Buttons */
|
55 |
-
.stButton>button {
|
|
|
|
|
|
|
56 |
.stButton>button:hover { filter:brightness(1.1); }
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
}
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
/*
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
background: linear-gradient(rgba(0,0,0,.45), rgba(0,0,0,.45)),
|
84 |
url('https://images.unsplash.com/photo-1469474968028-56623f02e42e?auto=format&fit=crop&w=1280&q=80') center/cover no-repeat; }
|
85 |
.hero-text { height:100%; display:flex; flex-direction:column; align-items:center; justify-content:center; text-align:center; color:#fff; }
|
86 |
-
.hero-text h1 { margin:0; font-size:28px; font-weight:700; color
|
87 |
.hero-text p { margin:6px 0 0; font-size:15px; color:#fcd34d; }
|
88 |
-
/* ===== FORCE DARK SELECT / MULTISELECT ===== */
|
89 |
-
[data-testid="stSelectbox"] div[role="combobox"], [data-testid="stMultiSelect"] div[role="combobox"],
|
90 |
-
div[role="combobox"][aria-haspopup="listbox"] { background-color:#1e293b !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:8px !important; }
|
91 |
-
[data-testid="stSelectbox"] div[role="combobox"] input, [data-testid="stMultiSelect"] div[role="combobox"] input,
|
92 |
-
div[role="combobox"] input { color:#f8fafc !important; }
|
93 |
-
div[role="combobox"] input::placeholder { color:#94a3b8 !important; }
|
94 |
-
div[role="combobox"] svg { color:#f8fafc !important; fill:#f8fafc !important; }
|
95 |
-
[data-testid="stMultiSelect"] [data-baseweb="tag"], [data-testid="stMultiSelect"] [aria-label="remove"] { background-color:#334155 !important; color:#e2e8f0 !important; border-radius:999px !important; }
|
96 |
-
div[role="listbox"], ul[role="listbox"], div[data-baseweb="menu"] { background-color:#0b1220 !important; color:#f8fafc !important; border:1px solid #334155 !important; border-radius:10px !important; }
|
97 |
-
[role="listbox"] [role="option"], div[data-baseweb="menu"] [role="option"] { background:transparent !important; color:#f8fafc !important; }
|
98 |
-
[role="listbox"] [role="option"]:hover, div[data-baseweb="menu"] [role="option"]:hover { background:#1f2937 !important; }
|
99 |
-
[role="listbox"] [role="option"][aria-selected="true"], div[data-baseweb="menu"] [role="option"][aria-selected="true"] { background:#334155 !important; color:#f8fafc !important; }
|
100 |
</style>
|
101 |
""", unsafe_allow_html=True)
|
102 |
|
103 |
-
# ββ Hero
|
104 |
st.markdown("""
|
105 |
<div class="hero">
|
106 |
<div class="hero-text">
|
@@ -110,27 +160,13 @@ st.markdown("""
|
|
110 |
</div>
|
111 |
""", unsafe_allow_html=True)
|
112 |
|
113 |
-
# ββ
|
114 |
-
SHOW_DEV = os.environ.get("SHOW_DEV") == "1"
|
115 |
-
|
116 |
-
# ββ Environment + index βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
117 |
_env = get_env()
|
118 |
ensure_index_exists(_env)
|
119 |
|
120 |
-
#
|
121 |
-
def _dedup_records(rows):
|
122 |
-
seen, out = set(), []
|
123 |
-
for r in rows or []:
|
124 |
-
k = r.get("id") or r.get("url") or r.get("title")
|
125 |
-
if not k or k in seen:
|
126 |
-
continue
|
127 |
-
seen.add(k)
|
128 |
-
out.append(r)
|
129 |
-
return out
|
130 |
-
|
131 |
def _norm_list(v):
|
132 |
-
if v is None:
|
133 |
-
return []
|
134 |
if isinstance(v, str):
|
135 |
parts = [p.strip() for p in v.replace(";", ",").split(",")]
|
136 |
return [p.lower() for p in parts if p]
|
@@ -146,8 +182,7 @@ def _matches_filters(rec, geo_sel, cat_sel):
|
|
146 |
return g_ok and c_ok
|
147 |
|
148 |
def _ministry_filter(rows):
|
149 |
-
if not rows:
|
150 |
-
return rows
|
151 |
banned_terms = [
|
152 |
"broad agency announcement", "baa", "research", "r&d", "prototype",
|
153 |
"laboratory", "university", "sbir", "sttr",
|
@@ -155,42 +190,40 @@ def _ministry_filter(rows):
|
|
155 |
"w911", "n00014", "fa-", "afrl", "arpa"
|
156 |
]
|
157 |
preferred_agencies = {
|
158 |
-
"FTA",
|
159 |
-
"EDA", "HRSA", "SAMHSA", "CFPB", "HUD"
|
160 |
}
|
161 |
-
|
162 |
-
"vehicle",
|
163 |
-
"congregate meals",
|
164 |
-
"food pantry",
|
165 |
-
"community",
|
166 |
-
"reentry",
|
167 |
]
|
168 |
def txt(r):
|
169 |
-
return " ".join([
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
]).lower()
|
174 |
-
|
175 |
-
kept = []
|
176 |
for r in rows:
|
177 |
t = txt(r)
|
178 |
-
if any(b in t for b in banned_terms):
|
179 |
-
continue
|
180 |
agency = (r.get("agency") or "").upper()
|
181 |
-
cats = [c.lower() for c in (r.get("categories") or [])]
|
182 |
-
|
183 |
-
|
184 |
c in {"transportation","vehicle","elderly","disabled","food","community","justice","reentry","workforce"} for c in cats
|
185 |
)
|
186 |
-
if
|
187 |
kept.append(r)
|
188 |
return kept
|
189 |
|
|
|
|
|
|
|
|
|
|
|
190 |
def _days_until(iso):
|
191 |
-
|
192 |
-
if not iso:
|
193 |
-
return None
|
194 |
try:
|
195 |
d = datetime.fromisoformat(str(iso)).date()
|
196 |
return (d - date.today()).days
|
@@ -198,39 +231,13 @@ def _days_until(iso):
|
|
198 |
return None
|
199 |
|
200 |
def _deadline_badge(days_left):
|
201 |
-
if days_left is None:
|
202 |
-
|
203 |
-
if days_left
|
204 |
-
|
205 |
-
if days_left <= 14:
|
206 |
-
return f"π₯ Due in {days_left}d"
|
207 |
-
if days_left <= 30:
|
208 |
-
return f"π¨ {days_left}d"
|
209 |
return f"π© {days_left}d"
|
210 |
-
# ---------- end helpers ----------
|
211 |
-
|
212 |
-
# ---------- optional diagnostics ----------
|
213 |
-
with st.expander("Diagnostics (optional)", expanded=False):
|
214 |
-
idx = Path(_env["INDEX_DIR"])
|
215 |
-
st.write("INDEX_DIR:", str(idx))
|
216 |
-
st.write("faiss.index exists:", (idx / "faiss.index").exists())
|
217 |
-
st.write("meta.json exists:", (idx / "meta.json").exists())
|
218 |
-
if (idx / "meta.json").exists():
|
219 |
-
try:
|
220 |
-
meta = json.loads((idx / "meta.json").read_text())
|
221 |
-
st.write("meta.json count:", len(meta))
|
222 |
-
st.write("meta head:", [{"id": m.get("id"), "title": m.get("title")} for m in meta[:2]])
|
223 |
-
except Exception as e:
|
224 |
-
st.error(f"Failed to read meta.json: {e!r}")
|
225 |
-
try:
|
226 |
-
demo = search("transportation", _env, top_k=3, filters={})
|
227 |
-
st.write("sample search('transportation') results:", len(demo))
|
228 |
-
if demo:
|
229 |
-
st.write(demo[:3])
|
230 |
-
except Exception as e:
|
231 |
-
st.error(f"search() raised: {e!r}")
|
232 |
-
# ---------- end diagnostics ----------
|
233 |
|
|
|
234 |
st.title("Grants Discovery RAG (Capacity Building)")
|
235 |
|
236 |
preset = st.radio(
|
@@ -238,7 +245,6 @@ preset = st.radio(
|
|
238 |
["General", "Elderly", "Prison Ministry", "Evangelism", "Vehicles/Transport", "FTA 5310"],
|
239 |
horizontal=True
|
240 |
)
|
241 |
-
|
242 |
default_q = {
|
243 |
"General": "capacity building",
|
244 |
"Elderly": "capacity building for seniors and aging services",
|
@@ -248,7 +254,6 @@ default_q = {
|
|
248 |
"FTA 5310": "5310 Enhanced Mobility Seniors Individuals with Disabilities",
|
249 |
}.get(preset, "capacity building")
|
250 |
|
251 |
-
# --- controls ---
|
252 |
q = st.text_input("Search query", value=default_q)
|
253 |
|
254 |
geo = st.multiselect("Geo filter (optional)", options=["US", "MD", "PA"], default=[])
|
@@ -256,40 +261,38 @@ categories = st.multiselect(
|
|
256 |
"Category filter (optional)",
|
257 |
options=[
|
258 |
"capacity_building","elderly","prison_ministry","evangelism",
|
259 |
-
"transportation","vehicle",
|
260 |
-
"
|
261 |
],
|
262 |
default=[]
|
263 |
)
|
264 |
|
265 |
-
|
|
|
|
|
266 |
sort_by = st.selectbox("Sort by", ["Relevance", "Deadline (soonest first)"], index=0)
|
267 |
only_open = st.checkbox("Only show opportunities with a future deadline", value=True)
|
268 |
ministry_focus = st.checkbox("Ministry Focus (hide research/defense/academic BAAs)", value=True)
|
269 |
|
270 |
-
# NEW: Sprint 2 view + agency facet
|
271 |
view = st.selectbox("View", ["All", "Saved", "Hidden"], index=0)
|
272 |
-
|
|
|
273 |
try:
|
274 |
-
meta_for_agencies = json.loads(Path(
|
275 |
agency_options = sorted({m.get("agency") for m in meta_for_agencies if m.get("agency")})
|
276 |
except Exception:
|
277 |
agency_options = []
|
278 |
sel_agencies = st.multiselect("Agency filter (optional)", options=agency_options, default=[])
|
279 |
|
280 |
-
# Build backend filters (if the search() supports them)
|
281 |
backend_filters = {}
|
282 |
if geo: backend_filters["geo"] = geo
|
283 |
if categories: backend_filters["categories"] = categories
|
284 |
if sel_agencies: backend_filters["agency"] = sel_agencies
|
285 |
|
286 |
-
#
|
287 |
-
if "saved_ids" not in st.session_state:
|
288 |
-
|
289 |
-
if "hidden_ids" not in st.session_state:
|
290 |
-
st.session_state.hidden_ids = set()
|
291 |
|
292 |
-
# action helpers
|
293 |
def _save_item(item_id: str):
|
294 |
st.session_state.saved_ids.add(item_id)
|
295 |
st.session_state.hidden_ids.discard(item_id)
|
@@ -300,101 +303,77 @@ def _hide_item(item_id: str):
|
|
300 |
st.session_state.saved_ids.discard(item_id)
|
301 |
st.experimental_rerun()
|
302 |
|
303 |
-
|
|
|
304 |
|
305 |
-
with
|
306 |
if st.button("Search"):
|
307 |
try:
|
308 |
-
raw = search(q,
|
309 |
-
|
310 |
-
|
311 |
-
# 1) Geo/Category client-side filter (fallback if backend ignores)
|
312 |
if geo or categories:
|
313 |
-
base_filtered = [r for r in
|
314 |
else:
|
315 |
-
base_filtered =
|
316 |
|
317 |
-
#
|
318 |
-
from datetime import date, datetime
|
319 |
def _to_date_safe(val):
|
320 |
if not val: return None
|
321 |
try: return datetime.fromisoformat(str(val)).date()
|
322 |
except Exception: return None
|
323 |
-
|
324 |
open_filtered = base_filtered
|
325 |
if only_open:
|
326 |
open_filtered = [r for r in base_filtered
|
327 |
if (_to_date_safe(r.get("deadline")) or date.max) >= date.today()]
|
328 |
|
329 |
-
#
|
330 |
if sel_agencies:
|
331 |
af = set(sel_agencies)
|
332 |
open_filtered = [r for r in open_filtered if (r.get("agency") in af)]
|
333 |
|
334 |
-
#
|
335 |
final_results = _ministry_filter(open_filtered) if ministry_focus else open_filtered
|
336 |
|
337 |
-
# Clear/show hidden toggle mgmt
|
338 |
-
if not ministry_focus and st.session_state.get("show_hidden"):
|
339 |
-
st.session_state.pop("show_hidden", None)
|
340 |
-
|
341 |
-
hidden_due_to_ministry = 0
|
342 |
-
if ministry_focus:
|
343 |
-
hidden_due_to_ministry = len(open_filtered) - len(final_results)
|
344 |
-
st.session_state.pop("show_hidden", None)
|
345 |
-
|
346 |
st.session_state["results"] = final_results
|
347 |
st.session_state["last_query"] = q
|
348 |
st.session_state["last_filters"] = {
|
349 |
-
"geo": geo, "categories": categories,
|
350 |
-
"
|
351 |
-
"agencies": sel_agencies,
|
352 |
}
|
353 |
|
354 |
-
|
355 |
-
|
356 |
-
f"Open-only: {len(open_filtered)} β’ Displaying: {len(final_results)}"
|
357 |
-
+ (f" β’ Hidden by ministry filter: {hidden_due_to_ministry}" if ministry_focus else "")
|
358 |
-
)
|
359 |
|
360 |
-
|
361 |
-
if st.checkbox(f"Show hidden items ({hidden_due_to_ministry})", value=False, key="show_hidden"):
|
362 |
-
st.session_state["results"] = open_filtered
|
363 |
except Exception as e:
|
364 |
st.error(str(e))
|
365 |
|
366 |
-
|
367 |
-
with col2:
|
368 |
if st.button("Export Results to CSV"):
|
369 |
results_for_export = st.session_state.get("results", [])
|
370 |
if not results_for_export:
|
371 |
st.warning("No results to export. Run a search first.")
|
372 |
else:
|
373 |
-
|
374 |
-
|
|
|
375 |
import pandas as pd
|
376 |
pd.DataFrame(results_for_export).to_csv(out_path, index=False)
|
377 |
st.success(f"Exported to {out_path}")
|
378 |
|
379 |
st.markdown("---")
|
380 |
|
381 |
-
#
|
382 |
-
from datetime import date, datetime
|
383 |
-
def _to_date(d):
|
384 |
-
if not d: return None
|
385 |
-
try: return datetime.fromisoformat(str(d)).date()
|
386 |
-
except Exception: return None
|
387 |
-
|
388 |
-
# ---- Render results ----
|
389 |
results = st.session_state.get("results", [])
|
|
|
390 |
|
391 |
-
#
|
392 |
if view == "Saved":
|
393 |
results = [r for r in results if r.get("id") in st.session_state.saved_ids]
|
394 |
elif view == "Hidden":
|
395 |
results = [r for r in results if r.get("id") in st.session_state.hidden_ids]
|
396 |
|
397 |
-
#
|
398 |
if sort_by.startswith("Deadline") and results:
|
399 |
results.sort(
|
400 |
key=lambda r: (
|
@@ -403,42 +382,175 @@ if sort_by.startswith("Deadline") and results:
|
|
403 |
)
|
404 |
)
|
405 |
|
406 |
-
#
|
407 |
-
|
408 |
-
|
409 |
-
if
|
410 |
-
st.
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
else:
|
441 |
if ran_search:
|
442 |
-
st.info("No active grants match these filters right now.
|
443 |
else:
|
444 |
st.info("Enter a query and click Search.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
import os, json
|
10 |
from pathlib import Path
|
11 |
+
from datetime import date, datetime
|
12 |
+
|
13 |
+
import streamlit as st
|
14 |
|
15 |
from app.main import get_env, ensure_index_exists
|
16 |
from app.search import search
|
17 |
|
18 |
+
# ββ Page config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
19 |
+
st.set_page_config(page_title="Grants Discovery App By Lupo", page_icon="π§", layout="wide")
|
20 |
|
21 |
+
# ββ THEME / CSS β single, unified block (dark app; readable controls) βββββββββ
|
22 |
st.markdown("""
|
23 |
<style>
|
24 |
+
/* App base */
|
25 |
+
:root {
|
26 |
+
--bg: #0a0f1a;
|
27 |
+
--panel: #121827;
|
28 |
+
--text: #e5eefb;
|
29 |
+
--muted: #95a3b8;
|
30 |
+
--accent: #f97316;
|
31 |
+
--border: #2b3a55;
|
32 |
}
|
33 |
+
.stApp { background-color: var(--bg); color: var(--text); }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
/* Typo & links */
|
36 |
+
html, body, [class*="st-"], h1,h2,h3,h4,h5,h6, p, span, div { color: var(--text) !important; }
|
37 |
+
a, .stRadio > label, .stSlider label { color: var(--accent) !important; }
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
/* Buttons */
|
40 |
+
.stButton>button {
|
41 |
+
background: var(--accent); color:#fff; border:none; border-radius:10px;
|
42 |
+
padding: 0.5rem 0.9rem; font-weight:600;
|
43 |
+
}
|
44 |
.stButton>button:hover { filter:brightness(1.1); }
|
45 |
+
|
46 |
+
/* Text inputs */
|
47 |
+
.stTextInput input, .stTextArea textarea {
|
48 |
+
background: var(--panel) !important; color: var(--text) !important;
|
49 |
+
border: 1px solid var(--border) !important; border-radius: 10px !important;
|
50 |
+
}
|
51 |
+
|
52 |
+
/* ===== FIXED: Select/Multiselect controls - HIGH CONTRAST ===== */
|
53 |
+
|
54 |
+
/* Labels above the controls */
|
55 |
+
[data-testid="stSelectbox"] label div,
|
56 |
+
[data-testid="stMultiSelect"] label div {
|
57 |
+
color: #e5eefb !important; /* Light text for dark background */
|
58 |
+
font-weight: 600;
|
59 |
+
}
|
60 |
+
|
61 |
+
/* Closed control (the combobox) */
|
62 |
+
[data-testid="stSelectbox"] div[role="combobox"],
|
63 |
+
[data-testid="stMultiSelect"] div[role="combobox"] {
|
64 |
+
background: #1e293b !important; /* Dark field */
|
65 |
+
color: #f8fafc !important; /* Light text - HIGH CONTRAST */
|
66 |
+
border: 1px solid #475569 !important;
|
67 |
+
border-radius: 10px !important;
|
68 |
+
font-weight: 500;
|
69 |
+
}
|
70 |
+
|
71 |
+
/* Text & icons inside the closed control */
|
72 |
+
[data-testid="stSelectbox"] div[role="combobox"] *,
|
73 |
+
[data-testid="stMultiSelect"] div[role="combobox"] * {
|
74 |
+
color: #f8fafc !important; /* Force light text */
|
75 |
+
fill: #f8fafc !important; /* Force light icons */
|
76 |
}
|
77 |
+
|
78 |
+
/* Placeholder text */
|
79 |
+
[data-testid="stMultiSelect"] input::placeholder {
|
80 |
+
color: #94a3b8 !important; /* Muted but visible placeholder */
|
81 |
+
}
|
82 |
+
|
83 |
+
/* Multiselect chips */
|
84 |
+
[data-baseweb="tag"] {
|
85 |
+
background: #334155 !important;
|
86 |
+
color: #e2e8f0 !important; /* Light text on chips */
|
87 |
+
border-radius: 999px !important;
|
88 |
+
font-weight: 500;
|
89 |
+
}
|
90 |
+
|
91 |
+
/* Dropdown menu (popover) - DARK THEME */
|
92 |
+
div[data-baseweb="popover"] {
|
93 |
+
z-index: 999999 !important; /* Ensure it appears above everything */
|
94 |
+
}
|
95 |
+
|
96 |
+
div[data-baseweb="popover"] [role="listbox"],
|
97 |
+
div[data-baseweb="menu"],
|
98 |
+
ul[role="listbox"] {
|
99 |
+
background: #1e293b !important; /* Dark menu background */
|
100 |
+
color: #f8fafc !important; /* Light text - HIGH CONTRAST */
|
101 |
+
border: 1px solid #475569 !important;
|
102 |
+
border-radius: 10px !important;
|
103 |
+
}
|
104 |
+
|
105 |
+
/* Options in dropdown */
|
106 |
+
[role="listbox"] [role="option"],
|
107 |
+
div[data-baseweb="menu"] [role="option"] {
|
108 |
+
background: transparent !important;
|
109 |
+
color: #f8fafc !important; /* Light text */
|
110 |
+
font-weight: 500;
|
111 |
+
}
|
112 |
+
|
113 |
+
/* Hover state */
|
114 |
+
[role="listbox"] [role="option"]:hover,
|
115 |
+
div[data-baseweb="menu"] [role="option"]:hover {
|
116 |
+
background: #334155 !important; /* Slightly lighter on hover */
|
117 |
+
color: #ffffff !important;
|
118 |
+
}
|
119 |
+
|
120 |
+
/* Selected state */
|
121 |
+
[role="listbox"] [role="option"][aria-selected="true"],
|
122 |
+
div[data-baseweb="menu"] [role="option"][aria-selected="true"] {
|
123 |
+
background: #475569 !important; /* Highlight selected */
|
124 |
+
color: #ffffff !important;
|
125 |
+
font-weight: 600;
|
126 |
+
}
|
127 |
+
|
128 |
+
/* Search input inside multiselect dropdown */
|
129 |
+
div[data-baseweb="popover"] input {
|
130 |
+
background: #0f172a !important;
|
131 |
+
color: #f8fafc !important;
|
132 |
+
border: 1px solid #475569 !important;
|
133 |
+
border-radius: 6px !important;
|
134 |
+
}
|
135 |
+
|
136 |
+
/* Cards */
|
137 |
+
.result-card { border:1px solid var(--border); background: var(--panel);
|
138 |
+
border-radius:14px; padding:16px; margin:10px 0; box-shadow:0 1px 2px rgba(0,0,0,0.2); }
|
139 |
+
.result-meta { font-size:13px; color: var(--muted); margin-top:6px; }
|
140 |
+
span.chip { display:inline-block; padding:3px 8px; border-radius:999px; background:#2a354a;
|
141 |
+
margin-right:6px; font-size:12px; color:var(--text); }
|
142 |
+
|
143 |
+
/* Hero */
|
144 |
+
.hero { height: 220px; border-radius: 16px; margin: 6px 0 16px;
|
145 |
background: linear-gradient(rgba(0,0,0,.45), rgba(0,0,0,.45)),
|
146 |
url('https://images.unsplash.com/photo-1469474968028-56623f02e42e?auto=format&fit=crop&w=1280&q=80') center/cover no-repeat; }
|
147 |
.hero-text { height:100%; display:flex; flex-direction:column; align-items:center; justify-content:center; text-align:center; color:#fff; }
|
148 |
+
.hero-text h1 { margin:0; font-size:28px; font-weight:700; color: var(--accent); }
|
149 |
.hero-text p { margin:6px 0 0; font-size:15px; color:#fcd34d; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
</style>
|
151 |
""", unsafe_allow_html=True)
|
152 |
|
153 |
+
# ββ Hero ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
154 |
st.markdown("""
|
155 |
<div class="hero">
|
156 |
<div class="hero-text">
|
|
|
160 |
</div>
|
161 |
""", unsafe_allow_html=True)
|
162 |
|
163 |
+
# ββ Environment & index βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
164 |
_env = get_env()
|
165 |
ensure_index_exists(_env)
|
166 |
|
167 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
def _norm_list(v):
|
169 |
+
if v is None: return []
|
|
|
170 |
if isinstance(v, str):
|
171 |
parts = [p.strip() for p in v.replace(";", ",").split(",")]
|
172 |
return [p.lower() for p in parts if p]
|
|
|
182 |
return g_ok and c_ok
|
183 |
|
184 |
def _ministry_filter(rows):
|
185 |
+
if not rows: return rows
|
|
|
186 |
banned_terms = [
|
187 |
"broad agency announcement", "baa", "research", "r&d", "prototype",
|
188 |
"laboratory", "university", "sbir", "sttr",
|
|
|
190 |
"w911", "n00014", "fa-", "afrl", "arpa"
|
191 |
]
|
192 |
preferred_agencies = {
|
193 |
+
"FTA","HHS","ACL","USDA","USDA-FNS","USDA-RD","DOL","DOJ","OJP","OVW","EDA","HRSA","SAMHSA","CFPB","HUD"
|
|
|
194 |
}
|
195 |
+
terms = [
|
196 |
+
"vehicle","van","bus","paratransit","mobility",
|
197 |
+
"congregate meals","home-delivered meals","senior nutrition",
|
198 |
+
"food pantry","food bank","hunger relief","refrigeration","freezer",
|
199 |
+
"community","faith","church","ministry","nonprofit",
|
200 |
+
"reentry","workforce","case management","technical assistance","capacity"
|
201 |
]
|
202 |
def txt(r):
|
203 |
+
return " ".join([str(r.get("title","")),
|
204 |
+
str(r.get("synopsis") or r.get("summary") or ""),
|
205 |
+
str(r.get("agency") or "")]).lower()
|
206 |
+
kept=[]
|
|
|
|
|
|
|
207 |
for r in rows:
|
208 |
t = txt(r)
|
209 |
+
if any(b in t for b in banned_terms): continue
|
|
|
210 |
agency = (r.get("agency") or "").upper()
|
211 |
+
cats = [c.lower() for c in (r.get("categories") or [])] if isinstance(r.get("categories"), list) else []
|
212 |
+
prefer = any(agency.startswith(a) for a in preferred_agencies)
|
213 |
+
has_cue = any(term in t for term in terms) or any(
|
214 |
c in {"transportation","vehicle","elderly","disabled","food","community","justice","reentry","workforce"} for c in cats
|
215 |
)
|
216 |
+
if prefer or has_cue:
|
217 |
kept.append(r)
|
218 |
return kept
|
219 |
|
220 |
+
def _to_date(d):
|
221 |
+
if not d: return None
|
222 |
+
try: return datetime.fromisoformat(str(d)).date()
|
223 |
+
except Exception: return None
|
224 |
+
|
225 |
def _days_until(iso):
|
226 |
+
if not iso: return None
|
|
|
|
|
227 |
try:
|
228 |
d = datetime.fromisoformat(str(iso)).date()
|
229 |
return (d - date.today()).days
|
|
|
231 |
return None
|
232 |
|
233 |
def _deadline_badge(days_left):
|
234 |
+
if days_left is None: return "π¦ TBD"
|
235 |
+
if days_left < 0: return "β¬ Closed"
|
236 |
+
if days_left <= 14: return f"π₯ Due in {days_left}d"
|
237 |
+
if days_left <= 30: return f"π¨ {days_left}d"
|
|
|
|
|
|
|
|
|
238 |
return f"π© {days_left}d"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
+
# ββ UI: Presets & inputs ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
241 |
st.title("Grants Discovery RAG (Capacity Building)")
|
242 |
|
243 |
preset = st.radio(
|
|
|
245 |
["General", "Elderly", "Prison Ministry", "Evangelism", "Vehicles/Transport", "FTA 5310"],
|
246 |
horizontal=True
|
247 |
)
|
|
|
248 |
default_q = {
|
249 |
"General": "capacity building",
|
250 |
"Elderly": "capacity building for seniors and aging services",
|
|
|
254 |
"FTA 5310": "5310 Enhanced Mobility Seniors Individuals with Disabilities",
|
255 |
}.get(preset, "capacity building")
|
256 |
|
|
|
257 |
q = st.text_input("Search query", value=default_q)
|
258 |
|
259 |
geo = st.multiselect("Geo filter (optional)", options=["US", "MD", "PA"], default=[])
|
|
|
261 |
"Category filter (optional)",
|
262 |
options=[
|
263 |
"capacity_building","elderly","prison_ministry","evangelism",
|
264 |
+
"transportation","vehicle","justice","reentry",
|
265 |
+
"victim_services","youth","women","food","workforce"
|
266 |
],
|
267 |
default=[]
|
268 |
)
|
269 |
|
270 |
+
# Fetch more so pagination is meaningful
|
271 |
+
top_k = st.slider("Fetch up to (results)", 50, 500, 200, step=50)
|
272 |
+
|
273 |
sort_by = st.selectbox("Sort by", ["Relevance", "Deadline (soonest first)"], index=0)
|
274 |
only_open = st.checkbox("Only show opportunities with a future deadline", value=True)
|
275 |
ministry_focus = st.checkbox("Ministry Focus (hide research/defense/academic BAAs)", value=True)
|
276 |
|
|
|
277 |
view = st.selectbox("View", ["All", "Saved", "Hidden"], index=0)
|
278 |
+
|
279 |
+
# Agencies facet from meta
|
280 |
try:
|
281 |
+
meta_for_agencies = json.loads(Path(get_env()["INDEX_DIR"], "meta.json").read_text())
|
282 |
agency_options = sorted({m.get("agency") for m in meta_for_agencies if m.get("agency")})
|
283 |
except Exception:
|
284 |
agency_options = []
|
285 |
sel_agencies = st.multiselect("Agency filter (optional)", options=agency_options, default=[])
|
286 |
|
|
|
287 |
backend_filters = {}
|
288 |
if geo: backend_filters["geo"] = geo
|
289 |
if categories: backend_filters["categories"] = categories
|
290 |
if sel_agencies: backend_filters["agency"] = sel_agencies
|
291 |
|
292 |
+
# Sprint 2: Save/Hide state
|
293 |
+
if "saved_ids" not in st.session_state: st.session_state.saved_ids = set()
|
294 |
+
if "hidden_ids" not in st.session_state: st.session_state.hidden_ids = set()
|
|
|
|
|
295 |
|
|
|
296 |
def _save_item(item_id: str):
|
297 |
st.session_state.saved_ids.add(item_id)
|
298 |
st.session_state.hidden_ids.discard(item_id)
|
|
|
303 |
st.session_state.saved_ids.discard(item_id)
|
304 |
st.experimental_rerun()
|
305 |
|
306 |
+
# ββ Search & filter pipeline (stores full result set) βββββββββββββββββββββββββ
|
307 |
+
c1, c2 = st.columns([1,1])
|
308 |
|
309 |
+
with c1:
|
310 |
if st.button("Search"):
|
311 |
try:
|
312 |
+
raw = search(q, get_env(), top_k=top_k, filters=backend_filters) # fetch many
|
313 |
+
# Geo/Category client-side fallback
|
|
|
|
|
314 |
if geo or categories:
|
315 |
+
base_filtered = [r for r in raw if _matches_filters(r, geo, categories)]
|
316 |
else:
|
317 |
+
base_filtered = raw
|
318 |
|
319 |
+
# Only open
|
|
|
320 |
def _to_date_safe(val):
|
321 |
if not val: return None
|
322 |
try: return datetime.fromisoformat(str(val)).date()
|
323 |
except Exception: return None
|
|
|
324 |
open_filtered = base_filtered
|
325 |
if only_open:
|
326 |
open_filtered = [r for r in base_filtered
|
327 |
if (_to_date_safe(r.get("deadline")) or date.max) >= date.today()]
|
328 |
|
329 |
+
# Agency
|
330 |
if sel_agencies:
|
331 |
af = set(sel_agencies)
|
332 |
open_filtered = [r for r in open_filtered if (r.get("agency") in af)]
|
333 |
|
334 |
+
# Ministry
|
335 |
final_results = _ministry_filter(open_filtered) if ministry_focus else open_filtered
|
336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
st.session_state["results"] = final_results
|
338 |
st.session_state["last_query"] = q
|
339 |
st.session_state["last_filters"] = {
|
340 |
+
"geo": geo, "categories": categories, "only_open": only_open,
|
341 |
+
"ministry_focus": ministry_focus, "agencies": sel_agencies,
|
|
|
342 |
}
|
343 |
|
344 |
+
# RESET PAGINATION on new run
|
345 |
+
st.session_state.page = 1
|
|
|
|
|
|
|
346 |
|
347 |
+
st.success(f"Fetched {len(raw)} β’ After filters: {len(final_results)}")
|
|
|
|
|
348 |
except Exception as e:
|
349 |
st.error(str(e))
|
350 |
|
351 |
+
with c2:
|
|
|
352 |
if st.button("Export Results to CSV"):
|
353 |
results_for_export = st.session_state.get("results", [])
|
354 |
if not results_for_export:
|
355 |
st.warning("No results to export. Run a search first.")
|
356 |
else:
|
357 |
+
out_dir = get_env()["EXPORT_DIR"]
|
358 |
+
os.makedirs(out_dir, exist_ok=True)
|
359 |
+
out_path = os.path.join(out_dir, "results.csv")
|
360 |
import pandas as pd
|
361 |
pd.DataFrame(results_for_export).to_csv(out_path, index=False)
|
362 |
st.success(f"Exported to {out_path}")
|
363 |
|
364 |
st.markdown("---")
|
365 |
|
366 |
+
# ββ Post-search view/sort/pagination (5.4) ββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
results = st.session_state.get("results", [])
|
368 |
+
ran_search = bool(st.session_state.get("last_query"))
|
369 |
|
370 |
+
# View filter
|
371 |
if view == "Saved":
|
372 |
results = [r for r in results if r.get("id") in st.session_state.saved_ids]
|
373 |
elif view == "Hidden":
|
374 |
results = [r for r in results if r.get("id") in st.session_state.hidden_ids]
|
375 |
|
376 |
+
# Sort
|
377 |
if sort_by.startswith("Deadline") and results:
|
378 |
results.sort(
|
379 |
key=lambda r: (
|
|
|
382 |
)
|
383 |
)
|
384 |
|
385 |
+
# Pagination state
|
386 |
+
if "page_size" not in st.session_state:
|
387 |
+
st.session_state.page_size = 25
|
388 |
+
if "page" not in st.session_state:
|
389 |
+
st.session_state.page = 1
|
390 |
+
|
391 |
+
total = len(results)
|
392 |
+
st.caption(f"Results: {total}")
|
393 |
+
|
394 |
+
# Controls
|
395 |
+
cols = st.columns([1,1,2,2,2])
|
396 |
+
with cols[0]:
|
397 |
+
page_size = st.selectbox("Page size", [10, 25, 50, 100], index=1)
|
398 |
+
st.session_state.page_size = page_size
|
399 |
+
# compute pages
|
400 |
+
total_pages = max(1, (total + page_size - 1) // page_size)
|
401 |
+
with cols[1]:
|
402 |
+
page = st.number_input("Page", min_value=1, max_value=total_pages,
|
403 |
+
value=min(st.session_state.page, total_pages), step=1)
|
404 |
+
st.session_state.page = page
|
405 |
+
|
406 |
+
# Slice AFTER filters & sort
|
407 |
+
start = (st.session_state.page - 1) * st.session_state.page_size
|
408 |
+
end = min(start + st.session_state.page_size, total)
|
409 |
+
page_items = results[start:end]
|
410 |
+
st.caption(f"Showing {start+1 if total else 0}β{end} of {total} β’ Page {st.session_state.page}/{total_pages}")
|
411 |
+
|
412 |
+
# Nav buttons
|
413 |
+
prev_col, _, next_col = st.columns([1,6,1])
|
414 |
+
with prev_col:
|
415 |
+
if st.button("β Prev", disabled=(st.session_state.page <= 1)):
|
416 |
+
st.session_state.page = max(1, st.session_state.page - 1)
|
417 |
+
st.experimental_rerun()
|
418 |
+
with next_col:
|
419 |
+
if st.button("Next βΆ", disabled=(st.session_state.page >= total_pages)):
|
420 |
+
st.session_state.page = min(total_pages, st.session_state.page + 1)
|
421 |
+
st.experimental_rerun()
|
422 |
+
|
423 |
+
# ββ Render page items βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
424 |
+
def _render_card(r):
|
425 |
+
title = r.get("title", "(no title)")
|
426 |
+
url = r.get("url", "")
|
427 |
+
cats = r.get("categories") or r.get("cats") or []
|
428 |
+
geo_tags = r.get("geo") or []
|
429 |
+
_id = r.get("id") or r.get("url") or title
|
430 |
+
posted = r.get("posted_date") or ""
|
431 |
+
deadline = r.get("deadline") or ""
|
432 |
+
days_left = _days_until(deadline)
|
433 |
+
|
434 |
+
st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
|
435 |
+
st.markdown(f"### {title}")
|
436 |
+
meta = f"**Source:** {r.get('source','')} β’ **Geo:** {', '.join(geo_tags) if isinstance(geo_tags,list) else geo_tags} β’ **Categories:** {', '.join(cats) if isinstance(cats,list) else cats}"
|
437 |
+
st.markdown(f"<div class='result-meta'>{meta}</div>", unsafe_allow_html=True)
|
438 |
+
|
439 |
+
# Link / score
|
440 |
+
if url and not url.startswith('http'):
|
441 |
+
st.caption("Note: This item may display an ID instead of a full link. Open on Grants.gov if needed.")
|
442 |
+
if url:
|
443 |
+
st.write(f"[Open Link]({url})")
|
444 |
+
if r.get("score") is not None:
|
445 |
+
st.caption(f"Score: {r.get('score', 0):.3f}")
|
446 |
+
|
447 |
+
# Deadline
|
448 |
+
st.caption(f"Posted: {posted} β’ Deadline: {deadline} β’ {_deadline_badge(days_left)}")
|
449 |
+
|
450 |
+
# Save/Hide
|
451 |
+
c1, c2, _ = st.columns([1,1,6])
|
452 |
+
if c1.button(("β
Saved" if _id in st.session_state.saved_ids else "πΎ Save"), key=f"save-{_id}"):
|
453 |
+
_save_item(_id)
|
454 |
+
if c2.button(("π Hidden" if _id in st.session_state.hidden_ids else "π Hide"), key=f"hide-{_id}"):
|
455 |
+
_hide_item(_id)
|
456 |
+
|
457 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
458 |
+
|
459 |
+
if page_items:
|
460 |
+
for r in page_items:
|
461 |
+
_render_card(r)
|
462 |
else:
|
463 |
if ran_search:
|
464 |
+
st.info("No active grants match these filters right now.")
|
465 |
else:
|
466 |
st.info("Enter a query and click Search.")
|
467 |
+
|
468 |
+
st.markdown("""
|
469 |
+
<style>
|
470 |
+
/* ================== SELECT/MULTISELECT HARD OVERRIDE ================== */
|
471 |
+
/* Goal: kill white-on-white by styling the BaseWeb select root + portal. */
|
472 |
+
/* Works across Chrome/Safari/Firefox; includes -webkit-text-fill-color fix. */
|
473 |
+
|
474 |
+
/* 1) CLOSED CONTROL (the visible field) β target the BaseWeb root */
|
475 |
+
body div[data-baseweb="select"] {
|
476 |
+
background: #1e293b !important; /* dark field */
|
477 |
+
color: #f8fafc !important; /* light text */
|
478 |
+
border: 1px solid #475569 !important;
|
479 |
+
border-radius: 10px !important;
|
480 |
+
}
|
481 |
+
|
482 |
+
/* Make absolutely everything inside readable (some builds render text in spans) */
|
483 |
+
body div[data-baseweb="select"] * {
|
484 |
+
color: #f8fafc !important;
|
485 |
+
-webkit-text-fill-color: #f8fafc !important; /* Safari/Chromium quirk */
|
486 |
+
fill: #f8fafc !important;
|
487 |
+
}
|
488 |
+
|
489 |
+
/* Placeholder node BaseWeb renders (aria-hidden) */
|
490 |
+
body div[data-baseweb="select"] div[aria-hidden="true"] {
|
491 |
+
color: #94a3b8 !important;
|
492 |
+
-webkit-text-fill-color: #94a3b8 !important;
|
493 |
+
}
|
494 |
+
|
495 |
+
/* Ensure the actual input inherits readable color */
|
496 |
+
body div[data-baseweb="select"] input {
|
497 |
+
color: #f8fafc !important;
|
498 |
+
-webkit-text-fill-color: #f8fafc !important;
|
499 |
+
caret-color: #f8fafc !important;
|
500 |
+
background: transparent !important;
|
501 |
+
}
|
502 |
+
|
503 |
+
/* 2) OPEN DROPDOWN MENU (lives in a portal under <body>) */
|
504 |
+
body div[data-baseweb="popover"] [role="listbox"],
|
505 |
+
body div[data-baseweb="menu"],
|
506 |
+
body ul[role="listbox"] {
|
507 |
+
background: #1e293b !important;
|
508 |
+
color: #f8fafc !important;
|
509 |
+
border: 1px solid #475569 !important;
|
510 |
+
border-radius: 10px !important;
|
511 |
+
z-index: 2147483647 !important;
|
512 |
+
}
|
513 |
+
|
514 |
+
/* Options inside the menu */
|
515 |
+
body [role="listbox"] [role="option"],
|
516 |
+
body div[data-baseweb="menu"] [role="option"] {
|
517 |
+
background: transparent !important;
|
518 |
+
color: #f8fafc !important;
|
519 |
+
}
|
520 |
+
body [role="listbox"] [role="option"]:hover,
|
521 |
+
body div[data-baseweb="menu"] [role="option"]:hover {
|
522 |
+
background: #334155 !important;
|
523 |
+
}
|
524 |
+
body [role="listbox"] [role="option"][aria-selected="true"],
|
525 |
+
body div[data-baseweb="menu"] [role="option"][aria-selected="true"] {
|
526 |
+
background: #475569 !important;
|
527 |
+
color: #ffffff !important;
|
528 |
+
}
|
529 |
+
|
530 |
+
/* 3) EMERGENCY FALLBACK β if a theme forces a white menu inline, flip text dark */
|
531 |
+
body div[data-baseweb="popover"][style*="rgb(255"] [role="listbox"],
|
532 |
+
body div[data-baseweb="popover"][style*="255, 255, 255"] [role="listbox"] {
|
533 |
+
background: #ffffff !important;
|
534 |
+
color: #0f172a !important;
|
535 |
+
border: 1px solid #cbd5e1 !important;
|
536 |
+
}
|
537 |
+
body div[data-baseweb="popover"][style*="rgb(255"] [role="listbox"] * ,
|
538 |
+
body div[data-baseweb="popover"][style*="255, 255, 255"] [role="listbox"] * {
|
539 |
+
color: #0f172a !important;
|
540 |
+
-webkit-text-fill-color: #0f172a !important;
|
541 |
+
}
|
542 |
+
|
543 |
+
/* 4) MULTISELECT CHIPS */
|
544 |
+
body [data-baseweb="tag"] {
|
545 |
+
background: #334155 !important;
|
546 |
+
color: #e2e8f0 !important;
|
547 |
+
border-radius: 999px !important;
|
548 |
+
}
|
549 |
+
|
550 |
+
/* 5) OPTIONAL: turn on outlines once to verify the selector match (debug)
|
551 |
+
body div[data-baseweb="select"] { outline: 1px dashed #22d3ee !important; }
|
552 |
+
body div[data-baseweb="popover"] [role="listbox"] { outline: 1px dashed #22d3ee !important; }
|
553 |
+
*/
|
554 |
+
</style>
|
555 |
+
""", unsafe_allow_html=True)
|
556 |
+
|
app/utils/dedupe.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re, hashlib
|
2 |
+
from difflib import SequenceMatcher
|
3 |
+
|
4 |
+
def _norm(text: str) -> str:
|
5 |
+
"""Lowercase, strip punctuation, collapse whitespace."""
|
6 |
+
t = (text or "").lower()
|
7 |
+
t = re.sub(r'[^a-z0-9 ]+', ' ', t)
|
8 |
+
return re.sub(r'\s+', ' ', t).strip()
|
9 |
+
|
10 |
+
def hash_fingerprint(title: str, agency: str, deadline: str) -> str:
|
11 |
+
"""
|
12 |
+
A strong key: normalized title + agency + deadline.
|
13 |
+
Use this as a primary key in your datastore.
|
14 |
+
"""
|
15 |
+
base = f"{_norm(title)}|{_norm(agency)}|{deadline or ''}"
|
16 |
+
return hashlib.sha1(base.encode()).hexdigest()
|
17 |
+
|
18 |
+
def near_duplicate(a: dict, b: dict) -> bool:
|
19 |
+
"""
|
20 |
+
Fuzzy fallback: similar title & agency,
|
21 |
+
and deadlines match or are both blank.
|
22 |
+
"""
|
23 |
+
dates_close = (a.get("deadline") == b.get("deadline")) \
|
24 |
+
or (not a.get("deadline") and not b.get("deadline"))
|
25 |
+
t_sim = SequenceMatcher(None, _norm(a.get("title","")),
|
26 |
+
_norm(b.get("title",""))).ratio()
|
27 |
+
ag_sim = SequenceMatcher(None, _norm(a.get("agency","")),
|
28 |
+
_norm(b.get("agency",""))).ratio()
|
29 |
+
return dates_close and (t_sim > 0.88) and (ag_sim > 0.75)
|
app/utils/normalize.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/utils/normalize.py
|
2 |
+
ELIGIBILITY_MAP = {
|
3 |
+
"nonprofit": "NONPROFIT_501C3",
|
4 |
+
"501(c)(3)": "NONPROFIT_501C3",
|
5 |
+
"local government": "LOCAL_GOV",
|
6 |
+
"state government": "STATE_GOV",
|
7 |
+
"higher education": "HIGHER_ED",
|
8 |
+
}
|
9 |
+
def normalize_eligibility(raw: str) -> list[str]:
|
10 |
+
vals = []
|
11 |
+
txt = (raw or "").lower()
|
12 |
+
for k,v in ELIGIBILITY_MAP.items():
|
13 |
+
if k in txt:
|
14 |
+
vals.append(v)
|
15 |
+
return sorted(set(vals)) or ["UNKNOWN"]
|
config/sources.yaml
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# Minimal, valid config β v6.3
|
2 |
filters:
|
3 |
-
capacity_only:
|
4 |
pa_md_only: false # set to true to restrict index to PA/MD
|
5 |
|
6 |
sources:
|
@@ -46,7 +46,6 @@ sources:
|
|
46 |
page_size: 100
|
47 |
max_pages: 3
|
48 |
payload:
|
49 |
-
# Target 5310 by ALN and keywords
|
50 |
aln: "20.513"
|
51 |
keyword: "\"Enhanced Mobility\" OR \"Section 5310\" OR seniors OR elderly OR disabilities OR paratransit OR wheelchair OR shuttle OR van OR bus"
|
52 |
oppStatuses: "posted"
|
@@ -86,9 +85,27 @@ sources:
|
|
86 |
fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
|
87 |
sortBy: "openDate|desc"
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
# ---------- STATE & METRO PASS-THROUGHS (FTA 5310 etc.) ----------
|
90 |
-
# NOTE: These require adapters (http_html/web_page/http_pdf) you haven't implemented yet.
|
91 |
-
# They are kept here (enabled) in case your runtime supports them; otherwise set enabled: false.
|
92 |
|
93 |
- name: "Maryland MTA β Grants (incl. 5310)"
|
94 |
type: web_page
|
@@ -162,7 +179,7 @@ sources:
|
|
162 |
mode: "article"
|
163 |
keep_links: true
|
164 |
|
165 |
-
# --- Pennsylvania:
|
166 |
- name: "PA Creative Industries β Capacity Building (landing)"
|
167 |
type: http_html
|
168 |
enabled: true
|
@@ -226,22 +243,30 @@ sources:
|
|
226 |
geo: "PA"
|
227 |
categories: ["capacity_building"]
|
228 |
|
229 |
-
# --- Maryland: OneStop (
|
230 |
-
- name: "Maryland OneStop β Capacity search"
|
231 |
-
type:
|
232 |
enabled: true
|
233 |
url: "https://onestop.md.gov/search?query=capacity"
|
234 |
geo: "MD"
|
235 |
categories: ["capacity_building"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
parse:
|
237 |
follow_links: true
|
238 |
link_selectors:
|
239 |
- "a[href*='/forms/']"
|
240 |
- "a[href*='/search/']"
|
241 |
content_selectors:
|
|
|
242 |
- "main"
|
243 |
- "article"
|
244 |
-
- "[role='main']"
|
245 |
|
246 |
# --- Maryland: DHCD (housing/community programs & press) ---
|
247 |
- name: "MD DHCD β Programs (grants & loans index)"
|
@@ -304,6 +329,29 @@ sources:
|
|
304 |
geo: "MD"
|
305 |
categories: ["capacity_building"]
|
306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
# ---------- OPTIONAL: Curated JSON (enable after you generate it) ----------
|
308 |
- name: "State 5310 Listings (curated JSON)"
|
309 |
type: json_static
|
@@ -311,3 +359,27 @@ sources:
|
|
311 |
file: "data/state_5310_listings.json"
|
312 |
geo: "PA|MD|VA|DC"
|
313 |
categories: ["transportation","elderly","disabilities","5310","deadlines"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Minimal, valid config β v6.3
|
2 |
filters:
|
3 |
+
capacity_only: false # keep only capacity-building items
|
4 |
pa_md_only: false # set to true to restrict index to PA/MD
|
5 |
|
6 |
sources:
|
|
|
46 |
page_size: 100
|
47 |
max_pages: 3
|
48 |
payload:
|
|
|
49 |
aln: "20.513"
|
50 |
keyword: "\"Enhanced Mobility\" OR \"Section 5310\" OR seniors OR elderly OR disabilities OR paratransit OR wheelchair OR shuttle OR van OR bus"
|
51 |
oppStatuses: "posted"
|
|
|
85 |
fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
|
86 |
sortBy: "openDate|desc"
|
87 |
|
88 |
+
# ---------- FEDERAL: Federal Register (broad NOFO scanning) ----------
|
89 |
+
|
90 |
+
- name: "Federal Register β Funding/NOFO keywords (API)"
|
91 |
+
type: http_json
|
92 |
+
enabled: true
|
93 |
+
url: "https://www.federalregister.gov/api/v1/documents.json"
|
94 |
+
geo: "US"
|
95 |
+
categories: ["capacity_building", "notices"]
|
96 |
+
api:
|
97 |
+
payload:
|
98 |
+
conditions[term]: "funding opportunity OR cooperative agreement OR NOFO"
|
99 |
+
per_page: 50
|
100 |
+
order: "newest"
|
101 |
+
parse:
|
102 |
+
item_path: "results[]"
|
103 |
+
title: "title"
|
104 |
+
link: "html_url"
|
105 |
+
published_at: "publication_date"
|
106 |
+
body: "abstract"
|
107 |
+
|
108 |
# ---------- STATE & METRO PASS-THROUGHS (FTA 5310 etc.) ----------
|
|
|
|
|
109 |
|
110 |
- name: "Maryland MTA β Grants (incl. 5310)"
|
111 |
type: web_page
|
|
|
179 |
mode: "article"
|
180 |
keep_links: true
|
181 |
|
182 |
+
# --- Pennsylvania: PA Creative Industries (PCA) ---
|
183 |
- name: "PA Creative Industries β Capacity Building (landing)"
|
184 |
type: http_html
|
185 |
enabled: true
|
|
|
243 |
geo: "PA"
|
244 |
categories: ["capacity_building"]
|
245 |
|
246 |
+
# --- Maryland: OneStop (JS-rendered search) ---
|
247 |
+
- name: "Maryland OneStop β Capacity search (JS)"
|
248 |
+
type: http_html_js # Playwright adapter
|
249 |
enabled: true
|
250 |
url: "https://onestop.md.gov/search?query=capacity"
|
251 |
geo: "MD"
|
252 |
categories: ["capacity_building"]
|
253 |
+
options:
|
254 |
+
wait_for: "[role='main']"
|
255 |
+
scroll: true
|
256 |
+
max_pages: 3
|
257 |
+
timeout_ms: 180000 # NEW: longer timeout for SPA
|
258 |
+
network_idle: true # NEW: wait for background XHR/fetch to settle
|
259 |
+
# debug: true # optional: screenshot on failure
|
260 |
+
# click_selector: "a[aria-label='Next']" # uncomment if pagination controls appear
|
261 |
parse:
|
262 |
follow_links: true
|
263 |
link_selectors:
|
264 |
- "a[href*='/forms/']"
|
265 |
- "a[href*='/search/']"
|
266 |
content_selectors:
|
267 |
+
- "[role='main']"
|
268 |
- "main"
|
269 |
- "article"
|
|
|
270 |
|
271 |
# --- Maryland: DHCD (housing/community programs & press) ---
|
272 |
- name: "MD DHCD β Programs (grants & loans index)"
|
|
|
329 |
geo: "MD"
|
330 |
categories: ["capacity_building"]
|
331 |
|
332 |
+
# --- Pennsylvania: DCED (Programs index; JS-rendered) ---
|
333 |
+
- name: "PA DCED β Programs (JS)"
|
334 |
+
type: http_html_js
|
335 |
+
enabled: true
|
336 |
+
url: "https://dced.pa.gov/programs/"
|
337 |
+
geo: "PA"
|
338 |
+
categories: ["capacity_building","community_development","economic_development"]
|
339 |
+
options:
|
340 |
+
wait_for: "main"
|
341 |
+
scroll: true
|
342 |
+
max_pages: 5
|
343 |
+
timeout_ms: 180000 # NEW
|
344 |
+
network_idle: true # NEW
|
345 |
+
# click_selector: ".pagination a.next"
|
346 |
+
# debug: true
|
347 |
+
parse:
|
348 |
+
item_selector: ".program-listing .program, .content" # fallback
|
349 |
+
title: ".program-title, h1, h2"
|
350 |
+
link: ".program-title a@href, a@href"
|
351 |
+
body: ".program-summary, .entry-content, main"
|
352 |
+
deadline_selector: ".deadline, .key-dates"
|
353 |
+
eligibility_selector: ".eligibility, .who-eligible"
|
354 |
+
|
355 |
# ---------- OPTIONAL: Curated JSON (enable after you generate it) ----------
|
356 |
- name: "State 5310 Listings (curated JSON)"
|
357 |
type: json_static
|
|
|
359 |
file: "data/state_5310_listings.json"
|
360 |
geo: "PA|MD|VA|DC"
|
361 |
categories: ["transportation","elderly","disabilities","5310","deadlines"]
|
362 |
+
|
363 |
+
- name: "Faith-based Foundations β Card/Grid (JS)"
|
364 |
+
type: http_html_js
|
365 |
+
enabled: true
|
366 |
+
skip_filters: true
|
367 |
+
url: "https://example.org/foundations/maryland/religion-related"
|
368 |
+
geo: "MD|PA|DE|NJ|VA"
|
369 |
+
categories: ["foundation_private","faith_based","capacity_building"]
|
370 |
+
options:
|
371 |
+
wait_for: "[role='main']" # or the results container CSS
|
372 |
+
scroll: true
|
373 |
+
scroll_selector: ".results-pane" # β replace with the REAL scrolling DIV
|
374 |
+
scroll_times: 40
|
375 |
+
scroll_wait_ms: 250
|
376 |
+
min_cards: 20
|
377 |
+
timeout_ms: 30000
|
378 |
+
network_idle: false
|
379 |
+
# click_selector: ".pagination a.next" # only if the page has a Next button
|
380 |
+
selectors:
|
381 |
+
card: ".result-card, .card, article, .search-result"
|
382 |
+
title: "h2 a, h3 a, .card-title a, .result-title a, h2, h3, .card-title"
|
383 |
+
link: "h2 a, h3 a, .card-title a, .result-title a, a"
|
384 |
+
description: ".summary, .card-text, .excerpt, p"
|
385 |
+
meta: ".meta, .tags, .badge, .location"
|