Spaces:

michaellupo74
/

grants-rag

Running

App Files Files Community

Gen. Overseer Lupo commited on Sep 20

Commit

99589b3

1 Parent(s): 67b91ae

Add local extra JSON source and update config

Browse files

Files changed (12) hide show

# +0 -0
a +0 -0
app/ingest.py +134 -175
app/main.py +31 -23
app/normalize.py +102 -0
app/paths.py +16 -0
app/sources/grantsgov_api.py +104 -45
app/ui_streamlit.py +55 -11
config/v6.yaml +57 -0
ensure +0 -0
is +0 -0
package +0 -0

# ADDED Viewed

File without changes

a ADDED Viewed

File without changes

app/ingest.py CHANGED Viewed

@@ -1,206 +1,165 @@
-import os, json
 from pathlib import Path
-from typing import Dict, List
 import yaml
-from sentence_transformers import SentenceTransformer
 import numpy as np
-import json
-from pathlib import Path
-# --- helpers kept local to avoid circular/broken imports ---
-import json
-from pathlib import Path
-from typing import List, Dict, Any
-def load_fallback_json(path: str) -> List[Dict[str, Any]]:
-    p = Path(path)
-    with open(p, "r") as f:
-        data = json.load(f)
-    # Normalize to a list of dicts
-    if isinstance(data, dict):
-        data = data.get("items") or data.get("records") or [data]
-    return data
-# replaces the previous map_to_records
-import hashlib
-from typing import List, Dict, Any
-def map_to_records(opp_hits: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """
-    Normalize Grants.gov hits to our internal schema with a guaranteed 'id'.
-    """
-    def mk_id(h: Dict[str, Any]) -> str:
-        # Prefer official ids; fall back to number; else hash a few fields
-        rid = h.get("id") or h.get("number") or h.get("oppNumber") or ""
-        if isinstance(rid, str) and rid.strip():
-            return rid.strip()
-        basis = f"{h.get('title','')}|{h.get('agencyName','')}|{h.get('openDate','')}"
-        return hashlib.md5(basis.encode("utf-8")).hexdigest()
-    recs: List[Dict[str, Any]] = []
-    for h in opp_hits:
-        title = h.get("title") or h.get("number") or "Untitled opportunity"
-        grants_id = h.get("id")
-        url = h.get("url") or (f"https://www.grants.gov/search-results-detail/{grants_id}" if grants_id else None)
-        desc_bits = [
-            h.get("agencyName") or "",
-            f"Status: {h.get('oppStatus','')}".strip(),
-            f"Opens: {h.get('openDate','')}".strip(),
-            f"Closes: {h.get('closeDate','')}".strip(),
-        ]
-        description = " | ".join([b for b in desc_bits if b])
-        recs.append({
-            "id": mk_id(h),
-            "title": title,
-            "url": url,
-            "description": description,
-            "geo": h.get("geo") or [],
-            "categories": h.get("aln") or h.get("fundingCategories") or [],
-            "source": "grants.gov",
-        })
-    return recs
-def load_fallback_json(path: str):
-    p = Path(path)
-    with open(p, "r") as f:
-        data = json.load(f)
-    # Normalize to a list of dicts
-    if isinstance(data, dict):
-        data = data.get("items") or data.get("records") or [data]
-    return data
 def load_config(cfg_path: str) -> Dict:
-    with open(cfg_path, "r") as f:
         return yaml.safe_load(f)
-def ensure_dirs(env: Dict):
-    Path(env["DOCSTORE_DIR"]).mkdir(parents=True, exist_ok=True)
-    Path(env["INDEX_DIR"]).mkdir(parents=True, exist_ok=True)
-def hash_id(text: str) -> str:
-    import hashlib
-    return hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
-def normalize_record(src_name: str, rec: Dict) -> Dict:
-    return {
-        "id": rec.get("id") or hash_id(rec.get("title","") + rec.get("url","")),
-        "source": src_name,
-        "title": rec.get("title","").strip(),
-        "summary": rec.get("summary","").strip(),
-        "url": rec.get("url","").strip(),
-        "deadline": rec.get("deadline","").strip(),
-        "eligibility": rec.get("eligibility","").strip(),
-        "agency": rec.get("agency","").strip(),
-        "geo": rec.get("geo","").strip(),
-        "categories": rec.get("categories", []),
-        "raw": rec
-    }
-def collect_from_grantsgov_api(src):
     """
-    Call Grants.gov client and normalize results to a list[dict] for indexing.
     """
-    # We import here to avoid circular imports at module load time.
-    from app.sources.grantsgov_api import search_grants
-    import json
-    # Build payload from source config (keys expected by our search_grants client)
-    page_size = int(src.get("page_size", 100))
-    max_pages = int(src.get("max_pages", 10))
-    payload = {
-        "keyword": src.get("keyword", "") or src.get("keywords", ""),
-        "oppNum": src.get("oppNum", ""),
-        "eligibilities": src.get("eligibilities", []),
-        "agencies": src.get("agencies", []),
-        "oppStatuses": src.get("oppStatuses", "forecasted|posted"),
-        "aln": src.get("aln", []),
-        "fundingCategories": src.get("fundingCategories", []),
-    }
-    # Call the Grants.gov client
-    result = search_grants(src.get("url", ""), payload, page_size=page_size, max_pages=max_pages)
-    # Normalize: ensure we have list[dict] before mapping
-    items = result.get("hits") if isinstance(result, dict) else result
-    if isinstance(items, dict) and "oppHits" in items:
-        items = items["oppHits"]
-    if items is None:
-        items = []
-    norm_items = []
-    for x in items:
-        if isinstance(x, str):
-            try:
-                x = json.loads(x)
-            except Exception:
-                continue
-        if isinstance(x, dict):
-            norm_items.append(x)
-    # Map to our internal record schema
-    basic = map_to_records(norm_items)
-    return basic
-def save_docstore(recs: List[Dict], env: Dict):
-    ds_path = Path(env["DOCSTORE_DIR"]) / "docstore.jsonl"
-    with open(ds_path, "w") as f:
         for r in recs:
-            f.write(json.dumps(r) + "\n")
-    return str(ds_path)
-def build_index(env: Dict):
-    ds_path = Path(env["DOCSTORE_DIR"]) / "docstore.jsonl"
     if not ds_path.exists():
         raise RuntimeError("Docstore not found. Run ingest first.")
-    model = SentenceTransformer("all-MiniLM-L6-v2")
-    texts = []
-    metas = []
-    with open(ds_path, "r") as f:
         for line in f:
             rec = json.loads(line)
-            text = " | ".join([rec.get("title",""), rec.get("summary",""), rec.get("eligibility",""), rec.get("agency","")])
-            texts.append(text)
-            metas.append({"id": rec["id"], "title": rec["title"], "url": rec["url"], "source": rec["source"], "geo": rec["geo"], "categories": rec["categories"]})
-    emb = model.encode(texts, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
     import faiss
     dim = emb.shape[1]
     index = faiss.IndexFlatIP(dim)
     index.add(emb)
-    Path(env["INDEX_DIR"]).mkdir(parents=True, exist_ok=True)
-    faiss.write_index(index, str(Path(env["INDEX_DIR"]) / "faiss.index"))
-    with open(Path(env["INDEX_DIR"]) / "meta.json", "w") as f:
-        json.dump(metas, f)
     return len(texts)
-def ingest(cfg_path: str, env: Dict):
     cfg = load_config(cfg_path)
-    ensure_dirs(env)
-    all_recs = []
-    for src in cfg.get("sources", []):
-        if not src.get("enabled", False):
             continue
-        if src["type"] == "grantsgov_api":
-            recs = collect_from_grantsgov_api(src)
         else:
-            recs = []
-        all_recs.extend(recs)
-    path = save_docstore(all_recs, env)
-    n_indexed = build_index(env)
-    return path, n_indexed
 if __name__ == "__main__":
-    from dotenv import dotenv_values
-    env = dotenv_values(".env")
-    cfg_path = "config/v6.yaml"
-    p, n = ingest(cfg_path, env)
     print(f"Ingested {n} records. Docstore at {p}")

+# app/ingest.py
+from __future__ import annotations
+import json
 from pathlib import Path
+from typing import Dict, List, Any
 import yaml
 import numpy as np
+from sentence_transformers import SentenceTransformer
+from app.paths import DOCSTORE_DIR, INDEX_DIR
+from .normalize import normalize  # ← central normalizer
+# -------------------- Config --------------------
 def load_config(cfg_path: str) -> Dict:
+    with open(cfg_path, "r", encoding="utf-8") as f:
         return yaml.safe_load(f)
+# -------------------- Grants.gov collector --------------------
+def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
     """
+    Calls the Grants.gov Search2 client and returns a list of RAW dicts
+    (adapter may already be close to unified; we'll still run normalize()).
     """
+    from app.sources.grantsgov_api import search_grants  # local import to avoid cycles
+    api = src.get("api", {})
+    page_size = int(api.get("page_size", src.get("page_size", 100)))
+    max_pages = int(api.get("max_pages", src.get("max_pages", 5)))
+    payload = api.get("payload", src.get("payload", {}))
+    url = src.get("url", "")
+    out = search_grants(url, payload, page_size=page_size, max_pages=max_pages)
+    hits = out.get("hits", []) if isinstance(out, dict) else (out or [])
+    return [h for h in hits if isinstance(h, dict)]
+# -------------------- Write docstore & build index --------------------
+def _save_docstore(recs: List[Dict[str, Any]]) -> str:
+    DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
+    path = DOCSTORE_DIR / "docstore.jsonl"
+    with path.open("w", encoding="utf-8") as f:
         for r in recs:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+    return str(path)
+def _build_index_from_docstore() -> int:
+    ds_path = DOCSTORE_DIR / "docstore.jsonl"
     if not ds_path.exists():
         raise RuntimeError("Docstore not found. Run ingest first.")
+    # Load records
+    texts: List[str] = []
+    metas: List[Dict[str, Any]] = []
+    with ds_path.open("r", encoding="utf-8") as f:
         for line in f:
             rec = json.loads(line)
+            title = rec.get("title") or ""
+            synopsis = rec.get("synopsis") or rec.get("summary") or ""
+            agency = rec.get("agency") or ""
+            eligibility = rec.get("eligibility") or ""
+            txt = "\n".join([title, synopsis, agency, eligibility]).strip()
+            texts.append(txt)
+            metas.append({
+                "id": rec.get("id"),
+                "title": title,
+                "url": rec.get("url"),
+                "source": rec.get("source"),
+                "geo": rec.get("geo"),
+                "categories": rec.get("categories"),
+                "agency": agency,
+                "deadline": rec.get("deadline"),
+                "program_number": rec.get("program_number"),
+                "posted_date": rec.get("posted_date"),
+            })
+    # Embed
+    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")
+    model.max_seq_length = 256
+    emb = model.encode(
+        texts,
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+        show_progress_bar=True,
+        batch_size=32,
+    ).astype(np.float32, copy=False)
+    # FAISS index
     import faiss
     dim = emb.shape[1]
     index = faiss.IndexFlatIP(dim)
     index.add(emb)
+    INDEX_DIR.mkdir(parents=True, exist_ok=True)
+    faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
+    (INDEX_DIR / "meta.json").write_text(json.dumps(metas, ensure_ascii=False))
     return len(texts)
+# -------------------- Ingest main --------------------
+def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
+    """
+    Reads config, fetches from enabled sources, normalizes with a single map,
+    attaches categories/geo consistently, DEDUPEs, and builds the index.
+    """
     cfg = load_config(cfg_path)
+    all_rows: List[Dict[str, Any]] = []
+    for entry in cfg.get("sources", []):
+        if not entry.get("enabled"):
             continue
+        geo = entry.get("geo") or "US"
+        cats = entry.get("categories") or []
+        static = {"geo": geo, "categories": cats}
+        typ = entry.get("type")
+        rows: List[Dict[str, Any]] = []
+        if typ == "grantsgov_api":
+            raw_hits = _collect_from_grantsgov_api(entry)
+            rows = [normalize("grants_gov", h, static) for h in raw_hits]
+        elif typ == "local_sample":
+            p = Path(entry["path"]).expanduser()
+            blob = json.loads(p.read_text(encoding="utf-8"))
+            items = blob.get("opportunities") or []
+            rows = [normalize("local_sample", op, static) for op in items]
         else:
+            # Future adapters (doj_ojp, state_md, …) would plug in here using normalize("<key>", raw, static)
+            rows = []
+        all_rows.extend(rows)
+    # ---- DEDUPE (id → url → title) ----
+    seen, unique = set(), []
+    for r in all_rows:
+        key = r.get("id") or r.get("url") or r.get("title")
+        if not key or key in seen:
+            continue
+        seen.add(key)
+        unique.append(r)
+    path = _save_docstore(unique)
+    n = _build_index_from_docstore()
+    return path, n
 if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--config", default="config/sources.yaml")
+    args = ap.parse_args()
+    p, n = ingest(args.config)
     print(f"Ingested {n} records. Docstore at {p}")

app/main.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import argparse, os, json
 from pathlib import Path
 from dotenv import dotenv_values
@@ -5,38 +6,43 @@ import pandas as pd
 from app.ingest import ingest
 from app.search import search
 def get_env():
     """
-    Load environment paths. On Hugging Face Spaces, fall back to local 'data/' dirs.
-    Never use absolute paths like /Users/... which don't exist in the container.
     """
-    env = dotenv_values(".env") or {}
-    # Root of the repo (parent of this file's folder)
-    ROOT = Path(__file__).resolve().parents[1]
-    def default_path(sub):
-        return str(ROOT / sub)
-    env.setdefault("DATA_DIR", default_path("data"))
-    env.setdefault("DOCSTORE_DIR", default_path("data/docstore"))
-    env.setdefault("INDEX_DIR", default_path("data/index"))
-    env.setdefault("EXPORT_DIR", default_path("data/exports"))
-    for k in ["DATA_DIR", "DOCSTORE_DIR", "INDEX_DIR", "EXPORT_DIR"]:
-        os.makedirs(env[k], exist_ok=True)
     return env
 def ensure_index_exists(env: dict):
     """
     Ensure a FAISS index exists in env['INDEX_DIR'].
     If missing, run a minimal ingest using config/sources.yaml.
-    This lets the Hugging Face Space self-heal on first boot.
     """
     index_dir = Path(env["INDEX_DIR"])
     faiss_idx = index_dir / "faiss.index"
@@ -46,14 +52,13 @@ def ensure_index_exists(env: dict):
         return  # already built
     print("Index not found. Building now via ingest() …")
-    # NOTE: This uses your existing ingestion pipeline.
-    # If your ingest relies on API keys, set them in the Space's
-    # Settings → Variables and secrets, then Restart the Space.
     path, n = ingest("config/sources.yaml", env)
     print(f"Ingest complete. {n} records. Docstore: {path}")
-def cmd_ingest(args):
     env = get_env()
     path, n = ingest("config/sources.yaml", env)
     print(f"Ingest complete. {n} records. Docstore: {path}")
@@ -61,7 +66,7 @@ def cmd_ingest(args):
 def cmd_search(args):
     env = get_env()
-    ensure_index_exists(env)  # <— NEW: auto-build if missing
     filters = {}
     if args.geo:
         filters["geo"] = args.geo.split(",")
@@ -69,13 +74,16 @@ def cmd_search(args):
         filters["categories"] = args.categories.split(",")
     res = search(args.q, env, top_k=args.k, filters=filters)
     for r in res:
-        print(f"- {r['title']} [{r['source']}] ({r['geo']}) score={r['score']:.3f}")
-        print(f"  {r['url']}")
 def cmd_export(args):
     env = get_env()
-    ensure_index_exists(env)  # <— NEW: auto-build if missing
     filters = {}
     if args.geo:
         filters["geo"] = args.geo.split(",")

+# app/main.py
 import argparse, os, json
 from pathlib import Path
 from dotenv import dotenv_values
 from app.ingest import ingest
 from app.search import search
+from app.paths import DATA_DIR, DOCSTORE_DIR, INDEX_DIR, EXPORT_DIR  # ← canonical paths
 def get_env():
     """
+    Load environment with safe, repo-relative defaults from app.paths.
+    - Honors .env (dotenv) and real env vars if set.
+    - Falls back to ./data, ./data/docstore, ./data/index, ./data/exports
+      which work on macOS AND Hugging Face Spaces.
     """
+    # 1) Start with .env (if present)
+    env = dict(dotenv_values(".env") or {})
+    # 2) Merge in process env (so Space secrets / shell vars override .env)
+    for k, v in os.environ.items():
+        env[k] = v
+    # 3) Provide safe defaults from app.paths if not specified
+    env.setdefault("DATA_DIR",     str(DATA_DIR))
+    env.setdefault("DOCSTORE_DIR", str(DOCSTORE_DIR))
+    env.setdefault("INDEX_DIR",    str(INDEX_DIR))
+    env.setdefault("EXPORT_DIR",   str(EXPORT_DIR))
+    # Optional UI/debug flags
+    env.setdefault("SHOW_DEV", "0")
+    # 4) Ensure directories exist
+    for k in ("DATA_DIR", "DOCSTORE_DIR", "INDEX_DIR", "EXPORT_DIR"):
+        Path(env[k]).mkdir(parents=True, exist_ok=True)
     return env
 def ensure_index_exists(env: dict):
     """
     Ensure a FAISS index exists in env['INDEX_DIR'].
     If missing, run a minimal ingest using config/sources.yaml.
     """
     index_dir = Path(env["INDEX_DIR"])
     faiss_idx = index_dir / "faiss.index"
         return  # already built
     print("Index not found. Building now via ingest() …")
+    # Ingest reads config and writes index/meta/docstore
+    # If your ingest needs API keys, set them in Space Settings → Variables
     path, n = ingest("config/sources.yaml", env)
     print(f"Ingest complete. {n} records. Docstore: {path}")
+def cmd_ingest(_args):
     env = get_env()
     path, n = ingest("config/sources.yaml", env)
     print(f"Ingest complete. {n} records. Docstore: {path}")
 def cmd_search(args):
     env = get_env()
+    ensure_index_exists(env)
     filters = {}
     if args.geo:
         filters["geo"] = args.geo.split(",")
         filters["categories"] = args.categories.split(",")
     res = search(args.q, env, top_k=args.k, filters=filters)
     for r in res:
+        geo = r.get("geo")
+        if isinstance(geo, list):
+            geo = ",".join(geo)
+        print(f"- {r.get('title','(no title)')} [{r.get('source','')}] ({geo}) score={r.get('score',0):.3f}")
+        print(f"  {r.get('url','')}")
 def cmd_export(args):
     env = get_env()
+    ensure_index_exists(env)
     filters = {}
     if args.geo:
         filters["geo"] = args.geo.split(",")

app/normalize.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import Any, Dict, Callable, Optional
+from datetime import datetime
+def _iso(d: Any) -> Optional[str]:
+    if not d:
+        return None
+    s = str(d)
+    for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"):
+        try:
+            return datetime.strptime(s, fmt).date().isoformat()
+        except Exception:
+            pass
+    try:
+        return datetime.fromisoformat(s).date().isoformat()
+    except Exception:
+        return None
+def _first(x: Any) -> Any:
+    return (x[0] if isinstance(x, (list, tuple)) and x else x)
+def _list(x: Any) -> list:
+    if x is None:
+        return []
+    if isinstance(x, list):
+        return x
+    if isinstance(x, (set, tuple)):
+        return list(x)
+    return [x]
+# Registry of source mappers: raw -> unified schema
+MAPPERS: Dict[str, Callable[[Dict[str, Any]], Dict[str, Any]]] = {}
+def mapper(name: str):
+    def _wrap(fn: Callable[[Dict[str, Any]], Dict[str, Any]]):
+        MAPPERS[name] = fn
+        return fn
+    return _wrap
+@mapper("grants_gov")
+def _map_grants_gov(h: Dict[str, Any]) -> Dict[str, Any]:
+    gg_id = h.get("id")
+    num = h.get("number")
+    aln_list = h.get("alnist") or h.get("aln") or []
+    out: Dict[str, Any] = {
+        "id": f"gg:{num or gg_id}",
+        "source": "grants.gov",
+        "title": h.get("title"),
+        "agency": h.get("agencyName") or h.get("agencyCode") or h.get("agency"),
+        "program_number": _first(aln_list) or h.get("program_number"),
+        "posted_date": _iso(h.get("openDate") or h.get("posted_date")),
+        "deadline": _iso(h.get("closeDate") or h.get("deadline")),
+        "synopsis": h.get("synopsis") or h.get("summary"),
+        "location_scope": h.get("location_scope") or ["US"],
+        "tags": h.get("tags") or [],
+        "url": h.get("url") or (f"https://www.grants.gov/search-results-detail/{gg_id}" if gg_id else None),
+        "raw": h,
+    }
+    # Optionals if present on the raw record
+    for k_src, k_dst in [
+        ("awardFloor", "award_floor"),
+        ("awardCeiling", "award_ceiling"),
+        ("expectedNumberOfAwards", "expected_awards"),
+        ("eligibility", "eligibility"),
+    ]:
+        if h.get(k_src) is not None or h.get(k_dst) is not None:
+            out[k_dst] = h.get(k_dst) if h.get(k_dst) is not None else h.get(k_src)
+    return out
+@mapper("local_sample")
+def _map_local_sample(op: Dict[str, Any]) -> Dict[str, Any]:
+    return {
+        "id": f"sample:{op.get('opportunityNumber')}",
+        "source": "sample_local",
+        "title": op.get("opportunityTitle"),
+        "agency": op.get("agency"),
+        "program_number": None,
+        "posted_date": _iso(op.get("postedDate")),
+        "deadline": _iso(op.get("closeDate")),
+        "synopsis": op.get("synopsis"),
+        "location_scope": ["US"],
+        "tags": [],
+        "url": None,
+        "raw": op,
+    }
+def normalize(source_key: str, raw: Dict[str, Any], static: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    if source_key not in MAPPERS:
+        raise KeyError("No mapper registered for %r" % source_key)
+    rec = MAPPERS[source_key](raw)
+    static = static or {}
+    # attach geo
+    if static.get("geo"):
+        rec["geo"] = static["geo"]
+    # attach categories and mirror into tags
+    cats = _list(static.get("categories"))
+    rec.setdefault("categories", [])
+    for c in cats:
+        if c not in rec["categories"]:
+            rec["categories"].append(c)
+    rec["tags"] = list(set(_list(rec.get("tags")) + cats))
+    return rec

app/paths.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# app/paths.py
+import os
+import pathlib
+# Repo root (…/grants_rag_app)
+ROOT = pathlib.Path(__file__).resolve().parents[1]
+# Defaults: repo-relative folders that work on Mac AND on Hugging Face
+DATA_DIR     = pathlib.Path(os.getenv("DATA_DIR", ROOT / "data"))
+DOCSTORE_DIR = pathlib.Path(os.getenv("DOCSTORE_DIR", DATA_DIR / "docstore"))
+INDEX_DIR    = pathlib.Path(os.getenv("INDEX_DIR", DATA_DIR / "index"))
+EXPORT_DIR   = pathlib.Path(os.getenv("EXPORT_DIR", DATA_DIR / "exports"))
+# Make sure they exist (no-ops if they already do)
+for p in (DATA_DIR, DOCSTORE_DIR, INDEX_DIR, EXPORT_DIR):
+    p.mkdir(parents=True, exist_ok=True)

app/sources/grantsgov_api.py CHANGED Viewed

@@ -1,46 +1,85 @@
 # app/sources/grantsgov_api.py
 from __future__ import annotations
 import requests
-from typing import Dict, List, Any
-API_URL = "https://api.grants.gov/v1/api/search2"  # official search endpoint
 def _coerce_pipe(v: Any) -> str:
-    """
-    Accepts list/tuple/str/None and returns Grants.gov pipe-delimited string.
-    """
     if v is None:
         return ""
     if isinstance(v, (list, tuple, set)):
         return "|".join([str(x) for x in v if x])
     return str(v)
-def search_grants(_unused_url: str, payload: Dict[str, Any], page_size: int = 100, max_pages: int = 10, timeout: int = 30) -> Dict[str, Any]:
     """
-    Calls Grants.gov Search2 API with pagination.
-    - _unused_url: kept for backward compatibility with your ingest() signature.
-    - payload: may include keys: keyword, oppNum, eligibilities, agencies, oppStatuses, aln, fundingCategories
-    - page_size: results per page (API uses 'rows')
-    - max_pages: hard cap to avoid runaway calls
-    Returns a dict with:
-      {
-        "hits": [ ...normalized opportunties... ],
-        "hitCount": int
-      }
     """
     all_hits: List[Dict[str, Any]] = []
     start = 0
     pages = 0
-    hit_count = None
-    # Normalize allowable filters
     keyword = payload.get("keyword", "") or payload.get("keywords", "")
     oppNum = payload.get("oppNum", "")
     eligibilities = _coerce_pipe(payload.get("eligibilities", ""))
-    agencies = _coerce_pipe(payload.get("agencies", ""))
-    oppStatuses = _coerce_pipe(payload.get("oppStatuses", "")) or "forecasted|posted"
-    aln = _coerce_pipe(payload.get("aln", ""))
     fundingCategories = _coerce_pipe(payload.get("fundingCategories", ""))
     session = requests.Session()
@@ -49,6 +88,7 @@ def search_grants(_unused_url: str, payload: Dict[str, Any], page_size: int = 10
     while pages < max_pages:
         req_body = {
             "rows": page_size,
             "keyword": keyword,
             "oppNum": oppNum,
             "eligibilities": eligibilities,
@@ -56,42 +96,61 @@ def search_grants(_unused_url: str, payload: Dict[str, Any], page_size: int = 10
             "oppStatuses": oppStatuses,
             "aln": aln,
             "fundingCategories": fundingCategories,
-            "startRecordNum": start,  # paginate
         }
         resp = session.post(API_URL, json=req_body, headers=headers, timeout=timeout)
         resp.raise_for_status()
-        j = resp.json()
-        data = j.get("data", {}) or {}
         if hit_count is None:
-            hit_count = int(data.get("hitCount", 0))
-        opp_hits = data.get("oppHits", []) or []
-        # Normalize fields to your expected schema
         for h in opp_hits:
-            all_hits.append({
-                "id": h.get("id"),
-                "number": h.get("number"),
-                "title": h.get("title"),
-                "agencyCode": h.get("agencyCode"),
-                "agencyName": h.get("agencyName"),
-                "openDate": h.get("openDate"),
-                "closeDate": h.get("closeDate"),
-                "oppStatus": h.get("oppStatus"),
-                "docType": h.get("docType"),
-                "aln": h.get("alnist", []),  # list of ALNs
                 "source": "grants.gov",
-                "url": f"https://www.grants.gov/search-results-detail/{h.get('id')}" if h.get("id") else None,
-            })
-        # advance
         got = len(opp_hits)
-        if got == 0:
-            break
         start += got
         pages += 1
-        if start >= hit_count:
             break
     return {"hits": all_hits, "hitCount": hit_count or 0}

 # app/sources/grantsgov_api.py
 from __future__ import annotations
+from typing import Dict, List, Any, Optional
+from datetime import datetime
 import requests
+# Official Grants.gov Search2 endpoint (JSON POST)
+API_URL = "https://api.grants.gov/v1/api/search2"
 def _coerce_pipe(v: Any) -> str:
+    """Accept list/tuple/set/str/None and return pipe-delimited string."""
     if v is None:
         return ""
     if isinstance(v, (list, tuple, set)):
         return "|".join([str(x) for x in v if x])
     return str(v)
+def _first(x: Any) -> Optional[str]:
+    if isinstance(x, (list, tuple)) and x:
+        return str(x[0])
+    return str(x) if x is not None else None
+def _parse_date(d: Any) -> Optional[str]:
+    """Return YYYY-MM-DD or None (be tolerant to formats)."""
+    if not d:
+        return None
+    s = str(d)
+    # common formats seen in the API
+    for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"):
+        try:
+            return datetime.strptime(s, fmt).date().isoformat()
+        except Exception:
+            pass
+    try:
+        return datetime.fromisoformat(s).date().isoformat()
+    except Exception:
+        return None
+# Map common config keys → API keys so older configs still work
+_KEY_MAP = {
+    "opportunityStatuses": "oppStatuses",
+    "agencyCodes": "agencies",
+    "agencies": "agencies",
+    "alns": "aln",
+}
+def _remap_payload_keys(payload: Dict[str, Any]) -> Dict[str, Any]:
+    out = dict(payload or {})
+    for k, v in list(out.items()):
+        if k in _KEY_MAP:
+            out[_KEY_MAP[k]] = v
+    return out
+def search_grants(
+    _unused_url: str,
+    payload: Dict[str, Any],
+    page_size: int = 100,
+    max_pages: int = 10,
+    timeout: int = 30,
+) -> Dict[str, Any]:
     """
+    Calls Grants.gov Search2 API with pagination and returns normalized results:
+    Returns:
+        {
+          "hits": [ { unified schema per record }, ... ],
+          "hitCount": int
+        }
     """
     all_hits: List[Dict[str, Any]] = []
     start = 0
     pages = 0
+    hit_count: Optional[int] = None
+    # Bridge payload keys and coerce to API expectations
+    payload = _remap_payload_keys(payload or {})
     keyword = payload.get("keyword", "") or payload.get("keywords", "")
     oppNum = payload.get("oppNum", "")
     eligibilities = _coerce_pipe(payload.get("eligibilities", ""))
+    agencies     = _coerce_pipe(payload.get("agencies", ""))
+    oppStatuses  = _coerce_pipe(payload.get("oppStatuses", "")) or "forecasted|posted"
+    aln          = _coerce_pipe(payload.get("aln", ""))
     fundingCategories = _coerce_pipe(payload.get("fundingCategories", ""))
     session = requests.Session()
     while pages < max_pages:
         req_body = {
             "rows": page_size,
+            "startRecordNum": start,  # pagination
             "keyword": keyword,
             "oppNum": oppNum,
             "eligibilities": eligibilities,
             "oppStatuses": oppStatuses,
             "aln": aln,
             "fundingCategories": fundingCategories,
         }
         resp = session.post(API_URL, json=req_body, headers=headers, timeout=timeout)
         resp.raise_for_status()
+        j = resp.json() or {}
+        data = j.get("data") or {}
         if hit_count is None:
+            try:
+                hit_count = int(data.get("hitCount", 0))
+            except Exception:
+                hit_count = 0
+        opp_hits = data.get("oppHits") or []
+        if not opp_hits:
+            break
+        # ---- Normalize each record to unified schema ----
         for h in opp_hits:
+            gg_id = h.get("id")
+            num   = h.get("number")
+            aln_list = h.get("alnist", []) or []
+            norm = {
+                # unified schema (stable id avoids duplicates across configs)
+                "id": f"gg:{num or gg_id}",
                 "source": "grants.gov",
+                "title": h.get("title"),
+                "agency": h.get("agencyName") or h.get("agencyCode"),
+                "program_number": _first(aln_list),          # Assistance Listing (ALN/CFDA)
+                "posted_date": _parse_date(h.get("openDate")),
+                "deadline": _parse_date(h.get("closeDate")),
+                "synopsis": h.get("synopsis") or h.get("summary"),
+                "location_scope": ["US"],                    # Grants.gov is US-wide by default
+                "tags": [],                                  # to be extended by ingest with config categories
+                "url": f"https://www.grants.gov/search-results-detail/{gg_id}" if gg_id else None,
+                "raw": h,                                    # keep full source blob for traceability
+            }
+            # Optional award fields if present (keep None if absent)
+            if "awardFloor" in h:
+                norm["award_floor"] = h.get("awardFloor")
+            if "awardCeiling" in h:
+                norm["award_ceiling"] = h.get("awardCeiling")
+            if "expectedNumberOfAwards" in h:
+                norm["expected_awards"] = h.get("expectedNumberOfAwards")
+            if "eligibility" in h:
+                norm["eligibility"] = h.get("eligibility")
+            all_hits.append(norm)
         got = len(opp_hits)
         start += got
         pages += 1
+        if hit_count is not None and start >= hit_count:
             break
     return {"hits": all_hits, "hitCount": hit_count or 0}

app/ui_streamlit.py CHANGED Viewed

@@ -1,4 +1,11 @@
 # app/ui_streamlit.py
 import os, json
 from pathlib import Path
@@ -148,8 +155,6 @@ span.chip { display:inline-block; padding:3px 8px; border-radius:999px; backgrou
 .hero-text h1 { margin:0; font-size:28px; font-weight:700; color:#f97316; }
 .hero-text p  { margin:6px 0 0; font-size:15px; color:#fcd34d; } /* gold */
 /* ===== FORCE DARK SELECT / MULTISELECT (works across Streamlit versions) ===== */
 /* Closed control (the visible box) */
@@ -204,8 +209,6 @@ div[data-baseweb="menu"] [role="option"][aria-selected="true"] {
   background: #334155 !important;
   color: #f8fafc !important;
 }
 </style>
 """, unsafe_allow_html=True)
@@ -222,7 +225,6 @@ st.markdown("""
 # ── Hide developer diagnostics by default ─────────────────────────────────────
 SHOW_DEV = os.environ.get("SHOW_DEV") == "1"
 # ── Environment + index ───────────────────────────────────────────────────────
 _env = get_env()
 ensure_index_exists(_env)
@@ -285,11 +287,22 @@ q = st.text_input("Search query", value=default_q)
 geo = st.multiselect("Geo filter (optional)", options=["US", "MD", "MA"], default=[])
 categories = st.multiselect(
     "Category filter (optional)",
-    options=["capacity_building", "elderly", "prison_ministry", "evangelism", "transportation", "vehicle"],
     default=[]
 )
 top_k = st.slider("Results", 5, 50, 15)
 # Build filters only when selected
 filters = {}
@@ -347,20 +360,46 @@ with col1:
 with col2:
     if st.button("Export Results to CSV"):
-        results = st.session_state.get("results", [])
-        if not results:
             st.warning("No results to export. Run a search first.")
         else:
             os.makedirs(_env["EXPORT_DIR"], exist_ok=True)
             out_path = os.path.join(_env["EXPORT_DIR"], "results.csv")
             import pandas as pd
-            pd.DataFrame(results).to_csv(out_path, index=False)
             st.success(f"Exported to {out_path}")
 st.markdown("---")
 results = st.session_state.get("results", [])
 if results:
     st.caption(f"Results: {len(results)}")
     for r in results:
@@ -375,6 +414,11 @@ if results:
         if url and not url.startswith("http"):
             st.caption("Note: This item may display an ID or number instead of a full link. Open on Grants.gov if needed.")
         st.write(f"[Open Link]({url})  \nScore: {r.get('score', 0):.3f}")
         st.markdown("---")
 else:
-    st.info("Enter a query and click Search.")

 # app/ui_streamlit.py
+# Ensure project root is on sys.path when Streamlit runs this as a script
+import sys, pathlib
+ROOT = pathlib.Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
 import os, json
 from pathlib import Path
 .hero-text h1 { margin:0; font-size:28px; font-weight:700; color:#f97316; }
 .hero-text p  { margin:6px 0 0; font-size:15px; color:#fcd34d; } /* gold */
 /* ===== FORCE DARK SELECT / MULTISELECT (works across Streamlit versions) ===== */
 /* Closed control (the visible box) */
   background: #334155 !important;
   color: #f8fafc !important;
 }
 </style>
 """, unsafe_allow_html=True)
 # ── Hide developer diagnostics by default ─────────────────────────────────────
 SHOW_DEV = os.environ.get("SHOW_DEV") == "1"
 # ── Environment + index ───────────────────────────────────────────────────────
 _env = get_env()
 ensure_index_exists(_env)
 geo = st.multiselect("Geo filter (optional)", options=["US", "MD", "MA"], default=[])
 categories = st.multiselect(
     "Category filter (optional)",
+    options=[
+        "capacity_building","elderly","prison_ministry","evangelism",
+        "transportation","vehicle",
+        "justice","reentry","victim_services","youth","women","workforce"
+    ],
     default=[]
 )
 top_k = st.slider("Results", 5, 50, 15)
+sort_by = st.selectbox(
+    "Sort by",
+    ["Relevance", "Deadline (soonest first)"],
+    index=0,
+)
+only_open = st.checkbox("Only show opportunities with a future deadline", value=True)
 # Build filters only when selected
 filters = {}
 with col2:
     if st.button("Export Results to CSV"):
+        results_for_export = st.session_state.get("results", [])
+        if not results_for_export:
             st.warning("No results to export. Run a search first.")
         else:
             os.makedirs(_env["EXPORT_DIR"], exist_ok=True)
             out_path = os.path.join(_env["EXPORT_DIR"], "results.csv")
             import pandas as pd
+            pd.DataFrame(results_for_export).to_csv(out_path, index=False)
             st.success(f"Exported to {out_path}")
 st.markdown("---")
+# ---- Sorting/filter helpers ----
+from datetime import date, datetime
+def _to_date(d):
+    if not d:
+        return None
+    try:
+        return datetime.fromisoformat(str(d)).date()
+    except Exception:
+        return None
+# Pull results from session state and then apply UI-level filters/sorts
 results = st.session_state.get("results", [])
+# Optionally filter to only-open
+if only_open and results:
+    results = [r for r in results if (_to_date(r.get("deadline")) or date.max) >= date.today()]
+# Apply sort if selected
+if sort_by.startswith("Deadline") and results:
+    results.sort(
+        key=lambda r: (
+            _to_date(r.get("deadline")) is None,
+            _to_date(r.get("deadline")) or date.max,
+        )
+    )
+# ---- Render results ----
 if results:
     st.caption(f"Results: {len(results)}")
     for r in results:
         if url and not url.startswith("http"):
             st.caption("Note: This item may display an ID or number instead of a full link. Open on Grants.gov if needed.")
         st.write(f"[Open Link]({url})  \nScore: {r.get('score', 0):.3f}")
+        posted = r.get("posted_date") or ""
+        deadline = r.get("deadline") or ""
+        st.caption(f"Posted: {posted}  •  Deadline: {deadline}")
         st.markdown("---")
 else:
+    st.info("Enter a query and click Search.")

config/v6.yaml CHANGED Viewed

@@ -15,6 +15,20 @@ sources:
         keyword: "capacity building"
         oppStatuses: "posted"
   - name: "Grants.gov (API: capacity building - vehicles/transportation)"
     type: grantsgov_api
     enabled: true
@@ -50,3 +64,46 @@ sources:
         keyword: "5310 Enhanced Mobility Seniors Individuals with Disabilities"
         oppStatuses: "posted"
         agencyCodes: ["FTA"]

         keyword: "capacity building"
         oppStatuses: "posted"
+- name: "DOL — Reentry & Workforce Capacity"
+  type: grantsgov_api
+  enabled: true
+  url: "https://www.grants.gov/grantsws/rest/opportunities/search/"
+  geo: "US"
+  categories: ["justice","reentry","workforce"]   # simple, uncluttered tags
+  api:
+    page_size: 100
+    max_pages: 5
+    payload:
+      keyword: "reentry workforce employment training apprenticeships transitional jobs capacity building technical assistance"
+      oppStatuses: ["posted"]
+      agencies: ["DOL"]   # broad DOL umbrella
   - name: "Grants.gov (API: capacity building - vehicles/transportation)"
     type: grantsgov_api
     enabled: true
         keyword: "5310 Enhanced Mobility Seniors Individuals with Disabilities"
         oppStatuses: "posted"
         agencyCodes: ["FTA"]
+  # --- DOJ / OJP family via Grants.gov ---
+  - name: "DOJ (All) — Capacity/Reentry"
+    type: grantsgov_api
+    enabled: true
+    url: "https://www.grants.gov/grantsws/rest/opportunities/search/"
+    geo: "US"
+    categories: ["justice", "reentry"]
+    api:
+      page_size: 100
+      max_pages: 5
+      payload:
+        keyword: "capacity building reentry community outreach victim services youth violence prevention"
+        oppStatuses: ["posted"]
+        agencies: ["DOJ"]   # parent Dept. of Justice
+  - name: "OJP (BJA/OJJDP/OVC/BJS) — Capacity"
+    type: grantsgov_api
+    enabled: true
+    url: "https://www.grants.gov/grantsws/rest/opportunities/search/"
+    geo: "US"
+    categories: ["justice", "victim_services", "youth", "reentry"]
+    api:
+      page_size: 100
+      max_pages: 5
+      payload:
+        keyword: "capacity building community-based nonprofit technical assistance case management"
+        oppStatuses: ["posted"]
+        agencies: ["OJP","BJA","OJJDP","OVC","BJS"]
+  - name: "OVW — Violence Against Women"
+    type: grantsgov_api
+    enabled: true
+    url: "https://www.grants.gov/grantsws/rest/opportunities/search/"
+    geo: "US"
+    categories: ["justice", "victim_services", "women"]
+    api:
+      page_size: 100
+      max_pages: 5
+      payload:
+        keyword: "capacity building advocacy shelter prevention prosecution"
+        oppStatuses: ["posted"]
+        agencies: ["OVW"]

ensure ADDED Viewed

File without changes

is ADDED Viewed

File without changes

package ADDED Viewed

File without changes