Yeetek commited on
Commit
6a06d7b
·
verified ·
1 Parent(s): 87831b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -5
app.py CHANGED
@@ -38,11 +38,13 @@ os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
38
  os.environ["NUMBA_DISABLE_CACHE"] = "1"
39
 
40
  # 2) Config from ENV
41
- MODEL_NAME = os.getenv("EMBED_MODEL", "seznam/simcse-small-e-czech")
 
 
42
  MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
43
  MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
44
 
45
- # 3) Set HF cache envs to a writeable folder (once at startup)
46
  cache_dir = "/tmp/hfcache"
47
  os.makedirs(cache_dir, exist_ok=True)
48
  import stat
@@ -52,11 +54,17 @@ os.environ["TRANSFORMERS_CACHE"] = cache_dir
52
  os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
53
 
54
  # 4) Initialise embeddings once
55
- embeddings = SentenceTransformer(MODEL_NAME, cache_folder=cache_dir)
 
 
 
 
 
 
 
56
 
57
  # Pre-initialize fallback global models for small-batch debugging
58
- # Global UMAP: 2-neighbors, cosine space, random init
59
- global_umap = UMAP(
60
  n_neighbors=2,
61
  metric="cosine",
62
  init="random",
 
38
  os.environ["NUMBA_DISABLE_CACHE"] = "1"
39
 
40
  # 2) Config from ENV
41
+ # Read model name from env and normalize to lowercase to match HF repo ID
42
+ env_model = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
43
+ MODEL_NAME = env_model
44
  MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
45
  MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
46
 
47
+ # 3) Set HF cache envs to a writeable folder (once at startup) envs to a writeable folder (once at startup)
48
  cache_dir = "/tmp/hfcache"
49
  os.makedirs(cache_dir, exist_ok=True)
50
  import stat
 
54
  os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
55
 
56
  # 4) Initialise embeddings once
57
+ # Use huggingface_hub to snapshot-download the model locally
58
+ from huggingface_hub import snapshot_download
59
+ print(f"Downloading model {MODEL_NAME} to {cache_dir}...")
60
+ sys.stdout.flush()
61
+ local_model_path = snapshot_download(repo_id=MODEL_NAME, cache_dir=cache_dir)
62
+
63
+ # Load SentenceTransformer from local path
64
+ embeddings = SentenceTransformer(local_model_path, cache_folder=cache_dir)
65
 
66
  # Pre-initialize fallback global models for small-batch debugging
67
+ # Global UMAP: 2-neighbors, cosine space, random init(
 
68
  n_neighbors=2,
69
  metric="cosine",
70
  init="random",