psyrishi commited on
Commit
3efb860
Β·
1 Parent(s): b630781

πŸš€ Initial commit: narrative summarizer with BART model

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -35
  2. .gitignore +34 -0
  3. LICENSE +21 -0
  4. README.md +156 -6
  5. app.py +67 -0
  6. requirements.txt +4 -0
  7. summarizer.py +75 -0
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.py text eol=lf
2
+ *.txt text eol=lf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore PyCharm / IntelliJ project files
2
+ .idea/
3
+ *.iml
4
+
5
+ # Byte-compiled / cache
6
+ __pycache__/
7
+ *.py[cod]
8
+ *.pyo
9
+ *.pyd
10
+ *.so
11
+ *.log
12
+
13
+ # System files
14
+ .DS_Store
15
+ Thumbs.db
16
+
17
+ # Checkpointing / runtime folders (auto-created at runtime)
18
+ inputs/
19
+ outputs/
20
+ checkpoints/
21
+
22
+ # Hugging Face / Gradio cache (optional)
23
+ .gradio/
24
+ hf_cache/
25
+
26
+ # Environment files
27
+ .env
28
+ *.env
29
+ .venv/
30
+ venv/
31
+
32
+ # OS-specific
33
+ ehthumbs.db
34
+ Icon?
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 psyrishi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included
13
+ in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
+ IN THE SOFTWARE.
README.md CHANGED
@@ -1,14 +1,164 @@
1
  ---
2
  title: Narrative Summarizer
3
- emoji: ⚑
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.45.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: Summarizer for .txt files using BART model and custom prompt
 
 
 
 
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Narrative Summarizer
3
+ emoji: πŸ“š
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.15.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ tags:
12
+ - summarization
13
+ - text
14
+ - transformer
15
+ - bart
16
+ - compression
17
+ - gradio
18
  ---
19
 
20
+ # πŸ“š Narrative Summarizer
21
+
22
+ Summarize long `.txt` narrative files into compressed, LLM-optimized summaries using BART. Choose between `Bread`, `Butter`, or both prompt styles for custom compression behavior. Upload a `.txt` file, select your preferences, and receive a clean, compressed summary in seconds.
23
+
24
+ ---
25
+
26
+ ## πŸ“š Narrative Summarizer β€” Hugging Face Space
27
+
28
+ **`psyrishi/narrative-summarizer`**
29
+
30
+ A user-friendly summarization tool for `.txt` files, powered by Hugging Face Transformers and built with Gradio.
31
+
32
+ This app transforms long-form narratives into compressed, LLM-friendly summaries using either the **"Bread"**, **"Butter"**, or a **combination of both** prompt styles. It supports checkpointing to avoid data loss on interruptions and ensures large text files are processed reliably.
33
+
34
+ ---
35
+
36
+ ### ✨ Features
37
+
38
+ * βœ… Supports `.txt` file uploads up to 3 MB (or more)
39
+ * πŸ“Œ Prompt options: `Bread`, `Butter`, or `Bread and Butter`
40
+ * πŸ” Multi-iteration summarization support
41
+ * 🧠 Model: `facebook/bart-large-cnn`
42
+ * πŸ’Ύ Auto checkpointing: progress won't be lost on timeout
43
+ * 🧰 Output is saved for download post-processing
44
+ * 🌐 Clean Gradio UI – easy to run in browser
45
+
46
+ ---
47
+
48
+ ### πŸ“₯ How to Use
49
+
50
+ 1. **Upload** a `.txt` file (max \~3MB recommended)
51
+ 2. **Select** a summarization style from dropdown:
52
+
53
+ * `Bread only`
54
+ * `Butter only`
55
+ * `Bread and Butter`
56
+ 3. Choose:
57
+
58
+ * `Iterations`: how many times the prompts apply
59
+ * `Max Length`: max summary tokens per chunk
60
+ * `Min Length`: min summary tokens per chunk
61
+ 4. Click **Summarize**
62
+ 5. Get your **condensed output** in the results box
63
+
64
+ ---
65
+
66
+ ### βš™οΈ Tech Stack
67
+
68
+ | Component | Details |
69
+ | ----------------- | -------------------------------- |
70
+ | **Frontend** | [Gradio](https://www.gradio.app) |
71
+ | **Backend** | Hugging Face `transformers` |
72
+ | **Model** | `facebook/bart-large-cnn` |
73
+ | **Checkpointing** | JSON-based resume system |
74
+ | **Language** | Python 3.10+ |
75
+
76
+ ---
77
+
78
+ ### πŸ“‚ Folder Structure
79
+
80
+ ```
81
+ .
82
+ β”œβ”€β”€ app.py # Gradio frontend app
83
+ β”œβ”€β”€ summarizer.py # Backend summarization logic
84
+ β”œβ”€β”€ requirements.txt # Dependencies
85
+ β”œβ”€β”€ inputs/ # Uploaded input files
86
+ β”œβ”€β”€ outputs/ # Final summarized outputs
87
+ └── checkpoints/ # Intermediate checkpointing
88
+ ```
89
+
90
+ ---
91
+
92
+ ### πŸ› οΈ Setup (Local)
93
+
94
+ Clone this repo and run it locally:
95
+
96
+ ```bash
97
+ git clone https://huggingface.co/spaces/psyrishi/narrative-summarizer
98
+ cd narrative-summarizer
99
+
100
+ pip install -r requirements.txt
101
+ python app.py
102
+ ```
103
+
104
+ ---
105
+
106
+ ## πŸš€ Space Configuration
107
+
108
+ Here’s how to fill out the **Hugging Face Space creation form**:
109
+
110
+ | Field | Value |
111
+ | --------------------- | ------------------------------------------- |
112
+ | **Owner** | `psyrishi` |
113
+ | **Space Name** | `narrative-summarizer` |
114
+ | **Short Description** | Summarizer for the txt files |
115
+ | **License** | Choose: `MIT`, `Apache 2.0`, or `Other` |
116
+ | **Space SDK** | βœ… Gradio |
117
+ | **Gradio Template** | Start from Scratch or Blank |
118
+ | **Hardware** | βœ… Free (sufficient for your use case) |
119
+ | **Visibility** | Choose: `Public` (recommended) or `Private` |
120
+ | **Dev Mode** | (Optional) Available to PRO subscribers |
121
+
122
+ ---
123
+
124
+ ### πŸ§ͺ Prompt Styles Explained
125
+
126
+ * πŸ₯– **Bread**: Focuses on compression for efficient LLM parsing
127
+ * 🧈 **Butter**: Enhances nuance and detail while summarizing
128
+ * πŸ₯ͺ **Bread + Butter**: Applies both sequentially for balance
129
+
130
+ ---
131
+
132
+ ### πŸ“Œ Example Input
133
+
134
+ ```txt
135
+ Once upon a time, in a quiet village nestled between two mountains...
136
+ ```
137
+
138
+ ### πŸ“€ Example Output (Bread only)
139
+
140
+ ```txt
141
+ A peaceful mountain village faces hidden turmoil, gradually unveiling conflicts beneath its quiet surface.
142
+ ```
143
+
144
+ ---
145
+
146
+ ### πŸ” License
147
+
148
+ Recommend using:
149
+
150
+ ```
151
+ MIT License
152
+
153
+ Copyright (c) 2025 psyrishi
154
+ Permission is hereby granted, free of charge, to any person obtaining a copy...
155
+ ```
156
+
157
+ Or [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0).
158
+
159
+ ---
160
+
161
+ ### πŸ‘‹ Feedback & Contributions
162
+
163
+ Feel free to fork the repo, create pull requests, or open issues if you'd like to contribute or improve the tool.
164
+
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from summarizer import Summarizer
4
+
5
+ summarizer = Summarizer()
6
+
7
+ PROMPT_CHOICES = {
8
+ "Bread only": ["Bread"],
9
+ "Butter only": ["Butter"],
10
+ "Bread and Butter": ["Bread", "Butter"]
11
+ }
12
+
13
+ def summarize_file(file, prompt_type, iterations, max_length, min_length):
14
+ if not file:
15
+ return "No file uploaded."
16
+
17
+ os.makedirs("inputs", exist_ok=True)
18
+ input_path = os.path.join("inputs", file.name)
19
+ with open(input_path, 'wb') as f:
20
+ f.write(file.read())
21
+
22
+ output_path = os.path.join("outputs", f"{os.path.splitext(file.name)[0]}_summary.txt")
23
+ os.makedirs("outputs", exist_ok=True)
24
+
25
+ def progress_callback(done, total, eta):
26
+ return print(f"Progress: {done}/{total} | ETA: {int(eta)} sec")
27
+
28
+ try:
29
+ summary = summarizer.summarize_file(
30
+ input_path=input_path,
31
+ output_path=output_path,
32
+ prompt_types=PROMPT_CHOICES[prompt_type],
33
+ iterations=iterations,
34
+ max_length=max_length,
35
+ min_length=min_length,
36
+ progress_callback=progress_callback
37
+ )
38
+ return summary
39
+ except Exception as e:
40
+ return f"Error occurred during summarization: {str(e)}"
41
+
42
+ with gr.Blocks() as demo:
43
+ gr.Markdown("## πŸ“š Narrative Compression Tool")
44
+
45
+ with gr.Row():
46
+ file_input = gr.File(label="Upload .txt File", file_types=[".txt"])
47
+ prompt_type = gr.Dropdown(
48
+ choices=list(PROMPT_CHOICES.keys()),
49
+ label="Select Prompt",
50
+ value="Bread only"
51
+ )
52
+
53
+ iterations = gr.Slider(1, 5, value=1, step=1, label="Iterations")
54
+ max_length = gr.Slider(50, 300, value=150, step=10, label="Max Summary Length")
55
+ min_length = gr.Slider(20, 100, value=50, step=10, label="Min Summary Length")
56
+
57
+ submit = gr.Button("Summarize")
58
+
59
+ output = gr.Textbox(label="Condensed Summary", lines=15)
60
+
61
+ submit.click(
62
+ summarize_file,
63
+ inputs=[file_input, prompt_type, iterations, max_length, min_length],
64
+ outputs=output
65
+ )
66
+
67
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ gradio
4
+ hf-xet
summarizer.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ from transformers import pipeline
5
+ import torch
6
+
7
+ class Summarizer:
8
+ def __init__(self, model_name="facebook/bart-large-cnn", chunk_size=1000, batch_size=4):
9
+ self.model_name = model_name
10
+ self.chunk_size = chunk_size
11
+ self.batch_size = batch_size
12
+ self.device = 0 if torch.cuda.is_available() else -1
13
+ self.summarizer = pipeline("summarization", model=model_name, device=self.device)
14
+
15
+ os.makedirs("checkpoints", exist_ok=True)
16
+
17
+ def _chunk_text(self, text):
18
+ return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size)]
19
+
20
+ def _apply_prompt(self, chunk, prompt_type):
21
+ if prompt_type == "Bread":
22
+ return f"Transform the provided fictional narrative into a maximally compressed yet losslessly decompressible format optimized for LLM reconstruction. {chunk}"
23
+ elif prompt_type == "Butter":
24
+ return f"Solid foundation, but let's refine the granularity. Your 4-subpoint structure creates artificial symmetry where organic complexity should flourish. {chunk}"
25
+ else:
26
+ return chunk
27
+
28
+ def summarize_file(self, input_path, output_path, prompt_types, iterations=1,
29
+ max_length=150, min_length=50, progress_callback=None):
30
+ with open(input_path, 'r', encoding='utf-8') as f:
31
+ text = f.read()
32
+
33
+ chunks = self._chunk_text(text)
34
+ total_chunks = len(chunks)
35
+ processed_chunks = 0
36
+ summaries = []
37
+ start_time = time.time()
38
+
39
+ # Checkpoint recovery
40
+ checkpoint_path = os.path.join("checkpoints", os.path.basename(input_path) + ".json")
41
+ if os.path.exists(checkpoint_path):
42
+ with open(checkpoint_path, 'r', encoding='utf-8') as cp:
43
+ checkpoint_data = json.load(cp)
44
+ summaries = checkpoint_data.get("summaries", [])
45
+ processed_chunks = checkpoint_data.get("processed_chunks", 0)
46
+
47
+ for i in range(processed_chunks, total_chunks):
48
+ chunk = chunks[i]
49
+ for _ in range(iterations):
50
+ for p in prompt_types:
51
+ chunk = self._apply_prompt(chunk, p)
52
+
53
+ summary = self.summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
54
+ summaries.append(summary)
55
+ processed_chunks += 1
56
+
57
+ # Save checkpoint
58
+ with open(checkpoint_path, 'w', encoding='utf-8') as cp:
59
+ json.dump({
60
+ "processed_chunks": processed_chunks,
61
+ "summaries": summaries
62
+ }, cp)
63
+
64
+ if progress_callback:
65
+ elapsed = time.time() - start_time
66
+ avg = elapsed / processed_chunks
67
+ eta = avg * (total_chunks - processed_chunks)
68
+ progress_callback(processed_chunks, total_chunks, eta)
69
+
70
+ # Save final result
71
+ final_summary = "\n".join(summaries)
72
+ with open(output_path, 'w', encoding='utf-8') as f:
73
+ f.write(final_summary)
74
+
75
+ return final_summary