Spaces:

psyrishi
/

narrative-summarizer

Sleeping

App Files Files Community

psyrishi commited on 24 days ago

Commit

3efb860

1 Parent(s): b630781

🚀 Initial commit: narrative summarizer with BART model

Browse files

Files changed (7) hide show

.gitattributes +2 -35
.gitignore +34 -0
LICENSE +21 -0
README.md +156 -6
app.py +67 -0
requirements.txt +4 -0
summarizer.py +75 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ *.py text eol=lf
2	+ *.txt text eol=lf

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# Ignore PyCharm / IntelliJ project files
+.idea/
+*.iml
+# Byte-compiled / cache
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.so
+*.log
+# System files
+.DS_Store
+Thumbs.db
+# Checkpointing / runtime folders (auto-created at runtime)
+inputs/
+outputs/
+checkpoints/
+# Hugging Face / Gradio cache (optional)
+.gradio/
+hf_cache/
+# Environment files
+.env
+*.env
+.venv/
+venv/
+# OS-specific
+ehthumbs.db
+Icon?

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 psyrishi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.

README.md CHANGED Viewed

@@ -1,14 +1,164 @@
 ---
 title: Narrative Summarizer
-emoji: ⚡
-colorFrom: blue
-colorTo: purple
 sdk: gradio
-sdk_version: 5.45.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: Summarizer for .txt files using BART model and custom prompt
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Narrative Summarizer
+emoji: 📚
+colorFrom: indigo
+colorTo: blue
 sdk: gradio
+sdk_version: 4.15.0
 app_file: app.py
 pinned: false
 license: mit
+tags:
+  - summarization
+  - text
+  - transformer
+  - bart
+  - compression
+  - gradio
 ---
+# 📚 Narrative Summarizer
+Summarize long `.txt` narrative files into compressed, LLM-optimized summaries using BART. Choose between `Bread`, `Butter`, or both prompt styles for custom compression behavior. Upload a `.txt` file, select your preferences, and receive a clean, compressed summary in seconds.
+---
+## 📚 Narrative Summarizer — Hugging Face Space
+**`psyrishi/narrative-summarizer`**
+A user-friendly summarization tool for `.txt` files, powered by Hugging Face Transformers and built with Gradio.
+This app transforms long-form narratives into compressed, LLM-friendly summaries using either the **"Bread"**, **"Butter"**, or a **combination of both** prompt styles. It supports checkpointing to avoid data loss on interruptions and ensures large text files are processed reliably.
+---
+### ✨ Features
+* ✅ Supports `.txt` file uploads up to 3 MB (or more)
+* 📌 Prompt options: `Bread`, `Butter`, or `Bread and Butter`
+* 🔁 Multi-iteration summarization support
+* 🧠 Model: `facebook/bart-large-cnn`
+* 💾 Auto checkpointing: progress won't be lost on timeout
+* 🧰 Output is saved for download post-processing
+* 🌐 Clean Gradio UI – easy to run in browser
+---
+### 📥 How to Use
+1. **Upload** a `.txt` file (max \~3MB recommended)
+2. **Select** a summarization style from dropdown:
+   * `Bread only`
+   * `Butter only`
+   * `Bread and Butter`
+3. Choose:
+   * `Iterations`: how many times the prompts apply
+   * `Max Length`: max summary tokens per chunk
+   * `Min Length`: min summary tokens per chunk
+4. Click **Summarize**
+5. Get your **condensed output** in the results box
+---
+### ⚙️ Tech Stack
+| Component         | Details                          |
+| ----------------- | -------------------------------- |
+| **Frontend**      | [Gradio](https://www.gradio.app) |
+| **Backend**       | Hugging Face `transformers`      |
+| **Model**         | `facebook/bart-large-cnn`        |
+| **Checkpointing** | JSON-based resume system         |
+| **Language**      | Python 3.10+                     |
+---
+### 📂 Folder Structure
+```
+.
+├── app.py              # Gradio frontend app
+├── summarizer.py       # Backend summarization logic
+├── requirements.txt    # Dependencies
+├── inputs/             # Uploaded input files
+├── outputs/            # Final summarized outputs
+└── checkpoints/        # Intermediate checkpointing
+```
+---
+### 🛠️ Setup (Local)
+Clone this repo and run it locally:
+```bash
+git clone https://huggingface.co/spaces/psyrishi/narrative-summarizer
+cd narrative-summarizer
+pip install -r requirements.txt
+python app.py
+```
+---
+## 🚀 Space Configuration
+Here’s how to fill out the **Hugging Face Space creation form**:
+| Field                 | Value                                       |
+| --------------------- | ------------------------------------------- |
+| **Owner**             | `psyrishi`                                  |
+| **Space Name**        | `narrative-summarizer`                      |
+| **Short Description** | Summarizer for the txt files                |
+| **License**           | Choose: `MIT`, `Apache 2.0`, or `Other`     |
+| **Space SDK**         | ✅ Gradio                                    |
+| **Gradio Template**   | Start from Scratch or Blank                 |
+| **Hardware**          | ✅ Free (sufficient for your use case)       |
+| **Visibility**        | Choose: `Public` (recommended) or `Private` |
+| **Dev Mode**          | (Optional) Available to PRO subscribers     |
+---
+### 🧪 Prompt Styles Explained
+* 🥖 **Bread**: Focuses on compression for efficient LLM parsing
+* 🧈 **Butter**: Enhances nuance and detail while summarizing
+* 🥪 **Bread + Butter**: Applies both sequentially for balance
+---
+### 📌 Example Input
+```txt
+Once upon a time, in a quiet village nestled between two mountains...
+```
+### 📤 Example Output (Bread only)
+```txt
+A peaceful mountain village faces hidden turmoil, gradually unveiling conflicts beneath its quiet surface.
+```
+---
+### 🔐 License
+Recommend using:
+```
+MIT License
+Copyright (c) 2025 psyrishi
+Permission is hereby granted, free of charge, to any person obtaining a copy...
+```
+Or [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0).
+---
+### 👋 Feedback & Contributions
+Feel free to fork the repo, create pull requests, or open issues if you'd like to contribute or improve the tool.

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import gradio as gr
+from summarizer import Summarizer
+summarizer = Summarizer()
+PROMPT_CHOICES = {
+    "Bread only": ["Bread"],
+    "Butter only": ["Butter"],
+    "Bread and Butter": ["Bread", "Butter"]
+}
+def summarize_file(file, prompt_type, iterations, max_length, min_length):
+    if not file:
+        return "No file uploaded."
+    os.makedirs("inputs", exist_ok=True)
+    input_path = os.path.join("inputs", file.name)
+    with open(input_path, 'wb') as f:
+        f.write(file.read())
+    output_path = os.path.join("outputs", f"{os.path.splitext(file.name)[0]}_summary.txt")
+    os.makedirs("outputs", exist_ok=True)
+    def progress_callback(done, total, eta):
+        return print(f"Progress: {done}/{total} | ETA: {int(eta)} sec")
+    try:
+        summary = summarizer.summarize_file(
+            input_path=input_path,
+            output_path=output_path,
+            prompt_types=PROMPT_CHOICES[prompt_type],
+            iterations=iterations,
+            max_length=max_length,
+            min_length=min_length,
+            progress_callback=progress_callback
+        )
+        return summary
+    except Exception as e:
+        return f"Error occurred during summarization: {str(e)}"
+with gr.Blocks() as demo:
+    gr.Markdown("## 📚 Narrative Compression Tool")
+    with gr.Row():
+        file_input = gr.File(label="Upload .txt File", file_types=[".txt"])
+        prompt_type = gr.Dropdown(
+            choices=list(PROMPT_CHOICES.keys()),
+            label="Select Prompt",
+            value="Bread only"
+        )
+    iterations = gr.Slider(1, 5, value=1, step=1, label="Iterations")
+    max_length = gr.Slider(50, 300, value=150, step=10, label="Max Summary Length")
+    min_length = gr.Slider(20, 100, value=50, step=10, label="Min Summary Length")
+    submit = gr.Button("Summarize")
+    output = gr.Textbox(label="Condensed Summary", lines=15)
+    submit.click(
+        summarize_file,
+        inputs=[file_input, prompt_type, iterations, max_length, min_length],
+        outputs=output
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers
+torch
+gradio
+hf-xet

summarizer.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import json
+import time
+from transformers import pipeline
+import torch
+class Summarizer:
+    def __init__(self, model_name="facebook/bart-large-cnn", chunk_size=1000, batch_size=4):
+        self.model_name = model_name
+        self.chunk_size = chunk_size
+        self.batch_size = batch_size
+        self.device = 0 if torch.cuda.is_available() else -1
+        self.summarizer = pipeline("summarization", model=model_name, device=self.device)
+        os.makedirs("checkpoints", exist_ok=True)
+    def _chunk_text(self, text):
+        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size)]
+    def _apply_prompt(self, chunk, prompt_type):
+        if prompt_type == "Bread":
+            return f"Transform the provided fictional narrative into a maximally compressed yet losslessly decompressible format optimized for LLM reconstruction. {chunk}"
+        elif prompt_type == "Butter":
+            return f"Solid foundation, but let's refine the granularity. Your 4-subpoint structure creates artificial symmetry where organic complexity should flourish. {chunk}"
+        else:
+            return chunk
+    def summarize_file(self, input_path, output_path, prompt_types, iterations=1,
+                       max_length=150, min_length=50, progress_callback=None):
+        with open(input_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+        chunks = self._chunk_text(text)
+        total_chunks = len(chunks)
+        processed_chunks = 0
+        summaries = []
+        start_time = time.time()
+        # Checkpoint recovery
+        checkpoint_path = os.path.join("checkpoints", os.path.basename(input_path) + ".json")
+        if os.path.exists(checkpoint_path):
+            with open(checkpoint_path, 'r', encoding='utf-8') as cp:
+                checkpoint_data = json.load(cp)
+                summaries = checkpoint_data.get("summaries", [])
+                processed_chunks = checkpoint_data.get("processed_chunks", 0)
+        for i in range(processed_chunks, total_chunks):
+            chunk = chunks[i]
+            for _ in range(iterations):
+                for p in prompt_types:
+                    chunk = self._apply_prompt(chunk, p)
+            summary = self.summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
+            summaries.append(summary)
+            processed_chunks += 1
+            # Save checkpoint
+            with open(checkpoint_path, 'w', encoding='utf-8') as cp:
+                json.dump({
+                    "processed_chunks": processed_chunks,
+                    "summaries": summaries
+                }, cp)
+            if progress_callback:
+                elapsed = time.time() - start_time
+                avg = elapsed / processed_chunks
+                eta = avg * (total_chunks - processed_chunks)
+                progress_callback(processed_chunks, total_chunks, eta)
+        # Save final result
+        final_summary = "\n".join(summaries)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(final_summary)
+        return final_summary