CUDA Out of Memory
Hi,
I have 2 x 24GB Tesla P40 GPUs and I get CUDA out of memory when only one GPU is fully saturated. As such it appears that device_map="auto" isn't taking effect. Any thoughts?
Here's the code I'm using (pulled from the examples, only change is atten_implementation="eager" due to the GPU model not supporting flash attention:
import sys
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
model_path = "nanonets/Nanonets-OCR-s"
model = AutoModelForImageTextToText.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
attn_implementation="eager"
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)
def ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=4096):
prompt = """Extract the text from the above document as if you were reading it naturally. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using β and β for check boxes."""
image = Image.open(image_path)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "image", "image": f"file://{image_path}"},
{"type": "text", "text": prompt},
]},
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
inputs = inputs.to(model.device)
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
return output_text[0]
image_path = sys.argv[1]
result = ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=15000)
print(result)
With the above code the following error is generated:
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.36 GiB. GPU 0 has a total capacity of 23.87 GiB of which 3.25 GiB is free. Including non-PyTorch memory, this process has 20.61 GiB memory in use. Of the allocated memory 20.31 GiB is allocated by PyTorch, and 132.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
What are your image dimensions? Maybe the model is creating too many tokens from your image. Try resizing the image to a fixed size like 2048x2048 and see if it works. You should be able to load this into a single 24 GB machine.
.
running the command from the docext example uses 40Gb of vram
Hi,
I have 2 x 24GB Tesla P40 GPUs and I get CUDA out of memory when only one GPU is fully saturated. As such it appears that device_map="auto" isn't taking effect. Any thoughts?
Here's the code I'm using (pulled from the examples, only change is atten_implementation="eager" due to the GPU model not supporting flash attention:
import sys import torch from PIL import Image from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText model_path = "nanonets/Nanonets-OCR-s" model = AutoModelForImageTextToText.from_pretrained( model_path, torch_dtype="auto", device_map="auto", attn_implementation="eager" ) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_path) processor = AutoProcessor.from_pretrained(model_path) def ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=4096): prompt = """Extract the text from the above document as if you were reading it naturally. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using β and β for check boxes.""" image = Image.open(image_path) messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": [ {"type": "image", "image": f"file://{image_path}"}, {"type": "text", "text": prompt}, ]}, ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt") inputs = inputs.to(model.device) output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) return output_text[0] image_path = sys.argv[1] result = ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=15000) print(result)
With the above code the following error is generated:
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.36 GiB. GPU 0 has a total capacity of 23.87 GiB of which 3.25 GiB is free. Including non-PyTorch memory, this process has 20.61 GiB memory in use. Of the allocated memory 20.31 GiB is allocated by PyTorch, and 132.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
I had this problem as well but after some fiine tunning and resize of the image it worked out. I have a single 5090 Geforce RTX .
import os
from pdf2image import convert_from_bytes
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
import torch
from pathlib import Path
import gc
print("Loading Nanonets-OCR-s model...")
model_path = "nanonets/Nanonets-OCR-s"
model = AutoModelForImageTextToText.from_pretrained(
model_path,
torch_dtype="auto", # Use fp16 for speed!
device_map="auto",
attn_implementation="sdpa"
)
model.eval()
# Try to compile for extra speed (if available)
if hasattr(torch, 'compile'):
print("Compiling model for faster inference...")
model = torch.compile(model, mode="reduce-overhead")
tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)
def process_three_pages(images, model, processor, max_new_tokens=12288): # Increased tokens for 3 pages
"""Process 3 images in single message"""
prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using β and β for check boxes."""
# Build content with all images
content = []
for image in images:
content.append({"type": "image"})
content.append({"type": "text", "text": prompt})
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": content},
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Process all images together
with torch.autocast(device_type='cuda', dtype=torch.float16):
inputs = processor(text=[text], images=images, return_tensors="pt")
inputs = inputs.to(model.device)
# Generate for all images
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
# Decode result
generated_ids = output_ids[0][len(inputs.input_ids[0]):]
output_text = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
return output_text
def process_pdf_fast(pdf_path):
"""Process PDF three pages at a time"""
print(f"\nProcessing: {pdf_path}")
# Read PDF
with open(pdf_path, 'rb') as f:
pdf_data = f.read()
# Convert to images with lower DPI for speed (still good quality)
print("Converting PDF to images...")
images = convert_from_bytes(pdf_data, dpi=200, thread_count=8) # Use more threads
print(f"Found {len(images)} pages")
# Output file
output_path = pdf_path.with_suffix('.txt')
with open(output_path, 'w', encoding='utf-8') as f:
f.write(f"OCR Output for: {pdf_path}\n")
f.write(f"Total Pages: {len(images)}\n")
f.write("="*80 + "\n")
f.flush()
# Process 3 pages at a time
for i in range(0, len(images), 3):
batch_images = images[i:min(i+3, len(images))]
page_range = f"{i+1}-{min(i+3, len(images))}"
print(f"Processing pages {page_range}...")
# Process batch of up to 3 pages
result = process_three_pages(batch_images, model, processor)
# Write result
f.write(f"\n\n{'='*40} PAGES {page_range} {'='*40}\n\n")
f.write(result)
f.write("\n")
f.flush()
print(f" β Pages {page_range} done!")
# Clear memory periodically
if (i // 3) % 2 == 0:
torch.cuda.empty_cache()
gc.collect()
print(f"β Completed: {output_path}")
# Find all PDFs
documents_folder = "downloaded-documents"
pdf_files = list(Path(documents_folder).rglob("*.pdf"))
if not pdf_files:
print(f"No PDFs found in {documents_folder}")
else:
print(f"Found {len(pdf_files)} PDFs")
# Process each PDF
for i, pdf_path in enumerate(pdf_files, 1):
print(f"\n[{i}/{len(pdf_files)}]")
process_pdf_fast(pdf_path)
print("\nDONE!")