CUDA Out of Memory

#14

by ep5000 - opened Jun 24

Discussion

ep5000

Jun 24

•

edited Jun 24

Hi,

I have 2 x 24GB Tesla P40 GPUs and I get CUDA out of memory when only one GPU is fully saturated. As such it appears that device_map="auto" isn't taking effect. Any thoughts?

Here's the code I'm using (pulled from the examples, only change is atten_implementation="eager" due to the GPU model not supporting flash attention:

import sys
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText

model_path = "nanonets/Nanonets-OCR-s"

model = AutoModelForImageTextToText.from_pretrained(
    model_path, 
    torch_dtype="auto", 
    device_map="auto", 
    attn_implementation="eager"
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)


def ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=4096):
    prompt = """Extract the text from the above document as if you were reading it naturally. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
    image = Image.open(image_path)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {"type": "image", "image": f"file://{image_path}"},
            {"type": "text", "text": prompt},
        ]},
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to(model.device)
    
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]

image_path = sys.argv[1]
result = ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=15000)
print(result)

With the above code the following error is generated:

torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.36 GiB. GPU 0 has a total capacity of 23.87 GiB of which 3.25 GiB is free. Including non-PyTorch memory, this process has 20.61 GiB memory in use. Of the allocated memory 20.31 GiB is allocated by PyTorch, and 132.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Souvik3333

Nanonets org Jun 24

What are your image dimensions? Maybe the model is creating too many tokens from your image. Try resizing the image to a fixed size like 2048x2048 and see if it works. You should be able to load this into a single 24 GB machine.

usmaan-rifkhan

Jun 25

•

edited Jun 25

vladciocan88

Jul 7

running the command from the docext example uses 40Gb of vram

artartarta

Jul 7

•

edited Jul 7

Hi,

I have 2 x 24GB Tesla P40 GPUs and I get CUDA out of memory when only one GPU is fully saturated. As such it appears that device_map="auto" isn't taking effect. Any thoughts?

Here's the code I'm using (pulled from the examples, only change is atten_implementation="eager" due to the GPU model not supporting flash attention:

import sys
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText

model_path = "nanonets/Nanonets-OCR-s"

model = AutoModelForImageTextToText.from_pretrained(
    model_path, 
    torch_dtype="auto", 
    device_map="auto", 
    attn_implementation="eager"
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)


def ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=4096):
    prompt = """Extract the text from the above document as if you were reading it naturally. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
    image = Image.open(image_path)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {"type": "image", "image": f"file://{image_path}"},
            {"type": "text", "text": prompt},
        ]},
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to(model.device)
    
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]

image_path = sys.argv[1]
result = ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=15000)
print(result)

With the above code the following error is generated:

torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.36 GiB. GPU 0 has a total capacity of 23.87 GiB of which 3.25 GiB is free. Including non-PyTorch memory, this process has 20.61 GiB memory in use. Of the allocated memory 20.31 GiB is allocated by PyTorch, and 132.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

ElectricMan420

13 days ago

•

edited 13 days ago

I had this problem as well but after some fiine tunning and resize of the image it worked out. I have a single 5090 Geforce RTX .

import os
from pdf2image import convert_from_bytes
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
import torch
from pathlib import Path
import gc



print("Loading Nanonets-OCR-s model...")
model_path = "nanonets/Nanonets-OCR-s"

model = AutoModelForImageTextToText.from_pretrained(
    model_path, 
    torch_dtype="auto",  # Use fp16 for speed!
    device_map="auto", 
    attn_implementation="sdpa"
)
model.eval()

# Try to compile for extra speed (if available)
if hasattr(torch, 'compile'):
    print("Compiling model for faster inference...")
    model = torch.compile(model, mode="reduce-overhead")

tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)

def process_three_pages(images, model, processor, max_new_tokens=12288):  # Increased tokens for 3 pages
    """Process 3 images in single message"""
    
    prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
    
    # Build content with all images
    content = []
    for image in images:
        content.append({"type": "image"})
    content.append({"type": "text", "text": prompt})
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": content},
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Process all images together
    with torch.autocast(device_type='cuda', dtype=torch.float16):
        inputs = processor(text=[text], images=images, return_tensors="pt")
        inputs = inputs.to(model.device)
        
        # Generate for all images
        output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
        
    # Decode result
    generated_ids = output_ids[0][len(inputs.input_ids[0]):]
    output_text = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
    return output_text

def process_pdf_fast(pdf_path):
    """Process PDF three pages at a time"""
    print(f"\nProcessing: {pdf_path}")
    
    # Read PDF
    with open(pdf_path, 'rb') as f:
        pdf_data = f.read()
    
    # Convert to images with lower DPI for speed (still good quality)
    print("Converting PDF to images...")
    images = convert_from_bytes(pdf_data, dpi=200, thread_count=8)  # Use more threads
    print(f"Found {len(images)} pages")
    
    # Output file
    output_path = pdf_path.with_suffix('.txt')
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"OCR Output for: {pdf_path}\n")
        f.write(f"Total Pages: {len(images)}\n")
        f.write("="*80 + "\n")
        f.flush()
        
        # Process 3 pages at a time
        for i in range(0, len(images), 3):
            batch_images = images[i:min(i+3, len(images))]
            page_range = f"{i+1}-{min(i+3, len(images))}"
            
            print(f"Processing pages {page_range}...")
            
            # Process batch of up to 3 pages
            result = process_three_pages(batch_images, model, processor)
            
            # Write result
            f.write(f"\n\n{'='*40} PAGES {page_range} {'='*40}\n\n")
            f.write(result)
            f.write("\n")
            f.flush()
            print(f"  ✓ Pages {page_range} done!")
            
            # Clear memory periodically
            if (i // 3) % 2 == 0:
                torch.cuda.empty_cache()
                gc.collect()
    
    print(f"✓ Completed: {output_path}")

# Find all PDFs
documents_folder = "downloaded-documents"
pdf_files = list(Path(documents_folder).rglob("*.pdf"))

if not pdf_files:
    print(f"No PDFs found in {documents_folder}")
else:
    print(f"Found {len(pdf_files)} PDFs")
    
    # Process each PDF
    for i, pdf_path in enumerate(pdf_files, 1):
        print(f"\n[{i}/{len(pdf_files)}]")
        process_pdf_fast(pdf_path)
    
    print("\nDONE!")

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment