Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- app/agents/__pycache__/presentation_generator.cpython-310.pyc +0 -0
- app/services/__pycache__/blog_image_service.cpython-310.pyc +0 -0
- app/services/__pycache__/image_service.cpython-310.pyc +0 -0
- app/services/blog_image_service.py +1 -1
- app/services/image_service.py +6 -4
- outputs/analysis_summary.md +85 -0
- outputs/blogs/blog_content.md +137 -0
- outputs/posters/Attention Is All You Need_poster.pdf +3 -0
- outputs/posters/Attention Is All You Need_poster_image.png +3 -0
- outputs/posters/poster_latex.tex +179 -0
- outputs/presentations/presentation_beamer.tex +536 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
outputs/posters/Attention[[:space:]]Is[[:space:]]All[[:space:]]You[[:space:]]Need_poster.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
outputs/posters/Attention[[:space:]]Is[[:space:]]All[[:space:]]You[[:space:]]Need_poster_image.png filter=lfs diff=lfs merge=lfs -text
|
app/agents/__pycache__/presentation_generator.cpython-310.pyc
CHANGED
Binary files a/app/agents/__pycache__/presentation_generator.cpython-310.pyc and b/app/agents/__pycache__/presentation_generator.cpython-310.pyc differ
|
|
app/services/__pycache__/blog_image_service.cpython-310.pyc
CHANGED
Binary files a/app/services/__pycache__/blog_image_service.cpython-310.pyc and b/app/services/__pycache__/blog_image_service.cpython-310.pyc differ
|
|
app/services/__pycache__/image_service.cpython-310.pyc
CHANGED
Binary files a/app/services/__pycache__/image_service.cpython-310.pyc and b/app/services/__pycache__/image_service.cpython-310.pyc differ
|
|
app/services/blog_image_service.py
CHANGED
@@ -15,7 +15,7 @@ class BlogImageService:
|
|
15 |
"""Service for generating and managing images for blog posts"""
|
16 |
|
17 |
def __init__(self):
|
18 |
-
self.deepinfra_model = "black-forest-labs/FLUX-1-
|
19 |
self.output_dir = Path("outputs/images/blog")
|
20 |
self.output_dir.mkdir(parents=True, exist_ok=True)
|
21 |
self.upload_api_key = (
|
|
|
15 |
"""Service for generating and managing images for blog posts"""
|
16 |
|
17 |
def __init__(self):
|
18 |
+
self.deepinfra_model = "black-forest-labs/FLUX-1.1-pro"
|
19 |
self.output_dir = Path("outputs/images/blog")
|
20 |
self.output_dir.mkdir(parents=True, exist_ok=True)
|
21 |
self.upload_api_key = (
|
app/services/image_service.py
CHANGED
@@ -13,7 +13,9 @@ from app.models.schemas import PaperAnalysis
|
|
13 |
|
14 |
class ImageGenerationService:
|
15 |
def __init__(self):
|
16 |
-
self.client = AsyncOpenAI(
|
|
|
|
|
17 |
self.output_dir = Path("outputs/images")
|
18 |
self.output_dir.mkdir(parents=True, exist_ok=True)
|
19 |
|
@@ -59,7 +61,7 @@ class ImageGenerationService:
|
|
59 |
- Facebook: Engaging, accessible, community-friendly
|
60 |
- Instagram: Vibrant, aesthetic, visual-first
|
61 |
|
62 |
-
Generate a detailed
|
63 |
"""
|
64 |
|
65 |
from app.services.llm_service import LLMService
|
@@ -82,8 +84,8 @@ class ImageGenerationService:
|
|
82 |
|
83 |
# Clean and truncate the prompt if needed
|
84 |
image_prompt = image_prompt.strip().replace("\n", " ")
|
85 |
-
if len(image_prompt) >
|
86 |
-
image_prompt = image_prompt[:
|
87 |
|
88 |
return image_prompt
|
89 |
|
|
|
13 |
|
14 |
class ImageGenerationService:
|
15 |
def __init__(self):
|
16 |
+
self.client = AsyncOpenAI(
|
17 |
+
api_key=settings.IMAGE_GEN_API_KEY, base_url=settings.IMAGE_GEN_BASE_URL
|
18 |
+
)
|
19 |
self.output_dir = Path("outputs/images")
|
20 |
self.output_dir.mkdir(parents=True, exist_ok=True)
|
21 |
|
|
|
61 |
- Facebook: Engaging, accessible, community-friendly
|
62 |
- Instagram: Vibrant, aesthetic, visual-first
|
63 |
|
64 |
+
Generate a detailed image generation prompt for stable diffusion (max 800 characters) that will create an engaging image for this research:
|
65 |
"""
|
66 |
|
67 |
from app.services.llm_service import LLMService
|
|
|
84 |
|
85 |
# Clean and truncate the prompt if needed
|
86 |
image_prompt = image_prompt.strip().replace("\n", " ")
|
87 |
+
if len(image_prompt) > 800:
|
88 |
+
image_prompt = image_prompt[:797] + "..."
|
89 |
|
90 |
return image_prompt
|
91 |
|
outputs/analysis_summary.md
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Paper Analysis Summary
|
2 |
+
|
3 |
+
## Title
|
4 |
+
Attention Is All You Need
|
5 |
+
|
6 |
+
## Authors
|
7 |
+
Ashish Vaswani, Llion Jones, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin
|
8 |
+
|
9 |
+
## Abstract
|
10 |
+
The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.
|
11 |
+
|
12 |
+
## Methodology
|
13 |
+
The Transformer is an encoder-decoder architecture. The encoder and decoder are composed of stacks of N=6 identical layers. Each layer incorporates residual connections and layer normalization.
|
14 |
+
|
15 |
+
**Encoder Layer:** Consists of two sub-layers: a Multi-Head Self-Attention mechanism and a position-wise fully connected feed-forward network.
|
16 |
+
|
17 |
+
**Decoder Layer:** Consists of three sub-layers: a Masked Multi-Head Self-Attention mechanism (to prevent attending to future positions), an Encoder-Decoder Multi-Head Attention mechanism (where queries come from the previous decoder layer and keys/values from the encoder output), and a position-wise fully connected feed-forward network.
|
18 |
+
|
19 |
+
**Attention Mechanisms:**
|
20 |
+
* **Scaled Dot-Product Attention:** Computes attention as $\text{softmax}(\frac{QK^T}{\sqrt{d_k}})V$. The scaling factor $\frac{1}{\sqrt{d_k}}$ is crucial for stability with large $d_k$.
|
21 |
+
* **Multi-Head Attention:** Instead of a single attention function, it linearly projects queries, keys, and values $h$ times with different learned projections, performs attention in parallel for each 'head', concatenates the results, and projects them again. This allows attending to different representation subspaces. The paper used $h=8$ heads with $d_k=d_v=d_{\text{model}}/h=64$.
|
22 |
+
|
23 |
+
**Position-wise Feed-Forward Networks:** A simple two-linear-transformation network with a ReLU activation, applied identically and independently to each position. Input/output dimension $d_{\text{model}}=512$, inner layer $d_{ff}=2048$.
|
24 |
+
|
25 |
+
**Embeddings and Positional Encoding:** Learned embeddings convert tokens to vectors. To account for sequence order (since there's no recurrence/convolution), fixed sinusoidal positional encodings are added to the input embeddings. The same weight matrix is shared between embedding layers and the pre-softmax linear transformation.
|
26 |
+
|
27 |
+
**Training Details:**
|
28 |
+
* **Data:** WMT 2014 English-German (4.5M sentence pairs) and English-French (36M sentence pairs), using byte-pair or word-piece encodings.
|
29 |
+
* **Hardware:** Trained on 8 NVIDIA P100 GPUs.
|
30 |
+
* **Schedule:** Base models trained for 100,000 steps (12 hours); big models for 300,000 steps (3.5 days).
|
31 |
+
* **Optimizer:** Adam with a custom learning rate schedule involving a warmup phase (4000 steps) followed by a decay.
|
32 |
+
* **Regularization:** Residual Dropout (0.1 for base, 0.3 for big EN-FR) and Label Smoothing (0.1).
|
33 |
+
|
34 |
+
## Key Findings
|
35 |
+
- Introduction of the Transformer, a novel neural network architecture based *solely* on attention mechanisms, completely dispensing with recurrence and convolutions.
|
36 |
+
- Achieved new state-of-the-art BLEU scores on WMT 2014 English-to-German (28.4 BLEU) and WMT 2014 English-to-French (41.8 BLEU), significantly outperforming previous models and ensembles.
|
37 |
+
- Demonstrated superior parallelizability and significantly reduced training time compared to recurrent or convolutional models.
|
38 |
+
- The Transformer generalizes well to other tasks, successfully applied to English constituency parsing.
|
39 |
+
- Self-attention layers enable constant number of sequential operations (O(1)), facilitating parallelization and learning long-range dependencies more effectively than recurrent layers (O(n)).
|
40 |
+
- Multi-Head Attention allows the model to jointly attend to information from different representation subspaces, enhancing its ability to capture diverse dependencies.
|
41 |
+
- Fixed sinusoidal positional encodings are effective for injecting sequence order information and allow extrapolation to sequence lengths longer than those seen during training.
|
42 |
+
|
43 |
+
## Results
|
44 |
+
The Transformer achieved significant performance improvements across multiple tasks:
|
45 |
+
|
46 |
+
**Machine Translation (WMT 2014 English-to-German):**
|
47 |
+
* **Transformer (big):** 28.4 BLEU, setting a new state-of-the-art, outperforming previous best models (including ensembles) by over 2.0 BLEU.
|
48 |
+
* **Transformer (base):** 27.3 BLEU, surpassing all previously published models and ensembles at a fraction of the training cost.
|
49 |
+
|
50 |
+
**Machine Translation (WMT 2014 English-to-French):**
|
51 |
+
* **Transformer (big):** 41.8 BLEU, establishing a new single-model state-of-the-art, achieved with significantly lower training costs (3.5 days on 8 GPUs) compared to previous top models.
|
52 |
+
|
53 |
+
**Model Variations (Ablation Studies on English-to-German dev set):**
|
54 |
+
* Multi-head attention is crucial for performance; single-head attention resulted in a 0.9 BLEU drop.
|
55 |
+
* Reducing the attention key size ($d_k$) negatively impacted model quality.
|
56 |
+
* Larger models generally yielded better results.
|
57 |
+
* Dropout was found to be highly effective in preventing overfitting.
|
58 |
+
* Learned positional embeddings produced nearly identical results to the fixed sinusoidal positional encodings, validating the choice of sinusoidal functions.
|
59 |
+
|
60 |
+
**English Constituency Parsing:**
|
61 |
+
* **Transformer (4 layers, WSJ only):** Achieved 91.3 F1, outperforming the BerkeleyParser even with limited training data.
|
62 |
+
* **Transformer (4 layers, semi-supervised):** Achieved 92.7 F1, demonstrating strong generalization and competitive performance with state-of-the-art models despite minimal task-specific tuning.
|
63 |
+
|
64 |
+
## Conclusion
|
65 |
+
The paper successfully introduced the Transformer, the first sequence transduction model relying entirely on attention mechanisms, completely replacing traditional recurrent or convolutional layers. This novel architecture demonstrated significant advantages in training speed and achieved new state-of-the-art results on major machine translation benchmarks (WMT 2014 English-to-German and English-to-French). Furthermore, the Transformer's ability to generalize was showcased by its strong performance in English constituency parsing. The authors express excitement for the future of attention-based models and plan to extend their application to other modalities (e.g., images, audio, video) and explore local/restricted attention mechanisms for handling very large inputs.
|
66 |
+
|
67 |
+
## Complexity Level
|
68 |
+
Advanced
|
69 |
+
|
70 |
+
## Technical Terms
|
71 |
+
Sequence transduction, Recurrent Neural Networks (RNNs), Convolutional Neural Networks (CNNs), Encoder-Decoder architecture, Attention mechanism, Self-attention (Intra-attention), Transformer, Multi-Head Attention, Scaled Dot-Product Attention, Positional Encoding, Feed-Forward Networks, Residual connections, Layer Normalization, BLEU score, Beam search, Dropout, Label Smoothing, Adam optimizer, Constituency parsing, Byte-pair encoding (BPE), Word-piece vocabulary, Parallelization, Computational complexity, Long-range dependencies, Auto-regressive
|
72 |
+
|
73 |
+
## Figures and Tables
|
74 |
+
- The Transformer - model architecture. Illustrates the overall encoder-decoder structure with stacked self-attention and feed-forward layers.: No caption
|
75 |
+
- (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several attention layers running in parallel. Visualizes the core attention mechanisms.: No caption
|
76 |
+
- An example of the attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6. Visualizes attention weights for the word 'making'.: No caption
|
77 |
+
- Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Visualizes attention weights for the word 'its'.: No caption
|
78 |
+
- Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. Shows two examples of attention heads focusing on different syntactic/semantic structures.: No caption
|
79 |
+
- Maximum path lengths, per-layer complexity and minimum number of sequential operations for different layer types. Compares Self-Attention, Recurrent, and Convolutional layers, highlighting the O(1) sequential operations for Self-Attention.: No caption
|
80 |
+
- The Transformer achieves better BLEU scores than previous state-of-the-art models on the English-to-German and English-to-French newstest2014 tests at a fraction of the training cost. Provides a detailed comparison of BLEU scores and training costs (FLOPs) for Transformer vs. other models.: No caption
|
81 |
+
- Variations on the Transformer architecture. Shows ablation studies on hyperparameters like number of heads, $d_k$, $N$, $d_{\text{model}}$, $d_{ff}$, dropout, label smoothing, and positional encoding type, with their impact on perplexity and BLEU.: No caption
|
82 |
+
- The Transformer generalizes well to English constituency parsing (Results are on Section 23 of WSJ). Compares Transformer's F1 score on parsing with other models, both WSJ-only and semi-supervised.: No caption
|
83 |
+
|
84 |
+
---
|
85 |
+
*Generated by ScholarShare - AI Research Dissemination Platform*
|
outputs/blogs/blog_content.md
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# The Revolution Will Be Attended: How the Transformer Blew Up AI (and Made Your Chatbot Smarter)
|
2 |
+
|
3 |
+
Ever wondered how Google Translate instantly converts languages, or how AI models like ChatGPT seem to understand and generate human-like text with uncanny fluency? For years, the reigning champions in these tasks were complex neural networks known as Recurrent Neural Networks (RNNs) and Convolutional Neural Networks (CNNs). They were powerful, but they had their limitations.
|
4 |
+
|
5 |
+
Then, in 2017, a paper dropped that changed everything: **"Attention Is All You Need."**
|
6 |
+
|
7 |
+
This wasn't just another incremental improvement; it was a paradigm shift. Penned by a team of brilliant minds at Google Brain, this paper introduced the **Transformer** – an entirely new neural network architecture that threw out recurrence and convolutions in favor of a mechanism called "attention." The results? Mind-blowing. Faster training, better performance, and a blueprint for nearly every major breakthrough in Natural Language Processing (NLP) since.
|
8 |
+
|
9 |
+
If you've heard whispers of "Transformers" or "Attention" but felt lost in the technical jargon, you're in the right place. As a computer scientist specializing in machine learning, I'm here to demystify this groundbreaking work, making it accessible even if you're new to the field, while diving deep into its ingenious mechanics.
|
10 |
+
|
11 |
+
## The Problem with the Old Guard: Why We Needed a Revolution
|
12 |
+
|
13 |
+
Before the Transformer, NLP models faced significant hurdles:
|
14 |
+
|
15 |
+
1. **Sequential Processing (RNNs):** RNNs like LSTMs and GRUs process text word by word, step by step. Imagine reading a really long sentence out loud, one word at a time, and trying to remember the very first word by the time you get to the end. This sequential nature made it hard for RNNs to capture **long-range dependencies** (how words far apart in a sentence relate to each other). It also meant they were inherently **slow to train**, as you couldn't process parts of the sentence in parallel.
|
16 |
+
2. **Limited Context (CNNs):** While CNNs are great for images, their application in NLP often involved looking at fixed-size "windows" of words. This limited their ability to understand global context across an entire sentence or document.
|
17 |
+
3. **The "Bottleneck" Problem:** Even the best RNN/CNN models often used an "encoder-decoder" structure where the entire input sentence had to be compressed into a single fixed-size vector. This vector then had to contain *all* the information needed to generate the output, often leading to information loss, especially for long sentences.
|
18 |
+
|
19 |
+
**Attention** mechanisms were already a partial solution to the bottleneck. They allowed the decoder to "look back" at relevant parts of the input sequence while generating each output word, rather than relying solely on the single bottleneck vector. But the "Attention Is All You Need" paper took this concept to its logical extreme: **What if attention wasn't just an add-on, but the *entire* foundation?**
|
20 |
+
|
21 |
+
## The Big Idea: Attention Is All You Need (Seriously!)
|
22 |
+
|
23 |
+
The core innovation of the Transformer is its audacious claim: you don't need recurrence or convolutions to process sequences. **Attention, and specifically a mechanism called "Self-Attention," is enough.**
|
24 |
+
|
25 |
+
Imagine you're reading a sentence: "The animal didn't cross the street because it was too tired."
|
26 |
+
As a human, you instantly know "it" refers to "the animal," not "the street." How? You implicitly "attend" to "animal" when you read "it." This is the essence of self-attention.
|
27 |
+
|
28 |
+
The Transformer uses this principle to allow every word in a sentence to "look at" and "weigh" the importance of every other word in the same sentence, identifying crucial relationships regardless of how far apart they are.
|
29 |
+
|
30 |
+
## Deconstructing the Transformer: The Building Blocks
|
31 |
+
|
32 |
+
The Transformer is an **encoder-decoder model**, just like many previous sequence-to-sequence models. The **encoder** processes the input sequence (e.g., English sentence) and transforms it into a rich, contextualized representation. The **decoder** then takes this representation and generates the output sequence (e.g., French translation).
|
33 |
+
|
34 |
+
Both the encoder and decoder are stacks of identical layers. Let's break down the key components:
|
35 |
+
|
36 |
+
### 1. The Star of the Show: Scaled Dot-Product Attention
|
37 |
+
|
38 |
+
At the heart of the Transformer lies the **Scaled Dot-Product Attention** mechanism. It's a remarkably simple yet powerful idea. For each word in a sequence, it calculates how much attention it should pay to every other word.
|
39 |
+
|
40 |
+
Think of it like this:
|
41 |
+
* **Query (Q):** What information am I looking for? (e.g., the current word's meaning)
|
42 |
+
* **Key (K):** What information do other words have? (e.g., other words' potential relevance)
|
43 |
+
* **Value (V):** What *is* that information? (e.g., the actual content of those other words)
|
44 |
+
|
45 |
+
The attention score is calculated by taking the dot product of the Query with all Keys. A high dot product means the Query and Key are very similar, indicating high relevance. These scores are then scaled and passed through a softmax function to get probabilities, which are then used to weight the Values.
|
46 |
+
|
47 |
+
The formula looks like this:
|
48 |
+
|
49 |
+
$$
|
50 |
+
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
|
51 |
+
$$
|
52 |
+
|
53 |
+
* $Q$, $K$, $V$ are matrices representing the Query, Key, and Value vectors for all words in the sequence.
|
54 |
+
* $d_k$ is the dimension of the Key vectors. Dividing by $\sqrt{d_k}$ is crucial! It prevents the dot products from becoming too large (especially with high dimensions), which could push the softmax into regions with extremely small gradients, hindering training. This scaling ensures stable training.
|
55 |
+
|
56 |
+
### 2. Multi-Head Attention: Multiple Perspectives are Better
|
57 |
+
|
58 |
+
While single attention is good, **Multi-Head Attention** is even better. Instead of performing attention once, it performs it multiple times in parallel, each time with different learned linear projections of the Queries, Keys, and Values.
|
59 |
+
|
60 |
+
* **Analogy:** Imagine you're a detective investigating a complex case. One detective might focus on alibis, another on motives, a third on forensics, and so on. Each detective (attention head) looks at the same evidence (input sequence) but extracts different kinds of relationships or information.
|
61 |
+
* **How it works:** The input $Q, K, V$ are projected $h$ times (e.g., 8 times) into different, lower-dimensional subspaces. Attention is then performed independently for each of these "heads." The outputs from all heads are then concatenated and linearly transformed back to the original dimension. This allows the model to jointly attend to information from different representation subspaces at different positions.
|
62 |
+
|
63 |
+
Multi-Head Attention is used in three ways within the Transformer:
|
64 |
+
* **Encoder Self-Attention:** Allows each word in the input sequence to attend to all other words in the *same* input sequence.
|
65 |
+
* **Decoder Masked Self-Attention:** Allows each word in the output sequence to attend to all *previous* words in the *same* output sequence. The "masked" part is vital: it prevents the decoder from "cheating" by looking at future words it hasn't generated yet, maintaining the auto-regressive property.
|
66 |
+
* **Encoder-Decoder Attention:** Allows each word in the decoder to attend to *all* words in the final encoded representation of the input sequence. This helps the decoder focus on relevant parts of the source sentence when generating the target word.
|
67 |
+
|
68 |
+
### 3. Positional Encoding: Giving Words a Sense of Place
|
69 |
+
|
70 |
+
Since the Transformer completely abandons recurrence and convolutions, it has no inherent sense of word order. If you just shuffled the words, the model would produce the same output! This is where **Positional Encoding** comes in.
|
71 |
+
|
72 |
+
* **Analogy:** Imagine each word is given a unique "address" or "timestamp" based on its position in the sentence.
|
73 |
+
* **How it works:** Fixed sinusoidal (sine and cosine wave) functions are used to generate unique positional encodings for each position in the sequence. These encodings are then simply *added* to the word embeddings (the numerical representations of words). This way, the model learns to implicitly understand the relative and absolute positions of words, crucial for tasks like machine translation where word order matters.
|
74 |
+
|
75 |
+
### 4. The Encoder and Decoder Stacks
|
76 |
+
|
77 |
+
**The Encoder:**
|
78 |
+
A stack of 6 identical layers. Each layer has two sub-layers:
|
79 |
+
1. A **Multi-Head Self-Attention** mechanism.
|
80 |
+
2. A **Position-wise Feed-Forward Network**. This is a simple two-layer neural network applied independently and identically to each position (word) in the sequence. It's like a small "thinking module" for each word's representation.
|
81 |
+
|
82 |
+
Crucially, **residual connections** (adding the input of a sub-layer to its output) and **layer normalization** are applied around each sub-layer. Residual connections help with training very deep networks, preventing vanishing gradients, while layer normalization stabilizes training by normalizing the activations within each layer.
|
83 |
+
|
84 |
+
**The Decoder:**
|
85 |
+
Also a stack of 6 identical layers. Each layer has three sub-layers:
|
86 |
+
1. A **Masked Multi-Head Self-Attention** mechanism (to prevent looking ahead).
|
87 |
+
2. A **Multi-Head Encoder-Decoder Attention** mechanism (to attend to the encoder's output).
|
88 |
+
3. A **Position-wise Feed-Forward Network**.
|
89 |
+
|
90 |
+
Again, residual connections and layer normalization are used throughout.
|
91 |
+
|
92 |
+
## The Results: A New Era for NLP
|
93 |
+
|
94 |
+
The impact of the Transformer was immediate and profound:
|
95 |
+
|
96 |
+
1. **State-of-the-Art Performance:**
|
97 |
+
* On the WMT 2014 English-to-German translation task, the 'big' Transformer model achieved a BLEU score of **28.4**, outperforming previous best models (including ensembles) by over 2 BLEU points.
|
98 |
+
* For WMT 2014 English-to-French, it set a new single-model state-of-the-art with **41.8 BLEU**.
|
99 |
+
|
100 |
+
2. **Unprecedented Training Efficiency:**
|
101 |
+
* The Transformer's complete reliance on attention, which is highly parallelizable (you can calculate attention for all words simultaneously), meant it could be trained **significantly faster** than RNNs. The English-to-French model trained in just 3.5 days on 8 GPUs, a fraction of the time required by previous SOTA models. This speed-up was a game-changer, enabling researchers to experiment more rapidly.
|
102 |
+
|
103 |
+
3. **Remarkable Generalization:**
|
104 |
+
* Beyond machine translation, the Transformer showed its versatility by achieving competitive results on **English constituency parsing**, even with limited training data. This demonstrated its ability to learn complex linguistic structures across different tasks.
|
105 |
+
|
106 |
+
4. **Component Validation (Ablation Studies):**
|
107 |
+
* The paper also systematically tested the importance of each component. For instance, using only single-head attention led to a significant drop in quality, confirming the value of Multi-Head Attention. They also found that dropout was critical for preventing overfitting and that the fixed sinusoidal positional encodings worked just as well as learned ones.
|
108 |
+
|
109 |
+
## Practical Implications: Beyond Translation
|
110 |
+
|
111 |
+
The "Attention Is All You Need" paper wasn't just about machine translation; it laid the foundation for the entire modern landscape of large language models (LLMs). Almost every major advancement in NLP since 2017 – from BERT to GPT-3, GPT-4, and beyond – is built upon the Transformer architecture (or variations of it).
|
112 |
+
|
113 |
+
Its ability to:
|
114 |
+
* Process long sequences efficiently.
|
115 |
+
* Capture complex, long-range dependencies.
|
116 |
+
* Scale to massive datasets and model sizes.
|
117 |
+
|
118 |
+
... made it the perfect backbone for pre-training models on vast amounts of text data, leading to the powerful generative AI we see today. The Transformer's influence has even extended beyond text, inspiring architectures for image and audio processing.
|
119 |
+
|
120 |
+
## Conclusion: A Simple Idea, A Profound Impact
|
121 |
+
|
122 |
+
The Transformer stands as a testament to the power of simplicity and the elegance of a well-placed idea. By stripping away the complexities of recurrence and convolutions and focusing solely on the intuitive concept of "attention," its authors unlocked a new era of performance and efficiency in AI.
|
123 |
+
|
124 |
+
This paper didn't just introduce a new model; it provided a fundamental building block that continues to drive innovation at an incredible pace. So, the next time you marvel at an AI's ability to understand, translate, or generate text, remember the Transformer – the revolutionary architecture that proved, indeed, **attention is all you need.**
|
125 |
+
|
126 |
+
---
|
127 |
+
**Suggested Tags for DEV.to:**
|
128 |
+
`machinelearning` `deeplearning` `nlp` `ai` `transformer` `attention` `researchpaper` `artificialintelligence` `neuralnetworks` `datascience`
|
129 |
+
|
130 |
+
---
|
131 |
+
|
132 |
+
**Tags:** machinelearning, research, technical, science, advanced, academic, ai
|
133 |
+
**Reading Time:** 9 minutes
|
134 |
+
**Meta Description:** Discover how Introduction of the Transformer, a novel neural network architecture based *solely* on attention mec... Latest research insights explained in simple terms.
|
135 |
+
|
136 |
+
---
|
137 |
+
*Generated by ScholarShare - AI Research Dissemination Platform*
|
outputs/posters/Attention Is All You Need_poster.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22aeb988ef78331f7bd8a11ff741551b10b29ab6f8bf46902a17fed07ceeb892
|
3 |
+
size 101853
|
outputs/posters/Attention Is All You Need_poster_image.png
ADDED
![]() |
Git LFS Details
|
outputs/posters/poster_latex.tex
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
\documentclass[25pt, a0paper, landscape, margin=0mm, innermargin=15mm, blockverticalspace=15mm, colspace=15mm, subcolspace=8mm]{tikzposter}
|
2 |
+
|
3 |
+
\usepackage{amsmath}
|
4 |
+
\usepackage{amsfonts}
|
5 |
+
\usepackage{amssymb}
|
6 |
+
\usepackage{graphicx}
|
7 |
+
|
8 |
+
% Title, Author, Institute
|
9 |
+
\title{Attention Is All You Need}
|
10 |
+
\author{Ashish Vaswani, Llion Jones, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin}
|
11 |
+
\institute{Google Brain, Google Research, University of Toronto}
|
12 |
+
|
13 |
+
% Choose Layout
|
14 |
+
\usetheme{Default}
|
15 |
+
\usecolorstyle{Germany}
|
16 |
+
|
17 |
+
\begin{document}
|
18 |
+
|
19 |
+
% Title block
|
20 |
+
\maketitle
|
21 |
+
|
22 |
+
\begin{columns}
|
23 |
+
|
24 |
+
% FIRST column - Abstract and Key Findings
|
25 |
+
\column{0.3}
|
26 |
+
|
27 |
+
\block{Abstract}{
|
28 |
+
The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.
|
29 |
+
|
30 |
+
Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature.
|
31 |
+
}
|
32 |
+
|
33 |
+
\block{Key Findings}{
|
34 |
+
\begin{itemize}
|
35 |
+
\item \textbf{Novel Architecture:} Introduction of the Transformer, a neural network architecture based solely on attention mechanisms, completely dispensing with recurrence and convolutions.
|
36 |
+
|
37 |
+
\item \textbf{State-of-the-art Results:} Achieved SOTA on WMT 2014 English-to-German (28.4 BLEU) and English-to-French (41.8 BLEU) machine translation tasks.
|
38 |
+
|
39 |
+
\item \textbf{Efficiency:} Demonstrated significantly faster training times and higher parallelizability compared to recurrent or convolutional models.
|
40 |
+
|
41 |
+
\item \textbf{Generalization:} Strong generalization capabilities by successfully applying the Transformer to English constituency parsing, achieving competitive results.
|
42 |
+
|
43 |
+
\item \textbf{Multi-head Attention:} Multi-head attention is crucial for performance, allowing the model to jointly attend to information from different representation subspaces.
|
44 |
+
\end{itemize}
|
45 |
+
}
|
46 |
+
|
47 |
+
% SECOND column - Methodology
|
48 |
+
\column{0.4}
|
49 |
+
|
50 |
+
\block{Methodology: The Transformer Architecture}{
|
51 |
+
The Transformer is an encoder-decoder model where both components are composed of stacks of identical layers.
|
52 |
+
|
53 |
+
\innerblock{Encoder Stack}{
|
54 |
+
Consists of N=6 identical layers. Each layer has two sub-layers:
|
55 |
+
\begin{itemize}
|
56 |
+
\item Multi-head self-attention mechanism
|
57 |
+
\item Position-wise fully connected feed-forward network
|
58 |
+
\item Residual connections and layer normalization around each sub-layer
|
59 |
+
\end{itemize}
|
60 |
+
}
|
61 |
+
|
62 |
+
\innerblock{Decoder Stack}{
|
63 |
+
Also N=6 identical layers with three sub-layers:
|
64 |
+
\begin{itemize}
|
65 |
+
\item Masked multi-head self-attention (preserves auto-regressive property)
|
66 |
+
\item Multi-head attention over encoder output
|
67 |
+
\item Position-wise feed-forward network
|
68 |
+
\end{itemize}
|
69 |
+
}
|
70 |
+
|
71 |
+
\innerblock{Attention Mechanism}{
|
72 |
+
\textbf{Scaled Dot-Product Attention:}
|
73 |
+
$$\text{Attention}(Q,K,V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$
|
74 |
+
|
75 |
+
\textbf{Multi-Head Attention:}
|
76 |
+
$$\text{MultiHead}(Q,K,V) = \text{Concat}(\text{head}_1, ..., \text{head}_h)W^O$$
|
77 |
+
where $\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)$
|
78 |
+
|
79 |
+
This allows attending to different representation subspaces simultaneously.
|
80 |
+
}
|
81 |
+
|
82 |
+
\innerblock{Key Components}{
|
83 |
+
\begin{itemize}
|
84 |
+
\item \textbf{Position-wise FFN:} Two linear transformations with ReLU activation
|
85 |
+
\item \textbf{Positional Encoding:} Sinusoidal encodings to inject sequence order information
|
86 |
+
\item \textbf{Embeddings:} Learned embeddings with shared weights between input/output layers
|
87 |
+
\end{itemize}
|
88 |
+
}
|
89 |
+
}
|
90 |
+
|
91 |
+
\block{Training Regime}{
|
92 |
+
\begin{itemize}
|
93 |
+
\item \textbf{Data:} WMT 2014 EN-DE (4.5M pairs, 37k BPE tokens), EN-FR (36M sentences, 32k word-piece tokens)
|
94 |
+
\item \textbf{Hardware:} 8 NVIDIA P100 GPUs
|
95 |
+
\item \textbf{Optimizer:} Adam ($\beta_1=0.9, \beta_2=0.98, \epsilon=10^{-9}$) with custom learning rate schedule
|
96 |
+
\item \textbf{Regularization:} Residual Dropout ($P_{\text{drop}}=0.1$ base, $0.3$ big), Label Smoothing ($\epsilon_{ls}=0.1$)
|
97 |
+
\item \textbf{Batching:} ~25k source/target tokens per batch
|
98 |
+
\end{itemize}
|
99 |
+
}
|
100 |
+
|
101 |
+
% THIRD column - Results and Conclusion
|
102 |
+
\column{0.3}
|
103 |
+
|
104 |
+
\block{Results}{
|
105 |
+
\innerblock{Machine Translation Performance}{
|
106 |
+
\textbf{WMT 2014 English-to-German:}
|
107 |
+
\begin{itemize}
|
108 |
+
\item Transformer (big): \textbf{28.4 BLEU}
|
109 |
+
\item Outperformed all previous models by >2.0 BLEU
|
110 |
+
\item Including ensemble methods
|
111 |
+
\end{itemize}
|
112 |
+
|
113 |
+
\textbf{WMT 2014 English-to-French:}
|
114 |
+
\begin{itemize}
|
115 |
+
\item Transformer (big): \textbf{41.8 BLEU}
|
116 |
+
\item New single-model state-of-the-art
|
117 |
+
\item Achieved with <1/4 training cost of previous best
|
118 |
+
\end{itemize}
|
119 |
+
}
|
120 |
+
|
121 |
+
\innerblock{Training Efficiency}{
|
122 |
+
\begin{itemize}
|
123 |
+
\item \textbf{Transformer (base) EN-DE:} 3.3e18 FLOPs
|
124 |
+
\item \textbf{GNMT+RL:} 2.3e19 FLOPs
|
125 |
+
\item \textbf{7x reduction} in training cost
|
126 |
+
\item Significantly higher parallelizability
|
127 |
+
\end{itemize}
|
128 |
+
}
|
129 |
+
|
130 |
+
\innerblock{Generalization: English Constituency Parsing}{
|
131 |
+
\begin{itemize}
|
132 |
+
\item 4-layer Transformer achieved:
|
133 |
+
\item \textbf{91.3 F1} (WSJ only)
|
134 |
+
\item \textbf{92.7 F1} (semi-supervised)
|
135 |
+
\item Outperformed many previous models
|
136 |
+
\item Minimal task-specific tuning required
|
137 |
+
\end{itemize}
|
138 |
+
}
|
139 |
+
|
140 |
+
\innerblock{Ablation Studies}{
|
141 |
+
\begin{itemize}
|
142 |
+
\item Multi-head attention crucial (single-head: -0.9 BLEU)
|
143 |
+
\item Reducing attention key size hurts quality
|
144 |
+
\item Bigger models perform better
|
145 |
+
\item Dropout very helpful for regularization
|
146 |
+
\item Sinusoidal ≈ learned positional encodings
|
147 |
+
\end{itemize}
|
148 |
+
}
|
149 |
+
}
|
150 |
+
|
151 |
+
\block{Conclusion}{
|
152 |
+
The Transformer represents a paradigm shift in sequence modeling:
|
153 |
+
|
154 |
+
\begin{itemize}
|
155 |
+
\item \textbf{First} sequence transduction model based entirely on attention
|
156 |
+
\item \textbf{Eliminates} recurrent and convolutional layers completely
|
157 |
+
\item \textbf{Achieves} new state-of-the-art results with faster training
|
158 |
+
\item \textbf{Demonstrates} strong generalization across tasks
|
159 |
+
\item \textbf{Opens} new research directions for attention-based models
|
160 |
+
\end{itemize}
|
161 |
+
|
162 |
+
\vspace{1em}
|
163 |
+
\textbf{Future Work:}
|
164 |
+
\begin{itemize}
|
165 |
+
\item Extension to other modalities (images, audio)
|
166 |
+
\item Investigation of restricted attention mechanisms
|
167 |
+
\item Application to large inputs/outputs
|
168 |
+
\item Less sequential generation methods
|
169 |
+
\end{itemize}
|
170 |
+
|
171 |
+
\vspace{1em}
|
172 |
+
\coloredbox[bgcolor=blocktitlebgcolor, fgcolor=white, roundedcorners=5]{
|
173 |
+
\textbf{Impact:} This work laid the foundation for modern transformer-based models including BERT, GPT, and countless applications in NLP and beyond.
|
174 |
+
}
|
175 |
+
}
|
176 |
+
|
177 |
+
\end{columns}
|
178 |
+
|
179 |
+
\end{document}
|
outputs/presentations/presentation_beamer.tex
ADDED
@@ -0,0 +1,536 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
\documentclass{beamer}
|
2 |
+
|
3 |
+
% Theme and color scheme
|
4 |
+
\usetheme{Madrid}
|
5 |
+
\usecolortheme{default}
|
6 |
+
\definecolor{academicblue}{RGB}{25,25,112}
|
7 |
+
\definecolor{lightblue}{RGB}{173,216,230}
|
8 |
+
\definecolor{darkblue}{RGB}{0,51,102}
|
9 |
+
\setbeamercolor{structure}{fg=academicblue}
|
10 |
+
\setbeamercolor{frametitle}{bg=lightblue,fg=darkblue}
|
11 |
+
\setbeamercolor{title}{fg=academicblue}
|
12 |
+
|
13 |
+
% Packages
|
14 |
+
\usepackage[utf8]{inputenc}
|
15 |
+
\usepackage[T1]{fontenc}
|
16 |
+
\usepackage{tikz}
|
17 |
+
\usepackage{graphicx}
|
18 |
+
\usepackage{amsmath}
|
19 |
+
\usepackage{amssymb}
|
20 |
+
\usepackage{booktabs}
|
21 |
+
\usepackage{array}
|
22 |
+
\usepackage{multirow}
|
23 |
+
\usepackage{xcolor}
|
24 |
+
|
25 |
+
% TikZ libraries
|
26 |
+
\usetikzlibrary{shapes,arrows,positioning,calc,decorations.pathreplacing}
|
27 |
+
|
28 |
+
% Custom commands
|
29 |
+
\newcommand{\highlight}[1]{\textcolor{academicblue}{\textbf{#1}}}
|
30 |
+
|
31 |
+
% Title page information
|
32 |
+
\title[Attention Is All You Need]{Attention Is All You Need: A New Paradigm for Sequence Transduction}
|
33 |
+
\author{Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,\\Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin}
|
34 |
+
\institute{Presented by: [Your Name/Group]}
|
35 |
+
\date{[Date of Presentation]}
|
36 |
+
|
37 |
+
% --- Beamer-specific improvements ---
|
38 |
+
% 1. Remove navigation symbols (tiny and distracting)
|
39 |
+
\setbeamertemplate{navigation symbols}{}
|
40 |
+
|
41 |
+
% 2. Custom footline to increase font size for author, title, date, slide number
|
42 |
+
% The default Madrid theme footline is often too small for readability.
|
43 |
+
\setbeamertemplate{footline}{%
|
44 |
+
\begin{beamercolorbox}[wd=\paperwidth,ht=2.25ex,dp=1ex,leftskip=0.3cm,rightskip=0.3cm]{author in head/foot}%
|
45 |
+
\usebeamerfont{author in head/foot}\normalsize\insertshortauthor%
|
46 |
+
\hfill%
|
47 |
+
\usebeamerfont{title in head/foot}\normalsize\insertshorttitle%
|
48 |
+
\hfill%
|
49 |
+
\usebeamerfont{page number in head/foot}\normalsize\insertframenumber/\inserttotalframenumber%
|
50 |
+
\end{beamercolorbox}%
|
51 |
+
}
|
52 |
+
|
53 |
+
\begin{document}
|
54 |
+
|
55 |
+
% Title slide
|
56 |
+
\begin{frame}
|
57 |
+
\titlepage
|
58 |
+
\end{frame}
|
59 |
+
|
60 |
+
% Slide 2: Presentation Outline
|
61 |
+
\begin{frame}
|
62 |
+
\frametitle{Presentation Outline}
|
63 |
+
\begin{itemize}
|
64 |
+
\item The Challenge of Sequence Modeling
|
65 |
+
\item Limitations of Prior Approaches (RNNs/CNNs)
|
66 |
+
\item The Transformer: A Revolutionary Idea
|
67 |
+
\item \highlight{Methodology:}
|
68 |
+
\begin{itemize}
|
69 |
+
\item Architecture \& Attention Core
|
70 |
+
\item Layer Details \& Positional Information
|
71 |
+
\item Training Setup
|
72 |
+
\end{itemize}
|
73 |
+
\item \highlight{Results:}
|
74 |
+
\begin{itemize}
|
75 |
+
\item Machine Translation Performance
|
76 |
+
\item Training Efficiency \& Parallelizability
|
77 |
+
\item Generalization to Parsing
|
78 |
+
\end{itemize}
|
79 |
+
\item Key Findings \& Impact
|
80 |
+
\item Conclusion \& Future Directions
|
81 |
+
\item Q\&A
|
82 |
+
\end{itemize}
|
83 |
+
\end{frame}
|
84 |
+
|
85 |
+
% Slide 3: The Challenge of Sequence Modeling
|
86 |
+
\begin{frame}
|
87 |
+
\frametitle{The Challenge of Sequence Modeling}
|
88 |
+
\begin{itemize}
|
89 |
+
\item \highlight{Sequence Transduction Tasks:} Transforming an input sequence to an output sequence.
|
90 |
+
\begin{itemize}
|
91 |
+
\item Examples: Machine Translation, Text Summarization, Speech Recognition
|
92 |
+
\end{itemize}
|
93 |
+
\vspace{0.5cm}
|
94 |
+
\item \highlight{Core Difficulty:} Capturing dependencies between elements, often across long distances in the sequence.
|
95 |
+
\vspace{0.5cm}
|
96 |
+
\item \highlight{Ubiquitous in AI:} Critical for understanding and generating human language, and other sequential data.
|
97 |
+
\end{itemize}
|
98 |
+
\end{frame}
|
99 |
+
|
100 |
+
% Slide 4: Prior Approaches & Their Limitations
|
101 |
+
\begin{frame}
|
102 |
+
\frametitle{Prior Approaches \& Their Limitations}
|
103 |
+
\begin{itemize}
|
104 |
+
\item \highlight{Dominant Models:} Recurrent Neural Networks (RNNs, LSTMs, GRUs) \& Convolutional Neural Networks (CNNs) in encoder-decoder setups.
|
105 |
+
\vspace{0.3cm}
|
106 |
+
\item \highlight{RNN Limitations:}
|
107 |
+
\begin{itemize}
|
108 |
+
\item Sequential computation: Inhibits parallelization, slow training
|
109 |
+
\item Difficulty with long-range dependencies (vanishing/exploding gradients)
|
110 |
+
\end{itemize}
|
111 |
+
\vspace{0.3cm}
|
112 |
+
\item \highlight{CNN Limitations:}
|
113 |
+
\begin{itemize}
|
114 |
+
\item Fixed receptive field, requires many layers for long dependencies
|
115 |
+
\end{itemize}
|
116 |
+
\vspace{0.3cm}
|
117 |
+
\item \highlight{Attention Mechanisms:} Improved RNN/CNNs by allowing focus on relevant input parts, but recurrence/convolution remained the backbone.
|
118 |
+
\end{itemize}
|
119 |
+
\end{frame}
|
120 |
+
|
121 |
+
% Slide 5: The Transformer: A Revolutionary Idea
|
122 |
+
\begin{frame}
|
123 |
+
\frametitle{The Transformer: A Revolutionary Idea}
|
124 |
+
\begin{block}{The Core Proposition (Vaswani et al., 2017)}
|
125 |
+
"We propose a new simple network architecture, the Transformer, based \textit{solely} on attention mechanisms, dispensing with recurrence and convolutions entirely."
|
126 |
+
\end{block}
|
127 |
+
|
128 |
+
\vspace{0.5cm}
|
129 |
+
\begin{itemize}
|
130 |
+
\item \highlight{Motivation:}
|
131 |
+
\begin{itemize}
|
132 |
+
\item Overcome limitations of sequential processing
|
133 |
+
\item Achieve greater parallelization for faster training
|
134 |
+
\item Improve capture of long-range dependencies
|
135 |
+
\end{itemize}
|
136 |
+
\vspace{0.3cm}
|
137 |
+
\item \highlight{Hypothesis:} Attention mechanisms alone are powerful enough for state-of-the-art sequence transduction.
|
138 |
+
\end{itemize}
|
139 |
+
\end{frame}
|
140 |
+
|
141 |
+
% Slide 6: Methodology: Architecture & Attention Core
|
142 |
+
\begin{frame}
|
143 |
+
\frametitle{Methodology: Architecture \& Attention Core}
|
144 |
+
\begin{columns}
|
145 |
+
\begin{column}{0.5\textwidth}
|
146 |
+
\begin{itemize}
|
147 |
+
\item \highlight{Overall Architecture:} Encoder-Decoder Structure (N=6 identical layers each)
|
148 |
+
\begin{itemize}
|
149 |
+
\item \textbf{Encoder:} Processes input sequence
|
150 |
+
\item \textbf{Decoder:} Generates output sequence
|
151 |
+
\end{itemize}
|
152 |
+
\vspace{0.3cm}
|
153 |
+
\item \highlight{The Engine: Multi-Head Attention}
|
154 |
+
\begin{itemize}
|
155 |
+
\item \textbf{Scaled Dot-Product:}
|
156 |
+
\small{$\text{Attention}(Q, K, V) = \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V$}
|
157 |
+
\item \textbf{Multi-Head:} h=8 heads in parallel
|
158 |
+
\end{itemize}
|
159 |
+
\end{itemize}
|
160 |
+
\end{column}
|
161 |
+
\begin{column}{0.5\textwidth}
|
162 |
+
% High-level Transformer architecture
|
163 |
+
% Increased scale and font size for better visibility
|
164 |
+
\begin{tikzpicture}[scale=0.8] % Increased scale from 0.7 to 0.8
|
165 |
+
\node[rectangle, draw, fill=lightblue, minimum width=2.5cm, minimum height=1.2cm] (encoder) at (0,2) {\Large Encoder\\(N=6)}; % Increased font size and min dimensions
|
166 |
+
\node[rectangle, draw, fill=lightblue, minimum width=2.5cm, minimum height=1.2cm] (decoder) at (0,0) {\Large Decoder\\(N=6)}; % Increased font size and min dimensions
|
167 |
+
\node[above=0.3cm of encoder, font=\large] {Input}; % Increased font size
|
168 |
+
\node[below=0.3cm of decoder, font=\large] {Output}; % Increased font size
|
169 |
+
\draw[->, thick] (encoder) -- (decoder);
|
170 |
+
\draw[->, thick] (0,3.5) -- (encoder);
|
171 |
+
\draw[->, thick] (decoder) -- (0,-1.5);
|
172 |
+
\end{tikzpicture}
|
173 |
+
\end{column}
|
174 |
+
\end{columns}
|
175 |
+
\end{frame}
|
176 |
+
|
177 |
+
% Slide 7: Scaled Dot-Product and Multi-Head Attention
|
178 |
+
\begin{frame}
|
179 |
+
\frametitle{Attention Mechanisms}
|
180 |
+
\begin{columns}
|
181 |
+
\begin{column}{0.48\textwidth} % Adjusted column width slightly to give more space
|
182 |
+
\centering % Center the diagram in the column
|
183 |
+
\highlight{\Large Scaled Dot-Product Attention} % Increased heading font size
|
184 |
+
\vspace{0.5cm} % Add some vertical space
|
185 |
+
\begin{tikzpicture}[scale=1.0, % Increased scale from 0.8 to 1.0 for better readability
|
186 |
+
node distance=0.7cm, % Reduced node distance slightly to make it more compact
|
187 |
+
box/.style={rectangle, draw, minimum width=1.8cm, minimum height=0.8cm, font=\large}, % Custom style for boxes, increased min width/height and font size
|
188 |
+
qk/.style={rectangle, draw, minimum width=0.8cm, minimum height=0.8cm, font=\Large} % Style for Q, K, V
|
189 |
+
]
|
190 |
+
\node[qk] (q) at (0,3) {Q};
|
191 |
+
\node[qk] (k) at (1.5,3) {K}; % Adjusted position for K
|
192 |
+
\node[qk] (v) at (3,3) {V}; % Adjusted position for V
|
193 |
+
|
194 |
+
\node[box] (matmul1) at (0.75,2) {MatMul}; % Centered between Q and K
|
195 |
+
\node[box] (scale) at (0.75,1.2) {Scale}; % Below matmul1
|
196 |
+
\node[box] (softmax) at (0.75,0.4) {Softmax}; % Below scale
|
197 |
+
\node[box] (matmul2) at (1.8, -0.4) {MatMul}; % Adjusted position to align with V and softmax output
|
198 |
+
|
199 |
+
\node[below=0.2cm of matmul2, font=\Large] {Attention}; % Increased font size
|
200 |
+
|
201 |
+
\draw[->, thick] (q) -- (matmul1);
|
202 |
+
\draw[->, thick] (k) -- (matmul1);
|
203 |
+
\draw[->, thick] (matmul1) -- (scale);
|
204 |
+
\draw[->, thick] (scale) -- (softmax);
|
205 |
+
\draw[->, thick] (softmax) -- (matmul2.west |- softmax.south); % Connect softmax to matmul2
|
206 |
+
\draw[->, thick] (v) -- (matmul2);
|
207 |
+
\end{tikzpicture}
|
208 |
+
\end{column}
|
209 |
+
\begin{column}{0.48\textwidth} % Adjusted column width
|
210 |
+
\centering % Center the diagram in the column
|
211 |
+
\highlight{\Large Multi-Head Attention} % Increased heading font size
|
212 |
+
\vspace{0.5cm} % Add some vertical space
|
213 |
+
\begin{tikzpicture}[scale=1.0, % Increased scale from 0.7 to 1.0
|
214 |
+
node distance=0.8cm, % Increased node distance for better spacing
|
215 |
+
headbox/.style={rectangle, draw, fill=lightblue, minimum width=2.5cm, minimum height=0.8cm, font=\large}, % Increased min width to prevent text cut-off, increased font size
|
216 |
+
opbox/.style={rectangle, draw, minimum width=4cm, minimum height=0.8cm, font=\large} % Increased min width and font size
|
217 |
+
]
|
218 |
+
% Use a loop for heads, arrange vertically for better column usage
|
219 |
+
\foreach \i in {1,2,3} {
|
220 |
+
\node[headbox] (head\i) at (0, 3 - (\i-1)*1.2) {Head \i}; % Vertical arrangement for better use of space
|
221 |
+
}
|
222 |
+
\node at (0, -0.5) {\Large ...}; % Increased font size for ellipsis
|
223 |
+
\node[headbox] (headN) at (0, -1.5) {Head N}; % Added a generic Head N for completeness
|
224 |
+
|
225 |
+
\node[opbox] (concat) at (0,-2.8) {Concat}; % Adjusted position
|
226 |
+
\node[opbox] (linear) at (0,-3.8) {Linear}; % Adjusted position
|
227 |
+
|
228 |
+
% Draw arrows from heads to concat
|
229 |
+
\draw[->, thick] (head1) -- (concat);
|
230 |
+
\draw[->, thick] (head2) -- (concat);
|
231 |
+
\draw[->, thick] (head3) -- (concat);
|
232 |
+
\draw[->, thick] (headN) -- (concat); % Arrow from Head N
|
233 |
+
|
234 |
+
\draw[->, thick] (concat) -- (linear);
|
235 |
+
\end{tikzpicture}
|
236 |
+
\end{column}
|
237 |
+
\end{columns}
|
238 |
+
\end{frame}
|
239 |
+
|
240 |
+
% Slide 8: Layer Details & Positional Information
|
241 |
+
\begin{frame}
|
242 |
+
\frametitle{Layer Details \& Positional Information}
|
243 |
+
\begin{columns}
|
244 |
+
\begin{column}{0.6\textwidth}
|
245 |
+
\begin{itemize}
|
246 |
+
\item \highlight{Encoder Layer (2 sub-layers):}
|
247 |
+
\begin{enumerate}
|
248 |
+
\item Multi-Head Self-Attention
|
249 |
+
\item Position-wise FFN
|
250 |
+
\end{enumerate}
|
251 |
+
\vspace{0.3cm}
|
252 |
+
\item \highlight{Decoder Layer (3 sub-layers):}
|
253 |
+
\begin{enumerate}
|
254 |
+
\item Masked Multi-Head Self-Attention
|
255 |
+
\item Multi-Head Encoder-Decoder Attention
|
256 |
+
\item Position-wise FFN
|
257 |
+
\end{enumerate}
|
258 |
+
\vspace{0.3cm}
|
259 |
+
\item \highlight{Key Additions:}
|
260 |
+
\begin{itemize}
|
261 |
+
\item Residual Connections \& Layer Normalization
|
262 |
+
\item Positional Encoding (sine/cosine functions)
|
263 |
+
\end{itemize}
|
264 |
+
\end{itemize}
|
265 |
+
\end{column}
|
266 |
+
\begin{column}{0.4\textwidth}
|
267 |
+
% Encoder Layer diagram
|
268 |
+
% Increased scale and font size for better readability
|
269 |
+
\begin{tikzpicture}[scale=0.8, % Increased scale from 0.6 to 0.8
|
270 |
+
node distance=0.7cm, % Adjusted node distance
|
271 |
+
box/.style={rectangle, draw, minimum width=3cm, minimum height=0.8cm, font=\large} % Custom style for boxes, increased min width/height and font size
|
272 |
+
]
|
273 |
+
\node[box] (mha) at (0,3) {Multi-Head\\Attention};
|
274 |
+
\node[box] (add1) at (0,2.1) {Add \& Norm}; % Adjusted position
|
275 |
+
\node[box] (ffn) at (0,1.2) {Feed Forward}; % Adjusted position
|
276 |
+
\node[box] (add2) at (0,0.3) {Add \& Norm}; % Adjusted position
|
277 |
+
|
278 |
+
\draw[->, thick] (mha) -- (add1);
|
279 |
+
\draw[->, thick] (add1) -- (ffn);
|
280 |
+
\draw[->, thick] (ffn) -- (add2);
|
281 |
+
|
282 |
+
% Residual connections
|
283 |
+
\draw[->, thick, dashed] (mha.east) -- ++(0.5,0) |- (add1.east); % Connect from MHA to Add&Norm
|
284 |
+
\draw[->, thick, dashed] (ffn.east) -- ++(0.5,0) |- (add2.east); % Connect from FFN to Add&Norm
|
285 |
+
\end{tikzpicture}
|
286 |
+
\end{column}
|
287 |
+
\end{columns}
|
288 |
+
\end{frame}
|
289 |
+
|
290 |
+
% Slide 9: Positional Encoding
|
291 |
+
\begin{frame}
|
292 |
+
\frametitle{Positional Encoding}
|
293 |
+
\begin{columns}
|
294 |
+
\begin{column}{0.6\textwidth}
|
295 |
+
\begin{itemize}
|
296 |
+
\item \highlight{Problem:} Attention has no inherent notion of sequence order
|
297 |
+
\vspace{0.3cm}
|
298 |
+
\item \highlight{Solution:} Add positional encodings to input embeddings
|
299 |
+
\vspace{0.3cm}
|
300 |
+
\item \highlight{Sinusoidal Functions:}
|
301 |
+
\begin{align}
|
302 |
+
PE_{(pos,2i)} &= \sin(pos/10000^{2i/d_{model}})\\
|
303 |
+
PE_{(pos,2i+1)} &= \cos(pos/10000^{2i/d_{model}})
|
304 |
+
\end{align}
|
305 |
+
\vspace{0.3cm}
|
306 |
+
\item \highlight{Benefits:} Allows model to learn relative positions, generalizes to longer sequences
|
307 |
+
\end{itemize}
|
308 |
+
\end{column}
|
309 |
+
\begin{column}{0.4\textwidth}
|
310 |
+
% Positional encoding visualization
|
311 |
+
% Increased scale and font size for better readability
|
312 |
+
\begin{tikzpicture}[scale=1.0] % Increased scale from 0.8 to 1.0
|
313 |
+
\draw[->] (0,0) -- (3.5,0) node[right, font=\large] {Position}; % Increased font size
|
314 |
+
\draw[->] (0,0) -- (0,2.8) node[above, font=\large] {Value}; % Increased font size
|
315 |
+
|
316 |
+
% Sine waves with different frequencies
|
317 |
+
\draw[blue, thick] plot[domain=0:3.3, samples=100] (\x, {0.8 + 0.6*sin(180*\x)});
|
318 |
+
\draw[red, thick] plot[domain=0:3.3, samples=100] (\x, {1.5 + 0.4*sin(360*\x)});
|
319 |
+
|
320 |
+
\node[blue] at (3, 0.3) {\large Low freq}; % Increased font size
|
321 |
+
\node[red] at (3, 1.8) {\large High freq}; % Increased font size
|
322 |
+
\end{tikzpicture}
|
323 |
+
\end{column}
|
324 |
+
\end{columns}
|
325 |
+
\end{frame}
|
326 |
+
|
327 |
+
% Slide 10: Training Setup
|
328 |
+
\begin{frame}
|
329 |
+
\frametitle{Training Setup}
|
330 |
+
\begin{itemize}
|
331 |
+
\item \highlight{Datasets:}
|
332 |
+
\begin{itemize}
|
333 |
+
\item WMT 2014 English-German (En-De): $\sim$4.5 million sentence pairs
|
334 |
+
\item WMT 2014 English-French (En-Fr): $\sim$36 million sentence pairs
|
335 |
+
\end{itemize}
|
336 |
+
\vspace{0.4cm}
|
337 |
+
\item \highlight{Tokenization:} Byte-Pair Encoding (BPE) / WordPiece for vocabulary
|
338 |
+
\vspace{0.4cm}
|
339 |
+
\item \highlight{Hardware:} Trained on 8 NVIDIA P100 GPUs
|
340 |
+
\vspace{0.4cm}
|
341 |
+
\item \highlight{Optimizer:} Adam optimizer with custom learning rate schedule (linear warmup then inverse square root decay)
|
342 |
+
\vspace{0.4cm}
|
343 |
+
\item \highlight{Regularization:} Residual Dropout ($P_{drop}=0.1$), Label Smoothing ($\epsilon_{ls}=0.1$)
|
344 |
+
\end{itemize}
|
345 |
+
\end{frame}
|
346 |
+
|
347 |
+
% Slide 11: Machine Translation Performance
|
348 |
+
\begin{frame}
|
349 |
+
\frametitle{Results: Machine Translation Performance}
|
350 |
+
\begin{center}
|
351 |
+
% Increased scale for the tikzpicture containing the axis environment
|
352 |
+
\begin{tikzpicture}[scale=1.0] % Increased scale from 0.9 to 1.0
|
353 |
+
% Bar chart for BLEU scores
|
354 |
+
\begin{axis}[
|
355 |
+
ybar,
|
356 |
+
width=10cm,
|
357 |
+
height=6cm,
|
358 |
+
ylabel={\Large BLEU Score}, % Increased font size
|
359 |
+
symbolic x coords={En-De, En-Fr},
|
360 |
+
xtick=data,
|
361 |
+
ymin=20,
|
362 |
+
ymax=45,
|
363 |
+
legend style={at={(0.02,0.98)}, anchor=north west, font=\small}, % Adjusted font size for legend
|
364 |
+
bar width=15pt,
|
365 |
+
tick label style={font=\large}, % Increased font size for tick labels
|
366 |
+
label style={font=\Large}, % Increased font size for axis labels
|
367 |
+
]
|
368 |
+
|
369 |
+
\addplot[fill=gray!50] coordinates {(En-De,25.2) (En-Fr,38.1)};
|
370 |
+
\addplot[fill=blue!50] coordinates {(En-De,26.0) (En-Fr,39.0)};
|
371 |
+
\addplot[fill=green!50] coordinates {(En-De,27.3) (En-Fr,40.5)};
|
372 |
+
\addplot[fill=red!70] coordinates {(En-De,28.4) (En-Fr,41.8)};
|
373 |
+
|
374 |
+
\legend{Previous SOTA Ensemble, Previous SOTA Single, Transformer (base), Transformer (big)}
|
375 |
+
|
376 |
+
% Add value labels on bars
|
377 |
+
% Increased font size for value labels
|
378 |
+
\node at (axis cs:En-De,28.8) {\textbf{\Large 28.4}};
|
379 |
+
\node at (axis cs:En-Fr,42.3) {\textbf{\Large 41.8}};
|
380 |
+
|
381 |
+
\end{axis}
|
382 |
+
\end{tikzpicture}
|
383 |
+
\end{center}
|
384 |
+
|
385 |
+
\begin{itemize}
|
386 |
+
\item \highlight{Significance:} Outperformed all previous single models and ensembles
|
387 |
+
\item New state-of-the-art on both translation tasks
|
388 |
+
\end{itemize}
|
389 |
+
\end{frame}
|
390 |
+
|
391 |
+
% Slide 12: Training Efficiency & Parallelizability
|
392 |
+
\begin{frame}
|
393 |
+
\frametitle{Results: Training Efficiency \& Parallelizability}
|
394 |
+
\begin{center}
|
395 |
+
% Increased font size for table content
|
396 |
+
\begin{table}[h]
|
397 |
+
\centering
|
398 |
+
\begin{tabular}{lccc}
|
399 |
+
\toprule
|
400 |
+
\textbf{\large Model} & \textbf{\large Training Time} & \textbf{\large FLOPs} & \textbf{\large BLEU (En-De)} \\
|
401 |
+
\midrule
|
402 |
+
\large ByteNet & \large - & \large - & \large 23.75 \\
|
403 |
+
\large Deep-Att + PosUnk & \large - & \large - & \large 25.16 \\
|
404 |
+
\large GNMT + RL & \large - & \large $2.3 \times 10^{19}$ & \large 24.61 \\
|
405 |
+
\large ConvS2S & \large - & \large $9.6 \times 10^{18}$ & \large 25.16 \\
|
406 |
+
\midrule
|
407 |
+
\large Transformer (base) & \textbf{\large 12 hours} & \large $3.3 \times 10^{18}$ & \large 27.3 \\
|
408 |
+
\large Transformer (big) & \textbf{\large 3.5 days} & \large $2.3 \times 10^{19}$ & \textbf{\large 28.4} \\
|
409 |
+
\bottomrule
|
410 |
+
\end{tabular}
|
411 |
+
\end{table}
|
412 |
+
\end{center}
|
413 |
+
|
414 |
+
\vspace{0.5cm}
|
415 |
+
\begin{itemize}
|
416 |
+
\item \highlight{Superior Training Speed:} Transformer (big) on En-Fr trained in only \textbf{3.5 days} on 8 P100 GPUs
|
417 |
+
\item \highlight{Higher Parallelizability:} Attention computations are highly parallelizable
|
418 |
+
\item \highlight{Lower Computational Cost:} Significantly fewer FLOPs for similar or better quality
|
419 |
+
\end{itemize}
|
420 |
+
\end{frame}
|
421 |
+
|
422 |
+
% Slide 13: Generalization to Parsing
|
423 |
+
\begin{frame}
|
424 |
+
\frametitle{Results: Generalization to Parsing}
|
425 |
+
\begin{columns}
|
426 |
+
\begin{column}{0.6\textwidth}
|
427 |
+
\begin{itemize}
|
428 |
+
\item \highlight{Task:} English Constituency Parsing (predicting syntactic structure)
|
429 |
+
\vspace{0.3cm}
|
430 |
+
\item \highlight{Dataset:} WSJ (Penn Treebank), limited training data setting
|
431 |
+
\vspace{0.3cm}
|
432 |
+
\item \highlight{Model:} 4-layer Transformer, minimal task-specific tuning
|
433 |
+
\vspace{0.3cm}
|
434 |
+
\item \highlight{Performance (F1 Score):}
|
435 |
+
\begin{itemize}
|
436 |
+
\item WSJ only training: \textbf{91.3 F1}
|
437 |
+
\item Semi-supervised: \textbf{92.7 F1}
|
438 |
+
\end{itemize}
|
439 |
+
\vspace{0.3cm}
|
440 |
+
\item \highlight{Significance:} Strong generalization with minimal adaptation
|
441 |
+
\end{itemize}
|
442 |
+
\end{column}
|
443 |
+
\begin{column}{0.4\textwidth}
|
444 |
+
% Example parse tree
|
445 |
+
% Increased scale and font size for better readability
|
446 |
+
\begin{tikzpicture}[scale=0.9, % Increased scale from 0.7 to 0.9
|
447 |
+
node distance=0.7cm, % Adjusted node distance
|
448 |
+
every node/.style={font=\large} % Set default font size for all nodes
|
449 |
+
]
|
450 |
+
\node (s) at (2,3) {S};
|
451 |
+
\node (np1) at (1,2.2) {NP};
|
452 |
+
\node (vp) at (3,2.2) {VP};
|
453 |
+
\node (dt) at (0.5,1.4) {DT};
|
454 |
+
\node (nn) at (1.5,1.4) {NN};
|
455 |
+
\node (vbz) at (2.5,1.4) {VBZ};
|
456 |
+
\node (np2) at (3.5,1.4) {NP};
|
457 |
+
\node (the) at (0.5,0.6) {The};
|
458 |
+
\node (cat) at (1.5,0.6) {cat};
|
459 |
+
\node (sits) at (2.5,0.6) {sits};
|
460 |
+
\node (mat) at (3.5,0.6) {mat};
|
461 |
+
|
462 |
+
\draw[thick] (s) -- (np1);
|
463 |
+
\draw[thick] (s) -- (vp);
|
464 |
+
\draw[thick] (np1) -- (dt);
|
465 |
+
\draw[thick] (np1) -- (nn);
|
466 |
+
\draw[thick] (vp) -- (vbz);
|
467 |
+
\draw[thick] (vp) -- (np2);
|
468 |
+
\draw[thick] (dt) -- (the);
|
469 |
+
\draw[thick] (nn) -- (cat);
|
470 |
+
\draw[thick] (vbz) -- (sits);
|
471 |
+
\draw[thick] (np2) -- (mat);
|
472 |
+
\end{tikzpicture}
|
473 |
+
\end{column}
|
474 |
+
\end{columns}
|
475 |
+
\end{frame}
|
476 |
+
|
477 |
+
% Slide 14: Key Findings & Impact
|
478 |
+
\begin{frame}
|
479 |
+
\frametitle{Key Findings \& Impact}
|
480 |
+
\begin{itemize}
|
481 |
+
\item \highlight{Key Findings:}
|
482 |
+
\begin{enumerate}
|
483 |
+
\item \textbf{Attention is sufficient:} Recurrence/convolution not necessary for SOTA sequence modeling
|
484 |
+
\item \textbf{Superior Quality:} New SOTA on major MT benchmarks
|
485 |
+
\item \textbf{Efficiency:} More parallelizable, faster training, fewer FLOPs
|
486 |
+
\item \textbf{Generalizability:} Strong performance on non-translation tasks
|
487 |
+
\end{enumerate}
|
488 |
+
\vspace{0.5cm}
|
489 |
+
\item \highlight{Transformative Impact:}
|
490 |
+
\begin{itemize}
|
491 |
+
\item \textbf{Paradigm Shift:} Revolutionized NLP and sequence modeling
|
492 |
+
\item \textbf{Foundation for Modern LLMs:} Paved the way for BERT, GPT, T5, etc.
|
493 |
+
\item \textbf{Enabled Scaling:} Facilitated training of much larger models on vast datasets
|
494 |
+
\end{itemize}
|
495 |
+
\end{itemize}
|
496 |
+
\end{frame}
|
497 |
+
|
498 |
+
% Slide 15: Conclusion
|
499 |
+
\begin{frame}
|
500 |
+
\frametitle{Conclusion}
|
501 |
+
\begin{itemize}
|
502 |
+
\item The Transformer introduced a novel architecture based \textit{solely} on attention, successfully dispensing with recurrence and convolutions.
|
503 |
+
\vspace{0.5cm}
|
504 |
+
\item It demonstrated significant improvements in:
|
505 |
+
\begin{itemize}
|
506 |
+
\item \highlight{Translation Quality} (new SOTA BLEU scores)
|
507 |
+
\item \highlight{Training Efficiency} (faster, more parallelizable)
|
508 |
+
\item \highlight{Generalization} (strong parsing results)
|
509 |
+
\end{itemize}
|
510 |
+
\vspace{0.5cm}
|
511 |
+
\item This work marked a pivotal moment, establishing attention as a primary mechanism for sequence modeling and unlocking new possibilities in AI.
|
512 |
+
\end{itemize}
|
513 |
+
\end{frame}
|
514 |
+
|
515 |
+
% Slide 16: Future Directions & Q&A
|
516 |
+
\begin{frame}
|
517 |
+
\frametitle{Future Directions \& Q\&A}
|
518 |
+
\begin{itemize}
|
519 |
+
\item \highlight{Future Work (Proposed by Authors):}
|
520 |
+
\begin{itemize}
|
521 |
+
\item Apply Transformer to other modalities (images, audio, video)
|
522 |
+
\item Explore local/restricted attention mechanisms for very long sequences
|
523 |
+
\item Investigate non-auto-regressive generation for further speed-ups
|
524 |
+
\end{itemize}
|
525 |
+
\end{itemize}
|
526 |
+
|
527 |
+
\vspace{1cm}
|
528 |
+
\begin{center}
|
529 |
+
\Large{\highlight{Thank You!}}
|
530 |
+
\vspace{0.5cm}
|
531 |
+
|
532 |
+
\Large{\highlight{Questions?}}
|
533 |
+
\end{center}
|
534 |
+
\end{frame}
|
535 |
+
|
536 |
+
\end{document}
|