kshitijthakkar
commited on
Commit
·
701d26a
1
Parent(s):
36ea433
Docker file fix
Browse files- Dockerfile +12 -9
- app.py +2 -0
- mcp_server.py +3 -4
- outage_odyssey_ui.py +1 -1
- sample_incidents.md +264 -131
Dockerfile
CHANGED
@@ -1,27 +1,30 @@
|
|
1 |
# Dockerfile for a Python application with user permissions
|
2 |
FROM python:3.11-slim
|
3 |
|
4 |
-
# Install system dependencies as root
|
5 |
RUN apt-get update && apt-get install -y build-essential
|
6 |
|
7 |
-
# Create user and
|
8 |
-
RUN useradd -m -u 1000 user
|
9 |
-
|
10 |
-
|
11 |
|
12 |
# Set working directory
|
13 |
WORKDIR /app
|
14 |
|
|
|
|
|
|
|
|
|
15 |
# Copy files with proper ownership
|
16 |
COPY --chown=user:user . /app
|
17 |
|
18 |
# Install Python dependencies
|
19 |
COPY --chown=user:user ./requirements.txt requirements.txt
|
20 |
RUN pip install --no-cache-dir --upgrade pip && \
|
21 |
-
pip install --no-cache-dir --user -r requirements.txt
|
22 |
|
23 |
-
|
24 |
EXPOSE 8000 7860
|
25 |
|
26 |
-
#CMD ["
|
27 |
-
CMD
|
|
|
1 |
# Dockerfile for a Python application with user permissions
|
2 |
FROM python:3.11-slim
|
3 |
|
4 |
+
# Install system dependencies as root
|
5 |
RUN apt-get update && apt-get install -y build-essential
|
6 |
|
7 |
+
# Create user and set up directory structure as root
|
8 |
+
RUN useradd -m -u 1000 user && \
|
9 |
+
mkdir -p /app && \
|
10 |
+
chown -R user:user /app
|
11 |
|
12 |
# Set working directory
|
13 |
WORKDIR /app
|
14 |
|
15 |
+
# Switch to user AFTER setting up permissions
|
16 |
+
USER user
|
17 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
18 |
+
|
19 |
# Copy files with proper ownership
|
20 |
COPY --chown=user:user . /app
|
21 |
|
22 |
# Install Python dependencies
|
23 |
COPY --chown=user:user ./requirements.txt requirements.txt
|
24 |
RUN pip install --no-cache-dir --upgrade pip && \
|
25 |
+
pip install --no-cache-dir --user -r requirements.txt
|
26 |
|
|
|
27 |
EXPOSE 8000 7860
|
28 |
|
29 |
+
#CMD ["sh", "-c", "uvicorn mcp_server:app --host 0.0.0.0 --port 8000 && python app.py"]
|
30 |
+
CMD bash -c "python /app/mcp_server.py & sleep 60 && python /app/app.py"
|
app.py
CHANGED
@@ -114,6 +114,8 @@ try:
|
|
114 |
agent.name = "Outage Odyssey Agent"
|
115 |
GradioUI(agent=agent).launch(share=True,mcp_server=True) ##, file_upload_folder="uploaded_data", mcp_server=True,debug=True
|
116 |
|
|
|
|
|
117 |
finally:
|
118 |
mcp_client.disconnect()
|
119 |
print("MCP client disconnected")
|
|
|
114 |
agent.name = "Outage Odyssey Agent"
|
115 |
GradioUI(agent=agent).launch(share=True,mcp_server=True) ##, file_upload_folder="uploaded_data", mcp_server=True,debug=True
|
116 |
|
117 |
+
except Exception as e:
|
118 |
+
print(f"Error starting Gradio: {str(e)}")
|
119 |
finally:
|
120 |
mcp_client.disconnect()
|
121 |
print("MCP client disconnected")
|
mcp_server.py
CHANGED
@@ -25,7 +25,6 @@ import PyPDF2
|
|
25 |
# Load environment variables
|
26 |
load_dotenv()
|
27 |
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
28 |
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
29 |
# Setup logging
|
30 |
log_handler = RotatingFileHandler("mcp_server.log", maxBytes=10 * 1024 * 1024, backupCount=7)
|
31 |
log_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
@@ -522,7 +521,7 @@ def execute_incident_code(code: str) -> str:
|
|
522 |
# >>> generate_image_with_flux("A cartoon server room with smoke coming out")
|
523 |
# "generated_images/comic_image_123456789.png"
|
524 |
# """
|
525 |
-
#
|
526 |
# try:
|
527 |
# client = openai.OpenAI()
|
528 |
# response = client.images.generate(
|
@@ -664,11 +663,11 @@ async def health_check(request: Request) -> Response:
|
|
664 |
return JSONResponse({"status": "ok"})
|
665 |
|
666 |
|
667 |
-
def start_mcp_server():
|
668 |
try:
|
669 |
print("🚀 Starting server...")
|
670 |
logger.info("Starting Incident Comic Generator server")
|
671 |
-
mcp.run(transport="sse")
|
672 |
# mcp.run(transport="stdio")
|
673 |
except Exception as e:
|
674 |
logger.error(f"Server failed to start: {str(e)}")
|
|
|
25 |
# Load environment variables
|
26 |
load_dotenv()
|
27 |
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
|
|
28 |
# Setup logging
|
29 |
log_handler = RotatingFileHandler("mcp_server.log", maxBytes=10 * 1024 * 1024, backupCount=7)
|
30 |
log_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
|
|
521 |
# >>> generate_image_with_flux("A cartoon server room with smoke coming out")
|
522 |
# "generated_images/comic_image_123456789.png"
|
523 |
# """
|
524 |
+
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
525 |
# try:
|
526 |
# client = openai.OpenAI()
|
527 |
# response = client.images.generate(
|
|
|
663 |
return JSONResponse({"status": "ok"})
|
664 |
|
665 |
|
666 |
+
async def start_mcp_server():
|
667 |
try:
|
668 |
print("🚀 Starting server...")
|
669 |
logger.info("Starting Incident Comic Generator server")
|
670 |
+
await mcp.run(transport="sse")
|
671 |
# mcp.run(transport="stdio")
|
672 |
except Exception as e:
|
673 |
logger.error(f"Server failed to start: {str(e)}")
|
outage_odyssey_ui.py
CHANGED
@@ -718,7 +718,7 @@ Actions taken: Rolled back changes, investigating access controls"""
|
|
718 |
# Add a link to the GitHub repository
|
719 |
gr.Markdown("""
|
720 |
### 📖 **Source Code**
|
721 |
-
Check out the source code on [GitHub](https://github.com)
|
722 |
""", elem_classes="panel")
|
723 |
|
724 |
|
|
|
718 |
# Add a link to the GitHub repository
|
719 |
gr.Markdown("""
|
720 |
### 📖 **Source Code**
|
721 |
+
Check out the source code on [GitHub](https://github.com/Mandark-droid/OutageOdyssey)
|
722 |
""", elem_classes="panel")
|
723 |
|
724 |
|
sample_incidents.md
CHANGED
@@ -1,187 +1,320 @@
|
|
1 |
-
The Case of the Phantom Pepperoni: A NullPointerException Saga
|
2 |
-
Incident Summary
|
3 |
-
🔥 Affected Services
|
4 |
-
PizzaTrackerService (critical path for order processing)
|
5 |
-
OvenScheduler (automated baking system)
|
6 |
-
DroneNavigation (relied on bake-time ETA)
|
7 |
-
📉 Violated KPIs
|
8 |
-
Order Accuracy: Dropped to 58% (target: 99.9%)
|
9 |
-
Delivery Time: Increased to ∞ minutes (target: 30 mins)
|
10 |
-
Customer Satisfaction: Fell to "Why is my pizza literally on fire?"
|
11 |
-
🚨 Critical Alerts
|
12 |
-
text
|
13 |
-
[18:30:00] CRITICAL: NullPointerException at CheesyBytes.PizzaTrackerService.calculateOvenTime(Pizza.java:127) - Failed to invoke 'getToppingConfig()' on null object reference [18:31:23] ALERT: Oven #7 reported "unusual cheese flare-up" [18:35:45] WARNING: Drone fleet stuck in loop singing "Never Gonna Give You Up"
|
14 |
-
🔍 Forensic Data
|
15 |
-
Stack Trace:
|
16 |
-
text
|
17 |
-
java.lang.NullPointerException: Cannot invoke "ToppingConfig.getCookingTime()" because the return value of "Pizza.getToppingConfig()" is null
|
18 |
-
Variables at Fault:
|
19 |
-
currentPizza.getToppingConfig(): null
|
20 |
-
oven.preheat(): Called with null temperature value
|
21 |
-
Log Anomalies:
|
22 |
-
127 instances of Pizza{name='Phantom Pepperoni', config=null}
|
23 |
-
Drone logs showed 694 attempts to calculate route to "NaN,NaN"
|
24 |
-
⏰ Event Timeline
|
25 |
-
18:29:55: Deployment of "Pepperoni++" v2.1.3 (removed null-check in PizzaFactory)
|
26 |
-
18:30:01: First NullPointerException observed
|
27 |
-
18:32:00: OvenScheduler began interpreting null bake time as "MAX_INT minutes"
|
28 |
-
18:40:00: Security cameras captured engineers offering burnt pizzas to a NPE gremlin drawn on the whiteboard
|
29 |
-
🛠️ Resolution
|
30 |
-
The team:
|
31 |
-
Rolled back to v2.1.2 (which had a if (config != null) check)
|
32 |
-
Deployed @NonNull annotations on all getToppingConfig() calls
|
33 |
-
Hired a "NullPointerException Prevention Clown" for code review parties
|
34 |
-
Post-Incident Confession:
|
35 |
-
A developer later admitted: "I thought pepperoni didn’t need configs. It’s just meat confetti!"
|
36 |
-
Clues for Root Cause Hunters:
|
37 |
-
The deployment removed a null-check that previously handled legacy pizza types
|
38 |
-
Forensic data shows Phantom Pepperoni had no corresponding entry in the ToppingConfig database
|
39 |
-
The Pizza object constructor allowed toppingConfig to default to null if unspecified
|
40 |
-
The answer lies in the cheese... or the lack thereof. 🧀👻
|
41 |
|
42 |
-
---
|
43 |
-
The Curious Case of the Exploding Cupcake Counter: An Integer Overflow Incident
|
44 |
-
Incident Summary: On 2025-06-06T14:22:00Z, the CupcakeCounter microservice at SweetStats Analytics experienced a catastrophic integer overflow. This resulted in the system reporting negative cupcake sales, causing the finance dashboard to display “You owe the universe 2,147,483,648 cupcakes.” The marketing team briefly launched a “Cupcake Debt Forgiveness” campaign before the error was traced.
|
45 |
|
46 |
-
|
47 |
-
|
|
|
|
|
48 |
|
49 |
-
|
50 |
|
51 |
-
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
Sales Accuracy: -2,147,483,648% (target: 100%)
|
55 |
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
at SweetStats.SalesService.processOrder(SalesService.java:45)
|
72 |
-
Variable State:
|
73 |
|
74 |
-
|
75 |
|
76 |
-
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
|
79 |
|
80 |
-
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
|
|
|
83 |
|
84 |
-
|
85 |
-
14:21:55: “Cupcake Mania” flash sale begins
|
86 |
|
87 |
-
|
|
|
|
|
|
|
88 |
|
89 |
-
|
90 |
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
|
|
|
|
|
94 |
|
95 |
-
|
96 |
-
The team:
|
97 |
|
98 |
-
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
|
102 |
-
|
|
|
|
|
|
|
103 |
|
104 |
-
|
105 |
-
A developer admitted: “Who knew there were that many cupcake lovers? I thought 2 billion was enough for everyone!”
|
106 |
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
-
|
|
|
|
|
110 |
|
111 |
-
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
-
The mystery is baked in the numbers. 🧁💥
|
118 |
---
|
119 |
-
The Great Bread Uprising: A Foreign Key Fiasco
|
120 |
-
Incident Summary: On 2025-06-06T08:15:00Z, DoughyDelights Bakery’s inventory system triggered a SQL error 1451 after attempting to delete the "Sourdough Starter" recipe. This caused 12,000 baguettes to morph into pretzels in delivery apps, stranded 45 delivery drivers at "Bread Narnia" coordinates, and spawned customer complaints like "My ciabatta is judging me."
|
121 |
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
|
126 |
|
127 |
-
|
|
|
|
|
|
|
128 |
|
129 |
-
|
130 |
-
Inventory Accuracy: Dropped to -7% (target: 99.5%)
|
131 |
|
132 |
-
|
|
|
|
|
|
|
133 |
|
134 |
-
|
135 |
|
136 |
-
🚨 Critical Alerts
|
137 |
-
|
138 |
-
[
|
139 |
-
[
|
140 |
-
[
|
141 |
-
|
142 |
-
Error Log:
|
143 |
|
144 |
-
|
145 |
-
ERROR 1451: Cannot delete or update a parent row: a foreign key constraint fails
|
146 |
-
(DoughyDelights.orders, CONSTRAINT fk_recipe_id FOREIGN KEY (recipe_id)
|
147 |
-
REFERENCES RecipeRegistry (id))
|
148 |
-
Offending Query:
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
-
|
155 |
|
156 |
-
|
|
|
|
|
|
|
157 |
|
158 |
-
|
159 |
-
08:14:55: Overzealous intern executed "Legacy Recipe Purge" script
|
160 |
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
|
164 |
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
-
|
168 |
-
|
|
|
|
|
169 |
|
170 |
-
|
171 |
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
-
|
175 |
|
176 |
-
|
177 |
-
The
|
|
|
|
|
|
|
|
|
178 |
|
179 |
-
|
|
|
180 |
|
181 |
-
|
182 |
|
183 |
-
|
184 |
|
185 |
-
|
|
|
|
|
|
|
186 |
|
187 |
-
The
|
|
|
1 |
+
**The Case of the Phantom Pepperoni: A NullPointerException Saga**
|
2 |
+
**Incident Summary**: On **2025-06-05T18:30:00Z**, the *PizzaTrackerService* at **CheesyBytes Inc.** (a food delivery app) began reporting **"Phantom Pepperoni" orders** due to a `NullPointerException` in the `calculateOvenTime()` method. The incident caused **42% of pizzas** to bake indefinitely, triggering smoke alarms at 3 pizzerias and confusing delivery drones that circled HQ for hours.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
---
|
|
|
|
|
5 |
|
6 |
+
### 🔥 **Affected Services**
|
7 |
+
- **PizzaTrackerService** (critical path for order processing)
|
8 |
+
- **OvenScheduler** (automated baking system)
|
9 |
+
- **DroneNavigation** (relied on bake-time ETA)
|
10 |
|
11 |
+
---
|
12 |
|
13 |
+
### 📉 **Violated KPIs**
|
14 |
+
1. **Order Accuracy**: Dropped to 58% (target: 99.9%)
|
15 |
+
2. **Delivery Time**: Increased to **∞ minutes** (target: 30 mins)
|
16 |
+
3. **Customer Satisfaction**: Fell to "Why is my pizza *literally* on fire?"
|
17 |
|
18 |
+
---
|
|
|
19 |
|
20 |
+
### 🚨 **Critical Alerts**
|
21 |
+
```
|
22 |
+
[18:30:00] CRITICAL: NullPointerException at
|
23 |
+
CheesyBytes.PizzaTrackerService.calculateOvenTime(Pizza.java:127)
|
24 |
+
- Failed to invoke 'getToppingConfig()' on null object reference
|
25 |
+
[18:31:23] ALERT: Oven #7 reported "unusual cheese flare-up"
|
26 |
+
[18:35:45] WARNING: Drone fleet stuck in loop singing "Never Gonna Give You Up"
|
27 |
+
```
|
28 |
|
29 |
+
---
|
30 |
|
31 |
+
### 🔍 **Forensic Data**
|
32 |
+
**Stack Trace**:
|
33 |
+
```
|
34 |
+
java.lang.NullPointerException: Cannot invoke "ToppingConfig.getCookingTime()"
|
35 |
+
because the return value of "Pizza.getToppingConfig()" is null
|
36 |
+
```
|
37 |
+
**Variables at Fault**:
|
38 |
+
- `currentPizza.getToppingConfig()`: **null**
|
39 |
+
- `oven.preheat()`: Called with `null` temperature value
|
40 |
|
41 |
+
**Log Anomalies**:
|
42 |
+
- 127 instances of `Pizza{name='Phantom Pepperoni', config=null}`
|
43 |
+
- Drone logs showed 694 attempts to calculate route to "NaN,NaN"
|
|
|
|
|
44 |
|
45 |
+
---
|
46 |
|
47 |
+
### ⏰ **Event Timeline**
|
48 |
+
1. **18:29:55**: Deployment of "Pepperoni++" v2.1.3 (removed null-check in `PizzaFactory`)
|
49 |
+
2. **18:30:01**: First `NullPointerException` observed
|
50 |
+
3. **18:32:00**: OvenScheduler began interpreting `null` bake time as "MAX_INT minutes"
|
51 |
+
4. **18:40:00**: Security cameras captured engineers offering burnt pizzas to a NPE gremlin drawn on the whiteboard
|
52 |
|
53 |
+
---
|
54 |
|
55 |
+
### 🛠️ **Resolution**
|
56 |
+
The team:
|
57 |
+
1. Rolled back to v2.1.2 (which had a `if (config != null)` check)
|
58 |
+
2. Deployed `@NonNull` annotations on all `getToppingConfig()` calls
|
59 |
+
3. Hired a "NullPointerException Prevention Clown" for code review parties
|
60 |
|
61 |
+
**Post-Incident Confession**:
|
62 |
+
A developer later admitted: "I thought pepperoni didn’t *need* configs. It’s just meat confetti!"
|
63 |
|
64 |
+
---
|
|
|
65 |
|
66 |
+
**Clues for Root Cause Hunters**:
|
67 |
+
- The deployment removed a null-check that previously handled legacy pizza types
|
68 |
+
- Forensic data shows `Phantom Pepperoni` had no corresponding entry in the `ToppingConfig` database
|
69 |
+
- The `Pizza` object constructor allowed `toppingConfig` to default to `null` if unspecified
|
70 |
|
71 |
+
*The answer lies in the cheese... or the lack thereof.* 🧀👻
|
72 |
|
73 |
+
[1] https://sentry.io/answers/what-is-a-nullpointerexception-and-how-do-i-fix-it/
|
74 |
+
[2] http://support.sas.com/kb/47290
|
75 |
+
[3] https://www.weblineindia.com/blog/fix-nullpointerexception-in-java-with-examples/
|
76 |
+
[4] https://stackoverflow.com/questions/218384/what-is-a-nullpointerexception-and-how-do-i-fix-it
|
77 |
+
[5] https://www.youtube.com/watch?v=2b22PiQx8xc
|
78 |
+
[6] https://www.digitalocean.com/community/tutorials/java-lang-nullpointerexception
|
79 |
+
[7] https://www.harness.io/blog/java-nullpointerexception-solving-it
|
80 |
+
[8] https://howtodoinjava.com/java/exception-handling/how-to-effectively-handle-nullpointerexception-in-java/
|
81 |
+
[9] https://forum.step.esa.int/t/java-null-pointer-exception/40829
|
82 |
+
[10] https://blogs.oracle.com/fusionhcmcoe/post/autocomplete-java-null-pointer-exception
|
83 |
|
84 |
+
---
|
85 |
+
**The Great Bread Uprising: A Foreign Key Fiasco**
|
86 |
+
**Incident Summary**: On **2025-06-06T08:15:00Z**, **DoughyDelights Bakery**’s inventory system triggered a **SQL error 1451** after attempting to delete the "Sourdough Starter" recipe. This caused **12,000 baguettes** to morph into pretzels in delivery apps, stranded 45 delivery drivers at "Bread Narnia" coordinates, and spawned customer complaints like "My ciabatta is judging me."
|
87 |
|
88 |
+
---
|
|
|
89 |
|
90 |
+
### 🥖 **Affected Services**
|
91 |
+
- **RecipeRegistry** (core recipe database)
|
92 |
+
- **BreadGenerator** (batch baking scheduler)
|
93 |
+
- **DeliveryPathfinder** (GPS routing tied to recipe IDs)
|
94 |
|
95 |
+
---
|
96 |
|
97 |
+
### 📉 **Violated KPIs**
|
98 |
+
1. **Inventory Accuracy**: Dropped to -7% (target: 99.5%)
|
99 |
+
2. **Waste Metrics**: Increased by 420% (target: ≤5%)
|
100 |
+
3. **Driver Sanity**: Replaced with "Why am I delivering *negative bread*?"
|
101 |
|
102 |
+
---
|
|
|
103 |
|
104 |
+
### 🚨 **Critical Alerts**
|
105 |
+
```
|
106 |
+
[08:15:02] CRITICAL: SQL Error 1451 - "Cannot delete parent row: foreign key constraint fails"
|
107 |
+
[08:16:10] ALERT: Baker #3 reported "rye dough singing *Never Bake Alone*"
|
108 |
+
[08:20:00] WARNING: Delivery maps now route to /dev/null
|
109 |
+
```
|
110 |
+
|
111 |
+
---
|
112 |
+
|
113 |
+
### 🔍 **Forensic Data**
|
114 |
+
**Error Log**:
|
115 |
+
```
|
116 |
+
ERROR 1451: Cannot delete or update a parent row: a foreign key constraint fails
|
117 |
+
(`DoughyDelights`.`orders`, CONSTRAINT `fk_recipe_id` FOREIGN KEY (`recipe_id`)
|
118 |
+
REFERENCES `RecipeRegistry` (`id`))
|
119 |
+
```
|
120 |
+
**Offending Query**:
|
121 |
+
```sql
|
122 |
+
DELETE FROM RecipeRegistry WHERE recipe_name = 'Sourdough Starter';
|
123 |
+
```
|
124 |
|
125 |
+
**Database Snapshot**:
|
126 |
+
- **Orphaned Orders**: 14,892 orders referencing `recipe_id=NULL`
|
127 |
+
- **GPS Anomalies**: 67 drivers attempted deliveries to `POINT(∅,∅)`
|
128 |
|
129 |
+
---
|
130 |
|
131 |
+
### ⏰ **Event Timeline**
|
132 |
+
1. **08:14:55**: Overzealous intern executed "Legacy Recipe Purge" script
|
133 |
+
2. **08:15:01**: First foreign key violation detected
|
134 |
+
3. **08:17:30**: BreadGenerator interpreted missing recipes as "pretzel mode"
|
135 |
+
4. **08:25:00**: Surveillance footage showed CFO trying to bribe a baguette
|
136 |
|
137 |
+
---
|
138 |
+
|
139 |
+
### 🛠️ **Resolution**
|
140 |
+
The team:
|
141 |
+
1. Restored `Sourdough Starter` from backup (with **ON DELETE RESTRICT** added)
|
142 |
+
2. Deployed regex filter blocking `DELETE` commands containing "starter"
|
143 |
+
3. Hosted a "Foreign Key Appreciation Day" with constraint-themed cupcakes
|
144 |
+
|
145 |
+
**Post-Incident Confession**:
|
146 |
+
The intern later tweeted: "#YOLO DELETE statements are the gluten-free option of SQL."
|
147 |
|
|
|
148 |
---
|
|
|
|
|
149 |
|
150 |
+
**Clues for Root Cause Detectives**:
|
151 |
+
- The `orders` table had a foreign key dependency on `RecipeRegistry.id` [3]
|
152 |
+
- No `ON DELETE CASCADE` clause existed in the schema
|
153 |
+
- Audit logs showed 127 pre-incident warnings about "unreferenced recipe deletions"
|
154 |
+
|
155 |
+
*The proof is in the foreign pudding.* 🍮🔑
|
156 |
+
|
157 |
+
[1] https://www.linkedin.com/pulse/10-common-bugs-sql-query-zita-demeter-yumuc
|
158 |
+
[2] https://www.ibm.com/docs/en/idr/11.4.0?topic=sm-sql-error-codes-nnnn
|
159 |
+
[3] https://ai2sql.io/common-sql-error-codes
|
160 |
+
[4] https://learnsql.com/blog/five-common-sql-errors/
|
161 |
+
[5] https://docs.intersystems.com/latest/csp/docbook/DocBook.UI.Page.cls?KEY=RERR_sql
|
162 |
+
[6] http://blog.solvaria.com/top-common-errors-sql-oracle
|
163 |
+
[7] https://www.stratascratch.com/blog/top-most-common-sql-coding-errors-in-data-science/
|
164 |
+
[8] https://www.metabase.com/learn/sql/debugging-sql/sql-syntax
|
165 |
+
---
|
166 |
+
**The Curious Case of the Exploding Cupcake Counter: An Integer Overflow Incident**
|
167 |
+
**Incident Summary:** On **2025-06-06T14:22:00Z**, the CupcakeCounter microservice at **SweetStats Analytics** experienced a catastrophic integer overflow. This resulted in the system reporting negative cupcake sales, causing the finance dashboard to display “You owe the universe 2,147,483,648 cupcakes.” The marketing team briefly launched a “Cupcake Debt Forgiveness” campaign before the error was traced.
|
168 |
|
169 |
+
---
|
170 |
|
171 |
+
### 🧁 **Affected Services**
|
172 |
+
- **CupcakeCounter** (real-time sales tally)
|
173 |
+
- **FinanceDashboard** (exec-level reporting)
|
174 |
+
- **RewardsEngine** (customer loyalty points)
|
175 |
|
176 |
+
---
|
|
|
177 |
|
178 |
+
### 📉 **Violated KPIs**
|
179 |
+
1. **Sales Accuracy:** -2,147,483,648% (target: 100%)
|
180 |
+
2. **Reward Points Issued:** 0 (target: 1 per cupcake)
|
181 |
+
3. **Dashboard Uptime:** 78% (target: 99.9%)
|
182 |
|
183 |
+
---
|
184 |
|
185 |
+
### 🚨 **Critical Alerts**
|
186 |
+
```
|
187 |
+
[14:22:01] CRITICAL: ArithmeticException: integer overflow in CupcakeCounter.updateSales()
|
188 |
+
[14:22:03] ALERT: FinanceDashboard shows negative cupcake revenue
|
189 |
+
[14:23:10] WARNING: RewardsEngine issued 0 points for 2 million cupcake purchases
|
190 |
+
```
|
|
|
191 |
|
192 |
+
---
|
|
|
|
|
|
|
|
|
193 |
|
194 |
+
### 🔍 **Forensic Data**
|
195 |
+
**Error Log:**
|
196 |
+
```
|
197 |
+
java.lang.ArithmeticException: integer overflow
|
198 |
+
at SweetStats.CupcakeCounter.updateSales(CupcakeCounter.java:88)
|
199 |
+
at SweetStats.SalesService.processOrder(SalesService.java:45)
|
200 |
+
```
|
201 |
+
**Variable State:**
|
202 |
+
- `int totalCupcakesSold = 2147483647` (max value for signed 32-bit int)
|
203 |
+
- Next sale increments value to -2,147,483,648
|
204 |
+
|
205 |
+
**Database Snapshot:**
|
206 |
+
- `sales_total` column in `CupcakeSales` table: -2,147,483,648
|
207 |
+
- Loyalty points for user `cupcake_queen`: 0 (expected: 1,000,000)
|
208 |
+
|
209 |
+
---
|
210 |
+
|
211 |
+
### ⏰ **Event Timeline**
|
212 |
+
1. **14:21:55:** “Cupcake Mania” flash sale begins
|
213 |
+
2. **14:22:00:** Record-breaking 2 million orders in 5 seconds
|
214 |
+
3. **14:22:01:** Integer overflow triggers ArithmeticException
|
215 |
+
4. **14:23:30:** FinanceDashboard replaced currency symbols with crying emojis
|
216 |
+
5. **14:25:00:** CEO tweets “We are now in negative cupcakes. Please eat responsibly.”
|
217 |
+
|
218 |
+
---
|
219 |
+
|
220 |
+
### 🛠️ **Resolution**
|
221 |
+
The team:
|
222 |
+
1. Migrated `int` to `long` in all cupcake counters
|
223 |
+
2. Added overflow detection and alerts
|
224 |
+
3. Sent apology cupcakes to all affected customers (with apology sprinkles)
|
225 |
+
|
226 |
+
**Post-Incident Confession:**
|
227 |
+
A developer admitted: “Who knew there were that many cupcake lovers? I thought 2 billion was enough for everyone!”
|
228 |
+
|
229 |
+
---
|
230 |
+
|
231 |
+
**Clues for Root Cause Sleuths:**
|
232 |
+
- CupcakeCounter used a signed 32-bit integer for sales totals
|
233 |
+
- No overflow checks or exception handling
|
234 |
+
- Database column type matched Java `int`
|
235 |
+
- Flash sale volume exceeded maximum representable value
|
236 |
+
|
237 |
+
*The mystery is baked in the numbers.* 🧁💥
|
238 |
+
|
239 |
+
---
|
240 |
+
|
241 |
+
## The Great Database Traffic Jam: SQLTimeoutException Tango
|
242 |
+
|
243 |
+
**Incident Summary:**
|
244 |
+
On **2025-06-06T16:45:00Z**, the **OrderProcessor** microservice at **GadgetGuru Inc.** (an e-commerce platform for quirky gadgets) experienced a severe **SQLTimeoutException** during a flash sale on "Self-Stirring Coffee Mugs." The database connection pool ran dry, causing **15,000 pending orders** to queue indefinitely and triggering a wave of confused support tickets titled "Why is my coffee mug still manual?"
|
245 |
+
|
246 |
+
---
|
247 |
+
|
248 |
+
### 🛒 **Affected Services**
|
249 |
+
- **OrderProcessor** (order placement and payment)
|
250 |
+
- **InventoryTracker** (real-time stock management)
|
251 |
+
- **NotificationService** (order confirmation emails)
|
252 |
|
253 |
+
---
|
254 |
|
255 |
+
### 📉 **Violated KPIs**
|
256 |
+
1. **Order Processing Time:** Spiked to 9999 seconds (target: 2 seconds)
|
257 |
+
2. **Successful Order Rate:** Dropped to 8% (target: 99.9%)
|
258 |
+
3. **Customer Satisfaction:** "My coffee is getting cold waiting for my self-stirring mug!"
|
259 |
|
260 |
+
---
|
|
|
261 |
|
262 |
+
### 🚨 **Critical Alerts**
|
263 |
+
```
|
264 |
+
[16:45:01] CRITICAL: SQLTimeoutException in OrderProcessor.processOrder()
|
265 |
+
[16:45:02] ALERT: Connection pool exhausted (maxConnections=50, active=50, waiters=14,987)
|
266 |
+
[16:46:30] WARNING: NotificationService backlogged with 9,000 unsent emails
|
267 |
+
```
|
268 |
|
269 |
+
---
|
270 |
|
271 |
+
### 🔍 **Forensic Data**
|
272 |
+
**Error Log:**
|
273 |
+
```
|
274 |
+
com.mysql.jdbc.exceptions.jdbc4.MySQLTimeoutException: Connection timed out after 30 seconds
|
275 |
+
at OrderProcessor.processOrder(OrderProcessor.java:73)
|
276 |
+
at OrderService.submitOrder(OrderService.java:45)
|
277 |
+
```
|
278 |
+
**Connection Pool Metrics:**
|
279 |
+
- **Max Connections:** 50
|
280 |
+
- **Active Connections:** 50
|
281 |
+
- **Waiting Requests:** 14,987 at peak
|
282 |
+
- **Average Query Duration:** 32 seconds (normal: <0.5s)
|
283 |
|
284 |
+
**Database Snapshot:**
|
285 |
+
- **Locked Rows:** 23,000 (due to long-running transactions)
|
286 |
+
- **CPU Utilization:** 98%
|
287 |
+
- **Deadlock Events:** 0 (not a deadlock, just a traffic jam)
|
288 |
|
289 |
+
---
|
290 |
|
291 |
+
### ⏰ **Event Timeline**
|
292 |
+
1. **16:44:50:** Flash sale on "Self-Stirring Coffee Mugs" begins
|
293 |
+
2. **16:45:00:** Order volume spikes to 20,000 requests per minute
|
294 |
+
3. **16:45:01:** First SQLTimeoutException observed
|
295 |
+
4. **16:46:00:** Connection pool fully exhausted, orders start queuing
|
296 |
+
5. **16:50:00:** Support team overwhelmed with "Where’s my mug?" tickets
|
297 |
+
6. **16:55:00:** Engineers panic and try to "stir" the database with a spoon
|
298 |
|
299 |
+
---
|
300 |
|
301 |
+
### 🛠️ **Resolution**
|
302 |
+
The team:
|
303 |
+
1. **Increased the database connection pool size** from 50 to 500
|
304 |
+
2. **Optimized long-running queries** and added query timeouts
|
305 |
+
3. **Implemented rate limiting** for flash sales
|
306 |
+
4. **Hosted a "Connection Pool Appreciation Brunch"** with free coffee (manually stirred)
|
307 |
|
308 |
+
**Post-Incident Confession:**
|
309 |
+
A database admin later admitted: "I thought 50 connections was enough for everyone. Turns out, everyone wants self-stirring mugs!"
|
310 |
|
311 |
+
---
|
312 |
|
313 |
+
## **Clues for Root Cause Investigators**
|
314 |
|
315 |
+
- **Connection pool size was static and too small for flash sale traffic**
|
316 |
+
- **Long-running queries blocked connections, causing a backlog**
|
317 |
+
- **No rate limiting or circuit breakers were in place for peak events**
|
318 |
+
- **Monitoring did not alert on pending request queues until after the pool was exhausted**
|
319 |
|
320 |
+
*The culprit? A coffee mug that stirred up more trouble than coffee!* ☕🛑
|