import gradio as gr import matplotlib.pyplot as plt import numpy as np import math from datetime import datetime from matplotlib.ticker import FuncFormatter # Predefined hyperparameter sets PARAM_SETS = { "Stack-V2-Python": {"E": 0.69123678, "A": 0.01130616 * 1e9, "k": 0.393463, "alpha": 0.18937067}, "Pile": {"E": 1.28254036, "A": 0.2035367 * 1e9, "k": 0.33027934, "alpha": 0.19479807} } def pred_loss(E, A, k, alpha, n, p): return E + (A / (n * (1 + np.log(p) * k))) ** alpha def generate_plot(E, A, k, alpha): plt.clf() colors = ['#2B83BA', '#7BB7D6', '#ED7D5F', '#D7191C'] ax = plt.gca() for i, p in enumerate([1, 2, 4, 8]): x_plot = np.linspace(535813376 * 0.9, 4353203200 * 1.1, 100) y_plot = pred_loss(E, A, k, alpha, x_plot, p) ax.plot(x_plot, y_plot, marker=None, markersize=1, linewidth=3, color=colors[int(math.log(p, 2))], label=f"$P={p}$") ax.legend(fontsize=12) # ax.set_xscale("log") # ax.set_yscale("log") def billions(x, pos): if x < 1e9: result = "" else: result = f'{x * 1e-9:.1f}B' return result ax.xaxis.set_major_formatter(FuncFormatter(billions)) ax.xaxis.set_minor_formatter(FuncFormatter(billions)) ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{x:.2f}")) ax.yaxis.set_minor_formatter(FuncFormatter(lambda x, pos: f"{x:.2f}")) ax.set_xlim(535813376 * 0.9, 4353203200 * 1.1) ax.set_ylim(ax.get_ylim()[0] * 1, ax.get_ylim()[1] * 1.01) ax.text(0.03, 0.03, f"$E={E}$\n$A={A}$\n$k={k}$\n$\\alpha={alpha}$", transform=ax.transAxes, fontsize=10, verticalalignment='bottom', multialignment='left') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.set_xlabel('Parameters (Non-Embedding)', fontsize=12) ax.set_ylabel(f'Loss', fontsize=12) return plt OUTPUT_TEMPLATE = """Loss for a {n}B model when P={p} is: **{loss:.5f}**. It is equivalant to: - A **{n1}B** model with **P=1**; - A **{n2}B** model with **P=2**; - A **{n4}B** model with **P=4**; - A **{n8}B** model with **P=8**; Note: The equivalent parameters are for reference only. In some reasoning tasks, scaling the parallel streams will obtain more performance gains than the loss benefits! Enjoy it! 😊""" def process_inputs(E, A, k, alpha, n, p): """Process inputs and return results""" n = n * 1e9 plot = generate_plot(E, A, k, alpha) loss = pred_loss(E, A, k, alpha, n, p) n1 = n * (k * np.log(p) + 1) / (k * np.log(1) + 1) / 1e9 n2 = n * (k * np.log(p) + 1) / (k * np.log(2) + 1) / 1e9 n4 = n * (k * np.log(p) + 1) / (k * np.log(4) + 1) / 1e9 n8 = n * (k * np.log(p) + 1) / (k * np.log(8) + 1) / 1e9 print(f"[{datetime.now()}] {E = }, {A = }, {k = }, {alpha = }, {n = }, {p = }") return plot, OUTPUT_TEMPLATE.format(n=round(n / 1e9, 2), p=p, n1=round(n1, 2), n2=round(n2, 2), n4=round(n4, 2), n8=round(n8, 2), loss=loss) # Create interface HEAD = """