Spaces:
Running
Running
<html lang="en"> | |
<head> | |
<meta charset="UTF-8" /> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
<title>Repo-Level Dedupe Visualization</title> | |
<link rel="stylesheet" href="style.css" /> | |
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script> | |
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script> | |
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script> | |
</head> | |
<body> | |
<div class="container"> | |
<div class="header"> | |
<h1>Visualizing Repo-Level Dedupe</h1> | |
<p> | |
This visualization demonstrates block-level deduplication across all | |
models in | |
<a | |
target="_blank" | |
href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF" | |
>bartowski/gemma-2-9b-it-GGUF</a | |
>. | |
</p> | |
<p> | |
Each row represents a file in the repository grouped into blocks of up | |
to 64MB. The color of each block represents the deduplication ratio | |
for the block, which is a function of how often the chunks in the | |
block are shared between files. The darker the color, the more | |
frequently content is shared, the better the overall upload and | |
download times for a given file! The deduplication savings here take a | |
191GB repo and cut it down to 97GB, helping to shave a few hours off | |
the upload time. | |
</p> | |
<p> | |
You can read more about chunks, blocks, and the nitty gritty details | |
of how we make this all work in our accompanying | |
<a | |
target="_blank" | |
href="https://huggingface.co/blog/from-chunks-to-blocks" | |
>blog post</a | |
>. | |
</p> | |
To explore the visualization: | |
<ul> | |
<li> | |
<strong>Hover</strong> over a block in an individual file to | |
highlight it and see where else it appears in the repository. | |
</li> | |
<li> | |
<strong>Click</strong> any block in a file to see all other files | |
that share blocks. | |
</li> | |
<li> | |
<strong>Double-click</strong> anywhere on any file to reset and | |
continue exploring. | |
</li> | |
</ul> | |
</div> | |
<div class="heatmap-container"> | |
<div id="vis"></div> | |
</div> | |
</div> | |
<script> | |
var vlSpec = { | |
$schema: "https://vega.github.io/schema/vega-lite/v5.json", | |
resolve: { scale: { x: "independent" } }, | |
width: 800, | |
height: 25, | |
params: [ | |
{ | |
name: "highlight", | |
select: { type: "point", fields: ["xorb_id"], on: "pointerover" }, | |
}, | |
{ | |
name: "select", | |
select: { type: "point", fields: ["repo"], toggle: "false" }, | |
}, | |
{ | |
name: "xorbs_selected", | |
expr: "pluck(data('source_0'), 'repo_xorb_selected')", | |
}, | |
{ | |
name: "any_xorbs_selected", | |
expr: "extent(xorbs_selected)[0] != null", | |
}, | |
], | |
transform: [ | |
{ | |
calculate: | |
"(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1", | |
as: "repo_selected", | |
}, | |
{ | |
calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)", | |
as: "repo_xorb_selected", | |
}, | |
{ | |
calculate: | |
"split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]", | |
as: "repo", | |
}, | |
], | |
data: { | |
url: "xorbs.json", | |
}, | |
mark: "rect", | |
encoding: { | |
x: { | |
field: "xorb_id", | |
axis: null, | |
sort: { field: "dedupe_factor", order: "descending" }, | |
stack: "normalize", | |
}, | |
color: { | |
condition: [ | |
{ test: "datum.xorb_id == highlight.xorb_id", value: "orange" }, | |
], | |
field: "dedupe_factor", | |
type: "quantitative", | |
scale: { scheme: "blues", domain: [0, 10] }, | |
}, | |
opacity: { | |
condition: [ | |
{ | |
test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1", | |
value: 0.2, | |
}, | |
], | |
}, | |
tooltip: [ | |
{ field: "repo", type: "nominal", title: "File" }, | |
{ field: "xorb_id", type: "nominal", title: "Block Hash" }, | |
{ | |
field: "dedupe_factor", | |
type: "quantitative", | |
title: "Dedupe Factor", | |
}, | |
], | |
row: { | |
field: "repo", | |
title: "", | |
spacing: 1, | |
header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 }, | |
sort: { field: "repo", order: "ascending" }, | |
}, | |
}, | |
}; | |
vegaEmbed("#vis", vlSpec); | |
</script> | |
</body> | |
</html> | |