-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add gpu stats endpoint for internal monitoring (#109)
* Add internal router * Basic implementation based on spec * Add query params for filtering * Fix query * Add documentation * . * Fix typo in filename
- Loading branch information
Showing
3 changed files
with
258 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
import { Provider } from "@shared/dbSchemas/akash"; | ||
import { chainDb } from "@src/db/dbConnection"; | ||
import { isValidBech32Address } from "@src/utils/addresses"; | ||
import { round } from "@src/utils/math"; | ||
import { Hono } from "hono"; | ||
import * as semver from "semver"; | ||
import { QueryTypes } from "sequelize"; | ||
|
||
export const internalRouter = new Hono(); | ||
|
||
internalRouter.get("/provider-versions", async (c) => { | ||
const providers = await Provider.findAll({ | ||
attributes: ["hostUri", "akashVersion"], | ||
where: { | ||
isOnline: true | ||
}, | ||
group: ["hostUri", "akashVersion"] | ||
}); | ||
|
||
let grouped: { version: string; providers: string[] }[] = []; | ||
|
||
for (const provider of providers) { | ||
const existing = grouped.find((x) => x.version === provider.akashVersion); | ||
|
||
if (existing) { | ||
existing.providers.push(provider.hostUri); | ||
} else { | ||
grouped.push({ | ||
version: provider.akashVersion, | ||
providers: [provider.hostUri] | ||
}); | ||
} | ||
} | ||
|
||
const nullVersionName = "<UNKNOWN>"; | ||
const results = grouped.map((x) => ({ | ||
version: x.version ?? nullVersionName, | ||
count: x.providers.length, | ||
ratio: round(x.providers.length / providers.length, 2), | ||
providers: Array.from(new Set(x.providers)) | ||
})); | ||
|
||
const sorted = results | ||
.filter((x) => x.version !== nullVersionName) // Remove <UNKNOWN> version for sorting | ||
.sort((a, b) => semver.compare(b.version, a.version)) | ||
.concat(results.filter((x) => x.version === nullVersionName)) // Add back <UNKNOWN> version at the end | ||
.reduce((acc, x) => { | ||
acc[x.version] = x; | ||
return acc; | ||
}, {} as any); | ||
|
||
return c.json(sorted); | ||
}); | ||
|
||
internalRouter.get("/gpu", async (c) => { | ||
const provider = c.req.query("provider"); | ||
const vendor = c.req.query("vendor"); | ||
const model = c.req.query("model"); | ||
const memory_size = c.req.query("memory_size"); | ||
|
||
let provider_address = null; | ||
let provider_hosturi = null; | ||
|
||
if (provider) { | ||
if (isValidBech32Address(provider)) { | ||
provider_address = provider; | ||
} else if (URL.canParse(provider)) { | ||
provider_hosturi = provider; | ||
} else { | ||
return c.json({ error: "Invalid provider parameter, should be a valid akash address or host uri" }, 400); | ||
} | ||
} | ||
|
||
const gpuNodes = (await chainDb.query( | ||
` | ||
WITH snapshots AS ( | ||
SELECT DISTINCT ON("hostUri") | ||
ps.id AS id, | ||
"hostUri", | ||
p."owner" | ||
FROM provider p | ||
INNER JOIN "providerSnapshot" ps ON ps.id=p."lastSnapshotId" | ||
WHERE p."isOnline" IS TRUE | ||
) | ||
SELECT s."hostUri", n."name", n."gpuAllocatable" AS allocatable, n."gpuAllocated" AS allocated, gpu."modelId", gpu.vendor, gpu.name AS "modelName", gpu.interface, gpu."memorySize" | ||
FROM snapshots s | ||
INNER JOIN "providerSnapshotNode" n ON n."snapshotId"=s.id AND n."gpuAllocatable" > 0 | ||
LEFT JOIN ( | ||
SELECT DISTINCT ON (gpu."snapshotNodeId") gpu.* | ||
FROM "providerSnapshotNodeGPU" gpu | ||
) gpu ON gpu."snapshotNodeId" = n.id | ||
WHERE | ||
(:vendor IS NULL OR gpu.vendor = :vendor) | ||
AND (:model IS NULL OR gpu.name = :model) | ||
AND (:memory_size IS NULL OR gpu."memorySize" = :memory_size) | ||
AND (:provider_address IS NULL OR s."owner" = :provider_address) | ||
AND (:provider_hosturi IS NULL OR s."hostUri" = :provider_hosturi) | ||
`, | ||
{ | ||
type: QueryTypes.SELECT, | ||
replacements: { | ||
vendor: vendor ?? null, | ||
model: model ?? null, | ||
memory_size: memory_size ?? null, | ||
provider_address: provider_address ?? null, | ||
provider_hosturi: provider_hosturi ?? null | ||
} | ||
} | ||
)) as { | ||
hostUri: string; | ||
name: string; | ||
allocatable: number; | ||
allocated: number; | ||
modelId: string; | ||
vendor: string; | ||
modelName: string; | ||
interface: string; | ||
memorySize: string; | ||
}[]; | ||
|
||
const response = { | ||
gpus: { | ||
total: { | ||
allocatable: gpuNodes.map((x) => x.allocatable).reduce((acc, x) => acc + x, 0), | ||
allocated: gpuNodes.map((x) => x.allocated).reduce((acc, x) => acc + x, 0) | ||
}, | ||
details: {} as { [key: string]: { model: string; ram: string; interface: string; allocatable: number; allocated: number }[] } | ||
} | ||
}; | ||
|
||
for (const gpuNode of gpuNodes) { | ||
const vendorName = gpuNode.vendor ?? "<UNKNOWN>"; | ||
if (!(vendorName in response.gpus.details)) { | ||
response.gpus.details[vendorName] = []; | ||
} | ||
|
||
const existing = response.gpus.details[vendorName].find( | ||
(x) => x.model === gpuNode.modelName && x.interface === gpuNode.interface && x.ram === gpuNode.memorySize | ||
); | ||
|
||
if (existing) { | ||
existing.allocatable += gpuNode.allocatable; | ||
existing.allocated += gpuNode.allocated; | ||
} else { | ||
response.gpus.details[vendorName].push({ | ||
model: gpuNode.modelName, | ||
ram: gpuNode.memorySize, | ||
interface: gpuNode.interface, | ||
allocatable: gpuNode.allocatable, | ||
allocated: gpuNode.allocated | ||
}); | ||
} | ||
} | ||
|
||
return c.json(response); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# Internal endpoints that are not part of the public api | ||
|
||
Those endpoints are used for debugging and analytics purposes. | ||
|
||
- [GPU Stats](#gpu-stats) - Distribution of gpu vendor/model | ||
- [Provider Versions](#provider-versions) - See what akash version providers are running | ||
|
||
## GPU Stats | ||
|
||
Url: https://api.cloudmos.io/internal/gpu | ||
|
||
Returns a summary of the gpus on the network. | ||
|
||
### Example Response | ||
|
||
``` | ||
{ | ||
"gpus": { | ||
"total": { | ||
"allocatable": 2, | ||
"allocated": 0 | ||
}, | ||
"details": { | ||
"nvidia": [ | ||
{ | ||
"model": "t4", | ||
"ram": "16Gi", | ||
"interface": "PCIe", | ||
"allocatable": 1, | ||
"allocated": 0 | ||
}, | ||
{ | ||
"model": "rtx3060ti", | ||
"ram": "8Gi", | ||
"interface": "PCIe", | ||
"allocatable": 1, | ||
"allocated": 0 | ||
} | ||
] | ||
} | ||
} | ||
} | ||
``` | ||
|
||
### Query parameters for filtering | ||
|
||
--- | ||
|
||
| Param | Description | | ||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------ | | ||
| provider | Either a provider address (ex: `akash175llqyjvxfle9qwt740vm46772dzaznpzgm576`) or a host uri (ex: `https://provider.akashprovid.com:8443`) | | ||
| vendor | Ex: `nvidia` | | ||
| model | Ex: `t4` | | ||
| memory_size | Ex: `16Gi` | | ||
|
||
All query parameters can be combined, ex: | ||
`https://api.cloudmos.io/internal/gpu?provider=akash175llqyjvxfle9qwt740vm46772dzaznpzgm576&vendor=nvidia&model=rtx3060ti&memory_size=8Gi` | ||
|
||
## Provider Versions | ||
|
||
Url: https://api.cloudmos.io/internal/provider-versions | ||
|
||
Returns a list of versions and the providers that are currently on that version. The `<UNKNOWN>` version correspond to providers where the version could not be determined. The `/version` endpoint was broken for a long time, but is now fixed in [v0.5.0-rc11](https://github.com/akash-network/provider/releases/tag/v0.5.0-rc11) | ||
|
||
### Example Response | ||
|
||
``` | ||
{ | ||
"0.5.0-rc16": { | ||
"version": "0.5.0-rc16", | ||
"count": 4, | ||
"ratio": 0.05, | ||
"providers": [ | ||
"https://provider.moonbys.cloud:8443", | ||
"https://provider.akashprovid.com:8443", | ||
"https://provider.akashtesting.xyz:8443" | ||
] | ||
}, | ||
"0.5.0-rc15": { | ||
"version": "0.5.0-rc16", | ||
"count": 4, | ||
"ratio": 0.05, | ||
"providers": [ | ||
"https://provider.akash.pro:8443" | ||
] | ||
}, | ||
"<UNKNOWN>": { | ||
"version": "<UNKNOWN>", | ||
"count": 80, | ||
"ratio": 0.95, | ||
"providers": [ | ||
"https://provider.macptrading.com:8443", | ||
"https://provider.digitaler-friedhof.com:8443", | ||
"https://provider.qioi.io:8443", | ||
"https://provider.bluepeer.io:8443", | ||
... | ||
] | ||
} | ||
} | ||
``` |