Skip to content

Commit

Permalink
Add gpu stats endpoint for internal monitoring (#109)
Browse files Browse the repository at this point in the history
* Add internal router

* Basic implementation based on spec

* Add query params for filtering

* Fix query

* Add documentation

* .

* Fix typo in filename
  • Loading branch information
Redm4x authored Feb 24, 2024
1 parent fb8dfca commit db887c0
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 0 deletions.
2 changes: 2 additions & 0 deletions api/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { cors } from "hono/cors";
import { serve } from "@hono/node-server";
import { legacyRouter } from "./routers/legacyRouter";
import { sentry } from "@hono/sentry";
import { internalRouter } from "./routers/internalRouter";

const appHono = new Hono();
appHono.use(
Expand Down Expand Up @@ -70,6 +71,7 @@ appHono.route("/", apiRouter);
appHono.route("/user", userRouter);
appHono.route("/web3-index", web3IndexRouter);
appHono.route("/dashboard", dashboardRouter);
appHono.route("/internal", internalRouter);

appHono.get("/status", (c) => {
const version = packageJson.version;
Expand Down
156 changes: 156 additions & 0 deletions api/src/routers/internalRouter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import { Provider } from "@shared/dbSchemas/akash";
import { chainDb } from "@src/db/dbConnection";
import { isValidBech32Address } from "@src/utils/addresses";
import { round } from "@src/utils/math";
import { Hono } from "hono";
import * as semver from "semver";
import { QueryTypes } from "sequelize";

export const internalRouter = new Hono();

internalRouter.get("/provider-versions", async (c) => {
const providers = await Provider.findAll({
attributes: ["hostUri", "akashVersion"],
where: {
isOnline: true
},
group: ["hostUri", "akashVersion"]
});

let grouped: { version: string; providers: string[] }[] = [];

for (const provider of providers) {
const existing = grouped.find((x) => x.version === provider.akashVersion);

if (existing) {
existing.providers.push(provider.hostUri);
} else {
grouped.push({
version: provider.akashVersion,
providers: [provider.hostUri]
});
}
}

const nullVersionName = "<UNKNOWN>";
const results = grouped.map((x) => ({
version: x.version ?? nullVersionName,
count: x.providers.length,
ratio: round(x.providers.length / providers.length, 2),
providers: Array.from(new Set(x.providers))
}));

const sorted = results
.filter((x) => x.version !== nullVersionName) // Remove <UNKNOWN> version for sorting
.sort((a, b) => semver.compare(b.version, a.version))
.concat(results.filter((x) => x.version === nullVersionName)) // Add back <UNKNOWN> version at the end
.reduce((acc, x) => {
acc[x.version] = x;
return acc;
}, {} as any);

return c.json(sorted);
});

internalRouter.get("/gpu", async (c) => {
const provider = c.req.query("provider");
const vendor = c.req.query("vendor");
const model = c.req.query("model");
const memory_size = c.req.query("memory_size");

let provider_address = null;
let provider_hosturi = null;

if (provider) {
if (isValidBech32Address(provider)) {
provider_address = provider;
} else if (URL.canParse(provider)) {
provider_hosturi = provider;
} else {
return c.json({ error: "Invalid provider parameter, should be a valid akash address or host uri" }, 400);
}
}

const gpuNodes = (await chainDb.query(
`
WITH snapshots AS (
SELECT DISTINCT ON("hostUri")
ps.id AS id,
"hostUri",
p."owner"
FROM provider p
INNER JOIN "providerSnapshot" ps ON ps.id=p."lastSnapshotId"
WHERE p."isOnline" IS TRUE
)
SELECT s."hostUri", n."name", n."gpuAllocatable" AS allocatable, n."gpuAllocated" AS allocated, gpu."modelId", gpu.vendor, gpu.name AS "modelName", gpu.interface, gpu."memorySize"
FROM snapshots s
INNER JOIN "providerSnapshotNode" n ON n."snapshotId"=s.id AND n."gpuAllocatable" > 0
LEFT JOIN (
SELECT DISTINCT ON (gpu."snapshotNodeId") gpu.*
FROM "providerSnapshotNodeGPU" gpu
) gpu ON gpu."snapshotNodeId" = n.id
WHERE
(:vendor IS NULL OR gpu.vendor = :vendor)
AND (:model IS NULL OR gpu.name = :model)
AND (:memory_size IS NULL OR gpu."memorySize" = :memory_size)
AND (:provider_address IS NULL OR s."owner" = :provider_address)
AND (:provider_hosturi IS NULL OR s."hostUri" = :provider_hosturi)
`,
{
type: QueryTypes.SELECT,
replacements: {
vendor: vendor ?? null,
model: model ?? null,
memory_size: memory_size ?? null,
provider_address: provider_address ?? null,
provider_hosturi: provider_hosturi ?? null
}
}
)) as {
hostUri: string;
name: string;
allocatable: number;
allocated: number;
modelId: string;
vendor: string;
modelName: string;
interface: string;
memorySize: string;
}[];

const response = {
gpus: {
total: {
allocatable: gpuNodes.map((x) => x.allocatable).reduce((acc, x) => acc + x, 0),
allocated: gpuNodes.map((x) => x.allocated).reduce((acc, x) => acc + x, 0)
},
details: {} as { [key: string]: { model: string; ram: string; interface: string; allocatable: number; allocated: number }[] }
}
};

for (const gpuNode of gpuNodes) {
const vendorName = gpuNode.vendor ?? "<UNKNOWN>";
if (!(vendorName in response.gpus.details)) {
response.gpus.details[vendorName] = [];
}

const existing = response.gpus.details[vendorName].find(
(x) => x.model === gpuNode.modelName && x.interface === gpuNode.interface && x.ram === gpuNode.memorySize
);

if (existing) {
existing.allocatable += gpuNode.allocatable;
existing.allocated += gpuNode.allocated;
} else {
response.gpus.details[vendorName].push({
model: gpuNode.modelName,
ram: gpuNode.memorySize,
interface: gpuNode.interface,
allocatable: gpuNode.allocatable,
allocated: gpuNode.allocated
});
}
}

return c.json(response);
});
100 changes: 100 additions & 0 deletions doc/Internal_Endpoints.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Internal endpoints that are not part of the public api

Those endpoints are used for debugging and analytics purposes.

- [GPU Stats](#gpu-stats) - Distribution of gpu vendor/model
- [Provider Versions](#provider-versions) - See what akash version providers are running

## GPU Stats

Url: https://api.cloudmos.io/internal/gpu

Returns a summary of the gpus on the network.

### Example Response

```
{
"gpus": {
"total": {
"allocatable": 2,
"allocated": 0
},
"details": {
"nvidia": [
{
"model": "t4",
"ram": "16Gi",
"interface": "PCIe",
"allocatable": 1,
"allocated": 0
},
{
"model": "rtx3060ti",
"ram": "8Gi",
"interface": "PCIe",
"allocatable": 1,
"allocated": 0
}
]
}
}
}
```

### Query parameters for filtering

---

| Param | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| provider | Either a provider address (ex: `akash175llqyjvxfle9qwt740vm46772dzaznpzgm576`) or a host uri (ex: `https://provider.akashprovid.com:8443`) |
| vendor | Ex: `nvidia` |
| model | Ex: `t4` |
| memory_size | Ex: `16Gi` |

All query parameters can be combined, ex:
`https://api.cloudmos.io/internal/gpu?provider=akash175llqyjvxfle9qwt740vm46772dzaznpzgm576&vendor=nvidia&model=rtx3060ti&memory_size=8Gi`

## Provider Versions

Url: https://api.cloudmos.io/internal/provider-versions

Returns a list of versions and the providers that are currently on that version. The `<UNKNOWN>` version correspond to providers where the version could not be determined. The `/version` endpoint was broken for a long time, but is now fixed in [v0.5.0-rc11](https://github.com/akash-network/provider/releases/tag/v0.5.0-rc11)

### Example Response

```
{
"0.5.0-rc16": {
"version": "0.5.0-rc16",
"count": 4,
"ratio": 0.05,
"providers": [
"https://provider.moonbys.cloud:8443",
"https://provider.akashprovid.com:8443",
"https://provider.akashtesting.xyz:8443"
]
},
"0.5.0-rc15": {
"version": "0.5.0-rc16",
"count": 4,
"ratio": 0.05,
"providers": [
"https://provider.akash.pro:8443"
]
},
"<UNKNOWN>": {
"version": "<UNKNOWN>",
"count": 80,
"ratio": 0.95,
"providers": [
"https://provider.macptrading.com:8443",
"https://provider.digitaler-friedhof.com:8443",
"https://provider.qioi.io:8443",
"https://provider.bluepeer.io:8443",
...
]
}
}
```

0 comments on commit db887c0

Please sign in to comment.