diff --git a/.gitignore b/.gitignore index 56b1a82..dd86b00 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,5 @@ dmypy.json *.out poetry.lock .perun.ini +examples/data +examples/**/perun_results diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f736dd3..35f083e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,6 +28,8 @@ repos: rev: v2.2.0 hooks: - id: seed-isort-config + args: + - --exclude=examples/ - repo: https://github.com/pycqa/isort rev: 5.12.0 hooks: diff --git a/README.md b/README.md index 9b8bb85..861f491 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ perun is a Python package that calculates the energy consumption of Python scripts by sampling usage statistics from Intel RAPL, Nvidia-NVML, and psutil. It can handle MPI applications, gather data from hundreds of nodes, and accumulate it efficiently. perun can be used as a command-line tool or as a function decorator in Python scripts. -Check out the [docs](https://perun.readthedocs.io/en/latest/)! +Check out the [docs](https://perun.readthedocs.io/en/latest/) or a working [example](https://github.com/Helmholtz-AI-Energy/perun/blob/main/examples/torch_mnist/README.md)! ## Key Features diff --git a/docs/quickstart.rst b/docs/quickstart.rst index a2ca770..5b90231 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -3,6 +3,10 @@ Quick Start =========== +.. hint:: + + Check a `full example `_ on our github repository + To start using perun, the first step is to install it using pip. .. code-block:: console diff --git a/examples/torch_mnist/README.md b/examples/torch_mnist/README.md new file mode 100644 index 0000000..5286fcf --- /dev/null +++ b/examples/torch_mnist/README.md @@ -0,0 +1,139 @@ +# Torch MNIST Example + +This directory contains everything to you need to start using **perun** in your workflows. As an example, we are using the [torch](https://pytorch.org/) package to train a neural network to recognize the handwritten digits using the MNIST dataset. + +## Setup + +It is recommended that you create a new environment with any new project using the *venv* package + +```console +python -m venv venv/perun-example +source venv/perun-example/bin/activate +``` + +or with *conda* + +```console +conda create --name perun-example +conda activate perun-example +``` + +Once your new enviornment is ready, you can install the dependencies for the example. + +```console +pip install -r requirements.txt +``` + +This includes **perun** and the scripts dependencies. The the root of the project includes a minimal configuration file example.perun.ini*, with some basic options. More details on the configuration options can be found [in the docs](https://perun.readthedocs.io/en/latest/configuration.html). + +To make sure **perun** was installed properly and that it has access to some hardware sensors, run the command + +```console +perun sensors +``` + +## Monitoring + +Now everything is ready to start getting data. To get monitor your script a single time, simply run: + +```console +perun monitor torch_mnist.py +``` + +After the script finishes running, a folder *perun_results* will be created containing the consumption report of your application as a text file, including the all the raw data saved in an hdf5 file. + +To explored the contents of the hdf5 file, we recomed the **h5py** library or the [myHDF5](https://myhdf5.hdfgroup.org) website. + +The text report from running the MNIST example should look like this: + +```text +PERUN REPORT + +App name: torch_mnist +First run: 2023-08-22T17:44:34.927402 +Last run: 2023-08-22T17:44:34.927402 + + +RUN ID: 2023-08-22T17:44:34.927402 + +| Round # | Host | RUNTIME | ENERGY | CPU_POWER | CPU_UTIL | GPU_POWER | GPU_MEM | DRAM_POWER | MEM_UTIL | +|----------:|:--------------------|:----------|:----------|:------------|:-----------|:------------|:----------|:-------------|:-----------| +| 0 | hkn0402.localdomain | 61.954 s | 28.440 kJ | 203.619 W | 0.867 % | 232.448 W | 4.037 GB | 22.923 W | 0.033 % | +| 0 | All | 61.954 s | 28.440 kJ | 203.619 W | 0.867 % | 232.448 W | 4.037 GB | 22.923 W | 0.033 % | + +Monitored Functions + +| Round # | Function | Avg Calls / Rank | Avg Runtime | Avg Power | Avg CPU Util | Avg GPU Mem Util | +|----------:|:------------|-------------------:|:---------------|:-----------------|:---------------|:-------------------| +| 0 | train | 1 | 50.390±0.000 s | 456.993±0.000 W | 0.869±0.000 % | 2.731±0.000 % | +| 0 | train_epoch | 5 | 8.980±1.055 s | 433.082±11.012 W | 0.874±0.007 % | 2.746±0.148 % | +| 0 | test | 5 | 1.098±0.003 s | 274.947±83.746 W | 0.804±0.030 % | 2.808±0.025 % | + +The application has run been run 1 times. Throught its runtime, it has used 0.012 kWh, released a total of 0.005 kgCO2e into the atmosphere, and you paid 0.00 € in electricity for it. +``` + +The results display data about the functions *train*, *test_epoch* and *test*. Those functions were specialy marked using the ```@monitor()``` decorator. + +```python +@monitor() +def train(args, model, device, train_loader, test_loader, optimizer, scheduler): + for epoch in range(1, args.epochs + 1): + train_epoch(args, model, device, train_loader, optimizer, epoch) + test(model, device, test_loader) + scheduler.step() +``` + +## Benchmarking + +If you need to run your code multiple times to gather statistics, perun includes an option called ```--rounds```. The application will be run multiple times, with each run added to similar tables as the one generated for a single run. + + +```console +perun monitor --rounds 5 torch_mnist.py +``` + +```text +PERUN REPORT + +App name: torch_mnist +First run: 2023-08-22T17:44:34.927402 +Last run: 2023-08-22T17:45:46.992693 + + +RUN ID: 2023-08-22T17:45:46.992693 + +| Round # | Host | RUNTIME | ENERGY | CPU_POWER | CPU_UTIL | GPU_POWER | GPU_MEM | DRAM_POWER | MEM_UTIL | +|----------:|:--------------------|:----------|:----------|:------------|:-----------|:------------|:----------|:-------------|:-----------| +| 0 | hkn0402.localdomain | 52.988 s | 24.379 kJ | 202.854 W | 0.865 % | 234.184 W | 4.281 GB | 22.858 W | 0.034 % | +| 0 | All | 52.988 s | 24.379 kJ | 202.854 W | 0.865 % | 234.184 W | 4.281 GB | 22.858 W | 0.034 % | +| 1 | hkn0402.localdomain | 48.401 s | 22.319 kJ | 203.366 W | 0.886 % | 234.821 W | 4.513 GB | 22.798 W | 0.034 % | +| 1 | All | 48.401 s | 22.319 kJ | 203.366 W | 0.886 % | 234.821 W | 4.513 GB | 22.798 W | 0.034 % | +| 2 | hkn0402.localdomain | 48.258 s | 22.248 kJ | 203.339 W | 0.884 % | 234.720 W | 4.513 GB | 22.850 W | 0.034 % | +| 2 | All | 48.258 s | 22.248 kJ | 203.339 W | 0.884 % | 234.720 W | 4.513 GB | 22.850 W | 0.034 % | +| 3 | hkn0402.localdomain | 48.537 s | 22.393 kJ | 203.269 W | 0.884 % | 234.984 W | 4.513 GB | 22.968 W | 0.034 % | +| 3 | All | 48.537 s | 22.393 kJ | 203.269 W | 0.884 % | 234.984 W | 4.513 GB | 22.968 W | 0.034 % | +| 4 | hkn0402.localdomain | 48.416 s | 22.323 kJ | 203.408 W | 0.888 % | 234.626 W | 4.513 GB | 22.928 W | 0.034 % | +| 4 | All | 48.416 s | 22.323 kJ | 203.408 W | 0.888 % | 234.626 W | 4.513 GB | 22.928 W | 0.034 % | + +Monitored Functions + +| Round # | Function | Avg Calls / Rank | Avg Runtime | Avg Power | Avg CPU Util | Avg GPU Mem Util | +|----------:|:------------|-------------------:|:---------------|:-----------------|:---------------|:-------------------| +| 0 | train | 1 | 50.169±0.000 s | 458.380±0.000 W | 0.875±0.000 % | 2.727±0.000 % | +| 0 | train_epoch | 5 | 8.930±0.903 s | 439.707±12.743 W | 0.875±0.008 % | 2.743±0.154 % | +| 0 | test | 5 | 1.103±0.004 s | 232.750±1.219 W | 0.805±0.030 % | 2.809±0.023 % | +| 1 | train | 1 | 48.354±0.000 s | 453.376±0.000 W | 0.886±0.000 % | 2.820±0.000 % | +| 1 | train_epoch | 5 | 8.556±0.008 s | 428.418±11.199 W | 0.890±0.018 % | 2.820±0.000 % | +| 1 | test | 5 | 1.115±0.002 s | 272.918±80.330 W | 0.798±0.018 % | 2.820±0.000 % | +| 2 | train | 1 | 48.210±0.000 s | 453.867±0.000 W | 0.884±0.000 % | 2.820±0.000 % | +| 2 | train_epoch | 5 | 8.525±0.022 s | 423.647±1.049 W | 0.888±0.013 % | 2.820±0.000 % | +| 2 | test | 5 | 1.117±0.005 s | 312.983±97.688 W | 0.806±0.012 % | 2.820±0.000 % | +| 3 | train | 1 | 48.486±0.000 s | 452.940±0.000 W | 0.884±0.000 % | 2.820±0.000 % | +| 3 | train_epoch | 5 | 8.577±0.012 s | 433.627±13.812 W | 0.888±0.017 % | 2.820±0.000 % | +| 3 | test | 5 | 1.120±0.003 s | 233.973±3.516 W | 0.789±0.022 % | 2.820±0.000 % | +| 4 | train | 1 | 48.367±0.000 s | 453.256±0.000 W | 0.888±0.000 % | 2.820±0.000 % | +| 4 | train_epoch | 5 | 8.555±0.011 s | 433.582±12.606 W | 0.899±0.029 % | 2.820±0.000 % | +| 4 | test | 5 | 1.118±0.002 s | 233.367±2.238 W | 0.818±0.045 % | 2.820±0.000 % | + +The application has run been run 2 times. Throught its runtime, it has used 0.062 kWh, released a total of 0.026 kgCO2e into the atmosphere, and you paid 0.02 € in electricity for it. +``` diff --git a/examples/torch_mnist/requirements.txt b/examples/torch_mnist/requirements.txt new file mode 100644 index 0000000..dd95f60 --- /dev/null +++ b/examples/torch_mnist/requirements.txt @@ -0,0 +1,41 @@ +certifi==2023.5.7 +charset-normalizer==3.1.0 +click==8.1.3 +cmake==3.26.4 +filelock==3.12.2 +h5py==3.9.0 +idna==3.4 +Jinja2==3.1.2 +lit==16.0.6 +MarkupSafe==2.1.3 +mpmath==1.3.0 +networkx==3.1 +numpy==1.24.4 +nvidia-cublas-cu11==11.10.3.66 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +nvidia-cuda-runtime-cu11==11.7.99 +nvidia-cudnn-cu11==8.5.0.96 +nvidia-cufft-cu11==10.9.0.58 +nvidia-curand-cu11==10.2.10.91 +nvidia-cusolver-cu11==11.4.0.1 +nvidia-cusparse-cu11==11.7.4.91 +nvidia-nccl-cu11==2.14.3 +nvidia-nvtx-cu11==11.7.91 +pandas==2.0.3 +perun==0.4.0 +Pillow==10.0.0 +psutil==5.9.5 +py-cpuinfo==5.0.0 +pynvml==11.5.0 +python-dateutil==2.8.2 +pytz==2023.3 +requests==2.31.0 +six==1.16.0 +sympy==1.12 +torch==2.0.1 +torchvision==0.15.2 +triton==2.0.0 +typing_extensions==4.7.1 +tzdata==2023.3 +urllib3==2.0.3 diff --git a/examples/torch_mnist/torch_mnist.py b/examples/torch_mnist/torch_mnist.py new file mode 100644 index 0000000..d383c73 --- /dev/null +++ b/examples/torch_mnist/torch_mnist.py @@ -0,0 +1,195 @@ +import argparse + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.optim.lr_scheduler import StepLR +from torch.utils.data import DataLoader +from torchvision import datasets, transforms + +from perun import monitor + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +@monitor() +def train(args, model, device, train_loader, test_loader, optimizer, scheduler): + for epoch in range(1, args.epochs + 1): + train_epoch(args, model, device, train_loader, optimizer, epoch) + test(model, device, test_loader) + scheduler.step() + + +@monitor() +def train_epoch(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(data), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + if args.dry_run: + break + + +@monitor() +def test(model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss( + output, target, reduction="sum" + ).item() # sum up batch loss + pred = output.argmax( + dim=1, keepdim=True + ) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + print( + "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format( + test_loss, + correct, + len(test_loader.dataset), + 100.0 * correct / len(test_loader.dataset), + ) + ) + + +def main(): + # Training settings + parser = argparse.ArgumentParser(description="PyTorch MNIST Example") + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=1000, + metavar="N", + help="input batch size for testing (default: 1000)", + ) + parser.add_argument( + "--epochs", + type=int, + default=5, + metavar="N", + help="number of epochs to train (default: 14)", + ) + parser.add_argument( + "--lr", + type=float, + default=1.0, + metavar="LR", + help="learning rate (default: 1.0)", + ) + parser.add_argument( + "--gamma", + type=float, + default=0.7, + metavar="M", + help="Learning rate step gamma (default: 0.7)", + ) + parser.add_argument( + "--no-cuda", action="store_true", default=False, help="disables CUDA training" + ) + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="quickly check a single pass", + ) + parser.add_argument( + "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)" + ) + parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--save-model", + action="store_true", + default=False, + help="For Saving the current Model", + ) + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + + torch.manual_seed(args.seed) + + device = torch.device("cuda" if use_cuda else "cpu") + + train_kwargs = {"batch_size": args.batch_size} + test_kwargs = {"batch_size": args.test_batch_size} + if use_cuda: + cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True} + train_kwargs.update(cuda_kwargs) + test_kwargs.update(cuda_kwargs) + + transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ) + dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform) + dataset2 = datasets.MNIST("../data", train=False, transform=transform) + train_loader = DataLoader(dataset1, **train_kwargs) + test_loader = DataLoader(dataset2, **test_kwargs) + + model = Net().to(device) + optimizer = optim.Adadelta(model.parameters(), lr=args.lr) + + scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) + + train(args, model, device, train_loader, test_loader, optimizer, scheduler) + + +if __name__ == "__main__": + main() diff --git a/perun/__init__.py b/perun/__init__.py index 2fc7901..9a730fe 100644 --- a/perun/__init__.py +++ b/perun/__init__.py @@ -1,6 +1,6 @@ """perun module.""" # flake8: noqa -__version__ = "0.3.2" +__version__ = "0.4.0" from perun.configuration import config from perun.logging import init_logging