Horovod in Docker

Horovod官方镜像安装,参考https://horovod.readthedocs.io/en/stable/docker_include.html

但由于官方镜像没有显示CUDA等版本,因此很可能会出现版本不兼容的情况,需要根据自己的GPU来build镜像。参考,https://github.com/determined-ai/horovod/blob/master/docs/docker.rst

检查版本兼容

写Dockerfile中,需要注意各种库的版本。

base的版本我选择11.3的cuda,ubuntu平台

1
FROM nvidia/cuda:11.3.1-devel-ubuntu20.04
DOCKERFILE

cuda的版本

汇总参考,https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html

例如以下链接,对应的版本分别为

https://developer.nvidia.com/compute/cudnn/secure/8.4.0/local\_installers/11.6/cudnn-local-repo-ubuntu2004-8.4.0.27\_1.0-1\_amd64.deb

https://developer.nvidia.com/compute/machine-learning/cudnn/secure/8.2.0.53/11.3\_04222021/Ubuntu20\_04-x64/libcudnn8\_8.2.0.53-1+cuda11.3\_amd64.deb

1
2
8.4.0.27-1+cuda11.6
8.2.0.53-1+cuda11.3
DOCKERFILE

cudnn的版本

汇总,参考,https://developer.nvidia.com/rdp/cudnn-archive

NCCL版本

汇总,参考https://docs.nvidia.com/deeplearning/nccl/release-notes/index.html

注意查看兼容性:

最终选择cuda11.3,nccl2.9.9

安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
ENV CUDNN_VERSION=8.2.0.53-1+cuda11.3
ENV NCCL_VERSION=2.9.9-1+cuda11.3
# Set default shell to /bin/bash
SHELL ["/bin/bash", "-cu"]
RUN apt-get update --allow-insecure-repositories && apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
cmake \
g++-9 \
git \
curl \
vim \
wget \
ca-certificates \
libcudnn8=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
python-is-python3 \
python3-pip \
python3-dev \
python3-distutils \
librdmacm1 \
libibverbs1 \
ibverbs-providers
DOCKERFILE

build过程中,可能会在tzdata的安装上hang,(猜测原因是无法接收输入)添加时区:

1
2
ENV TZ=Asia/Dubai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
DOCKERFILE

pip安装时,使用国内镜像,不然很慢,可以直接写到Dockerfile中:

1
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
DOCKERFILE

安装Pytorch

1
2
RUN PYTAGS=$(python -c "from packaging import tags; tag = list(tags.sys_tags())[0]; print(f'{tag.interpreter}-{tag.abi}')") && \
pip install https://download.pytorch.org/whl/cu113/torch-${PYTORCH_VERSION}%2Bcu113-${PYTAGS}-linux_x86_64.whl
DOCKERFILE

安装horovod

1
2
3
4
5
6
7
8
9
10
11
RUN HOROVOD_GPU_OPERATIONS=NCCL \
HOROVOD_WITH_PYTORCH=1 \
HOROVOD_WITHOUT_MPI=1 \
pip install --no-cache-dir horovod
# Install OpenSSH for MPI to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd
# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
DOCKERFILE

测试

生成数据,train.csv,test.csv,保存到S3存储中

生成数据的python代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import numpy as np
from s3_utils import s3_save_pickle
import boto3

N = 1000
ratio = 0.8

a = np.random.uniform(-10, 10, N)
b = np.random.uniform(-10, 10, N)
noise = np.random.uniform(0, 1, N)
c = a + b + a * b + noise
data = np.stack([a, b, c], axis=-1)

pos = int(N * ratio)
train_set = data[:pos]
test_set = data[pos:]

# local save
# np.savetxt('train.csv', train_set, delimiter=',')
# np.savetxt('test.csv', test_set, delimiter=',')

session = boto3.session.Session()

s3_client = session.client(
service_name='s3',
aws_access_key_id='XXXX',
aws_secret_access_key='XXXX',
endpoint_url='http://10.105.222.7:24850',
)

s3_save_pickle(s3_client, train_set, 'songwei', 'simple_ml/train.pkl')
s3_save_pickle(s3_client, test_set, 'songwei', 'simple_ml/test.pkl')
PYTHON

单个CPU或者GPU测试

测试代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
from s3_utils import s3_load_pickle, s3_save_file, s3_save_model
import boto3
import torch
from torch import nn
from torch.nn import MSELoss
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
# define model
in_dim, hidden_dim, out_dim = 2, 4, 1
hp = json.load(open('hp.json', 'r'))

model = nn.Sequential(
nn.Linear(in_dim, hidden_dim),
nn.GELU(),
nn.Dropout(hp["dropout"]),
nn.Linear(hidden_dim, hidden_dim),
nn.GELU(),
nn.Dropout(hp["dropout"]),
nn.Linear(hidden_dim, out_dim)
)
model.to(device)

# prepare data
session = boto3.session.Session()

s3_client = session.client(
service_name='s3',
aws_access_key_id='XXXX',
aws_secret_access_key='XXXX',
endpoint_url='http://10.105.222.7:24850',
)

train_tensor = torch.tensor(s3_load_pickle(s3_client, 'songwei', 'simple_ml/train.pkl'), dtype=torch.float)
test_tensor = torch.tensor(s3_load_pickle(s3_client, 'songwei', 'simple_ml/test.pkl'), dtype=torch.float)

train_dataset = TensorDataset(train_tensor[:, :-1], train_tensor[:, -1:])
test_dataset = TensorDataset(test_tensor[:, :-1], test_tensor[:, -1:])

train_dl = DataLoader(train_dataset)
test_dl = DataLoader(test_dataset)

opt = Adam(model.parameters(), lr=hp["lr"])
loss_fn = MSELoss()
epoch = 20

# train and test
model.train()
for e in range(epoch):
avg_loss = []
for x, y in train_dl:
x = x.to(device)
y = y.to(device)
opt.zero_grad()
p = model(x)
loss = loss_fn(p, y)
loss.backward()
opt.step()
avg_loss.append(loss.item())
avg_loss = sum(avg_loss) / len(avg_loss)
print(f'Training. Epoch {e}, MSE loss: {avg_loss}')

model.eval()
with torch.no_grad():
avg_loss = []
for x, y in test_dl:
x = x.to(device)
y = y.to(device)
p = model(x)
loss = loss_fn(p, y)
avg_loss.append(loss.item())
avg_loss = sum(avg_loss) / len(avg_loss)
print(f'Testing. MSE loss: {avg_loss}')

s3_save_model(s3_client, model, 'songwei', 'simple_ml/model_save/single/model.pt')
s3_save_file(s3_client, 'config.pbtxt', 'songwei', 'simple_ml/model_save/single/config.pbtxt')
PYTHON

测试NCCL

开启Debug模式

1
export NCCL_DEBUG=INFO
BASH

查看目前ld能找到的lib,需要让ldconfig能找到nccl

1
ldconfig -p
BASH

测试参考,nccl-test,https://github.com/NVIDIA/nccl-tests.git

1
2
3
4
git clone <https://github.com/NVIDIA/nccl-tests.git>
cd nccl-tests
make
./build/all_reduce_perf -b 8 -e 256M -f 2 -g 4
BASH

【注意】nccl-test在docker中运行时,需要添加以下参数shm-size,不然可能有bus error

1
docker run -it --shm-size 8G --rm myhorovod bash
BASH

单机多GPU带horovod的python代码

保证horovod在python中的写法没问题,以及horovod的安装没有问题

测试代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json
import torch
from torch import nn
from torch.nn import MSELoss
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
import horovod.torch as hvd
from s3_utils import s3_load_pickle, s3_save_model, s3_save_file
import boto3

# prepare data
session = boto3.session.Session()

s3_client = session.client(
service_name='s3',
aws_access_key_id='XXXX',
aws_secret_access_key='XXXX',
endpoint_url='http://10.105.222.7:24850',
)

train_tensor = torch.tensor(s3_load_pickle(s3_client, 'songwei', 'simple_ml/train.pkl'), dtype=torch.float)
test_tensor = torch.tensor(s3_load_pickle(s3_client, 'songwei', 'simple_ml/test.pkl'), dtype=torch.float)

train_dataset = TensorDataset(train_tensor[:, :-1], train_tensor[:, -1:])
test_dataset = TensorDataset(test_tensor[:, :-1], test_tensor[:, -1:])

######################################################
hvd.init()
if torch.cuda.is_available():
torch.cuda.set_device(hvd.local_rank())
torch.set_num_threads(1)

train_sampler = DistributedSampler(train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
train_dl = DataLoader(train_dataset, sampler=train_sampler)

test_sampler = DistributedSampler(test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
test_dl = DataLoader(test_dataset, sampler=test_sampler)
######################################################

# define model
in_dim, hidden_dim, out_dim = 2, 4, 1
hp = json.load(open('hp.json', 'r'))

model = nn.Sequential(
nn.Linear(in_dim, hidden_dim),
nn.GELU(),
nn.Dropout(hp["dropout"]),
nn.Linear(hidden_dim, hidden_dim),
nn.GELU(),
nn.Dropout(hp["dropout"]),
nn.Linear(hidden_dim, out_dim)
)
model.cuda()

######################################################
opt = Adam(model.parameters(), lr=hp["lr"] * hvd.size())
opt = hvd.DistributedOptimizer(opt, named_parameters=model.named_parameters(), op=hvd.Average)
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(opt, root_rank=0)
######################################################

loss_fn = MSELoss()
epoch = 20

# train and test
model.train()
for e in range(epoch):
######################################################
train_sampler.set_epoch(e)
######################################################
avg_loss = []
for x, y in train_dl:
x = x.cuda()
y = y.cuda()
opt.zero_grad()
p = model(x)
loss = loss_fn(p, y)
loss.backward()
opt.step()
avg_loss.append(loss.item())
avg_loss = sum(avg_loss) / len(avg_loss)
print(f'Training. Epoch {e}, MSE loss: {avg_loss}, Worker: {hvd.rank()}')

model.eval()
with torch.no_grad():
avg_loss = []
for x, y in test_dl:
x = x.cuda()
y = y.cuda()
p = model(x)
loss = loss_fn(p, y)
avg_loss.append(loss.item())
avg_loss = sum(avg_loss) / len(avg_loss)
######################################################
avg_loss = hvd.allreduce(torch.tensor(avg_loss)).item()
######################################################
print(f'Testing. MSE loss: {avg_loss}, Worker: {hvd.rank()}')

s3_save_model(s3_client, model, 'songwei', 'simple_ml/model_save/1/model.pt')
s3_save_file(s3_client, 'config.pbtxt', 'songwei', 'simple_ml/model_save/1/config.pbtxt')
PYTHON

测试:

1
horovodrun -np 4 -H localhost:4 python main_with_horovod.py
BASH

Horovod in Docker
https://fffffaraway.github.io/2022/06/19/horovod-in-docker/
Author
Song Wei
Posted on
June 19, 2022
Licensed under