Horovod in Docker

Horovod官方镜像安装,参考https://horovod.readthedocs.io/en/stable/docker_include.html

但由于官方镜像没有显示CUDA等版本,因此很可能会出现版本不兼容的情况,需要根据自己的GPU来build镜像。参考,https://github.com/determined-ai/horovod/blob/master/docs/docker.rst

检查版本兼容

写Dockerfile中,需要注意各种库的版本。

base的版本我选择11.3的cuda,ubuntu平台

1
FROM nvidia/cuda:11.3.1-devel-ubuntu20.04

cuda的版本

汇总参考,https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html

例如以下链接,对应的版本分别为

https://developer.nvidia.com/compute/cudnn/secure/8.4.0/local\_installers/11.6/cudnn-local-repo-ubuntu2004-8.4.0.27\_1.0-1\_amd64.deb

https://developer.nvidia.com/compute/machine-learning/cudnn/secure/8.2.0.53/11.3\_04222021/Ubuntu20\_04-x64/libcudnn8\_8.2.0.53-1+cuda11.3\_amd64.deb

1
2
8.4.0.27-1+cuda11.6
8.2.0.53-1+cuda11.3

cudnn的版本

汇总,参考,https://developer.nvidia.com/rdp/cudnn-archive

NCCL版本

汇总,参考https://docs.nvidia.com/deeplearning/nccl/release-notes/index.html

注意查看兼容性:

最终选择cuda11.3,nccl2.9.9

安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
ENV CUDNN_VERSION=8.2.0.53-1+cuda11.3
ENV NCCL_VERSION=2.9.9-1+cuda11.3
# Set default shell to /bin/bash
SHELL ["/bin/bash", "-cu"]
RUN apt-get update --allow-insecure-repositories && apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
cmake \
g++-9 \
git \
curl \
vim \
wget \
ca-certificates \
libcudnn8=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
python-is-python3 \
python3-pip \
python3-dev \
python3-distutils \
librdmacm1 \
libibverbs1 \
ibverbs-providers

build过程中,可能会在tzdata的安装上hang,(猜测原因是无法接收输入)添加时区:

1
2
ENV TZ=Asia/Dubai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

pip安装时,使用国内镜像,不然很慢,可以直接写到Dockerfile中:

1
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

安装Pytorch

1
2
RUN PYTAGS=$(python -c "from packaging import tags; tag = list(tags.sys_tags())[0]; print(f'{tag.interpreter}-{tag.abi}')") && \
pip install https://download.pytorch.org/whl/cu113/torch-${PYTORCH_VERSION}%2Bcu113-${PYTAGS}-linux_x86_64.whl

安装horovod

1
2
3
4
5
6
7
8
9
10
11
RUN HOROVOD_GPU_OPERATIONS=NCCL \
HOROVOD_WITH_PYTORCH=1 \
HOROVOD_WITHOUT_MPI=1 \
pip install --no-cache-dir horovod
# Install OpenSSH for MPI to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd
# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config

测试

生成数据,train.csv,test.csv,保存到S3存储中

生成数据的python代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import numpy as np
from s3_utils import s3_save_pickle
import boto3

N = 1000
ratio = 0.8

a = np.random.uniform(-10, 10, N)
b = np.random.uniform(-10, 10, N)
noise = np.random.uniform(0, 1, N)
c = a + b + a * b + noise
data = np.stack([a, b, c], axis=-1)

pos = int(N * ratio)
train_set = data[:pos]
test_set = data[pos:]

# local save
# np.savetxt('train.csv', train_set, delimiter=',')
# np.savetxt('test.csv', test_set, delimiter=',')

session = boto3.session.Session()

s3_client = session.client(
service_name='s3',
aws_access_key_id='XXXX',
aws_secret_access_key='XXXX',
endpoint_url='http://10.105.222.7:24850',
)

s3_save_pickle(s3_client, train_set, 'songwei', 'simple_ml/train.pkl')
s3_save_pickle(s3_client, test_set, 'songwei', 'simple_ml/test.pkl')

单个CPU或者GPU测试

测试代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
from s3_utils import s3_load_pickle, s3_save_file, s3_save_model
import boto3
import torch
from torch import nn
from torch.nn import MSELoss
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
# define model
in_dim, hidden_dim, out_dim = 2, 4, 1
hp = json.load(open('hp.json', 'r'))

model = nn.Sequential(
nn.Linear(in_dim, hidden_dim),
nn.GELU(),
nn.Dropout(hp["dropout"]),
nn.Linear(hidden_dim, hidden_dim),
nn.GELU(),
nn.Dropout(hp["dropout"]),
nn.Linear(hidden_dim, out_dim)
)
model.to(device)

# prepare data
session = boto3.session.Session()

s3_client = session.client(
service_name='s3',
aws_access_key_id='XXXX',
aws_secret_access_key='XXXX',
endpoint_url='http://10.105.222.7:24850',
)

train_tensor = torch.tensor(s3_load_pickle(s3_client, 'songwei', 'simple_ml/train.pkl'), dtype=torch.float)
test_tensor = torch.tensor(s3_load_pickle(s3_client, 'songwei', 'simple_ml/test.pkl'), dtype=torch.float)

train_dataset = TensorDataset(train_tensor[:, :-1], train_tensor[:, -1:])
test_dataset = TensorDataset(test_tensor[:, :-1], test_tensor[:, -1:])

train_dl = DataLoader(train_dataset)
test_dl = DataLoader(test_dataset)

opt = Adam(model.parameters(), lr=hp["lr"])
loss_fn = MSELoss()
epoch = 20

# train and test
model.train()
for e in range(epoch):
avg_loss = []
for x, y in train_dl:
x = x.to(device)
y = y.to(device)
opt.zero_grad()
p = model(x)
loss = loss_fn(p, y)
loss.backward()
opt.step()
avg_loss.append(loss.item())
avg_loss = sum(avg_loss) / len(avg_loss)
print(f'Training. Epoch {e}, MSE loss: {avg_loss}')

model.eval()
with torch.no_grad():
avg_loss = []
for x, y in test_dl:
x = x.to(device)
y = y.to(device)
p = model(x)
loss = loss_fn(p, y)
avg_loss.append(loss.item())
avg_loss = sum(avg_loss) / len(avg_loss)
print(f'Testing. MSE loss: {avg_loss}')

s3_save_model(s3_client, model, 'songwei', 'simple_ml/model_save/single/model.pt')
s3_save_file(s3_client, 'config.pbtxt', 'songwei', 'simple_ml/model_save/single/config.pbtxt')

测试NCCL

开启Debug模式

1
export NCCL_DEBUG=INFO

查看目前ld能找到的lib,需要让ldconfig能找到nccl

1
ldconfig -p

测试参考,nccl-test,https://github.com/NVIDIA/nccl-tests.git

1
2
3
4
git clone <https://github.com/NVIDIA/nccl-tests.git>
cd nccl-tests
make
./build/all_reduce_perf -b 8 -e 256M -f 2 -g 4

【注意】nccl-test在docker中运行时,需要添加以下参数shm-size,不然可能有bus error

1
docker run -it --shm-size 8G --rm myhorovod bash

单机多GPU带horovod的python代码

保证horovod在python中的写法没问题,以及horovod的安装没有问题

测试代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json
import torch
from torch import nn
from torch.nn import MSELoss
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
import horovod.torch as hvd
from s3_utils import s3_load_pickle, s3_save_model, s3_save_file
import boto3

# prepare data
session = boto3.session.Session()

s3_client = session.client(
service_name='s3',
aws_access_key_id='XXXX',
aws_secret_access_key='XXXX',
endpoint_url='http://10.105.222.7:24850',
)

train_tensor = torch.tensor(s3_load_pickle(s3_client, 'songwei', 'simple_ml/train.pkl'), dtype=torch.float)
test_tensor = torch.tensor(s3_load_pickle(s3_client, 'songwei', 'simple_ml/test.pkl'), dtype=torch.float)

train_dataset = TensorDataset(train_tensor[:, :-1], train_tensor[:, -1:])
test_dataset = TensorDataset(test_tensor[:, :-1], test_tensor[:, -1:])

######################################################
hvd.init()
if torch.cuda.is_available():
torch.cuda.set_device(hvd.local_rank())
torch.set_num_threads(1)

train_sampler = DistributedSampler(train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
train_dl = DataLoader(train_dataset, sampler=train_sampler)

test_sampler = DistributedSampler(test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
test_dl = DataLoader(test_dataset, sampler=test_sampler)
######################################################

# define model
in_dim, hidden_dim, out_dim = 2, 4, 1
hp = json.load(open('hp.json', 'r'))

model = nn.Sequential(
nn.Linear(in_dim, hidden_dim),
nn.GELU(),
nn.Dropout(hp["dropout"]),
nn.Linear(hidden_dim, hidden_dim),
nn.GELU(),
nn.Dropout(hp["dropout"]),
nn.Linear(hidden_dim, out_dim)
)
model.cuda()

######################################################
opt = Adam(model.parameters(), lr=hp["lr"] * hvd.size())
opt = hvd.DistributedOptimizer(opt, named_parameters=model.named_parameters(), op=hvd.Average)
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(opt, root_rank=0)
######################################################

loss_fn = MSELoss()
epoch = 20

# train and test
model.train()
for e in range(epoch):
######################################################
train_sampler.set_epoch(e)
######################################################
avg_loss = []
for x, y in train_dl:
x = x.cuda()
y = y.cuda()
opt.zero_grad()
p = model(x)
loss = loss_fn(p, y)
loss.backward()
opt.step()
avg_loss.append(loss.item())
avg_loss = sum(avg_loss) / len(avg_loss)
print(f'Training. Epoch {e}, MSE loss: {avg_loss}, Worker: {hvd.rank()}')

model.eval()
with torch.no_grad():
avg_loss = []
for x, y in test_dl:
x = x.cuda()
y = y.cuda()
p = model(x)
loss = loss_fn(p, y)
avg_loss.append(loss.item())
avg_loss = sum(avg_loss) / len(avg_loss)
######################################################
avg_loss = hvd.allreduce(torch.tensor(avg_loss)).item()
######################################################
print(f'Testing. MSE loss: {avg_loss}, Worker: {hvd.rank()}')

s3_save_model(s3_client, model, 'songwei', 'simple_ml/model_save/1/model.pt')
s3_save_file(s3_client, 'config.pbtxt', 'songwei', 'simple_ml/model_save/1/config.pbtxt')

测试:

1
horovodrun -np 4 -H localhost:4 python main_with_horovod.py

Horovod in Docker
https://fffffaraway.github.io/2022/06/19/horovod-in-docker/
Author
Song Wei
Posted on
June 19, 2022
Licensed under