1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
| [0]<stderr>:+ POD_NAME=sw-simple-ml-gitlab-worker-0 [0]<stderr>:+ shift [0]<stderr>:+ /opt/kube/kubectl exec sw-simple-ml-gitlab-worker-0 -- /bin/sh -c cd /simple_ml > /dev/null 2>&1 ; HOROVOD_HOSTNAME=sw-simple-ml-gitlab-worker-0 HOROVOD_RANK=0 HOROVOD_SIZE=2 HOROVOD_LOCAL_RANK=0 HOROVOD_LOCAL_SIZE=1 HOROVOD_CROSS_RANK=0 HOROVOD_CROSS_SIZE=2 LIBRARY_PATH=/usr/local/cuda/lib64/stubs KUBERNETES_SERVICE_PORT=443 KUBERNETES_PORT=tcp://10.96.0.1:443 HOSTNAME=sw-simple-ml-gitlab-launcher LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 HOME=/root CUDA_VERSION=11.3.1 NVIDIA_REQUIRE_CUDA='cuda>=11.3 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450' NVIDIA_DRIVER_CAPABILITIES='' KUBERNETES_PORT_443_TCP_ADDR=10.96.0.1 PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin KUBERNETES_PORT_443_TCP_PORT=443 KUBERNETES_PORT_443_TCP_PROTO=tcp CUDNN_VERSION=8.2.0.53-1+cuda11.3 KUBERNETES_SERVICE_PORT_HTTPS=443 KUBERNETES_PORT_443_TCP=tcp://10.96.0.1:443 KUBERNETES_SERVICE_HOST=10.96.0.1 PWD=/simple_ml OMPI_MCA_orte_default_hostfile=/etc/mpi/hostfile OMPI_MCA_plm_rsh_agent=/etc/mpi/kubexec.sh NVIDIA_VISIBLE_DEVICES='' NCCL_VERSION=2.9.9-1+cuda11.3 TZ=Asia/Dubai LC_CTYPE=C.UTF-8 PYTHONUNBUFFERED=1 HOROVOD_GLOO_RENDEZVOUS_ADDR=10.244.3.228 HOROVOD_GLOO_RENDEZVOUS_PORT=56794 HOROVOD_CONTROLLER=gloo HOROVOD_CPU_OPERATIONS=gloo HOROVOD_GLOO_IFACE=eth0 NCCL_SOCKET_IFNAME=eth0 python main_with_horovod.py [1]<stderr>:+ POD_NAME=sw-simple-ml-gitlab-worker-1 [1]<stderr>:+ shift [1]<stderr>:+ /opt/kube/kubectl exec sw-simple-ml-gitlab-worker-1 -- /bin/sh -c cd /simple_ml > /dev/null 2>&1 ; HOROVOD_HOSTNAME=sw-simple-ml-gitlab-worker-1 HOROVOD_RANK=1 HOROVOD_SIZE=2 HOROVOD_LOCAL_RANK=0 HOROVOD_LOCAL_SIZE=1 HOROVOD_CROSS_RANK=1 HOROVOD_CROSS_SIZE=2 LIBRARY_PATH=/usr/local/cuda/lib64/stubs KUBERNETES_SERVICE_PORT=443 KUBERNETES_PORT=tcp://10.96.0.1:443 HOSTNAME=sw-simple-ml-gitlab-launcher LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 HOME=/root CUDA_VERSION=11.3.1 NVIDIA_REQUIRE_CUDA='cuda>=11.3 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450' NVIDIA_DRIVER_CAPABILITIES='' KUBERNETES_PORT_443_TCP_ADDR=10.96.0.1 PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin KUBERNETES_PORT_443_TCP_PORT=443 KUBERNETES_PORT_443_TCP_PROTO=tcp CUDNN_VERSION=8.2.0.53-1+cuda11.3 KUBERNETES_SERVICE_PORT_HTTPS=443 KUBERNETES_PORT_443_TCP=tcp://10.96.0.1:443 KUBERNETES_SERVICE_HOST=10.96.0.1 PWD=/simple_ml OMPI_MCA_orte_default_hostfile=/etc/mpi/hostfile OMPI_MCA_plm_rsh_agent=/etc/mpi/kubexec.sh NVIDIA_VISIBLE_DEVICES='' NCCL_VERSION=2.9.9-1+cuda11.3 TZ=Asia/Dubai LC_CTYPE=C.UTF-8 PYTHONUNBUFFERED=1 HOROVOD_GLOO_RENDEZVOUS_ADDR=10.244.3.228 HOROVOD_GLOO_RENDEZVOUS_PORT=56794 HOROVOD_CONTROLLER=gloo HOROVOD_CPU_OPERATIONS=gloo HOROVOD_GLOO_IFACE=eth0 NCCL_SOCKET_IFNAME=eth0 python main_with_horovod.py [0]<stdout>:Training. Epoch 0, MSE loss: 1338.4868140452961, Worker: 0 [1]<stdout>:Training. Epoch 0, MSE loss: 1148.9670435080386, Worker: 1 [0]<stdout>:Training. Epoch 1, MSE loss: 935.5324933822116, Worker: 0 [1]<stdout>:Training. Epoch 1, MSE loss: 934.2259948853756, Worker: 1 [0]<stdout>:Training. Epoch 2, MSE loss: 654.0407885544738, Worker: 0 [1]<stdout>:Training. Epoch 2, MSE loss: 633.2420742589119, Worker: 1 [1]<stdout>:Training. Epoch 3, MSE loss: 599.6154769317578, Worker: 1 [0]<stdout>:Training. Epoch 3, MSE loss: 593.8755020723866, Worker: 0 [1]<stdout>:Training. Epoch 4, MSE loss: 574.4224909156511, Worker: 1 [0]<stdout>:Training. Epoch 4, MSE loss: 483.1180045366194, Worker: 0 [0]<stdout>:Training. Epoch 5, MSE loss: 501.9576651239583, Worker: 0 [1]<stdout>:Training. Epoch 5, MSE loss: 561.3756192270189, Worker: 1 [1]<stdout>:Training. Epoch 6, MSE loss: 543.3503851517656, Worker: 1 [0]<stdout>:Training. Epoch 6, MSE loss: 461.72367964229545, Worker: 0 [1]<stdout>:Training. Epoch 7, MSE loss: 599.5701360625471, Worker: 1 [0]<stdout>:Training. Epoch 7, MSE loss: 477.2382655836473, Worker: 0 [0]<stdout>:Training. Epoch 8, MSE loss: 467.4026207899954, Worker: 0 [1]<stdout>:Training. Epoch 8, MSE loss: 489.72919646231924, Worker: 1 [0]<stdout>:Training. Epoch 9, MSE loss: 496.021579158862, Worker: 0 [1]<stdout>:Training. Epoch 9, MSE loss: 466.33928261113863, Worker: 1 [0]<stdout>:Training. Epoch 10, MSE loss: 473.9692597730085, Worker: 0 [1]<stdout>:Training. Epoch 10, MSE loss: 484.8857977675169, Worker: 1 [0]<stdout>:Training. Epoch 11, MSE loss: 472.89639633833195, Worker: 0 [1]<stdout>:Training. Epoch 11, MSE loss: 468.5362197326799, Worker: 1 [1]<stdout>:Training. Epoch 12, MSE loss: 460.76999270017814, Worker: 1 [0]<stdout>:Training. Epoch 12, MSE loss: 416.7220215983699, Worker: 0 [0]<stdout>:Training. Epoch 13, MSE loss: 451.9702810886584, Worker: 0 [1]<stdout>:Training. Epoch 13, MSE loss: 439.8771825716403, Worker: 1 [0]<stdout>:Training. Epoch 14, MSE loss: 366.88131853180283, Worker: 0 [1]<stdout>:Training. Epoch 14, MSE loss: 470.20061697393044, Worker: 1 [1]<stdout>:Training. Epoch 15, MSE loss: 431.8841860381735, Worker: 1 [0]<stdout>:Training. Epoch 15, MSE loss: 395.79066620089884, Worker: 0 [0]<stdout>:Training. Epoch 16, MSE loss: 390.8923997512978, Worker: 0 [1]<stdout>:Training. Epoch 16, MSE loss: 425.90262653747106, Worker: 1 [0]<stdout>:Training. Epoch 17, MSE loss: 516.2323905151153, Worker: 0 [1]<stdout>:Training. Epoch 17, MSE loss: 355.790610972445, Worker: 1 [0]<stdout>:Training. Epoch 18, MSE loss: 501.96195440185153, Worker: 0 [1]<stdout>:Training. Epoch 18, MSE loss: 442.8678330586471, Worker: 1 [1]<stdout>:Training. Epoch 19, MSE loss: 445.24695194348794, Worker: 1 [0]<stdout>:Training. Epoch 19, MSE loss: 441.0054695299096, Worker: 0 [0]<stdout>:Testing. MSE loss: 219.68063354492188, Worker: 0 [1]<stdout>:Testing. MSE loss: 219.68063354492188, Worker: 1
|