= hirsute verification = ubuntu@blanka:~/nvidia-dgx-2/tests$ cat /proc/version Linux version 5.11.0-42-generic (buildd@lgw01-amd64-041) (gcc (Ubuntu 10.3.0-1ubuntu1) 10.3.0, GNU ld (GNU Binutils for Ubuntu) 2.36.1) #46-Ubuntu SMP Fri Nov 26 12:04:17 UTC 2021 ubuntu@blanka:~/nvidia-dgx-2/tests$ ./nvidia-peermem-test.sh + export DEBCONF_FRONTEND=noninteractive + DEBCONF_FRONTEND=noninteractive + export DEBIAN_PRIORITY=critical + DEBIAN_PRIORITY=critical + SERVER_IFACE=enp148s0 + SERVER_IP=192.168.5.1/24 + SERVER_IB_BDF=0000:4b:00.0 + CLIENT_IFACE=enp18s0 + CLIENT_IP=192.168.5.2/24 + CLIENT_IB_BDF=0000:ba:00.0 + trap cleanup EXIT + sudo service unattended-upgrades stop + install_cuda_perftest + local release + local components + dpkg-query -W -f '${Version}' perftest + grep -q '+cuda.1$' + return + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_0 ++++ dirname ../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0/infiniband/mlx5_0 +++ dirname ../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0/infiniband ++ basename ../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0 + bdf=0000:0c:00.0 + case "$bdf" in + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_1 ++++ dirname ../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0/infiniband/mlx5_1 +++ dirname ../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0/infiniband ++ basename ../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0 + bdf=0000:12:00.0 + case "$bdf" in + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_2 ++++ dirname ../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0/infiniband/mlx5_2 +++ dirname ../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0/infiniband ++ basename ../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0 + bdf=0000:4b:00.0 + case "$bdf" in ++ basename /sys/class/infiniband/mlx5_2 + server_ib_dev=mlx5_2 + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_3 ++++ dirname ../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0/infiniband/mlx5_3 +++ dirname ../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0/infiniband ++ basename ../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0 + bdf=0000:54:00.0 + case "$bdf" in + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_4 ++++ dirname ../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/infiniband/mlx5_4 +++ dirname ../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/infiniband ++ basename ../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0 + bdf=0000:8d:00.0 + case "$bdf" in + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_5 ++++ dirname ../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0/infiniband/mlx5_5 +++ dirname ../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0/infiniband ++ basename ../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0 + bdf=0000:94:00.0 + case "$bdf" in + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_6 ++++ dirname ../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0/infiniband/mlx5_6 +++ dirname ../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0/infiniband ++ basename ../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0 + bdf=0000:ba:00.0 + case "$bdf" in ++ basename /sys/class/infiniband/mlx5_6 + client_ib_dev=mlx5_6 + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_7 ++++ dirname ../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0/infiniband/mlx5_7 +++ dirname ../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0/infiniband ++ basename ../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0 + bdf=0000:cc:00.0 + case "$bdf" in + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_8 ++++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/infiniband/mlx5_8 +++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/infiniband ++ basename ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0 + bdf=0000:e1:00.0 + case "$bdf" in + for ibdev in /sys/class/infiniband/* +++++ readlink /sys/class/infiniband/mlx5_9 ++++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1/infiniband/mlx5_9 +++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1/infiniband ++ basename ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1 + bdf=0000:e1:00.1 + case "$bdf" in + '[' -z mlx5_6 ']' + '[' -z mlx5_2 ']' + sudo rdma system set netns exclusive + sudo ip netns add peermemclient + sudo rdma dev set mlx5_6 netns peermemclient + sudo ip netns exec peermemclient ip link set dev lo up + sudo ip link set netns peermemclient enp18s0 + sudo ip netns exec peermemclient ip addr add dev enp18s0 192.168.5.2/24 + sudo ip netns exec peermemclient ip link set dev enp18s0 up + sudo ip addr add dev enp148s0 192.168.5.1/24 + sudo ip link set dev enp148s0 up + sudo modprobe ib_umad + sudo modprobe nvidia-peermem + sudo_apt install -y opensm + sudo --preserve-env=DEBCONF_FRONTEND,DEBIAN_PRIORITY apt install -y opensm Reading package lists... Done Building dependency tree... Done Reading state information... Done opensm is already the newest version (3.3.23-2). 0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded. + sudo service opensm start + use_cuda_needs_devid + ib_write_bw --help + grep use_cuda= --use_cuda= Use CUDA specific device for GPUDirect RDMA testing + return 0 + server_use_cuda_arg=--use_cuda=0 + client_use_cuda_arg=--use_cuda=1 + srvpid=7209 + sleep 5 + sudo ib_write_bw -a -d mlx5_2 --use_cuda=0 ************************************ * Waiting for client to connect... * ************************************ + sudo ip netns exec peermemclient ib_write_bw -a -d mlx5_6 192.168.5.1 --use_cuda=1 initializing CUDA initializing CUDA Listing all CUDA devices in system: CUDA device 0: PCIe address is 07:00 CUDA device 1: PCIe address is 0F:00 CUDA device 2: PCIe address is 47:00 CUDA device 3: PCIe address is 4E:00 CUDA device 4: PCIe address is 87:00 CUDA device 5: PCIe address is 90:00 CUDA device 6: PCIe address is B7:00 CUDA device 7: PCIe address is BD:00 Picking device No. 1 [pid = 7216, dev = 1] device name = [NVIDIA A100-SXM4-40GB] creating CUDA Ctx Listing all CUDA devices in system: CUDA device 0: PCIe address is 07:00 CUDA device 1: PCIe address is 0F:00 CUDA device 2: PCIe address is 47:00 CUDA device 3: PCIe address is 4E:00 CUDA device 4: PCIe address is 87:00 CUDA device 5: PCIe address is 90:00 CUDA device 6: PCIe address is B7:00 CUDA device 7: PCIe address is BD:00 Picking device No. 0 [pid = 7211, dev = 0] device name = [NVIDIA A100-SXM4-40GB] creating CUDA Ctx making it the current CUDA Ctx cuMemAlloc() of a 16777216 bytes GPU buffer allocated GPU buffer address at 00007f0eba000000 pointer=0x7f0eba000000 --------------------------------------------------------------------------------------- RDMA_Write BW Test Dual-port : OFF Device : mlx5_6 Number of qps : 1 Transport type : IB Connection type : RC Using SRQ : OFF PCIe relax order: ON ibv_wr* API : ON TX depth : 128 CQ Moderation : 100 Mtu : 4096[B] Link type : IB Max inline data : 0[B] rdma_cm QPs : OFF Data ex. method : Ethernet --------------------------------------------------------------------------------------- making it the current CUDA Ctx cuMemAlloc() of a 16777216 bytes GPU buffer allocated GPU buffer address at 00007f682e000000 pointer=0x7f682e000000 --------------------------------------------------------------------------------------- RDMA_Write BW Test Dual-port : OFF Device : mlx5_2 Number of qps : 1 Transport type : IB Connection type : RC Using SRQ : OFF PCIe relax order: ON ibv_wr* API : ON CQ Moderation : 100 Mtu : 4096[B] Link type : IB Max inline data : 0[B] rdma_cm QPs : OFF Data ex. method : Ethernet --------------------------------------------------------------------------------------- local address: LID 0x01 QPN 0x0107 PSN 0x90c1f2 RKey 0x17ecdc VAddr 0x007f682e800000 local address: LID 0x02 QPN 0x1883 PSN 0xa82bae RKey 0x17ece2 VAddr 0x007f0eba800000 remote address: LID 0x02 QPN 0x1883 PSN 0xa82bae RKey 0x17ece2 VAddr 0x007f0eba800000 remote address: LID 0x01 QPN 0x0107 PSN 0x90c1f2 RKey 0x17ecdc VAddr 0x007f682e800000 --------------------------------------------------------------------------------------- #bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps] --------------------------------------------------------------------------------------- #bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps] Conflicting CPU frequency values detected: 1500.000000 != 3391.375000. CPU Frequency is not max. 2 5000 4.11 4.10 2.151153 Conflicting CPU frequency values detected: 1500.000000 != 3345.763000. CPU Frequency is not max. 4 5000 8.07 8.04 2.108648 Conflicting CPU frequency values detected: 1500.000000 != 3362.509000. CPU Frequency is not max. 8 5000 16.13 16.13 2.113996 Conflicting CPU frequency values detected: 1500.000000 != 3335.048000. CPU Frequency is not max. 16 5000 32.30 32.19 2.109436 Conflicting CPU frequency values detected: 1500.000000 != 3339.906000. CPU Frequency is not max. 32 5000 64.41 64.38 2.109663 Conflicting CPU frequency values detected: 1500.000000 != 3333.100000. CPU Frequency is not max. 64 5000 129.43 129.12 2.115557 Conflicting CPU frequency values detected: 1500.000000 != 3349.864000. CPU Frequency is not max. 128 5000 257.89 257.16 2.106668 Conflicting CPU frequency values detected: 1500.000000 != 3350.294000. CPU Frequency is not max. 256 5000 516.27 515.84 2.112864 Conflicting CPU frequency values detected: 1500.000000 != 3340.996000. CPU Frequency is not max. 512 5000 1024.81 1024.72 2.098633 Conflicting CPU frequency values detected: 1500.000000 != 3356.251000. CPU Frequency is not max. 1024 5000 2053.47 2053.08 2.102352 Conflicting CPU frequency values detected: 1500.000000 != 3339.107000. CPU Frequency is not max. 2048 5000 3864.52 3720.22 1.904755 Conflicting CPU frequency values detected: 1500.000000 != 3355.693000. CPU Frequency is not max. 4096 5000 4494.10 4083.37 1.045344 Conflicting CPU frequency values detected: 1500.000000 != 3342.793000. CPU Frequency is not max. 8192 5000 4590.54 4425.60 0.566476 Conflicting CPU frequency values detected: 1500.000000 != 3351.159000. CPU Frequency is not max. 16384 5000 4517.28 4279.27 0.273873 Conflicting CPU frequency values detected: 1500.000000 != 3314.743000. CPU Frequency is not max. 32768 5000 4460.95 4387.03 0.140385 Conflicting CPU frequency values detected: 1500.000000 != 3305.732000. CPU Frequency is not max. 65536 5000 4465.92 4408.98 0.070544 Conflicting CPU frequency values detected: 1500.000000 != 3310.266000. CPU Frequency is not max. 131072 5000 4449.90 4422.93 0.035383 Conflicting CPU frequency values detected: 1500.000000 != 3364.586000. CPU Frequency is not max. 262144 5000 4443.64 4439.50 0.017758 Conflicting CPU frequency values detected: 1500.000000 != 3325.738000. CPU Frequency is not max. 524288 5000 4444.42 4441.08 0.008882 Conflicting CPU frequency values detected: 1500.000000 != 3391.764000. CPU Frequency is not max. 1048576 5000 4453.77 4452.52 0.004453 Conflicting CPU frequency values detected: 1500.000000 != 3391.441000. CPU Frequency is not max. 2097152 5000 4450.29 4449.44 0.002225 Conflicting CPU frequency values detected: 1500.000000 != 1958.593000. CPU Frequency is not max. 4194304 5000 4452.98 4451.38 0.001113 Conflicting CPU frequency values detected: 1500.000000 != 2246.050000. CPU Frequency is not max. 8388608 5000 4453.11 4452.79 0.000557 --------------------------------------------------------------------------------------- 8388608 5000 4453.11 4452.79 0.000557 --------------------------------------------------------------------------------------- deallocating RX GPU buffer 00007f682e000000 deallocating RX GPU buffer 00007f0eba000000 destroying current CUDA Ctx destroying current CUDA Ctx + cleanup + '[' -n 7209 ']' + test -d /proc/7209 + sudo kill 7209 kill: (7209): No such process + /bin/true + '[' -z '' ']' + sudo ip addr del dev enp148s0 192.168.5.1/24 + sudo ip netns exec peermemclient ip addr del dev enp18s0 192.168.5.2/24 + sudo ip netns delete peermemclient ubuntu@blanka:~/nvidia-dgx-2/tests$ echo $? 0