NPU(XDNA)をプログラムしてみる Data移動とベクトル演算編

AMD

前回で環境構築が終わったので、NPU(XDNA)をプログラムしてみるシリーズ第2回です。
NPU(XDNA)を使ったプログラムの基本的な流れは、パイソンでXDNAの設計を行い、makefileでコンパイルし、Test実行する流れです。

1. 単一タイル DMA:Passthrough DMA で memcpy を実装

NPUを使用するための仮想環境のセットアップを行う

source ~/mlir-aie/ironenv/bin/activate
source ~/mlir-aie/utils/env_setup.sh
ShellScript

AI Engine コアを一切使わず、DMA だけで 4 KB の memcpy を往復させる最小デザインになる。

1.1. コード

# passthrough_dmas/passthrough_dmas.py  (mlir-aie 公式サンプル)
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
import numpy as np, sys
from aie.iron import ObjectFifo, Program, Runtime
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1, NPU2Col1, XCVC1902

N          = 4096        # byte length(4 KB)
LINE       = 1024        # DMA 行長
if len(sys.argv) > 1:
    N = int(sys.argv[1]); assert N % LINE == 0
dev = {"npu":NPU1Col1, "npu2":NPU2Col1, "xcvc1902":XCVC1902}.get(
        sys.argv[2] if len(sys.argv)>2 else "npu")()

vector_ty  = np.ndarray[(N,),  np.dtype[np.int32]]
line_ty    = np.ndarray[(LINE,), np.dtype[np.int32]]

of_in  = ObjectFifo(line_ty, name="in")          # DDR → AIE
of_out =  of_in.cons().forward()                 # AIE → DDR (DMA forward)

rt = Runtime()
with rt.sequence(vector_ty, vector_ty, vector_ty) as (src, _, dst):
    rt.fill(of_in.prod(), src)                   # ホスト → FIFO
    rt.drain(of_out.cons(), dst, wait=True)      # FIFO → ホスト

module = Program(dev, rt).resolve_program(SequentialPlacer())
print(module)                                    # 生成 MLIR を出力
Python

1.2. Python実行結果

module {
  aie.device(npu1_1col) {
    %mem_tile_0_1 = aie.tile(0, 1)
    %shim_noc_tile_0_0 = aie.tile(0, 0)
    aie.objectfifo @in_fwd(%mem_tile_0_1, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo @in(%shim_noc_tile_0_0, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo.link [@in] -> [@in_fwd]([] [0])
    aiex.runtime_sequence @sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
      %0 = aiex.dma_configure_task_for @in {
        aie.dma_bd(%arg0 : memref<4096xi32>, 0, 4096, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 4096, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%0)
      %1 = aiex.dma_configure_task_for @in_fwd {
        aie.dma_bd(%arg2 : memref<4096xi32>, 0, 4096, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 4096, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      } {issue_token = true}
      aiex.dma_start_task(%1)
      aiex.dma_await_task(%1)
      aiex.dma_free_task(%0)
    }
  }
}
ShellScript

1.3. make 実行結果

(ironenv) ~/mlir-aie/programming_examples/basic/passthrough_dmas$ make 
mkdir -p build
python3 /mlir-aie/programming_examples/basic/passthrough_dmas/passthrough_dmas.py 4096 npu  > build/aie.mlir
mkdir -p build
cd build && aiecc.py --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin 
        --no-xchesscc --no-xbridge 
        --aie-generate-npu-insts --npu-insts-name=insts.bin ../build/aie.mlir
Found xchesscc at /tools/Xilinx/Vitis/2023.2/aietools
 AIE Compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/1 4 WorkersGenerating: /mlir-aie/programming_examples/basic/passthrough_dmas/build/aie.mlir.prj/aie_cdo_elfs.bin
Generating: /mlir-aie/programming_examples/basic/passthrough_dmas/build/aie.mlir.prj/aie_cdo_init.bin
Generating: /mlir-aie/programming_examples/basic/passthrough_dmas/build/aie.mlir.prj/aie_cdo_enable.bin

****** Bootgen v2024.1
  **** Build date : Apr  3 2025-04:13:15
    ** Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
    ** Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.

[INFO]   : Bootimage generated successfully

Info: Embedded Metadata section is missing project.platform.device.core element, adding it.
 AIE Compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/1 4 Workers
ShellScript

1.4. NPU(XDNA)実行

rm -rf _build
mkdir -p _build
cd _build &&  cmake /mlir-aie/programming_examples/basic/passthrough_dmas -DTARGET_NAME=passthrough_dmas
CMake Deprecation Warning at CMakeLists.txt:14 (cmake_minimum_required):
  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

-- The C compiler identification is GNU 13.3.0
-- The CXX compiler identification is GNU 13.3.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/gcc-13 - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/g++-13 - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
CMake Warning (dev) at CMakeLists.txt:45 (find_package):
  Policy CMP0167 is not set: The FindBoost module is removed.  Run "cmake
  --help-policy CMP0167" for policy details.  Use the cmake_policy command to
  set the policy and suppress this warning.

This warning is for project developers.  Use -Wno-dev to suppress it.

-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.83.0/BoostConfig.cmake (found version "1.83.0")
-- Configuring done (0.3s)
-- Generating done (0.0s)
-- Build files have been written to: mlir-aie/programming_examples/basic/passthrough_dmas/_build
cd _build &&  cmake --build . --config Release
gmake[1]: Entering directory 'mlir-aie/programming_examples/basic/passthrough_dmas/_build'
gmake[2]: Entering directory 'mlir-aie/programming_examples/basic/passthrough_dmas/_build'
gmake[3]: Entering directory 'mlir-aie/programming_examples/basic/passthrough_dmas/_build'
gmake[3]: Leaving directory 'mlir-aie/programming_examples/basic/passthrough_dmas/_build'
gmake[3]: Entering directory 'mlir-aie/programming_examples/basic/passthrough_dmas/_build'
[ 33%] Building CXX object CMakeFiles/passthrough_dmas.dirmlir-aie/runtime_lib/test_lib/test_utils.cpp.o
[ 66%] Building CXX object CMakeFiles/passthrough_dmas.dir/test.cpp.o
[100%] Linking CXX executable passthrough_dmas
gmake[3]: Leaving directory 'mlir-aie/programming_examples/basic/passthrough_dmas/_build'
[100%] Built target passthrough_dmas
gmake[2]: Leaving directory 'mlir-aie/programming_examples/basic/passthrough_dmas/_build'
gmake[1]: Leaving directory 'mlir-aie/programming_examples/basic/passthrough_dmas/_build'
cp _build/passthrough_dmas passthrough_dmas.exe 
./passthrough_dmas.exe -x build/final.xclbin -i build/insts.bin -k MLIR_AIE -l 4096
Name: MLIR_AIE

PASS!
ShellScript

NPU(XDNA)の実行結果を得るまで長い気がする、、、

2. ベクトル+スカラー/ベクトル同士の加算・乗算

2.1. Vector + Scalar Add (+1)

ホストメモリからデータを読み込み、全てのベクトル要素に +1 する例

2.1.1. コード

#!/usr/bin/env python3
# ベクトル要素すべてに +1 する例
import sys, numpy as np
from aie.iron import ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1
from aie.iron.controlflow import range_

# -------------------------------------------------------------
# パラメータ
# -------------------------------------------------------------
TOTAL = 1024  # 総要素数
MEM_CH = 64  # MemTile 1チャンク = 64 要素
TILE_CH = 32  # AIE Tile 1チャンク = 32 要素
DEV = NPU1Col1()  # NPU 1col

# NumPy 型定義
vec_ty = np.ndarray[(TOTAL,), np.dtype[np.int32]]
mem_ty = np.ndarray[(MEM_CH,), np.dtype[np.int32]]
tile_ty = np.ndarray[(TILE_CH,), np.dtype[np.int32]]

# -------------------------------------------------------------
# FIFO 構成:DDR → MemTile → AIE Tile → DDR
# -------------------------------------------------------------
of_ddr2mem = ObjectFifo(mem_ty, name="ddr2mem")
of_mem2aie = of_ddr2mem.cons().forward(obj_type=tile_ty)
of_aie2mem = ObjectFifo(tile_ty, name="aie2mem")  # 演算結果
of_mem2ddr = of_aie2mem.cons().forward(obj_type=mem_ty)

# -------------------------------------------------------------
# AIE コアの処理本体
# -------------------------------------------------------------
def core(src_fifo, dst_fifo):
    # 総ループ回数: TOTAL/TILE_CH
    for _ in range_(TOTAL // TILE_CH):
        in_v = src_fifo.acquire(1)  # 32要素ロック
        out_v = dst_fifo.acquire(1)
        # 32 要素に +1
        for i in range_(TILE_CH):
            out_v[i] = in_v[i] + 1
        src_fifo.release(1)
        dst_fifo.release(1)

# Worker = AIE タイルに core を配置
worker = Worker(core, fn_args=[of_mem2aie.cons(), of_aie2mem.prod()])

# -------------------------------------------------------------
# ランタイム(データ転送)
# -------------------------------------------------------------
rt = Runtime()
with rt.sequence(vec_ty, vec_ty) as (src, dst):
    rt.start(worker)  # コア動作開始
    rt.fill(of_ddr2mem.prod(), src)  # 入力を FIFO へ書き込み
    rt.drain(of_mem2ddr.cons(), dst, wait=True)

# -------------------------------------------------------------
# 実行
# -------------------------------------------------------------
prog = Program(DEV, rt).resolve_program(SequentialPlacer())
print(prog)
Python

2.1.2. Python実行結果

module {
  aie.device(npu1_1col) {
    %tile_0_2 = aie.tile(0, 2)
    %mem_tile_0_1 = aie.tile(0, 1)
    %shim_noc_tile_0_0 = aie.tile(0, 0)
    aie.objectfifo @ddr2mem_fwd(%mem_tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<32xi32>> 
    aie.objectfifo @ddr2mem(%shim_noc_tile_0_0, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64xi32>> 
    aie.objectfifo.link [@ddr2mem] -> [@ddr2mem_fwd]([] [0])
    aie.objectfifo @aie2mem_fwd(%mem_tile_0_1, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo<memref<64xi32>> 
    aie.objectfifo @aie2mem(%tile_0_2, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32xi32>> 
    aie.objectfifo.link [@aie2mem] -> [@aie2mem_fwd]([] [0])
    %core_0_2 = aie.core(%tile_0_2) {
      %c0 = arith.constant 0 : index
      %c9223372036854775807 = arith.constant 9223372036854775807 : index
      %c1 = arith.constant 1 : index
      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
        %c0_0 = arith.constant 0 : index
        %c32 = arith.constant 32 : index
        %c1_1 = arith.constant 1 : index
        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
          %0 = aie.objectfifo.acquire @ddr2mem_fwd(Consume, 1) : !aie.objectfifosubview<memref<32xi32>>
          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<32xi32>> -> memref<32xi32>
          %2 = aie.objectfifo.acquire @aie2mem(Produce, 1) : !aie.objectfifosubview<memref<32xi32>>
          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32xi32>> -> memref<32xi32>
          %c0_2 = arith.constant 0 : index
          %c32_3 = arith.constant 32 : index
          %c1_4 = arith.constant 1 : index
          scf.for %arg2 = %c0_2 to %c32_3 step %c1_4 {
            %4 = memref.load %1[%arg2] : memref<32xi32>
            %c1_i32 = arith.constant 1 : i32
            %5 = arith.addi %4, %c1_i32 : i32
            memref.store %5, %3[%arg2] : memref<32xi32>
          }
          aie.objectfifo.release @ddr2mem_fwd(Consume, 1)
          aie.objectfifo.release @aie2mem(Produce, 1)
        }
      }
      aie.end
    }
    aiex.runtime_sequence @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) {
      %0 = aiex.dma_configure_task_for @ddr2mem {
        aie.dma_bd(%arg0 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%0)
      %1 = aiex.dma_configure_task_for @aie2mem_fwd {
        aie.dma_bd(%arg1 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      } {issue_token = true}
      aiex.dma_start_task(%1)
      aiex.dma_await_task(%1)
      aiex.dma_free_task(%0)
    }
  }
}
ShellScript

2.1.3. make 実行結果

mkdir -p build
python3 /mlir-aie/programming_examples/basic/vector_scalar_add/vector_scalar_add.py npu > build/aie.mlir
mkdir -p build
cd build && aiecc.py --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host 
        --no-xchesscc --no-xbridge 
        --xclbin-name=final.xclbin --npu-insts-name=insts.bin aie.mlir
Found xchesscc at /tools/Xilinx/Vitis/2023.2/aietools
 AIE Compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/2 4 WorkersGenerating: /mlir-aie/programming_examples/basic/vector_scalar_add/build/aie.mlir.prj/aie_cdo_elfs.bin
Generating: /mlir-aie/programming_examples/basic/vector_scalar_add/build/aie.mlir.prj/aie_cdo_init.bin
Generating: /mlir-aie/programming_examples/basic/vector_scalar_add/build/aie.mlir.prj/aie_cdo_enable.bin

****** Bootgen v2024.1
  **** Build date : Apr  3 2025-04:13:15
    ** Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
    ** Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.

[INFO]   : Bootimage generated successfully

Info: Embedded Metadata section is missing project.platform.device.core element, adding it.
 AIE Compilation: ━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━  50% -:--:-- 0:00:00 1/2 4 Workers
ShellScript

2.1.4. NPU(XDNA)実行

実行結果を表示 ./vector_scalar_add.exe -x build/final.xclbin -i build/insts.bin -k MLIR_AIE
Correct output 2 == 2
Correct output 3 == 3
Correct output 4 == 4
Correct output 5 == 5
Correct output 6 == 6
Correct output 7 == 7
Correct output 8 == 8
Correct output 9 == 9
Correct output 10 == 10
Correct output 11 == 11
Correct output 12 == 12
Correct output 13 == 13
Correct output 14 == 14
Correct output 15 == 15
Correct output 16 == 16
Correct output 17 == 17
Correct output 18 == 18
Correct output 19 == 19
Correct output 20 == 20
Correct output 21 == 21
Correct output 22 == 22
Correct output 23 == 23
Correct output 24 == 24
Correct output 25 == 25
Correct output 26 == 26
Correct output 27 == 27
Correct output 28 == 28
Correct output 29 == 29
Correct output 30 == 30
Correct output 31 == 31
Correct output 32 == 32
Correct output 33 == 33
Correct output 34 == 34
Correct output 35 == 35
Correct output 36 == 36
Correct output 37 == 37
Correct output 38 == 38
Correct output 39 == 39
Correct output 40 == 40
Correct output 41 == 41
Correct output 42 == 42
Correct output 43 == 43
Correct output 44 == 44
Correct output 45 == 45
Correct output 46 == 46
Correct output 47 == 47
Correct output 48 == 48
Correct output 49 == 49
Correct output 50 == 50
Correct output 51 == 51
Correct output 52 == 52
Correct output 53 == 53
Correct output 54 == 54
Correct output 55 == 55
Correct output 56 == 56
Correct output 57 == 57
Correct output 58 == 58
Correct output 59 == 59
Correct output 60 == 60
Correct output 61 == 61
Correct output 62 == 62
Correct output 63 == 63
Correct output 64 == 64
Correct output 65 == 65
Correct output 66 == 66
Correct output 67 == 67
Correct output 68 == 68
Correct output 69 == 69
Correct output 70 == 70
Correct output 71 == 71
Correct output 72 == 72
Correct output 73 == 73
Correct output 74 == 74
Correct output 75 == 75
Correct output 76 == 76
Correct output 77 == 77
Correct output 78 == 78
Correct output 79 == 79
Correct output 80 == 80
Correct output 81 == 81
Correct output 82 == 82
Correct output 83 == 83
Correct output 84 == 84
Correct output 85 == 85
Correct output 86 == 86
Correct output 87 == 87
Correct output 88 == 88
Correct output 89 == 89
Correct output 90 == 90
Correct output 91 == 91
Correct output 92 == 92
Correct output 93 == 93
Correct output 94 == 94
Correct output 95 == 95
Correct output 96 == 96
Correct output 97 == 97
Correct output 98 == 98
Correct output 99 == 99
Correct output 100 == 100
Correct output 101 == 101
Correct output 102 == 102
Correct output 103 == 103
Correct output 104 == 104
Correct output 105 == 105
Correct output 106 == 106
Correct output 107 == 107
Correct output 108 == 108
Correct output 109 == 109
Correct output 110 == 110
Correct output 111 == 111
Correct output 112 == 112
Correct output 113 == 113
Correct output 114 == 114
Correct output 115 == 115
Correct output 116 == 116
Correct output 117 == 117
Correct output 118 == 118
Correct output 119 == 119
Correct output 120 == 120
Correct output 121 == 121
Correct output 122 == 122
Correct output 123 == 123
Correct output 124 == 124
Correct output 125 == 125
Correct output 126 == 126
Correct output 127 == 127
Correct output 128 == 128
Correct output 129 == 129
Correct output 130 == 130
Correct output 131 == 131
Correct output 132 == 132
Correct output 133 == 133
Correct output 134 == 134
Correct output 135 == 135
Correct output 136 == 136
Correct output 137 == 137
Correct output 138 == 138
Correct output 139 == 139
Correct output 140 == 140
Correct output 141 == 141
Correct output 142 == 142
Correct output 143 == 143
Correct output 144 == 144
Correct output 145 == 145
Correct output 146 == 146
Correct output 147 == 147
Correct output 148 == 148
Correct output 149 == 149
Correct output 150 == 150
Correct output 151 == 151
Correct output 152 == 152
Correct output 153 == 153
Correct output 154 == 154
Correct output 155 == 155
Correct output 156 == 156
Correct output 157 == 157
Correct output 158 == 158
Correct output 159 == 159
Correct output 160 == 160
Correct output 161 == 161
Correct output 162 == 162
Correct output 163 == 163
Correct output 164 == 164
Correct output 165 == 165
Correct output 166 == 166
Correct output 167 == 167
Correct output 168 == 168
Correct output 169 == 169
Correct output 170 == 170
Correct output 171 == 171
Correct output 172 == 172
Correct output 173 == 173
Correct output 174 == 174
Correct output 175 == 175
Correct output 176 == 176
Correct output 177 == 177
Correct output 178 == 178
Correct output 179 == 179
Correct output 180 == 180
Correct output 181 == 181
Correct output 182 == 182
Correct output 183 == 183
Correct output 184 == 184
Correct output 185 == 185
Correct output 186 == 186
Correct output 187 == 187
Correct output 188 == 188
Correct output 189 == 189
Correct output 190 == 190
Correct output 191 == 191
Correct output 192 == 192
Correct output 193 == 193
Correct output 194 == 194
Correct output 195 == 195
Correct output 196 == 196
Correct output 197 == 197
Correct output 198 == 198
Correct output 199 == 199
Correct output 200 == 200
Correct output 201 == 201
Correct output 202 == 202
Correct output 203 == 203
Correct output 204 == 204
Correct output 205 == 205
Correct output 206 == 206
Correct output 207 == 207
Correct output 208 == 208
Correct output 209 == 209
Correct output 210 == 210
Correct output 211 == 211
Correct output 212 == 212
Correct output 213 == 213
Correct output 214 == 214
Correct output 215 == 215
Correct output 216 == 216
Correct output 217 == 217
Correct output 218 == 218
Correct output 219 == 219
Correct output 220 == 220
Correct output 221 == 221
Correct output 222 == 222
Correct output 223 == 223
Correct output 224 == 224
Correct output 225 == 225
Correct output 226 == 226
Correct output 227 == 227
Correct output 228 == 228
Correct output 229 == 229
Correct output 230 == 230
Correct output 231 == 231
Correct output 232 == 232
Correct output 233 == 233
Correct output 234 == 234
Correct output 235 == 235
Correct output 236 == 236
Correct output 237 == 237
Correct output 238 == 238
Correct output 239 == 239
Correct output 240 == 240
Correct output 241 == 241
Correct output 242 == 242
Correct output 243 == 243
Correct output 244 == 244
Correct output 245 == 245
Correct output 246 == 246
Correct output 247 == 247
Correct output 248 == 248
Correct output 249 == 249
Correct output 250 == 250
Correct output 251 == 251
Correct output 252 == 252
Correct output 253 == 253
Correct output 254 == 254
Correct output 255 == 255
Correct output 256 == 256
Correct output 257 == 257
Correct output 258 == 258
Correct output 259 == 259
Correct output 260 == 260
Correct output 261 == 261
Correct output 262 == 262
Correct output 263 == 263
Correct output 264 == 264
Correct output 265 == 265
Correct output 266 == 266
Correct output 267 == 267
Correct output 268 == 268
Correct output 269 == 269
Correct output 270 == 270
Correct output 271 == 271
Correct output 272 == 272
Correct output 273 == 273
Correct output 274 == 274
Correct output 275 == 275
Correct output 276 == 276
Correct output 277 == 277
Correct output 278 == 278
Correct output 279 == 279
Correct output 280 == 280
Correct output 281 == 281
Correct output 282 == 282
Correct output 283 == 283
Correct output 284 == 284
Correct output 285 == 285
Correct output 286 == 286
Correct output 287 == 287
Correct output 288 == 288
Correct output 289 == 289
Correct output 290 == 290
Correct output 291 == 291
Correct output 292 == 292
Correct output 293 == 293
Correct output 294 == 294
Correct output 295 == 295
Correct output 296 == 296
Correct output 297 == 297
Correct output 298 == 298
Correct output 299 == 299
Correct output 300 == 300
Correct output 301 == 301
Correct output 302 == 302
Correct output 303 == 303
Correct output 304 == 304
Correct output 305 == 305
Correct output 306 == 306
Correct output 307 == 307
Correct output 308 == 308
Correct output 309 == 309
Correct output 310 == 310
Correct output 311 == 311
Correct output 312 == 312
Correct output 313 == 313
Correct output 314 == 314
Correct output 315 == 315
Correct output 316 == 316
Correct output 317 == 317
Correct output 318 == 318
Correct output 319 == 319
Correct output 320 == 320
Correct output 321 == 321
Correct output 322 == 322
Correct output 323 == 323
Correct output 324 == 324
Correct output 325 == 325
Correct output 326 == 326
Correct output 327 == 327
Correct output 328 == 328
Correct output 329 == 329
Correct output 330 == 330
Correct output 331 == 331
Correct output 332 == 332
Correct output 333 == 333
Correct output 334 == 334
Correct output 335 == 335
Correct output 336 == 336
Correct output 337 == 337
Correct output 338 == 338
Correct output 339 == 339
Correct output 340 == 340
Correct output 341 == 341
Correct output 342 == 342
Correct output 343 == 343
Correct output 344 == 344
Correct output 345 == 345
Correct output 346 == 346
Correct output 347 == 347
Correct output 348 == 348
Correct output 349 == 349
Correct output 350 == 350
Correct output 351 == 351
Correct output 352 == 352
Correct output 353 == 353
Correct output 354 == 354
Correct output 355 == 355
Correct output 356 == 356
Correct output 357 == 357
Correct output 358 == 358
Correct output 359 == 359
Correct output 360 == 360
Correct output 361 == 361
Correct output 362 == 362
Correct output 363 == 363
Correct output 364 == 364
Correct output 365 == 365
Correct output 366 == 366
Correct output 367 == 367
Correct output 368 == 368
Correct output 369 == 369
Correct output 370 == 370
Correct output 371 == 371
Correct output 372 == 372
Correct output 373 == 373
Correct output 374 == 374
Correct output 375 == 375
Correct output 376 == 376
Correct output 377 == 377
Correct output 378 == 378
Correct output 379 == 379
Correct output 380 == 380
Correct output 381 == 381
Correct output 382 == 382
Correct output 383 == 383
Correct output 384 == 384
Correct output 385 == 385
Correct output 386 == 386
Correct output 387 == 387
Correct output 388 == 388
Correct output 389 == 389
Correct output 390 == 390
Correct output 391 == 391
Correct output 392 == 392
Correct output 393 == 393
Correct output 394 == 394
Correct output 395 == 395
Correct output 396 == 396
Correct output 397 == 397
Correct output 398 == 398
Correct output 399 == 399
Correct output 400 == 400
Correct output 401 == 401
Correct output 402 == 402
Correct output 403 == 403
Correct output 404 == 404
Correct output 405 == 405
Correct output 406 == 406
Correct output 407 == 407
Correct output 408 == 408
Correct output 409 == 409
Correct output 410 == 410
Correct output 411 == 411
Correct output 412 == 412
Correct output 413 == 413
Correct output 414 == 414
Correct output 415 == 415
Correct output 416 == 416
Correct output 417 == 417
Correct output 418 == 418
Correct output 419 == 419
Correct output 420 == 420
Correct output 421 == 421
Correct output 422 == 422
Correct output 423 == 423
Correct output 424 == 424
Correct output 425 == 425
Correct output 426 == 426
Correct output 427 == 427
Correct output 428 == 428
Correct output 429 == 429
Correct output 430 == 430
Correct output 431 == 431
Correct output 432 == 432
Correct output 433 == 433
Correct output 434 == 434
Correct output 435 == 435
Correct output 436 == 436
Correct output 437 == 437
Correct output 438 == 438
Correct output 439 == 439
Correct output 440 == 440
Correct output 441 == 441
Correct output 442 == 442
Correct output 443 == 443
Correct output 444 == 444
Correct output 445 == 445
Correct output 446 == 446
Correct output 447 == 447
Correct output 448 == 448
Correct output 449 == 449
Correct output 450 == 450
Correct output 451 == 451
Correct output 452 == 452
Correct output 453 == 453
Correct output 454 == 454
Correct output 455 == 455
Correct output 456 == 456
Correct output 457 == 457
Correct output 458 == 458
Correct output 459 == 459
Correct output 460 == 460
Correct output 461 == 461
Correct output 462 == 462
Correct output 463 == 463
Correct output 464 == 464
Correct output 465 == 465
Correct output 466 == 466
Correct output 467 == 467
Correct output 468 == 468
Correct output 469 == 469
Correct output 470 == 470
Correct output 471 == 471
Correct output 472 == 472
Correct output 473 == 473
Correct output 474 == 474
Correct output 475 == 475
Correct output 476 == 476
Correct output 477 == 477
Correct output 478 == 478
Correct output 479 == 479
Correct output 480 == 480
Correct output 481 == 481
Correct output 482 == 482
Correct output 483 == 483
Correct output 484 == 484
Correct output 485 == 485
Correct output 486 == 486
Correct output 487 == 487
Correct output 488 == 488
Correct output 489 == 489
Correct output 490 == 490
Correct output 491 == 491
Correct output 492 == 492
Correct output 493 == 493
Correct output 494 == 494
Correct output 495 == 495
Correct output 496 == 496
Correct output 497 == 497
Correct output 498 == 498
Correct output 499 == 499
Correct output 500 == 500
Correct output 501 == 501
Correct output 502 == 502
Correct output 503 == 503
Correct output 504 == 504
Correct output 505 == 505
Correct output 506 == 506
Correct output 507 == 507
Correct output 508 == 508
Correct output 509 == 509
Correct output 510 == 510
Correct output 511 == 511
Correct output 512 == 512
Correct output 513 == 513
Correct output 514 == 514
Correct output 515 == 515
Correct output 516 == 516
Correct output 517 == 517
Correct output 518 == 518
Correct output 519 == 519
Correct output 520 == 520
Correct output 521 == 521
Correct output 522 == 522
Correct output 523 == 523
Correct output 524 == 524
Correct output 525 == 525
Correct output 526 == 526
Correct output 527 == 527
Correct output 528 == 528
Correct output 529 == 529
Correct output 530 == 530
Correct output 531 == 531
Correct output 532 == 532
Correct output 533 == 533
Correct output 534 == 534
Correct output 535 == 535
Correct output 536 == 536
Correct output 537 == 537
Correct output 538 == 538
Correct output 539 == 539
Correct output 540 == 540
Correct output 541 == 541
Correct output 542 == 542
Correct output 543 == 543
Correct output 544 == 544
Correct output 545 == 545
Correct output 546 == 546
Correct output 547 == 547
Correct output 548 == 548
Correct output 549 == 549
Correct output 550 == 550
Correct output 551 == 551
Correct output 552 == 552
Correct output 553 == 553
Correct output 554 == 554
Correct output 555 == 555
Correct output 556 == 556
Correct output 557 == 557
Correct output 558 == 558
Correct output 559 == 559
Correct output 560 == 560
Correct output 561 == 561
Correct output 562 == 562
Correct output 563 == 563
Correct output 564 == 564
Correct output 565 == 565
Correct output 566 == 566
Correct output 567 == 567
Correct output 568 == 568
Correct output 569 == 569
Correct output 570 == 570
Correct output 571 == 571
Correct output 572 == 572
Correct output 573 == 573
Correct output 574 == 574
Correct output 575 == 575
Correct output 576 == 576
Correct output 577 == 577
Correct output 578 == 578
Correct output 579 == 579
Correct output 580 == 580
Correct output 581 == 581
Correct output 582 == 582
Correct output 583 == 583
Correct output 584 == 584
Correct output 585 == 585
Correct output 586 == 586
Correct output 587 == 587
Correct output 588 == 588
Correct output 589 == 589
Correct output 590 == 590
Correct output 591 == 591
Correct output 592 == 592
Correct output 593 == 593
Correct output 594 == 594
Correct output 595 == 595
Correct output 596 == 596
Correct output 597 == 597
Correct output 598 == 598
Correct output 599 == 599
Correct output 600 == 600
Correct output 601 == 601
Correct output 602 == 602
Correct output 603 == 603
Correct output 604 == 604
Correct output 605 == 605
Correct output 606 == 606
Correct output 607 == 607
Correct output 608 == 608
Correct output 609 == 609
Correct output 610 == 610
Correct output 611 == 611
Correct output 612 == 612
Correct output 613 == 613
Correct output 614 == 614
Correct output 615 == 615
Correct output 616 == 616
Correct output 617 == 617
Correct output 618 == 618
Correct output 619 == 619
Correct output 620 == 620
Correct output 621 == 621
Correct output 622 == 622
Correct output 623 == 623
Correct output 624 == 624
Correct output 625 == 625
Correct output 626 == 626
Correct output 627 == 627
Correct output 628 == 628
Correct output 629 == 629
Correct output 630 == 630
Correct output 631 == 631
Correct output 632 == 632
Correct output 633 == 633
Correct output 634 == 634
Correct output 635 == 635
Correct output 636 == 636
Correct output 637 == 637
Correct output 638 == 638
Correct output 639 == 639
Correct output 640 == 640
Correct output 641 == 641
Correct output 642 == 642
Correct output 643 == 643
Correct output 644 == 644
Correct output 645 == 645
Correct output 646 == 646
Correct output 647 == 647
Correct output 648 == 648
Correct output 649 == 649
Correct output 650 == 650
Correct output 651 == 651
Correct output 652 == 652
Correct output 653 == 653
Correct output 654 == 654
Correct output 655 == 655
Correct output 656 == 656
Correct output 657 == 657
Correct output 658 == 658
Correct output 659 == 659
Correct output 660 == 660
Correct output 661 == 661
Correct output 662 == 662
Correct output 663 == 663
Correct output 664 == 664
Correct output 665 == 665
Correct output 666 == 666
Correct output 667 == 667
Correct output 668 == 668
Correct output 669 == 669
Correct output 670 == 670
Correct output 671 == 671
Correct output 672 == 672
Correct output 673 == 673
Correct output 674 == 674
Correct output 675 == 675
Correct output 676 == 676
Correct output 677 == 677
Correct output 678 == 678
Correct output 679 == 679
Correct output 680 == 680
Correct output 681 == 681
Correct output 682 == 682
Correct output 683 == 683
Correct output 684 == 684
Correct output 685 == 685
Correct output 686 == 686
Correct output 687 == 687
Correct output 688 == 688
Correct output 689 == 689
Correct output 690 == 690
Correct output 691 == 691
Correct output 692 == 692
Correct output 693 == 693
Correct output 694 == 694
Correct output 695 == 695
Correct output 696 == 696
Correct output 697 == 697
Correct output 698 == 698
Correct output 699 == 699
Correct output 700 == 700
Correct output 701 == 701
Correct output 702 == 702
Correct output 703 == 703
Correct output 704 == 704
Correct output 705 == 705
Correct output 706 == 706
Correct output 707 == 707
Correct output 708 == 708
Correct output 709 == 709
Correct output 710 == 710
Correct output 711 == 711
Correct output 712 == 712
Correct output 713 == 713
Correct output 714 == 714
Correct output 715 == 715
Correct output 716 == 716
Correct output 717 == 717
Correct output 718 == 718
Correct output 719 == 719
Correct output 720 == 720
Correct output 721 == 721
Correct output 722 == 722
Correct output 723 == 723
Correct output 724 == 724
Correct output 725 == 725
Correct output 726 == 726
Correct output 727 == 727
Correct output 728 == 728
Correct output 729 == 729
Correct output 730 == 730
Correct output 731 == 731
Correct output 732 == 732
Correct output 733 == 733
Correct output 734 == 734
Correct output 735 == 735
Correct output 736 == 736
Correct output 737 == 737
Correct output 738 == 738
Correct output 739 == 739
Correct output 740 == 740
Correct output 741 == 741
Correct output 742 == 742
Correct output 743 == 743
Correct output 744 == 744
Correct output 745 == 745
Correct output 746 == 746
Correct output 747 == 747
Correct output 748 == 748
Correct output 749 == 749
Correct output 750 == 750
Correct output 751 == 751
Correct output 752 == 752
Correct output 753 == 753
Correct output 754 == 754
Correct output 755 == 755
Correct output 756 == 756
Correct output 757 == 757
Correct output 758 == 758
Correct output 759 == 759
Correct output 760 == 760
Correct output 761 == 761
Correct output 762 == 762
Correct output 763 == 763
Correct output 764 == 764
Correct output 765 == 765
Correct output 766 == 766
Correct output 767 == 767
Correct output 768 == 768
Correct output 769 == 769
Correct output 770 == 770
Correct output 771 == 771
Correct output 772 == 772
Correct output 773 == 773
Correct output 774 == 774
Correct output 775 == 775
Correct output 776 == 776
Correct output 777 == 777
Correct output 778 == 778
Correct output 779 == 779
Correct output 780 == 780
Correct output 781 == 781
Correct output 782 == 782
Correct output 783 == 783
Correct output 784 == 784
Correct output 785 == 785
Correct output 786 == 786
Correct output 787 == 787
Correct output 788 == 788
Correct output 789 == 789
Correct output 790 == 790
Correct output 791 == 791
Correct output 792 == 792
Correct output 793 == 793
Correct output 794 == 794
Correct output 795 == 795
Correct output 796 == 796
Correct output 797 == 797
Correct output 798 == 798
Correct output 799 == 799
Correct output 800 == 800
Correct output 801 == 801
Correct output 802 == 802
Correct output 803 == 803
Correct output 804 == 804
Correct output 805 == 805
Correct output 806 == 806
Correct output 807 == 807
Correct output 808 == 808
Correct output 809 == 809
Correct output 810 == 810
Correct output 811 == 811
Correct output 812 == 812
Correct output 813 == 813
Correct output 814 == 814
Correct output 815 == 815
Correct output 816 == 816
Correct output 817 == 817
Correct output 818 == 818
Correct output 819 == 819
Correct output 820 == 820
Correct output 821 == 821
Correct output 822 == 822
Correct output 823 == 823
Correct output 824 == 824
Correct output 825 == 825
Correct output 826 == 826
Correct output 827 == 827
Correct output 828 == 828
Correct output 829 == 829
Correct output 830 == 830
Correct output 831 == 831
Correct output 832 == 832
Correct output 833 == 833
Correct output 834 == 834
Correct output 835 == 835
Correct output 836 == 836
Correct output 837 == 837
Correct output 838 == 838
Correct output 839 == 839
Correct output 840 == 840
Correct output 841 == 841
Correct output 842 == 842
Correct output 843 == 843
Correct output 844 == 844
Correct output 845 == 845
Correct output 846 == 846
Correct output 847 == 847
Correct output 848 == 848
Correct output 849 == 849
Correct output 850 == 850
Correct output 851 == 851
Correct output 852 == 852
Correct output 853 == 853
Correct output 854 == 854
Correct output 855 == 855
Correct output 856 == 856
Correct output 857 == 857
Correct output 858 == 858
Correct output 859 == 859
Correct output 860 == 860
Correct output 861 == 861
Correct output 862 == 862
Correct output 863 == 863
Correct output 864 == 864
Correct output 865 == 865
Correct output 866 == 866
Correct output 867 == 867
Correct output 868 == 868
Correct output 869 == 869
Correct output 870 == 870
Correct output 871 == 871
Correct output 872 == 872
Correct output 873 == 873
Correct output 874 == 874
Correct output 875 == 875
Correct output 876 == 876
Correct output 877 == 877
Correct output 878 == 878
Correct output 879 == 879
Correct output 880 == 880
Correct output 881 == 881
Correct output 882 == 882
Correct output 883 == 883
Correct output 884 == 884
Correct output 885 == 885
Correct output 886 == 886
Correct output 887 == 887
Correct output 888 == 888
Correct output 889 == 889
Correct output 890 == 890
Correct output 891 == 891
Correct output 892 == 892
Correct output 893 == 893
Correct output 894 == 894
Correct output 895 == 895
Correct output 896 == 896
Correct output 897 == 897
Correct output 898 == 898
Correct output 899 == 899
Correct output 900 == 900
Correct output 901 == 901
Correct output 902 == 902
Correct output 903 == 903
Correct output 904 == 904
Correct output 905 == 905
Correct output 906 == 906
Correct output 907 == 907
Correct output 908 == 908
Correct output 909 == 909
Correct output 910 == 910
Correct output 911 == 911
Correct output 912 == 912
Correct output 913 == 913
Correct output 914 == 914
Correct output 915 == 915
Correct output 916 == 916
Correct output 917 == 917
Correct output 918 == 918
Correct output 919 == 919
Correct output 920 == 920
Correct output 921 == 921
Correct output 922 == 922
Correct output 923 == 923
Correct output 924 == 924
Correct output 925 == 925
Correct output 926 == 926
Correct output 927 == 927
Correct output 928 == 928
Correct output 929 == 929
Correct output 930 == 930
Correct output 931 == 931
Correct output 932 == 932
Correct output 933 == 933
Correct output 934 == 934
Correct output 935 == 935
Correct output 936 == 936
Correct output 937 == 937
Correct output 938 == 938
Correct output 939 == 939
Correct output 940 == 940
Correct output 941 == 941
Correct output 942 == 942
Correct output 943 == 943
Correct output 944 == 944
Correct output 945 == 945
Correct output 946 == 946
Correct output 947 == 947
Correct output 948 == 948
Correct output 949 == 949
Correct output 950 == 950
Correct output 951 == 951
Correct output 952 == 952
Correct output 953 == 953
Correct output 954 == 954
Correct output 955 == 955
Correct output 956 == 956
Correct output 957 == 957
Correct output 958 == 958
Correct output 959 == 959
Correct output 960 == 960
Correct output 961 == 961
Correct output 962 == 962
Correct output 963 == 963
Correct output 964 == 964
Correct output 965 == 965
Correct output 966 == 966
Correct output 967 == 967
Correct output 968 == 968
Correct output 969 == 969
Correct output 970 == 970
Correct output 971 == 971
Correct output 972 == 972
Correct output 973 == 973
Correct output 974 == 974
Correct output 975 == 975
Correct output 976 == 976
Correct output 977 == 977
Correct output 978 == 978
Correct output 979 == 979
Correct output 980 == 980
Correct output 981 == 981
Correct output 982 == 982
Correct output 983 == 983
Correct output 984 == 984
Correct output 985 == 985
Correct output 986 == 986
Correct output 987 == 987
Correct output 988 == 988
Correct output 989 == 989
Correct output 990 == 990
Correct output 991 == 991
Correct output 992 == 992
Correct output 993 == 993
Correct output 994 == 994
Correct output 995 == 995
Correct output 996 == 996
Correct output 997 == 997
Correct output 998 == 998
Correct output 999 == 999
Correct output 1000 == 1000
Correct output 1001 == 1001
Correct output 1002 == 1002
Correct output 1003 == 1003
Correct output 1004 == 1004
Correct output 1005 == 1005
Correct output 1006 == 1006
Correct output 1007 == 1007
Correct output 1008 == 1008
Correct output 1009 == 1009
Correct output 1010 == 1010
Correct output 1011 == 1011
Correct output 1012 == 1012
Correct output 1013 == 1013
Correct output 1014 == 1014
Correct output 1015 == 1015
Correct output 1016 == 1016
Correct output 1017 == 1017
Correct output 1018 == 1018
Correct output 1019 == 1019
Correct output 1020 == 1020
Correct output 1021 == 1021
Correct output 1022 == 1022
Correct output 1023 == 1023
Correct output 1024 == 1024
Correct output 1025 == 1025

PASS!

2.2 Vector + Vector Add

2.2.1 Pythonコード

# vector_vector_add/vector_vector_add.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
import numpy as np
import sys

from aie.iron import ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1, NPU2, XCVC1902
from aie.iron.controlflow import range_

def my_vector_add():
    N = 256
    n = 16
    N_div_n = N // n

    if len(sys.argv) != 3:
        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")

    if sys.argv[1] == "npu":
        dev = NPU1Col1()
    elif sys.argv[1] == "npu2":
        dev = NPU2()
    elif sys.argv[1] == "xcvc1902":
        dev = XCVC1902()
    else:
        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))

    # Define tensor types
    tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
    tile_ty = np.ndarray[(n,), np.dtype[np.int32]]

    # AIE-array data movement with object fifos
    of_in1 = ObjectFifo(tile_ty, name="in1")
    of_in2 = ObjectFifo(tile_ty, name="in2")
    of_out = ObjectFifo(tile_ty, name="out")

    # Define a task that will run on a compute tile
    def core_body(of_in1, of_in2, of_out):
        # Number of sub-vector "tile" iterations
        for _ in range_(N_div_n):
            elem_in1 = of_in1.acquire(1)
            elem_in2 = of_in2.acquire(1)
            elem_out = of_out.acquire(1)
            for i in range_(n):
                elem_out[i] = elem_in1[i] + elem_in2[i]
            of_in1.release(1)
            of_in2.release(1)
            of_out.release(1)

    # Create a worker to run the task on a compute tile
    worker = Worker(core_body, fn_args=[of_in1.cons(), of_in2.cons(), of_out.prod()])

    # Runtime operations to move data to/from the AIE-array
    rt = Runtime()
    with rt.sequence(tensor_ty, tensor_ty, tensor_ty) as (A, B, C):
        rt.start(worker)
        rt.fill(of_in1.prod(), A)
        rt.fill(of_in2.prod(), B)
        rt.drain(of_out.cons(), C, wait=True)

    # Place program components (assign them resources on the device) and generate an MLIR module
    return Program(dev, rt).resolve_program(SequentialPlacer())

module = my_vector_add()
print(module)
Python

2.2.2 実行結果

module {
  aie.device(npu1_1col) {
    %tile_0_2 = aie.tile(0, 2)
    %shim_noc_tile_0_0 = aie.tile(0, 0)
    aie.objectfifo @A(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<32xi32>> 
    aie.objectfifo @C(%tile_0_2, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32xi32>> 
    aie.objectfifo @B(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<32xi32>> 
    %core_0_2 = aie.core(%tile_0_2) {
      %c0 = arith.constant 0 : index
      %c9223372036854775807 = arith.constant 9223372036854775807 : index
      %c1 = arith.constant 1 : index
      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
        %c0_0 = arith.constant 0 : index
        %c32 = arith.constant 32 : index
        %c1_1 = arith.constant 1 : index
        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
          %0 = aie.objectfifo.acquire @A(Consume, 1) : !aie.objectfifosubview<memref<32xi32>>
          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<32xi32>> -> memref<32xi32>
          %2 = aie.objectfifo.acquire @B(Consume, 1) : !aie.objectfifosubview<memref<32xi32>>
          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32xi32>> -> memref<32xi32>
          %4 = aie.objectfifo.acquire @C(Produce, 1) : !aie.objectfifosubview<memref<32xi32>>
          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<32xi32>> -> memref<32xi32>
          %c0_2 = arith.constant 0 : index
          %c32_3 = arith.constant 32 : index
          %c1_4 = arith.constant 1 : index
          scf.for %arg2 = %c0_2 to %c32_3 step %c1_4 {
            %6 = memref.load %1[%arg2] : memref<32xi32>
            %7 = memref.load %3[%arg2] : memref<32xi32>
            %8 = arith.addi %6, %7 : i32
            memref.store %8, %5[%arg2] : memref<32xi32>
          }
          aie.objectfifo.release @A(Consume, 1)
          aie.objectfifo.release @B(Consume, 1)
          aie.objectfifo.release @C(Produce, 1)
        }
      }
      aie.end
    }
    aiex.runtime_sequence @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) {
      %0 = aiex.dma_configure_task_for @A {
        aie.dma_bd(%arg0 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%0)
      %1 = aiex.dma_configure_task_for @B {
        aie.dma_bd(%arg1 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%1)
      %2 = aiex.dma_configure_task_for @C {
        aie.dma_bd(%arg2 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      } {issue_token = true}
      aiex.dma_start_task(%2)
      aiex.dma_await_task(%2)
      aiex.dma_free_task(%0)
      aiex.dma_free_task(%1)
    }
  }
}
ShellScript

2.2.3 make実行結果

rm -rf _build
mkdir -p _build
cd _build &&  cmake mlir-aie/programming_examples/basic/vector_vector_add -DTARGET_NAME=vector_vector_add
CMake Deprecation Warning at CMakeLists.txt:14 (cmake_minimum_required):
  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

-- The C compiler identification is GNU 13.3.0
-- The CXX compiler identification is GNU 13.3.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/gcc-13 - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/g++-13 - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
CMake Warning (dev) at CMakeLists.txt:45 (find_package):
  Policy CMP0167 is not set: The FindBoost module is removed.  Run "cmake
  --help-policy CMP0167" for policy details.  Use the cmake_policy command to
  set the policy and suppress this warning.

This warning is for project developers.  Use -Wno-dev to suppress it.

-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.83.0/BoostConfig.cmake (found version "1.83.0")
-- Configuring done (0.3s)
-- Generating done (0.0s)
-- Build files have been written to: mlir-aie/programming_examples/basic/vector_vector_add/_build
cd _build &&  cmake --build . --config Release
gmake[1]: Entering directory 'mlir-aie/programming_examples/basic/vector_vector_add/_build'
gmake[2]: Entering directory 'mlir-aie/programming_examples/basic/vector_vector_add/_build'
gmake[3]: Entering directory 'mlir-aie/programming_examples/basic/vector_vector_add/_build'
gmake[3]: Leaving directory 'mlir-aie/programming_examples/basic/vector_vector_add/_build'
gmake[3]: Entering directory 'mlir-aie/programming_examples/basic/vector_vector_add/_build'
[ 33%] Building CXX object CMakeFiles/vector_vector_add.dirmlir-aie/runtime_lib/test_lib/test_utils.cpp.o
[ 66%] Building CXX object CMakeFiles/vector_vector_add.dir/test.cpp.o
[100%] Linking CXX executable vector_vector_add
gmake[3]: Leaving directory 'mlir-aie/programming_examples/basic/vector_vector_add/_build'
[100%] Built target vector_vector_add
gmake[2]: Leaving directory 'mlir-aie/programming_examples/basic/vector_vector_add/_build'
gmake[1]: Leaving directory 'mlir-aie/programming_examples/basic/vector_vector_add/_build'
cp _build/vector_vector_add vector_vector_add.exe 
mkdir -p build
python3 mlir-aie/programming_examples/basic/vector_vector_add/vector_vector_add.py npu 0 > build/aie.mlir
mkdir -p build
cd build && aiecc.py --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host 
        --no-xchesscc --no-xbridge 
        --xclbin-name=final.xclbin --npu-insts-name=insts.bin aie.mlir
Found xchesscc at /tools/Xilinx/Vitis/2023.2/aietools
 AIE Compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/2 4 WorkersGenerating: mlir-aie/programming_examples/basic/vector_vector_add/build/aie.mlir.prj/aie_cdo_elfs.bin
Generating: mlir-aie/programming_examples/basic/vector_vector_add/build/aie.mlir.prj/aie_cdo_init.bin
Generating: mlir-aie/programming_examples/basic/vector_vector_add/build/aie.mlir.prj/aie_cdo_enable.bin

****** Bootgen v2024.1
  **** Build date : Apr  3 2025-04:13:15
    ** Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
    ** Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.

[INFO]   : Bootimage generated successfully

Info: Embedded Metadata section is missing project.platform.device.core element, adding it.
 AIE Compilation: ━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━  50% -:--:-- 0:00:00 1/2 4 Workers
ShellScript

2.2.4 実行

./vector_vector_add.exe -x build/final.xclbin -i build/insts.bin -k MLIR_AIE

PASS!
ShellScript

3. リダクション演算

3.1 Pythonコード

#!/usr/bin/env python3
# ベクトル総和を 1 つの値に縮約
import numpy as np
from aie.iron import ObjectFifo, Program, Runtime, Worker, Kernel
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1

N = 1024
DEV = NPU1Col1()

in_ty = np.ndarray[(N,), np.dtype[np.int32]]
out_ty = np.ndarray[(1,), np.dtype[np.int32]]

# FIFO
of_in = ObjectFifo(in_ty, name="vec_in")
of_out = ObjectFifo(out_ty, name="sum_out")

# C++ で書いた AIE カーネル(高速リダクション)
reduce_add = Kernel("reduce_add_vector", "reduce_add.cc.o", [in_ty, out_ty, np.int32])

def core(f_in, f_out, k):
    out = f_out.acquire(1)
    inp = f_in.acquire(1)
    k(inp, out, N)  # C++ カーネル呼び出し
    f_in.release(1)
    f_out.release(1)

worker = Worker(core, fn_args=[of_in.cons(), of_out.prod(), reduce_add])

rt = Runtime()
with rt.sequence(in_ty, out_ty) as (vec, sum_):
    rt.start(worker)
    rt.fill(of_in.prod(), vec)
    rt.drain(of_out.cons(), sum_, wait=True)

print(Program(DEV, rt).resolve_program(SequentialPlacer()))
Python

3.2 Python実行結果

module {
  aie.device(npu1_1col) {
    %tile_0_2 = aie.tile(0, 2)
    %shim_noc_tile_0_0 = aie.tile(0, 0)
    aie.objectfifo @sum_out(%tile_0_2, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo<memref<1xi32>> 
    aie.objectfifo @vec_in(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    func.func private @reduce_add_vector(memref<1024xi32>, memref<1xi32>, i32)
    %core_0_2 = aie.core(%tile_0_2) {
      %c0 = arith.constant 0 : index
      %c9223372036854775807 = arith.constant 9223372036854775807 : index
      %c1 = arith.constant 1 : index
      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
        %0 = aie.objectfifo.acquire @sum_out(Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
        %2 = aie.objectfifo.acquire @vec_in(Consume, 1) : !aie.objectfifosubview<memref<1024xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xi32>> -> memref<1024xi32>
        %c1024_i32 = arith.constant 1024 : i32
        func.call @reduce_add_vector(%3, %1, %c1024_i32) : (memref<1024xi32>, memref<1xi32>, i32) -> ()
        aie.objectfifo.release @vec_in(Consume, 1)
        aie.objectfifo.release @sum_out(Produce, 1)
      }
      aie.end
    } {link_with = "reduce_add.cc.o"}
    aiex.runtime_sequence @sequence(%arg0: memref<1024xi32>, %arg1: memref<1xi32>) {
      %0 = aiex.dma_configure_task_for @vec_in {
        aie.dma_bd(%arg0 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%0)
      %1 = aiex.dma_configure_task_for @sum_out {
        aie.dma_bd(%arg1 : memref<1xi32>, 0, 1, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      } {issue_token = true}
      aiex.dma_start_task(%1)
      aiex.dma_await_task(%1)
      aiex.dma_free_task(%0)
    }
  }
}
ShellScript

3.3 make実行

mkdir -p build
python3 mlir-aie/programming_examples/basic/vector_reduce_add/vector_reduce_add.py npu 0 > build/aie.mlir
mkdir -p build
cd build && mlir-aie/ironenv/lib/python3.12/site-packages/llvm-aie/bin/clang++ -O2 -std=c++20 --target=aie2-none-unknown-elf -Wno-parentheses -Wno-attributes -Wno-macro-redefined -Wno-empty-body -DNDEBUG -I mlir-aie/ironenv/lib/python3.12/site-packages/mlir_aie/include   -c mlir-aie/programming_examples/basic/vector_reduce_add/../../../aie_kernels/aie2/reduce_add.cc -o reduce_add.cc.o
mkdir -p build
cd build && aiecc.py --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin 
        --no-xchesscc --no-xbridge --peano mlir-aie/ironenv/lib/python3.12/site-packages/llvm-aie 
                        --aie-generate-npu-insts --npu-insts-name=insts.bin ../build/aie.mlir
Found xchesscc at /tools/Xilinx/Vitis/2023.2/aietools
 AIE Compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/2 4 WorkersGenerating: mlir-aie/programming_examples/basic/vector_reduce_add/build/aie.mlir.prj/aie_cdo_elfs.bin
Generating: mlir-aie/programming_examples/basic/vector_reduce_add/build/aie.mlir.prj/aie_cdo_init.bin
Generating: mlir-aie/programming_examples/basic/vector_reduce_add/build/aie.mlir.prj/aie_cdo_enable.bin

****** Bootgen v2024.1
  **** Build date : Apr  3 2025-04:13:15
    ** Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
    ** Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.

[INFO]   : Bootimage generated successfully

Info: Embedded Metadata section is missing project.platform.device.core element, adding it.
 AIE Compilation: ━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━  50% -:--:-- 0:00:00 1/2 4 Workers
ShellScript

3.4 実行結果

rm -rf _build
mkdir -p _build
cd _build &&  cmake /mlir-aie/programming_examples/basic/vector_reduce_add -DTARGET_NAME=vector_reduce_add
CMake Deprecation Warning at CMakeLists.txt:14 (cmake_minimum_required):
  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

-- The C compiler identification is GNU 14.2.0
-- The CXX compiler identification is GNU 14.2.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
CMake Warning (dev) at CMakeLists.txt:40 (find_package):
  Policy CMP0167 is not set: The FindBoost module is removed.  Run "cmake
  --help-policy CMP0167" for policy details.  Use the cmake_policy command to
  set the policy and suppress this warning.

This warning is for project developers.  Use -Wno-dev to suppress it.

-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.83.0/BoostConfig.cmake (found version "1.83.0")
-- Configuring done (0.3s)
-- Generating done (0.0s)
-- Build files have been written to: /mlir-aie/programming_examples/basic/vector_reduce_add/_build
cd _build &&  cmake --build . --config Release
gmake[1]: Entering directory '/mlir-aie/programming_examples/basic/vector_reduce_add/_build'
gmake[2]: Entering directory '/mlir-aie/programming_examples/basic/vector_reduce_add/_build'
gmake[3]: Entering directory '/mlir-aie/programming_examples/basic/vector_reduce_add/_build'
gmake[3]: Leaving directory '/mlir-aie/programming_examples/basic/vector_reduce_add/_build'
gmake[3]: Entering directory '/mlir-aie/programming_examples/basic/vector_reduce_add/_build'
[ 33%] Building CXX object CMakeFiles/vector_reduce_add.dir/mlir-aie/runtime_lib/test_lib/test_utils.cpp.o
[ 66%] Building CXX object CMakeFiles/vector_reduce_add.dir/test.cpp.o
[100%] Linking CXX executable vector_reduce_add
gmake[3]: Leaving directory '/mlir-aie/programming_examples/basic/vector_reduce_add/_build'
[100%] Built target vector_reduce_add
gmake[2]: Leaving directory '/mlir-aie/programming_examples/basic/vector_reduce_add/_build'
gmake[1]: Leaving directory '/mlir-aie/programming_examples/basic/vector_reduce_add/_build'
cp _build/vector_reduce_add vector_reduce_add.exe 
./vector_reduce_add.exe -x build/final.xclbin -i build/insts.bin -k MLIR_AIE

Avg NPU time: 327us.

Min NPU time: 327us.

Max NPU time: 327us.

PASS!
ShellScript

4. シングルバッファ vs ダブルバッファ DMA パターン

Python だけで ダブルバッファ DMA を構成する最小例です。
2 つのロック/バッファを交互に使い、転送と演算を完全にオーバーラップ します。

4.1 シングルバッファ

4.1.1 Pythonコード

#!/usr/bin/env python3
# -------------------------------------------------------------
# Single-Buffer DMA  : FIFO depth=1(転送と演算を逐次化)
# 機能:DDR → AIE → DDR へ 4 KB memcpy(AI Engine コアは未使用)
# -------------------------------------------------------------
import numpy as np
from aie.iron import ObjectFifo, Runtime, Program
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1

def build():
    BYTES = 4096  # 総転送量
    LINE_BYTES = 1024  # 1 DMA 行
    dev = NPU1Col1()  # Ryzen™ AI (1col) を想定

    vec_ty = np.ndarray[(BYTES,), np.dtype[np.int32]]
    line_ty = np.ndarray[(LINE_BYTES,), np.dtype[np.int32]]

    # ▼ depth=1 なのでシングルバッファ
    of_in = ObjectFifo(line_ty, depth=1, name="in")
    of_out = of_in.cons().forward()  # AIE→DDR へ回帰

    rt = Runtime()
    with rt.sequence(vec_ty, vec_ty) as (src, dst):
        rt.fill(of_in.prod(), src)  # ホスト → FIFO
        rt.drain(of_out.cons(), dst, wait=True)

    return Program(dev, rt).resolve_program(SequentialPlacer())

# コマンドライン実行時は MLIR を出力
if __name__ == "__main__":
    print(build())
Python

4.1.2 実行結果

module {
  aie.device(npu1_1col) {
    %mem_tile_0_1 = aie.tile(0, 1)
    %shim_noc_tile_0_0 = aie.tile(0, 0)
    aie.objectfifo @out(%mem_tile_0_1, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo @in(%shim_noc_tile_0_0, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo.link [@in] -> [@out]([] [0])
    aiex.runtime_sequence @sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>) {
      %0 = aiex.dma_configure_task_for @in {
        aie.dma_bd(%arg0 : memref<4096xi32>, 0, 4096, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 4096, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%0)
      %1 = aiex.dma_configure_task_for @out {
        aie.dma_bd(%arg1 : memref<4096xi32>, 0, 4096, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 4096, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      } {issue_token = true}
      aiex.dma_start_task(%1)
      aiex.dma_await_task(%1)
      aiex.dma_free_task(%0)
    }
  }
}
ShellScript

4.1.3 make実行

python3 design.py > build.mlir
aiecc.py           build.mlir test.cpp -o host --xclbin build.xclbin
Found xchesscc at /tools/Xilinx/Vitis/2023.2/aietools
 MLIR compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/1 1 Worker/tools/Xilinx/Vitis/2023.2/aietools/tps/lnx64/target_aie_ml/bin/LNa64bin/chess-llvm-link: /mlir-aie/ironenv/lib/python3.12/site-packages/mlir_aie/aie_runtime_lib/AIE2/chess_intrinsic_wrapper.ll:56:28: error: unterminated attribute group
attributes #1 = { nounwind memory(inaccessiblemem: readwrite) "frame-pointer"="all" "no-builtin-memcpy" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
                           ^
/tools/Xilinx/Vitis/2023.2/aietools/tps/lnx64/target_aie_ml/bin/LNa64bin/chess-llvm-link: error:  loading file '/mlir-aie/ironenv/lib/python3.12/site-packages/mlir_aie/aie_runtime_lib/AIE2/chess_intrinsic_wrapper.ll'
Error encountered while running: /tools/Xilinx/Vitis/2023.2/aietools/tps/lnx64/target_aie_ml/bin/LNa64bin/chess-llvm-link /mlir-aie/programming_examples/basic/buffer/single_buffer/build.mlir.prj/input.llchesshack.ll /mlir-aie/ironenv/lib/python3.12/site-packages/mlir_aie/aie_runtime_lib/AIE2/chess_intrinsic_wrapper.ll -S -o /mlir-aie/programming_examples/basic/buffer/single_buffer/build.mlir.prj/input.llchesslinked.ll
 MLIR compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/1 1 Worker
make: *** [Makefile:18: build.xclbin] Error 1
ShellScript

Vitis周りでエラーが出ているような気がする・・・

4.1.4 実行結果

Skip

4.2 ダブルバッファ

4.1.1 Pythonコード

#!/usr/bin/env python3
# Double-Buffer DMA : FIFO default_depth=2(ping-pong)
import numpy as np
from aie.iron import ObjectFifo, Runtime, Program
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1

def build():
    BYTES, LINE = 4096, 1024
    dev = NPU1Col1()

    vec_ty = np.ndarray[(BYTES,), np.dtype[np.int32]]
    line_ty = np.ndarray[(LINE,), np.dtype[np.int32]]

    # ★ ここを default_depth=2 に修正
    of_in = ObjectFifo(line_ty, default_depth=2, name="in")
    of_out = of_in.cons().forward(name="out")  # ここは深さ継承

    rt = Runtime()
    with rt.sequence(vec_ty, vec_ty) as (src, dst):
        rt.fill(of_in.prod(), src)
        rt.drain(of_out.cons(), dst, wait=True)

    return Program(dev, rt).resolve_program(SequentialPlacer())

if __name__ == "__main__":
    print(build())
Python

4.1.2 実行結果

module {
  aie.device(npu1_1col) {
    %shim_noc_tile_0_0 = aie.tile(0, 0)
    %mem_tile_0_1 = aie.tile(0, 1)
    aie.objectfifo @in(%shim_noc_tile_0_0, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo @out(%mem_tile_0_1, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo.link [@in] -> [@out]([] [0])
    aiex.runtime_sequence @sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>) {
      %0 = aiex.dma_configure_task_for @in {
        aie.dma_bd(%arg0 : memref<4096xi32>, 0, 4096, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 4096, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%0)
      %1 = aiex.dma_configure_task_for @out {
        aie.dma_bd(%arg1 : memref<4096xi32>, 0, 4096, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 4096, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      } {issue_token = true}
      aiex.dma_start_task(%1)
      aiex.dma_await_task(%1)
      aiex.dma_free_task(%0)
    }
  }
}
ShellScript

4.1.3 make実行

python3 design.py > build.mlir
aiecc.py build.mlir test.cpp -o host --xclbin build.xclbin
Found xchesscc at /tools/Xilinx/Vitis/2023.2/aietools
 MLIR compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/1 1 Worker/tools/Xilinx/Vitis/2023.2/aietools/tps/lnx64/target_aie_ml/bin/LNa64bin/chess-llvm-link: /mlir-aie/ironenv/lib/python3.12/site-packages/mlir_aie/aie_runtime_lib/AIE2/chess_intrinsic_wrapper.ll:56:28: error: unterminated attribute group
attributes #1 = { nounwind memory(inaccessiblemem: readwrite) "frame-pointer"="all" "no-builtin-memcpy" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
                           ^
/tools/Xilinx/Vitis/2023.2/aietools/tps/lnx64/target_aie_ml/bin/LNa64bin/chess-llvm-link: error:  loading file '/mlir-aie/ironenv/lib/python3.12/site-packages/mlir_aie/aie_runtime_lib/AIE2/chess_intrinsic_wrapper.ll'
Error encountered while running: /tools/Xilinx/Vitis/2023.2/aietools/tps/lnx64/target_aie_ml/bin/LNa64bin/chess-llvm-link /mlir-aie/programming_examples/basic/buffer/double_buffer/build.mlir.prj/input.llchesshack.ll /mlir-aie/ironenv/lib/python3.12/site-packages/mlir_aie/aie_runtime_lib/AIE2/chess_intrinsic_wrapper.ll -S -o /mlir-aie/programming_examples/basic/buffer/double_buffer/build.mlir.prj/input.llchesslinked.ll
 MLIR compilation: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   0% -:--:-- 0:00:00 0/1 1 Worker
make: *** [Makefile:16: build.xclbin] Error 1
ShellScript

4.1.4 実行結果

Skip

5. IRON Python API によるデータフロー記述

上記コード群はすべて IRON API の 3 レイヤで共通化できます。

レイヤ役割主要クラス
タイル/FIFO 宣言構造 (Topology)tile(), ObjectFifo()
コア本体演算 (Compute)Worker, Kernel
ランタイムシーケンスデータ転送 (Runtime)Runtime.sequence()

6. まとめ

IRON API を使った AIE プログラミングの基本を学べた気がするが、使いづらい・・・
わざわざNPU(XDNA)で実行する必要があるのか?という疑問も残る。CUDAや OpenCL で十分な気がする。

Versalに搭載されている AI Engine は、FPGAの拡張機能として使用するのであれば、使いやすいかもしれないが、
XDNA のように、AI用のプロセッサとしての使い方は、あまりメリットがないように思えた。

仮に採用したとして、AI専用のプロセッサとして使うのは、開発工数がかかるし、エコシステムがまだ揃っていないので、あまりお勧めできない感想を受けた。

CUDA強し・・・

タイトルとURLをコピーしました