Contents

IREE::Stream::StreamTransformPassPipeline 的主要作用是将program转换到stream dialect,优化变量编码方式,划分调度子图,生成异步调度策略,并实现内存规划策略。

  • buildStreamTensorPassPipeline

    • IREE::Stream::createVerifyInputPass

      检查program的合法性。

    • IREE::Stream::createOutlineConstantsPass

      将module内部的dense constant转换成global dense constant。

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %cst = arith.constant dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>
      %c10 = arith.constant 10 : index
      %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32>
      %1 = flow.tensor.reshape %0 : tensor<1x10xf32> -> tensor<10xf32>
      %2 = flow.tensor.empty : tensor<10xf32>
      %3 = flow.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%1, %cst, %2) : (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>) -> %2
      %4 = flow.tensor.reshape %3 : tensor<10xf32> -> tensor<1x10xf32>
      %5 = hal.tensor.export %4 : tensor<1x10xf32> -> !hal.buffer_view
      return %5 : !hal.buffer_view
      }

      转换成,

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      util.global private @_constant {noinline} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>
      func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %_constant = util.global.load @_constant : tensor<10xf32>
      %c10 = arith.constant 10 : index
      %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32>
      %1 = flow.tensor.reshape %0 : tensor<1x10xf32> -> tensor<10xf32>
      %2 = flow.tensor.empty : tensor<10xf32>
      %3 = flow.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%1, %_constant, %2) : (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>) -> %2
      %4 = flow.tensor.reshape %3 : tensor<10xf32> -> tensor<1x10xf32>
      %5 = hal.tensor.export %4 : tensor<1x10xf32> -> !hal.buffer_view
      return %5 : !hal.buffer_view
      }
    • addCleanupPatterns

    • IREE::Stream::createConvertToStreamPass

      IREE::UtilIREE::FlowIREE::HAL以及std dialect转换到IREE::Stream dialect。

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      module {
      util.global private @_constant {noinline} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>
      flow.executable private @test_dispatch_0 {
      flow.executable.export public @test_dispatch_0_generic_10 workgroups(%arg0: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
      flow.return %x, %y, %z : index, index, index
      }
      builtin.module {
      func.func @test_dispatch_0_generic_10(%arg0: !flow.dispatch.tensor<readonly:tensor<10xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<10xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<10xf32>>) {
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32>
      %2 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<10xf32>> -> tensor<10xf32>
      %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<10xf32>, tensor<10xf32>) outs(%2 : tensor<10xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
      %4 = arith.addf %in, %in_0 : f32
      linalg.yield %4 : f32
      } -> tensor<10xf32>
      flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<readwrite:tensor<10xf32>>
      return
      }
      }
      }
      func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %c10 = arith.constant 10 : index
      %_constant = util.global.load @_constant : tensor<10xf32>
      %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32>
      %1 = flow.tensor.reshape %0 : tensor<1x10xf32> -> tensor<10xf32>
      %2 = flow.tensor.empty : tensor<10xf32>
      %3 = flow.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%1, %_constant, %2) : (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>) -> %2
      %4 = flow.tensor.reshape %3 : tensor<10xf32> -> tensor<1x10xf32>
      %5 = hal.tensor.export %4 : tensor<1x10xf32> -> !hal.buffer_view
      return %5 : !hal.buffer_view
      }
      }

      转换为

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      module {
      util.global private @_constant : !stream.resource<constant>
      util.global private @_constant__size : index
      util.initializer {
      %cst = stream.tensor.constant : tensor<10xf32> in !stream.resource<constant> = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>
      %0 = stream.resource.size %cst : !stream.resource<constant>
      util.global.store %cst, @_constant : !stream.resource<constant>
      util.global.store %0, @_constant__size : index
      util.initializer.return
      }
      stream.executable private @test_dispatch_0 {
      stream.executable.export public @test_dispatch_0_generic_10 workgroups(%arg0: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
      stream.return %x, %y, %z : index, index, index
      }
      builtin.module {
      func.func @test_dispatch_0_generic_10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xf32>>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xf32>>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<10xf32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32>
      %5 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<10xf32>> -> tensor<10xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<10xf32>, tensor<10xf32>) outs(%5 : tensor<10xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
      %7 = arith.addf %in, %in_0 : f32
      linalg.yield %7 : f32
      } -> tensor<10xf32>
      flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<readwrite:tensor<10xf32>>
      return
      }
      }
      }
      func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %c10 = arith.constant 10 : index
      %_constant = util.global.load @_constant : !stream.resource<constant>
      %_constant__size = util.global.load @_constant__size : index
      %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
      %c553648160_i32 = arith.constant 553648160 : i32
      %c1_i32 = arith.constant 1 : i32
      %c1 = arith.constant 1 : index
      %c10_0 = arith.constant 10 : index
      hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c10_0]) type(%c553648160_i32) encoding(%c1_i32)
      %1 = stream.tensor.sizeof tensor<1x10xf32> : index
      %2 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> in !stream.resource<external>{%1}
      %3 = stream.async.transfer %2 : !stream.resource<external>{%1} -> !stream.resource<*>{%1}
      %4 = stream.tensor.sizeof tensor<10xf32> : index
      %5 = stream.tensor.clone %3 : tensor<1x10xf32> in !stream.resource<*>{%1} -> tensor<10xf32> in !stream.resource<*>{%4}
      %6 = stream.tensor.sizeof tensor<10xf32> : index
      %empty = stream.tensor.empty : tensor<10xf32> in !stream.resource<*>{%6}
      %c0 = arith.constant 0 : index
      %7 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%5[%c0 to %4 for %4], %0[%c0 to %_constant__size for %_constant__size], %empty[%c0 to %6 for %6]) : (!stream.resource<*>{%4}, !stream.resource<*>{%_constant__size}, !stream.resource<*>{%6}) -> %empty{%6}
      %8 = stream.tensor.sizeof tensor<1x10xf32> : index
      %9 = stream.tensor.clone %7 : tensor<10xf32> in !stream.resource<*>{%6} -> tensor<1x10xf32> in !stream.resource<*>{%8}
      %10 = stream.async.transfer %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8}
      %11 = stream.tensor.export %10 : tensor<1x10xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
      return %11 : !hal.buffer_view
      }
      }

      可以看到除了flow.executable,module中tensor type都被转换成stream.resourceindex,但hal.buffer_view type仍然被保留。初始值为tensor的util.global constant被转换为不带初始值的 stream.resourceindex,同时生成了一个util.initializerstream.resourceindex进行初始化。 util.global.load被转换成util.global.load + stream.async.transferhal.tensor.import被转换成stream.tensor.import + stream.async.transferhal.tensor.export被转换为stream.async.transfer + stream.tensor.exportflow.tensor.reshape被转换成stream.tensor.cloneflow.executable转换为stream.executable,内部的flow.executable.export转换为stream.executable.export ,内部的func op的argument由flow.dispatch.tensor转换为stream.binding

    • IREE::Stream::createVerifyLoweringToTensorsPass

      检查program的合法性。

    • addCleanupPatterns

    • IREE::Util::createCombineInitializersPass

      合并所有的util.initializer ops。

  • buildStreamAsyncPassPipeline

    • IREE::Stream::createEncodeHostTensorsPass

      主要作用是将tensor的元素位宽(bit)扩充为2的幂大小,并按字节对齐。其中i1~i7转换为i8(1 byte),i9~i15转换为i16 (2 bytes),i17~i31转换为i32 (4 bytes),i33~i63转换为i64(8 bytes)。

      1
      2
      3
      4
      5
      6
      7
      util.initializer {
      %cst = stream.tensor.constant : tensor<10xi4> in !stream.resource<constant> = dense<[0, 1, 2, 3, 4, 5, 6, 7, -8, -7]> : tensor<10xi4>
      %0 = stream.resource.size %cst : !stream.resource<constant>
      util.global.store %cst, @_constant : !stream.resource<constant>
      util.global.store %0, @_constant__size : index
      util.initializer.return
      }

      转换为

      1
      2
      3
      4
      5
      6
      7
      util.initializer {
      %c10 = arith.constant 10 : index
      %cst = stream.async.constant : !stream.resource<constant>{%c10} = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi8>
      util.global.store %cst, @_constant : !stream.resource<constant>
      util.global.store %c10, @_constant__size : index
      util.initializer.return
      }

      %cst的类型从i4转成了i8,此外stream.tensor.constant转换成了stream.async.constant%0 = stream.resource.size %cst : !stream.resource<constant>直接被替换成了常量%c10

    • IREE::Stream::createEncodeDeviceTensorsPass

      和createEncodeHostTensorsPass作用一样,区别是createEncodeDeviceTensorsPass作用的是stream.executable中的op。

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      builtin.module {
      func.func @test_dispatch_0_generic_10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xi4>>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xi4>>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<10xi4>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xi4>> -> tensor<10xi4>
      %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xi4>> -> tensor<10xi4>
      %5 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<10xi4>> -> tensor<10xi4>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<10xi4>, tensor<10xi4>) outs(%5 : tensor<10xi4>) {
      ^bb0(%in: i4, %in_0: i4, %out: i4):
      %7 = arith.addi %in, %in_0 : i4
      linalg.yield %7 : i4
      } -> tensor<10xi4>
      flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xi4> -> !flow.dispatch.tensor<readwrite:tensor<10xi4>>
      return
      }
      }

      转换为,

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      builtin.module {
      func.func @test_dispatch_0_generic_10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xi8>>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xi8>>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<10xi8>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xi8>> -> tensor<10xi8>
      %4 = arith.trunci %3 : tensor<10xi8> to tensor<10xi4>
      %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xi8>> -> tensor<10xi8>
      %6 = arith.trunci %5 : tensor<10xi8> to tensor<10xi4>
      %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<10xi8>> -> tensor<10xi8>
      %8 = arith.trunci %7 : tensor<10xi8> to tensor<10xi4>
      %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %6 : tensor<10xi4>, tensor<10xi4>) outs(%8 : tensor<10xi4>) {
      ^bb0(%in: i4, %in_0: i4, %out: i4):
      %11 = arith.addi %in, %in_0 : i4
      linalg.yield %11 : i4
      } -> tensor<10xi4>
      %10 = arith.extui %9 : tensor<10xi4> to tensor<10xi8>
      flow.dispatch.tensor.store %10, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xi8> -> !flow.dispatch.tensor<readwrite:tensor<10xi8>>
      return
      }
      }

      可以看到stream.binding.subspan的result type从i4转换成了i8,并且在flow.dispatch.tensor.load之后插入了一个arith.trunci,将i8截断为i4,进而参与linalg.generic中的计算。

    • IREE::Stream::createMaterializeBuiltinsPass

    • addCleanupPatterns

    • IREE::Stream::createMaterializeCopyOnWritePass

      写入时插入一次拷贝,以更有效地支持inplace更新,并且确保正确的执行语义。

    • IREE::Stream::createElideAsyncCopiesPass

      消除MaterializeCopyOnWritePass中插入的不必要的拷贝。

    • mlir::createCanonicalizerPass

    • IREE::Stream::createEmplaceAllocationsPass

      尝试消除stream.async.dispatch后的stream.async.update op。当stream.async.dispatch的结果没有绑定一个value时,就可以把stream.async.update的target绑定到stream.async.dispatch的结果,使得stream.async.dispatch直接把计算结果更新到target。

    • IREE::Stream::createRefineUsagePass

      确定每个stream.resource的生命期,推导stream.resource的类型。stream.resource类型包括:

      • Unknown: stream.resource<*>
      • External:stream.resource<external> 由外部程序管理的内存
      • Staging:stream.resource<staging> 用于上传/下载的暂存缓冲区
      • Transient:stream.resource<transient> 跨stream的一段临时值
      • Variable:stream.resource<variable> 跨stream的一段持续值
      • Constant:stream.resource<constant> 整个程序中持续存在的立即值(常量)。

      除此之外还消除了冗余的stream.async.transfer

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %c40 = arith.constant 40 : index
      %c0 = arith.constant 0 : index
      %c10 = arith.constant 10 : index
      %c553648160_i32 = arith.constant 553648160 : i32
      %c1_i32 = arith.constant 1 : i32
      hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10]) type(%c553648160_i32) encoding(%c1_i32)
      %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10xf32> in !stream.resource<external>{%c40}
      %1 = stream.async.transfer %0 : !stream.resource<external>{%c40} -> !stream.resource<*>{%c40}
      %2 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%1[%c0 to %c40 for %c40]) : (!stream.resource<*>{%c40}) -> !stream.resource<*>{%c40}
      %3 = stream.async.transfer %2 : !stream.resource<*>{%c40} -> !stream.resource<external>{%c40}
      %4 = stream.tensor.export %3 : tensor<10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view
      return %4 : !hal.buffer_view
      }

      转换为

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %c40 = arith.constant 40 : index
      %c0 = arith.constant 0 : index
      %c10 = arith.constant 10 : index
      %c553648160_i32 = arith.constant 553648160 : i32
      %c1_i32 = arith.constant 1 : i32
      hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10]) type(%c553648160_i32) encoding(%c1_i32)
      %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10xf32> in !stream.resource<external>{%c40}
      %1 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%0[%c0 to %c40 for %c40]) : (!stream.resource<external>{%c40}) -> !stream.resource<external>{%c40}
      %2 = stream.tensor.export %1 : tensor<10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view
      return %2 : !hal.buffer_view
      }

      可以看到!stream.resource<*>{ %c40}被推导为!stream.resource<external>{ %c40},并且有两处stream.async.transfer被删除了。

    • addCleanupPatterns

    • IREE::Stream::createScheduleExecutionPass

      根据启发式算法将每个callable(包括util.initializer)划分成多个part进行调度,每个part独立构成一个stream.async.execute,并且每个stream.async.execute后面都跟了一个stream.timepoint.await操作用于同步执行结果。

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %c40 = arith.constant 40 : index
      %c10 = arith.constant 10 : index
      %c553648160_i32 = arith.constant 553648160 : i32
      %c1_i32 = arith.constant 1 : i32
      %c1 = arith.constant 1 : index
      %c0 = arith.constant 0 : index
      %_constant = util.global.load @_constant : !stream.resource<constant>
      hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c10]) type(%c553648160_i32) encoding(%c1_i32)
      %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> in !stream.resource<external>{%c40}
      %1 = stream.async.alloca : !stream.resource<external>{%c40}
      %2 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%0[%c0 to %c40 for %c40], %_constant[%c0 to %c40 for %c40], %1[%c0 to %c40 for %c40]) : (!stream.resource<external>{%c40}, !stream.resource<constant>{%c40}, !stream.resource<external>{%c40}) -> %1{%c40}
      %3 = stream.tensor.export %2 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view
      return %3 : !hal.buffer_view
      }

      转换成,

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %c40 = arith.constant 40 : index
      %c10 = arith.constant 10 : index
      %c553648160_i32 = arith.constant 553648160 : i32
      %c1_i32 = arith.constant 1 : i32
      %c1 = arith.constant 1 : index
      %c0 = arith.constant 0 : index
      %_constant = util.global.load @_constant : !stream.resource<constant>
      hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c10]) type(%c553648160_i32) encoding(%c1_i32)
      %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> in !stream.resource<external>{%c40}
      %results, %result_timepoint = stream.async.execute with(%0 as %arg1: !stream.resource<external>{%c40}, %_constant as %arg2: !stream.resource<constant>{%c40}) -> !stream.resource<external>{%c40} {
      %3 = stream.async.alloca : !stream.resource<external>{%c40}
      %4 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%arg1[%c0 to %c40 for %c40], %arg2[%c0 to %c40 for %c40], %3[%c0 to %c40 for %c40]) : (!stream.resource<external>{%c40}, !stream.resource<constant>{%c40}, !stream.resource<external>{%c40}) -> %3{%c40}
      stream.yield %4 : !stream.resource<external>{%c40}
      } => !stream.timepoint
      %1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c40}
      %2 = stream.tensor.export %1 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view
      return %2 : !hal.buffer_view
      }

      注意:该示例中只有一个part。

    • IREE::Stream::createScheduleConcurrencyPass

      继续将stream.async.execute划分为多个并行调度区,每个并行调度区构成一个stream.async.concurrent

    • IREE::Stream::createPropagateTimepointsPass

      stream.resource 绑定一个 stream.timepoint,在代码中用stream.resource + stream.timepoint 的pair 替换原来的stream.resource,并在需要的地方插入await。

      • util.global

        1
        util.global private @_constant : !stream.resource<constant>

        转换成

        1
        2
        util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
        util.global private @_constant : !stream.resource<constant>
      • util.global.load

        1
        %_constant = util.global.load @_constant : !stream.resource<constant>

        转换成

        1
        2
        3
        %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
        %_constant = util.global.load @_constant : !stream.resource<constant>
        %0 = stream.timepoint.await %_constant__timepoint => %_constant : !stream.resource<constant>{%c40}
      • util.global.store

        1
        util.global.store %0, @_constant : !stream.resource<constant>

        转换成

        1
        2
        util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
        util.global.store %results, @_constant : !stream.resource<constant>
      • func.func

        1
        2
        3
        func.func @foo(%0: !stream.resource) {
        ...
        }

        转换成

        1
        2
        3
        4
        func.func @foo(%t: !stream.timepoint, %0: !stream.resource) {
        %1 = stream.timepoint.await %t, %0
        ...
        }
      • call

        由于func内部已经插入了await,因此call之前的冗余await可以删除,call之后需要再插入一个func返回值的await。

        1
        2
        %1 = stream.timepoint.await %t, %0
        %r = call @foo(%1)

        转换成

        1
        2
        %rt, %r = call @foo(%t, %0)
        stream.timepoint.await %rt, %t
      • return

        1
        2
        %1 = stream.timepoint.await %t, %0
        return %1

        转换成

        1
        return %t, %0
      • branch

        将参数的await挪到branch里面。

        1
        2
        3
        4
        5
        %1 = stream.timepoint.await %t, %0
        br ^bb1(%1)

        ^bb1(%b):
        ...

        转换成

        1
        2
        3
        br ^bb1(%t, %0)
        ^bb1(%a, %b):
        %1 = stream.timepoint.await %a, %b
      • stream.async.execute

        为每个未绑定stream.timepoint的输入参数绑定一个stream.timepoint,并在stream.async.execute之前计算参数的最大timepoint,stream.async.execute 则await这个最大timepoint。

        1
        2
        3
        %results, %result_timepoint = stream.async.execute with(%0 as %arg1: !stream.resource<external>{%c40}, %_constant as %arg2: !stream.resource<constant>{%c40}) -> !stream.resource<external>{%c40} {
        ...
        }

        转换成

        1
        2
        3
        4
        %3 = stream.timepoint.join max(%2, %_constant__timepoint) => !stream.timepoint
        %results, %result_timepoint = stream.async.execute await(%3) => with(%1 as %arg1: !stream.resource<external>{%c40}, %_constant as %arg2: !stream.resource<constant>{%c40}) -> !stream.resource<external>{%c40} {
        ...
        }
    • addCleanupPatterns

    • IREE::Stream::createVerifyLoweringToAsyncPass

      验证LoweringToAsyncPass阶段program的合法性。

  • buildStreamCmdPassPipeline

    • IREE::Stream::createScheduleAllocationPas

      • 首先将所有常量op聚合成一个stream.resource.constants,并移出该region,stream.resource.constants的结果会被append到该region的输入参数中(原本直接yield的常量除外)。

        1
        2
        3
        4
        %results, %result_timepoint = stream.async.execute with() -> !stream.resource<constant>{%c40} {
        %cst = stream.async.constant : !stream.resource<constant>{%c40} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>
        stream.yield %cst : !stream.resource<constant>{%c40}
        } => !stream.timepoint

        转换成

        1
        2
        3
        4
        5
        6
        %results, %result_timepoint = stream.resource.constants :
        !stream.resource<constant>{%c40} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>
        => !stream.timepoint
        %0 = stream.cmd.execute with() {
        } => !stream.timepoint
        %1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint
      • 分析stream.async.execute region中resource的类型和他们之间的alias关系,按照resource的类型统一分配空间。对于没有被Tied到输入(即非inplace)的results,会统一在region外面由stream.resource.alloc申请一段external空间,region再通过Tied的方式消费alloc的结果。对于中间临时的resource,经过stream.resource.pack计算需要分配的空间大小后统一由stream.resource.alloca申请一段transient空间,并会在region后面插入stream.resource.dealloca释放申请的临时空间。

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
        %c8 = arith.constant 8 : index
        %c40 = arith.constant 40 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %c1 = arith.constant 1 : index
        %c10 = arith.constant 10 : index
        %c553648160_i32 = arith.constant 553648160 : i32
        %c1_i32 = arith.constant 1 : i32
        %c2 = arith.constant 2 : index
        hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c2]) type(%c553648160_i32) encoding(%c1_i32)
        %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2xf32> in !stream.resource<external>{%c8}
        // stream.async.execute
        %results, %result_timepoint = stream.async.execute with(%0 as %arg1: !stream.resource<external>{%c8}) -> !stream.resource<external>{%c40} {
        %3 = stream.async.dispatch @predict_dispatch_0::@predict_dispatch_0_matmul_1x10x2[%c1, %c10](%arg1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<transient>{%c40}
        %4 = stream.async.dispatch @predict_dispatch_1::@predict_dispatch_1_generic_10[%c1](%3[%c0 to %c40 for %c40]) : (!stream.resource<transient>{%c40}) -> !stream.resource<transient>{%c4}
        %5 = stream.async.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10](%3[%c0 to %c40 for %c40], %4[%c0 to %c4 for %c4]) : (!stream.resource<transient>{%c40}, !stream.resource<transient>{%c4}) -> !stream.resource<transient>{%c40}
        %6 = stream.async.dispatch @predict_dispatch_3::@predict_dispatch_3_generic_10[%c1](%5[%c0 to %c40 for %c40]) : (!stream.resource<transient>{%c40}) -> !stream.resource<transient>{%c4}
        %7 = stream.async.dispatch @predict_dispatch_4::@predict_dispatch_4_generic_1x10[%c1, %c10](%5[%c0 to %c40 for %c40], %6[%c0 to %c4 for %c4]) : (!stream.resource<transient>{%c40}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c40}
        stream.yield %7 : !stream.resource<external>{%c40}
        } => !stream.timepoint
        %1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c40}
        %2 = stream.tensor.export %1 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view
        return %2 : !hal.buffer_view
        }

        转换成,

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        32
        33
        34
        35
        36
        37
        38
        39
        40
        41
        42
        43
        44
        45
        46
        47
        48
        49
        50
        51
        52
        53
        54
        55
        func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
        %c8 = arith.constant 8 : index
        %c40 = arith.constant 40 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %c1 = arith.constant 1 : index
        %c10 = arith.constant 10 : index
        %c553648160_i32 = arith.constant 553648160 : i32
        %c1_i32 = arith.constant 1 : i32
        %c2 = arith.constant 2 : index
        hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c2]) type(%c553648160_i32) encoding(%c1_i32)
        %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2xf32> in !stream.resource<external>{%c8}
        %c0_0 = arith.constant 0 : index
        // 申请输出resource的空间
        %1 = stream.resource.alloc uninitialized : !stream.resource<external>{%c40}
        // 计算临时resource所需要的空间大小
        %2:5 = stream.resource.pack slices({
        [0, 2] = %c40, // [0, 2]是某个resource的lifetime,%40是resource size
        [1, 2] = %c4,
        [2, 4] = %c40,
        [3, 4] = %c4
        }) : index
        // 申请临时resource的空间
        %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<transient>{%2#0} => !stream.timepoint
        %3 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%2#0}) {
        stream.cmd.dispatch @predict_dispatch_0::@predict_dispatch_0_matmul_1x10x2[%c1, %c10] {
        ro %arg1[%c0 for %c8] : !stream.resource<external>{%c8},
        wo %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0}
        }
        stream.cmd.dispatch @predict_dispatch_1::@predict_dispatch_1_generic_10[%c1] {
        ro %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0},
        wo %arg3[%2#2 for %c4] : !stream.resource<transient>{%2#0}
        }
        stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10] {
        ro %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0},
        ro %arg3[%2#2 for %c4] : !stream.resource<transient>{%2#0},
        wo %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0}
        }
        stream.cmd.dispatch @predict_dispatch_3::@predict_dispatch_3_generic_10[%c1] {
        ro %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0},
        wo %arg3[%2#4 for %c4] : !stream.resource<transient>{%2#0}
        }
        stream.cmd.dispatch @predict_dispatch_4::@predict_dispatch_4_generic_1x10[%c1, %c10] {
        ro %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0},
        ro %arg3[%2#4 for %c4] : !stream.resource<transient>{%2#0},
        wo %arg2[%c0_0 for %c40] : !stream.resource<external>{%c40}
        }
        } => !stream.timepoint
        // 释放申请的临时空间
        %4 = stream.resource.dealloca await(%3) => %result : !stream.resource<transient>{%2#0} => !stream.timepoint
        %5 = stream.timepoint.join max(%4, %3) => !stream.timepoint
        %6 = stream.timepoint.await %5 => %1 : !stream.resource<external>{%c40}
        %7 = stream.tensor.export %6 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view
        return %7 : !hal.buffer_view
        }
    • IREE::Stream::createPackConstantsPass

      stream.resource.constants的结果根据lifetime类型分成Constant和Variable两种,每一种都替换成一个util.buffer.constant

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      util.initializer {
      %c40 = arith.constant 40 : index
      %results, %result_timepoint = stream.resource.constants :
      !stream.resource<constant>{%c40} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>
      => !stream.timepoint
      %0 = stream.cmd.execute with() {
      } => !stream.timepoint
      %1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint
      util.global.store %results, @_constant : !stream.resource<constant>
      util.global.store %1, @_constant__timepoint : !stream.timepoint
      util.initializer.return
      }

      转换成,

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      util.initializer {
      %c40 = arith.constant 40 : index
      %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
      dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>,
      dense<0> : vector<24xi8>, // 填充的无用数据
      ]>
      %c0 = arith.constant 0 : index
      %c64 = arith.constant 64 : index
      // 尝试将buffer映射为target (!stream.resource<constant>)
      %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
      %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      // 如果可以映射,则直接返回映射的结果(!stream.resource<constant>)
      %4 = stream.timepoint.immediate => !stream.timepoint
      scf.yield %result, %4 : !stream.resource<constant>, !stream.timepoint
      } else {
      // 如果不能映射,需要先将buffer映射为缓冲区(stage),然后申请一段新的空间并从缓冲区拷贝数据(copy)。
      // 如果lifetime类型是Variable,则不需要try_map,直接走该分支(stage + copy)的实现。
      %4 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c64}
      %5 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c64}
      %6 = stream.cmd.execute with(%4 as %arg0: !stream.resource<staging>{%c64}, %5 as %arg1: !stream.resource<constant>{%c64}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<staging>{%c64} -> !stream.resource<constant>{%c64}
      } => !stream.timepoint
      scf.yield %5, %6 : !stream.resource<constant>, !stream.timepoint
      }
      %1 = stream.resource.subview %0#0[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c40}
      %2 = stream.cmd.execute with() {
      } => !stream.timepoint
      %3 = stream.timepoint.join max(%0#1, %2) => !stream.timepoint
      util.global.store %1, @_constant : !stream.resource<constant>
      util.global.store %3, @_constant__timepoint : !stream.timepoint
      util.initializer.return
      }
    • IREE::Stream::createPackAllocationsPass

      将包含多个resource的stream.resource.alloc 转换成 stream.resource.pack + stream.resource.alloc,并通过stream.resource.subview 获取每一个resource。

    • IREE::Stream::createLayoutSlicesPass

      stream.resource.pack转化为具体的内存复用算法计算过程。

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %c8 = arith.constant 8 : index
      %c40 = arith.constant 40 : index
      %c4 = arith.constant 4 : index
      %c0 = arith.constant 0 : index
      %c1 = arith.constant 1 : index
      %c10 = arith.constant 10 : index
      %c553648160_i32 = arith.constant 553648160 : i32
      %c1_i32 = arith.constant 1 : i32
      %c2 = arith.constant 2 : index
      hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c2]) type(%c553648160_i32) encoding(%c1_i32)
      %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2xf32> in !stream.resource<external>{%c8}
      %c0_0 = arith.constant 0 : index
      // 申请输出resource的空间
      %1 = stream.resource.alloc uninitialized : !stream.resource<external>{%c40}
      // 计算临时resource所需要的空间大小
      %2:5 = stream.resource.pack slices({
      [0, 2] = %c40, // [0, 2]是某个resource的lifetime,%40是resource size
      [1, 2] = %c4,
      [2, 4] = %c40,
      [3, 4] = %c4
      }) : index
      // 申请临时resource的空间
      %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<transient>{%2#0} => !stream.timepoint
      %3 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%2#0}) {
      stream.cmd.dispatch @predict_dispatch_0::@predict_dispatch_0_matmul_1x10x2[%c1, %c10] {
      ro %arg1[%c0 for %c8] : !stream.resource<external>{%c8},
      wo %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0}
      }
      stream.cmd.dispatch @predict_dispatch_1::@predict_dispatch_1_generic_10[%c1] {
      ro %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0},
      wo %arg3[%2#2 for %c4] : !stream.resource<transient>{%2#0}
      }
      stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10] {
      ro %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0},
      ro %arg3[%2#2 for %c4] : !stream.resource<transient>{%2#0},
      wo %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0}
      }
      stream.cmd.dispatch @predict_dispatch_3::@predict_dispatch_3_generic_10[%c1] {
      ro %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0},
      wo %arg3[%2#4 for %c4] : !stream.resource<transient>{%2#0}
      }
      stream.cmd.dispatch @predict_dispatch_4::@predict_dispatch_4_generic_1x10[%c1, %c10] {
      ro %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0},
      ro %arg3[%2#4 for %c4] : !stream.resource<transient>{%2#0},
      wo %arg2[%c0_0 for %c40] : !stream.resource<external>{%c40}
      }
      } => !stream.timepoint
      // 释放申请的临时空间
      %4 = stream.resource.dealloca await(%3) => %result : !stream.resource<transient>{%2#0} => !stream.timepoint
      %5 = stream.timepoint.join max(%4, %3) => !stream.timepoint
      %6 = stream.timepoint.await %5 => %1 : !stream.resource<external>{%c40}
      %7 = stream.tensor.export %6 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view
      return %7 : !hal.buffer_view
      }

      转换成,

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      %c8 = arith.constant 8 : index
      %c40 = arith.constant 40 : index
      %c4 = arith.constant 4 : index
      %c0 = arith.constant 0 : index
      %c1 = arith.constant 1 : index
      %c10 = arith.constant 10 : index
      %c553648160_i32 = arith.constant 553648160 : i32
      %c1_i32 = arith.constant 1 : i32
      %c2 = arith.constant 2 : index
      hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c2]) type(%c553648160_i32) encoding(%c1_i32)
      %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2xf32> in !stream.resource<external>{%c8}
      %c0_0 = arith.constant 0 : index
      %1 = stream.resource.alloc uninitialized : !stream.resource<external>{%c40}
      %c0_1 = arith.constant 0 : index
      %c64 = arith.constant 64 : index
      %c64_2 = arith.constant 64 : index
      %c128 = arith.constant 128 : index
      %c128_3 = arith.constant 128 : index
      %c192 = arith.constant 192 : index
      %c192_4 = arith.constant 192 : index
      %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<transient>{%c192_4} => !stream.timepoint
      %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%c192_4}) {
      stream.cmd.dispatch @predict_dispatch_0::@predict_dispatch_0_matmul_1x10x2[%c1, %c10] {
      ro %arg1[%c0 for %c8] : !stream.resource<external>{%c8},
      wo %arg3[%c0_1 for %c40] : !stream.resource<transient>{%c192_4}
      }
      stream.cmd.dispatch @predict_dispatch_1::@predict_dispatch_1_generic_10[%c1] {
      ro %arg3[%c0_1 for %c40] : !stream.resource<transient>{%c192_4},
      wo %arg3[%c64_2 for %c4] : !stream.resource<transient>{%c192_4}
      }
      stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10] {
      ro %arg3[%c0_1 for %c40] : !stream.resource<transient>{%c192_4},
      ro %arg3[%c64_2 for %c4] : !stream.resource<transient>{%c192_4},
      wo %arg3[%c128_3 for %c40] : !stream.resource<transient>{%c192_4}
      }
      stream.cmd.dispatch @predict_dispatch_3::@predict_dispatch_3_generic_10[%c1] {
      ro %arg3[%c128_3 for %c40] : !stream.resource<transient>{%c192_4},
      wo %arg3[%c0_1 for %c4] : !stream.resource<transient>{%c192_4}
      }
      stream.cmd.dispatch @predict_dispatch_4::@predict_dispatch_4_generic_1x10[%c1, %c10] {
      ro %arg3[%c128_3 for %c40] : !stream.resource<transient>{%c192_4},
      ro %arg3[%c0_1 for %c4] : !stream.resource<transient>{%c192_4},
      wo %arg2[%c0_0 for %c40] : !stream.resource<external>{%c40}
      }
      } => !stream.timepoint
      %3 = stream.resource.dealloca await(%2) => %result : !stream.resource<transient>{%c192_4} => !stream.timepoint
      %4 = stream.timepoint.join max(%3, %2) => !stream.timepoint
      %5 = stream.timepoint.await %4 => %1 : !stream.resource<external>{%c40}
      %6 = stream.tensor.export %5 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view
      return %6 : !hal.buffer_view
      }
    • IREE::Util::createPropagateSubrangesPass

      把resource转换成 (resource, size, offset, length)的元组。

      • util.global

        1
        util.global private @_constant : !stream.resource<constant>

        转换成

        1
        2
        3
        4
        util.global private @_constant : !stream.resource<constant>
        util.global private @_constant_size : index
        util.global private @_constant_offset : index
        util.global private @_constant_length : index
      • util.global.load

        1
        %0 = util.global.load @foo : !stream.resource

        转换成

        1
        2
        3
        4
        5
        6
        %0 = util.global.load @foo : !stream.resource
        %s = util.global.load @foo_size : index
        %o = util.global.load @foo_offset : index
        %l = util.global.load @foo_length : index
        %1 = stream.resource.subview %0[%o] :
        !stream.resource<*>{%s} -> !stream.resource<*>{%l}
      • util.global.store

        1
        2
        3
        %1 = stream.resource.subview %0[%o] :
        !stream.resource<*>{%s} -> !stream.resource<*>{%l}
        util.global.store %1, @foo : !stream.resource

        转换成

        1
        2
        3
        4
        util.global.store %0, @foo : !stream.resource // 这里语义是正确的吗???
        util.global.store %s, @foo_size : index
        util.global.store %o, @foo_offset : index
        util.global.store %l, @foo_length : index
      • func.func

        1
        2
        3
        func.func @foo(%0: !stream.resource) {
        ...
        }

        转换成

        1
        2
        3
        4
        func.func @foo(%0: !stream.resource, %sz: index, %o: index, %l: index) {
        %1 = stream.resource.subview %0[%o] : {%sz} -> {%l}
        ...
        }
      • call

        1
        2
        %1 = stream.resource.subview %0[%o] : {%sz} -> {%l}
        %r = call @foo(%1)

        转换成

        1
        2
        %r, %rsz, %ro, %rl = call @foo(%0, %sz, %o, %l)
        %2 = stream.resource.subview %r[%ro] : {%rsz} -> {%rl}
      • return

        1
        2
        %1 = stream.resource.subview %0[%o] : {%sz} -> {%l}
        return %1

        转换成

        1
        return %0, %sz, %o, %l
      • branch

        1
        2
        3
        4
        5
        %1 = stream.resource.subview %0[%o] : {%sz} -> {%l}
        br ^bb1(%1)

        ^bb1(%b):
        ...

        转换成

        1
        2
        3
        4
        br ^bb1(%0, %sz, %o, %l)

        ^bb1(%a, %b, %c, %d):
        %1 = stream.resource.subview %a[%b] : {%c} -> {%d}
      • cond_branch

    • addCleanupPatterns

    • IREE::Stream::createVerifyLoweringToCmdPass

      验证program的合法性。

  • buildStreamOptimizationPassPipeline

    • addCleanupPatterns

    • mlir::createConvertSCFToCFPass

      将structured control flow算子转换成更低层基础块形式的control flow算子。

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      func.func @test(%pred: i32, %arg1: tensor<2x10xf32>, %arg2: tensor<2x10xf32>) -> tensor<2x10xf32> {
      %c0 = arith.constant 0 : i32
      %0 = arith.cmpi sgt, %pred, %c0 : i32
      %1 = scf.if %0 -> (tensor<2x10xf32>) {
      %2 = mhlo.add %arg1, %arg2 : tensor<2x10xf32>
      scf.yield %2 : tensor<2x10xf32>
      } else {
      %2 = mhlo.subtract %arg1, %arg2 : tensor<2x10xf32>
      scf.yield %2 : tensor<2x10xf32>
      }
      return %1 : tensor<2x10xf32>
      }

      转换成

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      func.func @test(%pred: i32, %arg1: tensor<2x10xf32>, %arg2: tensor<2x10xf32>) -> tensor<2x10xf32> {
      %c0 = arith.constant 0 : i32
      %0 = arith.cmpi sgt, %pred, %c0 : i32
      cf.cond_br %0, ^bb1, ^bb2
      ^bb1:
      %2 = mhlo.add %arg1, %arg2 : tensor<2x10xf32>
      cf.br ^bb3(%2 : tensor<2x10xf32>)
      ^bb2:
      %3 = mhlo.subtract %arg1, %arg2 : tensor<2x10xf32>
      cf.br ^bb3(%3 : tensor<2x10xf32>)
      ^bb3(%4: tensor<2x10xf32>):
      return %4 : tensor<2x10xf32>
      }
    • addCleanupPatterns

    • IREE::Stream::createElideTimepointsPass

      消除已经确信到达的等待。比如

      1
      2
      3
      %timepoint0 = ...
      %timepoint1 = ... await(%timepoint0)
      %timepoint2 = stream.timepoint.join max(%timepoint0, %timepoint1)

      timepoint1到达时timepoint0一定已经达到过,因此可以转换成,

      1
      2
      3
      %timepoint0 = ...
      %timepoint1 = ... await(%timepoint0)
      %timepoint2 = stream.timepoint.join max(%timepoint1)

      canonicalization之后最终是

      1
      2
      3
      %timepoint0 = ...
      %timepoint1 = ... await(%timepoint0)
      %timepoint2 = %timepoint1
    • IREE::Util::createFixedPointIteratorPass

      该pass触发重复执行一个pass pipeline,直到达到固定迭代次数或最大迭代次数。这里的pipeline包括前面的addCleanupPatterns和createElideTimepointsPass两个子pass。

    • IREE::Stream::createFuseDispatchBindingsPass

      根据stream.cmd.dispatch 的resource关系合并dispatch executable的bindings,比如stream.cmd.dispatch 两个resource是同一个地址的不同range,则可以计算每个resource在base地址上的偏移,并将这两个resource合并成一个binding,在dispatch executable中根据偏移来截取每个被合并的binding。该操作默认只合并read only的resource。

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      stream.executable private @predict_dispatch_2 {
      stream.executable.export public @predict_dispatch_2_generic_1x10 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
      }
      builtin.module {
      func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1x10xf32>>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1x10xf32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x10xf32>> -> tensor<1x10xf32>
      %4 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %5 = tensor.empty() : tensor<1x10xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<1x10xf32>, tensor<f32>) outs(%5 : tensor<1x10xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
      %7 = arith.subf %in, %in_0 : f32
      %8 = math.exp %7 : f32
      linalg.yield %8 : f32
      } -> tensor<1x10xf32>
      flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x10xf32>>
      return
      }
      }
      }

      func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      ...
      %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%c192}) {
      ...
      stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10] {
      ro %arg3[%c0 for %c40] : !stream.resource<transient>{%c192},
      ro %arg3[%c64 for %c4] : !stream.resource<transient>{%c192},
      wo %arg3[%c128 for %c40] : !stream.resource<transient>{%c192}
      }
      ...
      }
      }

      转换成

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      stream.executable private @predict_dispatch_2 {
      stream.executable.export public @predict_dispatch_2_generic_1x10 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
      }
      builtin.module {
      func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index) {
      %c0 = arith.constant 0 : index
      %0 = arith.addi %c0, %arg2 : index
      %1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1x10xf32>>
      %2 = arith.addi %c0, %arg3 : index
      %3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
      %4 = arith.addi %c0, %arg4 : index
      %5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1x10xf32>>
      %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x10xf32>> -> tensor<1x10xf32>
      %7 = flow.dispatch.tensor.load %3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %8 = tensor.empty() : tensor<1x10xf32>
      %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6, %7 : tensor<1x10xf32>, tensor<f32>) outs(%8 : tensor<1x10xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
      %10 = arith.subf %in, %in_0 : f32
      %11 = math.exp %10 : f32
      linalg.yield %11 : f32
      } -> tensor<1x10xf32>
      flow.dispatch.tensor.store %9, %5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x10xf32>>
      return
      }
      }
      }

      func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
      ...
      %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%c192}) {
      ...
      stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10](%c0, %c64, %c128 : index, index, index) {
      ro %arg3[%c0_0 for %c192] : !stream.resource<transient>{%c192},
      wo %arg3[%c0_0 for %c192] : !stream.resource<transient>{%c192}
      }
      ...
      }
      }

      可以看到stream.cmd.dispatch @predict_dispatch_2的resource被合并为2个,predict_dispatch_2_generic_1x10 dispatch executable参数中的binding也减少为2个,但增加了3个表示offset的index,被合并的binding根据offset来截取。

    • IREE::Stream::createPackDispatchOperandsPass

      将dispatch executable参数中的标量/index类型转换成i32或i64类型。

      1
      2
      3
      func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index) {
      ...
      }

      转换成

      1
      2
      3
      func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: i32, %arg3: i32, %arg4: i32) {
      ...
      }
    • mlir::createCSEPass

    • IREE::Stream::createFoldUniformOperandsPass

      折叠dispatch executable的所有调用中相同的参数。

      1
      2
      3
      stream.cmd.dispatch @foo(%c1, %c100 : index, index)
      stream.cmd.dispatch @foo(%c1, %c101 : index, index)
      stream.cmd.dispatch @foo2(%c1, %c101 : index, index)

      转换成

      1
      2
      3
      stream.cmd.dispatch @foo(%c100 : index)
      stream.cmd.dispatch @foo(%c101 : index)
      stream.cmd.dispatch @foo2()

      @foo内联了%c1@foo2内联了%c1%c101

    • IREE::Stream::createAnnotateDispatchArgumentsPass

      给dispatch executable的参数添加potential value和alignment信息。

      1
      2
      3
      func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding) {
      ...
      }

      转换为

      1
      2
      3
      func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
      ...
      }
  • IREE::Stream::createMemoizeChannelsPass

    找出所有stream.channel.default ops,为每一个stream.channel.default op创建一个全局缓冲区,同时在初始化时创建对应的channel,并将channel结果写入全局缓冲区,最后将该stream.channel.default op替换为全局缓冲区的util.global.load op。

  • addCleanupPatterns

  • mlir::createSymbolDCEPass

Contents