当前位置: 首页 > ops >正文

TVM Operator Inventory (TOPI)简介

TOPI简介
这是 TVM Operator Inventory (TOPI) 的介绍。TOPI 提供了比 TVM 具有更高抽象的 numpy 风格的,通用操作和调度。TOPI 如何在 TVM 中,编写样板代码。
from future import absolute_import, print_function

import tvm
import tvm.testing
from tvm import te
from tvm import topi
import numpy as np
基本示例
重新审视行总和操作(相当于B = numpy.sum(A, axis=1)),要计算二维 TVM 张量 A 行总和,应该指定符号操作及调度。
n = te.var(“n”)
m = te.var(“m”)
A = te.placeholder((n, m), name=“A”)
k = te.reduce_axis((0, m), “k”)
B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name=“B”)
s = te.create_schedule(B.op)
以人类可读的格式,检查 IR 代码,可以这样做。
print(tvm.lower(s, [A], simple_mode=True))
输出:
primfn(A_1: handle) -> ()
attr = {“from_legacy_te_schedule”: True, “global_symbol”: “main”, “tir.noalias”: True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [n: int32, m: int32], [stride: int32, stride_1: int32], type=“auto”)}
buffer_map = {A_1: A} {
allocate(B: Pointer(global float32), float32, [n]), storage_scope = global;
for (i: int32, 0, n) {
B[i] = 0f32
for (k: int32, 0, m) {
B[i] = ((float32*)B[i] + (float32*)A_2[((istride) + (kstride_1))])
}
}
}
对于这样一个常见的操作,必须定义 reduce 轴,以及使用 te.compute进行显式计算 。对于更复杂的操作,需要提供多少细节。可以用简单topi.sum的,如numpy.sum,替换这两行。
C = topi.sum(A, axis=1)
ts = te.create_schedule(C.op)
print(tvm.lower(ts, [A], simple_mode=True))
输出:
primfn(A_1: handle) -> ()
attr = {“from_legacy_te_schedule”: True, “global_symbol”: “main”, “tir.noalias”: True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [n: int32, m: int32], [stride: int32, stride_1: int32], type=“auto”)}
buffer_map = {A_1: A} {
allocate(A_red: Pointer(global float32), float32, [n]), storage_scope = global;
for (ax0: int32, 0, n) {
A_red[ax0] = 0f32
for (k1: int32, 0, m) {
A_red[ax0] = ((float32*)A_red[ax0] + (float32*)A_2[((ax0stride) + (k1stride_1))])
}
}
}
Numpy 风格的算子重载
可以使用topi.broadcast_add具有正确(可广播特定)shape的张量,添加两个张量。TOPI 为此类常见操作,提供了算子重载。例如,
x, y = 100, 10
a = te.placeholder((x, y, y), name=“a”)
b = te.placeholder((y, y), name=“b”)
c = a + b # same as topi.broadcast_add
d = a * b # same as topi.broadcast_mul
使用相同的语法重载,TOPI 处理,将原语(int,float)广播到 tensor d - 3.14。
通用调度和融合操作
TOPI 如何免于在较低级别的 API 中,编写显式计算。像以前一样进行调度,TOPI根据给定的上下文,提供更高级别的调度方法。例如,对于 CUDA,可以using only topi.generic.schedule_reduce,调度topi.sum结尾的一系列操作。
e = topi.elemwise_sum([c, d])
f = e / 2.0
g = topi.sum(f)
with tvm.target.cuda():
sg = topi.cuda.schedule_reduce(g)
print(tvm.lower(sg, [a, b], simple_mode=True))
输出:
primfn(a_1: handle, b_1: handle) -> ()
attr = {“from_legacy_te_schedule”: True, “global_symbol”: “main”, “tir.noalias”: True}
buffers = {b: Buffer(b_2: Pointer(float32), float32, [10, 10], []),
a: Buffer(a_2: Pointer(float32), float32, [100, 10, 10], [])}
buffer_map = {a_1: a, b_1: b} {
allocate(T_divide_red: Pointer(global float32), float32, [1]), storage_scope = global;
attr [IterVar(threadIdx.x: int32, [0:1024], “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 1024;
allocate(T_divide_red.rf: Pointer(local float32), float32, [1]), storage_scope = local;
allocate(reduce_temp0: Pointer(local float32), float32, [1]), storage_scope = local {
T_divide_red.rf[0] = 0f32
for (k0.k1.fused.k2.fused.outer: int32, 0, 10) {
if @tir.likely((((((k0.k1.fused.k2.fused.outer1024) + threadIdx.x) < 10000) && (((k0.k1.fused.k2.fused.outer1024) + threadIdx.x) < 10000)) && (((k0.k1.fused.k2.fused.outer1024) + threadIdx.x) < 10000)), dtype=bool) {
T_divide_red.rf[0] = ((float32
)T_divide_red.rf[0] + ((((float32*)a_2[((k0.k1.fused.k2.fused.outer1024) + threadIdx.x)] + (float32)b_2[floormod(((k0.k1.fused.k2.fused.outer1024) + threadIdx.x), 100)]) + ((float32)a_2[((k0.k1.fused.k2.fused.outer1024) + threadIdx.x)](float32*)b_2[floormod(((k0.k1.fused.k2.fused.outer1024) + threadIdx.x), 100)]))0.5f32))
}
}
attr [meta[tir.CommReducer][0]] “reduce_scope” = @tir.reinterpret(0u64, dtype=handle);
@tir.tvm_thread_allreduce(1u32, (float32
)T_divide_red.rf[0], True, reduce_temp0, threadIdx.x, dtype=handle)
if (threadIdx.x == 0) {
T_divide_red[0] = (float32
)reduce_temp0[0]
}
}
}
计算的预定阶段已经累积,可以通过以下方式检查。
print(sg.stages)
输出:
[stage(a, placeholder(a, 0xd9c0fa00)), stage(b, placeholder(b, 0xe225cf70)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_elemwise_sum, compute(T_elemwise_sum, body=[(T_add[ax0, ax1, ax2] + T_multiply[ax0, ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=elemwise, attrs={})), stage(T_divide, compute(T_divide, body=[(T_elemwise_sum[ax0, ax1, ax2]/2f)], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=elemwise, attrs={})), stage(T_divide_red.rf, compute(T_divide_red.rf, body=[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[T_divide[floordiv(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10), 10), floormod(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10), 10), floormod((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10)]], init=[], axis=[iter_var(k0.k1.fused.k2.fused.outer, range(min=0, ext=10))], where=tir.likely((((floordiv(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10), 10) < 100) && (floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10) < 1000)) && ((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)) < 10000))), value_index=0)], axis=[iter_var(k0.k1.fused.k2.fused.inner, range(min=0, ext=1024))], reduce_axis=[iter_var(k0.k1.fused.k2.fused.outer, range(min=0, ext=10))], tag=, attrs={})), stage(T_divide_red, compute(T_divide_red.repl, body=[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[T_divide_red.rf[k0.k1.fused.k2.fused.inner.v]], init=[], axis=[iter_var(k0.k1.fused.k2.fused.inner.v, range(min=0, ext=1024))], where=(bool)1, value_index=0)], axis=[], reduce_axis=[iter_var(k0.k1.fused.k2.fused.inner.v, range(min=0, ext=1024))], tag=, attrs={}))]
可以通过与numpy结果进行比较,测试正确性,如下所示。
func = tvm.build(sg, [a, b, g], “cuda”)
dev = tvm.cuda(0)
a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
a_nd = tvm.nd.array(a_np, dev)
b_nd = tvm.nd.array(b_np, dev)
g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
func(a_nd, b_nd, g_nd)
tvm.testing.assert_allclose(g_nd.numpy(), g_np, rtol=1e-5)
TOPI 提供常用的神经网络操作,如 softmax 优化调度
tarray = te.placeholder((512, 512), name=“tarray”)
softmax_topi = topi.nn.softmax(tarray)
with tvm.target.Target(“cuda”):
sst = topi.cuda.schedule_softmax(softmax_topi)
print(tvm.lower(sst, [tarray], simp

http://www.xdnf.cn/news/11444.html

相关文章:

  • 操作系统(2)操作系统概述
  • WebGIS入门
  • Visual Studio 2005 IDE 技巧和窍门
  • StrictMode总结
  • this.Invoke((EventHandler)(delegate { }); 简解,(有误解恳亲指出
  • 海得服务器虚拟机,海得PLC远程编程调试流程
  • 不想要网页默认的右键菜单栏,怎么封装一个可以自定义的右键菜单组件?
  • asp八大开源cms比较汇总
  • 基于 .NET 6 开发的英雄联盟插件
  • AS3接口详解
  • 主机屋 linux,如何主机屋中发布网站?
  • tbody的解释及用法
  • 【虹科干货】TWAMP:什么是双向主动测量协议?
  • 科技类 企业网站 自适应整站前端html源码,50个页面,值得学习
  • 什么是静态网页
  • Error:kCFStreamErrorCodeKey=-2102 Domain=kCFErrorDomainCFNetwork Code=-1001 - iOS
  • 【CSS】font-weight设置为500显示不出加粗效果
  • WeX5学习笔记
  • 关于斐波拉契数列(Fibonacci)
  • 基数统计算法--HyperLogLog
  • 当 IDENTITY_INSERT 设置为 OFF 时,不能为表中的标识列插入显式值
  • serverlet学习
  • [全程动图]解决Offline Explorer崩溃闪退的问题和一些小技巧(如何下载js、100线程下载)
  • <html> 从0到1的教学实践分享(全网最全)
  • 数据哪里找?200个源数据网站全给你!
  • 如何在路由器上设置PPPoE(ADSL虚拟拨号)上网,即(宽带拨号)?
  • 海量数据处理之Bloom Filter详解
  • MAX10片内User Flash的使用
  • WebWork介绍
  • Kotlin入门学习(非常详细),从零基础入门到精通,看完这一篇就够了