正如其他人所指出的那样,CUDA设备没有硬件浮点除法的指令.相反,它们从分母的倒数的初始近似开始,由单个精度特殊功能单元提供.然后对具有分子的乘积进行迭代求精,直到它与机器精度内的分数匹配.
即使__ddiv_rn()
内在函数被ptxas编译到这个指令序列,所以它的使用也没有区别.
您可以通过自己使用检查代码来获得更深入的了解cuobjdump -sass
,尽管由于除了简单的指令列表之外没有可用的着色器程序集的官方文档,因此很难实现.
我将使用以下的裸骨分区内核作为示例:
__global__ void div(double x, double y, double *z) { *z = x / y; }
对于计算能力3.5设备,这将编译为以下着色器程序集:
Function : _Z3divddPd .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" /* 0x08a0109c10801000 */ /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ /*0010*/ MOV R0, c[0x0][0x14c]; /* 0x64c03c00299c0002 */ /*0018*/ MOV32I R2, 0x1; /* 0x74000000009fc00a */ /*0020*/ MOV R8, c[0x0][0x148]; /* 0x64c03c00291c0022 */ /*0028*/ MOV R9, c[0x0][0x14c]; /* 0x64c03c00299c0026 */ /*0030*/ MUFU.RCP64H R3, R0; /* 0x84000000031c000e */ /*0038*/ MOV32I R0, 0x35b7333; /* 0x7401adb9999fc002 */ /* 0x08a080a080a4a4a4 */ /*0048*/ DFMA R4, -R8, R2, c[0x2][0x0]; /* 0x9b880840001c2012 */ /*0050*/ DFMA R4, R4, R4, R4; /* 0xdb801000021c1012 */ /*0058*/ DFMA R4, R4, R2, R2; /* 0xdb800800011c1012 */ /*0060*/ DMUL R6, R4, c[0x0][0x140]; /* 0x64000000281c101a */ /*0068*/ FSETP.GE.AND P0, PT, R0, |c[0x0][0x144]|, PT; /* 0x5db09c00289c001e */ /*0070*/ DFMA R8, -R8, R6, c[0x0][0x140]; /* 0x9b881800281c2022 */ /*0078*/ MOV R2, c[0x0][0x150]; /* 0x64c03c002a1c000a */ /* 0x0880acb0a0ac8010 */ /*0088*/ MOV R3, c[0x0][0x154]; /* 0x64c03c002a9c000e */ /*0090*/ DFMA R4, R8, R4, R6; /* 0xdb801800021c2012 */ /*0098*/ @P0 BRA 0xb8; /* 0x120000000c00003c */ /*00a0*/ FFMA R0, RZ, c[0x0][0x14c], R5; /* 0x4c001400299ffc02 */ /*00a8*/ FSETP.GT.AND P0, PT, |R0|, c[0x2][0x8], PT; /* 0x5da01c40011c021e */ /*00b0*/ @P0 BRA 0xe8; /* 0x120000001800003c */ /*00b8*/ MOV R4, c[0x0][0x140]; /* 0x64c03c00281c0012 */ /* 0x08a1b810b8008010 */ /*00c8*/ MOV R5, c[0x0][0x144]; /* 0x64c03c00289c0016 */ /*00d0*/ MOV R7, c[0x0][0x14c]; /* 0x64c03c00299c001e */ /*00d8*/ MOV R6, c[0x0][0x148]; /* 0x64c03c00291c001a */ /*00e0*/ CAL 0xf8; /* 0x1300000008000100 */ /*00e8*/ ST.E.64 [R2], R4; /* 0xe5800000001c0810 */ /*00f0*/ EXIT; /* 0x18000000001c003c */ /*00f8*/ LOP32I.AND R0, R7, 0x40000000; /* 0x20200000001c1c00 */ /* 0x08a08010a010b010 */ /*0108*/ MOV32I R15, 0x1ff00000; /* 0x740ff800001fc03e */ /*0110*/ ISETP.LT.U32.AND P0, PT, R0, c[0x2][0xc], PT; /* 0x5b101c40019c001e */ /*0118*/ MOV R8, RZ; /* 0xe4c03c007f9c0022 */ /*0120*/ SEL R9, R15, c[0x2][0x10], !P0; /* 0x65002040021c3c26 */ /*0128*/ MOV32I R12, 0x1; /* 0x74000000009fc032 */ /*0130*/ DMUL R10, R8, R6; /* 0xe4000000031c202a */ /*0138*/ LOP32I.AND R0, R5, 0x7f800000; /* 0x203fc000001c1400 */ /* 0x08a0108ca01080a0 */ /*0148*/ MUFU.RCP64H R13, R11; /* 0x84000000031c2c36 */ /*0150*/ DFMA R16, -R10, R12, c[0x2][0x0]; /* 0x9b883040001c2842 */ /*0158*/ ISETP.LT.U32.AND P0, PT, R0, c[0x2][0x14], PT; /* 0x5b101c40029c001e */ /*0160*/ MOV R14, RZ; /* 0xe4c03c007f9c003a */ /*0168*/ DFMA R16, R16, R16, R16; /* 0xdb804000081c4042 */ /*0170*/ SEL R15, R15, c[0x2][0x10], !P0; /* 0x65002040021c3c3e */ /*0178*/ SSY 0x3a0; /* 0x1480000110000000 */ /* 0x08acb4a4a4a4a480 */ /*0188*/ DMUL R14, R14, R4; /* 0xe4000000021c383a */ /*0190*/ DFMA R12, R16, R12, R12; /* 0xdb803000061c4032 */ /*0198*/ DMUL R16, R14, R12; /* 0xe4000000061c3842 */ /*01a0*/ DFMA R10, -R10, R16, R14; /* 0xdb883800081c282a */ /*01a8*/ DFMA R10, R10, R12, R16; /* 0xdb804000061c282a */ /*01b0*/ DSETP.LEU.AND P0, PT, |R10|, RZ, PT; /* 0xdc581c007f9c2a1e */ /*01b8*/ @!P0 BRA 0x1e0; /* 0x120000001020003c */ /* 0x088010b010b8acb4 */ /*01c8*/ DSETP.EQ.AND P0, PT, R10, RZ, PT; /* 0xdc101c007f9c281e */ /*01d0*/ @!P0 BRA 0x358; /* 0x12000000c020003c */ /*01d8*/ DMUL.S R8, R4, R6; /* 0xe4000000035c1022 */ /*01e0*/ ISETP.GT.U32.AND P0, PT, R0, c[0x2][0x18], PT; /* 0x5b401c40031c001e */ /*01e8*/ MOV32I R0, 0x1ff00000; /* 0x740ff800001fc002 */ /*01f0*/ MOV R14, RZ; /* 0xe4c03c007f9c003a */ /*01f8*/ SEL R15, R0, c[0x2][0x10], !P0; /* 0x65002040021c003e */ /* 0x08b4a49c849c849c */ /*0208*/ DMUL R12, R10, R8; /* 0xe4000000041c2832 */ /*0210*/ DMUL R18, R10, R14; /* 0xe4000000071c284a */ /*0218*/ DMUL R10, R12, R14; /* 0xe4000000071c302a */ /*0220*/ DMUL R16, R8, R18; /* 0xe4000000091c2042 */ /*0228*/ DFMA R8, R10, R6, -R4; /* 0xdb901000031c2822 */ /*0230*/ DFMA R12, R16, R6, -R4; /* 0xdb901000031c4032 */ /*0238*/ DSETP.GT.AND P0, PT, |R8|, |R12|, PT; /* 0xdc209c00061c221e */ /* 0x08b010ac10b010a0 */ /*0248*/ SEL R9, R17, R11, P0; /* 0xe5000000059c4426 */ /*0250*/ FSETP.GTU.AND P1, PT, |R9|, 1.469367938527859385e-39, PT; /* 0xb5e01c00801c263d */ /*0258*/ MOV R11, R9; /* 0xe4c03c00049c002e */ /*0260*/ SEL R8, R16, R10, P0; /* 0xe5000000051c4022 */ /*0268*/ @P1 NOP.S; /* 0x8580000000443c02 */ /*0270*/ FSETP.LT.AND P0, PT, |R5|, 1.5046327690525280102e-36, PT; /* 0xb5881c20001c161d */ /*0278*/ MOV32I R0, 0x3ff00000; /* 0x741ff800001fc002 */ /* 0x0880a48090108c10 */ /*0288*/ MOV R16, RZ; /* 0xe4c03c007f9c0042 */ /*0290*/ SEL R17, R0, c[0x2][0x1c], !P0; /* 0x65002040039c0046 */ /*0298*/ LOP.OR R10, R8, 0x1; /* 0xc2001000009c2029 */ /*02a0*/ LOP.AND R8, R8, -0x2; /* 0xca0003ffff1c2021 */ /*02a8*/ DMUL R4, R16, R4; /* 0xe4000000021c4012 */ /*02b0*/ DMUL R6, R16, R6; /* 0xe4000000031c401a */ /*02b8*/ DFMA R14, R10, R6, -R4; /* 0xdb901000031c283a */ /* 0x08b010b010a0b4a4 */ /*02c8*/ DFMA R12, R8, R6, -R4; /* 0xdb901000031c2032 */ /*02d0*/ DSETP.GT.AND P0, PT, |R12|, |R14|, PT; /* 0xdc209c00071c321e */ /*02d8*/ SEL R8, R10, R8, P0; /* 0xe5000000041c2822 */ /*02e0*/ LOP.AND R0, R8, 0x1; /* 0xc2000000009c2001 */ /*02e8*/ IADD R11.CC, R8, -0x1; /* 0xc88403ffff9c202d */ /*02f0*/ ISETP.EQ.U32.AND P0, PT, R0, 0x1, PT; /* 0xb3201c00009c001d */ /*02f8*/ IADD.X R0, R9, -0x1; /* 0xc88043ffff9c2401 */ /* 0x08b4a480a010b010 */ /*0308*/ SEL R10, R11, R8, !P0; /* 0xe5002000041c2c2a */ /*0310*/ @P0 IADD R8.CC, R8, 0x1; /* 0xc084000000802021 */ /*0318*/ SEL R11, R0, R9, !P0; /* 0xe5002000049c002e */ /*0320*/ @P0 IADD.X R9, R9, RZ; /* 0xe08040007f802426 */ /*0328*/ DFMA R14, R10, R6, -R4; /* 0xdb901000031c283a */ /*0330*/ DFMA R4, R8, R6, -R4; /* 0xdb901000031c2012 */ /*0338*/ DSETP.GT.AND P0, PT, |R4|, |R14|, PT; /* 0xdc209c00071c121e */ /* 0x08b4acb4a010b810 */ /*0348*/ SEL R8, R10, R8, P0; /* 0xe5000000041c2822 */ /*0350*/ SEL.S R9, R11, R9, P0; /* 0xe500000004dc2c26 */ /*0358*/ MOV R8, RZ; /* 0xe4c03c007f9c0022 */ /*0360*/ MUFU.RCP64H R9, R7; /* 0x84000000031c1c26 */ /*0368*/ DSETP.GT.AND P0, PT, |R8|, RZ, PT; /* 0xdc201c007f9c221e */ /*0370*/ @P0 BRA.U 0x398; /* 0x120000001000023c */ /*0378*/ @!P0 DSETP.NEU.AND P1, PT, |R6|, +INF , PT; /* 0xb4681fff80201a3d */ /* 0x0800b8a010ac0010 */ /*0388*/ @!P0 SEL R9, R7, R9, P1; /* 0xe500040004a01c26 */ /*0390*/ @!P0 SEL R8, R6, RZ, P1; /* 0xe50004007fa01822 */ /*0398*/ DMUL.S R8, R8, R4; /* 0xe4000000025c2022 */ /*03a0*/ MOV R4, R8; /* 0xe4c03c00041c0012 */ /*03a8*/ MOV R5, R9; /* 0xe4c03c00049c0016 */ /*03b0*/ RET; /* 0x19000000001c003c */ /*03b8*/ BRA 0x3b8; /* 0x12007ffffc1c003c */
该MUFU.RCP64H
指令提供了倒数的初始近似值.它在分母(y
)的高32位上运行,并提供双精度近似的高32位,因此被分析器计为浮点运算(单精度特殊).还有另一个单精度FFMA
指令显然用作测试条件的高吞吐量版本,不需要全精度.
正如其他人所指出的那样,CUDA设备没有硬件浮点除法的指令.相反,它们从分母的倒数的初始近似开始,由单个精度特殊功能单元提供.然后对具有分子的乘积进行迭代求精,直到它与机器精度内的分数匹配.
即使__ddiv_rn()
内在函数被ptxas编译到这个指令序列,所以它的使用也没有区别.
您可以通过自己使用检查代码来获得更深入的了解cuobjdump -sass
,尽管由于除了简单的指令列表之外没有可用的着色器程序集的官方文档,因此很难实现.
我将使用以下的裸骨分区内核作为示例:
__global__ void div(double x, double y, double *z) { *z = x / y; }
对于计算能力3.5设备,这将编译为以下着色器程序集:
Function : _Z3divddPd .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" /* 0x08a0109c10801000 */ /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ /*0010*/ MOV R0, c[0x0][0x14c]; /* 0x64c03c00299c0002 */ /*0018*/ MOV32I R2, 0x1; /* 0x74000000009fc00a */ /*0020*/ MOV R8, c[0x0][0x148]; /* 0x64c03c00291c0022 */ /*0028*/ MOV R9, c[0x0][0x14c]; /* 0x64c03c00299c0026 */ /*0030*/ MUFU.RCP64H R3, R0; /* 0x84000000031c000e */ /*0038*/ MOV32I R0, 0x35b7333; /* 0x7401adb9999fc002 */ /* 0x08a080a080a4a4a4 */ /*0048*/ DFMA R4, -R8, R2, c[0x2][0x0]; /* 0x9b880840001c2012 */ /*0050*/ DFMA R4, R4, R4, R4; /* 0xdb801000021c1012 */ /*0058*/ DFMA R4, R4, R2, R2; /* 0xdb800800011c1012 */ /*0060*/ DMUL R6, R4, c[0x0][0x140]; /* 0x64000000281c101a */ /*0068*/ FSETP.GE.AND P0, PT, R0, |c[0x0][0x144]|, PT; /* 0x5db09c00289c001e */ /*0070*/ DFMA R8, -R8, R6, c[0x0][0x140]; /* 0x9b881800281c2022 */ /*0078*/ MOV R2, c[0x0][0x150]; /* 0x64c03c002a1c000a */ /* 0x0880acb0a0ac8010 */ /*0088*/ MOV R3, c[0x0][0x154]; /* 0x64c03c002a9c000e */ /*0090*/ DFMA R4, R8, R4, R6; /* 0xdb801800021c2012 */ /*0098*/ @P0 BRA 0xb8; /* 0x120000000c00003c */ /*00a0*/ FFMA R0, RZ, c[0x0][0x14c], R5; /* 0x4c001400299ffc02 */ /*00a8*/ FSETP.GT.AND P0, PT, |R0|, c[0x2][0x8], PT; /* 0x5da01c40011c021e */ /*00b0*/ @P0 BRA 0xe8; /* 0x120000001800003c */ /*00b8*/ MOV R4, c[0x0][0x140]; /* 0x64c03c00281c0012 */ /* 0x08a1b810b8008010 */ /*00c8*/ MOV R5, c[0x0][0x144]; /* 0x64c03c00289c0016 */ /*00d0*/ MOV R7, c[0x0][0x14c]; /* 0x64c03c00299c001e */ /*00d8*/ MOV R6, c[0x0][0x148]; /* 0x64c03c00291c001a */ /*00e0*/ CAL 0xf8; /* 0x1300000008000100 */ /*00e8*/ ST.E.64 [R2], R4; /* 0xe5800000001c0810 */ /*00f0*/ EXIT; /* 0x18000000001c003c */ /*00f8*/ LOP32I.AND R0, R7, 0x40000000; /* 0x20200000001c1c00 */ /* 0x08a08010a010b010 */ /*0108*/ MOV32I R15, 0x1ff00000; /* 0x740ff800001fc03e */ /*0110*/ ISETP.LT.U32.AND P0, PT, R0, c[0x2][0xc], PT; /* 0x5b101c40019c001e */ /*0118*/ MOV R8, RZ; /* 0xe4c03c007f9c0022 */ /*0120*/ SEL R9, R15, c[0x2][0x10], !P0; /* 0x65002040021c3c26 */ /*0128*/ MOV32I R12, 0x1; /* 0x74000000009fc032 */ /*0130*/ DMUL R10, R8, R6; /* 0xe4000000031c202a */ /*0138*/ LOP32I.AND R0, R5, 0x7f800000; /* 0x203fc000001c1400 */ /* 0x08a0108ca01080a0 */ /*0148*/ MUFU.RCP64H R13, R11; /* 0x84000000031c2c36 */ /*0150*/ DFMA R16, -R10, R12, c[0x2][0x0]; /* 0x9b883040001c2842 */ /*0158*/ ISETP.LT.U32.AND P0, PT, R0, c[0x2][0x14], PT; /* 0x5b101c40029c001e */ /*0160*/ MOV R14, RZ; /* 0xe4c03c007f9c003a */ /*0168*/ DFMA R16, R16, R16, R16; /* 0xdb804000081c4042 */ /*0170*/ SEL R15, R15, c[0x2][0x10], !P0; /* 0x65002040021c3c3e */ /*0178*/ SSY 0x3a0; /* 0x1480000110000000 */ /* 0x08acb4a4a4a4a480 */ /*0188*/ DMUL R14, R14, R4; /* 0xe4000000021c383a */ /*0190*/ DFMA R12, R16, R12, R12; /* 0xdb803000061c4032 */ /*0198*/ DMUL R16, R14, R12; /* 0xe4000000061c3842 */ /*01a0*/ DFMA R10, -R10, R16, R14; /* 0xdb883800081c282a */ /*01a8*/ DFMA R10, R10, R12, R16; /* 0xdb804000061c282a */ /*01b0*/ DSETP.LEU.AND P0, PT, |R10|, RZ, PT; /* 0xdc581c007f9c2a1e */ /*01b8*/ @!P0 BRA 0x1e0; /* 0x120000001020003c */ /* 0x088010b010b8acb4 */ /*01c8*/ DSETP.EQ.AND P0, PT, R10, RZ, PT; /* 0xdc101c007f9c281e */ /*01d0*/ @!P0 BRA 0x358; /* 0x12000000c020003c */ /*01d8*/ DMUL.S R8, R4, R6; /* 0xe4000000035c1022 */ /*01e0*/ ISETP.GT.U32.AND P0, PT, R0, c[0x2][0x18], PT; /* 0x5b401c40031c001e */ /*01e8*/ MOV32I R0, 0x1ff00000; /* 0x740ff800001fc002 */ /*01f0*/ MOV R14, RZ; /* 0xe4c03c007f9c003a */ /*01f8*/ SEL R15, R0, c[0x2][0x10], !P0; /* 0x65002040021c003e */ /* 0x08b4a49c849c849c */ /*0208*/ DMUL R12, R10, R8; /* 0xe4000000041c2832 */ /*0210*/ DMUL R18, R10, R14; /* 0xe4000000071c284a */ /*0218*/ DMUL R10, R12, R14; /* 0xe4000000071c302a */ /*0220*/ DMUL R16, R8, R18; /* 0xe4000000091c2042 */ /*0228*/ DFMA R8, R10, R6, -R4; /* 0xdb901000031c2822 */ /*0230*/ DFMA R12, R16, R6, -R4; /* 0xdb901000031c4032 */ /*0238*/ DSETP.GT.AND P0, PT, |R8|, |R12|, PT; /* 0xdc209c00061c221e */ /* 0x08b010ac10b010a0 */ /*0248*/ SEL R9, R17, R11, P0; /* 0xe5000000059c4426 */ /*0250*/ FSETP.GTU.AND P1, PT, |R9|, 1.469367938527859385e-39, PT; /* 0xb5e01c00801c263d */ /*0258*/ MOV R11, R9; /* 0xe4c03c00049c002e */ /*0260*/ SEL R8, R16, R10, P0; /* 0xe5000000051c4022 */ /*0268*/ @P1 NOP.S; /* 0x8580000000443c02 */ /*0270*/ FSETP.LT.AND P0, PT, |R5|, 1.5046327690525280102e-36, PT; /* 0xb5881c20001c161d */ /*0278*/ MOV32I R0, 0x3ff00000; /* 0x741ff800001fc002 */ /* 0x0880a48090108c10 */ /*0288*/ MOV R16, RZ; /* 0xe4c03c007f9c0042 */ /*0290*/ SEL R17, R0, c[0x2][0x1c], !P0; /* 0x65002040039c0046 */ /*0298*/ LOP.OR R10, R8, 0x1; /* 0xc2001000009c2029 */ /*02a0*/ LOP.AND R8, R8, -0x2; /* 0xca0003ffff1c2021 */ /*02a8*/ DMUL R4, R16, R4; /* 0xe4000000021c4012 */ /*02b0*/ DMUL R6, R16, R6; /* 0xe4000000031c401a */ /*02b8*/ DFMA R14, R10, R6, -R4; /* 0xdb901000031c283a */ /* 0x08b010b010a0b4a4 */ /*02c8*/ DFMA R12, R8, R6, -R4; /* 0xdb901000031c2032 */ /*02d0*/ DSETP.GT.AND P0, PT, |R12|, |R14|, PT; /* 0xdc209c00071c321e */ /*02d8*/ SEL R8, R10, R8, P0; /* 0xe5000000041c2822 */ /*02e0*/ LOP.AND R0, R8, 0x1; /* 0xc2000000009c2001 */ /*02e8*/ IADD R11.CC, R8, -0x1; /* 0xc88403ffff9c202d */ /*02f0*/ ISETP.EQ.U32.AND P0, PT, R0, 0x1, PT; /* 0xb3201c00009c001d */ /*02f8*/ IADD.X R0, R9, -0x1; /* 0xc88043ffff9c2401 */ /* 0x08b4a480a010b010 */ /*0308*/ SEL R10, R11, R8, !P0; /* 0xe5002000041c2c2a */ /*0310*/ @P0 IADD R8.CC, R8, 0x1; /* 0xc084000000802021 */ /*0318*/ SEL R11, R0, R9, !P0; /* 0xe5002000049c002e */ /*0320*/ @P0 IADD.X R9, R9, RZ; /* 0xe08040007f802426 */ /*0328*/ DFMA R14, R10, R6, -R4; /* 0xdb901000031c283a */ /*0330*/ DFMA R4, R8, R6, -R4; /* 0xdb901000031c2012 */ /*0338*/ DSETP.GT.AND P0, PT, |R4|, |R14|, PT; /* 0xdc209c00071c121e */ /* 0x08b4acb4a010b810 */ /*0348*/ SEL R8, R10, R8, P0; /* 0xe5000000041c2822 */ /*0350*/ SEL.S R9, R11, R9, P0; /* 0xe500000004dc2c26 */ /*0358*/ MOV R8, RZ; /* 0xe4c03c007f9c0022 */ /*0360*/ MUFU.RCP64H R9, R7; /* 0x84000000031c1c26 */ /*0368*/ DSETP.GT.AND P0, PT, |R8|, RZ, PT; /* 0xdc201c007f9c221e */ /*0370*/ @P0 BRA.U 0x398; /* 0x120000001000023c */ /*0378*/ @!P0 DSETP.NEU.AND P1, PT, |R6|, +INF , PT; /* 0xb4681fff80201a3d */ /* 0x0800b8a010ac0010 */ /*0388*/ @!P0 SEL R9, R7, R9, P1; /* 0xe500040004a01c26 */ /*0390*/ @!P0 SEL R8, R6, RZ, P1; /* 0xe50004007fa01822 */ /*0398*/ DMUL.S R8, R8, R4; /* 0xe4000000025c2022 */ /*03a0*/ MOV R4, R8; /* 0xe4c03c00041c0012 */ /*03a8*/ MOV R5, R9; /* 0xe4c03c00049c0016 */ /*03b0*/ RET; /* 0x19000000001c003c */ /*03b8*/ BRA 0x3b8; /* 0x12007ffffc1c003c */
该MUFU.RCP64H
指令提供了倒数的初始近似值.它在分母(y
)的高32位上运行,并提供双精度近似的高32位,因此被分析器计为浮点运算(单精度特殊).还有另一个单精度FFMA
指令显然用作测试条件的高吞吐量版本,不需要全精度.