三个不同AXI IP核的实现的方法_性能的对比及差异的分析

电子说

1.2w人已加入

描述

本文先总结不同AXI IP核的实现的方法,性能的对比,性能差异的分析,可能改进的方面。使用的硬件平台是Zedboard。

不同的AXI总线卷积加速模块的概况

这次实现并逐渐优化了三个版本的卷积加速模块,先简要描述各个版本的主要内容。

版本一

版本一主要是用来测试AXI总线IP核的实现可能。

该模块拥有19个32位寄存器

其中前9个寄存器用来保存需要计算的值

后面9个寄存器用来保存卷积核

在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)

9个寄存器单独赋值,程序中分别向对应地址写入内容,通过总线进行传输。

故乐观的来算,需要10个总线周期可以获取一个输出

可以从驱动的书写简单理解一下:

void Conv_HW(int filter[3][3], int arr[100][100],

int filterW, int filterH, int arrW, int arrH) {

int i, j;

for (i = 2; i 《 filterH + arrH - 3; i++) {

for (j = 2; j 《 filterW + arrW - 3; j++) {

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR, arr[i][j]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+4, arr[i][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+8, arr[i][j - 2]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+12, arr[i - 1][j]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+16, arr[i - 1][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+20, arr[i - 1][j - 2]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+24, arr[i - 2][j]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+28, arr[i - 2][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+32, arr[i - 2][j - 2]);

res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);

}

if (i % 15 == 0)

printf(“=”);

}

}

版本一性能

版本一性能最惨,由于没有时间戳,目测软件计算速度远远快于FPGA核心运算速度。

版本一的改进速度就是引入滑动窗口,能够最大程度减少总线周期。

版本二

版本二引入滑动窗口,和初期设计的概念相同。

IP核

该模块拥有19个32位寄存器

其中前9个寄存器用来保存需要计算的值

后面9个寄存器用来保存卷积核

在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)

三个寄存器滑动赋值,该计算窗口在计算矩阵上滑动 除了冷启动多余两个周期用来预载寄存器,后面的每一个计算只需要四个总线周期

可以通过写的驱动简单理解一下:

void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {

int i, j;

i = 2; j = 2;

for (i = 2; i 《 arrH; i++) {

//pre load

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);

for (j = 2; j 《 arrW; j++) {

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);

res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);

}

}

}

版本二性能

测试样本 500*500的32bit单位的矩阵 计算200次。

软件消耗33.78秒,卷积IP核心40.25秒

这样的结果还是非常不乐观,分析可能有两种限制了IP核的速度。

两个寄存器的乘法LUT太大,无法硬件优化

总线周期太慢太慢

版本三对于这两种可能进行探索。

版本二的FPGA部分核心代码

// Implement memory mapped register select and write logic generation

// The write data is accepted and written to memory mapped registers when

// axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to

// select byte enables of slave registers while writing.

// These registers are cleared when reset (active low) is applied.

// Slave register write enable is asserted when valid address and data are available

// and the slave is ready to accept the write address and write data.

assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;

always @( posedge S_AXI_ACLK )

begin

if ( S_AXI_ARESETN == 1‘b0 )

begin

slv_reg0 《= 0;

slv_reg1 《= 0;

slv_reg2 《= 0;

slv_reg3 《= 0;

slv_reg4 《= 0;

slv_reg5 《= 0;

slv_reg6 《= 0;

slv_reg7 《= 0;

slv_reg8 《= 0;

slv_reg9 《= 0;

slv_reg10 《= 0;

slv_reg11 《= 0;

slv_reg12 《= 0;

slv_reg13 《= 0;

slv_reg14 《= 0;

slv_reg15 《= 0;

slv_reg16 《= 0;

slv_reg17 《= 0;

// slv_reg18 《= 0;

end

else begin

if (slv_reg_wren)

begin

case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )

5’h00:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 0

slv_reg0[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5‘h01:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 1

slv_reg1[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5’h02:

begin

slv_reg0 《= slv_reg1;

slv_reg1 《= slv_reg2;

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 2

slv_reg2[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

end

5‘h03:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 3

slv_reg3[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5’h04:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 4

slv_reg4[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5‘h05:

begin

slv_reg3 《= slv_reg4;

slv_reg4 《= slv_reg5;

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 5

slv_reg5[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

end

5’h06:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 6

slv_reg6[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5‘h07:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 7

slv_reg7[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5’h08:

begin

slv_reg6 《= slv_reg7;

slv_reg7 《= slv_reg8;

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 8

slv_reg8[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

end

5‘h09:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 9

slv_reg9[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5’h0A:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 10

slv_reg10[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5‘h0B:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 11

slv_reg11[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5’h0C:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 12

slv_reg12[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5‘h0D:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 13

slv_reg13[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5’h0E:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 14

slv_reg14[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5‘h0F:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 15

slv_reg15[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5’h10:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 16

slv_reg16[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

5‘h11:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 17

slv_reg17[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

// 5’h12:

// for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

// if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// // Respective byte enables are asserted as per write strobes

// // Slave register 18

// slv_reg18[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

// end

default : begin

slv_reg0 《= slv_reg0;

slv_reg1 《= slv_reg1;

slv_reg2 《= slv_reg2;

slv_reg3 《= slv_reg3;

slv_reg4 《= slv_reg4;

slv_reg5 《= slv_reg5;

slv_reg6 《= slv_reg6;

slv_reg7 《= slv_reg7;

slv_reg8 《= slv_reg8;

slv_reg9 《= slv_reg9;

slv_reg10 《= slv_reg10;

slv_reg11 《= slv_reg11;

slv_reg12 《= slv_reg12;

slv_reg13 《= slv_reg13;

slv_reg14 《= slv_reg14;

slv_reg15 《= slv_reg15;

slv_reg16 《= slv_reg16;

slv_reg17 《= slv_reg17;

end

endcase

end

end

end

// Implement memory mapped register select and read logic generation

// Slave register read enable is asserted when valid address is available

// and the slave is ready to accept the read address.

assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid;

always @(*)

begin

// Address decoding for reading registers

case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )

5‘h00 : reg_data_out 《= slv_reg0;

5’h01 : reg_data_out 《= slv_reg1;

5‘h02 : reg_data_out 《= slv_reg2;

5’h03 : reg_data_out 《= slv_reg3;

5‘h04 : reg_data_out 《= slv_reg4;

5’h05 : reg_data_out 《= slv_reg5;

5‘h06 : reg_data_out 《= slv_reg6;

5’h07 : reg_data_out 《= slv_reg7;

5‘h08 : reg_data_out 《= slv_reg8;

5’h09 : reg_data_out 《= slv_reg9;

5‘h0A : reg_data_out 《= slv_reg10;

5’h0B : reg_data_out 《= slv_reg11;

5‘h0C : reg_data_out 《= slv_reg12;

5’h0D : reg_data_out 《= slv_reg13;

5‘h0E : reg_data_out 《= slv_reg14;

5’h0F : reg_data_out 《= slv_reg15;

5‘h10 : reg_data_out 《= slv_reg16;

5’h11 : reg_data_out 《= slv_reg17;

5‘h12 : reg_data_out 《= slv_reg0 * slv_reg9 +

slv_reg1 * slv_reg10 +

slv_reg2 * slv_reg11 +

slv_reg3 * slv_reg12 +

slv_reg4 * slv_reg13 +

slv_reg5 * slv_reg14 +

slv_reg6 * slv_reg15 +

slv_reg7 * slv_reg16 +

slv_reg8 * slv_reg17;

default : reg_data_out 《= 0;

endcase

end

版本三

先尝试生成更小的LUT

该模块拥有19个32位寄存器

其中前9个寄存器用来保存需要计算的值

卷积核固定在Verilog中,用来生成更小的LUT

一个计算只需要四个总线周期

性能测试

仍然软件消耗33秒,卷积IP核心40秒

基本否决是LUT问题。

下面测试AXI总线问题:

假设所有数据均来自于FPGA,无需从总线写入:

void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {

int i, j;

i = 2; j = 2;

for (i = 2; i 《 arrH; i++) {

for (j = 2; j 《 arrW; j++) {

res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);

}

}

}

只需要9.47秒即可完成计算,并传回CPU !!!

总结

至此,基本上可以否决利用AXI传数据的可能,所有需要利用AXI总线传输数据的模块均会被总线周期所连累,在优化了传输后,仍然无法解决该问题。确实需要一个更快的方式来传输数据。

在Altera的NIOS2中,直接利用IO口传输数据,无需总线周期,再因为NIOS II内核没有流水线优化,所以硬件确实比较快。

附1:AXI4 总线的 FPGA 接口部分

先看总线接口:

// Users to add ports here

// User ports ends

// Do not modify the ports beyond this line

// Global Clock Signal

// 全局时钟

input wire S_AXI_ACLK,

// Global Reset Signal. This Signal is Active LOW

// 全局复位信号

input wire S_AXI_ARESETN,

// Write address (issued by master, acceped by Slave)

// 写地址

input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR,

// 写地址的保护模式 包括privilege和security level

// Write channel Protection type. This signal indicates the

// privilege and security level of the transaction, and whether

// the transaction is a data access or an instruction access.

input wire [2 : 0] S_AXI_AWPROT,

// 写地址有效信号。为高指示地址有效。

// Write address valid. This signal indicates that the master signaling

// valid write address and control information.

input wire S_AXI_AWVALID,

// 写地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。

// ********** 注意 这里是地址 下面是数据 ********

// Write address ready. This signal indicates that the slave is ready

// to accept an address and associated control signals.

output wire S_AXI_AWREADY,

// 写数据,32位到1024位宽

// 从主设备来的数据 从设备接收

// Write data (issued by master, acceped by Slave)

input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA,

// 写字节选通,用于表示更新存储器的字节通道,对于数据总线的每8位数据有一位写选通信号。

// Write strobes. This signal indicates which byte lanes hold

// valid data. There is one write strobe bit for each eight

// bits of the write data bus.

input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB,

// 写有效。为高指示数据有效。

// Write valid. This signal indicates that valid write

// data and strobes are available.

input wire S_AXI_WVALID,

// 写准备。为高表示从设备空闲,准备接收数据;为低表示从设备忙。

// Write ready. This signal indicates that the slave

// can accept the write data.

output wire S_AXI_WREADY,

// 写响应。该信号表示写状态,可允许相应的表示为OKAYEXOKAYSLVERRDECERR。

// Write response. This signal indicates the status

// of the write transaction.

output wire [1 : 0] S_AXI_BRESP,

// 写响应有效。为高指示响应数据有效

// Write response valid. This signal indicates that the channel

// is signaling a valid write response.

output wire S_AXI_BVALID,

// 写响应准备。为高表示主设备空闲,准备接收写响应;为低表示主设备忙。

// Response ready. This signal indicates that the master

// can accept a write response.

input wire S_AXI_BREADY,

//

// 读地址。读地址给出突发数据传输的第一个传输地址。

// Read address (issued by master, acceped by Slave)

input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR,

// 保护类型,建议值为000。

// Protection type. This signal indicates the privilege

// and security level of the transaction, and whether the

// transaction is a data access or an instruction access.

input wire [2 : 0] S_AXI_ARPROT,

//

// Read address valid. This signal indicates that the channel

// is signaling valid read address and control information.

input wire S_AXI_ARVALID,

// 读地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。

// Read address ready. This signal indicates that the slave is

// ready to accept an address and associated control signals.

output wire S_AXI_ARREADY,

// Read data (issued by slave)

output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA,

// Read response. This signal indicates the status of the

// read transfer.

output wire [1 : 0] S_AXI_RRESP,

// Read valid. This signal indicates that the channel is

// signaling the required read data.

output wire S_AXI_RVALID,

// Read ready. This signal indicates that the master can

// accept the read data and response information.

input wire S_AXI_RREADY

);

// AXI4LITE signals

reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_awaddr;

reg axi_awready;

reg axi_wready;

reg [1 : 0] axi_bresp;

reg axi_bvalid;

reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_araddr;

reg axi_arready;

reg [C_S_AXI_DATA_WIDTH-1 : 0] axi_rdata;

reg [1 : 0] axi_rresp;

reg axi_rvalid;

其中最为重要的读取总线信号寻址的部分:

assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;

always @( posedge S_AXI_ACLK )

begin

if ( S_AXI_ARESETN == 1’b0 )

begin

slv_reg0 《= 0;

slv_reg1 《= 0;

slv_reg2 《= 0;

slv_reg3 《= 0;

slv_reg4 《= 0;

slv_reg5 《= 0;

slv_reg6 《= 0;

slv_reg7 《= 0;

slv_reg8 《= 0;

slv_reg9 《= 0;

end

else begin

if (slv_reg_wren)

begin

// 进行寻址

// 地址寻址 是这么玩的

// 当寄存器是32位的 最后就是 2位 4个Byte ADDR_LSB = 2

// 当寄存器是64位的 最后就是 3位 8个Byte ADDR_LSB = 3

// OPT_MEM_ADDR_BITS 用来寻址寄存器 这里选了十个寄存器 所以这里就是4位

case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )

4‘h0:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

// 只有在对应的Bit位置为1的时候才能开始读取

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 0

slv_reg0[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4’h1:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 1

slv_reg1[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4‘h2:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 2

slv_reg2[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4’h3:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 3

slv_reg3[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4‘h4:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 4

slv_reg4[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4’h5:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 5

slv_reg5[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4‘h6:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 6

slv_reg6[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4’h7:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 7

slv_reg7[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4‘h8:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 8

slv_reg8[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

4’h9:

for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )

if ( S_AXI_WSTRB[byte_index] == 1 ) begin

// Respective byte enables are asserted as per write strobes

// Slave register 9

slv_reg9[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];

end

default : begin

slv_reg0 《= slv_reg0;

slv_reg1 《= slv_reg1;

slv_reg2 《= slv_reg2;

slv_reg3 《= slv_reg3;

slv_reg4 《= slv_reg4;

slv_reg5 《= slv_reg5;

slv_reg6 《= slv_reg6;

slv_reg7 《= slv_reg7;

slv_reg8 《= slv_reg8;

slv_reg9 《= slv_reg9;

end

endcase

end

end

end

附2:AXI4的测试模块与仿真测试

`timescale 1ns/1ns

module conv_axi_test();

parameter integer C_S00_AXI_DATA_WIDTH = 32;

parameter integer C_S00_AXI_ADDR_WIDTH = 6;

reg s00_axi_aclk;

// 全局复位信号

reg s00_axi_aresetn;

reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_awaddr;

wire [2 : 0] s00_axi_awprot;

reg s00_axi_awvalid;

wire s00_axi_awready;

reg [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_wdata;

reg [(C_S00_AXI_DATA_WIDTH/8)-1 : 0] s00_axi_wstrb;

reg s00_axi_wvalid;

wire s00_axi_wready;

wire [1 : 0] s00_axi_bresp;

wire s00_axi_bvalid;

wire s00_axi_bready;

reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_araddr;

wire [2 : 0] s00_axi_arprot;

reg s00_axi_arvalid;

wire s00_axi_arready;

wire [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_rdata;

wire [1 : 0] s00_axi_rresp;

wire s00_axi_rvalid;

wire s00_axi_rready;

conv_v1_0_S00_AXI # (

.C_S_AXI_DATA_WIDTH(C_S00_AXI_DATA_WIDTH),

.C_S_AXI_ADDR_WIDTH(C_S00_AXI_ADDR_WIDTH)

) conv_v1_0_S00_AXI_inst (

.S_AXI_ACLK(s00_axi_aclk),

.S_AXI_ARESETN(s00_axi_aresetn),

.S_AXI_AWADDR(s00_axi_awaddr),

.S_AXI_AWPROT(s00_axi_awprot),

.S_AXI_AWVALID(s00_axi_awvalid),

.S_AXI_AWREADY(s00_axi_awready),

.S_AXI_WDATA(s00_axi_wdata),

.S_AXI_WSTRB(s00_axi_wstrb),

.S_AXI_WVALID(s00_axi_wvalid),

.S_AXI_WREADY(s00_axi_wready),

.S_AXI_BRESP(s00_axi_bresp),

.S_AXI_BVALID(s00_axi_bvalid),

.S_AXI_BREADY(s00_axi_bready),

.S_AXI_ARADDR(s00_axi_araddr),

.S_AXI_ARPROT(s00_axi_arprot),

.S_AXI_ARVALID(s00_axi_arvalid),

.S_AXI_ARREADY(s00_axi_arready),

.S_AXI_RDATA(s00_axi_rdata),

.S_AXI_RRESP(s00_axi_rresp),

.S_AXI_RVALID(s00_axi_rvalid),

.S_AXI_RREADY(s00_axi_rready)

);

initial

begin:d

integer i;

s00_axi_aclk = 1;

for(i = 0; i《 1000;i++)

begin

#1 s00_axi_aclk = ~ s00_axi_aclk;

end

$finish();

end

initial

begin

s00_axi_aresetn = 0;

s00_axi_arvalid = 0;

#4 s00_axi_aresetn = 1;

s00_axi_awvalid = 1;

s00_axi_wvalid = 1;

s00_axi_awaddr = 0;

s00_axi_wstrb = 4‘b1111;

s00_axi_wdata = 3;

#4 s00_axi_awaddr = 6’b000100;

s00_axi_wdata = 21;

#4 s00_axi_awaddr = 6‘b001000;

s00_axi_wdata = 19;

#4 s00_axi_awaddr = 6’b001100;

s00_axi_wdata = 22;

#4 s00_axi_awaddr = 6‘b010000;

s00_axi_wdata = 20;

#4 s00_axi_awaddr = 6’b010100;

s00_axi_wdata = 13;

#4 s00_axi_awaddr = 6‘b011000;

s00_axi_wdata = 16;

#4 s00_axi_awaddr = 6’b011100;

s00_axi_wdata = 14;

#4 s00_axi_awaddr = 6‘b100000;

s00_axi_wdata = 7;

#4

s00_axi_arvalid = 1;

s00_axi_araddr = 6’b100100;

end

initial

begin

$dumpfile(“test.vcd”);

$dumpvars();

end

endmodule

利用iverilog进行仿真GTKwave显示测试波形如下

IP核

新建IP核如下:

IP核

工程顶层图如下:

IP核

附3:软件驱动

#include

#include “platform.h”

#include “xbasic_types.h”

#include “xparameters.h”

#include “xil_io.h”

#define test_speed

int res[1000][1000];

void delay() {

int i, j, k;

for (i = 0; i 《 1000; i++) {

for (j = 0; j 《 1000; j++) {

for (k = 0; k 《 100; k++)

;

}

}

}

void show_reg() {

int i;

u32 result;

printf(“ ============SHOW REG ================ ”);

for (i = 0; i 《 9; i++) {

result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 4 * i);

printf(“Reg %3d : %u ”, i, result);

}

}

void load_kernel(int filter[3][3]) {

UINTPTR kernel_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR + 36;

Xil_Out32(kernel_addr, filter[0][0]);

kernel_addr = kernel_addr + 0x4;

Xil_Out32(kernel_addr, filter[0][1]);

kernel_addr = kernel_addr + 0x4;

Xil_Out32(kernel_addr, filter[0][2]);

kernel_addr = kernel_addr + 0x4;

Xil_Out32(kernel_addr, filter[1][0]);

kernel_addr = kernel_addr + 0x4;

Xil_Out32(kernel_addr, filter[1][1]);

kernel_addr = kernel_addr + 0x4;

Xil_Out32(kernel_addr, filter[1][2]);

kernel_addr = kernel_addr + 0x4;

Xil_Out32(kernel_addr, filter[2][0]);

kernel_addr = kernel_addr + 0x4;

Xil_Out32(kernel_addr, filter[2][1]);

kernel_addr = kernel_addr + 0x4;

Xil_Out32(kernel_addr, filter[2][2]);

}

void test_set() {

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 3);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 22);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 16);

printf(“1 ”);

show_reg();

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 21);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 20);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 14);

printf(“2 ”);

show_reg();

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 19);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 13);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 7);

printf(“3 ”);

show_reg();

}

void Conv_SW(int filter[3][3], int arr[100][100], int arrW, int arrH) {

int i, j;

i = 2; j = 2;

for (i = 2; i 《 arrH; i++) {

for (j = 2; j 《 arrW;j++){

res[i][j] = 0;

res[i][j] += filter[0][0] * arr[i - 1][j - 1];

res[i][j] += filter[0][1] * arr[i - 1][j];

res[i][j] += filter[0][2] * arr[i - 1][j + 1];

res[i][j] += filter[1][0] * arr[i][j - 1];

res[i][j] += filter[1][1] * arr[i][j];

res[i][j] += filter[1][2] * arr[i][j + 1];

res[i][j] += filter[2][0] * arr[i + 1][j - 1];

res[i][j] += filter[2][1] * arr[i + 1][j];

res[i][j] += filter[2][2] * arr[i + 1][j + 1];

}

}

}

void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {

int i, j;

i = 2; j = 2;

for (i = 2; i 《 arrH; i++) {

//pre load

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);

for (j = 2; j 《 arrW; j++) {

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);

Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);

res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);

}

}

}

int main() {

printf(“HELLO WORLD”);

u32 result;

int filterW = 3;

int filterH = 3;

int arrW = 5;

int arrH = 5;

int resW = filterW + arrW - 1;

int resH = filterH + arrH - 1;

int i, j;

int pFilter[3][3];

int arr[100][100];

UINTPTR cur_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR;

pFilter[0][0] = 1;

pFilter[0][1] = 3;

pFilter[0][2] = 1;

pFilter[1][0] = 0;

pFilter[1][1] = 5;

pFilter[1][2] = 0;

pFilter[2][0] = 2;

pFilter[2][1] = 1;

pFilter[2][2] = 2;

init_platform();

for (i = 0; i 《 9; i++) {

Xil_Out32(cur_addr, 0);

cur_addr = cur_addr + 4;

}

load_kernel(pFilter);

printf(“Kernel Loaded ”);

#ifdef test_single

test_set();

result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);

printf(“Test Set Result %u”, result);

show_reg();

#endif

#ifdef test_func

srand(10);

arrW = 20;

arrH = 20;

resH = filterH + arrH - 1;

resW = filterW + arrW - 1;

for (i = 0; i 《 arrH; i++) {

for (j = 0; j 《 arrW; j++) {

arr[i][j] = rand() % 20;

}

}

printf(“*********************************************** ”);

printf(“Filter: ”);

for (i = filterH - 1; i 》= 0; i--) {

for (j = filterW - 1; j 》= 0; j--) {

printf(“%d ”, pFilter[i][j]);

}

printf(“ ”);

}

printf(“*********************************************** ”);

printf(“Matrix: ”);

for (i = 0; i 《 arrH; i++) {

for (j = 0; j 《 arrW; j++) {

printf(“%4d ”, arr[i][j]);

}

printf(“ ”);

}

printf(“*********************************************** ”);

printf(“Software Start! ”);

Conv_SW(pFilter, arr, arrW, arrH);

printf(“ Software end! ”);

printf(“*********************************************** ”);

printf(“Result1: ”);

for (i = 0; i 《 resH; i++) {

for (j = 0; j 《 resW; j++) {

printf(“%5d ”, res[i][j]);

}

printf(“ ”);

}

for (i = 0; i 《 resH; i++) {

for (j = 0; j 《 resW; j++) {

res[i][j] = 0;

}

}

printf(“*********************************************** ”);

printf(“HardWare Start! ”);

Conv_HW(pFilter, arr, arrW, arrH);

printf(“ HardWare end!”);

printf(“Result2: ”);

for (i = 0; i 《 resH; i++) {

for (j = 0; j 《 resW; j++) {

printf(“%5d ”, res[i][j]);

}

printf(“ ”);

}

printf(“*********************************************** ”);

#endif

#ifdef test_speed

arrW = 500;

arrH = 500;

resH = filterH + arrH - 1;

resW = filterW + arrW - 1;

printf(“Software Start! ”);

for(i = 0; i《 200;i++) {

Conv_SW(pFilter, arr, arrW, arrH);

}

printf(“ Software end! ”);

printf(“HardWare Start! ”);

for(i = 0; i《 200;i++) {

Conv_HW(pFilter, arr, arrW, arrH);

}

printf(“ HardWare end!”);

cleanup_platform();

#endif

return 0;

}

打开APP阅读更多精彩内容
声明:本文内容及配图由入驻作者撰写或者入驻合作网站授权转载。文章观点仅代表作者本人,不代表电子发烧友网立场。文章及其配图仅供工程师学习之用,如有内容侵权或者其他违规问题,请联系本站处理。 举报投诉

全部0条评论

快来发表一下你的评论吧 !

×
20
完善资料,
赚取积分