// Copyright (c) 2021 Horizon Robotics.All Rights Reserved.
//
// The material in this file is confidential and contains trade secrets
// of Horizon Robotics Inc. This is proprietary information owned by
// Horizon Robotics Inc. No part of this work may be disclosed,
// reproduced, copied, transmitted, or used in any way for any purpose,
// without the express written permission of Horizon Robotics Inc.
#include "averagepool.h"
#include
#include "layer_common.h"
#include "pooling_common.h"
#include "util/common.h"
namespace hobot {
namespace dnn {
DEFINE_AND_REGISTER_LAYER_CREATOR(AveragePool)
static inline int32_t AveragePool3DOffset(TShape const &shape, int32_t i0,
int32_t i1, int32_t i2, int32_t i3,
int32_t i4) {
return (((i0 * shape[1] + i1) * shape[2] + i2) * shape[3] + i3) * shape[4] +
i4;
}
template
static void AveragePool3D_float32(DType const *input_data, DType *output_data,
//static inline void AveragePool3D(DType const *input_data, DType *output_data,
TShape const &ishape, TShape const &oshape,
std::vector &kernel_shape,
std::vector &strides,
std::vector &pads,
bool count_include_pad) {
if (ishape[kDim0] != oshape[kDim0]) {
DNN_LOGE(TAG_LAYER, "input[0] is not equal output[0]");
}
if (ishape[kDim1] != oshape[kDim1]) {
DNN_LOGE(TAG_LAYER, "input[1] is not equal output[1]");
}
if (pads[0] != pads[3]) {
DNN_LOGE(TAG_LAYER, "pads[0] is not equal pads[3]");
}
if (pads[1] != pads[4]) {
DNN_LOGE(TAG_LAYER, "pads[1] is not equal pads[4]");
}
if (pads[2] != pads[5]) {
DNN_LOGE(TAG_LAYER, "pads[2] is not equal pads[5]");
}
const int32_t batches = static_cast(ishape[kDim0]);
const int32_t channels = static_cast(ishape[kDim1]);
const int32_t in_spatial_dim_1{static_cast(ishape[kDim2])};
const int32_t in_spatial_dim_2{static_cast(ishape[kDim3])};
const int32_t in_spatial_dim_3{static_cast(ishape[kDim4])};
const int32_t out_spatial_dim_1{static_cast(oshape[kDim2])};
const int32_t out_spatial_dim_2{static_cast(oshape[kDim3])};
const int32_t out_spatial_dim_3{static_cast(oshape[kDim4])};
const int32_t stride_spatial_dim_1{strides[0]};
const int32_t stride_spatial_dim_2{strides[1]};
const int32_t stride_spatial_dim_3{strides[2]};
const int32_t filter_spatial_dim_1{kernel_shape[0]};
const int32_t filter_spatial_dim_2{kernel_shape[1]};
const int32_t filter_spatial_dim_3{kernel_shape[2]};
const int32_t padding_spatial_dim_1{pads[0]};
const int32_t padding_spatial_dim_2{pads[1]};
const int32_t padding_spatial_dim_3{pads[2]};
//RVV
if (((padding_spatial_dim_1 == 0) && (padding_spatial_dim_2 == 0)) && (padding_spatial_dim_3 == 0)) {
if (((filter_spatial_dim_1 == 2) && (filter_spatial_dim_1 == 2)) && (filter_spatial_dim_1 == 2)) {
// in_spatial_dim_1 % 2 == 0 && in_spatial_dim_2 % 2 == 0 && in_spatial_dim_3 % 2 == 0 (also != 0)
if (((stride_spatial_dim_1 == filter_spatial_dim_1) && (stride_spatial_dim_2 == filter_spatial_dim_2)) && (stride_spatial_dim_3 == filter_spatial_dim_3)) {
size_t vl;
float_t mid_data_list[batches][channels][in_spatial_dim_1][out_spatial_dim_2][out_spatial_dim_3];
float_t *mid_data = (float *)mid_data_list;
for (int32_t batch = 0; batch < batches; ++batch) {
for (int32_t channel = 0; channel < channels; ++channel) {
for (int32_t in_dim1{0}; in_dim1 < in_spatial_dim_1; in_dim1++) {
float_t *out_ptr = mid_data + in_dim1 * out_spatial_dim_2 * out_spatial_dim_3;
for (int32_t out_dim2{0}; out_dim2 < out_spatial_dim_2; out_dim2++) {
const float_t *line0 = input_data + in_dim1 * in_spatial_dim_2 * in_spatial_dim_3 + out_dim2 * in_spatial_dim_3 * 2;
const float_t *line1 = line0 + in_spatial_dim_3;
int32_t w = out_spatial_dim_3;
while (w > 0) {
vl = vsetvl_e32m2(w);
vfloat32m2_t vline0_seg1, vline0_seg2;
vfloat32m2_t vline1_seg1, vline1_seg2;
vlseg2e32_v_f32m2(&vline0_seg1, &vline0_seg2, line0, vl);
vlseg2e32_v_f32m2(&vline1_seg1, &vline1_seg2, line1, vl);
vfloat32m2_t vsum0 = vfadd_vv_f32m2(vline0_seg1, vline0_seg2, vl);
vfloat32m2_t vsum1 = vfadd_vv_f32m2(vline1_seg1, vline1_seg2, vl);
vfloat32m2_t vsum = vfadd_vv_f32m2(vsum0, vsum1, vl);
vfloat32m2_t vavg = vfmul_vf_f32m2(vsum, 0.25f, vl);
vse32_v_f32m2(out_ptr, vavg, vl);
w -= vl;
out_ptr += vl;
line0 += 2 * vl;
line1 += 2 * vl;
}
//line0 += in_spatial_dim_3;
//line1 += in_spatial_dim_3;
}
}
input_data += in_spatial_dim_1 * in_spatial_dim_2 * in_spatial_dim_3;
int32_t hw = out_spatial_dim_2 * out_spatial_dim_3;
for (int32_t out_dim1{0}; out_dim1 < out_spatial_dim_1; out_dim1++) {
const float_t *line0 = mid_data + out_dim1 * out_spatial_dim_2 * out_spatial_dim_3 * 2;
const float_t *line1 = line0 + out_spatial_dim_2 * out_spatial_dim_3;
for (int32_t i{0}; i < hw; i += vl) {
vl = vsetvl_e32m2(hw - i);
vfloat32m2_t vdim1_0 = vle32_v_f32m2(line0 + i, vl);
vfloat32m2_t vdim1_1 = vle32_v_f32m2(line1 + i, vl);
vfloat32m2_t vsum = vfadd_vv_f32m2(vdim1_0, vdim1_1, vl);
vfloat32m2_t vavg = vfmul_vf_f32m2(vsum, 0.5f, vl);
vse32_v_f32m2(output_data + i, vavg, vl);
}
//line0 += 2 * out_spatial_dim_2 * out_spatial_dim_3;
//line1 += 2 * out_spatial_dim_2 * out_spatial_dim_3;
output_data += out_spatial_dim_2 * out_spatial_dim_3;
}
mid_data += in_spatial_dim_1 * out_spatial_dim_2 * out_spatial_dim_3;
} // channel loop
// input_data += channels * in_spatial_dim_1 * in_spatial_dim_2 * in_spatial_dim_3;
// mid_data += channels * in_spatial_dim_1 * out_spatial_dim_2 * out_spatial_dim_3;
} // batch loop
} // stride == kernel loop
else {
// kernel = 2 && stride = 1
size_t vl;
float_t mid_data_list[batches][channels][in_spatial_dim_1][out_spatial_dim_2][out_spatial_dim_3];
float_t *mid_data = (float *)mid_data_list;
for (int32_t batch = 0; batch < batches; ++batch) {
for (int32_t channel = 0; channel < channels; ++channel) {
for (int32_t in_dim1{0}; in_dim1 < in_spatial_dim_1; in_dim1++) {
const float_t *line0 = input_data + in_dim1 * in_spatial_dim_2 * in_spatial_dim_3;
const float_t *line1 = line0 + in_spatial_dim_3;
float_t *out_ptr = mid_data + in_dim1 * out_spatial_dim_2 * out_spatial_dim_3;
for (int32_t out_dim2{0}; out_dim2 < out_spatial_dim_2; out_dim2++) {
int32_t w = out_spatial_dim_3;
for (int32_t i{0}; i < w; i += vl) {
vl = vsetvl_e32m2(w - i);
vfloat32m2_t vline0_seg1 = vle32_v_f32m2(line0 + i, vl);
vfloat32m2_t vline0_seg2 = vle32_v_f32m2(line0 + i + 1, vl);
vfloat32m2_t vline1_seg1 = vle32_v_f32m2(line1 + i, vl);
vfloat32m2_t vline1_seg2 = vle32_v_f32m2(line1 + i + 1, vl); ;
vfloat32m2_t vsum0 = vfadd_vv_f32m2(vline0_seg1, vline0_seg2, vl);
vfloat32m2_t vsum1 = vfadd_vv_f32m2(vline1_seg1, vline1_seg2, vl);
vfloat32m2_t vsum = vfadd_vv_f32m2(vsum0, vsum1, vl);
vfloat32m2_t vavg = vfmul_vf_f32m2(vsum, 0.25f, vl);
vse32_v_f32m2(out_ptr + i, vavg, vl);
}
line0 += in_spatial_dim_3;
line1 += in_spatial_dim_3;
out_ptr += out_spatial_dim_3;
}
}
input_data += in_spatial_dim_1 * in_spatial_dim_2 * in_spatial_dim_3;