Program Listing for File SE3Tracker_NEON.cpp

Return to documentation for file (lib/Tracking/SE3Tracker_NEON.cpp)

#if defined(ENABLE_NEON)

namespace lsd_slam {

    float SE3Tracker::calcResidualAndBuffersNEON(
            const Eigen::Vector3f* refPoint,
            const Eigen::Vector2f* refColVar,
            int* idxBuf,
            int refNum,
            const std::shared_ptr<Frame> &frame,
            const Sophus::SE3f& referenceToFrame,
            int level,
            bool plotResidual)
    {
        return calcResidualAndBuffers(refPoint, refColVar, idxBuf, refNum, frame, referenceToFrame, level, plotResidual);
    }

    float SE3Tracker::calcWeightsAndResidualNEON(
            const Sophus::SE3f& referenceToFrame)
    {
        float tx = referenceToFrame.translation()[0];
        float ty = referenceToFrame.translation()[1];
        float tz = referenceToFrame.translation()[2];


        float constants[] = {
            tx, ty, tz, settings.var_weight,
            cameraPixelNoise2, settings.huber_d/2, -1, -1 // last values are currently unused
        };
        // This could also become a constant if one register could be made free for it somehow
        float cutoff_res_ponly4[4] = {10000, 10000, 10000, 10000}; // removed
        float* cur_buf_warped_z = buf_warped_z;
        float* cur_buf_warped_x = buf_warped_x;
        float* cur_buf_warped_y = buf_warped_y;
        float* cur_buf_warped_dx = buf_warped_dx;
        float* cur_buf_warped_dy = buf_warped_dy;
        float* cur_buf_warped_residual = buf_warped_residual;
        float* cur_buf_d = buf_d;
        float* cur_buf_idepthVar = buf_idepthVar;
        float* cur_buf_weight_p = buf_weight_p;
        int loop_count = buf_warped_size / 4;
        int remaining = buf_warped_size - 4 * loop_count;
        float sum_vector[] = {0, 0, 0, 0};

        float sumRes=0;


    #ifdef DEBUG
        loop_count = 0;
        remaining = buf_warped_size;
    #else
        if (loop_count > 0)
        {
            __asm__ __volatile__
            (
                // Extract constants
                "vldmia   %[constants], {q8-q9}              \n\t" // constants(q8-q9)
                "vdup.32  q13, d18[0]                        \n\t" // extract sigma_i2 x 4 to q13
                "vdup.32  q14, d18[1]                        \n\t" // extract huber_res_ponly x 4 to q14
                //"vdup.32  ???, d19[0]                        \n\t" // extract cutoff_res_ponly x 4 to ???
                "vdup.32  q9, d16[0]                         \n\t" // extract tx x 4 to q9, overwrite!
                "vdup.32  q10, d16[1]                        \n\t" // extract ty x 4 to q10
                "vdup.32  q11, d17[0]                        \n\t" // extract tz x 4 to q11
                "vdup.32  q8, d17[1]                         \n\t" // extract depthVarFac x 4 to q8, overwrite!

                "veor     q15, q15, q15                      \n\t" // set sumRes.sumResP(q15) to zero (by xor with itself)
                ".loopcalcWeightsAndResidualNEON:            \n\t"

                    "vldmia   %[buf_idepthVar]!, {q7}           \n\t" // s(q7)
                    "vldmia   %[buf_warped_z]!, {q2}            \n\t" // pz(q2)
                    "vldmia   %[buf_d]!, {q3}                   \n\t" // d(q3)
                    "vldmia   %[buf_warped_x]!, {q0}            \n\t" // px(q0)
                    "vldmia   %[buf_warped_y]!, {q1}            \n\t" // py(q1)
                    "vldmia   %[buf_warped_residual]!, {q4}     \n\t" // rp(q4)
                    "vldmia   %[buf_warped_dx]!, {q5}           \n\t" // gx(q5)
                    "vldmia   %[buf_warped_dy]!, {q6}           \n\t" // gy(q6)

                    "vmul.f32 q7, q7, q8                        \n\t" // s *= depthVarFac
                    "vmul.f32 q12, q2, q2                       \n\t" // pz*pz (q12, temp)
                    "vmul.f32 q3, q12, q3                       \n\t" // pz*pz*d (q3)

                    "vrecpe.f32 q3, q3                          \n\t" // 1/(pz*pz*d) (q3)
                    "vmul.f32 q12, q9, q2                       \n\t" // tx*pz (q12)
                    "vmls.f32 q12, q11, q0                      \n\t" // tx*pz - tz*px (q12) [multiply and subtract] {free: q0}
                    "vmul.f32 q0, q10, q2                       \n\t" // ty*pz (q0) {free: q2}
                    "vmls.f32 q0, q11, q1                       \n\t" // ty*pz - tz*py (q0) {free: q1}
                    "vmul.f32 q12, q12, q3                      \n\t" // g0 (q12)
                    "vmul.f32 q0, q0, q3                        \n\t" // g1 (q0)

                    "vmul.f32 q12, q12, q5                      \n\t" // gx * g0 (q12) {free: q5}
                    "vldmia %[cutoff_res_ponly4], {q5}          \n\t" // cutoff_res_ponly (q5), load for later
                    "vmla.f32 q12, q6, q0                       \n\t" // drpdd = gx * g0 + gy * g1 (q12) {free: q6, q0}

                    "vmov.f32 q1, #1.0                          \n\t" // 1.0 (q1), will be used later

                    "vmul.f32 q12, q12, q12                     \n\t" // drpdd*drpdd (q12)
                    "vmul.f32 q12, q12, q7                      \n\t" // s*drpdd*drpdd (q12)
                    "vadd.f32 q12, q12, q13                     \n\t" // sigma_i2 + s*drpdd*drpdd (q12)
                    "vrecpe.f32 q12, q12                        \n\t" // w_p = 1/(sigma_i2 + s*drpdd*drpdd) (q12) {free: q7}

                    // float weighted_rp = fabs(rp*sqrtf(w_p));
                    "vrsqrte.f32 q7, q12                        \n\t" // 1 / sqrtf(w_p) (q7)
                    "vrecpe.f32 q7, q7                          \n\t" // sqrtf(w_p) (q7)
                    "vmul.f32 q7, q7, q4                        \n\t" // rp*sqrtf(w_p) (q7)
                    "vabs.f32 q7, q7                            \n\t" // weighted_rp (q7)

                    // float wh = fabs(weighted_rp < huber_res_ponly ? 1 : huber_res_ponly / weighted_rp);
                    "vrecpe.f32 q6, q7                          \n\t" // 1 / weighted_rp (q6)
                    "vmul.f32 q6, q6, q14                       \n\t" // huber_res_ponly / weighted_rp (q6)
                    "vclt.f32 q0, q7, q14                       \n\t" // weighted_rp < huber_res_ponly ? all bits 1 : all bits 0 (q0)
                    "vbsl     q0, q1, q6                        \n\t" // sets elements in q0 to 1(q1) where above condition is true, and to q6 where it is false {free: q6}
                    "vabs.f32 q0, q0                            \n\t" // wh (q0)

                    // sumRes.sumResP += wh * w_p * rp*rp
                    "vmul.f32 q4, q4, q4                        \n\t" // rp*rp (q4)
                    "vmul.f32 q4, q4, q12                       \n\t" // w_p*rp*rp (q4)
                    "vmla.f32 q15, q4, q0                       \n\t" // sumRes.sumResP += wh*w_p*rp*rp (q15) {free: q4}

                    // if(weighted_rp > cutoff_res_ponly)
                    //     wh = 0;
                    // *(buf_weight_p+i) = wh * w_p;
                    "vcle.f32 q4, q7, q5                        \n\t" // mask in q4: ! (weighted_rp > cutoff_res_ponly)
                    "vmul.f32 q0, q0, q12                       \n\t" // wh * w_p (q0)
                    "vand     q0, q0, q4                        \n\t" // set q0 to 0 where condition for q4 failed (i.e. weighted_rp > cutoff_res_ponly)
                    "vstmia   %[buf_weight_p]!, {q0}            \n\t"

                "subs     %[loop_count], %[loop_count], #1    \n\t"
                "bne      .loopcalcWeightsAndResidualNEON     \n\t"

                "vstmia   %[sum_vector], {q15}                \n\t"

            : /* outputs */ [buf_warped_z]"+&r"(cur_buf_warped_z),
                            [buf_warped_x]"+&r"(cur_buf_warped_x),
                            [buf_warped_y]"+&r"(cur_buf_warped_y),
                            [buf_warped_dx]"+&r"(cur_buf_warped_dx),
                            [buf_warped_dy]"+&r"(cur_buf_warped_dy),
                            [buf_d]"+&r"(cur_buf_d),
                            [buf_warped_residual]"+&r"(cur_buf_warped_residual),
                            [buf_idepthVar]"+&r"(cur_buf_idepthVar),
                            [buf_weight_p]"+&r"(cur_buf_weight_p),
                            [loop_count]"+&r"(loop_count)
            : /* inputs  */ [constants]"r"(constants),
                            [cutoff_res_ponly4]"r"(cutoff_res_ponly4),
                            [sum_vector]"r"(sum_vector)
            : /* clobber */ "memory", "cc",
                            "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
            );

            sumRes += sum_vector[0] + sum_vector[1] + sum_vector[2] + sum_vector[3];
        }
    #endif

        for(int i=buf_warped_size-remaining; i<buf_warped_size; i++)
        {
            float px = *(buf_warped_x+i);   // x'
            float py = *(buf_warped_y+i);   // y'
            float pz = *(buf_warped_z+i);   // z'
            float d = *(buf_d+i);   // d
            float rp = *(buf_warped_residual+i); // r_p
            float gx = *(buf_warped_dx+i);  // \delta_x I
            float gy = *(buf_warped_dy+i);  // \delta_y I
            float s = settings.var_weight * *(buf_idepthVar+i); // \sigma_d^2


            // calc dw/dd (first 2 components):
            float g0 = (tx * pz - tz * px) / (pz*pz*d);
            float g1 = (ty * pz - tz * py) / (pz*pz*d);


            // calc w_p
            float drpdd = gx * g0 + gy * g1;    // ommitting the minus
            float w_p = 1.0f / (cameraPixelNoise2 + s * drpdd * drpdd);
            float weighted_rp = fabs(rp*sqrtf(w_p));

            float wh = fabs(weighted_rp < (settings.huber_d/2) ? 1 : (settings.huber_d/2) / weighted_rp);

            sumRes += wh * w_p * rp*rp;

            *(buf_weight_p+i) = wh * w_p;
        }

        return sumRes / buf_warped_size;
    }

    void SE3Tracker::calculateWarpUpdateNEON(
            LGS6 &ls)
    {
    //  weightEstimator.reset();
    //  weightEstimator.estimateDistributionNEON(buf_warped_residual, buf_warped_size);
    //  weightEstimator.calcWeightsNEON(buf_warped_residual, buf_warped_weights, buf_warped_size);

        ls.initialize(width*height);

        float* cur_buf_warped_z = buf_warped_z;
        float* cur_buf_warped_x = buf_warped_x;
        float* cur_buf_warped_y = buf_warped_y;
        float* cur_buf_warped_dx = buf_warped_dx;
        float* cur_buf_warped_dy = buf_warped_dy;
        Vector6 v1,v2,v3,v4;
        float* v1_ptr;
        float* v2_ptr;
        float* v3_ptr;
        float* v4_ptr;
        for(int i=0;i<buf_warped_size;i+=4)
        {
            v1_ptr = &v1[0];
            v2_ptr = &v2[0];
            v3_ptr = &v3[0];
            v4_ptr = &v4[0];

            __asm__ __volatile__
            (
                "vldmia   %[buf_warped_z]!, {q10}            \n\t" // pz(q10)
                "vrecpe.f32 q10, q10                         \n\t" // z(q10)

                "vldmia   %[buf_warped_dx]!, {q11}           \n\t" // gx(q11)
                "vmul.f32 q0, q10, q11                       \n\t" // q0 = z*gx // = v[0]

                "vldmia   %[buf_warped_dy]!, {q12}           \n\t" // gy(q12)
                "vmul.f32 q1, q10, q12                       \n\t" // q1 = z*gy // = v[1]

                "vldmia   %[buf_warped_x]!, {q13}            \n\t" // px(q13)
                "vmul.f32 q5, q13, q12                       \n\t" // q5 = px * gy
                "vmul.f32 q5, q5, q10                        \n\t" // q5 = q5 * z = px * gy * z

                "vldmia   %[buf_warped_y]!, {q14}            \n\t" // py(q14)
                "vmul.f32 q3, q14, q11                       \n\t" // q3 = py * gx
                "vmls.f32 q5, q3, q10                        \n\t" // q5 = px * gy * z - py * gx * z // = v[5] (vmls: multiply and subtract from result)

                "vmul.f32 q10, q10, q10                      \n\t" // q10 = 1/(pz*pz)

                "vmul.f32 q6, q13, q11                       \n\t"
                "vmul.f32 q6, q6, q10                        \n\t" // q6 = val1 in SSE version = px * z_sqr * gx

                "vmul.f32 q7, q14, q12                       \n\t"
                "vmul.f32 q7, q7, q10                        \n\t" // q7 = val2 in SSE version = py * z_sqr * gy

                "vadd.f32 q2, q6, q7                         \n\t"
                "vneg.f32 q2, q2                             \n\t" // q2 = -px * z_sqr * gx -py * z_sqr * gy // = v[2]

                "vmul.f32 q8, q6, q14                        \n\t" // val3(q8) = px * z_sqr * gx * py
                "vadd.f32 q9, q12, q8                        \n\t" // val4(q9) = gy + px * z_sqr * gx * py
                "vmul.f32 q8, q7, q14                        \n\t" // val3(q8) = py * py * z_sqr * gy
                "vadd.f32 q9, q8, q9                         \n\t" // val4(q9) = gy + px * z_sqr * gx * py + py * py * z_sqr * gy
                "vneg.f32 q3, q9                             \n\t" // q3 = v[3]

                "vst4.32 {d0[0], d2[0], d4[0], d6[0]}, [%[v1]]! \n\t" // store v[0] .. v[3] for 1st value and inc pointer
                "vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [%[v2]]! \n\t" // store v[0] .. v[3] for 2nd value and inc pointer
                "vst4.32 {d1[0], d3[0], d5[0], d7[0]}, [%[v3]]! \n\t" // store v[0] .. v[3] for 3rd value and inc pointer
                "vst4.32 {d1[1], d3[1], d5[1], d7[1]}, [%[v4]]! \n\t" // store v[0] .. v[3] for 4th value and inc pointer

                "vmul.f32 q8, q6, q13                        \n\t" // val3(q8) = px * px * z_sqr * gx
                "vadd.f32 q9, q11, q8                        \n\t" // val4(q9) = gx + px * px * z_sqr * gx
                "vmul.f32 q8, q7, q13                        \n\t" // val3(q8) = px * py * z_sqr * gy
                "vadd.f32 q4, q9, q8                         \n\t" // q4 = v[4]

                "vst2.32 {d8[0], d10[0]}, [%[v1]]               \n\t" // store v[4], v[5] for 1st value
                "vst2.32 {d8[1], d10[1]}, [%[v2]]               \n\t" // store v[4], v[5] for 2nd value
                "vst2.32 {d9[0], d11[0]}, [%[v3]]               \n\t" // store v[4], v[5] for 3rd value
                "vst2.32 {d9[1], d11[1]}, [%[v4]]               \n\t" // store v[4], v[5] for 4th value

            : /* outputs */ [buf_warped_z]"+r"(cur_buf_warped_z),
                            [buf_warped_x]"+r"(cur_buf_warped_x),
                            [buf_warped_y]"+r"(cur_buf_warped_y),
                            [buf_warped_dx]"+r"(cur_buf_warped_dx),
                            [buf_warped_dy]"+r"(cur_buf_warped_dy),
                            [v1]"+r"(v1_ptr),
                            [v2]"+r"(v2_ptr),
                            [v3]"+r"(v3_ptr),
                            [v4]"+r"(v4_ptr)
            : /* inputs  */
            : /* clobber */ "memory", "cc", // TODO: is cc necessary?
                            "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14"
            );


            // step 6: integrate into A and b:
            if(!(i+3>=buf_warped_size))
            {
                ls.update(v1, *(buf_warped_residual+i+0), *(buf_weight_p+i+0));
                ls.update(v2, *(buf_warped_residual+i+1), *(buf_weight_p+i+1));
                ls.update(v3, *(buf_warped_residual+i+2), *(buf_weight_p+i+2));
                ls.update(v4, *(buf_warped_residual+i+3), *(buf_weight_p+i+3));
            }
            else
            {
                ls.update(v1, *(buf_warped_residual+i+0), *(buf_weight_p+i+0));

                if(i+1>=buf_warped_size) break;
                ls.update(v2, *(buf_warped_residual+i+1), *(buf_weight_p+i+1));

                if(i+2>=buf_warped_size) break;
                ls.update(v3, *(buf_warped_residual+i+2), *(buf_weight_p+i+2));

                if(i+3>=buf_warped_size) break;
                ls.update(v4, *(buf_warped_residual+i+3), *(buf_weight_p+i+3));
            }
        }

        // solve ls
        ls.finish();
        //ls.solve(result);

    }

}


#endif