Program Listing for File SE3Tracker_SSE.cpp

Return to documentation for file (lib/Tracking/SE3Tracker_SSE.cpp)

#if defined(ENABLE_SSE)

namespace lsd_slam {

    float SE3Tracker::calcResidualAndBuffersSSE(
            const Eigen::Vector3f* refPoint,
            const Eigen::Vector2f* refColVar,
            int* idxBuf,
            int refNum,
                const std::shared_ptr<Frame> &frame,
            const Sophus::SE3f& referenceToFrame,
            int level,
            bool plotResidual)
    {
        return calcResidualAndBuffers(refPoint, refColVar, idxBuf, refNum, frame, referenceToFrame, level, plotResidual);
    }


    float SE3Tracker::calcWeightsAndResidualSSE(
            const Sophus::SE3f& referenceToFrame)
    {
        const __m128 txs = _mm_set1_ps((float)(referenceToFrame.translation()[0]));
        const __m128 tys = _mm_set1_ps((float)(referenceToFrame.translation()[1]));
        const __m128 tzs = _mm_set1_ps((float)(referenceToFrame.translation()[2]));

        const __m128 zeros = _mm_set1_ps(0.0f);
        const __m128 ones = _mm_set1_ps(1.0f);


        const __m128 depthVarFacs = _mm_set1_ps((float)settings.var_weight);// float depthVarFac = var_weight;  // the depth var is over-confident. this is a constant multiplier to remedy that.... HACK
        const __m128 sigma_i2s = _mm_set1_ps((float)cameraPixelNoise2);


        const __m128 huber_res_ponlys = _mm_set1_ps((float)(settings.huber_d/2));

        __m128 sumResP = zeros;


        float sumRes = 0;

        for(int i=0;i<buf_warped_size-3;i+=4)
        {
    //      float px = *(buf_warped_x+i);   // x'
    //      float py = *(buf_warped_y+i);   // y'
    //      float pz = *(buf_warped_z+i);   // z'
    //      float d = *(buf_d+i);   // d
    //      float rp = *(buf_warped_residual+i); // r_p
    //      float gx = *(buf_warped_dx+i);  // \delta_x I
    //      float gy = *(buf_warped_dy+i);  // \delta_y I
    //      float s = depthVarFac * *(buf_idepthVar+i); // \sigma_d^2


            // calc dw/dd (first 2 components):
            __m128 pzs = _mm_load_ps(buf_warped_z+i);   // z'
            __m128 pz2ds = _mm_rcp_ps(_mm_mul_ps(_mm_mul_ps(pzs, pzs), _mm_load_ps(buf_d+i)));  // 1 / (z' * z' * d)
            __m128 g0s = _mm_sub_ps(_mm_mul_ps(pzs, txs), _mm_mul_ps(_mm_load_ps(buf_warped_x+i), tzs));
            g0s = _mm_mul_ps(g0s,pz2ds); //float g0 = (tx * pz - tz * px) / (pz*pz*d);

             //float g1 = (ty * pz - tz * py) / (pz*pz*d);
            __m128 g1s = _mm_sub_ps(_mm_mul_ps(pzs, tys), _mm_mul_ps(_mm_load_ps(buf_warped_y+i), tzs));
            g1s = _mm_mul_ps(g1s,pz2ds);

             // float drpdd = gx * g0 + gy * g1;    // ommitting the minus
            __m128 drpdds = _mm_add_ps(
                    _mm_mul_ps(g0s, _mm_load_ps(buf_warped_dx+i)),
                    _mm_mul_ps(g1s, _mm_load_ps(buf_warped_dy+i)));

             //float w_p = 1.0f / (sigma_i2 + s * drpdd * drpdd);
            __m128 w_ps = _mm_rcp_ps(_mm_add_ps(sigma_i2s,
                    _mm_mul_ps(drpdds,
                            _mm_mul_ps(drpdds,
                                    _mm_mul_ps(depthVarFacs,
                                            _mm_load_ps(buf_idepthVar+i))))));


            //float weighted_rp = fabs(rp*sqrtf(w_p));
            __m128 weighted_rps = _mm_mul_ps(_mm_load_ps(buf_warped_residual+i),
                    _mm_sqrt_ps(w_ps));
            weighted_rps = _mm_max_ps(weighted_rps, _mm_sub_ps(zeros,weighted_rps));


            //float wh = fabs(weighted_rp < huber_res_ponly ? 1 : huber_res_ponly / weighted_rp);
            __m128 whs = _mm_cmplt_ps(weighted_rps, huber_res_ponlys);  // bitmask 0xFFFFFFFF for 1, 0x000000 for huber_res_ponly / weighted_rp
            whs = _mm_or_ps(
                    _mm_and_ps(whs, ones),
                    _mm_andnot_ps(whs, _mm_mul_ps(huber_res_ponlys, _mm_rcp_ps(weighted_rps))));




            // sumRes.sumResP += wh * w_p * rp*rp;
            if(i+3 < buf_warped_size)
                sumResP = _mm_add_ps(sumResP,
                        _mm_mul_ps(whs, _mm_mul_ps(weighted_rps, weighted_rps)));

            // *(buf_weight_p+i) = wh * w_p;
            _mm_store_ps(buf_weight_p+i, _mm_mul_ps(whs, w_ps) );
        }
        sumRes = SSEE(sumResP,0) + SSEE(sumResP,1) + SSEE(sumResP,2) + SSEE(sumResP,3);

        return sumRes / ((buf_warped_size >> 2)<<2);
    }

    void SE3Tracker::calculateWarpUpdateSSE(
            LGS6 &ls)
    {
        ls.initialize( _imgSize.area() );

    //  printf("wupd SSE\n");
        for(int i=0;i<buf_warped_size-3;i+=4)
        {

            __m128 val1, val2, val3, val4;
            __m128 J61, J62, J63, J64, J65, J66;

            // redefine pz
            __m128 pz = _mm_load_ps(buf_warped_z+i);
            pz = _mm_rcp_ps(pz);                        // pz := 1/z


            __m128 gx = _mm_load_ps(buf_warped_dx+i);
            val1 = _mm_mul_ps(pz, gx);          // gx / z => SET [0]
            //v[0] = z*gx;
            J61 = val1;



            __m128 gy = _mm_load_ps(buf_warped_dy+i);
            val1 = _mm_mul_ps(pz, gy);                  // gy / z => SET [1]
            //v[1] = z*gy;
            J62 = val1;


            __m128 px = _mm_load_ps(buf_warped_x+i);
            val1 = _mm_mul_ps(px, gy);
            val1 = _mm_mul_ps(val1, pz);    //  px * gy * z
            __m128 py = _mm_load_ps(buf_warped_y+i);
            val2 = _mm_mul_ps(py, gx);
            val2 = _mm_mul_ps(val2, pz);    //  py * gx * z
            val1 = _mm_sub_ps(val1, val2);  // px * gy * z - py * gx * z => SET [5]
            //v[5] = -py * z * gx +  px * z * gy;
            J66 = val1;


            // redefine pz
            pz = _mm_mul_ps(pz,pz);         // pz := 1/(z*z)

            // will use these for the following calculations a lot.
            val1 = _mm_mul_ps(px, gx);
            val1 = _mm_mul_ps(val1, pz);        // px * z_sqr * gx
            val2 = _mm_mul_ps(py, gy);
            val2 = _mm_mul_ps(val2, pz);        // py * z_sqr * gy


            val3 = _mm_add_ps(val1, val2);
            val3 = _mm_sub_ps(_mm_setr_ps(0,0,0,0),val3);   //-px * z_sqr * gx -py * z_sqr * gy
            //v[2] = -px * z_sqr * gx -py * z_sqr * gy; => SET [2]
            J63 = val3;


            val3 = _mm_mul_ps(val1, py); // px * z_sqr * gx * py
            val4 = _mm_add_ps(gy, val3); // gy + px * z_sqr * gx * py
            val3 = _mm_mul_ps(val2, py); // py * py * z_sqr * gy
            val4 = _mm_add_ps(val3, val4); // gy + px * z_sqr * gx * py + py * py * z_sqr * gy
            val4 = _mm_sub_ps(_mm_setr_ps(0,0,0,0),val4); //val4 = -val4.
            //v[3] = -px * py * z_sqr * gx +
            //       -py * py * z_sqr * gy +
            //       -gy;       => SET [3]
            J64 = val4;


            val3 = _mm_mul_ps(val1, px); // px * px * z_sqr * gx
            val4 = _mm_add_ps(gx, val3); // gx + px * px * z_sqr * gx
            val3 = _mm_mul_ps(val2, px); // px * py * z_sqr * gy
            val4 = _mm_add_ps(val4, val3); // gx + px * px * z_sqr * gx + px * py * z_sqr * gy
            //v[4] = px * px * z_sqr * gx +
            //     px * py * z_sqr * gy +
            //     gx;              => SET [4]
            J65 = val4;

            if(i+3<buf_warped_size)
            {
                ls.updateSSE(J61, J62, J63, J64, J65, J66, _mm_load_ps(buf_warped_residual+i), _mm_load_ps(buf_weight_p+i));
            }
            else
            {
                for(int k=0;i+k<buf_warped_size;k++)
                {
                    Vector6 v6;
                    v6 << SSEE(J61,k),SSEE(J62,k),SSEE(J63,k),SSEE(J64,k),SSEE(J65,k),SSEE(J66,k);
                    ls.update(v6, *(buf_warped_residual+i+k), *(buf_weight_p+i+k));
                }
            }


        }

        // solve ls
        ls.finish();

    }


}

#endif