Program Listing for File LGSX.h

Return to documentation for file (lib/Tracking/LGSX.h)

#pragma once

#include "util/EigenCoreInclude.h"
#include <opencv2/core/core.hpp>
#include "util/settings.h"



namespace lsd_slam
{


typedef Eigen::Matrix<float, 6, 1> Vector6;
typedef Eigen::Matrix<float, 6, 6> Matrix6x6;

typedef Eigen::Matrix<float, 7, 1> Vector7;
typedef Eigen::Matrix<float, 7, 7> Matrix7x7;

typedef Eigen::Matrix<float, 4, 1> Vector4;
typedef Eigen::Matrix<float, 4, 4> Matrix4x4;




class LGS4
{
public:
  EIGEN_MAKE_ALIGNED_OPERATOR_NEW;

  Matrix4x4 A;
  Vector4 b;

  float error;
  size_t num_constraints;

  inline void initialize(const size_t maxnum_constraints)
  {
    A.setZero();
    b.setZero();
    memset(SSEData,0, sizeof(float)*4*15);
    error = 0;
    this->num_constraints = 0;
  }

  inline void finishNoDivide()
  {
#if defined(ENABLE_SSE)
    __m128 a;

    a = _mm_load_ps(SSEData+4*0);
    A(0,0) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*1);
    A(1,0) = (A(0,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*2);
    A(2,0) = (A(0,2) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*3);
    A(3,0) = (A(0,3) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));



    a = _mm_load_ps(SSEData+4*4);
    A(1,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*5);
    A(1,2) = (A(2,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*6);
    A(1,3) = (A(3,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));




    a = _mm_load_ps(SSEData+4*7);
    A(2,2) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*8);
    A(2,3) = (A(3,2) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));


    a = _mm_load_ps(SSEData+4*9);
    A(3,3) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);




    a = _mm_load_ps(SSEData+4*10);
    b[0] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*11);
    b[1] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*12);
    b[2] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*13);
    b[3] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*14);
    error += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

#endif
  }

#if defined(ENABLE_SSE)
  inline void updateSSE(
        const __m128 &J1,const __m128 &J2,const __m128 &J3,const __m128 &J4,
        const __m128& res, const __m128& weight)
  {
      //A.noalias() += J * J.transpose() * weight;
      __m128 J1w = _mm_mul_ps(J1,weight);
      _mm_store_ps(SSEData+4*0, _mm_add_ps(_mm_load_ps(SSEData+4*0),_mm_mul_ps(J1w,J1)));
      _mm_store_ps(SSEData+4*1, _mm_add_ps(_mm_load_ps(SSEData+4*1),_mm_mul_ps(J1w,J2)));
      _mm_store_ps(SSEData+4*2, _mm_add_ps(_mm_load_ps(SSEData+4*2),_mm_mul_ps(J1w,J3)));
      _mm_store_ps(SSEData+4*3, _mm_add_ps(_mm_load_ps(SSEData+4*3),_mm_mul_ps(J1w,J4)));

      __m128 J2w = _mm_mul_ps(J2,weight);
      _mm_store_ps(SSEData+4*4, _mm_add_ps(_mm_load_ps(SSEData+4*4),_mm_mul_ps(J2w,J2)));
      _mm_store_ps(SSEData+4*5, _mm_add_ps(_mm_load_ps(SSEData+4*5),_mm_mul_ps(J2w,J3)));
      _mm_store_ps(SSEData+4*6, _mm_add_ps(_mm_load_ps(SSEData+4*6),_mm_mul_ps(J2w,J4)));


      __m128 J3w = _mm_mul_ps(J3,weight);
      _mm_store_ps(SSEData+4*7, _mm_add_ps(_mm_load_ps(SSEData+4*7),_mm_mul_ps(J3w,J3)));
      _mm_store_ps(SSEData+4*8, _mm_add_ps(_mm_load_ps(SSEData+4*8),_mm_mul_ps(J3w,J4)));

      __m128 J4w = _mm_mul_ps(J4,weight);
      _mm_store_ps(SSEData+4*9, _mm_add_ps(_mm_load_ps(SSEData+4*9),_mm_mul_ps(J4w,J4)));

      //b.noalias() -= J * (res * weight);
      __m128 resw = _mm_mul_ps(res,weight);
      _mm_store_ps(SSEData+4*10, _mm_add_ps(_mm_load_ps(SSEData+4*10),_mm_mul_ps(resw,J1)));
      _mm_store_ps(SSEData+4*11, _mm_add_ps(_mm_load_ps(SSEData+4*11),_mm_mul_ps(resw,J2)));
      _mm_store_ps(SSEData+4*12, _mm_add_ps(_mm_load_ps(SSEData+4*12),_mm_mul_ps(resw,J3)));
      _mm_store_ps(SSEData+4*13, _mm_add_ps(_mm_load_ps(SSEData+4*13),_mm_mul_ps(resw,J4)));

      //error += res * res * weight;
      _mm_store_ps(SSEData+4*14, _mm_add_ps(_mm_load_ps(SSEData+4*14),_mm_mul_ps(resw,res)));

      num_constraints += 4;
  }
#endif

  inline void update(const Vector4& J, const float& res, const float& weight)
  {
    A.noalias() += J * J.transpose() * weight;
    b.noalias() -= J * (res * weight);
    error += res * res * weight;
    num_constraints += 1;
  }

private:
  EIGEN_ALIGN16 float SSEData[4*15];
};



class LGS6
{
public:
  EIGEN_MAKE_ALIGNED_OPERATOR_NEW;

  Matrix6x6 A;
  Vector6 b;

  float error;
  size_t num_constraints;

  inline void initialize(const size_t maxnum_constraints)
  {
    A.setZero();
    b.setZero();
    memset(SSEData,0, sizeof(float)*4*28);

    error = 0;
    this->num_constraints = 0;
  }

  inline void finishNoDivide()
  {

#if defined(ENABLE_SSE)
    __m128 a;

    a = _mm_load_ps(SSEData+4*0);
    A(0,0) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*1);
    A(1,0) = (A(0,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*2);
    A(2,0) = (A(0,2) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*3);
    A(3,0) = (A(0,3) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*4);
    A(4,0) = (A(0,4) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*5);
    A(5,0) = (A(0,5) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));






    a = _mm_load_ps(SSEData+4*6);
    A(1,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*7);
    A(1,2) = (A(2,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*8);
    A(1,3) = (A(3,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*9);
    A(1,4) = (A(4,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*10);
    A(1,5) = (A(5,1) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));




    a = _mm_load_ps(SSEData+4*11);
    A(2,2) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*12);
    A(2,3) = (A(3,2) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*13);
    A(2,4) = (A(4,2) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*14);
    A(2,5) = (A(5,2) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));



    a = _mm_load_ps(SSEData+4*15);
    A(3,3) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*16);
    A(3,4) = (A(4,3) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));

    a = _mm_load_ps(SSEData+4*17);
    A(3,5) = (A(5,3) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));



    a = _mm_load_ps(SSEData+4*18);
    A(4,4) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*19);
    A(4,5) = (A(5,4) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3));



    a = _mm_load_ps(SSEData+4*20);
    A(5,5) += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);





    a = _mm_load_ps(SSEData+4*21);
    b[0] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*22);
    b[1] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*23);
    b[2] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*24);
    b[3] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*25);
    b[4] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

    a = _mm_load_ps(SSEData+4*26);
    b[5] -= SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);



    a = _mm_load_ps(SSEData+4*27);
    error += SSEE(a,0) + SSEE(a,1) + SSEE(a,2) + SSEE(a,3);

#endif
  }


  void finish()
  {
    finishNoDivide();
    A /= (float) num_constraints;
    b /= (float) num_constraints;
    error /= (float) num_constraints;
  }

#if defined(ENABLE_SSE)
  inline void updateSSE(
        const __m128 &J1,const __m128 &J2,const __m128 &J3,const __m128 &J4,const __m128 &J5,const __m128 &J6,
        const __m128& res, const __m128& weight)
  {
      //A.noalias() += J * J.transpose() * weight;
      __m128 J1w = _mm_mul_ps(J1,weight);
      _mm_store_ps(SSEData+4*0, _mm_add_ps(_mm_load_ps(SSEData+4*0),_mm_mul_ps(J1w,J1)));
      _mm_store_ps(SSEData+4*1, _mm_add_ps(_mm_load_ps(SSEData+4*1),_mm_mul_ps(J1w,J2)));
      _mm_store_ps(SSEData+4*2, _mm_add_ps(_mm_load_ps(SSEData+4*2),_mm_mul_ps(J1w,J3)));
      _mm_store_ps(SSEData+4*3, _mm_add_ps(_mm_load_ps(SSEData+4*3),_mm_mul_ps(J1w,J4)));
      _mm_store_ps(SSEData+4*4, _mm_add_ps(_mm_load_ps(SSEData+4*4),_mm_mul_ps(J1w,J5)));
      _mm_store_ps(SSEData+4*5, _mm_add_ps(_mm_load_ps(SSEData+4*5),_mm_mul_ps(J1w,J6)));


      __m128 J2w = _mm_mul_ps(J2,weight);
      _mm_store_ps(SSEData+4*6, _mm_add_ps(_mm_load_ps(SSEData+4*6),_mm_mul_ps(J2w,J2)));
      _mm_store_ps(SSEData+4*7, _mm_add_ps(_mm_load_ps(SSEData+4*7),_mm_mul_ps(J2w,J3)));
      _mm_store_ps(SSEData+4*8, _mm_add_ps(_mm_load_ps(SSEData+4*8),_mm_mul_ps(J2w,J4)));
      _mm_store_ps(SSEData+4*9, _mm_add_ps(_mm_load_ps(SSEData+4*9),_mm_mul_ps(J2w,J5)));
      _mm_store_ps(SSEData+4*10, _mm_add_ps(_mm_load_ps(SSEData+4*10),_mm_mul_ps(J2w,J6)));


      __m128 J3w = _mm_mul_ps(J3,weight);
      _mm_store_ps(SSEData+4*11, _mm_add_ps(_mm_load_ps(SSEData+4*11),_mm_mul_ps(J3w,J3)));
      _mm_store_ps(SSEData+4*12, _mm_add_ps(_mm_load_ps(SSEData+4*12),_mm_mul_ps(J3w,J4)));
      _mm_store_ps(SSEData+4*13, _mm_add_ps(_mm_load_ps(SSEData+4*13),_mm_mul_ps(J3w,J5)));
      _mm_store_ps(SSEData+4*14, _mm_add_ps(_mm_load_ps(SSEData+4*14),_mm_mul_ps(J3w,J6)));

      __m128 J4w = _mm_mul_ps(J4,weight);
      _mm_store_ps(SSEData+4*15, _mm_add_ps(_mm_load_ps(SSEData+4*15),_mm_mul_ps(J4w,J4)));
      _mm_store_ps(SSEData+4*16, _mm_add_ps(_mm_load_ps(SSEData+4*16),_mm_mul_ps(J4w,J5)));
      _mm_store_ps(SSEData+4*17, _mm_add_ps(_mm_load_ps(SSEData+4*17),_mm_mul_ps(J4w,J6)));


      __m128 J5w = _mm_mul_ps(J5,weight);
      _mm_store_ps(SSEData+4*18, _mm_add_ps(_mm_load_ps(SSEData+4*18),_mm_mul_ps(J5w,J5)));
      _mm_store_ps(SSEData+4*19, _mm_add_ps(_mm_load_ps(SSEData+4*19),_mm_mul_ps(J5w,J6)));


      __m128 J6w = _mm_mul_ps(J6,weight);
      _mm_store_ps(SSEData+4*20, _mm_add_ps(_mm_load_ps(SSEData+4*20),_mm_mul_ps(J6w,J6)));




      //b.noalias() -= J * (res * weight);
      __m128 resw = _mm_mul_ps(res,weight);
      _mm_store_ps(SSEData+4*21, _mm_add_ps(_mm_load_ps(SSEData+4*21),_mm_mul_ps(resw,J1)));
      _mm_store_ps(SSEData+4*22, _mm_add_ps(_mm_load_ps(SSEData+4*22),_mm_mul_ps(resw,J2)));
      _mm_store_ps(SSEData+4*23, _mm_add_ps(_mm_load_ps(SSEData+4*23),_mm_mul_ps(resw,J3)));
      _mm_store_ps(SSEData+4*24, _mm_add_ps(_mm_load_ps(SSEData+4*24),_mm_mul_ps(resw,J4)));
      _mm_store_ps(SSEData+4*25, _mm_add_ps(_mm_load_ps(SSEData+4*25),_mm_mul_ps(resw,J5)));
      _mm_store_ps(SSEData+4*26, _mm_add_ps(_mm_load_ps(SSEData+4*26),_mm_mul_ps(resw,J6)));

      //error += res * res * weight;
      _mm_store_ps(SSEData+4*27, _mm_add_ps(_mm_load_ps(SSEData+4*27),_mm_mul_ps(resw,res)));

      num_constraints += 6;
  }
#endif


  inline void update(const Vector6& J, const float& res, const float& weight)
  {
    A.noalias() += J * J.transpose() * weight;
    b.noalias() -= J * (res * weight);
    error += res * res * weight;
    num_constraints += 1;
  }



private:
  EIGEN_ALIGN16 float SSEData[4*28];
};








class LGS7
{
public:
  EIGEN_MAKE_ALIGNED_OPERATOR_NEW;

  Matrix7x7 A;
  Vector7 b;

  float error;
  size_t num_constraints;

  void initializeFrom(const LGS6& ls6, const LGS4& ls4)
  {
    // set zero
    A.setZero();
    b.setZero();

    // add ls6
    A.topLeftCorner<6,6>() = ls6.A;
    b.head<6>() = ls6.b;

    // add ls4
    int remap[4] = {2,3,4,6};
    for(int i=0;i<4;i++)
    {
        b[remap[i]] += ls4.b[i];
        for(int j=0;j<4;j++)
            A(remap[i], remap[j]) += ls4.A(i,j);
    }

    num_constraints = ls6.num_constraints + ls4.num_constraints;
  }
};



}