#include #include #include #include #include #include #include #include #include // #include "utils.cuh" #include #include #include typedef Eigen::SparseMatrix SpMat; typedef Eigen::Triplet T; typedef std::vector> graph_t; typedef std::vector tensor_list_t; #define MIN_DEPTH 0.25 #define THREADS 256 #define NUM_BLOCKS(batch_size) ((batch_size + THREADS - 1) / THREADS) #define GPU_1D_KERNEL_LOOP(k, n) \ for (size_t k = threadIdx.x; k 1e-4) { float a = (1 - cosf(theta)) / theta_sq; crossInplace(phi, tau); t[0] += a * tau[0]; t[1] += a * tau[1]; t[2] += a * tau[2]; float b = (theta - sinf(theta)) / (theta * theta_sq); crossInplace(phi, tau); t[0] += b * tau[0]; t[1] += b * tau[1]; t[2] += b * tau[2]; } } __global__ void projective_transform_kernel( const torch::PackedTensorAccessor32 target, const torch::PackedTensorAccessor32 weight, const torch::PackedTensorAccessor32 poses, const torch::PackedTensorAccessor32 disps, const torch::PackedTensorAccessor32 intrinsics, const torch::PackedTensorAccessor32 ii, const torch::PackedTensorAccessor32 jj, torch::PackedTensorAccessor32 Hs, torch::PackedTensorAccessor32 vs, torch::PackedTensorAccessor32 Eii, torch::PackedTensorAccessor32 Eij, torch::PackedTensorAccessor32 Cii, torch::PackedTensorAccessor32 bz) { const int block_id = blockIdx.x; const int thread_id = threadIdx.x; const int ht = disps.size(1); const int wd = disps.size(2); int ix = static_cast(ii[block_id]); int jx = static_cast(jj[block_id]); __shared__ float fx; __shared__ float fy; __shared__ float cx; __shared__ float cy; __shared__ float ti[3], tj[3], tij[3]; __shared__ float qi[4], qj[4], qij[4]; // load intrinsics from global memory if (thread_id == 0) { fx = intrinsics[0]; fy = intrinsics[1]; cx = intrinsics[2]; cy = intrinsics[3]; } __syncthreads(); // stereo frames if (ix == jx) { if (thread_id == 0) { tij[0] = -0.1; tij[1] = 0; tij[2] = 0; qij[0] = 0; qij[1] = 0; qij[2] = 0; qij[3] = 1; } } else { // load poses from global memory if (thread_id < 3) { ti[thread_id] = poses[ix][thread_id]; tj[thread_id] = poses[jx][thread_id]; } if (thread_id < 4) { qi[thread_id] = poses[ix][thread_id+3]; qj[thread_id] = poses[jx][thread_id+3]; } __syncthreads(); if (thread_id == 0) { relSE3(ti, qi, tj, qj, tij, qij); } } __syncthreads(); //points float Xi[4]; float Xj[4]; // jacobians float Jx[12]; float Jz; float* Ji = &Jx[0]; float* Jj = &Jx[6]; // hessians float hij[12*(12+1)/2]; float vi[6], vj[6]; int l; for (l=0; l<12*(12+1)/2; l++) { hij[l] = 0; } for (int n=0; n<6; n++) { vi[n] = 0; vj[n] = 0; } __syncthreads(); GPU_1D_KERNEL_LOOP(k, ht*wd) { const int i = k / wd; const int j = k % wd; const float u = static_cast(j); const float v = static_cast(i); // homogenous coordinates Xi[0] = (u - cx) / fx; Xi[1] = (v - cy) / fy; Xi[2] = 1; Xi[3] = disps[ix][i][j]; // transform homogenous point actSE3(tij, qij, Xi, Xj); const float x = Xj[0]; const float y = Xj[1]; const float h = Xj[3]; const float d = (Xj[2] < MIN_DEPTH) ? 0.0 : 1.0 / Xj[2]; const float d2 = d * d; float wu = (Xj[2] < MIN_DEPTH) ? 0.0 : .001 * weight[block_id][0][i][j]; float wv = (Xj[2] < MIN_DEPTH) ? 0.0 : .001 * weight[block_id][1][i][j]; const float ru = target[block_id][0][i][j] - (fx * d * x + cx); const float rv = target[block_id][1][i][j] - (fy * d * y + cy); // x - coordinate Jj[0] = fx * (h*d); Jj[1] = fx * 0; Jj[2] = fx * (-x*h*d2); Jj[3] = fx * (-x*y*d2); Jj[4] = fx * (1 + x*x*d2); Jj[5] = fx * (-y*d); Jz = fx * (tij[0] * d - tij[2] * (x * d2)); Cii[block_id][k] = wu * Jz * Jz; bz[block_id][k] = wu * ru * Jz; if (ix == jx) wu = 0; adjSE3(tij, qij, Jj, Ji); for (int n=0; n<6; n++) Ji[n] *= -1; l=0; for (int n=0; n<12; n++) { for (int m=0; m<=n; m++) { hij[l] += wu * Jx[n] * Jx[m]; l++; } } for (int n=0; n<6; n++) { vi[n] += wu * ru * Ji[n]; vj[n] += wu * ru * Jj[n]; Eii[block_id][n][k] = wu * Jz * Ji[n]; Eij[block_id][n][k] = wu * Jz * Jj[n]; } Jj[0] = fy * 0; Jj[1] = fy * (h*d); Jj[2] = fy * (-y*h*d2); Jj[3] = fy * (-1 - y*y*d2); Jj[4] = fy * (x*y*d2); Jj[5] = fy * (x*d); Jz = fy * (tij[1] * d - tij[2] * (y * d2)); Cii[block_id][k] += wv * Jz * Jz; bz[block_id][k] += wv * rv * Jz; if (ix == jx) wv = 0; adjSE3(tij, qij, Jj, Ji); for (int n=0; n<6; n++) Ji[n] *= -1; l=0; for (int n=0; n<12; n++) { for (int m=0; m<=n; m++) { hij[l] += wv * Jx[n] * Jx[m]; l++; } } for (int n=0; n<6; n++) { vi[n] += wv * rv * Ji[n]; vj[n] += wv * rv * Jj[n]; Eii[block_id][n][k] += wv * Jz * Ji[n]; Eij[block_id][n][k] += wv * Jz * Jj[n]; } } __syncthreads(); __shared__ float sdata[THREADS]; for (int n=0; n<6; n++) { sdata[threadIdx.x] = vi[n]; blockReduce(sdata); if (threadIdx.x == 0) { vs[0][block_id][n] = sdata[0]; } __syncthreads(); sdata[threadIdx.x] = vj[n]; blockReduce(sdata); if (threadIdx.x == 0) { vs[1][block_id][n] = sdata[0]; } } l=0; for (int n=0; n<12; n++) { for (int m=0; m<=n; m++) { sdata[threadIdx.x] = hij[l]; blockReduce(sdata); if (threadIdx.x == 0) { if (n<6 && m<6) { Hs[0][block_id][n][m] = sdata[0]; Hs[0][block_id][m][n] = sdata[0]; } else if (n >=6 && m<6) { Hs[1][block_id][m][n-6] = sdata[0]; Hs[2][block_id][n-6][m] = sdata[0]; } else { Hs[3][block_id][n-6][m-6] = sdata[0]; Hs[3][block_id][m-6][n-6] = sdata[0]; } } l++; } } } __global__ void projmap_kernel( const torch::PackedTensorAccessor32 poses, const torch::PackedTensorAccessor32 disps, const torch::PackedTensorAccessor32 intrinsics, const torch::PackedTensorAccessor32 ii, const torch::PackedTensorAccessor32 jj, torch::PackedTensorAccessor32 coords, torch::PackedTensorAccessor32 valid) { const int block_id = blockIdx.x; const int thread_id = threadIdx.x; const int ht = disps.size(1); const int wd = disps.size(2); __shared__ int ix; __shared__ int jx; __shared__ float fx; __shared__ float fy; __shared__ float cx; __shared__ float cy; __shared__ float ti[3], tj[3], tij[3]; __shared__ float qi[4], qj[4], qij[4]; // load intrinsics from global memory if (thread_id == 0) { ix = static_cast(ii[block_id]); jx = static_cast(jj[block_id]); fx = intrinsics[0]; fy = intrinsics[1]; cx = intrinsics[2]; cy = intrinsics[3]; } __syncthreads(); // load poses from global memory if (thread_id < 3) { ti[thread_id] = poses[ix][thread_id]; tj[thread_id] = poses[jx][thread_id]; } if (thread_id < 4) { qi[thread_id] = poses[ix][thread_id+3]; qj[thread_id] = poses[jx][thread_id+3]; } __syncthreads(); if (thread_id == 0) { relSE3(ti, qi, tj, qj, tij, qij); } //points float Xi[4]; float Xj[4]; __syncthreads(); GPU_1D_KERNEL_LOOP(k, ht*wd) { const int i = k / wd; const int j = k % wd; const float u = static_cast(j); const float v = static_cast(i); // homogenous coordinates Xi[0] = (u - cx) / fx; Xi[1] = (v - cy) / fy; Xi[2] = 1; Xi[3] = disps[ix][i][j]; // transform homogenous point actSE3(tij, qij, Xi, Xj); coords[block_id][i][j][0] = u; coords[block_id][i][j][1] = v; if (Xj[2] > 0.01) { coords[block_id][i][j][0] = fx * (Xj[0] / Xj[2]) + cx; coords[block_id][i][j][1] = fy * (Xj[1] / Xj[2]) + cy; } valid[block_id][i][j][0] = (Xj[2] > MIN_DEPTH) ? 1.0 : 0.0; } } __global__ void frame_distance_kernel( const torch::PackedTensorAccessor32 poses, const torch::PackedTensorAccessor32 disps, const torch::PackedTensorAccessor32 intrinsics, const torch::PackedTensorAccessor32 ii, const torch::PackedTensorAccessor32 jj, torch::PackedTensorAccessor32 dist, const float beta) { const int block_id = blockIdx.x; const int thread_id = threadIdx.x; const int ht = disps.size(1); const int wd = disps.size(2); __shared__ int ix; __shared__ int jx; __shared__ float fx; __shared__ float fy; __shared__ float cx; __shared__ float cy; __shared__ float ti[3], tj[3], tij[3]; __shared__ float qi[4], qj[4], qij[4]; // load intrinsics from global memory if (thread_id == 0) { ix = static_cast(ii[block_id]); jx = static_cast(jj[block_id]); fx = intrinsics[0]; fy = intrinsics[1]; cx = intrinsics[2]; cy = intrinsics[3]; } __syncthreads(); //points float Xi[4]; float Xj[4]; __shared__ float accum[THREADS]; accum[thread_id] = 0; __shared__ float valid[THREADS]; valid[thread_id] = 0; __shared__ float total[THREADS]; total[thread_id] = 0; __syncthreads(); for (int n=0; n<1; n++) { if (thread_id < 3) { ti[thread_id] = poses[ix][thread_id]; tj[thread_id] = poses[jx][thread_id]; } if (thread_id < 4) { qi[thread_id] = poses[ix][thread_id+3]; qj[thread_id] = poses[jx][thread_id+3]; } __syncthreads(); relSE3(ti, qi, tj, qj, tij, qij); float d, du, dv; GPU_1D_KERNEL_LOOP(k, ht*wd) { const int i = k / wd; const int j = k % wd; const float u = static_cast(j); const float v = static_cast(i); // if (disps[ix][i][j] < 0.01) { // continue; // } // homogenous coordinates Xi[0] = (u - cx) / fx; Xi[1] = (v - cy) / fy; Xi[2] = 1; Xi[3] = disps[ix][i][j]; // transform homogenous point actSE3(tij, qij, Xi, Xj); du = fx * (Xj[0] / Xj[2]) + cx - u; dv = fy * (Xj[1] / Xj[2]) + cy - v; d = sqrtf(du*du + dv*dv); total[threadIdx.x] += beta; if (Xj[2] > MIN_DEPTH) { accum[threadIdx.x] += beta * d; valid[threadIdx.x] += beta; } Xi[0] = (u - cx) / fx; Xi[1] = (v - cy) / fy; Xi[2] = 1; Xi[3] = disps[ix][i][j]; Xj[0] = Xi[0] + Xi[3] * tij[0]; Xj[1] = Xi[1] + Xi[3] * tij[1]; Xj[2] = Xi[2] + Xi[3] * tij[2]; du = fx * (Xj[0] / Xj[2]) + cx - u; dv = fy * (Xj[1] / Xj[2]) + cy - v; d = sqrtf(du*du + dv*dv); total[threadIdx.x] += (1 - beta); if (Xj[2] > MIN_DEPTH) { accum[threadIdx.x] += (1 - beta) * d; valid[threadIdx.x] += (1 - beta); } } if (threadIdx.x == 0) { int tmp = ix; ix = jx; jx = tmp; } __syncthreads(); } __syncthreads(); blockReduce(accum); __syncthreads(); blockReduce(total); __syncthreads(); blockReduce(valid); __syncthreads(); if (thread_id == 0) { dist[block_id] = (valid[0] / (total[0] + 1e-8) < 0.75) ? 1000.0 : accum[0] / valid[0]; } } __global__ void depth_filter_kernel( const torch::PackedTensorAccessor32 poses, const torch::PackedTensorAccessor32 disps, const torch::PackedTensorAccessor32 intrinsics, const torch::PackedTensorAccessor32 inds, const torch::PackedTensorAccessor32 thresh, torch::PackedTensorAccessor32 counter) { const int block_id = blockIdx.x; const int neigh_id = blockIdx.y; const int index = blockIdx.z * blockDim.x + threadIdx.x; // if (threadIdx.x == 0) { // printf("%d %d %d %d\n", blockIdx.x, blockIdx.y, blockDim.x, threadIdx.x); // } const int num = disps.size(0); const int ht = disps.size(1); const int wd = disps.size(2); __shared__ int ix; __shared__ int jx; __shared__ float fx; __shared__ float fy; __shared__ float cx; __shared__ float cy; __shared__ float ti[3], tj[3], tij[3]; __shared__ float qi[4], qj[4], qij[4]; if (threadIdx.x == 0) { ix = static_cast(inds[block_id]); jx = (neigh_id < 3) ? ix - neigh_id - 1 : ix + neigh_id; fx = intrinsics[0]; fy = intrinsics[1]; cx = intrinsics[2]; cy = intrinsics[3]; } __syncthreads(); if (jx < 0 || jx >= num) { return; } const float t = thresh[block_id]; // load poses from global memory if (threadIdx.x < 3) { ti[threadIdx.x] = poses[ix][threadIdx.x]; tj[threadIdx.x] = poses[jx][threadIdx.x]; } if (threadIdx.x < 4) { qi[threadIdx.x] = poses[ix][threadIdx.x+3]; qj[threadIdx.x] = poses[jx][threadIdx.x+3]; } __syncthreads(); if (threadIdx.x == 0) { relSE3(ti, qi, tj, qj, tij, qij); } //points float Xi[4]; float Xj[4]; __syncthreads(); if (index < ht*wd) { const int i = index / wd; const int j = index % wd; const float ui = static_cast(j); const float vi = static_cast(i); const float di = disps[ix][i][j]; // homogenous coordinates Xi[0] = (ui - cx) / fx; Xi[1] = (vi - cy) / fy; Xi[2] = 1; Xi[3] = di; // transform homogenous point actSE3(tij, qij, Xi, Xj); const float uj = fx * (Xj[0] / Xj[2]) + cx; const float vj = fy * (Xj[1] / Xj[2]) + cy; const float dj = Xj[3] / Xj[2]; const int u0 = static_cast(floor(uj)); const int v0 = static_cast(floor(vj)); if (u0 >= 0 && v0 >= 0 && u0 < wd-1 && v0 < ht-1) { const float wx = ceil(uj) - uj; const float wy = ceil(vj) - vj; const float d00 = disps[jx][v0+0][u0+0]; const float d01 = disps[jx][v0+0][u0+1]; const float d10 = disps[jx][v0+1][u0+0]; const float d11 = disps[jx][v0+1][u0+1]; const float dj_hat = wy*wx*d00 + wy*(1-wx)*d01 + (1-wy)*wx*d10 + (1-wy)*(1-wx)*d11; const float err = abs(1.0/dj - 1.0/dj_hat); if (abs(1.0/dj - 1.0/d00) < t) atomicAdd(&counter[block_id][i][j], 1.0f); else if (abs(1.0/dj - 1.0/d01) < t) atomicAdd(&counter[block_id][i][j], 1.0f); else if (abs(1.0/dj - 1.0/d10) < t) atomicAdd(&counter[block_id][i][j], 1.0f); else if (abs(1.0/dj - 1.0/d11) < t) atomicAdd(&counter[block_id][i][j], 1.0f); } } } __global__ void iproj_kernel( const torch::PackedTensorAccessor32 poses, const torch::PackedTensorAccessor32 disps, const torch::PackedTensorAccessor32 intrinsics, torch::PackedTensorAccessor32 points) { const int block_id = blockIdx.x; const int index = blockIdx.y * blockDim.x + threadIdx.x; const int num = disps.size(0); const int ht = disps.size(1); const int wd = disps.size(2); __shared__ float fx; __shared__ float fy; __shared__ float cx; __shared__ float cy; __shared__ float t[3]; __shared__ float q[4]; if (threadIdx.x == 0) { fx = intrinsics[0]; fy = intrinsics[1]; cx = intrinsics[2]; cy = intrinsics[3]; } __syncthreads(); // load poses from global memory if (threadIdx.x < 3) { t[threadIdx.x] = poses[block_id][threadIdx.x]; } if (threadIdx.x < 4) { q[threadIdx.x] = poses[block_id][threadIdx.x+3]; } __syncthreads(); //points float Xi[4]; float Xj[4]; if (index < ht*wd) { const int i = index / wd; const int j = index % wd; const float ui = static_cast(j); const float vi = static_cast(i); const float di = disps[block_id][i][j]; // homogenous coordinates Xi[0] = (ui - cx) / fx; Xi[1] = (vi - cy) / fy; Xi[2] = 1; Xi[3] = di; // transform homogenous point actSE3(t, q, Xi, Xj); points[block_id][i][j][0] = Xj[0] / Xj[3]; points[block_id][i][j][1] = Xj[1] / Xj[3]; points[block_id][i][j][2] = Xj[2] / Xj[3]; } } __global__ void accum_kernel( const torch::PackedTensorAccessor32 inps, const torch::PackedTensorAccessor32 ptrs, const torch::PackedTensorAccessor32 idxs, torch::PackedTensorAccessor32 outs) { const int block_id = blockIdx.x; const int D = inps.size(2); const int start = ptrs[block_id]; const int end = ptrs[block_id+1]; for (int k=threadIdx.x; k poses, const torch::PackedTensorAccessor32 dx, const int t0, const int t1) { for (int k=t0+threadIdx.x; k disps, const torch::PackedTensorAccessor32 dz, const torch::PackedTensorAccessor32 inds) { const int i = inds[blockIdx.x]; const int ht = disps.size(1); const int wd = disps.size(2); for (int k=threadIdx.x; k(); long* jx_data = jx_cpu.data_ptr(); long* kx_data = inds.data_ptr(); int count = jx.size(0); std::vector cols; torch::Tensor ptrs_cpu = torch::zeros({count+1}, torch::TensorOptions().dtype(torch::kInt64)); long* ptrs_data = ptrs_cpu.data_ptr(); ptrs_data[0] = 0; int i = 0; for (int j=0; j(); for (int i=0; i>>( data.packed_accessor32(), ptrs.packed_accessor32(), idxs.packed_accessor32(), out.packed_accessor32()); return out; } __global__ void EEt6x6_kernel( const torch::PackedTensorAccessor32 E, const torch::PackedTensorAccessor32 Q, const torch::PackedTensorAccessor32 idx, torch::PackedTensorAccessor32 S) { // indicices const int ix = idx[blockIdx.x][0]; const int jx = idx[blockIdx.x][1]; const int kx = idx[blockIdx.x][2]; const int D = E.size(2); float dS[6][6]; float ei[6]; float ej[6]; for (int i=0; i<6; i++) { for (int j=0; j<6; j++) { dS[i][j] = 0; } } for (int k=threadIdx.x; k E, const torch::PackedTensorAccessor32 Q, const torch::PackedTensorAccessor32 w, const torch::PackedTensorAccessor32 idx, torch::PackedTensorAccessor32 v) { const int D = E.size(2); const int kx = idx[blockIdx.x][0]; float b[6]; for (int n=0; n<6; n++) { b[n] = 0.0; } for (int k=threadIdx.x; k E, const torch::PackedTensorAccessor32 x, const torch::PackedTensorAccessor32 idx, torch::PackedTensorAccessor32 w) { const int D = E.size(2); const int ix = idx[blockIdx.x]; if (idx[blockIdx.x] <= 0 || idx[blockIdx.x] >= x.size(0)) return; for (int k=threadIdx.x; k A; Eigen::VectorX b; SparseBlock(int N, int M) : N(N), M(M) { A = Eigen::SparseMatrix(N*M, N*M); b = Eigen::VectorXd::Zero(N*M); } SparseBlock(Eigen::SparseMatrix const& A, Eigen::VectorX const& b, int N, int M) : A(A), b(b), N(N), M(M) {} void update_lhs(torch::Tensor As, torch::Tensor ii, torch::Tensor jj) { auto As_cpu = As.to(torch::kCPU).to(torch::kFloat64); auto ii_cpu = ii.to(torch::kCPU).to(torch::kInt64); auto jj_cpu = jj.to(torch::kCPU).to(torch::kInt64); auto As_acc = As_cpu.accessor(); auto ii_acc = ii_cpu.accessor(); auto jj_acc = jj_cpu.accessor(); std::vector tripletList; for (int n=0; n= 0 && j >= 0) { for (int k=0; k(); auto ii_acc = ii_cpu.accessor(); for (int n=0; n= 0) { for (int j=0; j get_dense() { Eigen::MatrixXd Ad = Eigen::MatrixXd(A); torch::Tensor H = torch::from_blob(Ad.data(), {N*M, N*M}, torch::TensorOptions() .dtype(torch::kFloat64)).to(torch::kCUDA).to(torch::kFloat32); torch::Tensor v = torch::from_blob(b.data(), {N*M, 1}, torch::TensorOptions() .dtype(torch::kFloat64)).to(torch::kCUDA).to(torch::kFloat32); return std::make_tuple(H, v); } torch::Tensor solve(const float lm=0.0001, const float ep=0.1) { torch::Tensor dx; Eigen::SparseMatrix L(A); L.diagonal().array() += ep + lm * L.diagonal().array(); Eigen::SimplicialLLT> solver; solver.compute(L); if (solver.info() == Eigen::Success) { Eigen::VectorXd x = solver.solve(b); dx = torch::from_blob(x.data(), {N, M}, torch::TensorOptions() .dtype(torch::kFloat64)).to(torch::kCUDA).to(torch::kFloat32); } else { dx = torch::zeros({N, M}, torch::TensorOptions() .device(torch::kCUDA).dtype(torch::kFloat32)); } return dx; } private: const int N; const int M; }; SparseBlock schur_block(torch::Tensor E, torch::Tensor Q, torch::Tensor w, torch::Tensor ii, torch::Tensor jj, torch::Tensor kk, const int t0, const int t1) { torch::Tensor ii_cpu = ii.to(torch::kCPU); torch::Tensor jj_cpu = jj.to(torch::kCPU); torch::Tensor kk_cpu = kk.to(torch::kCPU); const int P = t1 - t0; const long* ii_data = ii_cpu.data_ptr(); const long* jj_data = jj_cpu.data_ptr(); const long* kk_data = kk_cpu.data_ptr(); std::vector> graph(P); std::vector> index(P); for (int n=0; n= t0 && j <= t1) { const int t = j - t0; graph[t].push_back(k); index[t].push_back(n); } } std::vector ii_list, jj_list, idx, jdx; for (int i=0; i>>( E.packed_accessor32(), Q.packed_accessor32(), ix_cuda.packed_accessor32(), S.packed_accessor32()); Ev6x1_kernel<<>>( E.packed_accessor32(), Q.packed_accessor32(), w.packed_accessor32(), jx_cuda.packed_accessor32(), v.packed_accessor32()); // schur block SparseBlock A(P, 6); A.update_lhs(S, ii2_cpu, jj2_cpu); A.update_rhs(v, jj_cpu - t0); return A; } std::vector ba_cuda( torch::Tensor poses, torch::Tensor disps, torch::Tensor intrinsics, torch::Tensor disps_sens, torch::Tensor targets, torch::Tensor weights, torch::Tensor eta, torch::Tensor ii, torch::Tensor jj, const int t0, const int t1, const int iterations, const float lm, const float ep, const bool motion_only) { auto opts = poses.options(); const int num = ii.size(0); const int ht = disps.size(1); const int wd = disps.size(2); torch::Tensor ts = torch::arange(t0, t1).to(torch::kCUDA); torch::Tensor ii_exp = torch::cat({ts, ii}, 0); torch::Tensor jj_exp = torch::cat({ts, jj}, 0); std::tuple kuniq = torch::_unique(ii_exp, true, true); torch::Tensor kx = std::get<0>(kuniq); torch::Tensor kk_exp = std::get<1>(kuniq); torch::Tensor dx; torch::Tensor dz; // initialize buffers torch::Tensor Hs = torch::zeros({4, num, 6, 6}, opts); torch::Tensor vs = torch::zeros({2, num, 6}, opts); torch::Tensor Eii = torch::zeros({num, 6, ht*wd}, opts); torch::Tensor Eij = torch::zeros({num, 6, ht*wd}, opts); torch::Tensor Cii = torch::zeros({num, ht*wd}, opts); torch::Tensor wi = torch::zeros({num, ht*wd}, opts); for (int itr=0; itr>>( targets.packed_accessor32(), weights.packed_accessor32(), poses.packed_accessor32(), disps.packed_accessor32(), intrinsics.packed_accessor32(), ii.packed_accessor32(), jj.packed_accessor32(), Hs.packed_accessor32(), vs.packed_accessor32(), Eii.packed_accessor32(), Eij.packed_accessor32(), Cii.packed_accessor32(), wi.packed_accessor32()); // pose x pose block SparseBlock A(t1 - t0, 6); A.update_lhs(Hs.reshape({-1, 6, 6}), torch::cat({ii, ii, jj, jj}) - t0, torch::cat({ii, jj, ii, jj}) - t0); A.update_rhs(vs.reshape({-1, 6}), torch::cat({ii, jj}) - t0); if (motion_only) { dx = A.solve(lm, ep); // update poses pose_retr_kernel<<<1, THREADS>>>( poses.packed_accessor32(), dx.packed_accessor32(), t0, t1); } else { // add depth residual if there are depth sensor measurements const float alpha = 0.05; torch::Tensor m = (disps_sens.index({kx, "..."}) > 0).to(torch::TensorOptions().dtype(torch::kFloat32)).view({-1, ht*wd}); torch::Tensor C = accum_cuda(Cii, ii, kx) + m * alpha + (1 - m) * eta.view({-1, ht*wd}); torch::Tensor w = accum_cuda(wi, ii, kx) - m * alpha * (disps.index({kx, "..."}) - disps_sens.index({kx, "..."})).view({-1, ht*wd}); torch::Tensor Q = 1.0 / C; torch::Tensor Ei = accum_cuda(Eii.view({num, 6*ht*wd}), ii, ts).view({t1-t0, 6, ht*wd}); torch::Tensor E = torch::cat({Ei, Eij}, 0); SparseBlock S = schur_block(E, Q, w, ii_exp, jj_exp, kk_exp, t0, t1); dx = (A - S).solve(lm, ep); torch::Tensor ix = jj_exp - t0; torch::Tensor dw = torch::zeros({ix.size(0), ht*wd}, opts); EvT6x1_kernel<<>>( E.packed_accessor32(), dx.packed_accessor32(), ix.packed_accessor32(), dw.packed_accessor32()); dz = Q * (w - accum_cuda(dw, ii_exp, kx)); // update poses pose_retr_kernel<<<1, THREADS>>>( poses.packed_accessor32(), dx.packed_accessor32(), t0, t1); // update disparity maps disp_retr_kernel<<>>( disps.packed_accessor32(), dz.packed_accessor32(), kx.packed_accessor32()); } } return {dx, dz}; } torch::Tensor frame_distance_cuda( torch::Tensor poses, torch::Tensor disps, torch::Tensor intrinsics, torch::Tensor ii, torch::Tensor jj, const float beta) { auto opts = poses.options(); const int num = ii.size(0); torch::Tensor dist = torch::zeros({num}, opts); frame_distance_kernel<<>>( poses.packed_accessor32(), disps.packed_accessor32(), intrinsics.packed_accessor32(), ii.packed_accessor32(), jj.packed_accessor32(), dist.packed_accessor32(), beta); return dist; } std::vector projmap_cuda( torch::Tensor poses, torch::Tensor disps, torch::Tensor intrinsics, torch::Tensor ii, torch::Tensor jj) { auto opts = poses.options(); const int num = ii.size(0); const int ht = disps.size(1); const int wd = disps.size(2); torch::Tensor coords = torch::zeros({num, ht, wd, 3}, opts); torch::Tensor valid = torch::zeros({num, ht, wd, 1}, opts); projmap_kernel<<>>( poses.packed_accessor32(), disps.packed_accessor32(), intrinsics.packed_accessor32(), ii.packed_accessor32(), jj.packed_accessor32(), coords.packed_accessor32(), valid.packed_accessor32()); return {coords, valid}; } torch::Tensor depth_filter_cuda( torch::Tensor poses, torch::Tensor disps, torch::Tensor intrinsics, torch::Tensor ix, torch::Tensor thresh) { const int num = ix.size(0); const int ht = disps.size(1); const int wd = disps.size(2); torch::Tensor counter = torch::zeros({num, ht, wd}, disps.options()); dim3 blocks(num, 6, NUM_BLOCKS(ht * wd)); depth_filter_kernel<<>>( poses.packed_accessor32(), disps.packed_accessor32(), intrinsics.packed_accessor32(), ix.packed_accessor32(), thresh.packed_accessor32(), counter.packed_accessor32()); return counter; } torch::Tensor iproj_cuda( torch::Tensor poses, torch::Tensor disps, torch::Tensor intrinsics) { const int nm = disps.size(0); const int ht = disps.size(1); const int wd = disps.size(2); auto opts = disps.options(); torch::Tensor points = torch::zeros({nm, ht, wd, 3}, opts); dim3 blocks(nm, NUM_BLOCKS(ht * wd)); iproj_kernel<<>>( poses.packed_accessor32(), disps.packed_accessor32(), intrinsics.packed_accessor32(), points.packed_accessor32()); return points; }