Open3D (C++ API)  0.17.0
FillInLinearSystemImpl.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// Copyright (c) 2018-2023 www.open3d.org
5// SPDX-License-Identifier: MIT
6// ----------------------------------------------------------------------------
7
11
12namespace open3d {
13namespace t {
14namespace pipelines {
15namespace kernel {
16#if defined(__CUDACC__)
17void FillInRigidAlignmentTermCUDA
18#else
20#endif
21 (core::Tensor &AtA,
22 core::Tensor &Atb,
23 core::Tensor &residual,
24 const core::Tensor &Ti_ps,
25 const core::Tensor &Tj_qs,
26 const core::Tensor &Ri_normal_ps,
27 int i,
28 int j,
29 float threshold) {
30
31 core::Device device = AtA.GetDevice();
32 int64_t n = Ti_ps.GetLength();
33 if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
35 "Unable to setup linear system: input length mismatch.");
36 }
37
38 // First fill in a small 12 x 12 linear system
39 core::Tensor AtA_local =
40 core::Tensor::Zeros({12, 12}, core::Float32, device);
41 core::Tensor Atb_local = core::Tensor::Zeros({12}, core::Float32, device);
42
43 float *AtA_local_ptr = static_cast<float *>(AtA_local.GetDataPtr());
44 float *Atb_local_ptr = static_cast<float *>(Atb_local.GetDataPtr());
45 float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
46
47 const float *Ti_ps_ptr = static_cast<const float *>(Ti_ps.GetDataPtr());
48 const float *Tj_qs_ptr = static_cast<const float *>(Tj_qs.GetDataPtr());
49 const float *Ri_normal_ps_ptr =
50 static_cast<const float *>(Ri_normal_ps.GetDataPtr());
51
53 AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
54 const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
55 const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
56 const float *normal_p_prime =
57 Ri_normal_ps_ptr + 3 * workload_idx;
58
59 float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] +
60 (p_prime[1] - q_prime[1]) * normal_p_prime[1] +
61 (p_prime[2] - q_prime[2]) * normal_p_prime[2];
62 if (abs(r) > threshold) return;
63
64 float J_ij[12];
65 J_ij[0] = -q_prime[2] * normal_p_prime[1] +
66 q_prime[1] * normal_p_prime[2];
67 J_ij[1] = q_prime[2] * normal_p_prime[0] -
68 q_prime[0] * normal_p_prime[2];
69 J_ij[2] = -q_prime[1] * normal_p_prime[0] +
70 q_prime[0] * normal_p_prime[1];
71 J_ij[3] = normal_p_prime[0];
72 J_ij[4] = normal_p_prime[1];
73 J_ij[5] = normal_p_prime[2];
74 for (int k = 0; k < 6; ++k) {
75 J_ij[k + 6] = -J_ij[k];
76 }
77
78 // Not optimized; Switch to reduction if necessary.
79#if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
80 for (int i_local = 0; i_local < 12; ++i_local) {
81 for (int j_local = 0; j_local < 12; ++j_local) {
82 atomicAdd(&AtA_local_ptr[i_local * 12 + j_local],
83 J_ij[i_local] * J_ij[j_local]);
84 }
85 atomicAdd(&Atb_local_ptr[i_local], J_ij[i_local] * r);
86 }
87 atomicAdd(residual_ptr, r * r);
88#else
89#pragma omp critical(FillInRigidAlignmentTermCPU)
90 {
91 for (int i_local = 0; i_local < 12; ++i_local) {
92 for (int j_local = 0; j_local < 12; ++j_local) {
93 AtA_local_ptr[i_local * 12 + j_local]
94 += J_ij[i_local] * J_ij[j_local];
95 }
96 Atb_local_ptr[i_local] += J_ij[i_local] * r;
97 }
98 *residual_ptr += r * r;
99 }
100#endif
101 });
102
103 // Then fill-in the large linear system
104 std::vector<int64_t> indices_vec(12);
105 for (int k = 0; k < 6; ++k) {
106 indices_vec[k] = i * 6 + k;
107 indices_vec[k + 6] = j * 6 + k;
108 }
109
110 std::vector<int64_t> indices_i_vec;
111 std::vector<int64_t> indices_j_vec;
112 for (int local_i = 0; local_i < 12; ++local_i) {
113 for (int local_j = 0; local_j < 12; ++local_j) {
114 indices_i_vec.push_back(indices_vec[local_i]);
115 indices_j_vec.push_back(indices_vec[local_j]);
116 }
117 }
118
119 core::Tensor indices(indices_vec, {12}, core::Int64, device);
120 core::Tensor indices_i(indices_i_vec, {12 * 12}, core::Int64, device);
121 core::Tensor indices_j(indices_j_vec, {12 * 12}, core::Int64, device);
122
123 core::Tensor AtA_sub = AtA.IndexGet({indices_i, indices_j});
124 AtA.IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.View({12 * 12}));
125
126 core::Tensor Atb_sub = Atb.IndexGet({indices});
127 Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1}));
128}
129
130#if defined(__CUDACC__)
131void FillInSLACAlignmentTermCUDA
132#else
134#endif
135 (core::Tensor &AtA,
136 core::Tensor &Atb,
137 core::Tensor &residual,
138 const core::Tensor &Ti_Cps,
139 const core::Tensor &Tj_Cqs,
140 const core::Tensor &Cnormal_ps,
141 const core::Tensor &Ri_Cnormal_ps,
142 const core::Tensor &RjT_Ri_Cnormal_ps,
143 const core::Tensor &cgrid_idx_ps,
144 const core::Tensor &cgrid_idx_qs,
145 const core::Tensor &cgrid_ratio_qs,
146 const core::Tensor &cgrid_ratio_ps,
147 int i,
148 int j,
149 int n_frags,
150 float threshold) {
151 int64_t n = Ti_Cps.GetLength();
152 if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
153 Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n ||
154 cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
155 cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
157 "Unable to setup linear system: input length mismatch.");
158 }
159
160 int n_vars = Atb.GetLength();
161 float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
162 float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
163 float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
164
165 // Geometric properties
166 const float *Ti_Cps_ptr = static_cast<const float *>(Ti_Cps.GetDataPtr());
167 const float *Tj_Cqs_ptr = static_cast<const float *>(Tj_Cqs.GetDataPtr());
168 const float *Cnormal_ps_ptr =
169 static_cast<const float *>(Cnormal_ps.GetDataPtr());
170 const float *Ri_Cnormal_ps_ptr =
171 static_cast<const float *>(Ri_Cnormal_ps.GetDataPtr());
172 const float *RjT_Ri_Cnormal_ps_ptr =
173 static_cast<const float *>(RjT_Ri_Cnormal_ps.GetDataPtr());
174
175 // Association properties
176 const int *cgrid_idx_ps_ptr =
177 static_cast<const int *>(cgrid_idx_ps.GetDataPtr());
178 const int *cgrid_idx_qs_ptr =
179 static_cast<const int *>(cgrid_idx_qs.GetDataPtr());
180 const float *cgrid_ratio_ps_ptr =
181 static_cast<const float *>(cgrid_ratio_ps.GetDataPtr());
182 const float *cgrid_ratio_qs_ptr =
183 static_cast<const float *>(cgrid_ratio_qs.GetDataPtr());
184
186 AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
187 const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
188 const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
189 const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
190 const float *Ri_Cnormal_p =
191 Ri_Cnormal_ps_ptr + 3 * workload_idx;
192 const float *RjTRi_Cnormal_p =
193 RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
194
195 const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx;
196 const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx;
197 const float *cgrid_ratio_p =
198 cgrid_ratio_ps_ptr + 8 * workload_idx;
199 const float *cgrid_ratio_q =
200 cgrid_ratio_qs_ptr + 8 * workload_idx;
201
202 float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
203 (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
204 (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
205 if (abs(r) > threshold) return;
206
207 // Now we fill in a 60 x 60 sub-matrix: 2 x (6 + 8 x 3)
208 float J[60];
209 int idx[60];
210
211 // Jacobian w.r.t. Ti: 0-6
212 J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2];
213 J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2];
214 J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1];
215 J[3] = Ri_Cnormal_p[0];
216 J[4] = Ri_Cnormal_p[1];
217 J[5] = Ri_Cnormal_p[2];
218
219 // Jacobian w.r.t. Tj: 6-12
220 for (int k = 0; k < 6; ++k) {
221 J[k + 6] = -J[k];
222
223 idx[k + 0] = 6 * i + k;
224 idx[k + 6] = 6 * j + k;
225 }
226
227 // Jacobian w.r.t. C over p: 12-36
228 for (int k = 0; k < 8; ++k) {
229 J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0];
230 J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1];
231 J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2];
232
233 idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
234 idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
235 idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
236 }
237
238 // Jacobian w.r.t. C over q: 36-60
239 for (int k = 0; k < 8; ++k) {
240 J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
241 J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
242 J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
243
244 idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
245 idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
246 idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
247 }
248
249 // Not optimized; Switch to reduction if necessary.
250#if defined(__CUDACC__)
251 for (int ki = 0; ki < 60; ++ki) {
252 for (int kj = 0; kj < 60; ++kj) {
253 float AtA_ij = J[ki] * J[kj];
254 int ij = idx[ki] * n_vars + idx[kj];
255 atomicAdd(AtA_ptr + ij, AtA_ij);
256 }
257 float Atb_i = J[ki] * r;
258 atomicAdd(Atb_ptr + idx[ki], Atb_i);
259 }
260 atomicAdd(residual_ptr, r * r);
261#else
262#pragma omp critical(FillInSLACAlignmentTermCPU)
263 {
264 for (int ki = 0; ki < 60; ++ki) {
265 for (int kj = 0; kj < 60; ++kj) {
266 AtA_ptr[idx[ki] * n_vars + idx[kj]]
267 += J[ki] * J[kj];
268 }
269 Atb_ptr[idx[ki]] += J[ki] * r;
270 }
271 *residual_ptr += r * r;
272 }
273#endif
274 });
275}
276
277#if defined(__CUDACC__)
278void FillInSLACRegularizerTermCUDA
279#else
281#endif
282 (core::Tensor &AtA,
283 core::Tensor &Atb,
284 core::Tensor &residual,
285 const core::Tensor &grid_idx,
286 const core::Tensor &grid_nbs_idx,
287 const core::Tensor &grid_nbs_mask,
288 const core::Tensor &positions_init,
289 const core::Tensor &positions_curr,
290 float weight,
291 int n_frags,
292 int anchor_idx) {
293
294 int64_t n = grid_idx.GetLength();
295 int64_t n_vars = Atb.GetLength();
296
297 float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
298 float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
299 float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
300
301 const int *grid_idx_ptr = static_cast<const int *>(grid_idx.GetDataPtr());
302 const int *grid_nbs_idx_ptr =
303 static_cast<const int *>(grid_nbs_idx.GetDataPtr());
304 const bool *grid_nbs_mask_ptr =
305 static_cast<const bool *>(grid_nbs_mask.GetDataPtr());
306
307 const float *positions_init_ptr =
308 static_cast<const float *>(positions_init.GetDataPtr());
309 const float *positions_curr_ptr =
310 static_cast<const float *>(positions_curr.GetDataPtr());
311
313 AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
314 // Enumerate 6 neighbors
315 int idx_i = grid_idx_ptr[workload_idx];
316
317 const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx;
318 const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx;
319
320 // Build a 3x3 linear system to compute the local R
321 float cov[3][3] = {{0}};
322 float U[3][3], V[3][3], S[3];
323
324 int cnt = 0;
325 for (int k = 0; k < 6; ++k) {
326 bool mask_k = mask_nbs[k];
327 if (!mask_k) continue;
328
329 int idx_k = idx_nbs[k];
330
331 // Now build linear systems
332 float diff_ik_init[3] = {
333 positions_init_ptr[idx_i * 3 + 0] -
334 positions_init_ptr[idx_k * 3 + 0],
335 positions_init_ptr[idx_i * 3 + 1] -
336 positions_init_ptr[idx_k * 3 + 1],
337 positions_init_ptr[idx_i * 3 + 2] -
338 positions_init_ptr[idx_k * 3 + 2]};
339 float diff_ik_curr[3] = {
340 positions_curr_ptr[idx_i * 3 + 0] -
341 positions_curr_ptr[idx_k * 3 + 0],
342 positions_curr_ptr[idx_i * 3 + 1] -
343 positions_curr_ptr[idx_k * 3 + 1],
344 positions_curr_ptr[idx_i * 3 + 2] -
345 positions_curr_ptr[idx_k * 3 + 2]};
346
347 // Build linear system by computing XY^T when formulating Y
348 // = RX Y: curr X: init
349 for (int i = 0; i < 3; ++i) {
350 for (int j = 0; j < 3; ++j) {
351 cov[i][j] += diff_ik_init[i] * diff_ik_curr[j];
352 }
353 }
354 ++cnt;
355 }
356
357 if (cnt < 3) {
358 return;
359 }
360
361 core::linalg::kernel::svd3x3(*cov, *U, S, *V);
362
363 float R[3][3];
366
367 float d = core::linalg::kernel::det3x3(*R);
368
369 if (d < 0) {
370 U[2][0] = -U[2][0];
371 U[2][1] = -U[2][1];
372 U[2][2] = -U[2][2];
374 }
375
376 // Now we have R, we build Hessian and residuals
377 // But first, we need to anchor a point
378 if (idx_i == anchor_idx) {
379 R[0][0] = R[1][1] = R[2][2] = 1;
380 R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] =
381 0;
382 }
383 for (int k = 0; k < 6; ++k) {
384 bool mask_k = mask_nbs[k];
385
386 if (mask_k) {
387 int idx_k = idx_nbs[k];
388
389 float diff_ik_init[3] = {
390 positions_init_ptr[idx_i * 3 + 0] -
391 positions_init_ptr[idx_k * 3 + 0],
392 positions_init_ptr[idx_i * 3 + 1] -
393 positions_init_ptr[idx_k * 3 + 1],
394 positions_init_ptr[idx_i * 3 + 2] -
395 positions_init_ptr[idx_k * 3 + 2]};
396 float diff_ik_curr[3] = {
397 positions_curr_ptr[idx_i * 3 + 0] -
398 positions_curr_ptr[idx_k * 3 + 0],
399 positions_curr_ptr[idx_i * 3 + 1] -
400 positions_curr_ptr[idx_k * 3 + 1],
401 positions_curr_ptr[idx_i * 3 + 2] -
402 positions_curr_ptr[idx_k * 3 + 2]};
403 float R_diff_ik_curr[3];
404
405 core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init,
406 R_diff_ik_curr);
407
408 float local_r[3];
409 local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
410 local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
411 local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
412
413 int offset_idx_i = 3 * idx_i + 6 * n_frags;
414 int offset_idx_k = 3 * idx_k + 6 * n_frags;
415
416#if defined(__CUDACC__)
417 // Update residual
418 atomicAdd(residual_ptr,
419 weight * (local_r[0] * local_r[0] +
420 local_r[1] * local_r[1] +
421 local_r[2] * local_r[2]));
422
423 for (int axis = 0; axis < 3; ++axis) {
424 // Update AtA: 2x2
425 atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
426 offset_idx_i + axis],
427 weight);
428 atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
429 offset_idx_k + axis],
430 weight);
431 atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
432 offset_idx_k + axis],
433 -weight);
434 atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
435 offset_idx_i + axis],
436 -weight);
437
438 // Update Atb: 2x1
439 atomicAdd(&Atb_ptr[offset_idx_i + axis],
440 +weight * local_r[axis]);
441 atomicAdd(&Atb_ptr[offset_idx_k + axis],
442 -weight * local_r[axis]);
443 }
444#else
445#pragma omp critical(FillInSLACRegularizerTermCPU)
446 {
447 // Update residual
448 *residual_ptr += weight * (local_r[0] * local_r[0] +
449 local_r[1] * local_r[1] +
450 local_r[2] * local_r[2]);
451
452 for (int axis = 0; axis < 3; ++axis) {
453 // Update AtA: 2x2
454 AtA_ptr[(offset_idx_i + axis) * n_vars +
455 offset_idx_i + axis] += weight;
456 AtA_ptr[(offset_idx_k + axis) * n_vars +
457 offset_idx_k + axis] += weight;
458
459 AtA_ptr[(offset_idx_i + axis) * n_vars +
460 offset_idx_k + axis] -= weight;
461 AtA_ptr[(offset_idx_k + axis) * n_vars +
462 offset_idx_i + axis] -= weight;
463
464 // Update Atb: 2x1
465 Atb_ptr[offset_idx_i + axis] += weight * local_r[axis];
466 Atb_ptr[offset_idx_k + axis] -= weight * local_r[axis];
467 }
468 }
469#endif
470 }
471 }
472 });
473}
474} // namespace kernel
475} // namespace pipelines
476} // namespace t
477} // namespace open3d
#define OPEN3D_DEVICE
Definition: CUDAUtils.h:45
#define LogError(...)
Definition: Logging.h:48
Definition: Device.h:18
Definition: Tensor.h:32
T * GetDataPtr()
Definition: Tensor.h:1133
Tensor View(const SizeVector &dst_shape) const
Definition: Tensor.cpp:688
static Tensor Zeros(const SizeVector &shape, Dtype dtype, const Device &device=Device("CPU:0"))
Create a tensor fill with zeros.
Definition: Tensor.cpp:373
void IndexSet(const std::vector< Tensor > &index_tensors, const Tensor &src_tensor)
Advanced indexing getter.
Definition: Tensor.cpp:903
Tensor IndexGet(const std::vector< Tensor > &index_tensors) const
Advanced indexing getter. This will always allocate a new Tensor.
Definition: Tensor.cpp:872
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void transpose3x3_(scalar_t *A_3x3)
Definition: Matrix.h:151
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x3(const scalar_t *A_3x3, const scalar_t *B_3x3, scalar_t *C_3x3)
Definition: Matrix.h:48
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3, scalar_t *U_3x3, scalar_t *S_3x1, scalar_t *V_3x3)
OPEN3D_DEVICE OPEN3D_FORCE_INLINE scalar_t det3x3(const scalar_t *A_3x3)
Definition: Matrix.h:89
const Dtype Int64
Definition: Dtype.cpp:47
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:103
const Dtype Float32
Definition: Dtype.cpp:42
void FillInSLACAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &normal_ps, const core::Tensor &Ri_normal_ps, const core::Tensor &RjT_Ri_normal_ps, const core::Tensor &cgrid_idx_ps, const core::Tensor &cgrid_idx_qs, const core::Tensor &cgrid_ratio_qs, const core::Tensor &cgrid_ratio_ps, int i, int j, int n, float threshold)
Definition: FillInLinearSystemImpl.h:135
void FillInRigidAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &Ri_normal_ps, int i, int j, float threshold)
Definition: FillInLinearSystemImpl.h:21
void FillInSLACRegularizerTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &grid_idx, const core::Tensor &grid_nbs_idx, const core::Tensor &grid_nbs_mask, const core::Tensor &positions_init, const core::Tensor &positions_curr, float weight, int n, int anchor_idx)
Definition: FillInLinearSystemImpl.h:282
Definition: PinholeCameraIntrinsic.cpp:16