Static Value-Flow Analysis
fastcluster_dm.cpp.inc
Go to the documentation of this file.
1 /*
2  fastcluster: Fast hierarchical clustering routines for R and Python
3 
4  Copyright © 2011 Daniel Müllner
5  <http://danifold.net>
6 
7  This library implements various fast algorithms for hierarchical,
8  agglomerative clustering methods:
9 
10  (1) Algorithms for the "stored matrix approach": the input is the array of
11  pairwise dissimilarities.
12 
13  MST_linkage_core: single linkage clustering with the "minimum spanning
14  tree algorithm (Rohlfs)
15 
16  NN_chain_core: nearest-neighbor-chain algorithm, suitable for single,
17  complete, average, weighted and Ward linkage (Murtagh)
18 
19  generic_linkage: generic algorithm, suitable for all distance update
20  formulas (Müllner)
21 
22  (2) Algorithms for the "stored data approach": the input are points in a
23  vector space.
24 
25  MST_linkage_core_vector: single linkage clustering for vector data
26 
27  generic_linkage_vector: generic algorithm for vector data, suitable for
28  the Ward, centroid and median methods.
29 
30  generic_linkage_vector_alternative: alternative scheme for updating the
31  nearest neighbors. This method seems faster than "generic_linkage_vector"
32  for the centroid and median methods but slower for the Ward method.
33 
34  All these implementation treat infinity values correctly. They throw an
35  exception if a NaN distance value occurs.
36 */
37 
38 // Older versions of Microsoft Visual Studio do not have the fenv header.
39 #ifdef _MSC_VER
40 #if (_MSC_VER == 1500 || _MSC_VER == 1600)
41 #define NO_INCLUDE_FENV
42 #endif
43 #endif
44 // NaN detection via fenv might not work on systems with software
45 // floating-point emulation (bug report for Debian armel).
46 #ifdef __SOFTFP__
47 #define NO_INCLUDE_FENV
48 #endif
49 #ifdef NO_INCLUDE_FENV
50 // #pragma message("Do not use fenv header.")
51 #else
52 // #pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.")
53 // TODO
54 //#pragma STDC FENV_ACCESS ON
55 #include <fenv.h>
56 #endif
57 
58 #include <cmath> // for std::pow, std::sqrt
59 #include <cstddef> // for std::ptrdiff_t
60 #include <limits> // for std::numeric_limits<...>::infinity()
61 #include <algorithm> // for std::fill_n
62 #include <stdexcept> // for std::runtime_error
63 #include <string> // for std::string
64 
65 #include <cfloat> // also for DBL_MAX, DBL_MIN
66 #ifndef DBL_MANT_DIG
67 #error The constant DBL_MANT_DIG could not be defined.
68 #endif
69 #define T_FLOAT_MANT_DIG DBL_MANT_DIG
70 
71 #ifndef LONG_MAX
72 #include <climits>
73 #endif
74 #ifndef LONG_MAX
75 #error The constant LONG_MAX could not be defined.
76 #endif
77 #ifndef INT_MAX
78 #error The constant INT_MAX could not be defined.
79 #endif
80 
81 #ifndef INT32_MAX
82 #ifdef _MSC_VER
83 #if _MSC_VER >= 1600
84 #define __STDC_LIMIT_MACROS
85 #include <stdint.h>
86 #else
87 typedef __int32 int_fast32_t;
88 typedef __int64 int64_t;
89 #endif
90 #else
91 #define __STDC_LIMIT_MACROS
92 #include <stdint.h>
93 #endif
94 #endif
95 
96 #define FILL_N std::fill_n
97 #ifdef _MSC_VER
98 #if _MSC_VER < 1600
99 #undef FILL_N
100 #define FILL_N stdext::unchecked_fill_n
101 #endif
102 #endif
103 
104 // Suppress warnings about (potentially) uninitialized variables.
105 #ifdef _MSC_VER
106  #pragma warning (disable:4700)
107 #endif
108 
109 #ifndef HAVE_DIAGNOSTIC
110 #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
111 #define HAVE_DIAGNOSTIC 1
112 #endif
113 #endif
114 
115 #ifndef HAVE_VISIBILITY
116 #if __GNUC__ >= 4
117 #define HAVE_VISIBILITY 1
118 #endif
119 #endif
120 
121 /* Since the public interface is given by the Python respectively R interface,
122  * we do not want other symbols than the interface initalization routines to be
123  * visible in the shared object file. The "visibility" switch is a GCC concept.
124  * Hiding symbols keeps the relocation table small and decreases startup time.
125  * See http://gcc.gnu.org/wiki/Visibility
126  */
127 #if HAVE_VISIBILITY
128 #pragma GCC visibility push(hidden)
129 #endif
130 
131 typedef int_fast32_t t_index;
132 #ifndef INT32_MAX
133 #define MAX_INDEX 0x7fffffffL
134 #else
135 #define MAX_INDEX INT32_MAX
136 #endif
137 #if (LONG_MAX < MAX_INDEX)
138 #error The integer format "t_index" must not have a greater range than "long int".
139 #endif
140 #if (INT_MAX > MAX_INDEX)
141 #error The integer format "int" must not have a greater range than "t_index".
142 #endif
143 typedef double t_float;
144 
145 /* Method codes.
146 
147  These codes must agree with the METHODS array in fastcluster.R and the
148  dictionary mthidx in fastcluster.py.
149 */
150 enum method_codes {
151  // non-Euclidean methods
152  METHOD_METR_SINGLE = 0,
153  METHOD_METR_COMPLETE = 1,
154  METHOD_METR_AVERAGE = 2,
155  METHOD_METR_WEIGHTED = 3,
156  METHOD_METR_WARD = 4,
157  METHOD_METR_WARD_D = METHOD_METR_WARD,
158  METHOD_METR_CENTROID = 5,
159  METHOD_METR_MEDIAN = 6,
160  METHOD_METR_WARD_D2 = 7,
161 
162  MIN_METHOD_CODE = 0,
163  MAX_METHOD_CODE = 7
164 };
165 
166 enum method_codes_vector {
167  // Euclidean methods
168  METHOD_VECTOR_SINGLE = 0,
169  METHOD_VECTOR_WARD = 1,
170  METHOD_VECTOR_CENTROID = 2,
171  METHOD_VECTOR_MEDIAN = 3,
172 
173  MIN_METHOD_VECTOR_CODE = 0,
174  MAX_METHOD_VECTOR_CODE = 3
175 };
176 
177 // self-destructing array pointer
178 template <typename type>
179 class auto_array_ptr{
180 private:
181  type * ptr;
182  auto_array_ptr(auto_array_ptr const &); // non construction-copyable
183  auto_array_ptr& operator=(auto_array_ptr const &); // non copyable
184 public:
185  auto_array_ptr()
186  : ptr(NULL)
187  { }
188  template <typename index>
189  auto_array_ptr(index const size)
190  : ptr(new type[size])
191  { }
192  template <typename index, typename value>
193  auto_array_ptr(index const size, value const val)
194  : ptr(new type[size])
195  {
196  FILL_N(ptr, size, val);
197  }
198  ~auto_array_ptr() {
199  delete [] ptr; }
200  void free() {
201  delete [] ptr;
202  ptr = NULL;
203  }
204  template <typename index>
205  void init(index const size) {
206  ptr = new type [size];
207  }
208  template <typename index, typename value>
209  void init(index const size, value const val) {
210  init(size);
211  FILL_N(ptr, size, val);
212  }
213  inline operator type *() const { return ptr; }
214 };
215 
216 struct node {
217  t_index node1, node2;
218  t_float dist;
219 };
220 
221 inline bool operator< (const node a, const node b) {
222  return (a.dist < b.dist);
223 }
224 
225 class cluster_result {
226 private:
227  auto_array_ptr<node> Z;
228  t_index pos;
229 
230 public:
231  cluster_result(const t_index size)
232  : Z(size)
233  , pos(0)
234  {}
235 
236  void append(const t_index node1, const t_index node2, const t_float dist) {
237  Z[pos].node1 = node1;
238  Z[pos].node2 = node2;
239  Z[pos].dist = dist;
240  ++pos;
241  }
242 
243  node * operator[] (const t_index idx) const { return Z + idx; }
244 
245  /* Define several methods to postprocess the distances. All these functions
246  are monotone, so they do not change the sorted order of distances. */
247 
248  void sqrt() const {
249  for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
250  ZZ->dist = std::sqrt(ZZ->dist);
251  }
252  }
253 
254  void sqrt(const t_float) const { // ignore the argument
255  sqrt();
256  }
257 
258  void sqrtdouble(const t_float) const { // ignore the argument
259  for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
260  ZZ->dist = std::sqrt(2*ZZ->dist);
261  }
262  }
263 
264  #ifdef R_pow
265  #define my_pow R_pow
266  #else
267  #define my_pow std::pow
268  #endif
269 
270  void power(const t_float p) const {
271  t_float const q = 1/p;
272  for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
273  ZZ->dist = my_pow(ZZ->dist,q);
274  }
275  }
276 
277  void plusone(const t_float) const { // ignore the argument
278  for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
279  ZZ->dist += 1;
280  }
281  }
282 
283  void divide(const t_float denom) const {
284  for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
285  ZZ->dist /= denom;
286  }
287  }
288 };
289 
290 class doubly_linked_list {
291  /*
292  Class for a doubly linked list. Initially, the list is the integer range
293  [0, size]. We provide a forward iterator and a method to delete an index
294  from the list.
295 
296  Typical use: for (i=L.start; L<size; i=L.succ[I])
297  or
298  for (i=somevalue; L<size; i=L.succ[I])
299  */
300 public:
301  t_index start;
302  auto_array_ptr<t_index> succ;
303 
304 private:
305  auto_array_ptr<t_index> pred;
306  // Not necessarily private, we just do not need it in this instance.
307 
308 public:
309  doubly_linked_list(const t_index size)
310  // Initialize to the given size.
311  : start(0)
312  , succ(size+1)
313  , pred(size+1)
314  {
315  for (t_index i=0; i<size; ++i) {
316  pred[i+1] = i;
317  succ[i] = i+1;
318  }
319  // pred[0] is never accessed!
320  //succ[size] is never accessed!
321  }
322 
323  ~doubly_linked_list() {}
324 
325  void remove(const t_index idx) {
326  // Remove an index from the list.
327  if (idx==start) {
328  start = succ[idx];
329  }
330  else {
331  succ[pred[idx]] = succ[idx];
332  pred[succ[idx]] = pred[idx];
333  }
334  succ[idx] = 0; // Mark as inactive
335  }
336 
337  bool is_inactive(t_index idx) const {
338  return (succ[idx]==0);
339  }
340 };
341 
342 // Indexing functions
343 // D is the upper triangular part of a symmetric (NxN)-matrix
344 // We require r_ < c_ !
345 #define D_(r_,c_) ( D[(static_cast<std::ptrdiff_t>(2*N-3-(r_))*(r_)>>1)+(c_)-1] )
346 // Z is an ((N-1)x4)-array
347 #define Z_(_r, _c) (Z[(_r)*4 + (_c)])
348 
349 /*
350  Lookup function for a union-find data structure.
351 
352  The function finds the root of idx by going iteratively through all
353  parent elements until a root is found. An element i is a root if
354  nodes[i] is zero. To make subsequent searches faster, the entry for
355  idx and all its parents is updated with the root element.
356  */
357 class union_find {
358 private:
359  auto_array_ptr<t_index> parent;
360  t_index nextparent;
361 
362 public:
363  union_find(const t_index size)
364  : parent(size>0 ? 2*size-1 : 0, 0)
365  , nextparent(size)
366  { }
367 
368  t_index Find (t_index idx) const {
369  if (parent[idx] != 0 ) { // a → b
370  t_index p = idx;
371  idx = parent[idx];
372  if (parent[idx] != 0 ) { // a → b → c
373  do {
374  idx = parent[idx];
375  } while (parent[idx] != 0);
376  do {
377  t_index tmp = parent[p];
378  parent[p] = idx;
379  p = tmp;
380  } while (parent[p] != idx);
381  }
382  }
383  return idx;
384  }
385 
386  void Union (const t_index node1, const t_index node2) {
387  parent[node1] = parent[node2] = nextparent++;
388  }
389 };
390 
391 class nan_error{};
392 #ifdef FE_INVALID
393 class fenv_error{};
394 #endif
395 
396 static void MST_linkage_core(const t_index N, const t_float * const D,
397  cluster_result & Z2) {
398 /*
399  N: integer, number of data points
400  D: condensed distance matrix N*(N-1)/2
401  Z2: output data structure
402 
403  The basis of this algorithm is an algorithm by Rohlf:
404 
405  F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
406  The Computer Journal, vol. 16, 1973, p. 93–95.
407 */
408  t_index i;
409  t_index idx2;
410  doubly_linked_list active_nodes(N);
411  auto_array_ptr<t_float> d(N);
412 
413  t_index prev_node;
414  t_float min;
415 
416  // first iteration
417  idx2 = 1;
418  min = std::numeric_limits<t_float>::infinity();
419  for (i=1; i<N; ++i) {
420  d[i] = D[i-1];
421 #if HAVE_DIAGNOSTIC
422 #pragma GCC diagnostic push
423 #pragma GCC diagnostic ignored "-Wfloat-equal"
424 #endif
425  if (d[i] < min) {
426  min = d[i];
427  idx2 = i;
428  }
429  else if (fc_isnan(d[i]))
430  assert(false && "fastcluster: nan error");
431 #if HAVE_DIAGNOSTIC
432 #pragma GCC diagnostic pop
433 #endif
434  }
435  Z2.append(0, idx2, min);
436 
437  for (t_index j=1; j<N-1; ++j) {
438  prev_node = idx2;
439  active_nodes.remove(prev_node);
440 
441  idx2 = active_nodes.succ[0];
442  min = d[idx2];
443  for (i=idx2; i<prev_node; i=active_nodes.succ[i]) {
444  t_float tmp = D_(i, prev_node);
445 #if HAVE_DIAGNOSTIC
446 #pragma GCC diagnostic push
447 #pragma GCC diagnostic ignored "-Wfloat-equal"
448 #endif
449  if (tmp < d[i])
450  d[i] = tmp;
451  else if (fc_isnan(tmp))
452  assert(false && "fastcluster: nan error");
453 #if HAVE_DIAGNOSTIC
454 #pragma GCC diagnostic pop
455 #endif
456  if (d[i] < min) {
457  min = d[i];
458  idx2 = i;
459  }
460  }
461  for (; i<N; i=active_nodes.succ[i]) {
462  t_float tmp = D_(prev_node, i);
463 #if HAVE_DIAGNOSTIC
464 #pragma GCC diagnostic push
465 #pragma GCC diagnostic ignored "-Wfloat-equal"
466 #endif
467  if (d[i] > tmp)
468  d[i] = tmp;
469  else if (fc_isnan(tmp))
470  assert(false && "fastcluster: nan error");
471 #if HAVE_DIAGNOSTIC
472 #pragma GCC diagnostic pop
473 #endif
474  if (d[i] < min) {
475  min = d[i];
476  idx2 = i;
477  }
478  }
479  Z2.append(prev_node, idx2, min);
480  }
481 }
482 
483 /* Functions for the update of the dissimilarity array */
484 
485 inline static void f_single( t_float * const b, const t_float a ) {
486  if (*b > a) *b = a;
487 }
488 inline static void f_complete( t_float * const b, const t_float a ) {
489  if (*b < a) *b = a;
490 }
491 inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) {
492  *b = s*a + t*(*b);
493  #ifndef FE_INVALID
494 #if HAVE_DIAGNOSTIC
495 #pragma GCC diagnostic push
496 #pragma GCC diagnostic ignored "-Wfloat-equal"
497 #endif
498  if (fc_isnan(*b)) {
499  assert(false && "fastcluster: nan error");
500  }
501 #if HAVE_DIAGNOSTIC
502 #pragma GCC diagnostic pop
503 #endif
504  #endif
505 }
506 inline static void f_weighted( t_float * const b, const t_float a) {
507  *b = (a+*b)*.5;
508  #ifndef FE_INVALID
509 #if HAVE_DIAGNOSTIC
510 #pragma GCC diagnostic push
511 #pragma GCC diagnostic ignored "-Wfloat-equal"
512 #endif
513  if (fc_isnan(*b)) {
514  assert(false && "fastcluster: nan error");
515  }
516 #if HAVE_DIAGNOSTIC
517 #pragma GCC diagnostic pop
518 #endif
519  #endif
520 }
521 inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) {
522  *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v);
523  //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v);
524  #ifndef FE_INVALID
525 #if HAVE_DIAGNOSTIC
526 #pragma GCC diagnostic push
527 #pragma GCC diagnostic ignored "-Wfloat-equal"
528 #endif
529  if (fc_isnan(*b)) {
530  assert(false && "fastcluster: nan error");
531  }
532 #if HAVE_DIAGNOSTIC
533 #pragma GCC diagnostic pop
534 #endif
535  #endif
536 }
537 inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) {
538  *b = s*a - stc + t*(*b);
539  #ifndef FE_INVALID
540  if (fc_isnan(*b)) {
541  assert(false && "fastcluster: nan error");
542  }
543 #if HAVE_DIAGNOSTIC
544 #pragma GCC diagnostic pop
545 #endif
546  #endif
547 }
548 inline static void f_median( t_float * const b, const t_float a, const t_float c_4) {
549  *b = (a+(*b))*.5 - c_4;
550  #ifndef FE_INVALID
551 #if HAVE_DIAGNOSTIC
552 #pragma GCC diagnostic push
553 #pragma GCC diagnostic ignored "-Wfloat-equal"
554 #endif
555  if (fc_isnan(*b)) {
556  assert(false && "fastcluster: nan error");
557  }
558 #if HAVE_DIAGNOSTIC
559 #pragma GCC diagnostic pop
560 #endif
561  #endif
562 }
563 
564 template <method_codes method, typename t_members>
565 static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
566 /*
567  N: integer
568  D: condensed distance matrix N*(N-1)/2
569  Z2: output data structure
570 
571  This is the NN-chain algorithm, described on page 86 in the following book:
572 
573  Fionn Murtagh, Multidimensional Clustering Algorithms,
574  Vienna, Würzburg: Physica-Verlag, 1985.
575 */
576  t_index i;
577 
578  auto_array_ptr<t_index> NN_chain(N);
579  t_index NN_chain_tip = 0;
580 
581  t_index idx1, idx2;
582 
583  t_float size1, size2;
584  doubly_linked_list active_nodes(N);
585 
586  t_float min;
587 
588  for (t_float const * DD=D; DD!=D+(static_cast<std::ptrdiff_t>(N)*(N-1)>>1);
589  ++DD) {
590 #if HAVE_DIAGNOSTIC
591 #pragma GCC diagnostic push
592 #pragma GCC diagnostic ignored "-Wfloat-equal"
593 #endif
594  if (fc_isnan(*DD)) {
595  assert(false && "fastcluster: nan error");
596  }
597 #if HAVE_DIAGNOSTIC
598 #pragma GCC diagnostic pop
599 #endif
600  }
601 
602  #ifdef FE_INVALID
603  if (feclearexcept(FE_INVALID)) assert(false && "fastcluster: fenv error");
604  #endif
605 
606  for (t_index j=0; j<N-1; ++j) {
607  if (NN_chain_tip <= 3) {
608  NN_chain[0] = idx1 = active_nodes.start;
609  NN_chain_tip = 1;
610 
611  idx2 = active_nodes.succ[idx1];
612  min = D_(idx1,idx2);
613  for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
614  if (D_(idx1,i) < min) {
615  min = D_(idx1,i);
616  idx2 = i;
617  }
618  }
619  } // a: idx1 b: idx2
620  else {
621  NN_chain_tip -= 3;
622  idx1 = NN_chain[NN_chain_tip-1];
623  idx2 = NN_chain[NN_chain_tip];
624  min = idx1<idx2 ? D_(idx1,idx2) : D_(idx2,idx1);
625  } // a: idx1 b: idx2
626 
627  do {
628  NN_chain[NN_chain_tip] = idx2;
629 
630  for (i=active_nodes.start; i<idx2; i=active_nodes.succ[i]) {
631  if (D_(i,idx2) < min) {
632  min = D_(i,idx2);
633  idx1 = i;
634  }
635  }
636  for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
637  if (D_(idx2,i) < min) {
638  min = D_(idx2,i);
639  idx1 = i;
640  }
641  }
642 
643  idx2 = idx1;
644  idx1 = NN_chain[NN_chain_tip++];
645 
646  } while (idx2 != NN_chain[NN_chain_tip-2]);
647 
648  Z2.append(idx1, idx2, min);
649 
650  if (idx1>idx2) {
651  t_index tmp = idx1;
652  idx1 = idx2;
653  idx2 = tmp;
654  }
655 
656  if (method==METHOD_METR_AVERAGE ||
657  method==METHOD_METR_WARD) {
658  size1 = static_cast<t_float>(members[idx1]);
659  size2 = static_cast<t_float>(members[idx2]);
660  members[idx2] += members[idx1];
661  }
662 
663  // Remove the smaller index from the valid indices (active_nodes).
664  active_nodes.remove(idx1);
665 
666  switch (method) {
667  case METHOD_METR_SINGLE:
668  /*
669  Single linkage.
670 
671  Characteristic: new distances are never longer than the old distances.
672  */
673  // Update the distance matrix in the range [start, idx1).
674  for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
675  f_single(&D_(i, idx2), D_(i, idx1) );
676  // Update the distance matrix in the range (idx1, idx2).
677  for (; i<idx2; i=active_nodes.succ[i])
678  f_single(&D_(i, idx2), D_(idx1, i) );
679  // Update the distance matrix in the range (idx2, N).
680  for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
681  f_single(&D_(idx2, i), D_(idx1, i) );
682  break;
683 
684  case METHOD_METR_COMPLETE:
685  /*
686  Complete linkage.
687 
688  Characteristic: new distances are never shorter than the old distances.
689  */
690  // Update the distance matrix in the range [start, idx1).
691  for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
692  f_complete(&D_(i, idx2), D_(i, idx1) );
693  // Update the distance matrix in the range (idx1, idx2).
694  for (; i<idx2; i=active_nodes.succ[i])
695  f_complete(&D_(i, idx2), D_(idx1, i) );
696  // Update the distance matrix in the range (idx2, N).
697  for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
698  f_complete(&D_(idx2, i), D_(idx1, i) );
699  break;
700 
701  case METHOD_METR_AVERAGE: {
702  /*
703  Average linkage.
704 
705  Shorter and longer distances can occur.
706  */
707  // Update the distance matrix in the range [start, idx1).
708  t_float s = size1/(size1+size2);
709  t_float t = size2/(size1+size2);
710  for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
711  f_average(&D_(i, idx2), D_(i, idx1), s, t );
712  // Update the distance matrix in the range (idx1, idx2).
713  for (; i<idx2; i=active_nodes.succ[i])
714  f_average(&D_(i, idx2), D_(idx1, i), s, t );
715  // Update the distance matrix in the range (idx2, N).
716  for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
717  f_average(&D_(idx2, i), D_(idx1, i), s, t );
718  break;
719  }
720 
721  case METHOD_METR_WEIGHTED:
722  /*
723  Weighted linkage.
724 
725  Shorter and longer distances can occur.
726  */
727  // Update the distance matrix in the range [start, idx1).
728  for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
729  f_weighted(&D_(i, idx2), D_(i, idx1) );
730  // Update the distance matrix in the range (idx1, idx2).
731  for (; i<idx2; i=active_nodes.succ[i])
732  f_weighted(&D_(i, idx2), D_(idx1, i) );
733  // Update the distance matrix in the range (idx2, N).
734  for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
735  f_weighted(&D_(idx2, i), D_(idx1, i) );
736  break;
737 
738  case METHOD_METR_WARD:
739  /*
740  Ward linkage.
741 
742  Shorter and longer distances can occur, not smaller than min(d1,d2)
743  but maybe bigger than max(d1,d2).
744  */
745  // Update the distance matrix in the range [start, idx1).
746  //t_float v = static_cast<t_float>(members[i]);
747  for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
748  f_ward(&D_(i, idx2), D_(i, idx1), min,
749  size1, size2, static_cast<t_float>(members[i]) );
750  // Update the distance matrix in the range (idx1, idx2).
751  for (; i<idx2; i=active_nodes.succ[i])
752  f_ward(&D_(i, idx2), D_(idx1, i), min,
753  size1, size2, static_cast<t_float>(members[i]) );
754  // Update the distance matrix in the range (idx2, N).
755  for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
756  f_ward(&D_(idx2, i), D_(idx1, i), min,
757  size1, size2, static_cast<t_float>(members[i]) );
758  break;
759 
760  default:
761  assert(false && "fastcluster: invalid method");
762  }
763  }
764  #ifdef FE_INVALID
765  if (fetestexcept(FE_INVALID)) assert(false && "fastcluster: fenv error");
766  #endif
767 }
768 
769 class binary_min_heap {
770  /*
771  Class for a binary min-heap. The data resides in an array A. The elements of
772  A are not changed but two lists I and R of indices are generated which point
773  to elements of A and backwards.
774 
775  The heap tree structure is
776 
777  H[2*i+1] H[2*i+2]
778  \ /
779  \ /
780  ≤ ≤
781  \ /
782  \ /
783  H[i]
784 
785  where the children must be less or equal than their parent. Thus, H[0]
786  contains the minimum. The lists I and R are made such that H[i] = A[I[i]]
787  and R[I[i]] = i.
788 
789  This implementation is not designed to handle NaN values.
790  */
791 private:
792  t_float * const A;
793  t_index size;
794  auto_array_ptr<t_index> I;
795  auto_array_ptr<t_index> R;
796 
797  // no default constructor
798  binary_min_heap();
799  // noncopyable
800  binary_min_heap(binary_min_heap const &);
801  binary_min_heap & operator=(binary_min_heap const &);
802 
803 public:
804  binary_min_heap(t_float * const A_, const t_index size_)
805  : A(A_), size(size_), I(size), R(size)
806  { // Allocate memory and initialize the lists I and R to the identity. This
807  // does not make it a heap. Call heapify afterwards!
808  for (t_index i=0; i<size; ++i)
809  R[i] = I[i] = i;
810  }
811 
812  binary_min_heap(t_float * const A_, const t_index size1, const t_index size2,
813  const t_index start)
814  : A(A_), size(size1), I(size1), R(size2)
815  { // Allocate memory and initialize the lists I and R to the identity. This
816  // does not make it a heap. Call heapify afterwards!
817  for (t_index i=0; i<size; ++i) {
818  R[i+start] = i;
819  I[i] = i + start;
820  }
821  }
822 
823  ~binary_min_heap() {}
824 
825  void heapify() {
826  // Arrange the indices I and R so that H[i] := A[I[i]] satisfies the heap
827  // condition H[i] < H[2*i+1] and H[i] < H[2*i+2] for each i.
828  //
829  // Complexity: Θ(size)
830  // Reference: Cormen, Leiserson, Rivest, Stein, Introduction to Algorithms,
831  // 3rd ed., 2009, Section 6.3 “Building a heap”
832  t_index idx;
833  for (idx=(size>>1); idx>0; ) {
834  --idx;
835  update_geq_(idx);
836  }
837  }
838 
839  inline t_index argmin() const {
840  // Return the minimal element.
841  return I[0];
842  }
843 
844  void heap_pop() {
845  // Remove the minimal element from the heap.
846  --size;
847  I[0] = I[size];
848  R[I[0]] = 0;
849  update_geq_(0);
850  }
851 
852  void remove(t_index idx) {
853  // Remove an element from the heap.
854  --size;
855  R[I[size]] = R[idx];
856  I[R[idx]] = I[size];
857  if ( H(size)<=A[idx] ) {
858  update_leq_(R[idx]);
859  }
860  else {
861  update_geq_(R[idx]);
862  }
863  }
864 
865  void replace ( const t_index idxold, const t_index idxnew,
866  const t_float val) {
867  R[idxnew] = R[idxold];
868  I[R[idxnew]] = idxnew;
869  if (val<=A[idxold])
870  update_leq(idxnew, val);
871  else
872  update_geq(idxnew, val);
873  }
874 
875  void update ( const t_index idx, const t_float val ) const {
876  // Update the element A[i] with val and re-arrange the indices to preserve
877  // the heap condition.
878  if (val<=A[idx])
879  update_leq(idx, val);
880  else
881  update_geq(idx, val);
882  }
883 
884  void update_leq ( const t_index idx, const t_float val ) const {
885  // Use this when the new value is not more than the old value.
886  A[idx] = val;
887  update_leq_(R[idx]);
888  }
889 
890  void update_geq ( const t_index idx, const t_float val ) const {
891  // Use this when the new value is not less than the old value.
892  A[idx] = val;
893  update_geq_(R[idx]);
894  }
895 
896 private:
897  void update_leq_ (t_index i) const {
898  t_index j;
899  for ( ; (i>0) && ( H(i)<H(j=(i-1)>>1) ); i=j)
900  heap_swap(i,j);
901  }
902 
903  void update_geq_ (t_index i) const {
904  t_index j;
905  for ( ; (j=2*i+1)<size; i=j) {
906  if ( H(j)>=H(i) ) {
907  ++j;
908  if ( j>=size || H(j)>=H(i) ) break;
909  }
910  else if ( j+1<size && H(j+1)<H(j) ) ++j;
911  heap_swap(i, j);
912  }
913  }
914 
915  void heap_swap(const t_index i, const t_index j) const {
916  // Swap two indices.
917  t_index tmp = I[i];
918  I[i] = I[j];
919  I[j] = tmp;
920  R[I[i]] = i;
921  R[I[j]] = j;
922  }
923 
924  inline t_float H(const t_index i) const {
925  return A[I[i]];
926  }
927 
928 };
929 
930 template <method_codes method, typename t_members>
931 static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
932  /*
933  N: integer, number of data points
934  D: condensed distance matrix N*(N-1)/2
935  Z2: output data structure
936  */
937 
938  const t_index N_1 = N-1;
939  t_index i, j; // loop variables
940  t_index idx1, idx2; // row and column indices
941 
942  auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
943  auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
944  auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
945  // i-th row represents
946  doubly_linked_list active_nodes(N);
947  binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
948  // the distance to the nearest neighbor of each point
949  t_index node1, node2; // node numbers in the output
950  t_float size1, size2; // and their cardinalities
951 
952  t_float min; // minimum and row index for nearest-neighbor search
953  t_index idx;
954 
955  for (i=0; i<N; ++i)
956  // Build a list of row ↔ node label assignments.
957  // Initially i ↦ i
958  row_repr[i] = i;
959 
960  // Initialize the minimal distances:
961  // Find the nearest neighbor of each point.
962  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
963  t_float const * DD = D;
964  for (i=0; i<N_1; ++i) {
965  min = std::numeric_limits<t_float>::infinity();
966  for (idx=j=i+1; j<N; ++j, ++DD) {
967 #if HAVE_DIAGNOSTIC
968 #pragma GCC diagnostic push
969 #pragma GCC diagnostic ignored "-Wfloat-equal"
970 #endif
971  if (*DD<min) {
972  min = *DD;
973  idx = j;
974  }
975  else if (fc_isnan(*DD))
976  assert(false && "fastcluster: nan error");
977  }
978 #if HAVE_DIAGNOSTIC
979 #pragma GCC diagnostic pop
980 #endif
981  mindist[i] = min;
982  n_nghbr[i] = idx;
983  }
984 
985  // Put the minimal distances into a heap structure to make the repeated
986  // global minimum searches fast.
987  nn_distances.heapify();
988 
989  #ifdef FE_INVALID
990  if (feclearexcept(FE_INVALID)) assert(false && "fastcluster: fenv error");
991  #endif
992 
993  // Main loop: We have N-1 merging steps.
994  for (i=0; i<N_1; ++i) {
995  /*
996  Here is a special feature that allows fast bookkeeping and updates of the
997  minimal distances.
998 
999  mindist[i] stores a lower bound on the minimum distance of the point i to
1000  all points of higher index:
1001 
1002  mindist[i] ≥ min_{j>i} D(i,j)
1003 
1004  Normally, we have equality. However, this minimum may become invalid due
1005  to the updates in the distance matrix. The rules are:
1006 
1007  1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct
1008  minimum and n_nghbr[i] is a nearest neighbor.
1009 
1010  2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the
1011  correct minimum. The minimum needs to be recomputed.
1012 
1013  3) mindist[i] is never bigger than the true minimum. Hence, we never
1014  miss the true minimum if we take the smallest mindist entry,
1015  re-compute the value if necessary (thus maybe increasing it) and
1016  looking for the now smallest mindist entry until a valid minimal
1017  entry is found. This step is done in the lines below.
1018 
1019  The update process for D below takes care that these rules are
1020  fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are
1021  re-calculated when necessary but re-calculation is avoided whenever
1022  possible.
1023 
1024  The re-calculation of the minima makes the worst-case runtime of this
1025  algorithm cubic in N. We avoid this whenever possible, and in most cases
1026  the runtime appears to be quadratic.
1027  */
1028  idx1 = nn_distances.argmin();
1029  if (method != METHOD_METR_SINGLE) {
1030  while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) {
1031  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1032  n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1033  min = D_(idx1,j);
1034  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1035  if (D_(idx1,j)<min) {
1036  min = D_(idx1,j);
1037  n_nghbr[idx1] = j;
1038  }
1039  }
1040  /* Update the heap with the new true minimum and search for the
1041  (possibly different) minimal entry. */
1042  nn_distances.update_geq(idx1, min);
1043  idx1 = nn_distances.argmin();
1044  }
1045  }
1046 
1047  nn_distances.heap_pop(); // Remove the current minimum from the heap.
1048  idx2 = n_nghbr[idx1];
1049 
1050  // Write the newly found minimal pair of nodes to the output array.
1051  node1 = row_repr[idx1];
1052  node2 = row_repr[idx2];
1053 
1054  if (method==METHOD_METR_AVERAGE ||
1055  method==METHOD_METR_WARD ||
1056  method==METHOD_METR_CENTROID) {
1057  size1 = static_cast<t_float>(members[idx1]);
1058  size2 = static_cast<t_float>(members[idx2]);
1059  members[idx2] += members[idx1];
1060  }
1061  Z2.append(node1, node2, mindist[idx1]);
1062 
1063  // Remove idx1 from the list of active indices (active_nodes).
1064  active_nodes.remove(idx1);
1065  // Index idx2 now represents the new (merged) node with label N+i.
1066  row_repr[idx2] = N+i;
1067 
1068  // Update the distance matrix
1069  switch (method) {
1070  case METHOD_METR_SINGLE:
1071  /*
1072  Single linkage.
1073 
1074  Characteristic: new distances are never longer than the old distances.
1075  */
1076  // Update the distance matrix in the range [start, idx1).
1077  for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1078  f_single(&D_(j, idx2), D_(j, idx1));
1079  if (n_nghbr[j] == idx1)
1080  n_nghbr[j] = idx2;
1081  }
1082  // Update the distance matrix in the range (idx1, idx2).
1083  for (; j<idx2; j=active_nodes.succ[j]) {
1084  f_single(&D_(j, idx2), D_(idx1, j));
1085  // If the new value is below the old minimum in a row, update
1086  // the mindist and n_nghbr arrays.
1087  if (D_(j, idx2) < mindist[j]) {
1088  nn_distances.update_leq(j, D_(j, idx2));
1089  n_nghbr[j] = idx2;
1090  }
1091  }
1092  // Update the distance matrix in the range (idx2, N).
1093  // Recompute the minimum mindist[idx2] and n_nghbr[idx2].
1094  if (idx2<N_1) {
1095  min = mindist[idx2];
1096  for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j]) {
1097  f_single(&D_(idx2, j), D_(idx1, j) );
1098  if (D_(idx2, j) < min) {
1099  n_nghbr[idx2] = j;
1100  min = D_(idx2, j);
1101  }
1102  }
1103  nn_distances.update_leq(idx2, min);
1104  }
1105  break;
1106 
1107  case METHOD_METR_COMPLETE:
1108  /*
1109  Complete linkage.
1110 
1111  Characteristic: new distances are never shorter than the old distances.
1112  */
1113  // Update the distance matrix in the range [start, idx1).
1114  for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1115  f_complete(&D_(j, idx2), D_(j, idx1) );
1116  if (n_nghbr[j] == idx1)
1117  n_nghbr[j] = idx2;
1118  }
1119  // Update the distance matrix in the range (idx1, idx2).
1120  for (; j<idx2; j=active_nodes.succ[j])
1121  f_complete(&D_(j, idx2), D_(idx1, j) );
1122  // Update the distance matrix in the range (idx2, N).
1123  for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j])
1124  f_complete(&D_(idx2, j), D_(idx1, j) );
1125  break;
1126 
1127  case METHOD_METR_AVERAGE: {
1128  /*
1129  Average linkage.
1130 
1131  Shorter and longer distances can occur.
1132  */
1133  // Update the distance matrix in the range [start, idx1).
1134  t_float s = size1/(size1+size2);
1135  t_float t = size2/(size1+size2);
1136  for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1137  f_average(&D_(j, idx2), D_(j, idx1), s, t);
1138  if (n_nghbr[j] == idx1)
1139  n_nghbr[j] = idx2;
1140  }
1141  // Update the distance matrix in the range (idx1, idx2).
1142  for (; j<idx2; j=active_nodes.succ[j]) {
1143  f_average(&D_(j, idx2), D_(idx1, j), s, t);
1144  if (D_(j, idx2) < mindist[j]) {
1145  nn_distances.update_leq(j, D_(j, idx2));
1146  n_nghbr[j] = idx2;
1147  }
1148  }
1149  // Update the distance matrix in the range (idx2, N).
1150  if (idx2<N_1) {
1151  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1152  f_average(&D_(idx2, j), D_(idx1, j), s, t);
1153  min = D_(idx2,j);
1154  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1155  f_average(&D_(idx2, j), D_(idx1, j), s, t);
1156  if (D_(idx2,j) < min) {
1157  min = D_(idx2,j);
1158  n_nghbr[idx2] = j;
1159  }
1160  }
1161  nn_distances.update(idx2, min);
1162  }
1163  break;
1164  }
1165 
1166  case METHOD_METR_WEIGHTED:
1167  /*
1168  Weighted linkage.
1169 
1170  Shorter and longer distances can occur.
1171  */
1172  // Update the distance matrix in the range [start, idx1).
1173  for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1174  f_weighted(&D_(j, idx2), D_(j, idx1) );
1175  if (n_nghbr[j] == idx1)
1176  n_nghbr[j] = idx2;
1177  }
1178  // Update the distance matrix in the range (idx1, idx2).
1179  for (; j<idx2; j=active_nodes.succ[j]) {
1180  f_weighted(&D_(j, idx2), D_(idx1, j) );
1181  if (D_(j, idx2) < mindist[j]) {
1182  nn_distances.update_leq(j, D_(j, idx2));
1183  n_nghbr[j] = idx2;
1184  }
1185  }
1186  // Update the distance matrix in the range (idx2, N).
1187  if (idx2<N_1) {
1188  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1189  f_weighted(&D_(idx2, j), D_(idx1, j) );
1190  min = D_(idx2,j);
1191  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1192  f_weighted(&D_(idx2, j), D_(idx1, j) );
1193  if (D_(idx2,j) < min) {
1194  min = D_(idx2,j);
1195  n_nghbr[idx2] = j;
1196  }
1197  }
1198  nn_distances.update(idx2, min);
1199  }
1200  break;
1201 
1202  case METHOD_METR_WARD:
1203  /*
1204  Ward linkage.
1205 
1206  Shorter and longer distances can occur, not smaller than min(d1,d2)
1207  but maybe bigger than max(d1,d2).
1208  */
1209  // Update the distance matrix in the range [start, idx1).
1210  for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1211  f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1],
1212  size1, size2, static_cast<t_float>(members[j]) );
1213  if (n_nghbr[j] == idx1)
1214  n_nghbr[j] = idx2;
1215  }
1216  // Update the distance matrix in the range (idx1, idx2).
1217  for (; j<idx2; j=active_nodes.succ[j]) {
1218  f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2,
1219  static_cast<t_float>(members[j]) );
1220  if (D_(j, idx2) < mindist[j]) {
1221  nn_distances.update_leq(j, D_(j, idx2));
1222  n_nghbr[j] = idx2;
1223  }
1224  }
1225  // Update the distance matrix in the range (idx2, N).
1226  if (idx2<N_1) {
1227  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1228  f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
1229  size1, size2, static_cast<t_float>(members[j]) );
1230  min = D_(idx2,j);
1231  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1232  f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
1233  size1, size2, static_cast<t_float>(members[j]) );
1234  if (D_(idx2,j) < min) {
1235  min = D_(idx2,j);
1236  n_nghbr[idx2] = j;
1237  }
1238  }
1239  nn_distances.update(idx2, min);
1240  }
1241  break;
1242 
1243  case METHOD_METR_CENTROID: {
1244  /*
1245  Centroid linkage.
1246 
1247  Shorter and longer distances can occur, not bigger than max(d1,d2)
1248  but maybe smaller than min(d1,d2).
1249  */
1250  // Update the distance matrix in the range [start, idx1).
1251  t_float s = size1/(size1+size2);
1252  t_float t = size2/(size1+size2);
1253  t_float stc = s*t*mindist[idx1];
1254  for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1255  f_centroid(&D_(j, idx2), D_(j, idx1), stc, s, t);
1256  if (D_(j, idx2) < mindist[j]) {
1257  nn_distances.update_leq(j, D_(j, idx2));
1258  n_nghbr[j] = idx2;
1259  }
1260  else if (n_nghbr[j] == idx1)
1261  n_nghbr[j] = idx2;
1262  }
1263  // Update the distance matrix in the range (idx1, idx2).
1264  for (; j<idx2; j=active_nodes.succ[j]) {
1265  f_centroid(&D_(j, idx2), D_(idx1, j), stc, s, t);
1266  if (D_(j, idx2) < mindist[j]) {
1267  nn_distances.update_leq(j, D_(j, idx2));
1268  n_nghbr[j] = idx2;
1269  }
1270  }
1271  // Update the distance matrix in the range (idx2, N).
1272  if (idx2<N_1) {
1273  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1274  f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1275  min = D_(idx2,j);
1276  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1277  f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1278  if (D_(idx2,j) < min) {
1279  min = D_(idx2,j);
1280  n_nghbr[idx2] = j;
1281  }
1282  }
1283  nn_distances.update(idx2, min);
1284  }
1285  break;
1286  }
1287 
1288  case METHOD_METR_MEDIAN: {
1289  /*
1290  Median linkage.
1291 
1292  Shorter and longer distances can occur, not bigger than max(d1,d2)
1293  but maybe smaller than min(d1,d2).
1294  */
1295  // Update the distance matrix in the range [start, idx1).
1296  t_float c_4 = mindist[idx1]*.25;
1297  for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1298  f_median(&D_(j, idx2), D_(j, idx1), c_4 );
1299  if (D_(j, idx2) < mindist[j]) {
1300  nn_distances.update_leq(j, D_(j, idx2));
1301  n_nghbr[j] = idx2;
1302  }
1303  else if (n_nghbr[j] == idx1)
1304  n_nghbr[j] = idx2;
1305  }
1306  // Update the distance matrix in the range (idx1, idx2).
1307  for (; j<idx2; j=active_nodes.succ[j]) {
1308  f_median(&D_(j, idx2), D_(idx1, j), c_4 );
1309  if (D_(j, idx2) < mindist[j]) {
1310  nn_distances.update_leq(j, D_(j, idx2));
1311  n_nghbr[j] = idx2;
1312  }
1313  }
1314  // Update the distance matrix in the range (idx2, N).
1315  if (idx2<N_1) {
1316  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1317  f_median(&D_(idx2, j), D_(idx1, j), c_4 );
1318  min = D_(idx2,j);
1319  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1320  f_median(&D_(idx2, j), D_(idx1, j), c_4 );
1321  if (D_(idx2,j) < min) {
1322  min = D_(idx2,j);
1323  n_nghbr[idx2] = j;
1324  }
1325  }
1326  nn_distances.update(idx2, min);
1327  }
1328  break;
1329  }
1330 
1331  default:
1332  assert(false && "fastcluster: invalid method");
1333  }
1334  }
1335  #ifdef FE_INVALID
1336  if (fetestexcept(FE_INVALID)) assert(false && "fastcluster: fenv error");
1337  #endif
1338 }
1339 
1340 /*
1341  Clustering methods for vector data
1342 */
1343 
1344 template <typename t_dissimilarity>
1345 static void MST_linkage_core_vector(const t_index N,
1346  t_dissimilarity & dist,
1347  cluster_result & Z2) {
1348 /*
1349  N: integer, number of data points
1350  dist: function pointer to the metric
1351  Z2: output data structure
1352 
1353  The basis of this algorithm is an algorithm by Rohlf:
1354 
1355  F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
1356  The Computer Journal, vol. 16, 1973, p. 93–95.
1357 */
1358  t_index i;
1359  t_index idx2;
1360  doubly_linked_list active_nodes(N);
1361  auto_array_ptr<t_float> d(N);
1362 
1363  t_index prev_node;
1364  t_float min;
1365 
1366  // first iteration
1367  idx2 = 1;
1368  min = std::numeric_limits<t_float>::infinity();
1369  for (i=1; i<N; ++i) {
1370  d[i] = dist(0,i);
1371 #if HAVE_DIAGNOSTIC
1372 #pragma GCC diagnostic push
1373 #pragma GCC diagnostic ignored "-Wfloat-equal"
1374 #endif
1375  if (d[i] < min) {
1376  min = d[i];
1377  idx2 = i;
1378  }
1379  else if (fc_isnan(d[i]))
1380  assert(false && "fastcluster: nan error");
1381 #if HAVE_DIAGNOSTIC
1382 #pragma GCC diagnostic pop
1383 #endif
1384  }
1385 
1386  Z2.append(0, idx2, min);
1387 
1388  for (t_index j=1; j<N-1; ++j) {
1389  prev_node = idx2;
1390  active_nodes.remove(prev_node);
1391 
1392  idx2 = active_nodes.succ[0];
1393  min = d[idx2];
1394 
1395  for (i=idx2; i<N; i=active_nodes.succ[i]) {
1396  t_float tmp = dist(i, prev_node);
1397 #if HAVE_DIAGNOSTIC
1398 #pragma GCC diagnostic push
1399 #pragma GCC diagnostic ignored "-Wfloat-equal"
1400 #endif
1401  if (d[i] > tmp)
1402  d[i] = tmp;
1403  else if (fc_isnan(tmp))
1404  assert(false && "fastcluster: nan error");
1405 #if HAVE_DIAGNOSTIC
1406 #pragma GCC diagnostic pop
1407 #endif
1408  if (d[i] < min) {
1409  min = d[i];
1410  idx2 = i;
1411  }
1412  }
1413  Z2.append(prev_node, idx2, min);
1414  }
1415 }
1416 
1417 template <method_codes_vector method, typename t_dissimilarity>
1418 static void generic_linkage_vector(const t_index N,
1419  t_dissimilarity & dist,
1420  cluster_result & Z2) {
1421  /*
1422  N: integer, number of data points
1423  dist: function pointer to the metric
1424  Z2: output data structure
1425 
1426  This algorithm is valid for the distance update methods
1427  "Ward", "centroid" and "median" only!
1428  */
1429  const t_index N_1 = N-1;
1430  t_index i, j; // loop variables
1431  t_index idx1, idx2; // row and column indices
1432 
1433  auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
1434  auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
1435  auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
1436  // i-th row represents
1437  doubly_linked_list active_nodes(N);
1438  binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
1439  // the distance to the nearest neighbor of each point
1440  t_index node1, node2; // node numbers in the output
1441  t_float min; // minimum and row index for nearest-neighbor search
1442 
1443  for (i=0; i<N; ++i)
1444  // Build a list of row ↔ node label assignments.
1445  // Initially i ↦ i
1446  row_repr[i] = i;
1447 
1448  // Initialize the minimal distances:
1449  // Find the nearest neighbor of each point.
1450  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1451  for (i=0; i<N_1; ++i) {
1452  min = std::numeric_limits<t_float>::infinity();
1453  t_index idx;
1454  for (idx=j=i+1; j<N; ++j) {
1455  t_float tmp;
1456  switch (method) {
1457  case METHOD_VECTOR_WARD:
1458  tmp = dist.ward_initial(i,j);
1459  break;
1460  default:
1461  tmp = dist.template sqeuclidean<true>(i,j);
1462  }
1463  if (tmp<min) {
1464  min = tmp;
1465  idx = j;
1466  }
1467  }
1468  switch (method) {
1469  case METHOD_VECTOR_WARD:
1470  mindist[i] = t_dissimilarity::ward_initial_conversion(min);
1471  break;
1472  default:
1473  mindist[i] = min;
1474  }
1475  n_nghbr[i] = idx;
1476  }
1477 
1478  // Put the minimal distances into a heap structure to make the repeated
1479  // global minimum searches fast.
1480  nn_distances.heapify();
1481 
1482  // Main loop: We have N-1 merging steps.
1483  for (i=0; i<N_1; ++i) {
1484  idx1 = nn_distances.argmin();
1485 
1486  while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
1487  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1488  n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1489  switch (method) {
1490  case METHOD_VECTOR_WARD:
1491  min = dist.ward(idx1,j);
1492  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1493  t_float const tmp = dist.ward(idx1,j);
1494  if (tmp<min) {
1495  min = tmp;
1496  n_nghbr[idx1] = j;
1497  }
1498  }
1499  break;
1500  default:
1501  min = dist.template sqeuclidean<true>(idx1,j);
1502  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1503  t_float const tmp = dist.template sqeuclidean<true>(idx1,j);
1504  if (tmp<min) {
1505  min = tmp;
1506  n_nghbr[idx1] = j;
1507  }
1508  }
1509  }
1510  /* Update the heap with the new true minimum and search for the (possibly
1511  different) minimal entry. */
1512  nn_distances.update_geq(idx1, min);
1513  idx1 = nn_distances.argmin();
1514  }
1515 
1516  nn_distances.heap_pop(); // Remove the current minimum from the heap.
1517  idx2 = n_nghbr[idx1];
1518 
1519  // Write the newly found minimal pair of nodes to the output array.
1520  node1 = row_repr[idx1];
1521  node2 = row_repr[idx2];
1522 
1523  Z2.append(node1, node2, mindist[idx1]);
1524 
1525  switch (method) {
1526  case METHOD_VECTOR_WARD:
1527  case METHOD_VECTOR_CENTROID:
1528  dist.merge_inplace(idx1, idx2);
1529  break;
1530  case METHOD_VECTOR_MEDIAN:
1531  dist.merge_inplace_weighted(idx1, idx2);
1532  break;
1533  default:
1534  assert(false && "fastcluster: invalid method");
1535  }
1536 
1537  // Index idx2 now represents the new (merged) node with label N+i.
1538  row_repr[idx2] = N+i;
1539  // Remove idx1 from the list of active indices (active_nodes).
1540  active_nodes.remove(idx1); // TBD later!!!
1541 
1542  // Update the distance matrix
1543  switch (method) {
1544  case METHOD_VECTOR_WARD:
1545  /*
1546  Ward linkage.
1547 
1548  Shorter and longer distances can occur, not smaller than min(d1,d2)
1549  but maybe bigger than max(d1,d2).
1550  */
1551  // Update the distance matrix in the range [start, idx1).
1552  for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1553  if (n_nghbr[j] == idx2) {
1554  n_nghbr[j] = idx1; // invalidate
1555  }
1556  }
1557  // Update the distance matrix in the range (idx1, idx2).
1558  for ( ; j<idx2; j=active_nodes.succ[j]) {
1559  t_float const tmp = dist.ward(j, idx2);
1560  if (tmp < mindist[j]) {
1561  nn_distances.update_leq(j, tmp);
1562  n_nghbr[j] = idx2;
1563  }
1564  else if (n_nghbr[j]==idx2) {
1565  n_nghbr[j] = idx1; // invalidate
1566  }
1567  }
1568  // Find the nearest neighbor for idx2.
1569  if (idx2<N_1) {
1570  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1571  min = dist.ward(idx2,j);
1572  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1573  t_float const tmp = dist.ward(idx2,j);
1574  if (tmp < min) {
1575  min = tmp;
1576  n_nghbr[idx2] = j;
1577  }
1578  }
1579  nn_distances.update(idx2, min);
1580  }
1581  break;
1582 
1583  default:
1584  /*
1585  Centroid and median linkage.
1586 
1587  Shorter and longer distances can occur, not bigger than max(d1,d2)
1588  but maybe smaller than min(d1,d2).
1589  */
1590  for (j=active_nodes.start; j<idx2; j=active_nodes.succ[j]) {
1591  t_float const tmp = dist.template sqeuclidean<true>(j, idx2);
1592  if (tmp < mindist[j]) {
1593  nn_distances.update_leq(j, tmp);
1594  n_nghbr[j] = idx2;
1595  }
1596  else if (n_nghbr[j] == idx2)
1597  n_nghbr[j] = idx1; // invalidate
1598  }
1599  // Find the nearest neighbor for idx2.
1600  if (idx2<N_1) {
1601  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1602  min = dist.template sqeuclidean<true>(idx2,j);
1603  for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1604  t_float const tmp = dist.template sqeuclidean<true>(idx2, j);
1605  if (tmp < min) {
1606  min = tmp;
1607  n_nghbr[idx2] = j;
1608  }
1609  }
1610  nn_distances.update(idx2, min);
1611  }
1612  }
1613  }
1614 }
1615 
1616 template <method_codes_vector method, typename t_dissimilarity>
1617 static void generic_linkage_vector_alternative(const t_index N,
1618  t_dissimilarity & dist,
1619  cluster_result & Z2) {
1620  /*
1621  N: integer, number of data points
1622  dist: function pointer to the metric
1623  Z2: output data structure
1624 
1625  This algorithm is valid for the distance update methods
1626  "Ward", "centroid" and "median" only!
1627  */
1628  const t_index N_1 = N-1;
1629  t_index i, j=0; // loop variables
1630  t_index idx1, idx2; // row and column indices
1631 
1632  auto_array_ptr<t_index> n_nghbr(2*N-2); // array of nearest neighbors
1633  auto_array_ptr<t_float> mindist(2*N-2); // distances to the nearest neighbors
1634 
1635  doubly_linked_list active_nodes(N+N_1);
1636  binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap
1637  // structure for the distance to the nearest neighbor of each point
1638 
1639  t_float min; // minimum for nearest-neighbor searches
1640 
1641  // Initialize the minimal distances:
1642  // Find the nearest neighbor of each point.
1643  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1644  for (i=1; i<N; ++i) {
1645  min = std::numeric_limits<t_float>::infinity();
1646  t_index idx;
1647  for (idx=j=0; j<i; ++j) {
1648  t_float tmp;
1649  switch (method) {
1650  case METHOD_VECTOR_WARD:
1651  tmp = dist.ward_initial(i,j);
1652  break;
1653  default:
1654  tmp = dist.template sqeuclidean<true>(i,j);
1655  }
1656  if (tmp<min) {
1657  min = tmp;
1658  idx = j;
1659  }
1660  }
1661  switch (method) {
1662  case METHOD_VECTOR_WARD:
1663  mindist[i] = t_dissimilarity::ward_initial_conversion(min);
1664  break;
1665  default:
1666  mindist[i] = min;
1667  }
1668  n_nghbr[i] = idx;
1669  }
1670 
1671  // Put the minimal distances into a heap structure to make the repeated
1672  // global minimum searches fast.
1673  nn_distances.heapify();
1674 
1675  // Main loop: We have N-1 merging steps.
1676  for (i=N; i<N+N_1; ++i) {
1677  /*
1678  The bookkeeping is different from the "stored matrix approach" algorithm
1679  generic_linkage.
1680 
1681  mindist[i] stores a lower bound on the minimum distance of the point i to
1682  all points of *lower* index:
1683 
1684  mindist[i] ≥ min_{j<i} D(i,j)
1685 
1686  Moreover, new nodes do not re-use one of the old indices, but they are
1687  given a new, unique index (SciPy convention: initial nodes are 0,…,N−1,
1688  new nodes are N,…,2N−2).
1689 
1690  Invalid nearest neighbors are not recognized by the fact that the stored
1691  distance is smaller than the actual distance, but the list active_nodes
1692  maintains a flag whether a node is inactive. If n_nghbr[i] points to an
1693  active node, the entries nn_distances[i] and n_nghbr[i] are valid,
1694  otherwise they must be recomputed.
1695  */
1696  idx1 = nn_distances.argmin();
1697  while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
1698  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1699  n_nghbr[idx1] = j = active_nodes.start;
1700  switch (method) {
1701  case METHOD_VECTOR_WARD:
1702  min = dist.ward_extended(idx1,j);
1703  for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
1704  t_float tmp = dist.ward_extended(idx1,j);
1705  if (tmp<min) {
1706  min = tmp;
1707  n_nghbr[idx1] = j;
1708  }
1709  }
1710  break;
1711  default:
1712  min = dist.sqeuclidean_extended(idx1,j);
1713  for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
1714  t_float const tmp = dist.sqeuclidean_extended(idx1,j);
1715  if (tmp<min) {
1716  min = tmp;
1717  n_nghbr[idx1] = j;
1718  }
1719  }
1720  }
1721  /* Update the heap with the new true minimum and search for the (possibly
1722  different) minimal entry. */
1723  nn_distances.update_geq(idx1, min);
1724  idx1 = nn_distances.argmin();
1725  }
1726 
1727  idx2 = n_nghbr[idx1];
1728  active_nodes.remove(idx1);
1729  active_nodes.remove(idx2);
1730 
1731  Z2.append(idx1, idx2, mindist[idx1]);
1732 
1733  if (i<2*N_1) {
1734  switch (method) {
1735  case METHOD_VECTOR_WARD:
1736  case METHOD_VECTOR_CENTROID:
1737  dist.merge(idx1, idx2, i);
1738  break;
1739 
1740  case METHOD_VECTOR_MEDIAN:
1741  dist.merge_weighted(idx1, idx2, i);
1742  break;
1743 
1744  default:
1745  assert(false && "fastcluster: invalid method");
1746  }
1747 
1748  n_nghbr[i] = active_nodes.start;
1749  if (method==METHOD_VECTOR_WARD) {
1750  /*
1751  Ward linkage.
1752 
1753  Shorter and longer distances can occur, not smaller than min(d1,d2)
1754  but maybe bigger than max(d1,d2).
1755  */
1756  min = dist.ward_extended(active_nodes.start, i);
1757  for (j=active_nodes.succ[active_nodes.start]; j<i;
1758  j=active_nodes.succ[j]) {
1759  t_float tmp = dist.ward_extended(j, i);
1760  if (tmp < min) {
1761  min = tmp;
1762  n_nghbr[i] = j;
1763  }
1764  }
1765  }
1766  else {
1767  /*
1768  Centroid and median linkage.
1769 
1770  Shorter and longer distances can occur, not bigger than max(d1,d2)
1771  but maybe smaller than min(d1,d2).
1772  */
1773  min = dist.sqeuclidean_extended(active_nodes.start, i);
1774  for (j=active_nodes.succ[active_nodes.start]; j<i;
1775  j=active_nodes.succ[j]) {
1776  t_float tmp = dist.sqeuclidean_extended(j, i);
1777  if (tmp < min) {
1778  min = tmp;
1779  n_nghbr[i] = j;
1780  }
1781  }
1782  }
1783  if (idx2<active_nodes.start) {
1784  nn_distances.remove(active_nodes.start);
1785  } else {
1786  nn_distances.remove(idx2);
1787  }
1788  nn_distances.replace(idx1, i, min);
1789  }
1790  }
1791 }
1792 
1793 #if HAVE_VISIBILITY
1794 #pragma GCC visibility pop
1795 #endif
cJSON * p
Definition: cJSON.cpp:2559
return NULL
Definition: cJSON.cpp:1173
newitem type
Definition: cJSON.cpp:2739
cJSON * a
Definition: cJSON.cpp:2560
const cJSON *const b
Definition: cJSON.h:255
int index
Definition: cJSON.h:170
bool fc_isnan(double x)
Definition: fastcluster.cpp:19
IntervalValue operator<(const IntervalValue &lhs, const IntervalValue &rhs)