Static Value-Flow Analysis
Loading...
Searching...
No Matches
fastcluster_dm.cpp.inc
Go to the documentation of this file.
1/*
2 fastcluster: Fast hierarchical clustering routines for R and Python
3
4 Copyright © 2011 Daniel Müllner
5 <http://danifold.net>
6
7 This library implements various fast algorithms for hierarchical,
8 agglomerative clustering methods:
9
10 (1) Algorithms for the "stored matrix approach": the input is the array of
11 pairwise dissimilarities.
12
13 MST_linkage_core: single linkage clustering with the "minimum spanning
14 tree algorithm (Rohlfs)
15
16 NN_chain_core: nearest-neighbor-chain algorithm, suitable for single,
17 complete, average, weighted and Ward linkage (Murtagh)
18
19 generic_linkage: generic algorithm, suitable for all distance update
20 formulas (Müllner)
21
22 (2) Algorithms for the "stored data approach": the input are points in a
23 vector space.
24
25 MST_linkage_core_vector: single linkage clustering for vector data
26
27 generic_linkage_vector: generic algorithm for vector data, suitable for
28 the Ward, centroid and median methods.
29
30 generic_linkage_vector_alternative: alternative scheme for updating the
31 nearest neighbors. This method seems faster than "generic_linkage_vector"
32 for the centroid and median methods but slower for the Ward method.
33
34 All these implementation treat infinity values correctly. They throw an
35 exception if a NaN distance value occurs.
36*/
37
38// Older versions of Microsoft Visual Studio do not have the fenv header.
39#ifdef _MSC_VER
40#if (_MSC_VER == 1500 || _MSC_VER == 1600)
41#define NO_INCLUDE_FENV
42#endif
43#endif
44// NaN detection via fenv might not work on systems with software
45// floating-point emulation (bug report for Debian armel).
46#ifdef __SOFTFP__
47#define NO_INCLUDE_FENV
48#endif
49#ifdef NO_INCLUDE_FENV
50// #pragma message("Do not use fenv header.")
51#else
52// #pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.")
53// TODO
54//#pragma STDC FENV_ACCESS ON
55#include <fenv.h>
56#endif
57
58#include <cmath> // for std::pow, std::sqrt
59#include <cstddef> // for std::ptrdiff_t
60#include <limits> // for std::numeric_limits<...>::infinity()
61#include <algorithm> // for std::fill_n
62#include <stdexcept> // for std::runtime_error
63#include <string> // for std::string
64
65#include <cfloat> // also for DBL_MAX, DBL_MIN
66#ifndef DBL_MANT_DIG
67#error The constant DBL_MANT_DIG could not be defined.
68#endif
69#define T_FLOAT_MANT_DIG DBL_MANT_DIG
70
71#ifndef LONG_MAX
72#include <climits>
73#endif
74#ifndef LONG_MAX
75#error The constant LONG_MAX could not be defined.
76#endif
77#ifndef INT_MAX
78#error The constant INT_MAX could not be defined.
79#endif
80
81#ifndef INT32_MAX
82#ifdef _MSC_VER
83#if _MSC_VER >= 1600
84#define __STDC_LIMIT_MACROS
85#include <stdint.h>
86#else
87typedef __int32 int_fast32_t;
88typedef __int64 int64_t;
89#endif
90#else
91#define __STDC_LIMIT_MACROS
92#include <stdint.h>
93#endif
94#endif
95
96#define FILL_N std::fill_n
97#ifdef _MSC_VER
98#if _MSC_VER < 1600
99#undef FILL_N
100#define FILL_N stdext::unchecked_fill_n
101#endif
102#endif
103
104// Suppress warnings about (potentially) uninitialized variables.
105#ifdef _MSC_VER
106 #pragma warning (disable:4700)
107#endif
108
109#ifndef HAVE_DIAGNOSTIC
110#if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
111#define HAVE_DIAGNOSTIC 1
112#endif
113#endif
114
115#ifndef HAVE_VISIBILITY
116#if __GNUC__ >= 4
117#define HAVE_VISIBILITY 1
118#endif
119#endif
120
121/* Since the public interface is given by the Python respectively R interface,
122 * we do not want other symbols than the interface initalization routines to be
123 * visible in the shared object file. The "visibility" switch is a GCC concept.
124 * Hiding symbols keeps the relocation table small and decreases startup time.
125 * See http://gcc.gnu.org/wiki/Visibility
126 */
127#if HAVE_VISIBILITY
128#pragma GCC visibility push(hidden)
129#endif
130
131typedef int_fast32_t t_index;
132#ifndef INT32_MAX
133#define MAX_INDEX 0x7fffffffL
134#else
135#define MAX_INDEX INT32_MAX
136#endif
137#if (LONG_MAX < MAX_INDEX)
138#error The integer format "t_index" must not have a greater range than "long int".
139#endif
140#if (INT_MAX > MAX_INDEX)
141#error The integer format "int" must not have a greater range than "t_index".
142#endif
143typedef double t_float;
144
145/* Method codes.
146
147 These codes must agree with the METHODS array in fastcluster.R and the
148 dictionary mthidx in fastcluster.py.
149*/
150enum method_codes {
151 // non-Euclidean methods
152 METHOD_METR_SINGLE = 0,
153 METHOD_METR_COMPLETE = 1,
154 METHOD_METR_AVERAGE = 2,
155 METHOD_METR_WEIGHTED = 3,
156 METHOD_METR_WARD = 4,
157 METHOD_METR_WARD_D = METHOD_METR_WARD,
158 METHOD_METR_CENTROID = 5,
159 METHOD_METR_MEDIAN = 6,
160 METHOD_METR_WARD_D2 = 7,
161
162 MIN_METHOD_CODE = 0,
163 MAX_METHOD_CODE = 7
164};
165
166enum method_codes_vector {
167 // Euclidean methods
168 METHOD_VECTOR_SINGLE = 0,
169 METHOD_VECTOR_WARD = 1,
170 METHOD_VECTOR_CENTROID = 2,
171 METHOD_VECTOR_MEDIAN = 3,
172
173 MIN_METHOD_VECTOR_CODE = 0,
174 MAX_METHOD_VECTOR_CODE = 3
175};
176
177// self-destructing array pointer
178template <typename type>
179class auto_array_ptr{
180private:
181 type * ptr;
182 auto_array_ptr(auto_array_ptr const &); // non construction-copyable
183 auto_array_ptr& operator=(auto_array_ptr const &); // non copyable
184public:
185 auto_array_ptr()
186 : ptr(NULL)
187 { }
188 template <typename index>
189 auto_array_ptr(index const size)
190 : ptr(new type[size])
191 { }
192 template <typename index, typename value>
193 auto_array_ptr(index const size, value const val)
194 : ptr(new type[size])
195 {
196 FILL_N(ptr, size, val);
197 }
198 ~auto_array_ptr() {
199 delete [] ptr; }
200 void free() {
201 delete [] ptr;
202 ptr = NULL;
203 }
204 template <typename index>
205 void init(index const size) {
206 ptr = new type [size];
207 }
208 template <typename index, typename value>
209 void init(index const size, value const val) {
210 init(size);
211 FILL_N(ptr, size, val);
212 }
213 inline operator type *() const { return ptr; }
214};
215
216struct node {
217 t_index node1, node2;
218 t_float dist;
219};
220
221inline bool operator< (const node a, const node b) {
222 return (a.dist < b.dist);
223}
224
225class cluster_result {
226private:
227 auto_array_ptr<node> Z;
228 t_index pos;
229
230public:
231 cluster_result(const t_index size)
232 : Z(size)
233 , pos(0)
234 {}
235
236 void append(const t_index node1, const t_index node2, const t_float dist) {
237 Z[pos].node1 = node1;
238 Z[pos].node2 = node2;
239 Z[pos].dist = dist;
240 ++pos;
241 }
242
243 node * operator[] (const t_index idx) const { return Z + idx; }
244
245 /* Define several methods to postprocess the distances. All these functions
246 are monotone, so they do not change the sorted order of distances. */
247
248 void sqrt() const {
249 for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
250 ZZ->dist = std::sqrt(ZZ->dist);
251 }
252 }
253
254 void sqrt(const t_float) const { // ignore the argument
255 sqrt();
256 }
257
258 void sqrtdouble(const t_float) const { // ignore the argument
259 for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
260 ZZ->dist = std::sqrt(2*ZZ->dist);
261 }
262 }
263
264 #ifdef R_pow
265 #define my_pow R_pow
266 #else
267 #define my_pow std::pow
268 #endif
269
270 void power(const t_float p) const {
271 t_float const q = 1/p;
272 for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
273 ZZ->dist = my_pow(ZZ->dist,q);
274 }
275 }
276
277 void plusone(const t_float) const { // ignore the argument
278 for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
279 ZZ->dist += 1;
280 }
281 }
282
283 void divide(const t_float denom) const {
284 for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
285 ZZ->dist /= denom;
286 }
287 }
288};
289
290class doubly_linked_list {
291 /*
292 Class for a doubly linked list. Initially, the list is the integer range
293 [0, size]. We provide a forward iterator and a method to delete an index
294 from the list.
295
296 Typical use: for (i=L.start; L<size; i=L.succ[I])
297 or
298 for (i=somevalue; L<size; i=L.succ[I])
299 */
300public:
301 t_index start;
302 auto_array_ptr<t_index> succ;
303
304private:
305 auto_array_ptr<t_index> pred;
306 // Not necessarily private, we just do not need it in this instance.
307
308public:
309 doubly_linked_list(const t_index size)
310 // Initialize to the given size.
311 : start(0)
312 , succ(size+1)
313 , pred(size+1)
314 {
315 for (t_index i=0; i<size; ++i) {
316 pred[i+1] = i;
317 succ[i] = i+1;
318 }
319 // pred[0] is never accessed!
320 //succ[size] is never accessed!
321 }
322
323 ~doubly_linked_list() {}
324
325 void remove(const t_index idx) {
326 // Remove an index from the list.
327 if (idx==start) {
328 start = succ[idx];
329 }
330 else {
331 succ[pred[idx]] = succ[idx];
332 pred[succ[idx]] = pred[idx];
333 }
334 succ[idx] = 0; // Mark as inactive
335 }
336
337 bool is_inactive(t_index idx) const {
338 return (succ[idx]==0);
339 }
340};
341
342// Indexing functions
343// D is the upper triangular part of a symmetric (NxN)-matrix
344// We require r_ < c_ !
345#define D_(r_,c_) ( D[(static_cast<std::ptrdiff_t>(2*N-3-(r_))*(r_)>>1)+(c_)-1] )
346// Z is an ((N-1)x4)-array
347#define Z_(_r, _c) (Z[(_r)*4 + (_c)])
348
349/*
350 Lookup function for a union-find data structure.
351
352 The function finds the root of idx by going iteratively through all
353 parent elements until a root is found. An element i is a root if
354 nodes[i] is zero. To make subsequent searches faster, the entry for
355 idx and all its parents is updated with the root element.
356 */
357class union_find {
358private:
359 auto_array_ptr<t_index> parent;
360 t_index nextparent;
361
362public:
363 union_find(const t_index size)
364 : parent(size>0 ? 2*size-1 : 0, 0)
365 , nextparent(size)
366 { }
367
368 t_index Find (t_index idx) const {
369 if (parent[idx] != 0 ) { // a → b
370 t_index p = idx;
371 idx = parent[idx];
372 if (parent[idx] != 0 ) { // a → b → c
373 do {
374 idx = parent[idx];
375 } while (parent[idx] != 0);
376 do {
377 t_index tmp = parent[p];
378 parent[p] = idx;
379 p = tmp;
380 } while (parent[p] != idx);
381 }
382 }
383 return idx;
384 }
385
386 void Union (const t_index node1, const t_index node2) {
387 parent[node1] = parent[node2] = nextparent++;
388 }
389};
390
391class nan_error{};
392#ifdef FE_INVALID
393class fenv_error{};
394#endif
395
396static void MST_linkage_core(const t_index N, const t_float * const D,
397 cluster_result & Z2) {
398/*
399 N: integer, number of data points
400 D: condensed distance matrix N*(N-1)/2
401 Z2: output data structure
402
403 The basis of this algorithm is an algorithm by Rohlf:
404
405 F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
406 The Computer Journal, vol. 16, 1973, p. 93–95.
407*/
408 t_index i;
409 t_index idx2;
410 doubly_linked_list active_nodes(N);
411 auto_array_ptr<t_float> d(N);
412
413 t_index prev_node;
414 t_float min;
415
416 // first iteration
417 idx2 = 1;
418 min = std::numeric_limits<t_float>::infinity();
419 for (i=1; i<N; ++i) {
420 d[i] = D[i-1];
421#if HAVE_DIAGNOSTIC
422#pragma GCC diagnostic push
423#pragma GCC diagnostic ignored "-Wfloat-equal"
424#endif
425 if (d[i] < min) {
426 min = d[i];
427 idx2 = i;
428 }
429 else if (fc_isnan(d[i]))
430 assert(false && "fastcluster: nan error");
431#if HAVE_DIAGNOSTIC
432#pragma GCC diagnostic pop
433#endif
434 }
435 Z2.append(0, idx2, min);
436
437 for (t_index j=1; j<N-1; ++j) {
438 prev_node = idx2;
439 active_nodes.remove(prev_node);
440
441 idx2 = active_nodes.succ[0];
442 min = d[idx2];
443 for (i=idx2; i<prev_node; i=active_nodes.succ[i]) {
444 t_float tmp = D_(i, prev_node);
445#if HAVE_DIAGNOSTIC
446#pragma GCC diagnostic push
447#pragma GCC diagnostic ignored "-Wfloat-equal"
448#endif
449 if (tmp < d[i])
450 d[i] = tmp;
451 else if (fc_isnan(tmp))
452 assert(false && "fastcluster: nan error");
453#if HAVE_DIAGNOSTIC
454#pragma GCC diagnostic pop
455#endif
456 if (d[i] < min) {
457 min = d[i];
458 idx2 = i;
459 }
460 }
461 for (; i<N; i=active_nodes.succ[i]) {
462 t_float tmp = D_(prev_node, i);
463#if HAVE_DIAGNOSTIC
464#pragma GCC diagnostic push
465#pragma GCC diagnostic ignored "-Wfloat-equal"
466#endif
467 if (d[i] > tmp)
468 d[i] = tmp;
469 else if (fc_isnan(tmp))
470 assert(false && "fastcluster: nan error");
471#if HAVE_DIAGNOSTIC
472#pragma GCC diagnostic pop
473#endif
474 if (d[i] < min) {
475 min = d[i];
476 idx2 = i;
477 }
478 }
479 Z2.append(prev_node, idx2, min);
480 }
481}
482
483/* Functions for the update of the dissimilarity array */
484
485inline static void f_single( t_float * const b, const t_float a ) {
486 if (*b > a) *b = a;
487}
488inline static void f_complete( t_float * const b, const t_float a ) {
489 if (*b < a) *b = a;
490}
491inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) {
492 *b = s*a + t*(*b);
493 #ifndef FE_INVALID
494#if HAVE_DIAGNOSTIC
495#pragma GCC diagnostic push
496#pragma GCC diagnostic ignored "-Wfloat-equal"
497#endif
498 if (fc_isnan(*b)) {
499 assert(false && "fastcluster: nan error");
500 }
501#if HAVE_DIAGNOSTIC
502#pragma GCC diagnostic pop
503#endif
504 #endif
505}
506inline static void f_weighted( t_float * const b, const t_float a) {
507 *b = (a+*b)*.5;
508 #ifndef FE_INVALID
509#if HAVE_DIAGNOSTIC
510#pragma GCC diagnostic push
511#pragma GCC diagnostic ignored "-Wfloat-equal"
512#endif
513 if (fc_isnan(*b)) {
514 assert(false && "fastcluster: nan error");
515 }
516#if HAVE_DIAGNOSTIC
517#pragma GCC diagnostic pop
518#endif
519 #endif
520}
521inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) {
522 *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v);
523 //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v);
524 #ifndef FE_INVALID
525#if HAVE_DIAGNOSTIC
526#pragma GCC diagnostic push
527#pragma GCC diagnostic ignored "-Wfloat-equal"
528#endif
529 if (fc_isnan(*b)) {
530 assert(false && "fastcluster: nan error");
531 }
532#if HAVE_DIAGNOSTIC
533#pragma GCC diagnostic pop
534#endif
535 #endif
536}
537inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) {
538 *b = s*a - stc + t*(*b);
539 #ifndef FE_INVALID
540 if (fc_isnan(*b)) {
541 assert(false && "fastcluster: nan error");
542 }
543#if HAVE_DIAGNOSTIC
544#pragma GCC diagnostic pop
545#endif
546 #endif
547}
548inline static void f_median( t_float * const b, const t_float a, const t_float c_4) {
549 *b = (a+(*b))*.5 - c_4;
550 #ifndef FE_INVALID
551#if HAVE_DIAGNOSTIC
552#pragma GCC diagnostic push
553#pragma GCC diagnostic ignored "-Wfloat-equal"
554#endif
555 if (fc_isnan(*b)) {
556 assert(false && "fastcluster: nan error");
557 }
558#if HAVE_DIAGNOSTIC
559#pragma GCC diagnostic pop
560#endif
561 #endif
562}
563
564template <method_codes method, typename t_members>
565static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
566/*
567 N: integer
568 D: condensed distance matrix N*(N-1)/2
569 Z2: output data structure
570
571 This is the NN-chain algorithm, described on page 86 in the following book:
572
573 Fionn Murtagh, Multidimensional Clustering Algorithms,
574 Vienna, Würzburg: Physica-Verlag, 1985.
575*/
576 t_index i;
577
578 auto_array_ptr<t_index> NN_chain(N);
579 t_index NN_chain_tip = 0;
580
581 t_index idx1, idx2;
582
583 t_float size1, size2;
584 doubly_linked_list active_nodes(N);
585
586 t_float min;
587
588 for (t_float const * DD=D; DD!=D+(static_cast<std::ptrdiff_t>(N)*(N-1)>>1);
589 ++DD) {
590#if HAVE_DIAGNOSTIC
591#pragma GCC diagnostic push
592#pragma GCC diagnostic ignored "-Wfloat-equal"
593#endif
594 if (fc_isnan(*DD)) {
595 assert(false && "fastcluster: nan error");
596 }
597#if HAVE_DIAGNOSTIC
598#pragma GCC diagnostic pop
599#endif
600 }
601
602 #ifdef FE_INVALID
603 if (feclearexcept(FE_INVALID)) assert(false && "fastcluster: fenv error");
604 #endif
605
606 for (t_index j=0; j<N-1; ++j) {
607 if (NN_chain_tip <= 3) {
608 NN_chain[0] = idx1 = active_nodes.start;
609 NN_chain_tip = 1;
610
611 idx2 = active_nodes.succ[idx1];
612 min = D_(idx1,idx2);
613 for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
614 if (D_(idx1,i) < min) {
615 min = D_(idx1,i);
616 idx2 = i;
617 }
618 }
619 } // a: idx1 b: idx2
620 else {
621 NN_chain_tip -= 3;
622 idx1 = NN_chain[NN_chain_tip-1];
623 idx2 = NN_chain[NN_chain_tip];
624 min = idx1<idx2 ? D_(idx1,idx2) : D_(idx2,idx1);
625 } // a: idx1 b: idx2
626
627 do {
628 NN_chain[NN_chain_tip] = idx2;
629
630 for (i=active_nodes.start; i<idx2; i=active_nodes.succ[i]) {
631 if (D_(i,idx2) < min) {
632 min = D_(i,idx2);
633 idx1 = i;
634 }
635 }
636 for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
637 if (D_(idx2,i) < min) {
638 min = D_(idx2,i);
639 idx1 = i;
640 }
641 }
642
643 idx2 = idx1;
644 idx1 = NN_chain[NN_chain_tip++];
645
646 } while (idx2 != NN_chain[NN_chain_tip-2]);
647
648 Z2.append(idx1, idx2, min);
649
650 if (idx1>idx2) {
651 t_index tmp = idx1;
652 idx1 = idx2;
653 idx2 = tmp;
654 }
655
656 if (method==METHOD_METR_AVERAGE ||
657 method==METHOD_METR_WARD) {
658 size1 = static_cast<t_float>(members[idx1]);
659 size2 = static_cast<t_float>(members[idx2]);
660 members[idx2] += members[idx1];
661 }
662
663 // Remove the smaller index from the valid indices (active_nodes).
664 active_nodes.remove(idx1);
665
666 switch (method) {
667 case METHOD_METR_SINGLE:
668 /*
669 Single linkage.
670
671 Characteristic: new distances are never longer than the old distances.
672 */
673 // Update the distance matrix in the range [start, idx1).
674 for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
675 f_single(&D_(i, idx2), D_(i, idx1) );
676 // Update the distance matrix in the range (idx1, idx2).
677 for (; i<idx2; i=active_nodes.succ[i])
678 f_single(&D_(i, idx2), D_(idx1, i) );
679 // Update the distance matrix in the range (idx2, N).
680 for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
681 f_single(&D_(idx2, i), D_(idx1, i) );
682 break;
683
684 case METHOD_METR_COMPLETE:
685 /*
686 Complete linkage.
687
688 Characteristic: new distances are never shorter than the old distances.
689 */
690 // Update the distance matrix in the range [start, idx1).
691 for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
692 f_complete(&D_(i, idx2), D_(i, idx1) );
693 // Update the distance matrix in the range (idx1, idx2).
694 for (; i<idx2; i=active_nodes.succ[i])
695 f_complete(&D_(i, idx2), D_(idx1, i) );
696 // Update the distance matrix in the range (idx2, N).
697 for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
698 f_complete(&D_(idx2, i), D_(idx1, i) );
699 break;
700
701 case METHOD_METR_AVERAGE: {
702 /*
703 Average linkage.
704
705 Shorter and longer distances can occur.
706 */
707 // Update the distance matrix in the range [start, idx1).
708 t_float s = size1/(size1+size2);
709 t_float t = size2/(size1+size2);
710 for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
711 f_average(&D_(i, idx2), D_(i, idx1), s, t );
712 // Update the distance matrix in the range (idx1, idx2).
713 for (; i<idx2; i=active_nodes.succ[i])
714 f_average(&D_(i, idx2), D_(idx1, i), s, t );
715 // Update the distance matrix in the range (idx2, N).
716 for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
717 f_average(&D_(idx2, i), D_(idx1, i), s, t );
718 break;
719 }
720
721 case METHOD_METR_WEIGHTED:
722 /*
723 Weighted linkage.
724
725 Shorter and longer distances can occur.
726 */
727 // Update the distance matrix in the range [start, idx1).
728 for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
729 f_weighted(&D_(i, idx2), D_(i, idx1) );
730 // Update the distance matrix in the range (idx1, idx2).
731 for (; i<idx2; i=active_nodes.succ[i])
732 f_weighted(&D_(i, idx2), D_(idx1, i) );
733 // Update the distance matrix in the range (idx2, N).
734 for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
735 f_weighted(&D_(idx2, i), D_(idx1, i) );
736 break;
737
738 case METHOD_METR_WARD:
739 /*
740 Ward linkage.
741
742 Shorter and longer distances can occur, not smaller than min(d1,d2)
743 but maybe bigger than max(d1,d2).
744 */
745 // Update the distance matrix in the range [start, idx1).
746 //t_float v = static_cast<t_float>(members[i]);
747 for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
748 f_ward(&D_(i, idx2), D_(i, idx1), min,
749 size1, size2, static_cast<t_float>(members[i]) );
750 // Update the distance matrix in the range (idx1, idx2).
751 for (; i<idx2; i=active_nodes.succ[i])
752 f_ward(&D_(i, idx2), D_(idx1, i), min,
753 size1, size2, static_cast<t_float>(members[i]) );
754 // Update the distance matrix in the range (idx2, N).
755 for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
756 f_ward(&D_(idx2, i), D_(idx1, i), min,
757 size1, size2, static_cast<t_float>(members[i]) );
758 break;
759
760 default:
761 assert(false && "fastcluster: invalid method");
762 }
763 }
764 #ifdef FE_INVALID
765 if (fetestexcept(FE_INVALID)) assert(false && "fastcluster: fenv error");
766 #endif
767}
768
769class binary_min_heap {
770 /*
771 Class for a binary min-heap. The data resides in an array A. The elements of
772 A are not changed but two lists I and R of indices are generated which point
773 to elements of A and backwards.
774
775 The heap tree structure is
776
777 H[2*i+1] H[2*i+2]
778 \ /
779 \ /
780 ≤ ≤
781 \ /
782 \ /
783 H[i]
784
785 where the children must be less or equal than their parent. Thus, H[0]
786 contains the minimum. The lists I and R are made such that H[i] = A[I[i]]
787 and R[I[i]] = i.
788
789 This implementation is not designed to handle NaN values.
790 */
791private:
792 t_float * const A;
793 t_index size;
794 auto_array_ptr<t_index> I;
795 auto_array_ptr<t_index> R;
796
797 // no default constructor
798 binary_min_heap();
799 // noncopyable
800 binary_min_heap(binary_min_heap const &);
801 binary_min_heap & operator=(binary_min_heap const &);
802
803public:
804 binary_min_heap(t_float * const A_, const t_index size_)
805 : A(A_), size(size_), I(size), R(size)
806 { // Allocate memory and initialize the lists I and R to the identity. This
807 // does not make it a heap. Call heapify afterwards!
808 for (t_index i=0; i<size; ++i)
809 R[i] = I[i] = i;
810 }
811
812 binary_min_heap(t_float * const A_, const t_index size1, const t_index size2,
813 const t_index start)
814 : A(A_), size(size1), I(size1), R(size2)
815 { // Allocate memory and initialize the lists I and R to the identity. This
816 // does not make it a heap. Call heapify afterwards!
817 for (t_index i=0; i<size; ++i) {
818 R[i+start] = i;
819 I[i] = i + start;
820 }
821 }
822
823 ~binary_min_heap() {}
824
825 void heapify() {
826 // Arrange the indices I and R so that H[i] := A[I[i]] satisfies the heap
827 // condition H[i] < H[2*i+1] and H[i] < H[2*i+2] for each i.
828 //
829 // Complexity: Θ(size)
830 // Reference: Cormen, Leiserson, Rivest, Stein, Introduction to Algorithms,
831 // 3rd ed., 2009, Section 6.3 “Building a heap”
832 t_index idx;
833 for (idx=(size>>1); idx>0; ) {
834 --idx;
835 update_geq_(idx);
836 }
837 }
838
839 inline t_index argmin() const {
840 // Return the minimal element.
841 return I[0];
842 }
843
844 void heap_pop() {
845 // Remove the minimal element from the heap.
846 --size;
847 I[0] = I[size];
848 R[I[0]] = 0;
849 update_geq_(0);
850 }
851
852 void remove(t_index idx) {
853 // Remove an element from the heap.
854 --size;
855 R[I[size]] = R[idx];
856 I[R[idx]] = I[size];
857 if ( H(size)<=A[idx] ) {
858 update_leq_(R[idx]);
859 }
860 else {
861 update_geq_(R[idx]);
862 }
863 }
864
865 void replace ( const t_index idxold, const t_index idxnew,
866 const t_float val) {
867 R[idxnew] = R[idxold];
868 I[R[idxnew]] = idxnew;
869 if (val<=A[idxold])
870 update_leq(idxnew, val);
871 else
872 update_geq(idxnew, val);
873 }
874
875 void update ( const t_index idx, const t_float val ) const {
876 // Update the element A[i] with val and re-arrange the indices to preserve
877 // the heap condition.
878 if (val<=A[idx])
879 update_leq(idx, val);
880 else
881 update_geq(idx, val);
882 }
883
884 void update_leq ( const t_index idx, const t_float val ) const {
885 // Use this when the new value is not more than the old value.
886 A[idx] = val;
887 update_leq_(R[idx]);
888 }
889
890 void update_geq ( const t_index idx, const t_float val ) const {
891 // Use this when the new value is not less than the old value.
892 A[idx] = val;
893 update_geq_(R[idx]);
894 }
895
896private:
897 void update_leq_ (t_index i) const {
898 t_index j;
899 for ( ; (i>0) && ( H(i)<H(j=(i-1)>>1) ); i=j)
900 heap_swap(i,j);
901 }
902
903 void update_geq_ (t_index i) const {
904 t_index j;
905 for ( ; (j=2*i+1)<size; i=j) {
906 if ( H(j)>=H(i) ) {
907 ++j;
908 if ( j>=size || H(j)>=H(i) ) break;
909 }
910 else if ( j+1<size && H(j+1)<H(j) ) ++j;
911 heap_swap(i, j);
912 }
913 }
914
915 void heap_swap(const t_index i, const t_index j) const {
916 // Swap two indices.
917 t_index tmp = I[i];
918 I[i] = I[j];
919 I[j] = tmp;
920 R[I[i]] = i;
921 R[I[j]] = j;
922 }
923
924 inline t_float H(const t_index i) const {
925 return A[I[i]];
926 }
927
928};
929
930template <method_codes method, typename t_members>
931static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
932 /*
933 N: integer, number of data points
934 D: condensed distance matrix N*(N-1)/2
935 Z2: output data structure
936 */
937
938 const t_index N_1 = N-1;
939 t_index i, j; // loop variables
940 t_index idx1, idx2; // row and column indices
941
942 auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
943 auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
944 auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
945 // i-th row represents
946 doubly_linked_list active_nodes(N);
947 binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
948 // the distance to the nearest neighbor of each point
949 t_index node1, node2; // node numbers in the output
950 t_float size1, size2; // and their cardinalities
951
952 t_float min; // minimum and row index for nearest-neighbor search
953 t_index idx;
954
955 for (i=0; i<N; ++i)
956 // Build a list of row ↔ node label assignments.
957 // Initially i ↦ i
958 row_repr[i] = i;
959
960 // Initialize the minimal distances:
961 // Find the nearest neighbor of each point.
962 // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
963 t_float const * DD = D;
964 for (i=0; i<N_1; ++i) {
965 min = std::numeric_limits<t_float>::infinity();
966 for (idx=j=i+1; j<N; ++j, ++DD) {
967#if HAVE_DIAGNOSTIC
968#pragma GCC diagnostic push
969#pragma GCC diagnostic ignored "-Wfloat-equal"
970#endif
971 if (*DD<min) {
972 min = *DD;
973 idx = j;
974 }
975 else if (fc_isnan(*DD))
976 assert(false && "fastcluster: nan error");
977 }
978#if HAVE_DIAGNOSTIC
979#pragma GCC diagnostic pop
980#endif
981 mindist[i] = min;
982 n_nghbr[i] = idx;
983 }
984
985 // Put the minimal distances into a heap structure to make the repeated
986 // global minimum searches fast.
987 nn_distances.heapify();
988
989 #ifdef FE_INVALID
990 if (feclearexcept(FE_INVALID)) assert(false && "fastcluster: fenv error");
991 #endif
992
993 // Main loop: We have N-1 merging steps.
994 for (i=0; i<N_1; ++i) {
995 /*
996 Here is a special feature that allows fast bookkeeping and updates of the
997 minimal distances.
998
999 mindist[i] stores a lower bound on the minimum distance of the point i to
1000 all points of higher index:
1001
1002 mindist[i] ≥ min_{j>i} D(i,j)
1003
1004 Normally, we have equality. However, this minimum may become invalid due
1005 to the updates in the distance matrix. The rules are:
1006
1007 1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct
1008 minimum and n_nghbr[i] is a nearest neighbor.
1009
1010 2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the
1011 correct minimum. The minimum needs to be recomputed.
1012
1013 3) mindist[i] is never bigger than the true minimum. Hence, we never
1014 miss the true minimum if we take the smallest mindist entry,
1015 re-compute the value if necessary (thus maybe increasing it) and
1016 looking for the now smallest mindist entry until a valid minimal
1017 entry is found. This step is done in the lines below.
1018
1019 The update process for D below takes care that these rules are
1020 fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are
1021 re-calculated when necessary but re-calculation is avoided whenever
1022 possible.
1023
1024 The re-calculation of the minima makes the worst-case runtime of this
1025 algorithm cubic in N. We avoid this whenever possible, and in most cases
1026 the runtime appears to be quadratic.
1027 */
1028 idx1 = nn_distances.argmin();
1029 if (method != METHOD_METR_SINGLE) {
1030 while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) {
1031 // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1032 n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1033 min = D_(idx1,j);
1034 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1035 if (D_(idx1,j)<min) {
1036 min = D_(idx1,j);
1037 n_nghbr[idx1] = j;
1038 }
1039 }
1040 /* Update the heap with the new true minimum and search for the
1041 (possibly different) minimal entry. */
1042 nn_distances.update_geq(idx1, min);
1043 idx1 = nn_distances.argmin();
1044 }
1045 }
1046
1047 nn_distances.heap_pop(); // Remove the current minimum from the heap.
1048 idx2 = n_nghbr[idx1];
1049
1050 // Write the newly found minimal pair of nodes to the output array.
1051 node1 = row_repr[idx1];
1052 node2 = row_repr[idx2];
1053
1054 if (method==METHOD_METR_AVERAGE ||
1055 method==METHOD_METR_WARD ||
1056 method==METHOD_METR_CENTROID) {
1057 size1 = static_cast<t_float>(members[idx1]);
1058 size2 = static_cast<t_float>(members[idx2]);
1059 members[idx2] += members[idx1];
1060 }
1061 Z2.append(node1, node2, mindist[idx1]);
1062
1063 // Remove idx1 from the list of active indices (active_nodes).
1064 active_nodes.remove(idx1);
1065 // Index idx2 now represents the new (merged) node with label N+i.
1066 row_repr[idx2] = N+i;
1067
1068 // Update the distance matrix
1069 switch (method) {
1070 case METHOD_METR_SINGLE:
1071 /*
1072 Single linkage.
1073
1074 Characteristic: new distances are never longer than the old distances.
1075 */
1076 // Update the distance matrix in the range [start, idx1).
1077 for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1078 f_single(&D_(j, idx2), D_(j, idx1));
1079 if (n_nghbr[j] == idx1)
1080 n_nghbr[j] = idx2;
1081 }
1082 // Update the distance matrix in the range (idx1, idx2).
1083 for (; j<idx2; j=active_nodes.succ[j]) {
1084 f_single(&D_(j, idx2), D_(idx1, j));
1085 // If the new value is below the old minimum in a row, update
1086 // the mindist and n_nghbr arrays.
1087 if (D_(j, idx2) < mindist[j]) {
1088 nn_distances.update_leq(j, D_(j, idx2));
1089 n_nghbr[j] = idx2;
1090 }
1091 }
1092 // Update the distance matrix in the range (idx2, N).
1093 // Recompute the minimum mindist[idx2] and n_nghbr[idx2].
1094 if (idx2<N_1) {
1095 min = mindist[idx2];
1096 for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j]) {
1097 f_single(&D_(idx2, j), D_(idx1, j) );
1098 if (D_(idx2, j) < min) {
1099 n_nghbr[idx2] = j;
1100 min = D_(idx2, j);
1101 }
1102 }
1103 nn_distances.update_leq(idx2, min);
1104 }
1105 break;
1106
1107 case METHOD_METR_COMPLETE:
1108 /*
1109 Complete linkage.
1110
1111 Characteristic: new distances are never shorter than the old distances.
1112 */
1113 // Update the distance matrix in the range [start, idx1).
1114 for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1115 f_complete(&D_(j, idx2), D_(j, idx1) );
1116 if (n_nghbr[j] == idx1)
1117 n_nghbr[j] = idx2;
1118 }
1119 // Update the distance matrix in the range (idx1, idx2).
1120 for (; j<idx2; j=active_nodes.succ[j])
1121 f_complete(&D_(j, idx2), D_(idx1, j) );
1122 // Update the distance matrix in the range (idx2, N).
1123 for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j])
1124 f_complete(&D_(idx2, j), D_(idx1, j) );
1125 break;
1126
1127 case METHOD_METR_AVERAGE: {
1128 /*
1129 Average linkage.
1130
1131 Shorter and longer distances can occur.
1132 */
1133 // Update the distance matrix in the range [start, idx1).
1134 t_float s = size1/(size1+size2);
1135 t_float t = size2/(size1+size2);
1136 for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1137 f_average(&D_(j, idx2), D_(j, idx1), s, t);
1138 if (n_nghbr[j] == idx1)
1139 n_nghbr[j] = idx2;
1140 }
1141 // Update the distance matrix in the range (idx1, idx2).
1142 for (; j<idx2; j=active_nodes.succ[j]) {
1143 f_average(&D_(j, idx2), D_(idx1, j), s, t);
1144 if (D_(j, idx2) < mindist[j]) {
1145 nn_distances.update_leq(j, D_(j, idx2));
1146 n_nghbr[j] = idx2;
1147 }
1148 }
1149 // Update the distance matrix in the range (idx2, N).
1150 if (idx2<N_1) {
1151 n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1152 f_average(&D_(idx2, j), D_(idx1, j), s, t);
1153 min = D_(idx2,j);
1154 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1155 f_average(&D_(idx2, j), D_(idx1, j), s, t);
1156 if (D_(idx2,j) < min) {
1157 min = D_(idx2,j);
1158 n_nghbr[idx2] = j;
1159 }
1160 }
1161 nn_distances.update(idx2, min);
1162 }
1163 break;
1164 }
1165
1166 case METHOD_METR_WEIGHTED:
1167 /*
1168 Weighted linkage.
1169
1170 Shorter and longer distances can occur.
1171 */
1172 // Update the distance matrix in the range [start, idx1).
1173 for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1174 f_weighted(&D_(j, idx2), D_(j, idx1) );
1175 if (n_nghbr[j] == idx1)
1176 n_nghbr[j] = idx2;
1177 }
1178 // Update the distance matrix in the range (idx1, idx2).
1179 for (; j<idx2; j=active_nodes.succ[j]) {
1180 f_weighted(&D_(j, idx2), D_(idx1, j) );
1181 if (D_(j, idx2) < mindist[j]) {
1182 nn_distances.update_leq(j, D_(j, idx2));
1183 n_nghbr[j] = idx2;
1184 }
1185 }
1186 // Update the distance matrix in the range (idx2, N).
1187 if (idx2<N_1) {
1188 n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1189 f_weighted(&D_(idx2, j), D_(idx1, j) );
1190 min = D_(idx2,j);
1191 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1192 f_weighted(&D_(idx2, j), D_(idx1, j) );
1193 if (D_(idx2,j) < min) {
1194 min = D_(idx2,j);
1195 n_nghbr[idx2] = j;
1196 }
1197 }
1198 nn_distances.update(idx2, min);
1199 }
1200 break;
1201
1202 case METHOD_METR_WARD:
1203 /*
1204 Ward linkage.
1205
1206 Shorter and longer distances can occur, not smaller than min(d1,d2)
1207 but maybe bigger than max(d1,d2).
1208 */
1209 // Update the distance matrix in the range [start, idx1).
1210 for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1211 f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1],
1212 size1, size2, static_cast<t_float>(members[j]) );
1213 if (n_nghbr[j] == idx1)
1214 n_nghbr[j] = idx2;
1215 }
1216 // Update the distance matrix in the range (idx1, idx2).
1217 for (; j<idx2; j=active_nodes.succ[j]) {
1218 f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2,
1219 static_cast<t_float>(members[j]) );
1220 if (D_(j, idx2) < mindist[j]) {
1221 nn_distances.update_leq(j, D_(j, idx2));
1222 n_nghbr[j] = idx2;
1223 }
1224 }
1225 // Update the distance matrix in the range (idx2, N).
1226 if (idx2<N_1) {
1227 n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1228 f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
1229 size1, size2, static_cast<t_float>(members[j]) );
1230 min = D_(idx2,j);
1231 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1232 f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
1233 size1, size2, static_cast<t_float>(members[j]) );
1234 if (D_(idx2,j) < min) {
1235 min = D_(idx2,j);
1236 n_nghbr[idx2] = j;
1237 }
1238 }
1239 nn_distances.update(idx2, min);
1240 }
1241 break;
1242
1243 case METHOD_METR_CENTROID: {
1244 /*
1245 Centroid linkage.
1246
1247 Shorter and longer distances can occur, not bigger than max(d1,d2)
1248 but maybe smaller than min(d1,d2).
1249 */
1250 // Update the distance matrix in the range [start, idx1).
1251 t_float s = size1/(size1+size2);
1252 t_float t = size2/(size1+size2);
1253 t_float stc = s*t*mindist[idx1];
1254 for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1255 f_centroid(&D_(j, idx2), D_(j, idx1), stc, s, t);
1256 if (D_(j, idx2) < mindist[j]) {
1257 nn_distances.update_leq(j, D_(j, idx2));
1258 n_nghbr[j] = idx2;
1259 }
1260 else if (n_nghbr[j] == idx1)
1261 n_nghbr[j] = idx2;
1262 }
1263 // Update the distance matrix in the range (idx1, idx2).
1264 for (; j<idx2; j=active_nodes.succ[j]) {
1265 f_centroid(&D_(j, idx2), D_(idx1, j), stc, s, t);
1266 if (D_(j, idx2) < mindist[j]) {
1267 nn_distances.update_leq(j, D_(j, idx2));
1268 n_nghbr[j] = idx2;
1269 }
1270 }
1271 // Update the distance matrix in the range (idx2, N).
1272 if (idx2<N_1) {
1273 n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1274 f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1275 min = D_(idx2,j);
1276 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1277 f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1278 if (D_(idx2,j) < min) {
1279 min = D_(idx2,j);
1280 n_nghbr[idx2] = j;
1281 }
1282 }
1283 nn_distances.update(idx2, min);
1284 }
1285 break;
1286 }
1287
1288 case METHOD_METR_MEDIAN: {
1289 /*
1290 Median linkage.
1291
1292 Shorter and longer distances can occur, not bigger than max(d1,d2)
1293 but maybe smaller than min(d1,d2).
1294 */
1295 // Update the distance matrix in the range [start, idx1).
1296 t_float c_4 = mindist[idx1]*.25;
1297 for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1298 f_median(&D_(j, idx2), D_(j, idx1), c_4 );
1299 if (D_(j, idx2) < mindist[j]) {
1300 nn_distances.update_leq(j, D_(j, idx2));
1301 n_nghbr[j] = idx2;
1302 }
1303 else if (n_nghbr[j] == idx1)
1304 n_nghbr[j] = idx2;
1305 }
1306 // Update the distance matrix in the range (idx1, idx2).
1307 for (; j<idx2; j=active_nodes.succ[j]) {
1308 f_median(&D_(j, idx2), D_(idx1, j), c_4 );
1309 if (D_(j, idx2) < mindist[j]) {
1310 nn_distances.update_leq(j, D_(j, idx2));
1311 n_nghbr[j] = idx2;
1312 }
1313 }
1314 // Update the distance matrix in the range (idx2, N).
1315 if (idx2<N_1) {
1316 n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1317 f_median(&D_(idx2, j), D_(idx1, j), c_4 );
1318 min = D_(idx2,j);
1319 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1320 f_median(&D_(idx2, j), D_(idx1, j), c_4 );
1321 if (D_(idx2,j) < min) {
1322 min = D_(idx2,j);
1323 n_nghbr[idx2] = j;
1324 }
1325 }
1326 nn_distances.update(idx2, min);
1327 }
1328 break;
1329 }
1330
1331 default:
1332 assert(false && "fastcluster: invalid method");
1333 }
1334 }
1335 #ifdef FE_INVALID
1336 if (fetestexcept(FE_INVALID)) assert(false && "fastcluster: fenv error");
1337 #endif
1338}
1339
1340/*
1341 Clustering methods for vector data
1342*/
1343
1344template <typename t_dissimilarity>
1345static void MST_linkage_core_vector(const t_index N,
1346 t_dissimilarity & dist,
1347 cluster_result & Z2) {
1348/*
1349 N: integer, number of data points
1350 dist: function pointer to the metric
1351 Z2: output data structure
1352
1353 The basis of this algorithm is an algorithm by Rohlf:
1354
1355 F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
1356 The Computer Journal, vol. 16, 1973, p. 93–95.
1357*/
1358 t_index i;
1359 t_index idx2;
1360 doubly_linked_list active_nodes(N);
1361 auto_array_ptr<t_float> d(N);
1362
1363 t_index prev_node;
1364 t_float min;
1365
1366 // first iteration
1367 idx2 = 1;
1368 min = std::numeric_limits<t_float>::infinity();
1369 for (i=1; i<N; ++i) {
1370 d[i] = dist(0,i);
1371#if HAVE_DIAGNOSTIC
1372#pragma GCC diagnostic push
1373#pragma GCC diagnostic ignored "-Wfloat-equal"
1374#endif
1375 if (d[i] < min) {
1376 min = d[i];
1377 idx2 = i;
1378 }
1379 else if (fc_isnan(d[i]))
1380 assert(false && "fastcluster: nan error");
1381#if HAVE_DIAGNOSTIC
1382#pragma GCC diagnostic pop
1383#endif
1384 }
1385
1386 Z2.append(0, idx2, min);
1387
1388 for (t_index j=1; j<N-1; ++j) {
1389 prev_node = idx2;
1390 active_nodes.remove(prev_node);
1391
1392 idx2 = active_nodes.succ[0];
1393 min = d[idx2];
1394
1395 for (i=idx2; i<N; i=active_nodes.succ[i]) {
1396 t_float tmp = dist(i, prev_node);
1397#if HAVE_DIAGNOSTIC
1398#pragma GCC diagnostic push
1399#pragma GCC diagnostic ignored "-Wfloat-equal"
1400#endif
1401 if (d[i] > tmp)
1402 d[i] = tmp;
1403 else if (fc_isnan(tmp))
1404 assert(false && "fastcluster: nan error");
1405#if HAVE_DIAGNOSTIC
1406#pragma GCC diagnostic pop
1407#endif
1408 if (d[i] < min) {
1409 min = d[i];
1410 idx2 = i;
1411 }
1412 }
1413 Z2.append(prev_node, idx2, min);
1414 }
1415}
1416
1417template <method_codes_vector method, typename t_dissimilarity>
1418static void generic_linkage_vector(const t_index N,
1419 t_dissimilarity & dist,
1420 cluster_result & Z2) {
1421 /*
1422 N: integer, number of data points
1423 dist: function pointer to the metric
1424 Z2: output data structure
1425
1426 This algorithm is valid for the distance update methods
1427 "Ward", "centroid" and "median" only!
1428 */
1429 const t_index N_1 = N-1;
1430 t_index i, j; // loop variables
1431 t_index idx1, idx2; // row and column indices
1432
1433 auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
1434 auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
1435 auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
1436 // i-th row represents
1437 doubly_linked_list active_nodes(N);
1438 binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
1439 // the distance to the nearest neighbor of each point
1440 t_index node1, node2; // node numbers in the output
1441 t_float min; // minimum and row index for nearest-neighbor search
1442
1443 for (i=0; i<N; ++i)
1444 // Build a list of row ↔ node label assignments.
1445 // Initially i ↦ i
1446 row_repr[i] = i;
1447
1448 // Initialize the minimal distances:
1449 // Find the nearest neighbor of each point.
1450 // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1451 for (i=0; i<N_1; ++i) {
1452 min = std::numeric_limits<t_float>::infinity();
1453 t_index idx;
1454 for (idx=j=i+1; j<N; ++j) {
1455 t_float tmp;
1456 switch (method) {
1457 case METHOD_VECTOR_WARD:
1458 tmp = dist.ward_initial(i,j);
1459 break;
1460 default:
1461 tmp = dist.template sqeuclidean<true>(i,j);
1462 }
1463 if (tmp<min) {
1464 min = tmp;
1465 idx = j;
1466 }
1467 }
1468 switch (method) {
1469 case METHOD_VECTOR_WARD:
1470 mindist[i] = t_dissimilarity::ward_initial_conversion(min);
1471 break;
1472 default:
1473 mindist[i] = min;
1474 }
1475 n_nghbr[i] = idx;
1476 }
1477
1478 // Put the minimal distances into a heap structure to make the repeated
1479 // global minimum searches fast.
1480 nn_distances.heapify();
1481
1482 // Main loop: We have N-1 merging steps.
1483 for (i=0; i<N_1; ++i) {
1484 idx1 = nn_distances.argmin();
1485
1486 while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
1487 // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1488 n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1489 switch (method) {
1490 case METHOD_VECTOR_WARD:
1491 min = dist.ward(idx1,j);
1492 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1493 t_float const tmp = dist.ward(idx1,j);
1494 if (tmp<min) {
1495 min = tmp;
1496 n_nghbr[idx1] = j;
1497 }
1498 }
1499 break;
1500 default:
1501 min = dist.template sqeuclidean<true>(idx1,j);
1502 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1503 t_float const tmp = dist.template sqeuclidean<true>(idx1,j);
1504 if (tmp<min) {
1505 min = tmp;
1506 n_nghbr[idx1] = j;
1507 }
1508 }
1509 }
1510 /* Update the heap with the new true minimum and search for the (possibly
1511 different) minimal entry. */
1512 nn_distances.update_geq(idx1, min);
1513 idx1 = nn_distances.argmin();
1514 }
1515
1516 nn_distances.heap_pop(); // Remove the current minimum from the heap.
1517 idx2 = n_nghbr[idx1];
1518
1519 // Write the newly found minimal pair of nodes to the output array.
1520 node1 = row_repr[idx1];
1521 node2 = row_repr[idx2];
1522
1523 Z2.append(node1, node2, mindist[idx1]);
1524
1525 switch (method) {
1526 case METHOD_VECTOR_WARD:
1527 case METHOD_VECTOR_CENTROID:
1528 dist.merge_inplace(idx1, idx2);
1529 break;
1530 case METHOD_VECTOR_MEDIAN:
1531 dist.merge_inplace_weighted(idx1, idx2);
1532 break;
1533 default:
1534 assert(false && "fastcluster: invalid method");
1535 }
1536
1537 // Index idx2 now represents the new (merged) node with label N+i.
1538 row_repr[idx2] = N+i;
1539 // Remove idx1 from the list of active indices (active_nodes).
1540 active_nodes.remove(idx1); // TBD later!!!
1541
1542 // Update the distance matrix
1543 switch (method) {
1544 case METHOD_VECTOR_WARD:
1545 /*
1546 Ward linkage.
1547
1548 Shorter and longer distances can occur, not smaller than min(d1,d2)
1549 but maybe bigger than max(d1,d2).
1550 */
1551 // Update the distance matrix in the range [start, idx1).
1552 for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1553 if (n_nghbr[j] == idx2) {
1554 n_nghbr[j] = idx1; // invalidate
1555 }
1556 }
1557 // Update the distance matrix in the range (idx1, idx2).
1558 for ( ; j<idx2; j=active_nodes.succ[j]) {
1559 t_float const tmp = dist.ward(j, idx2);
1560 if (tmp < mindist[j]) {
1561 nn_distances.update_leq(j, tmp);
1562 n_nghbr[j] = idx2;
1563 }
1564 else if (n_nghbr[j]==idx2) {
1565 n_nghbr[j] = idx1; // invalidate
1566 }
1567 }
1568 // Find the nearest neighbor for idx2.
1569 if (idx2<N_1) {
1570 n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1571 min = dist.ward(idx2,j);
1572 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1573 t_float const tmp = dist.ward(idx2,j);
1574 if (tmp < min) {
1575 min = tmp;
1576 n_nghbr[idx2] = j;
1577 }
1578 }
1579 nn_distances.update(idx2, min);
1580 }
1581 break;
1582
1583 default:
1584 /*
1585 Centroid and median linkage.
1586
1587 Shorter and longer distances can occur, not bigger than max(d1,d2)
1588 but maybe smaller than min(d1,d2).
1589 */
1590 for (j=active_nodes.start; j<idx2; j=active_nodes.succ[j]) {
1591 t_float const tmp = dist.template sqeuclidean<true>(j, idx2);
1592 if (tmp < mindist[j]) {
1593 nn_distances.update_leq(j, tmp);
1594 n_nghbr[j] = idx2;
1595 }
1596 else if (n_nghbr[j] == idx2)
1597 n_nghbr[j] = idx1; // invalidate
1598 }
1599 // Find the nearest neighbor for idx2.
1600 if (idx2<N_1) {
1601 n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1602 min = dist.template sqeuclidean<true>(idx2,j);
1603 for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1604 t_float const tmp = dist.template sqeuclidean<true>(idx2, j);
1605 if (tmp < min) {
1606 min = tmp;
1607 n_nghbr[idx2] = j;
1608 }
1609 }
1610 nn_distances.update(idx2, min);
1611 }
1612 }
1613 }
1614}
1615
1616template <method_codes_vector method, typename t_dissimilarity>
1617static void generic_linkage_vector_alternative(const t_index N,
1618 t_dissimilarity & dist,
1619 cluster_result & Z2) {
1620 /*
1621 N: integer, number of data points
1622 dist: function pointer to the metric
1623 Z2: output data structure
1624
1625 This algorithm is valid for the distance update methods
1626 "Ward", "centroid" and "median" only!
1627 */
1628 const t_index N_1 = N-1;
1629 t_index i, j=0; // loop variables
1630 t_index idx1, idx2; // row and column indices
1631
1632 auto_array_ptr<t_index> n_nghbr(2*N-2); // array of nearest neighbors
1633 auto_array_ptr<t_float> mindist(2*N-2); // distances to the nearest neighbors
1634
1635 doubly_linked_list active_nodes(N+N_1);
1636 binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap
1637 // structure for the distance to the nearest neighbor of each point
1638
1639 t_float min; // minimum for nearest-neighbor searches
1640
1641 // Initialize the minimal distances:
1642 // Find the nearest neighbor of each point.
1643 // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1644 for (i=1; i<N; ++i) {
1645 min = std::numeric_limits<t_float>::infinity();
1646 t_index idx;
1647 for (idx=j=0; j<i; ++j) {
1648 t_float tmp;
1649 switch (method) {
1650 case METHOD_VECTOR_WARD:
1651 tmp = dist.ward_initial(i,j);
1652 break;
1653 default:
1654 tmp = dist.template sqeuclidean<true>(i,j);
1655 }
1656 if (tmp<min) {
1657 min = tmp;
1658 idx = j;
1659 }
1660 }
1661 switch (method) {
1662 case METHOD_VECTOR_WARD:
1663 mindist[i] = t_dissimilarity::ward_initial_conversion(min);
1664 break;
1665 default:
1666 mindist[i] = min;
1667 }
1668 n_nghbr[i] = idx;
1669 }
1670
1671 // Put the minimal distances into a heap structure to make the repeated
1672 // global minimum searches fast.
1673 nn_distances.heapify();
1674
1675 // Main loop: We have N-1 merging steps.
1676 for (i=N; i<N+N_1; ++i) {
1677 /*
1678 The bookkeeping is different from the "stored matrix approach" algorithm
1679 generic_linkage.
1680
1681 mindist[i] stores a lower bound on the minimum distance of the point i to
1682 all points of *lower* index:
1683
1684 mindist[i] ≥ min_{j<i} D(i,j)
1685
1686 Moreover, new nodes do not re-use one of the old indices, but they are
1687 given a new, unique index (SciPy convention: initial nodes are 0,…,N−1,
1688 new nodes are N,…,2N−2).
1689
1690 Invalid nearest neighbors are not recognized by the fact that the stored
1691 distance is smaller than the actual distance, but the list active_nodes
1692 maintains a flag whether a node is inactive. If n_nghbr[i] points to an
1693 active node, the entries nn_distances[i] and n_nghbr[i] are valid,
1694 otherwise they must be recomputed.
1695 */
1696 idx1 = nn_distances.argmin();
1697 while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
1698 // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1699 n_nghbr[idx1] = j = active_nodes.start;
1700 switch (method) {
1701 case METHOD_VECTOR_WARD:
1702 min = dist.ward_extended(idx1,j);
1703 for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
1704 t_float tmp = dist.ward_extended(idx1,j);
1705 if (tmp<min) {
1706 min = tmp;
1707 n_nghbr[idx1] = j;
1708 }
1709 }
1710 break;
1711 default:
1712 min = dist.sqeuclidean_extended(idx1,j);
1713 for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
1714 t_float const tmp = dist.sqeuclidean_extended(idx1,j);
1715 if (tmp<min) {
1716 min = tmp;
1717 n_nghbr[idx1] = j;
1718 }
1719 }
1720 }
1721 /* Update the heap with the new true minimum and search for the (possibly
1722 different) minimal entry. */
1723 nn_distances.update_geq(idx1, min);
1724 idx1 = nn_distances.argmin();
1725 }
1726
1727 idx2 = n_nghbr[idx1];
1728 active_nodes.remove(idx1);
1729 active_nodes.remove(idx2);
1730
1731 Z2.append(idx1, idx2, mindist[idx1]);
1732
1733 if (i<2*N_1) {
1734 switch (method) {
1735 case METHOD_VECTOR_WARD:
1736 case METHOD_VECTOR_CENTROID:
1737 dist.merge(idx1, idx2, i);
1738 break;
1739
1740 case METHOD_VECTOR_MEDIAN:
1741 dist.merge_weighted(idx1, idx2, i);
1742 break;
1743
1744 default:
1745 assert(false && "fastcluster: invalid method");
1746 }
1747
1748 n_nghbr[i] = active_nodes.start;
1749 if (method==METHOD_VECTOR_WARD) {
1750 /*
1751 Ward linkage.
1752
1753 Shorter and longer distances can occur, not smaller than min(d1,d2)
1754 but maybe bigger than max(d1,d2).
1755 */
1756 min = dist.ward_extended(active_nodes.start, i);
1757 for (j=active_nodes.succ[active_nodes.start]; j<i;
1758 j=active_nodes.succ[j]) {
1759 t_float tmp = dist.ward_extended(j, i);
1760 if (tmp < min) {
1761 min = tmp;
1762 n_nghbr[i] = j;
1763 }
1764 }
1765 }
1766 else {
1767 /*
1768 Centroid and median linkage.
1769
1770 Shorter and longer distances can occur, not bigger than max(d1,d2)
1771 but maybe smaller than min(d1,d2).
1772 */
1773 min = dist.sqeuclidean_extended(active_nodes.start, i);
1774 for (j=active_nodes.succ[active_nodes.start]; j<i;
1775 j=active_nodes.succ[j]) {
1776 t_float tmp = dist.sqeuclidean_extended(j, i);
1777 if (tmp < min) {
1778 min = tmp;
1779 n_nghbr[i] = j;
1780 }
1781 }
1782 }
1783 if (idx2<active_nodes.start) {
1784 nn_distances.remove(active_nodes.start);
1785 } else {
1786 nn_distances.remove(idx2);
1787 }
1788 nn_distances.replace(idx1, i, min);
1789 }
1790 }
1791}
1792
1793#if HAVE_VISIBILITY
1794#pragma GCC visibility pop
1795#endif
cJSON * p
Definition cJSON.cpp:2559
newitem type
Definition cJSON.cpp:2739
cJSON * a
Definition cJSON.cpp:2560
const cJSON *const b
Definition cJSON.h:255
int index
Definition cJSON.h:170
#define NULL
Definition extapi.c:2
bool fc_isnan(double x)