EMAN2
Public Member Functions | Static Public Member Functions | Private Member Functions | Private Attributes | List of all members
EMAN::BoxSVDClassifier Class Reference

#include <boxingtools.h>

Public Member Functions

 BoxSVDClassifier (const vector< vector< float > > &data, const unsigned int &classes=4)
 
 ~BoxSVDClassifier ()
 
map< unsigned int, unsigned int > go ()
 

Static Public Member Functions

static map< unsigned int, unsigned int > colorMappingByClassSize (const map< unsigned int, unsigned int > &grouping)
 

Private Member Functions

map< unsigned int, unsigned int > randomSeedCluster (const gsl_matrix *const svd_coords, unsigned int matrix_dims)
 
map< unsigned int, unsigned int > getIterativeCluster (const gsl_matrix *const svd_coords, const map< unsigned int, unsigned int > &current_grouping)
 
bool setDims (const vector< vector< float > > &data)
 
vector< vector< float > > getDistances (const gsl_matrix *const svd_coords, const gsl_matrix *const ref_coords)
 
map< unsigned int, unsigned int > getMapping (const vector< vector< float > > &distances)
 

Private Attributes

const vector< vector< float > > & mData
 
unsigned int mColumns
 
unsigned int mRows
 
unsigned int mClasses
 

Detailed Description

Definition at line 116 of file boxingtools.h.

Constructor & Destructor Documentation

◆ BoxSVDClassifier()

BoxSVDClassifier::BoxSVDClassifier ( const vector< vector< float > > &  data,
const unsigned int &  classes = 4 
)

Definition at line 98 of file boxingtools.cpp.

98 :
99 mData(data), mClasses(classes)
100{
101 setDims( mData );
102}
const vector< vector< float > > & mData
Definition: boxingtools.h:130
unsigned int mClasses
Definition: boxingtools.h:135
bool setDims(const vector< vector< float > > &data)

References mData, and setDims().

◆ ~BoxSVDClassifier()

BoxSVDClassifier::~BoxSVDClassifier ( )

Definition at line 105 of file boxingtools.cpp.

106{
107
108}

Member Function Documentation

◆ colorMappingByClassSize()

map< unsigned int, unsigned int > BoxSVDClassifier::colorMappingByClassSize ( const map< unsigned int, unsigned int > &  grouping)
static

Definition at line 412 of file boxingtools.cpp.

413{
414
415 vector<unsigned int> current_mappings;
416 // Get the extent of the current mappings
417 for (map< unsigned int, unsigned int>::const_iterator it = grouping.begin(); it != grouping.end(); ++it )
418 {
419 if ( find( current_mappings.begin(), current_mappings.end(), it->second ) == current_mappings.end() )
420 {
421 current_mappings.push_back( it->second );
422 }
423 }
424
425 if ( current_mappings.size() < 2 )
426 {
427 cerr << "Error, cannot call colMappingByClassSize when less than 2 classes have been specified, I think you created " << current_mappings.size() << " classes " << endl;
428 throw;
429 }
430
431 // Record how many data points are in each class.
432 map<unsigned int, unsigned int> mappings_tally;
433 for( vector<unsigned int>::const_iterator it = current_mappings.begin(); it != current_mappings.end(); ++it )
434 {
435 // First initialize each total to zero
436 mappings_tally[*it] = 0;
437 }
438
439 // Now do the actual counting
440 for (map< unsigned int, unsigned int>::const_iterator it = grouping.begin(); it != grouping.end(); ++it )
441 {
442 mappings_tally[it->second] += 1;
443 }
444
445 // find the largest tally
446 unsigned int current_mapping_idx = 0;
447 map< unsigned int, unsigned int> return_map;
448 while ( mappings_tally.size() > 0 )
449 {
450#if SVD_CLASSIFIER_DEBUG
451 cout << "Printing mappings_tally" << endl;
452 print_map(mappings_tally);
453#endif
454
455 map< unsigned int, unsigned int>::iterator it = mappings_tally.begin();
456 map< unsigned int, unsigned int>::iterator track_it = mappings_tally.begin();
457 unsigned int current_max = it->second;
458 unsigned int current_idx = it->first;
459 ++it;
460 for (; it != mappings_tally.end(); ++it )
461 {
462 if ( it->second > current_max )
463 {
464 current_max = it->second;
465 current_idx = it->first;
466 track_it = it;
467 }
468 }
469
470#if SVD_CLASSIFIER_DEBUG
471 cout << "The mapping is " << current_idx << " to " << current_mapping_idx << endl;
472#endif
473 for (map< unsigned int, unsigned int>::const_iterator group_it = grouping.begin(); group_it != grouping.end(); ++group_it )
474 {
475 if ( group_it->second == current_idx )
476 {
477 return_map[group_it->first] = current_mapping_idx;
478 }
479 }
480
481 mappings_tally.erase( current_idx );
482
483 current_mapping_idx++;
484 }
485
486
487#if SVD_CLASSIFIER_DEBUG
488 cout << "Printing adjusted classification map" << endl;
489 print_map(return_map);
490#endif
491
492
493 return return_map;
494}

Referenced by EMAN::BoxingTools::classify().

◆ getDistances()

vector< vector< float > > BoxSVDClassifier::getDistances ( const gsl_matrix *const  svd_coords,
const gsl_matrix *const  ref_coords 
)
private

Definition at line 362 of file boxingtools.cpp.

363{
364 // accrue the distance data - this could be done more concisely, but there shouldn't be much cost
365 // because the data should be fairl small. By more concisely I mean, the distance data would not need
366 // to be stored, it could be determined without storing it in distances.
367 vector<vector<float> > distances;
368 for (unsigned int i = 0; i < mColumns; ++i )
369 {
370 vector<float> ith_distances;
371 for( unsigned int random_seed_idx = 0; random_seed_idx < mClasses; ++random_seed_idx )
372 {
373 float distance = 0;
374 for (unsigned int j = 0; j < mColumns; ++j )
375 {
376 float value = (float)( (gsl_matrix_get( ref_coords, random_seed_idx, j) - gsl_matrix_get( svd_coords, i, j)) );
377 distance += value * value;
378 }
379 ith_distances.push_back(distance);
380 }
381 distances.push_back(ith_distances);
382 }
383
384 return distances;
385}
unsigned int mColumns
Definition: boxingtools.h:132

References mClasses, and mColumns.

Referenced by getIterativeCluster(), and randomSeedCluster().

◆ getIterativeCluster()

map< unsigned int, unsigned int > BoxSVDClassifier::getIterativeCluster ( const gsl_matrix *const  svd_coords,
const map< unsigned int, unsigned int > &  current_grouping 
)
private

Definition at line 239 of file boxingtools.cpp.

240{
241 // Space to store the reference vectors
242 gsl_matrix * ref_coords = gsl_matrix_calloc( mClasses, mColumns );
243
244 // Assumes there are a total of mClasses in the current_groupings mapping
245 for(unsigned int i = 0; i < mClasses; ++i)
246 {
247 unsigned int tally = 0;
248 for (map< unsigned int, unsigned int>::const_iterator it = current_grouping.begin(); it != current_grouping.end(); ++it )
249 {
250 if ( it->second == i )
251 {
252 for( unsigned int j = 0; j < mColumns; ++j )
253 {
254 gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( svd_coords, it->first, j ) + gsl_matrix_get( ref_coords, i, j));
255 }
256 ++tally;
257 }
258
259 }
260 // then normalize the the addition
261 if (tally != 0)
262 for( unsigned int j = 0; j < mColumns; ++j )
263 {
264 gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( ref_coords, i, j )/((float) tally));
265 }
266 }
267
268 vector<vector<float> > distances = getDistances(svd_coords, ref_coords);
269
270#if SVD_CLASSIFIER_DEBUG
271 cout << "The distance matrix is " << endl;
272 for( unsigned int i = 0; i < distances.size(); ++i )
273 {
274 for( unsigned int j = 0; j < distances[i].size(); ++j )
275 {
276 cout << distances[i][j] << " ";
277 }
278 cout << endl;
279 }
280#endif
281
282
283 // Finally decide which of the randomly chosen vectors is closest to each of the input vectors
284 // and use that as the basis of the grouping
285 map< unsigned int, unsigned int> return_map = getMapping(distances);
286
287#if SVD_CLASSIFIER_DEBUG
288 cout << "Printing classification map" << endl;
289 print_map(return_map);
290#endif
291
292 gsl_matrix_free(ref_coords);
293
294 return return_map;
295}
map< unsigned int, unsigned int > getMapping(const vector< vector< float > > &distances)
vector< vector< float > > getDistances(const gsl_matrix *const svd_coords, const gsl_matrix *const ref_coords)

References getDistances(), getMapping(), mClasses, and mColumns.

Referenced by go().

◆ getMapping()

map< unsigned int, unsigned int > BoxSVDClassifier::getMapping ( const vector< vector< float > > &  distances)
private

Definition at line 387 of file boxingtools.cpp.

388{
389 // Finally decide which of the randomly chosen vectors is closest to each of the input vectors
390 // and use that as the basis of the grouping
391 map< unsigned int, unsigned int> return_map;
392 unsigned int vector_idx = 0;
393 for( vector<vector<float> >::const_iterator it = distances.begin(); it != distances.end(); ++it, ++vector_idx )
394 {
395 vector<float>::const_iterator mIt = it->begin();
396 float min = *mIt;
397 unsigned int min_idx = 0;
398 for ( unsigned int current_idx = 0; mIt != it->end(); ++mIt, ++current_idx )
399 {
400 if ( *mIt < min )
401 {
402 min = *mIt;
403 min_idx = current_idx;
404 }
405 }
406 return_map[vector_idx] = min_idx;
407 }
408
409 return return_map;
410}

Referenced by getIterativeCluster(), and randomSeedCluster().

◆ go()

map< unsigned int, unsigned int > BoxSVDClassifier::go ( )

Definition at line 130 of file boxingtools.cpp.

131{
132 // This is done in the constructor
133 // setDims(mData);
134
135
136 unsigned int local_columns = mColumns;
137 if ( mRows < mColumns )
138 {
139// cerr << "Warning: gsl SVD works only when m > n, you have m = " << mRows << " and n = " << mColumns << endl;
140 // This local adaptation means things will proceed the same way even if there are more columns in A then rows
141 // Every input data is still classified, just the SVD eigenvectors are found using a subset of all the data
142 local_columns = mRows;
143 }
144
145 gsl_matrix * U = gsl_matrix_calloc( mRows, local_columns );
146 gsl_matrix * A = gsl_matrix_calloc( mRows, mColumns );
147 for ( unsigned int i = 0; i < mRows; ++i )
148 {
149 for ( unsigned int j = 0; j < mColumns; ++j )
150 {
151 gsl_matrix_set( A, i, j, mData[j][i] );
152 if ( j < local_columns )
153 gsl_matrix_set( U, i, j, mData[j][i] );
154 }
155 }
156#if SVD_CLASSIFIER_DEBUG
157 printMatrix( A, mRows, mColumns, "A" );
158#endif
159
160 gsl_matrix * V = gsl_matrix_calloc( local_columns, local_columns );
161 gsl_vector * S = gsl_vector_calloc( local_columns );
162 gsl_vector * work = gsl_vector_calloc( local_columns );
163
164 if ( gsl_linalg_SV_decomp (U, V, S, work) )
165 {
166 cerr << "ERROR: gsl returned a non zero value on application of the SVD" << endl;
167 }
168
169#if SVD_CLASSIFIER_DEBUG
170 printMatrix( U, mRows, local_columns, "U" );
171 printVector( S, local_columns, "S" );
172 printMatrix( V, local_columns, local_columns, "V");
173#endif
174
175 // normalize the columns of matrix A
176 for ( unsigned int j = 0; j < mColumns; ++j )
177 {
178 float norm = 0;
179 for ( unsigned int i = 0; i < mRows; ++i )
180 {
181 norm += (float)(gsl_matrix_get( A, i, j)*gsl_matrix_get( A, i, j));
182 }
183 norm = sqrtf(norm);
184 for ( unsigned int i = 0; i < mRows; ++i )
185 {
186 gsl_matrix_set( A, i, j, gsl_matrix_get(A,i,j)/norm);
187 }
188 }
189
190#if SVD_CLASSIFIER_DEBUG
191 for ( unsigned int j = 0; j < mColumns; ++j )
192 {
193 double norm = 0;
194 for ( unsigned int i = 0; i < mRows; ++i )
195 {
196 norm += gsl_matrix_get( A, i, j)*gsl_matrix_get( A, i, j);
197 }
198 cout << "For column " << j << " the squared norm is " << norm << endl;
199 }
200#endif
201
202
203 gsl_matrix * svd_coords = gsl_matrix_calloc( mColumns, mColumns );
204 // Correlate the columns of A with the columns of U and store the information in a martrix called svd_coords
205 for ( unsigned int i = 0; i < mColumns; ++i )
206 {
207 for ( unsigned int j = 0; j < local_columns; ++j )
208 {
209 double result = 0.0;
210 for ( unsigned int k = 0; k < mRows; ++k )
211 {
212 result += gsl_matrix_get(A,k,i)*gsl_matrix_get(U,k,j);
213 }
214 gsl_matrix_set( svd_coords, i, j, result);
215 }
216 }
217
218#if SVD_CLASSIFIER_DEBUG
219 printMatrix( svd_coords, mColumns, mColumns, "svd_coords" );
220#endif
221
222 map< unsigned int, unsigned int> grouping = randomSeedCluster(svd_coords, mColumns);
223
224 for ( unsigned int i = 0; i < 20; ++ i )
225 {
226 grouping = getIterativeCluster(svd_coords, grouping);
227 }
228
229 gsl_matrix_free(A);
230 gsl_matrix_free(U);
231 gsl_matrix_free(V);
232 gsl_vector_free(S);
233 gsl_vector_free(work);
234 gsl_matrix_free(svd_coords);
235
236 return grouping;
237}
#define V(i, j)
Definition: analyzer.cpp:697
map< unsigned int, unsigned int > getIterativeCluster(const gsl_matrix *const svd_coords, const map< unsigned int, unsigned int > &current_grouping)
map< unsigned int, unsigned int > randomSeedCluster(const gsl_matrix *const svd_coords, unsigned int matrix_dims)

References getIterativeCluster(), mColumns, mData, mRows, randomSeedCluster(), and V.

Referenced by EMAN::BoxingTools::classify().

◆ randomSeedCluster()

map< unsigned int, unsigned int > BoxSVDClassifier::randomSeedCluster ( const gsl_matrix *const  svd_coords,
unsigned int  matrix_dims 
)
private

Definition at line 298 of file boxingtools.cpp.

299{
300 // Seed the random number generator
301 srand(static_cast<unsigned int>(time(0)));
302
303 vector<unsigned int> random_seed_indices;
304 while ( random_seed_indices.size() < mClasses )
305 {
306 unsigned int random_idx = static_cast<int>(((float)rand()/RAND_MAX)*matrix_dims);
307 if ( find( random_seed_indices.begin(), random_seed_indices.end(), random_idx ) == random_seed_indices.end() )
308 {
309 random_seed_indices.push_back( random_idx );
310 }
311 }
312
313 // Space to store the reference vectors
314 gsl_matrix * ref_coords = gsl_matrix_calloc( mClasses, mColumns );
315
316 // Put the reference vectors into a matrix to make the approach transparent to the reader
317 for(unsigned int i = 0; i < random_seed_indices.size(); ++i)
318 {
319 for( unsigned int j = 0; j < matrix_dims; ++j )
320 {
321 gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( svd_coords, random_seed_indices[i], j ));
322 }
323 }
324
325#if SVD_CLASSIFIER_DEBUG
326 printMatrix( ref_coords, mClasses, matrix_dims, "Reference matrix in first grouping");
327#endif
328
329 // accrue the distance data - this could be done more concisely, but there shouldn't be much cost
330 // because the data should be fairl small. By more concisely I mean, the distance data would not need
331 // to be stored, it could be determined without storing it in distances.
332 vector<vector<float> > distances = getDistances(svd_coords, ref_coords);
333
334#if SVD_CLASSIFIER_DEBUG
335 cout << "The distance matrix is " << endl;
336 for( unsigned int i = 0; i < distances.size(); ++i )
337 {
338 for( unsigned int j = 0; j < distances[i].size(); ++j )
339 {
340 cout << distances[i][j] << " ";
341 }
342 cout << endl;
343 }
344#endif
345
346
347 // Finally decide which of the randomly chosen vectors is closest to each of the input vectors
348 // and use that as the basis of the grouping
349 map< unsigned int, unsigned int> return_map = getMapping(distances);
350
351#if SVD_CLASSIFIER_DEBUG
352 cout << "Printing classification map, randomly seeded" << endl;
353 print_map(return_map);
354#endif
355
356 gsl_matrix_free(ref_coords);
357
358 return return_map;
359}

References getDistances(), getMapping(), mClasses, and mColumns.

Referenced by go().

◆ setDims()

bool BoxSVDClassifier::setDims ( const vector< vector< float > > &  data)
private

Definition at line 110 of file boxingtools.cpp.

111{
112 mColumns = mData.size();
113 vector<vector<float> >::const_iterator it = data.begin();
114 mRows = it->size();
115 it++;
116 for( ; it != data.end(); ++it )
117 {
118 if ( it->size() != mRows )
119 {
120 cerr << "ERROR: can not initial the BoxSVDClassifier with vectors of un-equal lengths " << endl;
121 cerr << "The vector lengths that did not agree were " << mRows << " and " << it->size() << endl;
122 return false;
123 }
124 }
125
126 return true;
127}

References mColumns, mData, and mRows.

Referenced by BoxSVDClassifier().

Member Data Documentation

◆ mClasses

unsigned int EMAN::BoxSVDClassifier::mClasses
private

Definition at line 135 of file boxingtools.h.

Referenced by getDistances(), getIterativeCluster(), and randomSeedCluster().

◆ mColumns

unsigned int EMAN::BoxSVDClassifier::mColumns
private

Definition at line 132 of file boxingtools.h.

Referenced by getDistances(), getIterativeCluster(), go(), randomSeedCluster(), and setDims().

◆ mData

const vector<vector<float> >& EMAN::BoxSVDClassifier::mData
private

Definition at line 130 of file boxingtools.h.

Referenced by BoxSVDClassifier(), go(), and setDims().

◆ mRows

unsigned int EMAN::BoxSVDClassifier::mRows
private

Definition at line 133 of file boxingtools.h.

Referenced by go(), and setDims().


The documentation for this class was generated from the following files: