EMAN2
Public Member Functions | Static Public Member Functions | Private Member Functions | Private Attributes
EMAN::BoxSVDClassifier Class Reference

#include <boxingtools.h>

Collaboration diagram for EMAN::BoxSVDClassifier:
Collaboration graph
[legend]

List of all members.

Public Member Functions

 BoxSVDClassifier (const vector< vector< float > > &data, const unsigned int &classes=4)
 ~BoxSVDClassifier ()
map< unsigned int, unsigned int > go ()

Static Public Member Functions

static map< unsigned int,
unsigned int > 
colorMappingByClassSize (const map< unsigned int, unsigned int > &grouping)

Private Member Functions

map< unsigned int, unsigned int > randomSeedCluster (const gsl_matrix *const svd_coords, unsigned int matrix_dims)
map< unsigned int, unsigned int > getIterativeCluster (const gsl_matrix *const svd_coords, const map< unsigned int, unsigned int > &current_grouping)
bool setDims (const vector< vector< float > > &data)
vector< vector< float > > getDistances (const gsl_matrix *const svd_coords, const gsl_matrix *const ref_coords)
map< unsigned int, unsigned int > getMapping (const vector< vector< float > > &distances)

Private Attributes

const vector< vector< float > > & mData
unsigned int mColumns
unsigned int mRows
unsigned int mClasses

Detailed Description

Definition at line 120 of file boxingtools.h.


Constructor & Destructor Documentation

BoxSVDClassifier::BoxSVDClassifier ( const vector< vector< float > > &  data,
const unsigned int &  classes = 4 
)

Definition at line 102 of file boxingtools.cpp.

References mData, and setDims().

                                                                                                  :
                mData(data), mClasses(classes)
{
        setDims( mData );
}
BoxSVDClassifier::~BoxSVDClassifier ( )

Definition at line 109 of file boxingtools.cpp.

{

}

Member Function Documentation

map< unsigned int, unsigned int > BoxSVDClassifier::colorMappingByClassSize ( const map< unsigned int, unsigned int > &  grouping) [static]

Definition at line 416 of file boxingtools.cpp.

Referenced by EMAN::BoxingTools::classify().

{

        vector<unsigned int> current_mappings;
        // Get the extent of the current mappings
        for (map< unsigned int, unsigned int>::const_iterator it = grouping.begin(); it != grouping.end(); ++it )
        {
                if ( find( current_mappings.begin(), current_mappings.end(), it->second ) == current_mappings.end() )
                {
                        current_mappings.push_back( it->second );
                }
        }

        if ( current_mappings.size() < 2 )
        {
                cerr << "Error, cannot call colMappingByClassSize when less than 2 classes have been specified, I think you created " << current_mappings.size() << " classes " << endl;
                throw;
        }

        // Record how many data points are in each class.
        map<unsigned int, unsigned int> mappings_tally;
        for( vector<unsigned int>::const_iterator it = current_mappings.begin(); it != current_mappings.end(); ++it )
        {
                // First initialize each total to zero
                mappings_tally[*it] = 0;
        }

        // Now do the actual counting
        for (map< unsigned int, unsigned int>::const_iterator it = grouping.begin(); it != grouping.end(); ++it )
        {
                mappings_tally[it->second] += 1;
        }

        // find the largest tally
        unsigned int current_mapping_idx = 0;
        map< unsigned int, unsigned int> return_map;
        while ( mappings_tally.size() > 0 )
        {
#if SVD_CLASSIFIER_DEBUG
                cout << "Printing mappings_tally" << endl;
                print_map(mappings_tally);
#endif

                map< unsigned int, unsigned int>::iterator it = mappings_tally.begin();
                map< unsigned int, unsigned int>::iterator track_it = mappings_tally.begin();
                unsigned int current_max = it->second;
                unsigned int current_idx = it->first;
                ++it;
                for (; it != mappings_tally.end(); ++it )
                {
                        if ( it->second > current_max )
                        {
                                current_max = it->second;
                                current_idx = it->first;
                                track_it = it;
                        }
                }

#if SVD_CLASSIFIER_DEBUG
                cout << "The mapping is " << current_idx << " to " << current_mapping_idx << endl;
#endif
                for (map< unsigned int, unsigned int>::const_iterator group_it = grouping.begin(); group_it != grouping.end(); ++group_it )
                {
                        if ( group_it->second == current_idx )
                        {
                                return_map[group_it->first] = current_mapping_idx;
                        }
                }

                mappings_tally.erase( current_idx );

                current_mapping_idx++;
        }


#if SVD_CLASSIFIER_DEBUG
        cout << "Printing adjusted classification map" << endl;
        print_map(return_map);
#endif


        return return_map;
}
vector< vector< float > > BoxSVDClassifier::getDistances ( const gsl_matrix *const  svd_coords,
const gsl_matrix *const  ref_coords 
) [private]

Definition at line 366 of file boxingtools.cpp.

References mClasses, and mColumns.

Referenced by getIterativeCluster(), and randomSeedCluster().

{
        // accrue the distance data - this could be done more concisely, but there shouldn't be much cost
        // because the data should be fairl small. By more concisely I mean, the distance data would not need
        // to be stored, it could be determined without storing it in distances.
        vector<vector<float> > distances;
        for (unsigned int i = 0; i < mColumns; ++i )
        {
                vector<float> ith_distances;
                for( unsigned int random_seed_idx = 0; random_seed_idx < mClasses; ++random_seed_idx )
                {
                        float distance = 0;
                        for (unsigned int j = 0; j < mColumns; ++j )
                        {
                                float value = (float)( (gsl_matrix_get( ref_coords, random_seed_idx, j) - gsl_matrix_get( svd_coords, i, j)) );
                                distance += value * value;
                        }
                        ith_distances.push_back(distance);
                }
                distances.push_back(ith_distances);
        }

        return distances;
}
map< unsigned int, unsigned int > BoxSVDClassifier::getIterativeCluster ( const gsl_matrix *const  svd_coords,
const map< unsigned int, unsigned int > &  current_grouping 
) [private]

Definition at line 243 of file boxingtools.cpp.

References getDistances(), getMapping(), mClasses, and mColumns.

Referenced by go().

{
        // Space to store the reference vectors
        gsl_matrix * ref_coords = gsl_matrix_calloc( mClasses, mColumns );

        // Assumes there are a total of mClasses in the current_groupings mapping
        for(unsigned int i = 0; i < mClasses; ++i)
        {
                unsigned int tally = 0;
                for (map< unsigned int, unsigned int>::const_iterator it = current_grouping.begin(); it != current_grouping.end(); ++it )
                {
                        if ( it->second == i )
                        {
                                for( unsigned int j = 0; j < mColumns; ++j )
                                {
                                        gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( svd_coords, it->first, j ) + gsl_matrix_get( ref_coords, i, j));
                                }
                                ++tally;
                        }

                }
                // then normalize the the addition
                if (tally != 0)
                        for( unsigned int j = 0; j < mColumns; ++j )
                {
                        gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( ref_coords, i, j )/((float) tally));
                }
        }

        vector<vector<float> > distances = getDistances(svd_coords, ref_coords);

#if SVD_CLASSIFIER_DEBUG
        cout << "The distance matrix is " << endl;
        for( unsigned int i = 0; i < distances.size(); ++i )
        {
                for( unsigned int j = 0; j < distances[i].size(); ++j )
                {
                        cout << distances[i][j] << " ";
                }
                cout << endl;
        }
#endif


        // Finally decide which of the randomly chosen vectors is closest to each of the input vectors
        // and use that as the basis of the grouping
        map< unsigned int, unsigned int> return_map = getMapping(distances);

#if SVD_CLASSIFIER_DEBUG
        cout << "Printing classification map" << endl;
        print_map(return_map);
#endif

        gsl_matrix_free(ref_coords);

        return return_map;
}
map< unsigned int, unsigned int > BoxSVDClassifier::getMapping ( const vector< vector< float > > &  distances) [private]

Definition at line 391 of file boxingtools.cpp.

References min.

Referenced by getIterativeCluster(), and randomSeedCluster().

{
        // Finally decide which of the randomly chosen vectors is closest to each of the input vectors
        // and use that as the basis of the grouping
        map< unsigned int, unsigned int> return_map;
        unsigned int vector_idx = 0;
        for( vector<vector<float> >::const_iterator it = distances.begin(); it != distances.end(); ++it, ++vector_idx )
        {
                vector<float>::const_iterator mIt = it->begin();
                float min = *mIt;
                unsigned int min_idx = 0;
                for ( unsigned int current_idx = 0; mIt != it->end(); ++mIt, ++current_idx )
                {
                        if ( *mIt < min )
                        {
                                min = *mIt;
                                min_idx = current_idx;
                        }
                }
                return_map[vector_idx] = min_idx;
        }

        return return_map;
}
map< unsigned int, unsigned int > BoxSVDClassifier::go ( )

Definition at line 134 of file boxingtools.cpp.

References getIterativeCluster(), mColumns, mData, mRows, norm(), randomSeedCluster(), and V.

Referenced by EMAN::BoxingTools::classify().

{
        //      This is done in the constructor
        //      setDims(mData);


        unsigned int local_columns = mColumns;
        if ( mRows < mColumns )
        {
//              cerr << "Warning: gsl SVD works only when m > n, you have m = " << mRows << " and n = " << mColumns << endl;
                // This local adaptation means things will proceed the same way even if there are more columns in A then rows
                // Every input data is still classified, just the SVD eigenvectors are found using a subset of all the data
                local_columns = mRows;
        }

        gsl_matrix * U = gsl_matrix_calloc( mRows, local_columns );
        gsl_matrix * A = gsl_matrix_calloc( mRows, mColumns );
        for ( unsigned int i = 0; i < mRows; ++i )
        {
                for ( unsigned int j = 0; j < mColumns; ++j )
                {
                        gsl_matrix_set( A, i, j, mData[j][i] );
                        if ( j < local_columns )
                                gsl_matrix_set( U, i, j, mData[j][i] );
                }
        }
#if SVD_CLASSIFIER_DEBUG
        printMatrix( A, mRows, mColumns, "A" );
#endif

        gsl_matrix * V = gsl_matrix_calloc( local_columns, local_columns );
        gsl_vector * S = gsl_vector_calloc( local_columns );
        gsl_vector * work = gsl_vector_calloc( local_columns );

        if ( gsl_linalg_SV_decomp (U, V, S, work) )
        {
                cerr << "ERROR: gsl returned a non zero value on application of the SVD" << endl;
        }

#if SVD_CLASSIFIER_DEBUG
        printMatrix( U, mRows, local_columns, "U" );
        printVector( S, local_columns, "S" );
        printMatrix( V, local_columns, local_columns, "V");
#endif

        // normalize the columns of matrix A
        for ( unsigned int j = 0; j < mColumns; ++j )
        {
                float norm = 0;
                for ( unsigned int i = 0; i < mRows; ++i )
                {
                        norm += (float)(gsl_matrix_get( A, i, j)*gsl_matrix_get( A, i, j));
                }
                norm = sqrtf(norm);
                for ( unsigned int i = 0; i < mRows; ++i )
                {
                        gsl_matrix_set( A, i, j, gsl_matrix_get(A,i,j)/norm);
                }
        }

#if SVD_CLASSIFIER_DEBUG
        for ( unsigned int j = 0; j < mColumns; ++j )
        {
                double norm = 0;
                for ( unsigned int i = 0; i < mRows; ++i )
                {
                        norm += gsl_matrix_get( A, i, j)*gsl_matrix_get( A, i, j);
                }
                cout << "For column " << j << " the squared norm is " << norm << endl;
        }
#endif


        gsl_matrix * svd_coords = gsl_matrix_calloc( mColumns, mColumns );
        // Correlate the columns of A with the columns of U and store the information in a martrix called svd_coords
        for ( unsigned int i = 0; i < mColumns; ++i )
        {
                for ( unsigned int j = 0; j < local_columns; ++j )
                {
                        double result = 0.0;
                        for ( unsigned int k = 0; k < mRows; ++k )
                        {
                                result += gsl_matrix_get(A,k,i)*gsl_matrix_get(U,k,j);
                        }
                        gsl_matrix_set( svd_coords, i, j, result);
                }
        }

#if SVD_CLASSIFIER_DEBUG
        printMatrix( svd_coords, mColumns, mColumns, "svd_coords" );
#endif

        map< unsigned int, unsigned int> grouping = randomSeedCluster(svd_coords, mColumns);

        for ( unsigned int i = 0; i < 20; ++ i )
        {
                grouping = getIterativeCluster(svd_coords, grouping);
        }

        gsl_matrix_free(A);
        gsl_matrix_free(U);
        gsl_matrix_free(V);
        gsl_vector_free(S);
        gsl_vector_free(work);
        gsl_matrix_free(svd_coords);

        return grouping;
}
map< unsigned int, unsigned int > BoxSVDClassifier::randomSeedCluster ( const gsl_matrix *const  svd_coords,
unsigned int  matrix_dims 
) [private]

Definition at line 302 of file boxingtools.cpp.

References getDistances(), getMapping(), mClasses, and mColumns.

Referenced by go().

{
        // Seed the random number generator
        srand(static_cast<unsigned int>(time(0)));

        vector<unsigned int> random_seed_indices;
        while ( random_seed_indices.size() < mClasses )
        {
                unsigned int random_idx = static_cast<int>(((float)rand()/RAND_MAX)*matrix_dims);
                if ( find( random_seed_indices.begin(), random_seed_indices.end(), random_idx ) == random_seed_indices.end() )
                {
                        random_seed_indices.push_back( random_idx );
                }
        }

        // Space to store the reference vectors
        gsl_matrix * ref_coords = gsl_matrix_calloc( mClasses, mColumns );

        // Put the reference vectors into a matrix to make the approach transparent to the reader
        for(unsigned int i = 0; i < random_seed_indices.size(); ++i)
        {
                for( unsigned int j = 0; j < matrix_dims; ++j )
                {
                        gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( svd_coords, random_seed_indices[i], j ));
                }
        }

#if SVD_CLASSIFIER_DEBUG
        printMatrix( ref_coords, mClasses, matrix_dims, "Reference matrix in first grouping");
#endif

        // accrue the distance data - this could be done more concisely, but there shouldn't be much cost
        // because the data should be fairl small. By more concisely I mean, the distance data would not need
        // to be stored, it could be determined without storing it in distances.
        vector<vector<float> > distances = getDistances(svd_coords, ref_coords);

#if SVD_CLASSIFIER_DEBUG
        cout << "The distance matrix is " << endl;
        for( unsigned int i = 0; i < distances.size(); ++i )
        {
                for( unsigned int j = 0; j < distances[i].size(); ++j )
                {
                        cout << distances[i][j] << " ";
                }
                cout << endl;
        }
#endif


        // Finally decide which of the randomly chosen vectors is closest to each of the input vectors
        // and use that as the basis of the grouping
        map< unsigned int, unsigned int> return_map = getMapping(distances);

#if SVD_CLASSIFIER_DEBUG
        cout << "Printing classification map, randomly seeded" << endl;
        print_map(return_map);
#endif

        gsl_matrix_free(ref_coords);

        return return_map;
}
bool BoxSVDClassifier::setDims ( const vector< vector< float > > &  data) [private]

Definition at line 114 of file boxingtools.cpp.

References data, mColumns, mData, and mRows.

Referenced by BoxSVDClassifier().

{
        mColumns = mData.size();
        vector<vector<float> >::const_iterator it = data.begin();
        mRows = it->size();
        it++;
        for( ; it != data.end(); ++it )
        {
                if ( it->size() != mRows )
                {
                        cerr << "ERROR: can not initial the BoxSVDClassifier with vectors of un-equal lengths " << endl;
                        cerr << "The vector lengths that did not agree were " <<  mRows << " and " << it->size() << endl;
                        return false;
                }
        }

        return true;
}

Member Data Documentation

unsigned int EMAN::BoxSVDClassifier::mClasses [private]

Definition at line 139 of file boxingtools.h.

Referenced by getDistances(), getIterativeCluster(), and randomSeedCluster().

unsigned int EMAN::BoxSVDClassifier::mColumns [private]

Definition at line 136 of file boxingtools.h.

Referenced by getDistances(), getIterativeCluster(), go(), randomSeedCluster(), and setDims().

const vector<vector<float> >& EMAN::BoxSVDClassifier::mData [private]

Definition at line 134 of file boxingtools.h.

Referenced by BoxSVDClassifier(), go(), and setDims().

unsigned int EMAN::BoxSVDClassifier::mRows [private]

Definition at line 137 of file boxingtools.h.

Referenced by go(), and setDims().


The documentation for this class was generated from the following files: