程序代码参考了csdn某博客,具体名字忘记了
变量命名的头文件
//common.h#ifndef COMM_H#define COMM_H#include#include #include #include #include using namespace std;typedef vector StrVec; //字符串向量typedef vector IntVec; //整数向量typedef vector > Int2DVec;//整数二维向量typedef vector > Double2DVec;//浮点数二维向量typedef vector DoubleVec;//浮点数向量#endif
去除停用词语
#pragma once#include "common.h"// 用于移除停止词class StopWordsHandler{public: StopWordsHandler(void); ~StopWordsHandler(void); bool IsStopWord(string& str);private: StrVec stopwords;};#include "StopWordHandler.h"string StopWordList[] = { "的", "我们","要","自己","之","将","“","”",",","(",")","后","应","到","某","后","个","是","位","新","一","两","在","中","或","有","更","好",""};//停用词int strwordlen = sizeof(StopWordList) / sizeof(StopWordList[0]);StopWordsHandler::StopWordsHandler(){ for ( int i = 0 ; i < strwordlen ; i++) stopwords.push_back(StopWordList[i]);}StopWordsHandler::~StopWordsHandler(){}bool StopWordsHandler::IsStopWord(string& str){ transform(str.begin(),str.end(),str.begin(),tolower);//确保小写化 return find(stopwords.begin(),stopwords.end(),str)!=stopwords.end();}
分词选用了最简单的分词方法,预先用空格做好了分词
#pragma once#include "Common.h"class ITokeniser{public: virtual void Partition(string input,StrVec& retWords)=0;//分词算法};#pragma once#include "Itokenisher.h"class Tokeniser :public ITokeniser{public: Tokeniser(); ~Tokeniser(); void Partition(string input , StrVec& retWords);};#include "Tokeniser.h"#include "StopWordHandler.h"#includeTokeniser::Tokeniser(){}Tokeniser::~Tokeniser(){}void Tokeniser::Partition(string input ,StrVec& retWord){ transform(input.begin() , input.end(),input.begin(),tolower); string::iterator pos = input.begin(); StopWordsHandler stopHandler; do { string temp; pos = find(input.begin() , input.end(),' '); copy(input.begin() , pos ,back_inserter(temp)); if ( !stopHandler.IsStopWord(temp)) retWord.push_back(temp); if ( pos == input.end()) break; else input.erase(input.begin() ,++pos); }while ( pos != input.end());}
TFIDF的计算
#pragma once#include "Itokenisher.h"#include
计算余弦相似性距离
#pragma once#include "common.h"class TermVector{public: static double ComputerCosineSimilarity(const DoubleVec& vector1 , const DoubleVec& vector2 ); static double innerProduct(const DoubleVec& v1 ,const DoubleVec& v2); static double VectorLength(const DoubleVec & v);};#include "TermVector.h"#includedouble TermVector::ComputerCosineSimilarity(const DoubleVec & v1 , const DoubleVec& v2){ if ( v1.size() != v2.size()) throw string("different length"); double denom = (VectorLength(v1) * VectorLength(v2)); if ( denom == 0 ) return 0 ; else return (innerProduct(v1 , v2) / denom);}double TermVector::innerProduct(const DoubleVec & v1 , const DoubleVec& v2){ if ( v1.size() != v2.size()) throw string ("different length"); double result = 0.0f; for ( int i = 0 ; i < v1.size() ; i++) result+=v1[i]*v2[i]; return result;}double TermVector::VectorLength(const DoubleVec & v){ double sum = 0.0f; for ( int i = 0 ; i < v.size() ; i++) sum= sum+(v[i] * v[i]); return (double)sqrt(sum);}
定义cluster的类
#pragma once#include "common.h"class Cluster{public: IntVec CurrentMembership; //该类簇的数据成员索引 DoubleVec Mean ; //该簇类的聚类中心 Cluster(); ~Cluster(); Cluster(int dataindex , DoubleVec & data); void UpdateMean(Double2DVec & coordinates);};#include "cluster.h"Cluster::Cluster(){ }Cluster::Cluster(int dataindex , DoubleVec& data){ CurrentMembership.push_back(dataindex); copy(data.begin() , data.end() ,back_inserter(Mean));}void Cluster::UpdateMean(Double2DVec & coordinates){ //根据 mcurrentmembership取得原始资料点对象 //根据该子集的均值,corrdinate是一个m* n的矩阵,其实就是要求每列的均值 for (int i = 0 ; i< CurrentMembership.size();i++) { DoubleVec& coord = coordinates[CurrentMembership[i]]; for ( int j = 0 ; j < coord.size() ; j++) Mean[j]+=coord[j]; for (int k = 0 ; k
#pragma once#include "common.h"class Cluster;class KMeans{public: vector_clusters; KMeans(Double2DVec& data, int K); void Start(); ~KMeans();private: int _coordCount; //数据的数量 Double2DVec _coordinates;//原始数据 int _k; //聚类的簇个数 IntVec _clusterAssignments; IntVec _nearestCluster; Double2DVec _distanceCache; void InitRandom(); static double getDistance(const DoubleVec & coord ,const DoubleVec& center); int NearestCluster(int ndx); };#include "kmean.h"#include #include "cluster.h"#include "TermVector.h"#include KMeans::KMeans(Double2DVec &data , int k ){ int i ; this->_coordinates.resize(data.size()); for ( i = 0 ; i UpdateMean(_coordinates); } //计算每个数据和每个簇类中心的距离 for ( i = 0 ; i <_coordCount ; i++) { for ( j = 0 ; j <_k ; j++) { double dist = getDistance(_coordinates[i],_clusters[j]->Mean); _distanceCache[i][j] = dist; } } //计算每个数据离簇类最近 for ( i = 0 ; i <_coordCount ; i++) _nearestCluster[i] = this->NearestCluster(i); int k = 0 ; for ( i = 0 ; i <_coordCount ; i++) { if (_nearestCluster[i] == _clusterAssignments[i]) k++; } if ( k == _coordCount) break; for ( j = 0 ; j < _k ; j++) { _clusters[j]->CurrentMembership.clear(); } for ( i = 0 ; i <_coordCount ; i++) { _clusters[_nearestCluster[i]]->CurrentMembership.push_back(i); _clusterAssignments[i] = _nearestCluster[i]; } }}double KMeans::getDistance(const DoubleVec& coord , const DoubleVec& center){ return 1 - TermVector::ComputerCosineSimilarity(coord,center);}int KMeans::NearestCluster(int ndx){ int near = -1 ; double min = numeric_limits ::max(); for ( int c = 0 ; c <_k ; c++) { double d = _distanceCache[ndx][c]; if ( d < min) { min = d ; near = c ; } } return near;}KMeans::~KMeans(){ vector ::iterator iter; for ( iter = this->_clusters.begin(); iter!=_clusters.end() ; iter++) delete (*iter); _clusters.clear();}
#include "TF_IDF.h"#include "Tokeniser.h"#include#include "kmean.h"#include "cluster.h"int main(){ // 读入文档数据 StrVec strVec; ifstream inFile("c:\\input.txt"); string tempstr; while ( getline(inFile , tempstr)) { strVec.push_back(tempstr); } TFIDFMeasure tf(strVec , new Tokeniser()); int K =3 ; //聚类的个数 int docCount = strVec.size(); //生成k-mean的输入数据 Double2DVec data; data.resize(docCount); int dimension = tf.NumTerm(); for ( int i = 0 ; i < docCount ; i++) { tf.GetTermVector( i , data[i]); //获取第i个文档的TFIDF权重向量 } KMeans kmeans(data , K ); kmeans.Start(); vector clusters = kmeans._clusters; vector ::iterator iter; IntVec::iterator it2 ; for ( iter = clusters.begin() ; iter != clusters.end() ; iter++) { cout <<"------------------------------------" < CurrentMembership; for ( it2 = vec.begin() ; it2 != vec.end() ; it2++) cout < <
posted on 2012-04-11 16:04 阅读( ...) 评论( ...)