博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
简单的中文分词加上kmean聚类 (c++)
阅读量:5790 次
发布时间:2019-06-18

本文共 12678 字,大约阅读时间需要 42 分钟。

程序代码参考了csdn某博客,具体名字忘记了 

变量命名的头文件

//common.h#ifndef COMM_H#define COMM_H#include 
#include
#include
#include
#include
using namespace std;typedef vector
StrVec; //字符串向量typedef vector
IntVec; //整数向量typedef vector
> Int2DVec;//整数二维向量typedef vector
> Double2DVec;//浮点数二维向量typedef vector
DoubleVec;//浮点数向量#endif

去除停用词语

#pragma once#include "common.h"// 用于移除停止词class StopWordsHandler{public:    StopWordsHandler(void);    ~StopWordsHandler(void);    bool IsStopWord(string& str);private:    StrVec stopwords;};#include "StopWordHandler.h"string StopWordList[] = {
"的", "我们","要","自己","之","将","“","”",",","(",")","后","应","到","某","后","个","是","位","新","一","两","在","中","或","有","更","好",""};//停用词int strwordlen = sizeof(StopWordList) / sizeof(StopWordList[0]);StopWordsHandler::StopWordsHandler(){ for ( int i = 0 ; i < strwordlen ; i++) stopwords.push_back(StopWordList[i]);}StopWordsHandler::~StopWordsHandler(){}bool StopWordsHandler::IsStopWord(string& str){ transform(str.begin(),str.end(),str.begin(),tolower);//确保小写化 return find(stopwords.begin(),stopwords.end(),str)!=stopwords.end();}

分词选用了最简单的分词方法,预先用空格做好了分词

#pragma once#include "Common.h"class ITokeniser{public:    virtual void Partition(string input,StrVec& retWords)=0;//分词算法};#pragma once#include "Itokenisher.h"class Tokeniser :public  ITokeniser{public:    Tokeniser();    ~Tokeniser();    void Partition(string input , StrVec& retWords);};#include "Tokeniser.h"#include "StopWordHandler.h"#include 
Tokeniser::Tokeniser(){}Tokeniser::~Tokeniser(){}void Tokeniser::Partition(string input ,StrVec& retWord){ transform(input.begin() , input.end(),input.begin(),tolower); string::iterator pos = input.begin(); StopWordsHandler stopHandler; do { string temp; pos = find(input.begin() , input.end(),' '); copy(input.begin() , pos ,back_inserter(temp)); if ( !stopHandler.IsStopWord(temp)) retWord.push_back(temp); if ( pos == input.end()) break; else input.erase(input.begin() ,++pos); }while ( pos != input.end());}

TFIDF的计算

#pragma once#include "Itokenisher.h"#include class TFIDFMeasure{private:    StrVec _docs; //文档集合 , 每一行字符串代表一个文档    int _numDocs; //文档数目    int _numTerms;//单词数目    StrVec _terms;//单词集合    Int2DVec _termFreq ;//每个单词出现在每份文档的频率    Double2DVec _termWeight;//每个单词在每份文档的权重    IntVec _maxTermFreq ;//记录每份文档的最大词频    IntVec _docFreq;//出现这个单词的文档频率    ITokeniser* _tokeniser;//分词器    map
_wordIndex;//单词映射表public : TFIDFMeasure(const StrVec& document , ITokeniser * tokeniser); ~TFIDFMeasure(); inline int NumTerm( ) const { return this->_numTerms; } void GetTermVector(int doc , DoubleVec& vec);//获取项向量protected: void init();//初始化tf-idf计数 void GenerateTerms(const StrVec& ,StrVec& terms);//分词处理 void GenerateTermFrequency();//计算词频 void GenerateTermWeight();//计算词的权重 void GetWordFrequency( string & input ,map
&freq); int CountWords(string& word ,const StrVec& words); int GetTermIndex(const string& term);//查询词语对应的下标 double ComputeTermWeight(int term ,int doc);//计算词语在制定文档的频率 double GetTermFrequency(int term , int doc);//获取词语在文档的频率 double GetInverseDoucumentFrequency(int term); //计算逆文档频率 };#include "TF_IDF.h"TFIDFMeasure::~TFIDFMeasure(){ if (this->_tokeniser != NULL) { delete _tokeniser; _tokeniser = NULL; } _docs.clear(); _terms.clear(); _wordIndex.clear();}TFIDFMeasure::TFIDFMeasure(const StrVec& document , ITokeniser * tokeniser ){ _docs = document; _numDocs = document.size(); _tokeniser = tokeniser; this->init();}void TFIDFMeasure::init(){ //初始化 this->GenerateTerms(_docs,_terms); //分词 this->_numTerms = _terms.size(); //所有文档中的词项数目 //申请空间 _maxTermFreq.resize(_numDocs); _docFreq.resize(_numTerms); _termFreq.resize(_numTerms); _termWeight.resize(_numTerms); for (int i = 0 ; i < _terms.size() ; i++) { _termWeight[i].resize(_numDocs); _termFreq[i].resize(_numDocs); _wordIndex[_terms[i]] = i; //将单词放入单词映射表中 } this->GenerateTermFrequency(); this->GenerateTermWeight();}void TFIDFMeasure::GenerateTerms(const StrVec& docs ,StrVec &terms){ for (int i = 0 ; i < docs.size() ; i++) { StrVec words; _tokeniser->Partition(docs[i] , words); //分词部分 for ( int j = 0 ; j < words.size() ; j++) { if ( find(terms.begin() , terms.end(),words[j] ) == terms.end()) terms.push_back(words[j]); } }}void TFIDFMeasure::GenerateTermFrequency(){ //计算每个单词在每份文档中出现的概率 for ( int i = 0 ; i < _numDocs ; i++) { string curDoc = _docs[i]; //当前待处理的文档 map
freq; this->GetWordFrequency(curDoc ,freq); map
::iterator iter; _maxTermFreq[i] = numeric_limits
::min(); for ( iter = freq.begin() ; iter != freq.end() ; iter++) { string word = iter->first; int wordFreq = iter->second; int termIndex = GetTermIndex(word); //单词下标 if ( termIndex == -1) continue; _termFreq[termIndex][i] = wordFreq; _docFreq[termIndex]++; if ( wordFreq > _maxTermFreq[i]) _maxTermFreq[i] = wordFreq; } }}int TFIDFMeasure::GetTermIndex(const string & term){ map
::iterator pos = _wordIndex.find(term); if ( pos != _wordIndex.end()) return pos->second; else return -1;}class WordComp {public: WordComp(string& sWord) : word(sWord) { } bool operator() (const string& lhs) { return lhs.compare(word)==0; } private: string word; };void TFIDFMeasure::GetWordFrequency( string & input , map
& freq){ //计算单词频率 transform(input.begin(),input.end(),input.begin(),tolower); StrVec temp; this->_tokeniser->Partition(input , temp); unique(temp.begin() , temp.end()); StrVec::iterator iter; for ( iter = temp.begin() ; iter != temp.end() ; iter++) { int count = CountWords(*iter , temp); //计算单词在文档中出现的次数 freq[*iter] = count; }}int TFIDFMeasure::CountWords(string & word ,const StrVec& temp){ //计算每个单词在该文档的词频数目 int ncount = 0 ; ncount = count_if(temp.begin() , temp.end() , WordComp(word)); return ncount ;}void TFIDFMeasure::GenerateTermWeight(){ for (int i = 0 ; i < _numTerms ; i++) for (int j = 0 ; j < _numDocs ; j++) _termWeight[i][j] = ComputeTermWeight( i , j );}double TFIDFMeasure::ComputeTermWeight(int term , int doc){ float tf = GetTermFrequency(term , doc); float idf = GetInverseDoucumentFrequency(term); return tf * idf ;}double TFIDFMeasure::GetTermFrequency(int term , int doc){ int freq = _termFreq[term][doc]; //词频 int maxfreq = _maxTermFreq[doc]; return ((float) freq /(float)maxfreq);}double TFIDFMeasure::GetInverseDoucumentFrequency(int term){ int df = _docFreq[term]; return log((float)(_numDocs)/(float)df);}void TFIDFMeasure::GetTermVector(int doc ,DoubleVec& vec){ vec.resize(this->_numTerms); for ( int i = 0 ; i < this->_numTerms ; i++) vec[i] = _termWeight[i][doc];}

计算余弦相似性距离

#pragma once#include "common.h"class TermVector{public:    static double ComputerCosineSimilarity(const DoubleVec& vector1 , const DoubleVec& vector2 );    static double innerProduct(const DoubleVec& v1 ,const DoubleVec& v2);    static double VectorLength(const DoubleVec & v);};#include "TermVector.h"#include 
double TermVector::ComputerCosineSimilarity(const DoubleVec & v1 , const DoubleVec& v2){ if ( v1.size() != v2.size()) throw string("different length"); double denom = (VectorLength(v1) * VectorLength(v2)); if ( denom == 0 ) return 0 ; else return (innerProduct(v1 , v2) / denom);}double TermVector::innerProduct(const DoubleVec & v1 , const DoubleVec& v2){ if ( v1.size() != v2.size()) throw string ("different length"); double result = 0.0f; for ( int i = 0 ; i < v1.size() ; i++) result+=v1[i]*v2[i]; return result;}double TermVector::VectorLength(const DoubleVec & v){ double sum = 0.0f; for ( int i = 0 ; i < v.size() ; i++) sum= sum+(v[i] * v[i]); return (double)sqrt(sum);}

定义cluster的类

#pragma once#include "common.h"class Cluster{public:    IntVec CurrentMembership; //该类簇的数据成员索引    DoubleVec Mean ; //该簇类的聚类中心    Cluster();    ~Cluster();    Cluster(int dataindex , DoubleVec & data);    void UpdateMean(Double2DVec & coordinates);};#include "cluster.h"Cluster::Cluster(){    }Cluster::Cluster(int dataindex , DoubleVec& data){    CurrentMembership.push_back(dataindex);    copy(data.begin() , data.end() ,back_inserter(Mean));}void Cluster::UpdateMean(Double2DVec & coordinates){    //根据 mcurrentmembership取得原始资料点对象    //根据该子集的均值,corrdinate是一个m* n的矩阵,其实就是要求每列的均值    for (int i = 0 ; i< CurrentMembership.size();i++)    {        DoubleVec& coord = coordinates[CurrentMembership[i]];        for ( int j = 0 ; j < coord.size() ; j++)            Mean[j]+=coord[j];        for (int k = 0 ; k 
#pragma once#include "common.h"class Cluster;class KMeans{public:    vector
_clusters; KMeans(Double2DVec& data, int K); void Start(); ~KMeans();private: int _coordCount; //数据的数量 Double2DVec _coordinates;//原始数据 int _k; //聚类的簇个数 IntVec _clusterAssignments; IntVec _nearestCluster; Double2DVec _distanceCache; void InitRandom(); static double getDistance(const DoubleVec & coord ,const DoubleVec& center); int NearestCluster(int ndx); };#include "kmean.h"#include
#include "cluster.h"#include "TermVector.h"#include
KMeans::KMeans(Double2DVec &data , int k ){ int i ; this->_coordinates.resize(data.size()); for ( i = 0 ; i
UpdateMean(_coordinates); } //计算每个数据和每个簇类中心的距离 for ( i = 0 ; i <_coordCount ; i++) { for ( j = 0 ; j <_k ; j++) { double dist = getDistance(_coordinates[i],_clusters[j]->Mean); _distanceCache[i][j] = dist; } } //计算每个数据离簇类最近 for ( i = 0 ; i <_coordCount ; i++) _nearestCluster[i] = this->NearestCluster(i); int k = 0 ; for ( i = 0 ; i <_coordCount ; i++) { if (_nearestCluster[i] == _clusterAssignments[i]) k++; } if ( k == _coordCount) break; for ( j = 0 ; j < _k ; j++) { _clusters[j]->CurrentMembership.clear(); } for ( i = 0 ; i <_coordCount ; i++) { _clusters[_nearestCluster[i]]->CurrentMembership.push_back(i); _clusterAssignments[i] = _nearestCluster[i]; } }}double KMeans::getDistance(const DoubleVec& coord , const DoubleVec& center){ return 1 - TermVector::ComputerCosineSimilarity(coord,center);}int KMeans::NearestCluster(int ndx){ int near = -1 ; double min = numeric_limits
::max(); for ( int c = 0 ; c <_k ; c++) { double d = _distanceCache[ndx][c]; if ( d < min) { min = d ; near = c ; } } return near;}KMeans::~KMeans(){ vector
::iterator iter; for ( iter = this->_clusters.begin(); iter!=_clusters.end() ; iter++) delete (*iter); _clusters.clear();}
#include "TF_IDF.h"#include "Tokeniser.h"#include 
#include "kmean.h"#include "cluster.h"int main(){ // 读入文档数据 StrVec strVec; ifstream inFile("c:\\input.txt"); string tempstr; while ( getline(inFile , tempstr)) { strVec.push_back(tempstr); } TFIDFMeasure tf(strVec , new Tokeniser()); int K =3 ; //聚类的个数 int docCount = strVec.size(); //生成k-mean的输入数据 Double2DVec data; data.resize(docCount); int dimension = tf.NumTerm(); for ( int i = 0 ; i < docCount ; i++) { tf.GetTermVector( i , data[i]); //获取第i个文档的TFIDF权重向量 } KMeans kmeans(data , K ); kmeans.Start(); vector
clusters = kmeans._clusters; vector
::iterator iter; IntVec::iterator it2 ; for ( iter = clusters.begin() ; iter != clusters.end() ; iter++) { cout <<"------------------------------------" <
CurrentMembership; for ( it2 = vec.begin() ; it2 != vec.end() ; it2++) cout <
<

 

 

posted on
2012-04-11 16:04 阅读(
...) 评论(
...)

转载于:https://www.cnblogs.com/lzhenf/archive/2012/04/11/2442526.html

你可能感兴趣的文章
信号量实现进程同步
查看>>
Spring4-自动装配Beans-通过构造函数参数的数据类型按属性自动装配Bean
查看>>
win10.64位wnmp-nginx1.14.0 + PHP 5. 6.36 + MySQL 5.5.59 环境配置搭建 结合Thinkphp3.2.3
查看>>
如何查看python selenium的api
查看>>
Python_Mix*random模块,time模块,sys模块,os模块
查看>>
iframe刷新问题
查看>>
数据解码互联网行业职位
查看>>
我所见的讲的最容易理解,逻辑最强的五层网络模型,来自大神阮一峰
查看>>
vue-cli项目打包需要修改的路径问题
查看>>
js实现复选框的操作-------Day41
查看>>
数据结构化与保存
查看>>
[SpringBoot] - 配置文件的多种形式及优先级
查看>>
chrome浏览器开发者工具之同步修改至本地
查看>>
debian7 + wheezy + chromium + flashplayer
查看>>
AOP
查看>>
进阶开发——文档,缓存,ip限速
查看>>
vue中子组件需调用父组件通过异步获取的数据
查看>>
uva 11468 - Substring(AC自己主动机+概率)
查看>>
Mysql 数据备份与恢复,用户创建,授权
查看>>
沉思录
查看>>