Höhere Programmierung in der Computerlinguistik mit C++ - Vorlesung
10
Zurück
C++ Vorlesung 10 - Tokenizer - main.cpp
Die main ruft verschiedne methoden der Klasse Tokenizer auf um einen Text tokenweise einzulesen.
Benötigt boost-regex.
Es ist notwendig den Pfad zur zu verwendenden Textdatei mit der Variable testFile zu setzen.
#include "Tokenizer.h"
using namespace std;
int main(int argc, char** argv) {
Tokenizer t;
string testFile = "";
wcout << "Mit Regex aus Boost" << endl;
t.resetFile(testFile);
t.tokenizeByRegex();
wcout << "Mit Split aus Boost" << endl;
t.resetFile(testFile);
t.tokenizeByStringAlgorithm_split();
wcout << "Mit eignem Split-Algorithmus" << endl;
t.resetFile(testFile);
t.tokenizeByMySplit();
wcout << "Mit IO" << endl;
t.resetFile(testFile);
t.tokenizeByStreamOperator();
}
C++ Vorlesung 10 - Tokenizer - Tokenizer.h
Header-Datei der Klasse Tokenizer.
#ifndef TOKENIZER_H
#define TOKENIZER_H
#include<fstream>
#include<iostream>
#include<time.h>
#include <boost/algorithm/string.hpp>
#include <boost/foreach.hpp>
#include <boost/regex.hpp>
#include<vector>
class Tokenizer {
public:
Tokenizer();
Tokenizer(const Tokenizer& orig);
virtual ~Tokenizer();
void tokenizeByStreamOperator();
void tokenizeByStringAlgorithm_split();
void tokenizeByMySplit();
void tokenizeByRegex();
void resetCounter();
void countToken(std::wstring token);
void resetFile(std::string);
protected:
void openFile(std::string);
void printDebugMessage();
time_t startTime;
long tokenCount;
std::wifstream f;
};
#endif
C++ Vorlesung 10 - Tokenizer - Tokenizer.cpp
Implementierungsdatei der Klasse Tokenizer.
#define NUM_OF_TOKENS_FOR_MESSAGE 1000000
#include "Tokenizer.h"
using namespace std;
Tokenizer::Tokenizer() {
this->startTime = time(NULL);
}
Tokenizer::Tokenizer(const Tokenizer& orig) {
}
Tokenizer::~Tokenizer() {
}
void Tokenizer::resetFile(string filename) {
this->openFile(filename);
this->resetCounter();
}
void Tokenizer::countToken(std::wstring token) {
this->tokenCount++;
this->printDebugMessage();
}
void Tokenizer::resetCounter() {
this->startTime = time(NULL);
this->tokenCount = 0;
wcout << "__Resetting Counter__" << endl;
}
void Tokenizer::openFile(string filename) {
if( f.is_open()) { f.close(); }
this->f.open(filename.c_str() , wifstream::in );
this->f.imbue( locale(""));
}
void Tokenizer::printDebugMessage() {
if ( this->tokenCount % NUM_OF_TOKENS_FOR_MESSAGE == 0 ) {
long seconds = time(NULL) - this->startTime;
wcout << this->tokenCount << " Tokens ->\t" << seconds << " seconds." << endl;;
}
}
void Tokenizer::tokenizeByStreamOperator( ) {
wstring token;
while ( f >> token ) {
countToken(token);
}
}
void Tokenizer::tokenizeByMySplit( ) {
wstring line;
wstring token;
wchar_t c = ' ';
while ( getline( f,line ) ) {
wstring::size_type i = 0;
wstring::size_type j = line.find(c);
while (j != wstring::npos) {
token = line.substr( i, j-i ) ;
i = ++j;
j = line.find(c,j);
countToken(token);
if (j == string::npos) {
countToken(line.substr(i,j-i));
}
}
}
}
void Tokenizer::tokenizeByStringAlgorithm_split( ) {
wstring line;
vector<wstring> tokens;
while ( getline(f , line) ) {
boost::split(tokens, line, boost::is_any_of(".,!?; "), boost::token_compress_on);
BOOST_FOREACH(wstring token, tokens) {
countToken(token);
}
}
}
void Tokenizer::tokenizeByRegex( ) {
wstring line;
boost::wregex re(L"\\S+");
while ( getline(f,line) ) {
boost::regex_token_iterator<wstring::const_iterator > aMatch( line.begin(), line.end(), re) ;
boost::regex_token_iterator<wstring::const_iterator> noMatch ;
while (aMatch != noMatch ) {
countToken( *aMatch);
aMatch++;
}
}
}
C++ Vorlesung 10 - FrequencyList - main.cpp
Main-Method zur Klasse FrequencyList.
Muss mit boost-regex und der Klasse Tokenizer zusammen kompiliert werden.
Der Methode frq.resetFile muss ein Pfad zu einer Textdatei als String übergeben werden.
#include "FrequencyList.h"
using namespace std;
int main() {
FrequencyList frq;
frq.resetFile("");
frq.addTokens();
long seconds = time(NULL);
wcout << "Time before sort: " << seconds << endl;
frq.sort();
wcout << "sorting took: " << time(NULL) - seconds << " second/s." << endl;
frq.printTopX(25);
}
C++ Vorlesung 10 - FrequencyList - FrequencyList.h
Header-Datei von FrequencyList.
Über die typedef "frqContainer" kann festgelegt werden welche Map als Container benutzt werden sollen.
Bei den Structs handelt es sich um Sortierfunktionen.
#ifndef FREQUENCYLIST_H
#define FREQUENCYLIST_H
#include "Tokenizer.h"
#include <boost/unordered_map.hpp>
#include <map>
#include <utility>
#include <algorithm>
class FrequencyList : public Tokenizer{
public:
FrequencyList();
void addToken(std::wstring);
void addTokens();
void sort();
void sort2();
void printTopX(int);
protected:
void sanitizeToken(std::wstring &);
private:
typedef std::map<std::wstring,long> frqContainer;
frqContainer frq;
struct absteigend{
bool operator()(int a, int b) const {
return a > b;
}
};
typedef std::multimap<long, std::wstring, absteigend> sortContainer;
sortContainer sortedFrq;
typedef std::pair<std::wstring, long> tuple;
bool myPairVectorSort( tuple& , tuple&);
struct absteigend2{
bool operator()(tuple a, tuple b) const {
return a > b;
}
} mySort2;
};
#endif
C++ Vorlesung 10 - FrequencyList - FrequencyList.cpp
Implementierungsdatei von FrequencyList.
#include "FrequencyList.h"
using namespace std;
FrequencyList::FrequencyList() : Tokenizer(){
}
void FrequencyList::addToken(wstring token) {
frqContainer::iterator it;
sanitizeToken(token);
it = frq.find(token);
if (it != frq.end() ) {
it ->second++;
}
else {
frq.insert( frqContainer::value_type(token,1) );
}
}
void FrequencyList::addTokens( ) {
wstring token;
while ( f >> token ) {
addToken(token);
countToken(token);
}
}
void FrequencyList::sanitizeToken(wstring & token) {
}
void FrequencyList::sort() {
BOOST_FOREACH( frqContainer::value_type kv, frq ) {
sortedFrq.insert( sortContainer::value_type(kv.second,kv.first) );
}
}
void FrequencyList::sort2() {
vector< tuple > sorted;
BOOST_FOREACH( frqContainer::value_type kv, frq ) {
sorted.push_back( tuple(kv.first,kv.second) );
}
std::sort(sorted.begin(), sorted.end(), mySort2 );
}
void FrequencyList::printTopX(int x) {
BOOST_FOREACH( sortContainer::value_type kv , sortedFrq) {
if ( x-- <= 0 ) { break ;}
wcout << kv.first << "\t" << kv.second << endl;
}
}
Zurück