cppjieba
cppjieba copied to clipboard
请教个问题:python接口调的core dump, 而C++本身可以
我的cppjieba.py
#encoding=utf-8
from ctypes import *
import os
cur_dir = os.path.dirname( os.path.abspath(__file__)) or os.getcwd()
lib = cdll.LoadLibrary(cur_dir+'/libJieba.so')
lib.Jieba_cut.restype = py_object
lib.Jieba_tag.restype = py_object
lib.Jieba_extract.restype = py_object
class Tokenizer(object):
def __init__(self,
dict_path=cur_dir+'/dict/jieba.dict.utf8',
model_path=cur_dir+'/dict/hmm_model.utf8',
user_dict_path=cur_dir+'/dict/user.dict.utf8',
idfPath=cur_dir+'/dict/idf.utf8',
stopWordPath=cur_dir+'/dict/stop_words.utf8'):
self.obj = lib.Jieba_new(dict_path, model_path, user_dict_path, idfPath, stopWordPath)
def add_word(self, word, tag='n', num=50):
return lib.Jieba_add_word(self.obj, word, tag, num)
def load_user_dicts(self, paths):
return lib.Jieba_load_dict(self.obj, paths)
def dload_user_dicts(self, paths):
return lib.Jieba_dload_dict(self.obj, paths)
def cut(self, sentence, hmm=True):
return lib.Jieba_cut(self.obj, sentence, hmm)
def pos_cut(self, sentence):
rval=[]
r=lib.Jieba_tag(self.obj, sentence)
for s in r:
a=s.split('/')
rval.append(('/'.join(a[:-1]),a[-1]))
return rval
def extract(self, sentence, topN):
return lib.Jieba_extract(self.obj, sentence, topN)
if __name__=='__main__':
model=Tokenizer()
text='''
全国两会是数千名中外记者关注中国发展、聚焦中国命运的“新闻发布厅”。
'''
models=[]
for i in range(50):
model=Tokenizer()
s=model.cut(text)
models.append(model)
我的Jieba.cpp
#ifndef CPPJIEAB_JIEBA_H
#define CPPJIEAB_JIEBA_H
#include "QuerySegment.hpp"
#include "KeywordExtractor.hpp"
using namespace std;
namespace cppjieba {
class Jieba {
public:
Jieba(const string& dict_path,
const string& model_path,
const string& user_dict_path,
const string& idfPath,
const string& stopWordPath)
: dict_trie_(dict_path, user_dict_path),
model_(model_path),
mp_seg_(&dict_trie_),
hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_),
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
}
~Jieba() {
}
struct LocWord {
string word;
size_t begin;
size_t end;
}; // struct LocWord
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
mix_seg_.Cut(sentence, words, hmm);
}
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
mix_seg_.Cut(sentence, words, hmm);
}
void CutAll(const string& sentence, vector<string>& words) const {
full_seg_.Cut(sentence, words);
}
void CutAll(const string& sentence, vector<Word>& words) const {
full_seg_.Cut(sentence, words);
}
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
query_seg_.Cut(sentence, words, hmm);
}
void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
query_seg_.Cut(sentence, words, hmm);
}
void CutHMM(const string& sentence, vector<string>& words) const {
hmm_seg_.Cut(sentence, words);
}
void CutHMM(const string& sentence, vector<Word>& words) const {
hmm_seg_.Cut(sentence, words);
}
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
mp_seg_.Cut(sentence, words, max_word_len);
}
void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
mp_seg_.Cut(sentence, words, max_word_len);
}
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
mix_seg_.Tag(sentence, words);
}
string LookupTag(const string &str) const {
return mix_seg_.LookupTag(str);
}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG, const int& freq=10) {
return dict_trie_.InsertUserWordWeight(word, tag, freq);
}
void LoadUserDict(const string& filePaths){
return dict_trie_.LoadUserDict(filePaths);
}
void dLoadUserDict(const string& filePaths){
return dict_trie_.dLoadUserDict(filePaths);
}
void ResetSeparators(const string& s) {
//TODO
mp_seg_.ResetSeparators(s);
hmm_seg_.ResetSeparators(s);
mix_seg_.ResetSeparators(s);
full_seg_.ResetSeparators(s);
query_seg_.ResetSeparators(s);
}
const DictTrie* GetDictTrie() const {
return &dict_trie_;
}
const HMMModel* GetHMMModel() const {
return &model_;
}
private:
DictTrie dict_trie_;
HMMModel model_;
// They share the same dict trie and model
MPSegment mp_seg_;
HMMSegment hmm_seg_;
MixSegment mix_seg_;
FullSegment full_seg_;
QuerySegment query_seg_;
public:
KeywordExtractor extractor;
}; // class Jieba
} // namespace cppjieba
#endif // CPPJIEAB_JIEBA_H
using namespace cppjieba;
char* DICT_PATH = "dict/jieba.dict.utf8";
char* HMM_PATH = "dict/hmm_model.utf8";
char* USER_DICT_PATH = "dict/user.dict.utf8";
char* IDF_PATH = "dict/idf.utf8";
char* STOP_WORD_PATH = "dict/stop_words.utf8";
#include <python2.7/Python.h>
extern "C" {
/*需要以下接口:
seg.__init__()
.add_word(word,num,tag)
.del_word(word)
.all_cut(sentence)
.cut(sentence)
.tag(sentence)
key_words()
*/
Jieba* Jieba_new(char* dict_path,
char* model_path,
char* user_dict_path,
char* idfPath,
char* stopWordPath){
return new Jieba(dict_path, model_path, user_dict_path, idfPath, stopWordPath);
}
PyObject* Jieba_cut(Jieba* segmentor, char* sentence, bool hmm = true){
//cout<< segmentor->dict_trie_ << endl;
PyObject* result = PyList_New(0);
vector<string> words;
segmentor->Cut(sentence, words, hmm);
for (vector<string>::const_iterator iter =words.begin(); iter != words.end(); iter++){
PyObject* a=PyString_FromString((*iter).c_str());
PyList_Append(result,a);
Py_XDECREF(a);
}
//free(words);
return result;
}
bool Jieba_add_word(Jieba* segmentor, char* word, char* tag, int weight){
return segmentor->InsertUserWord(word, tag, weight);
}
void Jieba_load_dict(Jieba* segmentor, char* path){
return segmentor->LoadUserDict(path);
}
void Jieba_dload_dict(Jieba* segmentor, char* path){
return segmentor->dLoadUserDict(path);
}
PyObject* Jieba_tag(Jieba *segmentor, char* sentence){
PyObject* result = PyList_New(0);
vector<pair<string, string> > tagers;
segmentor->Tag(sentence, tagers);
for (vector<pair<string, string> >::const_iterator iter =tagers.begin(); iter != tagers.end(); iter++){
/*
PyObject* a=PyString_FromString((iter->first).c_str());
PyObject* b=PyString_FromString((iter->second).c_str());
PyObject* p=PyTuple_Pack(2,a,b);//很奇怪,有时候core dumped...
*/
string s=iter->first+'/'+iter->second;
PyObject* p=PyString_FromString(s.c_str());
PyList_Append(result,p);
Py_XDECREF(p);
}
return result;
}
PyObject* Jieba_extract(Jieba *segmentor, char* sentence, int topN){
PyObject* result = PyList_New(0);
vector<pair<string, double> > keywords;
//vector<Word> keywords;
segmentor->extractor.Extract(sentence, keywords, topN);
for (vector< pair<string, double> >::const_iterator iter =keywords.begin(); iter != keywords.end(); iter++){
//cout << *iter <<endl;
//string s=(*iter).word+'/'+(*iter).weight.c_str();
PyObject* weight=PyFloat_FromDouble(iter->second);
PyObject* word=PyString_FromString(iter->first.c_str());
PyObject* p=PyTuple_Pack(2,word,weight);
PyList_Append(result,p);
Py_XDECREF(weight);
Py_XDECREF(word);
Py_XDECREF(p);
}
return result;
}
}
int main(int argc, char** argv) {
vector<Jieba*> models(100);
Jieba* segmentor=Jieba_new(DICT_PATH,HMM_PATH,"user_dicts/地方.txt",IDF_PATH,STOP_WORD_PATH);
Jieba* segmentor1=Jieba_new(DICT_PATH,HMM_PATH,USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH);
PyObject* result;
char* s = "他来到了网易杭研大厦";
for (int i=0;i<100;i++){
cout<< i<<endl;
models[i]=Jieba_new(DICT_PATH,HMM_PATH,USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH);
result= Jieba_cut(models[i],s);
}
}
Jieba.cpp 编译后跑的没问题; 但是cppjieba.py编译成*.so文件,由cppjieba.py去调,
for i in range(50):
model=Tokenizer()
s=model.cut(text)
models.append(model)
会在正确运行十几次后报segment fault, 发现每次的错误都出现在 MixSegment.hpp 的
while (pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, wrs, hmm);
}
但是看core dump, 每次报错不在里面的同一行,错误类型都是
signal 11, Segmentation fault
我在想是python代码的一些行为 导致了C++变量一部分变成了空指针吗,但是有点搞不来了
您能从您的角度给我点建议,可能的问题所在吗?
还有个问题,就是我把你的DictTrie的LoadUserDict拿到public里, 如果我在初始化分词器里指定 自定义字典, 可以跑; 但是通过LoadUserDict 修改模型, 分词就会报错。 这个是目前不支持 还是 bug?
如果通过add_word 修改词典是可以分词的。
Hello,我最近写了个 Python 的封装:https://github.com/messense/cjieba-py