libsvm続き
昨日の続き。もはやlibsvm関係なくなってますが。
昨日は入力にわかち書きが必要だったので、普通の文を分割して分割できるようにした。
一応各メソッド手動でテスト書いたけど、うまいテストコード書き方ないかなぁ。
in
# あとで1ってラベル付けてる
textOhayou = [
u"おはようございます",
u"おはようございなのよ",
u"おはようー"
]# あとで-1ってラベル付けてる
textTadaima = [
u"ただいまなのよ",
]testTexts = [
u"おはようござい",
u"ただいまー",
u"おはようです",
]
out
[ [ [1, 1, 1], 1], [[1, 1, 0, 1, 1, 1], 1], [[1, 0, 0, 0, 0, 0, 1], 1], [[0, 0, 0, 1, 1, 1, 0, 1], -1]]
[ [ [1, 1, 0, 0, 0, 0, 0, 0], 0], [[0, 0, 0, 0, 0, 0, 1, 1], 0], [[1, 0, 0, 0, 0, 0, 0, 0, 1], 0]]
([1, 1, 1, -1], [[1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 1, 1, 0, 1]])
.*
optimization finished, #iter = 7
nu = 0.035728
obj = -0.714286, rho = -0.142782
nSV = 3, nBSV = 0
Total nSV = 3
[ [1, 1, 0, 0, 0, 0, 0, 0], 0] : 1.0
[ [0, 0, 0, 0, 0, 0, 1, 1], 0] : -1.0
[ [1, 0, 0, 0, 0, 0, 0, 0, 1], 0] : 1.0
ソース
#!/usr/bin/env python # -*- coding: utf-8 -*- from svm import * import mecab mecab_path = "/usr/lib/libmecab.so.1" def split_text(class_id, text): """ in: 1, u"おはようございます", out: [ [u"おはよう", u"ござい", u"ます"], 1], """ sparse_array = mecab.sparse_all( text, mecab_path ).split("\n") word = [] for sa in sparse_array[:-2]: word.append( sa.split("\t")[0] ) result = [ word, class_id ] return result def split_texts( class_id, texts ): """ in: 1, [ u"おはようございます", u"おはようなのよ", ] out: [ [ [u"おはよう", u"ございます"], 1], [ [u"おはよう", u"なのよ"], 1], ] """ result = [] for t in texts: result.append( split_text( class_id, t.encode("utf-8") ) ) return result def get_list_index( data, list ): for i in range(len(list)): if data == list[i]: return i else: list.append( data ) return len(list)-1 def text2vector( textArrays, wordList): """ in: textArrays = [ [ [ u"おはよう", u"ござい", u"ます" ], 1], [ [ u"おはよう", u"ござい", u"なのよ"], 1], [ [ u"こんばんわ", u"なのよ"], -1], [ [ u"おはよう", u"ー"], 1] ] testTexts = [ [[ u"おはよう", u"ござい" ], 0], [[ u"こんばんわ", u"ー" ], 0] ] out: learning_data = [ [[ 1, 1, 1, 0, 0 ], 1], [[ 1, 1, 0, 1, 0 ], 1], [[ 0, 0, 0, 1, 1 ], -1], ] test_data = [ [ 1, 1, 0, 0, 0 ], [ 1, 0, 0, 1, 0 ], [ 0, 0, 0, 0, 1, 1], ] """ out_data = [] for oneText in textArrays: out_one_data = [ 0 for i in wordList] for t in oneText[0]: i = get_list_index( t, wordList ) if( i < len(out_one_data) ): out_one_data[i] = 1 else: out_one_data.append(1) out_data.append( [out_one_data, oneText[1]] ) print out_data return out_data, wordList def learn_and_predict(larning_data, test_data): new_learning_class = [] new_learning_data = [] for ld in learning_data: new_learning_class.append(ld[1]) new_learning_data.append(ld[0]) new_learning_data_set = (new_learning_class, new_learning_data) print new_learning_data_set prob = svm_problem( *new_learning_data_set ) param = svm_parameter(kernel_type = LINEAR, C = 10) m = svm_model(prob, param) for t in test_data: print t, ":", m.predict(t[0]) if __name__ == "__main__": textOhayou = [ u"おはようございます", u"おはようございなのよ", u"おはようー" ] textTadaima = [ u"ただいまなのよ", ] testTexts = [ u"おはようござい", u"ただいまー", u"おはようです", ] s = split_texts(1, textOhayou) s += split_texts(-1, textTadaima) testTexts = split_texts(0, testTexts) wordList = [] learning_data, wordList = text2vector( s, wordList ) test_data, wordList = text2vector( testTexts, wordList ) learn_and_predict( learning_data, test_data)
mecab.py
# -*- coding: utf-8 -*- from ctypes import * def sparse_all(s,mecabpath): # ライブラリの場所を指定 # ライブラリを ctypes を使って読み込み lib = cdll.LoadLibrary(mecabpath) # 解析器初期化用の引数を指定(第二引数無しで普通の解析) argc = c_int(2) argv = (c_char_p * 2)("mecab", "") # 解析器のオブジェクトを作る tagger = lib.mecab_new(argc, argv) """ 指定された文字列を品詞など調べて返す。 """ s = lib.mecab_sparse_tostr(tagger, s) ret = c_char_p(s).value # 終わったら、一応、殺しておく lib.mecab_destroy(tagger) return ret
これでとりあえずは自分で集めたテキストを分類しやすそうだな。