White scenery @showyou, hatena

If you have any comments, you may also send twitter @shsub or @showyou.

libsvm続き

昨日の続き。もはやlibsvm関係なくなってますが。
昨日は入力にわかち書きが必要だったので、普通の文を分割して分割できるようにした。
一応各メソッド手動でテスト書いたけど、うまいテストコード書き方ないかなぁ。

in

# あとで1ってラベル付けてる
textOhayou = [
u"おはようございます",
u"おはようございなのよ",
u"おはようー"
]

# あとで-1ってラベル付けてる
textTadaima = [
u"ただいまなのよ",
]

testTexts = [
u"おはようござい",
u"ただいまー",
u"おはようです",
]

out

[ [ [1, 1, 1], 1], [[1, 1, 0, 1, 1, 1], 1], [[1, 0, 0, 0, 0, 0, 1], 1], [[0, 0, 0, 1, 1, 1, 0, 1], -1]]
[ [ [1, 1, 0, 0, 0, 0, 0, 0], 0], [[0, 0, 0, 0, 0, 0, 1, 1], 0], [[1, 0, 0, 0, 0, 0, 0, 0, 1], 0]]
([1, 1, 1, -1], [[1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 1, 1, 0, 1]])
.*
optimization finished, #iter = 7
nu = 0.035728
obj = -0.714286, rho = -0.142782
nSV = 3, nBSV = 0
Total nSV = 3
[ [1, 1, 0, 0, 0, 0, 0, 0], 0] : 1.0
[ [0, 0, 0, 0, 0, 0, 1, 1], 0] : -1.0
[ [1, 0, 0, 0, 0, 0, 0, 0, 1], 0] : 1.0

ソース

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from svm import *
import mecab

mecab_path = "/usr/lib/libmecab.so.1"

def split_text(class_id, text):
    """
    in:
        1,  u"おはようございます",
    out:
        [ [u"おはよう", u"ござい", u"ます"], 1],
    """
    sparse_array =  mecab.sparse_all( text, mecab_path ).split("\n")
    word = []
    for sa in sparse_array[:-2]:
        word.append( sa.split("\t")[0] )
    result = [ word, class_id ]
    return result


def split_texts( class_id, texts ):
    """
    in:
        1, [
            u"おはようございます",
            u"おはようなのよ",
        ]
    out:
        [
            [ [u"おはよう", u"ございます"], 1],
            [ [u"おはよう", u"なのよ"], 1],
        ]
    """
    result = []
    for t in texts:
        result.append( split_text( class_id, t.encode("utf-8") ) )
    return result


def get_list_index( data, list ):
    for i in range(len(list)):
        if data == list[i]:
            return i
    else:
        list.append( data )
        return len(list)-1


def text2vector( textArrays, wordList):
    """
    in:
        textArrays = [
            [ [ u"おはよう", u"ござい", u"ます" ],  1],
            [ [ u"おはよう", u"ござい", u"なのよ"], 1],
            [ [ u"こんばんわ", u"なのよ"], -1],
            [ [ u"おはよう", u"ー"], 1]
        ]

        testTexts = [
            [[ u"おはよう", u"ござい" ], 0],
            [[ u"こんばんわ", u"ー" ], 0]
        ]

    out:
        learning_data = [
            [[ 1, 1, 1, 0, 0 ], 1],
            [[ 1, 1, 0, 1, 0 ], 1],
            [[ 0, 0, 0, 1, 1 ], -1],
        ]

        test_data = [
            [ 1, 1, 0, 0, 0 ],
            [ 1, 0, 0, 1, 0 ],
            [ 0, 0, 0, 0, 1, 1],
        ]
    """
    out_data = []
    for oneText in textArrays:
        out_one_data = [ 0 for i in wordList]
        for t in oneText[0]:
            i = get_list_index( t, wordList )
            if( i < len(out_one_data) ):
                out_one_data[i] = 1
            else:
                out_one_data.append(1)
        out_data.append( [out_one_data, oneText[1]] )

    print out_data
    return out_data, wordList


def learn_and_predict(larning_data, test_data):

    new_learning_class = []
    new_learning_data = []
    for ld in learning_data:
        new_learning_class.append(ld[1])
        new_learning_data.append(ld[0])
    new_learning_data_set = (new_learning_class, new_learning_data)
    print new_learning_data_set

    prob = svm_problem( *new_learning_data_set )
    param = svm_parameter(kernel_type = LINEAR, C = 10)
    m = svm_model(prob, param)
    for t in test_data:
        print t, ":", m.predict(t[0])


if __name__ == "__main__":

    textOhayou = [
        u"おはようございます",
        u"おはようございなのよ",
        u"おはようー"
    ]
    textTadaima = [
        u"ただいまなのよ",
    ]

    testTexts = [
        u"おはようござい",
        u"ただいまー",
        u"おはようです",
    ]
    s = split_texts(1, textOhayou)
    s += split_texts(-1, textTadaima)

    testTexts = split_texts(0, testTexts)
    wordList = []
    learning_data, wordList = text2vector( s, wordList )
    test_data, wordList = text2vector( testTexts, wordList )
    learn_and_predict( learning_data, test_data)

mecab.py

# -*- coding: utf-8 -*-
from ctypes import *

def sparse_all(s,mecabpath):
    # ライブラリの場所を指定
    # ライブラリを ctypes を使って読み込み
    lib = cdll.LoadLibrary(mecabpath)

    # 解析器初期化用の引数を指定(第二引数無しで普通の解析)
    argc = c_int(2)
    argv = (c_char_p * 2)("mecab", "")

    # 解析器のオブジェクトを作る
    tagger = lib.mecab_new(argc, argv)

    """ 指定された文字列を品詞など調べて返す。 """
    s = lib.mecab_sparse_tostr(tagger, s)
    ret = c_char_p(s).value

    # 終わったら、一応、殺しておく
    lib.mecab_destroy(tagger)
    return ret

これでとりあえずは自分で集めたテキストを分類しやすそうだな。