- 论坛徽章:
- 0
|
本帖最后由 L_WC 于 2017-04-19 12:49 编辑
回复 1# zhouzhen1
首先感谢lz, 刚好前段时间有个需求要用分词,我用perl调的python的jieba....如果这个perl的好用了,很有帮助。。。谢谢
刚使用了一下,
1. 有个问题想请教一下,extract的score 是根据什么来的。
2 . 在 $jieba->insert_user_word("中国人"); 之后只有 {cut_all => 1} 模式才能输出 中国人 这个分词。
- <div>use Lingua::ZH::Jieba;</div><div>binmode STDOUT, ":utf8";</div><div>
- </div><div>my $jieba = Lingua::ZH::Jieba->new();</div><div>$jieba->insert_user_word("中国人");</div><div>
- </div><div>#my $words_cutall = $jieba->cut("我来到北京清华大学,我是中国人", { cut_all => 1 } );</div><div>#my $words_cutall = $jieba->cut("我来到北京清华大学,我是中国人" );</div><div>my $words_cutall = $jieba->cut("我来到北京清华大学,我是中国人", { no_hmm => 1 } );</div><div>print join('/', @$words_cutall), "\n";</div>
复制代码
。。。。update 一下 2017/4/19 用lz的分词包写了一个文本比对的例子- [root@L121 tmp]# perl jieba.pl
- Similar is :0.815374248327211
- [root@L121 tmp]# cat jieba.pl
- use Lingua::ZH::Jieba;
- use Smart::Comments;
- binmode STDOUT, ":utf8";
- my ($txt1, $txt2) = ("我来到北京清华大学,我是中国人","我来到北京清华大学,中国人民很友好");
- my $jieba = Lingua::ZH::Jieba->new();
- my $words_cutall1 = $jieba->cut( $txt1, { cut_all => 1 } );
- my $words_cutall2 = $jieba->cut( $txt2, { cut_all => 1 } );
- #print join('/', @$words_cutall1), "\n";
- #print join('/', @$words_cutall2), "\n";
- use v5.10;
- my $all_list = get_all_words($words_cutall1, $words_cutall2);
- my %h_1 = get_words_count($words_cutall1);
- my %h_2 = get_words_count($words_cutall2);
- my $sum = 0;
- my $sum_1 = 0;
- my $sum_2 = 0;
- foreach my $k (get_all_words($words_cutall1, $words_cutall2)){
- $h_1{$k} = 0 unless exists $h_1{$k};
- $h_2{$k} = 0 unless exists $h_2{$k};
- $sum_1 += $h_1{$k} * $h_1{$k};
- $sum_2 += $h_2{$k} * $h_2{$k};
- $sum += $h_1{$k} * $h_2{$k};
- }
- say "Similar is :", $sum/(sqrt($sum_1) * sqrt($sum_2));
- sub get_words_count{
- my $words = shift;
- my %h;
- $h{$_}++ for @$words;
- return %h;
- }
- sub get_all_words{
- my ($words1, $words2) = @_;
- my @words = (@$words1, @$words2);
- my %h;
- $h{$_}++ for @words;
- return keys %h;
- }
复制代码
|
|