采用Nagao算法统计各个子字符串的频次,然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息(内部凝聚度)。
名词解释:
Nagao算法:一种快速的统计文本里所有子字符串频次的算法。详细算法可见http://www.doc88.com/p-664123446503.html
词频:该字符串在文档中出现的次数。出现次数越多越重要。
左右邻个数:文档中该字符串的左边和右边出现的不同的字的个数。左右邻越多,说明字符串成词概率越高。
左右熵:文档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上面的指标,有一定区别。
交互信息:每次将某字符串分成两部分,左半部分字符串和右半部分字符串,计算其同时出现的概率除于其各自独立出现的概率,最后取所有的划分里面概率最小值。这个值越大,说明字符串内部凝聚度越高,越可能成词。
算法具体流程:
1. 将输入文件逐行读入,按照非汉字([^\u4E00-\u9FA5]+)以及停词“的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说”,
分成一个个字符串,代码如下:
String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
停用词可以修改。
2. 获取所有切分后的字符串的左子串和右子串,分别加入左、右PTable
3. 对PTable排序,并计算LTable。LTable记录的是,排序后的PTable中,下一个子串同上一个子串具有相同字符的数量
4. 遍历PTable和LTable,即可得到所有子字符串的词频、左右邻
5. 根据所有子字符串的词频、左右邻结果,输出字符串的词频、左右邻个数、左右熵、交互信息
1. NagaoAlgorithm.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
|
package com.algo.word; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class NagaoAlgorithm { private int N; private List<String> leftPTable; private int [] leftLTable; private List<String> rightPTable; private int [] rightLTable; private double wordNumber; private Map<String, TFNeighbor> wordTFNeighbor; private final static String stopwords = "的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说" ; private NagaoAlgorithm(){ //default N = 5 N = 5 ; leftPTable = new ArrayList<String>(); rightPTable = new ArrayList<String>(); wordTFNeighbor = new HashMap<String, TFNeighbor>(); } //reverse phrase private String reverse(String phrase) { StringBuilder reversePhrase = new StringBuilder(); for ( int i = phrase.length() - 1 ; i >= 0 ; i--) reversePhrase.append(phrase.charAt(i)); return reversePhrase.toString(); } //co-prefix length of s1 and s2 private int coPrefixLength(String s1, String s2){ int coPrefixLength = 0 ; for ( int i = 0 ; i < Math.min(s1.length(), s2.length()); i++){ if (s1.charAt(i) == s2.charAt(i)) coPrefixLength++; else break ; } return coPrefixLength; } //add substring of line to pTable private void addToPTable(String line){ //split line according to consecutive none Chinese character String[] phrases = line.split( "[^\u4E00-\u9FA5]+|[" +stopwords+ "]" ); for (String phrase : phrases){ for ( int i = 0 ; i < phrase.length(); i++) rightPTable.add(phrase.substring(i)); String reversePhrase = reverse(phrase); for ( int i = 0 ; i < reversePhrase.length(); i++) leftPTable.add(reversePhrase.substring(i)); wordNumber += phrase.length(); } } //count lTable private void countLTable(){ Collections.sort(rightPTable); rightLTable = new int [rightPTable.size()]; for ( int i = 1 ; i < rightPTable.size(); i++) rightLTable[i] = coPrefixLength(rightPTable.get(i- 1 ), rightPTable.get(i)); Collections.sort(leftPTable); leftLTable = new int [leftPTable.size()]; for ( int i = 1 ; i < leftPTable.size(); i++) leftLTable[i] = coPrefixLength(leftPTable.get(i- 1 ), leftPTable.get(i)); System.out.println( "Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable" ); } //according to pTable and lTable, count statistical result: TF, neighbor distribution private void countTFNeighbor(){ //get TF and right neighbor for ( int pIndex = 0 ; pIndex < rightPTable.size(); pIndex++){ String phrase = rightPTable.get(pIndex); for ( int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){ String word = phrase.substring( 0 , length); TFNeighbor tfNeighbor = new TFNeighbor(); tfNeighbor.incrementTF(); if (phrase.length() > length) tfNeighbor.addToRightNeighbor(phrase.charAt(length)); for ( int lIndex = pIndex+ 1 ; lIndex < rightLTable.length; lIndex++){ if (rightLTable[lIndex] >= length){ tfNeighbor.incrementTF(); String coPhrase = rightPTable.get(lIndex); if (coPhrase.length() > length) tfNeighbor.addToRightNeighbor(coPhrase.charAt(length)); } else break ; } wordTFNeighbor.put(word, tfNeighbor); } } //get left neighbor for ( int pIndex = 0 ; pIndex < leftPTable.size(); pIndex++){ String phrase = leftPTable.get(pIndex); for ( int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){ String word = reverse(phrase.substring( 0 , length)); TFNeighbor tfNeighbor = wordTFNeighbor.get(word); if (phrase.length() > length) tfNeighbor.addToLeftNeighbor(phrase.charAt(length)); for ( int lIndex = pIndex + 1 ; lIndex < leftLTable.length; lIndex++){ if (leftLTable[lIndex] >= length){ String coPhrase = leftPTable.get(lIndex); if (coPhrase.length() > length) tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length)); } else break ; } } } System.out.println( "Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor" ); } //according to wordTFNeighbor, count MI of word private double countMI(String word){ if (word.length() <= 1 ) return 0 ; double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber; List<Double> mi = new ArrayList<Double>(word.length()); for ( int pos = 1 ; pos < word.length(); pos++){ String leftPart = word.substring( 0 , pos); String rightPart = word.substring(pos); double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber; double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber; mi.add(coProbability/(leftProbability*rightProbability)); } return Collections.min(mi); } //save TF, (left and right) neighbor number, neighbor entropy, mutual information private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){ try { //read stop words file Set<String> stopWords = new HashSet<String>(); BufferedReader br = new BufferedReader( new FileReader(stopList)); String line; while ((line = br.readLine()) != null ){ if (line.length() > 1 ) stopWords.add(line); } br.close(); //output words TF, neighbor info, MI BufferedWriter bw = new BufferedWriter( new FileWriter(out)); for (Map.Entry<String, TFNeighbor> entry : wordTFNeighbor.entrySet()){ if ( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue ; TFNeighbor tfNeighbor = entry.getValue(); int tf, leftNeighborNumber, rightNeighborNumber; double mi; tf = tfNeighbor.getTF(); leftNeighborNumber = tfNeighbor.getLeftNeighborNumber(); rightNeighborNumber = tfNeighbor.getRightNeighborNumber(); mi = countMI(entry.getKey()); if (tf > Integer.parseInt(threshold[ 0 ]) && leftNeighborNumber > Integer.parseInt(threshold[ 1 ]) && rightNeighborNumber > Integer.parseInt(threshold[ 2 ]) && mi > Integer.parseInt(threshold[ 3 ]) ){ StringBuilder sb = new StringBuilder(); sb.append(entry.getKey()); sb.append( "," ).append(tf); sb.append( "," ).append(leftNeighborNumber); sb.append( "," ).append(rightNeighborNumber); sb.append( "," ).append(tfNeighbor.getLeftNeighborEntropy()); sb.append( "," ).append(tfNeighbor.getRightNeighborEntropy()); sb.append( "," ).append(mi).append( "\n" ); bw.write(sb.toString()); } } bw.close(); } catch (IOException e) { throw new RuntimeException(e); } System.out.println( "Info: [Nagao Algorithm Step 4]: having saved to file" ); } //apply nagao algorithm to input file public static void applyNagao(String[] inputs, String out, String stopList){ NagaoAlgorithm nagao = new NagaoAlgorithm(); //step 1: add phrases to PTable String line; for (String in : inputs){ try { BufferedReader br = new BufferedReader( new FileReader(in)); while ((line = br.readLine()) != null ){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println( "Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable" ); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5" .split( "," )); } public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){ NagaoAlgorithm nagao = new NagaoAlgorithm(); nagao.setN(n); String[] threshold = filter.split( "," ); if (threshold.length != 4 ){ System.out.println( "ERROR: filter must have 4 numbers, seperated with ',' " ); return ; } //step 1: add phrases to PTable String line; for (String in : inputs){ try { BufferedReader br = new BufferedReader( new FileReader(in)); while ((line = br.readLine()) != null ){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println( "Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable" ); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, threshold); } private void setN( int n){ N = n; } public static void main(String[] args) { String[] ins = { "E://test//ganfen.txt" }; applyNagao(ins, "E://test//out.txt" , "E://test//stoplist.txt" ); } } |
2. TFNeighbor.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
package com.algo.word; import java.util.HashMap; import java.util.Map; public class TFNeighbor { private int tf; private Map<Character, Integer> leftNeighbor; private Map<Character, Integer> rightNeighbor; TFNeighbor(){ leftNeighbor = new HashMap<Character, Integer>(); rightNeighbor = new HashMap<Character, Integer>(); } //add word to leftNeighbor public void addToLeftNeighbor( char word){ //leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0)); Integer number = leftNeighbor.get(word); leftNeighbor.put(word, number == null ? 1 : 1 +number); } //add word to rightNeighbor public void addToRightNeighbor( char word){ //rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0)); Integer number = rightNeighbor.get(word); rightNeighbor.put(word, number == null ? 1 : 1 +number); } //increment tf public void incrementTF(){ tf++; } public int getLeftNeighborNumber(){ return leftNeighbor.size(); } public int getRightNeighborNumber(){ return rightNeighbor.size(); } public double getLeftNeighborEntropy(){ double entropy = 0 ; int sum = 0 ; for ( int number : leftNeighbor.values()){ entropy += number*Math.log(number); sum += number; } if (sum == 0 ) return 0 ; return Math.log(sum) - entropy/sum; } public double getRightNeighborEntropy(){ double entropy = 0 ; int sum = 0 ; for ( int number : rightNeighbor.values()){ entropy += number*Math.log(number); sum += number; } if (sum == 0 ) return 0 ; return Math.log(sum) - entropy/sum; } public int getTF(){ return tf; } } |
3. Main.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
package com.algo.word; public class Main { public static void main(String[] args) { //if 3 arguments, first argument is input files splitting with ',' //second argument is output file //output 7 columns split with ',' , like below: //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information //third argument is stop words list if (args.length == 3 ) NagaoAlgorithm.applyNagao(args[ 0 ].split( "," ), args[ 1 ], args[ 2 ]); //if 4 arguments, forth argument is the NGram parameter N //5th argument is threshold of output words, default is "20,3,3,5" //output TF > 20 && (left | right) neighbor number > 3 && MI > 5 else if (args.length == 5 ) NagaoAlgorithm.applyNagao(args[ 0 ].split( "," ), args[ 1 ], args[ 2 ], Integer.parseInt(args[ 3 ]), args[ 4 ]); } } |
以上所述就是本文的全部内容了,希望大家能够喜欢。