回复: 用java编写的源代码--制作词表
// here is a copy based on Oliver Mason (2000)
// Corpus.java provides the GUI
// FileTokeniser serves the purpose of spliting a file into linguistic units
// FreqList creates a freqency list
//Corpus.java
import java.awt.BorderLayout;
import java.awt.Font;
import java.awt.event.ActionEvent;
import java.io.*;
import java.util.*;
import javax.swing.AbstractAction;
import javax.swing.JFrame;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import corpus.*;
public class Corpus {
protected String corpusfile="browncorpus.txt";
protected String nl;
private class SwingAction extends AbstractAction {
SwingAction() {
super("New Action", null);
}
public void actionPerformed(ActionEvent e) {
try{
FileTokeniser ft=new FileTokeniser(corpusfile);
FreqList flist=new FreqList();
nl=System.getProperty("line.separator");
while (ft.hasMoreTokens()){
flist.add(ft.getNextToken());
}
ft.close();
PrintWriter pw=new PrintWriter(new FileWriter(corpusfile+".frq"));
Iterator a=flist.iterator();
while (a.hasNext()){
String word=(String)a.next();
int freq =flist.getFreq(word);
textArea.append(word+"\t\t"+freq+nl);
}
flist.save(pw);
pw.close();
}catch (IOException ex){};
}
}
private SwingAction action = new SwingAction();
private JTextArea textArea;
private JFrame frame;
/**
* Launch the application
* @param args
*/
public static void main(String args[]) {
try {
Corpus window = new Corpus();
window.frame.setVisible(true);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Create the application
*/
public Corpus() {
initialize();
}
/**
* Initialize the contents of the frame
*/
protected void initialize() {
frame = new JFrame();
frame.getContentPane().setLayout(new BorderLayout());
frame.setBounds(100, 100, 500, 375);
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
final JScrollPane scrollPane = new JScrollPane();
frame.getContentPane().add(scrollPane);
textArea = new JTextArea();
textArea.setEditable(false);
textArea.setFont(new Font("Sans", Font.PLAIN, 14));
scrollPane.setViewportView(textArea);
final JMenuBar menuBar = new JMenuBar();
frame.setJMenuBar(menuBar);
final JMenu corpusMenu = new JMenu();
corpusMenu.setText("WordList");
menuBar.add(corpusMenu);
final JMenuItem openMenuItem = new JMenuItem();
openMenuItem.setAction(action);
openMenuItem.setText("BrownCorpus");
corpusMenu.add(openMenuItem);
corpusMenu.addSeparator();
final JMenu sortMenu_1 = new JMenu();
sortMenu_1.setText("Sort");
corpusMenu.add(sortMenu_1);
final JMenuItem ascendMenuItem = new JMenuItem();
ascendMenuItem.setText("Ascend");
sortMenu_1.add(ascendMenuItem);
final JMenuItem descendMenuItem = new JMenuItem();
descendMenuItem.setText("Descend");
sortMenu_1.add(descendMenuItem);
final JMenu sortMenu = new JMenu();
sortMenu.setText("Concordance");
menuBar.add(sortMenu);
final JMenuItem findMenuItem = new JMenuItem();
findMenuItem.setText("Find...");
sortMenu.add(findMenuItem);
final JMenuItem preferencesMenuItem = new JMenuItem();
preferencesMenuItem.setText("Preferences");
sortMenu.add(preferencesMenuItem);
final JMenu collocationMenu = new JMenu();
collocationMenu.setText("Collocation");
menuBar.add(collocationMenu);
final JMenuItem miscoreMenuItem = new JMenuItem();
miscoreMenuItem.setText("MI Score");
collocationMenu.add(miscoreMenuItem);
final JMenuItem tScoreMenuItem = new JMenuItem();
tScoreMenuItem.setText("T Score");
collocationMenu.add(tScoreMenuItem);
final JMenuItem zScoreMenuItem = new JMenuItem();
zScoreMenuItem.setText("Z Score");
collocationMenu.add(zScoreMenuItem);
final JMenuItem preferencesMenuItem_1 = new JMenuItem();
preferencesMenuItem_1.setText("Preferences");
collocationMenu.add(preferencesMenuItem_1);
final JMenu helpMenu = new JMenu();
helpMenu.setText("Help");
menuBar.add(helpMenu);
final JMenuItem aboutMenuItem = new JMenuItem();
aboutMenuItem.setText("About");
helpMenu.add(aboutMenuItem);
}
}
// FileTokeniser.java
package corpus;
import java.io.IOException;
import java.io.FileReader;
import java.io.BufferedReader;
import java.util.StringTokenizer;
public class FileTokeniser
{
private BufferedReader input=null;
private StringTokenizer tokeniser =null;
private String nextToken=null;
/**constructor*/
public
FileTokeniser(String infile)
throws IOException{
input= new BufferedReader(new FileReader(infile));
do {
String line=input.readLine();
tokeniser=new StringTokenizer(PreTokeniser.tokenise(line));
}while (!tokeniser.hasMoreTokens());
nextToken=tokeniser.nextToken();
}
/**check if more tokens are available*/
public boolean
hasMoreTokens(){
if (nextToken==null){
return(false);
}else{
return(true);
}}
/** read the next token*/
public String
getNextToken(){
String retval=nextToken;
if (tokeniser.hasMoreTokens()){//more available on this line
nextToken=tokeniser.nextToken();
}else{//read the next line
try{
nextToken=null;
String line=input.readLine();
while (line!=null && nextToken==null){
if (line!=null){
tokeniser=new StringTokenizer(PreTokeniser.tokenise(line));
if (tokeniser.hasMoreTokens()){
nextToken=tokeniser.nextToken();
}else{
line=input.readLine();
}
}else{
input.close();
}
}
}catch (IOException exc){
System.err.println("FileTokeniser: "+exc);
}
}
return(retval);
}
/**close the input file*/
public void
close()
throws IOException{
input.close();
input=null;
}
//FreqList.java
package corpus;
import java.util.*;
import java.io.*;
/**
* An implementation of a frequency list
*/
public class FreqList{
private Map storage;
private int total=0;
public
FreqList(){
storage=new HashMap();
}
public void
add (String word){
int value[]=(int[])storage.get(word);
if (value==null){
value=new int[1];
storage.put(word,value);
}
value[0]++;
total++;
}
public int
getFreq(String word){
int retval=0;
int value[]=(int[])storage.get(word);
if (value!=null){
retval=value[0];
}
return(retval);
}
public int
getN(){
return(total);
}
public Iterator
iterator(){
return(storage.keySet().iterator());
}
public void
save(PrintWriter pw){
Iterator it=storage.keySet().iterator();
while (it.hasNext()){
String word=(String)it.next();
int freq =getFreq(word);
pw.println(word+" "+freq);
}
}
public void
load(BufferedReader br)
throws IOException {
String line=br.readLine();
while (line!=null){
StringTokenizer st=new StringTokenizer(line);
if (st.countTokens()<2){
System.err.println("Insufficient line: '"+line+" '");
}else if(st.countTokens()==2){
insert(st.nextToken(),Integer.parseInt(st.nextToken()));
}else{
StringBuffer word=new StringBuffer(st.nextToken());
while (st.countTokens()>2) {
word.append(' ');
word.append(st.nextToken());
}
insert(word.toString(),Integer.parseInt(st.nextToken()));
}
line=br.readLine();
}
}
private void
insert(String word, int freq){
int value[]=new int[1];
total+=freq;
freq+=getFreq(word);
value[0]=freq;
storage.put(word,value);
}
}
//the end