User:Kevin Baas/stat generator code/CountWords.java

From Wikipedia, the free encyclopedia
import java.util.*;
import java.io.*;

public class CountWords {
	int total_word_count = 0;
	Hashtable<String,Integer> total_word_counts = new Hashtable<String,Integer>();
	Vector<String> articles = new Vector<String>();
	public String[] sarticles;
	//Vector<Integer> article_totals;
	public String count_path = "";
	Hashtable<String,Integer> article_total_word_counts = new Hashtable<String,Integer>();
	//Hashtable<String,Hashtable<String,Integer>> article_word_counts = new Hashtable<String,Hashtable<String,Integer>>();

	public void countArticleWords(String article, String[] words) {
		articles.add(article);
		int total = 0;

		//get article word count entry
		/*
		Hashtable<String,Integer> article_word_count = article_word_counts.get(article);
		if( article_word_count == null) {
			article_word_count = new Hashtable<String,Integer>();
			article_word_counts.put(article,article_word_count);
		}*/

		Hashtable<String,Integer> article_word_count = new Hashtable<String,Integer>();
		System.out.println();

		//can be improved by adding to total word count in a second pass, from iterating through article hash table
		for( int i = 0; i < words.length; i++) {

			//get word and make sure it only contains alphabetic characters
			if( words[i] == null)
				continue;
			String word = words[i].toLowerCase();
			byte[] test = word.getBytes();
			boolean ok = true;
			if( test.length == 0)
				continue;
			if( test[0] >= '0' && test[0] <= '9') {
				for( int j = 0; j < test.length; j++) {
					if( test[j] < '0' || test[j] > '9') {
						ok = false;
						break;
					}
				}
			} else {
				for( int j = 0; j < test.length; j++) {
					if( test[j] < 'a' || test[j] > 'z') {
						ok = false;
						break;
					}
				}
			}
			if( !ok)
				continue;

			//add to word counts
			total++;
			Integer tot_count = total_word_counts.get(word);
			if( tot_count == null)
				tot_count = 0;
			tot_count++;

			Integer art_count = article_word_count.get(word);
			if( art_count == null)
				art_count = 0;
			art_count++;
			total_word_counts.put(word,tot_count);
			article_word_count.put(word,art_count);
			//System.out.print(word+" ");
		}

		//write word count totals
		article_total_word_counts.put(article,total);
		total_word_count += total;
		System.out.print(" "+total+" ");
		write_counts_as_file(article,article_word_count);
	}
	void write_counts_as_file(String article, Hashtable<String,Integer> article_word_count) {
		File f = new File(count_path+"\\"+article+".csv");
		try {
			FileOutputStream fis = new FileOutputStream(f);
			StringBuffer sb = new StringBuffer();
			Enumeration<String> e = article_word_count.keys();
			while(e.hasMoreElements()) {
				String s = e.nextElement();
				//System.out.print(s+" ");
				Integer i = article_word_count.get(s);
				sb.append((s+","+i+"\n"));
			}
			fis.write(new String(sb).getBytes());
			fis.close();
		} catch (Exception ex) { }
	}

	int[] get_doc_term_counts(String article) {
		Hashtable<String,Integer> local = new Hashtable<String,Integer>();
		int[] counts = new int[words.length];
		System.out.println(count_path+"\\"+article+".csv");
		File f = new File(count_path+"\\"+article+".csv");
		try {
			FileInputStream fis = new FileInputStream(f);
			StringBuffer sb = new StringBuffer();
			//while( fis.available() > 0) {
				byte[] bb = new byte[(int)f.length()];
				fis.read(bb);
				sb.append(new String(bb));
			//}
			fis.close();
			String s = new String(sb);
			String[] lines = s.split("\n");
			//System.out.println("lines - "+lines.length);
			for( int i = 0; i < lines.length; i++) {
				String[] fields = lines[i].split(",");
				//System.out.println(fields[0].trim()+":"+ new Integer(fields[1].trim()));
				local.put(fields[0].trim(), new Integer(fields[1].trim()));
			}
			for( int i = 0; i < words.length; i++) {
				Integer val = local.get(words[i]);
				if( val == null)
					val = 0;
				counts[i] = val;
			}
		} catch (Exception ex) {
			ex.printStackTrace();
		}
		return counts;
	}

	public String[] words = null;
	public int[] values = null;
	public double[] freqs = null;
	public void compressMainWordIndex(int words_to_add) {
		int min = 5;
		Vector<String> total_keys = new Vector<String>();
		Enumeration<String> e = total_word_counts.keys();
		while(e.hasMoreElements()) {
			String s = e.nextElement();
			if(total_word_counts.get(s) >= min)
				total_keys.add(s);
		}
		words = new String[total_keys.size()];
		double adj = words_to_add/words.length;
		values = new int[total_keys.size()];
		freqs = new double[total_keys.size()];
		for( int i = 0; i < words.length; i++) {
			words[i] = total_keys.get(i);
			values[i] = total_word_counts.get(words[i]);
			freqs[i] = (values[i]+adj)/(total_word_count+words_to_add);
		}
		sarticles = new String[articles.size()];
		for( int i = 0; i < articles.size(); i ++)
			sarticles[i] = articles.get(i);
	}
	public double[] getMeanRegressedArticleWordFreq(String article, int words_to_add) {
		return null;
		/*
		double[] art_freqs = new double[freqs.length];
		Hashtable<String,Integer> arthash = article_word_counts.get(article);
		int count = article_total_word_counts.get(article);
		for( int i = 0; i < words.length; i++) {
			Integer f = arthash.get(words[i]);
			if( f == null)
				f = 0;
			art_freqs[i] = (f+freqs[i]*words_to_add)/(count+words_to_add);
		}
		return art_freqs;
		*/
	}
	public double[] getWordSurprise(double[] article_word_frequency) {
		double[] surprise = new double[words.length];
		for( int i = 0; i < words.length; i++)
			surprise[i] = Math.log(article_word_frequency[i]/freqs[i]);
		return surprise;
	}
	public double[] get_doc_term_stats(double doc_term_count, double doc_word_count, double tot_term_count, double tot_word_count) {
		double[] stats = new double[0];

		double jpdocterm = doc_term_count / tot_word_count;//=ptermdoc * pdoc;
		double ptermdoc = doc_term_count / doc_word_count;
		double pdocterm = doc_term_count / tot_term_count;//= ptermdoc * pdoc / pterm;
		double pterm = tot_term_count / tot_word_count;
		double pdoc = doc_word_count / tot_word_count;
		double pre = pdoc*pterm;

/*
		double lpterm = -Math.log(pterm);
		double lpdoc = -Math.log(pdoc);
		double lptermdoc = -Math.log(ptermdoc);
		double lpdocterm = -Math.log(pdocterm);
		double lnpterm = -Math.log(1-pterm);
		double lnpdoc = -Math.log(1-pdoc);
		double lnptermdoc = -Math.log(1-ptermdoc);
		double lnpdocterm = -Math.log(1-pdocterm);
*/
		double mult = 10000000;
		stats = new double[]{
				mult*H(pre,ptermdoc), //0
				mult*H(pre,pterm),
				mult*H(pre,pdocterm),
				mult*H(pre,pdoc),
				mult*H(jpdocterm,ptermdoc), //4
				mult*H(jpdocterm,pterm),
				mult*H(jpdocterm,pdocterm),
				mult*H(jpdocterm,pdoc),
				mult*pdoc*H(1-pterm,1-ptermdoc), //8
				mult*pdoc*H(1-pterm,1-pterm),
				mult*pterm*H(1-pdoc,1-pdocterm),
				mult*pterm*H(1-pdoc,1-pdoc),
				mult*pdoc*H(1-ptermdoc,1-ptermdoc),  //12
				mult*pdoc*H(1-ptermdoc,1-pterm),
				mult*pterm*H(1-pdocterm,1-pdocterm),
				mult*pterm*H(1-pdocterm,1-pdoc),
				mult*H(pre,jpdocterm),  //16
				mult*H(jpdocterm,jpdocterm),

				//H(pjdocterm,pdoc),
				//H(pjdocterm,pterm),
				doc_term_count, //=total term count, doc word count
				((doc_term_count >= 1) ? 1 : 0), //=articles term is in, distinct terms in doc
				0,0,0,0,
				0,0,0,0,
				0,0,0,0,
				0,
				//mult*H(pdocterm,pdocterm),
		};
		for( int i = 0; i < 8; i++)
			stats[8+i]+=stats[i];
		for( int i = 0; i < 8; i++)
			stats[20+i] = stats[2*i+1]-stats[2*i];
		stats[28] = stats[16]-stats[1];
		stats[29] = stats[16]-stats[3];
		stats[30] = stats[17]-stats[5];
		stats[31] = stats[17]-stats[7];
		stats[32] = stats[17]-stats[16];
		return stats;
	}

	public double[][][] get_term_stats(String[] docs, int[] doc_word_counts, int[] tot_term_counts, int tot_word_count) {
		double[][] term_stats = new double[tot_term_counts.length][];
		double[][] doc_stats = new double[docs.length][];
		for( int i = 0; i < tot_term_counts.length; i++) {
			term_stats[i] = new double[]{
					0,0,0,0,0,0,0,0,
					0,0,0,0,0,0,0,0,
					0,0,0,0,0,0,0,0,
					0,0,0,0,0,0,0,0,
					0,
			};
		}
		for( int i = 0; i < docs.length; i++) {
			doc_stats[i] = new double[]{
					0,0,0,0,0,0,0,0,
					0,0,0,0,0,0,0,0,
					0,0,0,0,0,0,0,0,
					0,0,0,0,0,0,0,0,
					0,
			};
		}
		for( int i = 0; i < docs.length; i++) {
			System.out.print(".");
			if( i % 100 == 0)
				System.out.println();
			int[] doc_term_counts = get_doc_term_counts(docs[i]);
			for( int j = 0; j < doc_term_counts.length; j++) {
				double[] doc_term_stats = this.get_doc_term_stats(doc_term_counts[j], doc_word_counts[i], tot_term_counts[j], tot_word_count);
				for( int k = 0; k < doc_term_stats.length; k++) {
					doc_stats[i][k] += doc_term_stats[k];
				}
				for( int k = 0; k < doc_term_stats.length; k++)
					term_stats[j][k] += doc_term_stats[k];
			}
		}
		return new double[][][]{doc_stats,term_stats};
	}
	double H(double p,double q) {
		if( p == 0 || q == 0 || p != p || q!= q)
			return 0.0;
		return -p * Math.log(q);
	}
	double[][][] getAllStats() {
		String[] sarticles = new String[articles.size()];
		int[] art_word_counts = new int[articles.size()];
		for( int i = 0; i < sarticles.length; i++) {
			sarticles[i] = articles.get(i);
			Integer n = article_total_word_counts.get(sarticles[i]);
			if( n == null)
				n = 0;
			art_word_counts[i] = n;
		}
		return get_term_stats(sarticles,art_word_counts,values,this.total_word_count);
	}
}