Program code for building TDM

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

/**
*
* @author shakthydoss
*/
public class ReadingMultipleFile {
  public static  List keywordList = new  ArrayList();
  public static  int [][] countMatrix;
  static String path = "E:Colloge  studiesSEM – 7Text MiningAssignmentsAssignment -7Corpus2";
//  static String path = "Corpus2";
  public static File folder = new File(path);
  public static File[] listOfFiles = folder.listFiles();

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws FileNotFoundException, IOException {

        String s ,temp;
        StringTokenizer st ;
        int countKeyword =0;

        Mystemmer stem = new Mystemmer();
       //  List keywordList = new  ArrayList();

        StopWordList swl = new StopWordList();

        BufferedWriter bw = new BufferedWriter(new FileWriter("E:Colloge  studiesSEM – 7Text MiningAssignmentsAssignment -7keywords.txt"));

          if (listOfFiles.length > 0)
          {
               for (int i = 0; i < listOfFiles.length; i++)
               {
                 if (listOfFiles[i].isFile())
                 {
                    String p1 = listOfFiles[i].getName();
                  //  System.out.println("["+i+"] " + p1);
                   BufferedReader br = new BufferedReader(new FileReader(listOfFiles[i].getPath()));

                    while((s=br.readLine())!=null)
                    {
                       st = new StringTokenizer(s, " ", false);
                       while(st.hasMoreTokens())
                       {
                           temp = st.nextToken();
                           if(swl.stopWord.contains(temp))
                           {
                               if(st.hasMoreTokens())
                               st.nextToken();
                               //System.out.println(temp);
                           }
                           else
                           {
                               if(temp.length() <=3||temp.length()>=15)
                               {
                                  if(st.hasMoreTokens())
                                  st.nextToken();
                               }
                               else
                               {
                                   temp = stem.DoSuffixStremmer(temp);
                                   // put the stemmer here
                                   if(keywordList.contains(temp)==false) //checking in keyword_array
                                   {
                                    keywordList.add(temp); // adding keyword to keyword_array
                                    bw.write(temp);
                                    countKeyword++;
                                    bw.newLine();
                                   }
                               }
                           }
                       } // while ends
                    } // while ends
                 }
              }
               bw.close();
         }
         System.out.println("");
         System.out.println("No of Documents – "+listOfFiles.length);
         System.out.println("No of keywords – "+countKeyword);
         System.out.println("");
         System.out.println("");

         countMatrix = new int[listOfFiles.length][keywordList.size()];

        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++) {
                countMatrix[i][j] =0;
            }
        }

        

           if (listOfFiles.length > 0)
          {
               for (int i = 0; i < listOfFiles.length; i++)
               {
                 if (listOfFiles[i].isFile())
                 {
                    String p1 = listOfFiles[i].getName();
                  //System.out.println("["+i+"] " + p1);
                   BufferedReader br = new BufferedReader(new FileReader(listOfFiles[i].getPath()));

                    while((s=br.readLine())!=null)
                    {
                       st = new StringTokenizer(s, " ", false);
                       while(st.hasMoreTokens())
                       {
                           temp = st.nextToken();
                           if(swl.stopWord.contains(temp))
                           {
                               if(st.hasMoreTokens())
                               st.nextToken();
                               //System.out.println(temp);
                           }
                           else
                           {
                               if(temp.length() <=3||temp.length()>=15)
                               {
                                  if(st.hasMoreTokens())
                                  st.nextToken();
                               }
                               else
                               {
                                   // put stemmer here
                                   temp = stem.DoSuffixStremmer(temp);
                                    if(keywordList.contains(temp)==true) // checking the keyword in keyword_array
                                   {
                                     //generating count matrix
                                    countMatrix[i][keywordList.indexOf(temp)] = countMatrix[i][keywordList.indexOf(temp)] + 1;
                                   }
                               }
                           }

                       } // while ends
                    } // while ends
                 }
              }
               bw.close();
            // System.out.println("no of keywords – "+ii);
         }

         System.out.println("************************** Count Matrix *************************");
         System.out.println("");

          for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++) {
                System.out.print(","+countMatrix[i][j]);
            }
             System.out.println(" ");
        }
          TDIDF_Matrix tM = new TDIDF_Matrix();
          tM.compute_tottal_no_words_in_doc();
          tM.compute_num_of_doc_in_which_word_i_appears();
          tM.compute_TDIDF(listOfFiles.length, keywordList.size());

       }// main closing
} // class closing

 

 

//  TDIDF_Matrix –> to build weighted matrix from count matrix

import java.text.DecimalFormat;

public class TDIDF_Matrix  extends ReadingMultipleFile{
public static double[][] tdidf;
int[] tottal_no_words_in_doc;
int[] num_of_doc_in_which_word_i_appears;
  DecimalFormat twoDForm = new DecimalFormat("0.00000");
    public TDIDF_Matrix() {
         tdidf = new double[listOfFiles.length][keywordList.size()];
         tottal_no_words_in_doc = new int[listOfFiles.length];
         num_of_doc_in_which_word_i_appears = new int[keywordList.size()];
    }

    public void compute_tottal_no_words_in_doc()
    {
        int sum = 0;
        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++)
            {
                if((countMatrix[i][j])>0)
                 {
                     sum = sum+1;
                 }
            }
            tottal_no_words_in_doc[i]=sum;
            sum =0;
        }

        for (int i = 0; i < listOfFiles.length; i++) {
            System.out.println("Total no of words in document : "+i+" –> "+tottal_no_words_in_doc[i]);
        }
    }

    public void compute_num_of_doc_in_which_word_i_appears()
    {
        int sum = 0;
        for (int i = 0; i < keywordList.size(); i++) {
            for (int j = 0; j < listOfFiles.length; j++)
            {
                if((countMatrix[j][i])>0)
                 {
                     sum = sum+1;
                 }
            }
            num_of_doc_in_which_word_i_appears[i] = sum;
            sum = 0;
        }

        for (int i = 0; i < keywordList.size(); i++) {
            System.out.println("word : "+i +" occured in "+num_of_doc_in_which_word_i_appears[i]+" documents ");
        }
    }

    public void compute_TDIDF(int x , int y)
    {

        //initializing the tdidf
        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size() ; j++) {
                tdidf[i][j]=0.00000;
            }

        }

       // ReadingMultipleFile re = new ReadingMultipleFile();
        for (int i = 0; i < listOfFiles.length; i++)
        {
            for (int j = 0; j < keywordList.size(); j++)
            {
               tdidf[i][j] = Double.valueOf( twoDForm.format((Double.valueOf(twoDForm.format((countMatrix[i][j]*10000)/1+tottal_no_words_in_doc[i])).doubleValue()/10000) * (Math.log( 50 / num_of_doc_in_which_word_i_appears[j])))).doubleValue();
            }//for closing
        } // for closing

        System.out.println("");
        System.out.println(" ************** TFIDF Matrix **************");
        System.out.println("");
        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++) {
                System.out.print(tdidf[i][j]+"  ,  ");
            }
            System.out.println("");
        }

        //computeSVD();
    }

    public void computeSVD()
    {
        System.out.println("");
        System.out.println(" ************** TFIDF Matrix **************");
        System.out.println("");
        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++) {
                System.out.print(tdidf[i][j]+"  ,  ");
            }
            System.out.println("");
        }
    }

}

Click here to see my simple stemmer implementation


9 Comments

  1. sita2901 wrote
    at 4:17 AM - 1st August 2011 Permalink

    ……
    tdidf[i][j] = Double.valueOf( twoDForm.format((Double.valueOf(twoDForm.format((countMatrix[i][j]*10000)/1+tottal_no_words_in_doc[i])).doubleValue()/10000) * (Math.log( 50 / num_of_doc_in_which_word_i_appears[j])))).doubleValue();
    …..
    sir, where 1 (number in “1+tottal_no_words_in_doc[i]”) comes from?? why it added by 1??, thanks before..

  2. shakthydoss wrote
    at 8:01 AM - 1st August 2011 Permalink

    Hi sita ,

    It is just for my comfortable…… so that i can get the value that i want.
    You can omit 1 and continue with the native formula that i mentioned in the post.

  3. sita2901 wrote
    at 1:24 AM - 2nd August 2011 Permalink

    hhmmm.. i see,..
    oke sir, thanks a lot

  4. alinux8per wrote
    at 9:38 AM - 6th October 2011 Permalink

    hello shakthydoss;
    i have this error :

    Exception in thread “main” java.lang.NumberFormatException: For input string: “20000,00000”
    at sun.misc.FloatingDecimal.readJavaFormatString(FloatingDecimal.java:1242)
    at java.lang.Double.valueOf(Double.java:492)
    at lsimodelimplementation.TDIDF_Matrix.compute_TDIDF(TDIDF_Matrix.java:74)
    at lsimodelimplementation.ReadingMultipleFile.main(ReadingMultipleFile.java:182)
    Java Result: 1

    74 ligne is : tdidf[i][j] = Double.valueOf( twoDForm.format((Double.valueOf(twoDForm.format((countMatrix[i][j]*10000)/1+tottal_no_words_in_doc[i])).doubleValue()/10000) * (Math.log( listOfFiles.length / num_of_doc_in_which_word_i_appears[j])))).doubleValue();

    and 182 ligne is : tM.compute_TDIDF(listOfFiles.length, keywordList.size());

    can you help me please
    thanks before.

  5. shakthydoss wrote
    at 3:38 AM - 8th October 2011 Permalink

    I guess the input string should be 200000000 instead of 20000,00000

  6. SUNANDA DAS wrote
    at 11:02 AM - 20th July 2012 Permalink

    Hi, I have a code in c. It runs for 10 input files but not for 1000 files for generating TDM.Can u help me to remove the error and can u give me the exe file for ur TDM?

  7. DUR4NGO wrote
    at 11:28 PM - 22nd April 2014 Permalink

    Can anyone give me the file “KeyWords.txt”, please?

  8. SOUHILA SOUSOU wrote
    at 9:19 AM - 23rd June 2015 Permalink

    plaise help me,

    I must implimenter the LSI model but I can not get a matrix term document

  9. Aditi Shetkar wrote
    at 5:31 PM - 29th October 2015 Permalink

    how to apply phrase extraction to above program to get common phrases from text file?

Post a Comment

Your email is never published nor shared. Required fields are marked *