@misc{cogprints1796, editor = {Luc De Raedt and Peter Flach}, title = {Mining the Web for Synonyms: PMI-IR versus LSA on TOEFL}, author = {Peter Turney}, publisher = {Springer-Verlag}, year = {2001}, pages = {491--502}, keywords = {PMI-IR, synonyms, LSA, LSI, Latent Semantic Analysis, text mining, web mining, TOEFL, mutual information}, url = {http://cogprints.org/1796/}, abstract = {This paper presents a simple unsupervised learning algorithm for recognizing synonyms, based on statistical data acquired by querying a Web search engine. The algorithm, called PMI-IR, uses Pointwise Mutual Information (PMI) and Information Retrieval (IR) to measure the similarity of pairs of words. PMI-IR is empirically evaluated using 80 synonym test questions from the Test of English as a Foreign Language (TOEFL) and 50 synonym test questions from a collection of tests for students of English as a Second Language (ESL). On both tests, the algorithm obtains a score of 74\%. PMI-IR is contrasted with Latent Semantic Analysis (LSA), which achieves a score of 64\% on the same 80 TOEFL questions. The paper discusses potential applications of the new unsupervised learning algorithm and some implications of the results for LSA and LSI (Latent Semantic Indexing). } }