This site has been permanently archived. This is a static copy provided by the University of Southampton.
@misc{cogprints1796,
editor = {Luc De Raedt and Peter Flach},
title = {Mining the Web for Synonyms: PMI-IR versus LSA on TOEFL},
author = {Peter Turney},
publisher = {Springer-Verlag},
year = {2001},
pages = {491--502},
keywords = {PMI-IR, synonyms, LSA, LSI, Latent Semantic Analysis, text mining, web mining, TOEFL, mutual information},
url = {http://cogprints.org/1796/},
abstract = {This paper presents a simple unsupervised learning algorithm for recognizing synonyms, based on statistical data acquired by querying a Web search engine. The algorithm, called PMI-IR, uses Pointwise Mutual Information (PMI) and Information Retrieval (IR) to measure the similarity of pairs of words. PMI-IR is empirically evaluated using 80 synonym test questions from the Test of English as a Foreign Language (TOEFL) and 50 synonym test questions from a collection of tests for students of English as a Second Language (ESL). On both tests, the algorithm obtains a score of 74\%. PMI-IR is contrasted with Latent Semantic Analysis (LSA), which achieves a score of 64\% on the same 80 TOEFL questions. The paper discusses potential applications of the new unsupervised learning algorithm and some implications of the results for LSA and LSI (Latent Semantic Indexing).
}
}