{ "eprintid": "156", "rev_number": "5", "eprint_status": "archive", "dir": "disk0/00/00/01/56", "datestamp": "2009-04-06 19:13:27", "lastmod": "2009-04-14 04:37:12", "status_changed": "2009-04-06 19:13:27", "type": "conference_item", "metadata_visibility": "show", "item_issues_count": "0", "creators": { "item": { "name": { "family": "Sato", "given": "Satoshi" }, "id": "", "country": "", "affiliation": "Nagoya University" } }, "track": "Poster Session", "title": "Crawling English-Japanese Person-Name Transliterations from the Web", "ispublished": "pub", "full_text_status": "public", "pres_type": "poster", "abstract": "Automatic compilation of lexicon is a dream of lexicon compilers as well as lexicon users. This paper proposes a system that crawls English-Japanese person-name transliterations from the Web, which works a back-end collector for automatic compilation of bilingual person-name lexicon. Our crawler collected 561K transliterations in five months. From them, an English-Japanese person-name lexicon with 406K entries has been compiled by an automatic post processing. This lexicon is much larger than other similar resources including English-Japanese lexicon of HeiNER obtained from Wikipedia. names written in Latin script are transliterated into one in Katakana script according to their pronunciations. English-Japanese transliteration of person name is difficult because of several reasons, such as limited coverage of existing bilingual lexicons, non-English (e.g., French and German) person names appeared in English texts, and spelling variants in Katakana script. 2. There is a possibility that we can compile a large EnglishJapanese person-name lexicon from the Web, because a lot of transliteration instances of person names exist on the Web. Actually, human translators use the Web as a virtual low-quality bilingual lexicon. 3. New person names are produced; new person-name transliterations are produced in every day. Human translators hope frequent update of bilingual personname lexicon. This paper proposes a system that crawls English-Japanese person-name transliterations from the Web, which works as a back-end collector for automatic lexicon compilation. From collected transliterations, a bilingual person-name lexicon is produced by an automatic post processing. This attempt of automatic lexicon compilation can be viewed as a conversion from a virtual low-quality bilingual lexicon (i.e., the Web) to a real high-quality bilingual lexicon.", "date": "2009-04", "pagerange": "1151-1151", "event_title": "18th International World Wide Web Conference", "event_location": "Madrid, Spain", "event_dates": "April 20th-24th, 2009", "event_type": "conference", "refereed": "TRUE", "documents": { "document": { "docid": "156", "rev_number": "4", "eprintid": "156", "pos": "1", "format": "application/pdf", "language": "en", "security": "public", "main": "p1151.pdf", "content": "published", "files": { "file": { "filename": "p1151.pdf", "filesize": "595016", "url": "http://www2009.eprints.org/156/1/p1151.pdf" } } } } }