# !/usr/bin/env python # -*- encoding: utf-8 -*- import nltk import MeCab import urllib2 from urllib2 import HTTPError from itertools import chain def tfidf(doc,docs): """対象の文書と全文の形態素解析した単語リストを指定すると対象の文書のTF-IDFを返す""" tokens = list(chain.from_iterable(docs)) #flatten A = nltk.TextCollection(docs) token_types = set(tokens) return [{"word":token_type,"tfidf":A.tf_idf(token_type, doc)} for token_type
