diff --git "a/Week12_\353\263\265\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.ipynb" "b/Week12_\353\263\265\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.ipynb" new file mode 100644 index 0000000..cc4f56a --- /dev/null +++ "b/Week12_\353\263\265\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.ipynb" @@ -0,0 +1,446 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 8.4 텍스트 분류 실습 - 20 뉴스그룹 분류" + ], + "metadata": { + "id": "GOvjlMRqZI11" + } + }, + { + "cell_type": "markdown", + "source": [ + "텍스트 정규화" + ], + "metadata": { + "id": "2wCY1627ZOH5" + } + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "rMIlisWbW-El" + }, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_20newsgroups\n", + "\n", + "news_data = fetch_20newsgroups(subset='all', random_state=156)" + ] + }, + { + "cell_type": "code", + "source": [ + "print(news_data.keys())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O0ZXtG5bYesX", + "outputId": "e73a7702-d3b8-4338-e974-3fe2bbc3314d" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "print('target 클래스의 값과 분포도 \\n', pd.Series(news_data.target).value_counts().sort_index())\n", + "print('target 클래스의 이름들 \\n', news_data.target_names)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PXyjXJ9lYjXK", + "outputId": "e243500a-fc4b-4794-f51f-ec3fba0e46a8" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "target 클래스의 값과 분포도 \n", + " 0 799\n", + "1 973\n", + "2 985\n", + "3 982\n", + "4 963\n", + "5 988\n", + "6 975\n", + "7 990\n", + "8 996\n", + "9 994\n", + "10 999\n", + "11 991\n", + "12 984\n", + "13 990\n", + "14 987\n", + "15 997\n", + "16 910\n", + "17 940\n", + "18 775\n", + "19 628\n", + "Name: count, dtype: int64\n", + "target 클래스의 이름들 \n", + " ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(news_data.data[0])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tCiPPzmiYqvq", + "outputId": "634a916e-def3-4596-86a7-6d4e06684ffa" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "From: egreen@east.sun.com (Ed Green - Pixel Cruncher)\n", + "Subject: Re: Observation re: helmets\n", + "Organization: Sun Microsystems, RTP, NC\n", + "Lines: 21\n", + "Distribution: world\n", + "Reply-To: egreen@east.sun.com\n", + "NNTP-Posting-Host: laser.east.sun.com\n", + "\n", + "In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:\n", + "> \n", + "> The question for the day is re: passenger helmets, if you don't know for \n", + ">certain who's gonna ride with you (like say you meet them at a .... church \n", + ">meeting, yeah, that's the ticket)... What are some guidelines? Should I just \n", + ">pick up another shoei in my size to have a backup helmet (XL), or should I \n", + ">maybe get an inexpensive one of a smaller size to accomodate my likely \n", + ">passenger? \n", + "\n", + "If your primary concern is protecting the passenger in the event of a\n", + "crash, have him or her fitted for a helmet that is their size. If your\n", + "primary concern is complying with stupid helmet laws, carry a real big\n", + "spare (you can put a big or small head in a big helmet, but not in a\n", + "small one).\n", + "\n", + "---\n", + "Ed Green, former Ninjaite |I was drinking last night with a biker,\n", + " Ed.Green@East.Sun.COM |and I showed him a picture of you. I said,\n", + "DoD #0111 (919)460-8302 |\"Go on, get to know her, you'll like her!\"\n", + " (The Grateful Dead) --> |It seemed like the least I could do...\n", + "\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.datasets import fetch_20newsgroups\n", + "\n", + "# subset='train'으로 학습용 데이터만 추출, remove=('headers', 'footers', ’quotes')로 내용만 추출\n", + "train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),\n", + " random_state=156)\n", + "\n", + "X_train = train_news.data\n", + "y_train = train_news.target\n", + "\n", + "# subset='test'으로 테스트 데이터만 추출, remove=('headers', 'footers', ’quotes')로 내용만 추출\n", + "test_news= fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),\n", + " random_state=156)\n", + "\n", + "X_test = test_news.data\n", + "y_test = test_news.target\n", + "print('학습 데이터 크기 {0}, 테스트 데이터 크기 {1}'.format(len(train_news.data),\n", + "len(test_news.data)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zkXYOqXfYrLt", + "outputId": "e9aa5445-67bc-4efa-fcb1-fc1cea5951a0" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "학습 데이터 크기 11314, 테스트 데이터 크기 7532\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "피처 벡터화 변환과 머신러닝 모델 학습/예측/평가" + ], + "metadata": { + "id": "mmPq7Dn_ZQf_" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Count Vectorization으로 피처 벡터화 변환 수행.\n", + "cnt_vect = CountVectorizer()\n", + "cnt_vect.fit(X_train)\n", + "X_train_cnt_vect = cnt_vect .transform(X_train)\n", + "\n", + "# 학습 데이터로 fit( )된 CountVectorizer를 이용해 테스트 데이터를 피터 벡터화 변환 수행.\n", + "X_test_cnt_vect = cnt_vect.transform(X_test)\n", + "\n", + "print('학습 데이터 텍스트의 CountVectorizer Shape:', X_train_cnt_vect.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xpIIUUm_ZSp0", + "outputId": "b8a1a8c4-8c22-4365-8097-044eb0336e6b" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "학습 데이터 텍스트의 CountVectorizer Shape: (11314, 101631)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# LogisticRegression을 이용하여 학습/예측/평가 수행.\n", + "lr_clf = LogisticRegression(solver='liblinear')\n", + "lr_clf.fit(X_train_cnt_vect, y_train)\n", + "pred = lr_clf.predict(X_test_cnt_vect)\n", + "print('CountVectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test,pred)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4tIXfdDGZhP_", + "outputId": "19a00921-0642-46d9-e171-9d3d1af58ddf" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CountVectorized Logistic Regression의 예측 정확도는 0.617\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "# TF-IDF 벡터화를 적용해 학습 데이터 세트와 테스트 데이터 세트 변환.\n", + "tfidf_vect = TfidfVectorizer()\n", + "tfidf_vect.fit(X_train)\n", + "X_train_tfidf_vect = tfidf_vect.transform(X_train)\n", + "X_test_tfidf_vect = tfidf_vect.transform(X_test)\n", + "\n", + "# LogisticRegression을 이용해 학습/예측/평가 수행.\n", + "lr_clf = LogisticRegression(solver='liblinear')\n", + "lr_clf.fit(X_train_tfidf_vect, y_train)\n", + "pred = lr_clf.predict(X_test_tfidf_vect)\n", + "print('TF-IDF Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aI7O4HCcZh1L", + "outputId": "e5bf91a3-b556-4eab-ed0d-a65628d19107" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "TF-IDF Logistic Regression의 예측 정확도는 0.678\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# stop words 필터링을 추가하고 ngram을 기본 (1, 1)에서 (1, 2)로 변경해 피처 벡터화 적용.\n", + "tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=300 )\n", + "tfidf_vect.fit(X_train)\n", + "X_train_tfidf_vect = tfidf_vect.transform(X_train)\n", + "X_test_tfidf_vect = tfidf_vect.transform(X_test)\n", + "\n", + "r_clf = LogisticRegression(solver='liblinear')\n", + "lr_clf.fit(X_train_tfidf_vect, y_train)\n", + "pred = lr_clf.predict(X_test_tfidf_vect)\n", + "print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lOUTsoV7aJ9k", + "outputId": "e6013964-533e-45b2-cd30-1feca4ef1395" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.690\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "# 최적 C 값 도출 튜닝 수행. CV는 3 폴드 세트로 설정.\n", + "params = { 'C':[0.01, 0.1, 1, 5, 10]}\n", + "grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1 )\n", + "grid_cv_lr.fit(X_train_tfidf_vect, y_train)\n", + "print('Logistic Regression best C parameter :', grid_cv_lr.best_params_ )\n", + "\n", + "# 최적 C 값으로 학습된 grid_cv로 예측 및 정확도 평가.\n", + "pred = grid_cv_lr.predict(X_test_tfidf_vect)\n", + "print('TF-IDF Vectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dVikW7bLaRtl", + "outputId": "7e43ce42-0e11-4b20-f071-760cb08498a9" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n", + "Logistic Regression best C parameter : {'C': 10}\n", + "TF-IDF Vectorized Logistic Regression의 예측 정확도는 0.704\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "사이킷런 파이프라인 사용 및 GridSearchCV와의 결합" + ], + "metadata": { + "id": "fIAzFqIra1mL" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.pipeline import Pipeline\n", + "pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english')),\n", + " ('lr_clf', LogisticRegression(random_state=156))])" + ], + "metadata": { + "id": "FETLrEuVa7Q9" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.pipeline import Pipeline\n", + "\n", + "# TfidfVectorizer 객체를 tfidf_vect로, LogisticRegression 객체를 lr_clf로 생성하는 Pipeline 생성\n", + "pipeline = Pipeline([\n", + " ('1tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=300)),\n", + " ('lr_clf', LogisticRegression(solver='liblinear', C=10))\n", + "])\n", + "\n", + "# 별도의 TfidfVectorizer 객체의 fit( ), transform( )과 LogisticRegression의 fit(), predict( )가\n", + "# 필요 없음.\n", + "# pipeline의 fit( )과 predict( )만으로 한꺼번에 피처 벡터화와 ML 학습/예측이 가능.\n", + "pipeline.fit(X_train, y_train)\n", + "pred = pipeline.predict(X_test)\n", + "print('Pipeline 을 통한 Logistic Regression 의 예측 정확도는 {0:.3f}'.format (\n", + " accuracy_score(y_test, pred)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6xx9B83JbtTt", + "outputId": "2e5675d3-fa2b-4b11-9236-36a69c209e60" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Pipeline 을 통한 Logistic Regression 의 예측 정확도는 0.704\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git "a/Week12_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.ipynb" "b/Week12_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.ipynb" new file mode 100644 index 0000000..7a9e120 --- /dev/null +++ "b/Week12_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.ipynb" @@ -0,0 +1,1884 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# **텍스트 분석**" + ], + "metadata": { + "id": "uCMcq_L_S0nr" + } + }, + { + "cell_type": "markdown", + "source": [ + "## 1. 텍스트 분석 이해" + ], + "metadata": { + "id": "Cz3p8YTOS9tU" + } + }, + { + "cell_type": "markdown", + "source": [ + "## 2. 텍스트 사전 준비 작업 - 텍스트 정규화" + ], + "metadata": { + "id": "RMzxwhqrTIr7" + } + }, + { + "cell_type": "markdown", + "source": [ + "문장 토큰화" + ], + "metadata": { + "id": "fTDvTsw6TODx" + } + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mQ0yjKhCSsTi", + "outputId": "d68ead0b-8e45-4c29-ac17-ec0079fec32a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt_tab.zip.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + " 3\n", + "['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']\n" + ] + } + ], + "source": [ + "from nltk import sent_tokenize\n", + "import nltk\n", + "nltk.download('punkt')\n", + "nltk.download('punkt_tab')\n", + "\n", + "text_sample = 'The Matrix is everywhere its all around us, here even in this room. \\\n", + " You can see it out your window or on your television. \\\n", + " You feel it when you go to work, or go to church or pay your taxes.'\n", + "sentences = sent_tokenize(text=text_sample)\n", + "print(type(sentences),len(sentences))\n", + "print(sentences)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "단어 토큰화" + ], + "metadata": { + "id": "NjeFglEOTqxI" + } + }, + { + "cell_type": "code", + "source": [ + "from nltk import word_tokenize\n", + "\n", + "sentence = \"The Matrix is everywhere its all around us, here even in this room.\"\n", + "words = word_tokenize(sentence)\n", + "print(type(words), len(words))\n", + "print(words)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QQJhNXavTUbU", + "outputId": "ea2537a9-b575-41b0-abb7-5a2f39663770" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " 15\n", + "['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from nltk import word_tokenize, sent_tokenize\n", + "\n", + "# 여러 개의 문장으로 된 입력 데이터를 문장별로 단어 토큰화하게 만드는 함수 생성\n", + "def tokenize_text(text):\n", + "\n", + " # 문장별로 분리 토큰\n", + " sentences = sent_tokenize(text)\n", + " # 분리된 문장별 단어 토큰화\n", + " word_tokens = [word_tokenize(sentence) for sentence in sentences]\n", + " return word_tokens\n", + "\n", + "# 여러 문장에 대해 문장별 단어 토큰화 수행.\n", + "word_tokens = tokenize_text(text_sample)\n", + "print(type(word_tokens), len(word_tokens))\n", + "print(word_tokens)\n", + "" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RVxijaXUUAfM", + "outputId": "68009e31-2b57-4d96-9728-a89eb1002a96" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " 3\n", + "[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "스톱 워드 제거" + ], + "metadata": { + "id": "N7aWA9oRUQGX" + } + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('stopwords')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fW_ILC9FURN7", + "outputId": "cf4c334e-2b58-4071-e339-ddcafd2d8eb8" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print('영어 stop words 개수:', len(nltk.corpus.stopwords.words('english')))\n", + "print(nltk.corpus.stopwords.words('english')[:20])\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-w6haR3ZUTO2", + "outputId": "7e3807de-7f3e-4798-88c8-b77563542127" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "영어 stop words 개수: 198\n", + "['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', \"aren't\", 'as', 'at', 'be', 'because', 'been']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "\n", + "stopwords = nltk.corpus.stopwords.words('english')\n", + "all_tokens = []\n", + "# 위 예제에서 3개의 문장별로 얻은 word_tokens list에 대해 스톱 워드를 제거하는 반복문\n", + "for sentence in word_tokens:\n", + " filtered_words=[]\n", + " # 개별 문장별로 토큰화된 문장 list에 대해 스톱 워드를 제거하는 반복문\n", + " for word in sentence:\n", + " # 소문자로 모두 변환합니다.\n", + " word = word.lower()\n", + " # 토큰화된 개별 단어가 스톱 워드의 단어에 포함되지 않으면 word_tokens에 추가\n", + " if word not in stopwords:\n", + " filtered_words.append(word)\n", + " all_tokens.append(filtered_words)\n", + "\n", + "print(all_tokens)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dtZJy7u7UZ9o", + "outputId": "18aeca77-fae0-4a1f-af2c-b25ec8d3c982" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Stemming과 Lemmatization" + ], + "metadata": { + "id": "ILkdNK-fU2cj" + } + }, + { + "cell_type": "code", + "source": [ + "from nltk.stem import LancasterStemmer\n", + "stemmer = LancasterStemmer()\n", + "\n", + "print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))\n", + "print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused'))\n", + "print(stemmer.stem('happier'), stemmer.stem('happiest'))\n", + "print(stemmer.stem('fancier'), stemmer.stem('fanciest'))\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8ppfMZmoU1SC", + "outputId": "b2c40807-2eff-421c-a207-a056fc7bbdee" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "work work work\n", + "amus amus amus\n", + "happy happiest\n", + "fant fanciest\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "import nltk\n", + "nltk.download('wordnet')\n", + "\n", + "lemma = WordNetLemmatizer()\n", + "print(lemma.lemmatize('ammusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v'))\n", + "print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))\n", + "print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "21ukCWsNU_PY", + "outputId": "d25abbd4-bc7c-4f18-9d4b-2a67e3a1d80a" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ammusing amuse amuse\n", + "happy happy\n", + "fancy fancy\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 3. Bag of Words - BOW" + ], + "metadata": { + "id": "EVFM5d11VWRL" + } + }, + { + "cell_type": "markdown", + "source": [ + "희소 행렬 - COO 형식" + ], + "metadata": { + "id": "VwC4I6cWVdXi" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "\n", + "dense = np.array([[3, 0, 1], [0, 2, 0]])" + ], + "metadata": { + "id": "1n-6Scc-VUe8" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from scipy import sparse\n", + "\n", + "# 0이 아닌 데이터 추출\n", + "data = np.array([3, 1, 2])\n", + "\n", + "# 행 위치와 열 위치를 각각 배열로 생성\n", + "row_pos = np.array([0, 0, 1 ])\n", + "col_pos = np.array([0, 2, 1 ])\n", + "\n", + "# sparse 패키지의 coojnatrix를 이용해 COO 형식으로 희소 행렬 생성\n", + "sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))" + ], + "metadata": { + "id": "BYPUYKoJVg57" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sparse_coo.toarray()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4A469JXuVmU8", + "outputId": "ae0ec7ab-af02-4477-b7e0-2cd0bed50fc0" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[3, 0, 1],\n", + " [0, 2, 0]])" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "희소 행렬 - CSR 형식" + ], + "metadata": { + "id": "ymEOmjqtVvfh" + } + }, + { + "cell_type": "code", + "source": [ + "[ [0, 0, 1, 0, 0, 5], [1, 4, 0, 3, 2, 5], [0, 6, 0, 3, 0, ], [2, 0, 0, 0, 0, 0], [0, 0, 0, 7, 0,\n", + "8], [1, 0, 0, 0, 0, 0] ]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YK62y4jXVu1o", + "outputId": "be626bea-62ed-4524-8ca9-3561fc2ac664" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[0, 0, 1, 0, 0, 5],\n", + " [1, 4, 0, 3, 2, 5],\n", + " [0, 6, 0, 3, 0],\n", + " [2, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 7, 0, 8],\n", + " [1, 0, 0, 0, 0, 0]]" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from scipy import sparse\n", + "\n", + "dense2 = np.array([[0, 0, 1, 0, 0, 5],\n", + " [1, 4, 0, 3, 2, 5],\n", + " [0, 6, 0, 3, 0, 0],\n", + " [2, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 7, 0, 8],\n", + " [1, 0, 0, 0, 0, 0]])\n", + "\n", + "# 0이 아닌 데이터 추출\n", + "data2 = np.array([1, 5, 1, 4, 3, 2, 5, 6, 3, 2, 7, 8, 1 ])\n", + "\n", + "# 행 위치와 열 위치를 각각 array로 생성\n", + "row_pos = np.array([0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 5])\n", + "coI_pos = np.array([2, 5, 0, 1, 3, 4, 5, 1, 3, 0, 3, 5, 0])\n", + "\n", + "# COO 형식으로 변환\n", + "sparse_coo = sparse.coo_matrix((data2, (row_pos, coI_pos)))\n", + "\n", + "# 행 위치 배열의 고유한 값의 시작 위치 인덱스를 배열로 생성\n", + "row_pos_ind = np.array([0, 2, 7, 9, 10, 12, 13])\n", + "\n", + "# CSR 형식으로 변환\n", + "sparse_csr = sparse.csr_matrix((data2, coI_pos, row_pos_ind))\n", + "\n", + "print('COO 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인')\n", + "print(sparse_coo.toarray())\n", + "print('CSR 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인')\n", + "print(sparse_csr.toarray())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "69hfeAnhV9YS", + "outputId": "df4ad7a2-defb-4bf8-c629-a26d261f7f28" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "COO 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인\n", + "[[0 0 1 0 0 5]\n", + " [1 4 0 3 2 5]\n", + " [0 6 0 3 0 0]\n", + " [2 0 0 0 0 0]\n", + " [0 0 0 7 0 8]\n", + " [1 0 0 0 0 0]]\n", + "CSR 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인\n", + "[[0 0 1 0 0 5]\n", + " [1 4 0 3 2 5]\n", + " [0 6 0 3 0 0]\n", + " [2 0 0 0 0 0]\n", + " [0 0 0 7 0 8]\n", + " [1 0 0 0 0 0]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "dense3 = np.array([[0, 0, 1, 0, 0, 5],\n", + " [1, 4, 0, 3, 2, 5],\n", + " [0, 6, 0, 3, 0, 0],\n", + " [2, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 7, 0, 8],\n", + " [1, 0, 0, 0, 0, 0]])\n", + "coo = sparse.coo_matrix(dense3)\n", + "csr = sparse.csr_matrix(dense3)" + ], + "metadata": { + "id": "_P4y5zPFWBdj" + }, + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 5. 감성 분석" + ], + "metadata": { + "id": "svu6eh11Xhyu" + } + }, + { + "cell_type": "markdown", + "source": [ + "지도학습 기반 감성 분석 실습 - IMDB 영화평" + ], + "metadata": { + "id": "6JARAdUhcixI" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "review_df = pd.read_csv('/content/labeledTrainData.tsv', header=0, sep='\\t', quoting=3)\n", + "review_df.head(3)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "DPGBkDokXRWj", + "outputId": "34c4a25b-5af1-47a5-fdf1-60336ece1046" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id sentiment review\n", + "0 \"5814_8\" 1 \"With all this stuff going down at the moment ...\n", + "1 \"2381_9\" 1 \"\\\"The Classic War of the Worlds\\\" by Timothy ...\n", + "2 \"7759_3\" 0 \"The film starts with a manager (Nicholas Bell..." + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsentimentreview
0\"5814_8\"1\"With all this stuff going down at the moment ...
1\"2381_9\"1\"\\\"The Classic War of the Worlds\\\" by Timothy ...
2\"7759_3\"0\"The film starts with a manager (Nicholas Bell...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "review_df", + "summary": "{\n \"name\": \"review_df\",\n \"rows\": 25000,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 25000,\n \"samples\": [\n \"\\\"2570_3\\\"\",\n \"\\\"4897_8\\\"\",\n \"\\\"8485_3\\\"\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 24904,\n \"samples\": [\n \"\\\"This is not my favorite WIP (\\\\\\\"Women in Prison\\\\\\\"), but it is one of the most famous films in the sub-genre. It is was produced by Roger Corman, who at this point had already produced a few WIPs. It is obvious that the film tries to play with the established formula. The movie takes place in an USA prison, not in a \\\\\\\"banana republic\\\\\\\" like most WIP films. I'm not sure if that was a wise move, but it is an acceptable change of pace. Writer-director Demme really gets into his job, always digging for new ways to present a familiar scenario. In fact, he is a little too ambitious for his own good. The filmmaker creates a few surreal dream sequences that are borderline pretentious but it is fun to see how hard he tries to put this film above your average chicks-in-chains flick. But do not worry, Demme still operates within the parameters of the sub-genre. There is plenty of nudity and violence, something that will satisfy hardcore fans. The film is a little slow, but it is very entertaining. The cast is good. Roberta Collins is a WIP veteran, so she does not need an introduction, and Barbara Steel is a hoot as the wheelchair-bound crazy warden. Pam Grier is sorely missed, though.\\\"\",\n \"\\\"In 1972, after his wife left to go her own way, Elvis Presley began dating Linda Thompson. Miss Thompson, a good-humored, long haired, lovely, statuesque beauty queen, is charted to fill a void in Elvis' life. When Elvis' divorce became final, Linda was already in place as the legendary performer's live-in girlfriend and travel companion until 1976.

This is a gaudy look at their love affair and companionship. Linda whole-heartedly tending to her lover's needs and desires. And even putting up with his swallowing medications by the handful and introducing her to her own love affair with valium. At times this movie is harsh and dark of heart; a very unattractive look at the 'King' and his queen.

Don Johnson is absolutely awful as Elvis. Over acting to the hilt is not attractive. Stephanie Zimbalist lacks the classiness of Linda, but does the job pretty well. Supporting cast includes: John Crawford, Ruta Lee, and Rick Lenz. Watching this twice is more than enough for me, but don't let this review stop you from checking it out. For most Elvis fans that I have conferred with, this is not a favored presentation.\\\"\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(review_df['review'][0])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "y_bNZVCwYiqh", + "outputId": "b4334a98-a35a-4b95-c1d4-120cbab7fdc8" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.

Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.

The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci's character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ's music.

Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.

Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ's bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i've gave this subject....hmmm well i don't know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\"\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import re\n", + "\n", + "#
html 태그는 replace 함수로 공백으로 변환\n", + "review_df['review'] = review_df['review'].str.replace('
', ' ')\n", + "\n", + "# 파이썬의 정규 표현식 모듈인 re를 이용해 영어 문자열이 아닌 문자는 모두 공백으로 변환\n", + "review_df['review'] = review_df['review'].apply( lambda x : re.sub(\"[^a-zA—Z]\", \" \", x) )\n" + ], + "metadata": { + "id": "4vkQ1HR0Ylet" + }, + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "class_df = review_df['sentiment']\n", + "feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)\n", + "\n", + "X_train, X_test, y_train, y_test= train_test_split(feature_df, class_df, test_size=0.3, random_state=156)\n", + "X_train.shape, X_test.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ib3REJ93Y1cK", + "outputId": "d395bfb4-1a19-427a-900e-9349ae35c0c8" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((17500, 1), (7500, 1))" + ] + }, + "metadata": {}, + "execution_count": 26 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, roc_auc_score\n", + "\n", + "# 스톱 워드는 English, ngram은 (1, 2)로 설정해 CountVectorization 수행.\n", + "# LogisticRegression의 C는 10으로 설정.\n", + "pipeline = Pipeline([\n", + " ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2) )),\n", + " ('lr_clf', LogisticRegression(solver='liblinear', C=10))])\n", + "\n", + "# Pipeline 객체를 이용해 fit(), predictO로 학습/예측 수행. predict_j)roba()는 roc_auc 때문에 수행.\n", + "pipeline.fit(X_train['review'], y_train)\n", + "pred = pipeline.predict(X_test['review'])\n", + "pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]\n", + "\n", + "print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_probs)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZhIMQONUZEH9", + "outputId": "77afa467-d3df-4c17-8918-b30e26738da0" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "예측 정확도는 0.8817, ROC-AUC는 0.9483\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "J7L12EjFcfhA" + } + }, + { + "cell_type": "markdown", + "source": [ + "SentiWordNet을 이용한 감성 분석" + ], + "metadata": { + "id": "NPSKtxo_cpJa" + } + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('all')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cHfmrCxEcfHO", + "outputId": "9f626f38-39bc-44be-ae46-918aa77f2218" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading collection 'all'\n", + "[nltk_data] | \n", + "[nltk_data] | Downloading package abc to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/abc.zip.\n", + "[nltk_data] | Downloading package alpino to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/alpino.zip.\n", + "[nltk_data] | Downloading package averaged_perceptron_tagger to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping taggers/averaged_perceptron_tagger.zip.\n", + "[nltk_data] | Downloading package averaged_perceptron_tagger_eng to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping\n", + "[nltk_data] | taggers/averaged_perceptron_tagger_eng.zip.\n", + "[nltk_data] | Downloading package averaged_perceptron_tagger_ru to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping\n", + "[nltk_data] | taggers/averaged_perceptron_tagger_ru.zip.\n", + "[nltk_data] | Downloading package averaged_perceptron_tagger_rus to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping\n", + "[nltk_data] | taggers/averaged_perceptron_tagger_rus.zip.\n", + "[nltk_data] | Downloading package basque_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/basque_grammars.zip.\n", + "[nltk_data] | Downloading package bcp47 to /root/nltk_data...\n", + "[nltk_data] | Downloading package biocreative_ppi to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/biocreative_ppi.zip.\n", + "[nltk_data] | Downloading package bllip_wsj_no_aux to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping models/bllip_wsj_no_aux.zip.\n", + "[nltk_data] | Downloading package book_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/book_grammars.zip.\n", + "[nltk_data] | Downloading package brown to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/brown.zip.\n", + "[nltk_data] | Downloading package brown_tei to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/brown_tei.zip.\n", + "[nltk_data] | Downloading package cess_cat to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/cess_cat.zip.\n", + "[nltk_data] | Downloading package cess_esp to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/cess_esp.zip.\n", + "[nltk_data] | Downloading package chat80 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/chat80.zip.\n", + "[nltk_data] | Downloading package city_database to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/city_database.zip.\n", + "[nltk_data] | Downloading package cmudict to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/cmudict.zip.\n", + "[nltk_data] | Downloading package comparative_sentences to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/comparative_sentences.zip.\n", + "[nltk_data] | Downloading package comtrans to /root/nltk_data...\n", + "[nltk_data] | Downloading package conll2000 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/conll2000.zip.\n", + "[nltk_data] | Downloading package conll2002 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/conll2002.zip.\n", + "[nltk_data] | Downloading package conll2007 to /root/nltk_data...\n", + "[nltk_data] | Downloading package crubadan to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/crubadan.zip.\n", + "[nltk_data] | Downloading package dependency_treebank to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/dependency_treebank.zip.\n", + "[nltk_data] | Downloading package dolch to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/dolch.zip.\n", + "[nltk_data] | Downloading package english_wordnet to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/english_wordnet.zip.\n", + "[nltk_data] | Downloading package europarl_raw to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/europarl_raw.zip.\n", + "[nltk_data] | Downloading package extended_omw to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package floresta to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/floresta.zip.\n", + "[nltk_data] | Downloading package framenet_v15 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/framenet_v15.zip.\n", + "[nltk_data] | Downloading package framenet_v17 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/framenet_v17.zip.\n", + "[nltk_data] | Downloading package gazetteers to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/gazetteers.zip.\n", + "[nltk_data] | Downloading package genesis to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/genesis.zip.\n", + "[nltk_data] | Downloading package gutenberg to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/gutenberg.zip.\n", + "[nltk_data] | Downloading package ieer to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/ieer.zip.\n", + "[nltk_data] | Downloading package inaugural to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/inaugural.zip.\n", + "[nltk_data] | Downloading package indian to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/indian.zip.\n", + "[nltk_data] | Downloading package jeita to /root/nltk_data...\n", + "[nltk_data] | Downloading package kimmo to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/kimmo.zip.\n", + "[nltk_data] | Downloading package knbc to /root/nltk_data...\n", + "[nltk_data] | Downloading package large_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/large_grammars.zip.\n", + "[nltk_data] | Downloading package lin_thesaurus to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/lin_thesaurus.zip.\n", + "[nltk_data] | Downloading package mac_morpho to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/mac_morpho.zip.\n", + "[nltk_data] | Downloading package machado to /root/nltk_data...\n", + "[nltk_data] | Downloading package masc_tagged to /root/nltk_data...\n", + "[nltk_data] | Downloading package maxent_ne_chunker to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping chunkers/maxent_ne_chunker.zip.\n", + "[nltk_data] | Downloading package maxent_ne_chunker_tab to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping chunkers/maxent_ne_chunker_tab.zip.\n", + "[nltk_data] | Downloading package maxent_treebank_pos_tagger to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping taggers/maxent_treebank_pos_tagger.zip.\n", + "[nltk_data] | Downloading package maxent_treebank_pos_tagger_tab to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping\n", + "[nltk_data] | taggers/maxent_treebank_pos_tagger_tab.zip.\n", + "[nltk_data] | Downloading package mock_corpus to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/mock_corpus.zip.\n", + "[nltk_data] | Downloading package moses_sample to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping models/moses_sample.zip.\n", + "[nltk_data] | Downloading package movie_reviews to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/movie_reviews.zip.\n", + "[nltk_data] | Downloading package mte_teip5 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/mte_teip5.zip.\n", + "[nltk_data] | Downloading package mwa_ppdb to /root/nltk_data...\n", + "[nltk_data] | Unzipping misc/mwa_ppdb.zip.\n", + "[nltk_data] | Downloading package names to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/names.zip.\n", + "[nltk_data] | Downloading package nombank.1.0 to /root/nltk_data...\n", + "[nltk_data] | Downloading package nonbreaking_prefixes to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/nonbreaking_prefixes.zip.\n", + "[nltk_data] | Downloading package nps_chat to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/nps_chat.zip.\n", + "[nltk_data] | Downloading package omw to /root/nltk_data...\n", + "[nltk_data] | Downloading package omw-1.4 to /root/nltk_data...\n", + "[nltk_data] | Downloading package opinion_lexicon to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/opinion_lexicon.zip.\n", + "[nltk_data] | Downloading package panlex_swadesh to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package paradigms to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/paradigms.zip.\n", + "[nltk_data] | Downloading package pe08 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/pe08.zip.\n", + "[nltk_data] | Downloading package perluniprops to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping misc/perluniprops.zip.\n", + "[nltk_data] | Downloading package pil to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/pil.zip.\n", + "[nltk_data] | Downloading package pl196x to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/pl196x.zip.\n", + "[nltk_data] | Downloading package porter_test to /root/nltk_data...\n", + "[nltk_data] | Unzipping stemmers/porter_test.zip.\n", + "[nltk_data] | Downloading package ppattach to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/ppattach.zip.\n", + "[nltk_data] | Downloading package problem_reports to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/problem_reports.zip.\n", + "[nltk_data] | Downloading package product_reviews_1 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/product_reviews_1.zip.\n", + "[nltk_data] | Downloading package product_reviews_2 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/product_reviews_2.zip.\n", + "[nltk_data] | Downloading package propbank to /root/nltk_data...\n", + "[nltk_data] | Downloading package pros_cons to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/pros_cons.zip.\n", + "[nltk_data] | Downloading package ptb to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/ptb.zip.\n", + "[nltk_data] | Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] | Package punkt is already up-to-date!\n", + "[nltk_data] | Downloading package punkt_tab to /root/nltk_data...\n", + "[nltk_data] | Package punkt_tab is already up-to-date!\n", + "[nltk_data] | Downloading package qc to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/qc.zip.\n", + "[nltk_data] | Downloading package reuters to /root/nltk_data...\n", + "[nltk_data] | Downloading package rslp to /root/nltk_data...\n", + "[nltk_data] | Unzipping stemmers/rslp.zip.\n", + "[nltk_data] | Downloading package rte to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/rte.zip.\n", + "[nltk_data] | Downloading package sample_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/sample_grammars.zip.\n", + "[nltk_data] | Downloading package semcor to /root/nltk_data...\n", + "[nltk_data] | Downloading package senseval to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/senseval.zip.\n", + "[nltk_data] | Downloading package sentence_polarity to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/sentence_polarity.zip.\n", + "[nltk_data] | Downloading package sentiwordnet to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/sentiwordnet.zip.\n", + "[nltk_data] | Downloading package shakespeare to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/shakespeare.zip.\n", + "[nltk_data] | Downloading package sinica_treebank to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/sinica_treebank.zip.\n", + "[nltk_data] | Downloading package smultron to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/smultron.zip.\n", + "[nltk_data] | Downloading package snowball_data to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package spanish_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/spanish_grammars.zip.\n", + "[nltk_data] | Downloading package state_union to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/state_union.zip.\n", + "[nltk_data] | Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] | Package stopwords is already up-to-date!\n", + "[nltk_data] | Downloading package subjectivity to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/subjectivity.zip.\n", + "[nltk_data] | Downloading package swadesh to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/swadesh.zip.\n", + "[nltk_data] | Downloading package switchboard to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/switchboard.zip.\n", + "[nltk_data] | Downloading package tagsets to /root/nltk_data...\n", + "[nltk_data] | Unzipping help/tagsets.zip.\n", + "[nltk_data] | Downloading package tagsets_json to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping help/tagsets_json.zip.\n", + "[nltk_data] | Downloading package timit to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/timit.zip.\n", + "[nltk_data] | Downloading package toolbox to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/toolbox.zip.\n", + "[nltk_data] | Downloading package treebank to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/treebank.zip.\n", + "[nltk_data] | Downloading package twitter_samples to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/twitter_samples.zip.\n", + "[nltk_data] | Downloading package udhr to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/udhr.zip.\n", + "[nltk_data] | Downloading package udhr2 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/udhr2.zip.\n", + "[nltk_data] | Downloading package unicode_samples to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/unicode_samples.zip.\n", + "[nltk_data] | Downloading package universal_tagset to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping taggers/universal_tagset.zip.\n", + "[nltk_data] | Downloading package universal_treebanks_v20 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package vader_lexicon to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package verbnet to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/verbnet.zip.\n", + "[nltk_data] | Downloading package verbnet3 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/verbnet3.zip.\n", + "[nltk_data] | Downloading package webtext to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/webtext.zip.\n", + "[nltk_data] | Downloading package wmt15_eval to /root/nltk_data...\n", + "[nltk_data] | Unzipping models/wmt15_eval.zip.\n", + "[nltk_data] | Downloading package word2vec_sample to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping models/word2vec_sample.zip.\n", + "[nltk_data] | Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] | Package wordnet is already up-to-date!\n", + "[nltk_data] | Downloading package wordnet2021 to /root/nltk_data...\n", + "[nltk_data] | Downloading package wordnet2022 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/wordnet2022.zip.\n", + "[nltk_data] | Downloading package wordnet31 to /root/nltk_data...\n", + "[nltk_data] | Downloading package wordnet_ic to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/wordnet_ic.zip.\n", + "[nltk_data] | Downloading package words to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/words.zip.\n", + "[nltk_data] | Downloading package ycoe to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/ycoe.zip.\n", + "[nltk_data] | \n", + "[nltk_data] Done downloading collection all\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 28 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from nltk.corpus import wordnet as wn\n", + "\n", + "term = 'present'\n", + "\n", + "# 'present'라는 단어로 wordnet의 synsets 생성.\n", + "synsets = wn.synsets(term)\n", + "print('synsets() 반환 type :', type(synsets))\n", + "print('synsets() 반환 값 갯수:', len(synsets))\n", + "print('synsets() 반환 값 :', synsets)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FyqXFemrc1MF", + "outputId": "c1ede3d4-d89c-44cf-c353-2dfcef155a6c" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "synsets() 반환 type : \n", + "synsets() 반환 값 갯수: 18\n", + "synsets() 반환 값 : [Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "for synset in synsets :\n", + " print('##### Synset name : ', synset.name(), '#####')\n", + " print('POS :', synset.lexname())\n", + " print('Definition:', synset.definition())\n", + " print('Lemmas:', synset.lemma_names())\n", + "" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "M5s4qKY5dJhi", + "outputId": "586eb8fd-1d7f-4757-81c9-d44e60cd39c3" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "##### Synset name : present.n.01 #####\n", + "POS : noun.time\n", + "Definition: the period of time that is happening now; any continuous stretch of time including the moment of speech\n", + "Lemmas: ['present', 'nowadays']\n", + "##### Synset name : present.n.02 #####\n", + "POS : noun.possession\n", + "Definition: something presented as a gift\n", + "Lemmas: ['present']\n", + "##### Synset name : present.n.03 #####\n", + "POS : noun.communication\n", + "Definition: a verb tense that expresses actions or states at the time of speaking\n", + "Lemmas: ['present', 'present_tense']\n", + "##### Synset name : show.v.01 #####\n", + "POS : verb.perception\n", + "Definition: give an exhibition of to an interested audience\n", + "Lemmas: ['show', 'demo', 'exhibit', 'present', 'demonstrate']\n", + "##### Synset name : present.v.02 #####\n", + "POS : verb.communication\n", + "Definition: bring forward and present to the mind\n", + "Lemmas: ['present', 'represent', 'lay_out']\n", + "##### Synset name : stage.v.01 #####\n", + "POS : verb.creation\n", + "Definition: perform (a play), especially on a stage\n", + "Lemmas: ['stage', 'present', 'represent']\n", + "##### Synset name : present.v.04 #####\n", + "POS : verb.possession\n", + "Definition: hand over formally\n", + "Lemmas: ['present', 'submit']\n", + "##### Synset name : present.v.05 #####\n", + "POS : verb.stative\n", + "Definition: introduce\n", + "Lemmas: ['present', 'pose']\n", + "##### Synset name : award.v.01 #####\n", + "POS : verb.possession\n", + "Definition: give, especially as an honor or reward\n", + "Lemmas: ['award', 'present']\n", + "##### Synset name : give.v.08 #####\n", + "POS : verb.possession\n", + "Definition: give as a present; make a gift of\n", + "Lemmas: ['give', 'gift', 'present']\n", + "##### Synset name : deliver.v.01 #####\n", + "POS : verb.communication\n", + "Definition: deliver (a speech, oration, or idea)\n", + "Lemmas: ['deliver', 'present']\n", + "##### Synset name : introduce.v.01 #####\n", + "POS : verb.communication\n", + "Definition: cause to come to know personally\n", + "Lemmas: ['introduce', 'present', 'acquaint']\n", + "##### Synset name : portray.v.04 #####\n", + "POS : verb.creation\n", + "Definition: represent abstractly, for example in a painting, drawing, or sculpture\n", + "Lemmas: ['portray', 'present']\n", + "##### Synset name : confront.v.03 #####\n", + "POS : verb.communication\n", + "Definition: present somebody with something, usually to accuse or criticize\n", + "Lemmas: ['confront', 'face', 'present']\n", + "##### Synset name : present.v.12 #####\n", + "POS : verb.communication\n", + "Definition: formally present a debutante, a representative of a country, etc.\n", + "Lemmas: ['present']\n", + "##### Synset name : salute.v.06 #####\n", + "POS : verb.communication\n", + "Definition: recognize with a gesture prescribed by a military regulation; assume a prescribed position\n", + "Lemmas: ['salute', 'present']\n", + "##### Synset name : present.a.01 #####\n", + "POS : adj.all\n", + "Definition: temporal sense; intermediate between past and future; now existing or happening or in consideration\n", + "Lemmas: ['present']\n", + "##### Synset name : present.a.02 #####\n", + "POS : adj.all\n", + "Definition: being or existing in a specified place\n", + "Lemmas: ['present']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# synset 객체를 단어별로 생성합니다.\n", + "tree = wn.synset('tree.n.01')\n", + "lion = wn.synset('lion.n.01')\n", + "tiger = wn.synset('tiger.n.02')\n", + "cat = wn.synset('cat.n.01')\n", + "dog = wn.synset('dog.n.01')\n", + "\n", + "entities = [tree, lion, tiger, cat, dog]\n", + "similarities = []\n", + "entity_names = [entity.name().split('.')[0] for entity in entities]\n", + "\n", + "# 단어별 synset을 반복하면서 다른 단어의 synset과 유사도를 측정합니다.\n", + "for entity in entities:\n", + " similarity = [round(entity.path_similarity(compared_entity), 2)\n", + " for compared_entity in entities]\n", + " similarities.append(similarity)\n", + "\n", + "# 개별 단어별 유사도를 DataFrame 형태로 저장합니다.\n", + "similarity_df = pd.DataFrame(similarities, columns=entity_names, index=entity_names)\n", + "similarity_df\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "W4iRtpX3dXXZ", + "outputId": "65a022db-6fa7-44cd-99c1-b85e695f0575" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " tree lion tiger cat dog\n", + "tree 1.00 0.07 0.07 0.08 0.12\n", + "lion 0.07 1.00 0.33 0.25 0.17\n", + "tiger 0.07 0.33 1.00 0.25 0.17\n", + "cat 0.08 0.25 0.25 1.00 0.20\n", + "dog 0.12 0.17 0.17 0.20 1.00" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
treeliontigercatdog
tree1.000.070.070.080.12
lion0.071.000.330.250.17
tiger0.070.331.000.250.17
cat0.080.250.251.000.20
dog0.120.170.170.201.00
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "similarity_df", + "summary": "{\n \"name\": \"similarity_df\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"tree\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.40971941618624813,\n \"min\": 0.07,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.07,\n 0.12,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"lion\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3683476618630828,\n \"min\": 0.07,\n \"max\": 1.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.0,\n 0.17,\n 0.33\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tiger\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3683476618630828,\n \"min\": 0.07,\n \"max\": 1.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.33,\n 0.17,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"cat\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.36664696916789047,\n \"min\": 0.08,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.25,\n 0.2,\n 0.08\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"dog\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3745263675630862,\n \"min\": 0.12,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.17,\n 1.0,\n 0.12\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "from nltk.corpus import sentiwordnet as swn\n", + "\n", + "senti_synsets = list(swn.senti_synsets('slow'))\n", + "print('senti_synsets() 반환 type :', type(senti_synsets))\n", + "print('senti_synsets() 반환 값 갯수:', len(senti_synsets))\n", + "print('senti_synsets() 반환 값 :', senti_synsets)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BR9Z8f7Nd3Hl", + "outputId": "e1532be3-9a83-4791-bcc7-c8f084fa921c" + }, + "execution_count": 32, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "senti_synsets() 반환 type : \n", + "senti_synsets() 반환 값 갯수: 11\n", + "senti_synsets() 반환 값 : [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "from nltk.corpus import sentiwordnet as swn\n", + "\n", + "father = swn.senti_synset('father.n.01')\n", + "print('father 긍정감성 지수:', father.pos_score())\n", + "print ('fat her 부정감성 지수:', father. neg_score ())\n", + "print('father 객관성 지수:', father.obj_score())\n", + "print('\\n')\n", + "fabulous = swn.senti_synset('fabulous.a.01')\n", + "print('fabulous 긍정감성 지수:', fabulous.pos_score())\n", + "print('fabulous 부정감성 지수:', fabulous.neg_score())\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JDb3tmP-d6eN", + "outputId": "5a534857-c7bc-4276-b6f0-3d7ab13e2597" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "father 긍정감성 지수: 0.0\n", + "fat her 부정감성 지수: 0.0\n", + "father 객관성 지수: 1.0\n", + "\n", + "\n", + "fabulous 긍정감성 지수: 0.875\n", + "fabulous 부정감성 지수: 0.125\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "SentiWordNet을 이용한 영화 감상평 감성 분석" + ], + "metadata": { + "id": "G2dM_3xgeRQ5" + } + }, + { + "cell_type": "code", + "source": [ + "from nltk.corpus import wordnet as wn\n", + "# 간단한 NTLK PennTreebank Tag를 기반으로 WordNet 기반의 품사 Tag로 변환\n", + "def penn_to_wn(tag):\n", + " if tag.startswith('J'):\n", + " return wn.ADJ\n", + " elif tag.startswith('N'):\n", + " return wn.NOUN\n", + " elif tag.startswith('R'):\n", + " return wn.ADV\n", + " elif tag.startswith('V'):\n", + " return wn.VERB" + ], + "metadata": { + "id": "n9yFiyoxeUuB" + }, + "execution_count": 34, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.corpus import sentiwordnet as swn\n", + "from nltk import sent_tokenize, word_tokenize, pos_tag\n", + "\n", + "def swn_polarity(text):\n", + " # 감성 지수 초기화\n", + " sentiment = 0.0\n", + " tokens_count = 0\n", + "\n", + " lemmatizer = WordNetLemmatizer()\n", + " raw_sentences = sent_tokenize(text)\n", + " # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산\n", + " for raw_sentence in raw_sentences:\n", + " # NTLK 기반의 품사 태깅 문장 추출\n", + " tagged_sentence = pos_tag(word_tokenize(raw_sentence))\n", + " for word, tag in tagged_sentence:\n", + "\n", + " # WordNet 기반 품사 태깅과 어근 추출\n", + " wn_tag = penn_to_wn(tag)\n", + " if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):\n", + " continue\n", + " lemma = lemmatizer.lemmatize(word, pos=wn_tag)\n", + " if not lemma:\n", + " continue\n", + " # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성.\n", + " synsets = wn.synsets(lemma, pos=wn_tag)\n", + " if not synsets:\n", + " continue\n", + " # sentiwordnet의 감성 단어 분석으로 감성 synset 추출\n", + " # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산.\n", + " synset = synsets[0]\n", + " swn_synset = swn.senti_synset(synset.name())\n", + " sentiment += (swn_synset.pos_score() - swn_synset.neg_score())\n", + " tokens_count += 1\n", + "\n", + " if not tokens_count:\n", + " return 0\n", + "\n", + " # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환\n", + " if sentiment >= 0 :\n", + " return 1\n", + "\n", + " return 0" + ], + "metadata": { + "id": "uusU5PKvepE0" + }, + "execution_count": 37, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "review_df['preds'] = review_df['review'].apply( lambda x : swn_polarity(x) )\n", + "y_target = review_df['sentiment'].values\n", + "preds = review_df['preds'].values" + ], + "metadata": { + "id": "Wb06Szi2f9eV" + }, + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import accuracy_score, confusion_matrix, precision_score\n", + "from sklearn.metrics import recall_score, f1_score, roc_auc_score\n", + "import numpy as np\n", + "\n", + "print(confusion_matrix(y_target, preds))\n", + "print(\"정확도:\", np.round(accuracy_score(y_target, preds), 4))\n", + "print(\"정밀도:\", np.round(precision_score(y_target, preds), 4))\n", + "print(\"재현율:\", np.round(recall_score(y_target, preds), 4))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mESgwNQ0gaMY", + "outputId": "8d7dc835-8272-4665-fc06-a1659f62619a" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[12500 0]\n", + " [12500 0]]\n", + "정확도: 0.5\n", + "정밀도: 0.0\n", + "재현율: 0.0\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "VADER을 이용한 감성 분석" + ], + "metadata": { + "id": "v6JirDbShoHu" + } + }, + { + "cell_type": "code", + "source": [ + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "\n", + "senti_analyzer = SentimentIntensityAnalyzer()\n", + "senti_scores = senti_analyzer.polarity_scores(review_df['review'][0])\n", + "print(senti_scores)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "k4Lonht9hqCJ", + "outputId": "a145b6b6-95f5-4235-f765-727f48023863" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'neg': 0.113, 'neu': 0.758, 'pos': 0.128, 'compound': 0.3042}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def vader_polarity(review, threshold=0.1):\n", + " analyzer = SentimentIntensityAnalyzer()\n", + " scores = analyzer.polarity_scores(review)\n", + "\n", + " # compound 값에 기반해 threshold 입력값보다 크면 1, 그렇지 않으면 0을 반환\n", + " agg_score = scores['compound']\n", + " final_sentiment = 1 if agg_score >= threshold else 0\n", + " return final_sentiment\n", + "\n", + "# apply lambda 식을 이용해 레코드별로 vader_polarity( )를 수행하고 결과를 'vader_preds'에 저장\n", + "review_df['vader_preds'] = review_df['review'].apply( lambda x : vader_polarity(x, 0.1) )\n", + "y_target = review_df['sentiment'].values\n", + "vader_preds = review_df['vader_preds'].values\n", + "\n", + "print(confusion_matrix(y_target, vader_preds))\n", + "print(\"정확도:\", np.round(accuracy_score(y_target, vader_preds),4))\n", + "print(\"정밀도:\", np.round(precision_score(y_target , vader_preds),4))\n", + "print(\"재현율:\", np.round(recall_score(y_target, vader_preds),4))\n", + "\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OOYKJNjeh8ad", + "outputId": "a1072a47-1278-4e66-9489-e7c9bae25e26" + }, + "execution_count": 45, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[ 6704 5796]\n", + " [ 1881 10619]]\n", + "정확도: 0.6929\n", + "정밀도: 0.6469\n", + "재현율: 0.8495\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "L5czdaWbigVQ" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git "a/Week12_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.pdf" "b/Week12_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.pdf" new file mode 100644 index 0000000..b38e5f7 Binary files /dev/null and "b/Week12_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\204\270\354\227\260.pdf" differ