diff --git "a/Week13_\354\230\210\354\212\265\352\263\274\354\240\234_\352\266\214\354\247\200\354\233\220.ipynb" "b/Week13_\354\230\210\354\212\265\352\263\274\354\240\234_\352\266\214\354\247\200\354\233\220.ipynb" new file mode 100644 index 0000000..769ff66 --- /dev/null +++ "b/Week13_\354\230\210\354\212\265\352\263\274\354\240\234_\352\266\214\354\247\200\354\233\220.ipynb" @@ -0,0 +1,496 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "##1. LDA 기반 토픽 모델링 (20 뉴스그룹)" + ], + "metadata": { + "id": "C7izBqtjNoCq" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QdgfVsTfNc7m", + "outputId": "a448bba1-f61a-4a1c-d8c7-634a0fba494e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CountVectorizer Shape: (7862, 1000)\n", + "(8, 1000)\n", + "Topic # 0\n", + "10 year medical health 1993 20 12 disease cancer team patients research number new 11\n", + "Topic # 1\n", + "don just like know think good time ve does way really people want ll right\n", + "Topic # 2\n", + "image file jpeg output program gif images format files color entry use bit 03 02\n", + "Topic # 3\n", + "armenian armenians turkish people said turkey armenia government genocide turks muslim russian greek azerbaijan killed\n", + "Topic # 4\n", + "israel jews dos jewish israeli dos dos arab state people arabs palestinian adl ed anti peace\n", + "Topic # 5\n", + "edu com available graphics ftp window use mail data motif software version pub information server\n", + "Topic # 6\n", + "god people jesus church believe say christ does christian think christians did know bible man\n", + "Topic # 7\n", + "thanks use using does help like display need problem know server screen windows window program\n" + ] + } + ], + "source": [ + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.decomposition import LatentDirichletAllocation\n", + "\n", + "# 모터사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 전자공학, 의학 8개 주제를 추출.\n", + "cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',\n", + " 'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med']\n", + "\n", + "# 위에서 cats 변수로 기재된 카테고리만 추출. subset='all'로 오타 수정\n", + "news_df = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'),\n", + " categories=cats, random_state=0)\n", + "\n", + "# LDA는 Count 기반의 벡터화만 적용. max_features=1000 오타 수정\n", + "count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1, 2))\n", + "feat_vect = count_vect.fit_transform(news_df.data)\n", + "print('CountVectorizer Shape:', feat_vect.shape)\n", + "\n", + "# LDA 모델 객체 생성 및 학습\n", + "lda = LatentDirichletAllocation(n_components=8, random_state=0)\n", + "lda.fit(feat_vect)\n", + "\n", + "print(lda.components_.shape)\n", + "\n", + "# 토픽별 핵심 단어 출력 함수\n", + "def display_topics(model, feature_names, no_top_words):\n", + " for topic_index, topic in enumerate(model.components_):\n", + " print('Topic #', topic_index)\n", + " # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array 인덱스를 반환.\n", + " topic_word_indexes = topic.argsort()[::-1]\n", + " top_indexes = topic_word_indexes[:no_top_words]\n", + "\n", + " # top_indexes 대상인 인덱스별로 feature_names에 해당하는 word feature 추출 후 join으로 concat\n", + " feature_concat = ' '.join([feature_names[i] for i in top_indexes])\n", + " print(feature_concat)\n", + "\n", + "# CountVectorizer 객체 내의 전체 word의 명칭을 get_feature_names_out()를 통해 추출 (최신 버전 반영)\n", + "feature_names = count_vect.get_feature_names_out()\n", + "\n", + "# 토픽별 가장 연관도가 높은 word를 15개만 추출\n", + "display_topics(lda, feature_names, 15)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##2. Opinion Review 데이터 세트 로딩 및 문서 군집화 (K-Means)" + ], + "metadata": { + "id": "EloQ8slaNtUu" + } + }, + { + "cell_type": "code", + "source": [ + "# 1. 데이터 세트 zip 파일 다운로드\n", + "!wget https://archive.ics.uci.edu/static/public/288/opinosis+opinion+frasl+review.zip\n", + "\n", + "# 2. 압축 해제 (opinosis_data 폴더 내에 저장)\n", + "!unzip -q opinosis+opinion+frasl+review.zip -d opinosis_data" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RF7ry3viObsN", + "outputId": "e6321346-4b13-4aa0-879b-8bedb1f43dbd" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2026-05-26 12:19:29-- https://archive.ics.uci.edu/static/public/288/opinosis+opinion+frasl+review.zip\n", + "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252\n", + "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.\n", + "HTTP request sent, awaiting response... 404 Not Found\n", + "2026-05-26 12:19:30 ERROR 404: Not Found.\n", + "\n", + "unzip: cannot find or open opinosis+opinion+frasl+review.zip, opinosis+opinion+frasl+review.zip.zip or opinosis+opinion+frasl+review.zip.ZIP.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import glob, os\n", + "import warnings\n", + "import nltk\n", + "import string\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.cluster import KMeans\n", + "\n", + "warnings.filterwarnings('ignore')\n", + "pd.set_option('display.max_colwidth', 700)\n", + "\n", + "# 다운로드에 필요한 nltk 데이터\n", + "nltk.download('punkt')\n", + "nltk.download('wordnet')\n", + "\n", + "# 책 부록에 생략된 LemNormalize 함수 구현 (실행을 위해 추가)\n", + "remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)\n", + "lemmar = WordNetLemmatizer()\n", + "\n", + "def LemTokens(tokens):\n", + " return [lemmar.lemmatize(token) for token in tokens]\n", + "\n", + "def LemNormalize(text):\n", + " return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))\n", + "\n", + "path = '/content/opinosis_data/OpinosisDataset1.0/topics'\n", + "\n", + "all_files = glob.glob(os.path.join(path, \"*.data\"))\n", + "filename_list = []\n", + "opinion_text = []\n", + "\n", + "# 파일 취합 및 DataFrame 로딩\n", + "for file_ in all_files:\n", + " df = pd.read_table(file_, index_col=None, header=0, encoding='latin1')\n", + " # 코랩(Linux) 환경과 윈도우 환경 모두 동작하도록 os.path.basename 사용\n", + " filename_ = os.path.basename(file_)\n", + " filename = filename_.split('.')[0]\n", + "\n", + " filename_list.append(filename)\n", + " opinion_text.append(df.to_string())\n", + "\n", + "document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})\n", + "\n", + "# TF-IDF 피처 벡터화 적용\n", + "tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english',\n", + " ngram_range=(1,2), min_df=0.05, max_df=0.85)\n", + "\n", + "# (주의: 실제 데이터가 있어야 아래 코드 실행 가능)\n", + "# feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])\n", + "\n", + "# 3개의 집합으로 군집화 수행\n", + "# km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)\n", + "# km_cluster.fit(feature_vect)\n", + "# document_df['cluster_label'] = km_cluster.labels_\n", + "# document_df.sort_values(by='cluster_label')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cdX_GCmJOgJc", + "outputId": "7364ada8-0fd2-41dd-c552-00a2ecf35c29" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "##3. 군집별 핵심 단어 추출" + ], + "metadata": { + "id": "8e634y1bPAJx" + } + }, + { + "cell_type": "code", + "source": [ + "# 군집별 top n 핵심 단어, 그 단어의 중심 위치 상댓값, 대상 파일명을 반환함.\n", + "def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):\n", + " cluster_details = {}\n", + " # cluster_centers_ array의 값이 큰 순으로 정렬된 인덱스 값을 반환\n", + " centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:, ::-1]\n", + "\n", + " for cluster_num in range(clusters_num):\n", + " cluster_details[cluster_num] = {}\n", + " cluster_details[cluster_num]['cluster'] = cluster_num\n", + "\n", + " top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]\n", + " top_features = [feature_names[ind] for ind in top_feature_indexes]\n", + " top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()\n", + "\n", + " # DataFrame에서 클러스터 번호에 해당하는 파일명 추출 (조건식 == 오타 수정)\n", + " filenames = cluster_data[cluster_data['cluster_label'] == cluster_num]['filename']\n", + " filenames = filenames.values.tolist()\n", + "\n", + " cluster_details[cluster_num]['top_features'] = top_features\n", + " cluster_details[cluster_num]['top_features_value'] = top_feature_values\n", + " cluster_details[cluster_num]['filenames'] = filenames\n", + "\n", + " return cluster_details\n", + "\n", + "def print_cluster_details(cluster_details):\n", + " for cluster_num, cluster_detail in cluster_details.items():\n", + " print('####### Cluster {0}'.format(cluster_num))\n", + " print('Top features:', cluster_detail['top_features'])\n", + " print('Reviews 파일명:', cluster_detail['filenames'][:7])\n", + " print('==================================================')\n", + "\n", + "# (실제 실행을 위한 코드 예시 - 주석 처리됨)\n", + "# feature_names = tfidf_vect.get_feature_names_out()\n", + "# cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=document_df,\n", + "# feature_names=feature_names, clusters_num=3, top_n_features=10)\n", + "# print_cluster_details(cluster_details)" + ], + "metadata": { + "id": "CEXwMPTrOrcZ" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "##4. 문서 유사도 측정 (Cosine Similarity)" + ], + "metadata": { + "id": "VcnH9U9hO74m" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "# 1. 수식으로 직접 구현하는 코사인 유사도 함수\n", + "def cos_similarity(v1, v2):\n", + " dot_product = np.dot(v1, v2)\n", + " l2_norm = (np.sqrt(sum(np.square(v1))) * np.sqrt(sum(np.square(v2))))\n", + " similarity = dot_product / l2_norm\n", + " return similarity\n", + "\n", + "doc_list = ['if you take the blue pill, the story ends',\n", + " 'if you take the red pill, you stay in Wonderland',\n", + " 'if you take the red pill, I show you how deep the rabbit hole goes']\n", + "\n", + "tfidf_vect_simple = TfidfVectorizer()\n", + "feature_vect_simple = tfidf_vect_simple.fit_transform(doc_list)\n", + "print('feature_vect_simple shape:', feature_vect_simple.shape)\n", + "\n", + "# 밀집 행렬 변환 후 수동 함수로 계산 테스트\n", + "feature_vect_dense = feature_vect_simple.todense()\n", + "vect1 = np.array(feature_vect_dense[0]).reshape(-1,)\n", + "vect2 = np.array(feature_vect_dense[1]).reshape(-1,)\n", + "similarity_simple = cos_similarity(vect1, vect2)\n", + "print('문장 1, 문장 2 Cosine 유사도: {0:.3f}'.format(similarity_simple))\n", + "\n", + "# 2. 사이킷런의 cosine_similarity API 활용\n", + "similarity_simple_pair = cosine_similarity(feature_vect_simple[0], feature_vect_simple)\n", + "print('사이킷런 API 첫번째 문서 유사도:\\n', similarity_simple_pair)\n", + "\n", + "similarity_all_pair = cosine_similarity(feature_vect_simple, feature_vect_simple)\n", + "print('사이킷런 API 전체 문서 유사도:\\n', similarity_all_pair)\n", + "\n", + "# (참고) 이전 리뷰 데이터의 특정 군집 간 유사도 비교 시각화 (코드 템플릿)\n", + "# sorted_index = similarity_pair.argsort()[:, ::-1]\n", + "# sorted_index = sorted_index[:, 1:] # 자기 자신 제외\n", + "# sns.barplot(x='similarity', y='filename', data=hotel_1_sim_df)\n", + "# plt.title(comparison_docname)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FRmdn6c_Osy2", + "outputId": "7e5a5d7e-4dbb-44a8-f326-ffd959b58171" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "feature_vect_simple shape: (3, 18)\n", + "문장 1, 문장 2 Cosine 유사도: 0.402\n", + "사이킷런 API 첫번째 문서 유사도:\n", + " [[1. 0.40207758 0.40425045]]\n", + "사이킷런 API 전체 문서 유사도:\n", + " [[1. 0.40207758 0.40425045]\n", + " [0.40207758 1. 0.45647296]\n", + " [0.40425045 0.45647296 1. ]]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "##5. 한글 형태소 분석 및 네이버 영화 평점 감성 분석" + ], + "metadata": { + "id": "NSy7IbzjO4iR" + } + }, + { + "cell_type": "code", + "source": [ + "# 학습 데이터(ratings_train.txt) 및 테스트 데이터(ratings_test.txt) 다운로드\n", + "!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt\n", + "!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yv-MlWrqPEZx", + "outputId": "665dbfd6-a973-4882-e6bd-8cb608b48856" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2026-05-26 12:22:14-- https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 14628807 (14M) [text/plain]\n", + "Saving to: ‘ratings_train.txt’\n", + "\n", + "ratings_train.txt 100%[===================>] 13.95M --.-KB/s in 0.1s \n", + "\n", + "2026-05-26 12:22:14 (108 MB/s) - ‘ratings_train.txt’ saved [14628807/14628807]\n", + "\n", + "--2026-05-26 12:22:14-- https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4893335 (4.7M) [application/octet-stream]\n", + "Saving to: ‘ratings_test.txt’\n", + "\n", + "ratings_test.txt 100%[===================>] 4.67M --.-KB/s in 0.07s \n", + "\n", + "2026-05-26 12:22:15 (67.2 MB/s) - ‘ratings_test.txt’ saved [4893335/4893335]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# 패키지 설치\n", + "!pip install konlpy\n", + "\n", + "import pandas as pd\n", + "import re\n", + "from konlpy.tag import Okt\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# 데이터 로딩\n", + "try:\n", + " train_df = pd.read_csv('ratings_train.txt', sep='\\t', encoding='utf-8')\n", + " test_df = pd.read_csv('ratings_test.txt', sep='\\t', encoding='utf-8')\n", + "except FileNotFoundError:\n", + " print(\"데이터 파일이 없습니다.\")\n", + "\n", + "# 전처리\n", + "if 'train_df' in locals():\n", + " train_df = train_df.fillna(' ')\n", + " train_df['document'] = train_df['document'].apply(lambda x: re.sub(r\"\\d+\", \" \", x))\n", + " train_df.drop('id', axis=1, inplace=True)\n", + "\n", + " test_df = test_df.fillna(' ')\n", + " test_df['document'] = test_df['document'].apply(lambda x: re.sub(r\"\\d+\", \" \", x))\n", + " test_df.drop('id', axis=1, inplace=True)\n", + "\n", + "# 토크나이저 설정\n", + "okt = Okt()\n", + "def tw_tokenizer(text):\n", + " return okt.morphs(text)\n", + "\n", + "# TF-IDF 벡터화 및 모델 학습 수행\n", + "tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1, 2), min_df=3, max_df=0.9)\n", + "tfidf_vect.fit(train_df['document'])\n", + "tfidf_matrix_train = tfidf_vect.transform(train_df['document'])\n", + "\n", + "lg_clf = LogisticRegression(random_state=0, solver='liblinear')\n", + "params = {'C': [1, 3.5, 4.5, 5.5, 10]}\n", + "\n", + "grid_cv = GridSearchCV(lg_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)\n", + "grid_cv.fit(tfidf_matrix_train, train_df['label'])\n", + "print(grid_cv.best_params_, round(grid_cv.best_score_, 4))\n", + "\n", + "# 테스트 데이터 예측\n", + "tfidf_matrix_test = tfidf_vect.transform(test_df['document'])\n", + "best_estimator = grid_cv.best_estimator_\n", + "preds = best_estimator.predict(tfidf_matrix_test)\n", + "print('최종 정확도:', accuracy_score(test_df['label'], preds))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_UkpfiUdPFv6", + "outputId": "ca107864-100d-4421-9bd9-1b55543ad68a" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: konlpy in /usr/local/lib/python3.12/dist-packages (0.6.0)\n", + "Requirement already satisfied: JPype1>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from konlpy) (1.7.1)\n", + "Requirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.12/dist-packages (from konlpy) (6.1.0)\n", + "Requirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.12/dist-packages (from konlpy) (2.0.2)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from JPype1>=0.7.0->konlpy) (26.1)\n", + "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n", + "{'C': 3.5} 0.8593\n", + "최종 정확도: 0.86172\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git "a/Week13_\354\230\210\354\212\265\352\263\274\354\240\234_\352\266\214\354\247\200\354\233\220.pdf" "b/Week13_\354\230\210\354\212\265\352\263\274\354\240\234_\352\266\214\354\247\200\354\233\220.pdf" new file mode 100644 index 0000000..96218dc Binary files /dev/null and "b/Week13_\354\230\210\354\212\265\352\263\274\354\240\234_\352\266\214\354\247\200\354\233\220.pdf" differ