diff --git "a/Week12_\353\263\265\354\212\265\352\263\274\354\240\234_\354\225\210\354\204\234\354\230\201.ipynb" "b/Week12_\353\263\265\354\212\265\352\263\274\354\240\234_\354\225\210\354\204\234\354\230\201.ipynb"
new file mode 100644
index 0000000..3add843
--- /dev/null
+++ "b/Week12_\353\263\265\354\212\265\352\263\274\354\240\234_\354\225\210\354\204\234\354\230\201.ipynb"
@@ -0,0 +1,450 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "69255bb6-215c-4d7b-9690-5608080cff14",
+   "metadata": {},
+   "source": [
+    "# **8.4 텍스트 분류 실습 _ 20 뉴스그룹 분류**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72225395-4610-4bb2-a366-ff81483b66ac",
+   "metadata": {},
+   "source": [
+    "## 텍스트 정규화"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "10dc285e-8598-48e2-a3e4-3b17baeb9702",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "컴퓨터에 있는 하위 파일들을 다이렉트로 읽어오는 중...\n",
+      "🎉 드디어 강제 돌파 성공! 데이터 개수: 18846개\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from sklearn.datasets import load_files\n",
+    "from sklearn.utils import Bunch\n",
+    "\n",
+    "# 1. 이미 압축이 풀려있는 train과 test 폴더 경로를 직접 지정합니다.\n",
+    "base_dir = r'C:\\Users\\youth\\scikit_learn_data\\20news_home'\n",
+    "train_dir = os.path.join(base_dir, '20news-bydate-train')\n",
+    "test_dir = os.path.join(base_dir, '20news-bydate-test')\n",
+    "\n",
+    "print(\"컴퓨터에 있는 하위 파일들을 다이렉트로 읽어오는 중...\")\n",
+    "\n",
+    "# 2. scikit-learn의 내장 함수로 폴더를 직접 긁어와서 합칩니다. (인터넷 접속 X)\n",
+    "train_bunch = load_files(train_dir, encoding='latin1', decode_error='replace')\n",
+    "test_bunch = load_files(test_dir, encoding='latin1', decode_error='replace')\n",
+    "\n",
+    "# 3. 원래 원하셨던 news_data 변수와 똑같은 구조(subset='all')로 합쳐줍니다.\n",
+    "news_data = Bunch(\n",
+    "    data = train_bunch.data + test_bunch.data,\n",
+    "    target = list(train_bunch.target) + list(test_bunch.target),\n",
+    "    target_names = train_bunch.target_names,\n",
+    "    DESCR = train_bunch.DESCR\n",
+    ")\n",
+    "\n",
+    "print(f\"🎉 드디어 강제 돌파 성공! 데이터 개수: {len(news_data.data)}개\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e4f1336f-0a46-490d-ab04-c1d4119eb104",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_keys(['data', 'target', 'target_names', 'DESCR'])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(news_data.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b74907ac-899b-447d-b590-88c7b6b6840f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "target 클래스의 값과 분포도 \n",
+      " 0     799\n",
+      "1     973\n",
+      "2     985\n",
+      "3     982\n",
+      "4     963\n",
+      "5     988\n",
+      "6     975\n",
+      "7     990\n",
+      "8     996\n",
+      "9     994\n",
+      "10    999\n",
+      "11    991\n",
+      "12    984\n",
+      "13    990\n",
+      "14    987\n",
+      "15    997\n",
+      "16    910\n",
+      "17    940\n",
+      "18    775\n",
+      "19    628\n",
+      "Name: count, dtype: int64\n",
+      "target 클래스의 이름들 \n",
+      " ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "print('target 클래스의 값과 분포도 \\n',pd.Series(news_data.target).value_counts().sort_index())\n",
+    "print('target 클래스의 이름들 \\n',news_data.target_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fac4d712-2f94-4079-8582-e44bb938f4ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "From: cubbie@garnet.berkeley.edu (                               )\n",
+      "Subject: Re: Cubs behind Marlins? How?\n",
+      "Article-I.D.: agate.1pt592$f9a\n",
+      "Organization: University of California, Berkeley\n",
+      "Lines: 12\n",
+      "NNTP-Posting-Host: garnet.berkeley.edu\n",
+      "\n",
+      "\n",
+      "gajarsky@pilot.njin.net writes:\n",
+      "\n",
+      "morgan and guzman will have era's 1 run higher than last year, and\n",
+      " the cubs will be idiots and not pitch harkey as much as hibbard.\n",
+      " castillo won't be good (i think he's a stud pitcher)\n",
+      "\n",
+      "       This season so far, Morgan and Guzman helped to lead the Cubs\n",
+      "       at top in ERA, even better than THE rotation at Atlanta.\n",
+      "       Cubs ERA at 0.056 while Braves at 0.059. We know it is early\n",
+      "       in the season, we Cubs fans have learned how to enjoy the\n",
+      "       short triumph while it is still there.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(news_data.data[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e844be0c-e76f-4b07-b85c-c64f6f669a6d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'list'>\n",
+      "학습 데이터 크기 11314 , 테스트 데이터 크기 7532\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "\n",
+    "# subset='train'으로 학습용(Train) 데이터만 추출, remove=('headers', 'footers', 'quotes')로 내용만 추출\n",
+    "train_news= fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=156)\n",
+    "X_train = train_news.data\n",
+    "y_train = train_news.target\n",
+    "print(type(X_train))\n",
+    "\n",
+    "# subset='test'으로 테스트(Test) 데이터만 추출, remove=('headers', 'footers', 'quotes')로 내용만 추출\n",
+    "test_news= fetch_20newsgroups(subset='test',remove=('headers', 'footers','quotes'),random_state=156)\n",
+    "X_test = test_news.data\n",
+    "y_test = test_news.target\n",
+    "print('학습 데이터 크기 {0} , 테스트 데이터 크기 {1}'.format(len(train_news.data) , len(test_news.data)))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29c20b15-1da8-4976-97a7-255cf1dfa86e",
+   "metadata": {},
+   "source": [
+    "## 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "3560c193-0455-4d67-b741-c84a247cc528",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "학습 데이터 Text의 CountVectorizer Shape: (11314, 101631)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "# Count Vectorization으로 feature extraction 변환 수행. \n",
+    "cnt_vect = CountVectorizer()\n",
+    "\n",
+    "cnt_vect.fit(X_train)\n",
+    "X_train_cnt_vect = cnt_vect.transform(X_train)\n",
+    "\n",
+    "# 학습 데이터로 fit( )된 CountVectorizer를 이용하여 테스트 데이터를 feature extraction 변환 수행. \n",
+    "X_test_cnt_vect = cnt_vect.transform(X_test)\n",
+    "\n",
+    "print('학습 데이터 Text의 CountVectorizer Shape:',X_train_cnt_vect.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e2ec03f8-67c6-4217-94b8-66b5d93112dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CountVectorized Logistic Regression 의 예측 정확도는 0.617\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# LogisticRegression을 이용하여 학습/예측/평가 수행. \n",
+    "lr_clf = LogisticRegression(solver='liblinear')\n",
+    "lr_clf.fit(X_train_cnt_vect , y_train)\n",
+    "pred = lr_clf.predict(X_test_cnt_vect)\n",
+    "print('CountVectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test,pred)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7c86c7ba-f760-45ca-ab66-c4e4be84b1e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TF-IDF Logistic Regression 의 예측 정확도는 0.678\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환. \n",
+    "tfidf_vect = TfidfVectorizer()\n",
+    "tfidf_vect.fit(X_train)\n",
+    "X_train_tfidf_vect = tfidf_vect.transform(X_train)\n",
+    "X_test_tfidf_vect = tfidf_vect.transform(X_test)\n",
+    "\n",
+    "# LogisticRegression을 이용하여 학습/예측/평가 수행. \n",
+    "lr_clf = LogisticRegression(solver='liblinear')\n",
+    "lr_clf.fit(X_train_tfidf_vect , y_train)\n",
+    "pred = lr_clf.predict(X_test_tfidf_vect)\n",
+    "print('TF-IDF Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "3ef52dd2-69fa-4749-bef4-f3386c122d7c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.690\n"
+     ]
+    }
+   ],
+   "source": [
+    "# stop words 필터링을 추가하고 ngram을 기본(1,1)에서 (1,2)로 변경하여 Feature Vectorization 적용.\n",
+    "tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300 )\n",
+    "tfidf_vect.fit(X_train)\n",
+    "X_train_tfidf_vect = tfidf_vect.transform(X_train)\n",
+    "X_test_tfidf_vect = tfidf_vect.transform(X_test)\n",
+    "\n",
+    "lr_clf = LogisticRegression(solver='liblinear')\n",
+    "lr_clf.fit(X_train_tfidf_vect , y_train)\n",
+    "pred = lr_clf.predict(X_test_tfidf_vect)\n",
+    "print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "e27c5266-cc72-44f0-bc8e-707e2995c0b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n",
+      "Logistic Regression best C parameter : {'C': 10}\n",
+      "TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.704\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV\n",
+    "\n",
+    "# 최적 C 값 도출 튜닝 수행. CV는 3 Fold셋으로 설정. \n",
+    "params = { 'C':[0.01, 0.1, 1, 5, 10]}\n",
+    "grid_cv_lr = GridSearchCV(lr_clf ,param_grid=params , cv=3 , scoring='accuracy' , verbose=1 )\n",
+    "grid_cv_lr.fit(X_train_tfidf_vect , y_train)\n",
+    "print('Logistic Regression best C parameter :',grid_cv_lr.best_params_ )\n",
+    "\n",
+    "# 최적 C 값으로 학습된 grid_cv로 예측 수행하고 정확도 평가. \n",
+    "pred = grid_cv_lr.predict(X_test_tfidf_vect)\n",
+    "print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df7454ab-4446-4ba7-86e4-b1d7fde0050a",
+   "metadata": {},
+   "source": [
+    "## 사이킷런 파이프라인(Pipeline) 사용 및 GridSearchCV와의 결합"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2d18bb46-729c-492d-b39b-be44d48fba75",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pipeline을 통한 Logistic Regression 의 예측 정확도는 0.704\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.pipeline import Pipeline\n",
+    "\n",
+    "# TfidfVectorizer 객체를 tfidf_vect 객체명으로, LogisticRegression객체를 lr_clf 객체명으로 생성하는 Pipeline생성\n",
+    "pipeline = Pipeline([\n",
+    "    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)),\n",
+    "    ('lr_clf', LogisticRegression(solver='liblinear', C=10))\n",
+    "])\n",
+    "\n",
+    "# 별도의 TfidfVectorizer객체의 fit_transform( )과 LogisticRegression의 fit(), predict( )가 필요 없음. \n",
+    "# pipeline의 fit( ) 과 predict( ) 만으로 한꺼번에 Feature Vectorization과 ML 학습/예측이 가능. \n",
+    "pipeline.fit(X_train, y_train)\n",
+    "pred = pipeline.predict(X_test)\n",
+    "print('Pipeline을 통한 Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "2a781e44-e8c2-47d8-9d66-eaf96c496c47",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fitting 3 folds for each of 27 candidates, totalling 81 fits\n",
+      "{'lr_clf__C': 10, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.7550828826229531\n",
+      "Pipeline을 통한 Logistic Regression 의 예측 정확도는 0.702\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.pipeline import Pipeline\n",
+    "\n",
+    "pipeline = Pipeline([\n",
+    "    ('tfidf_vect', TfidfVectorizer(stop_words='english')),\n",
+    "    ('lr_clf', LogisticRegression(solver='liblinear'))\n",
+    "])\n",
+    "\n",
+    "# Pipeline에 기술된 각각의 객체 변수에 언더바(_)2개를 연달아 붙여 GridSearchCV에 사용될 \n",
+    "# 파라미터/하이퍼 파라미터 이름과 값을 설정. . \n",
+    "params = { 'tfidf_vect__ngram_range': [(1,1), (1,2), (1,3)],\n",
+    "           'tfidf_vect__max_df': [100, 300, 700],\n",
+    "           'lr_clf__C': [1, 5, 10]\n",
+    "}\n",
+    "\n",
+    "# GridSearchCV의 생성자에 Estimator가 아닌 Pipeline 객체 입력\n",
+    "grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3 , scoring='accuracy',verbose=1)\n",
+    "grid_cv_pipe.fit(X_train , y_train)\n",
+    "print(grid_cv_pipe.best_params_ , grid_cv_pipe.best_score_)\n",
+    "\n",
+    "pred = grid_cv_pipe.predict(X_test)\n",
+    "print('Pipeline을 통한 Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10e2e7d6-b559-4b72-8b1a-5c534fec28ee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.25"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}