From f52bac0d4f2b112ee5fb42fca3f80275adf95dc5 Mon Sep 17 00:00:00 2001 From: EasternDay <849919718@qq.com> Date: Tue, 30 Jun 2020 11:08:38 +0800 Subject: [PATCH] =?UTF-8?q?=E7=95=A5=E4=BD=9C=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...1825101045-\346\235\250\347\245\211.ipynb" | 169 ++++++++++-------- 1 file changed, 92 insertions(+), 77 deletions(-) diff --git "a/\344\270\200\347\217\255-1825101045-\346\235\250\347\245\211.ipynb" "b/\344\270\200\347\217\255-1825101045-\346\235\250\347\245\211.ipynb" index 85fb073..a82e75f 100644 --- "a/\344\270\200\347\217\255-1825101045-\346\235\250\347\245\211.ipynb" +++ "b/\344\270\200\347\217\255-1825101045-\346\235\250\347\245\211.ipynb" @@ -95,8 +95,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-06-22T00:57:31.811677Z", - "start_time": "2020-06-22T00:57:31.407717Z" + "end_time": "2020-06-30T02:54:55.500263Z", + "start_time": "2020-06-30T02:54:54.444909Z" } }, "outputs": [], @@ -157,8 +157,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-06-22T00:57:31.901434Z", - "start_time": "2020-06-22T00:57:31.812674Z" + "end_time": "2020-06-30T02:54:55.580119Z", + "start_time": "2020-06-30T02:54:55.501297Z" } }, "outputs": [ @@ -301,8 +301,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-06-22T00:57:32.040039Z", - "start_time": "2020-06-22T00:57:31.903398Z" + "end_time": "2020-06-30T02:54:55.705015Z", + "start_time": "2020-06-30T02:54:55.581124Z" } }, "outputs": [ @@ -421,7 +421,7 @@ " lambda x: x.split(\"Includes:\")[0].replace(\"| About: \", \"\").split(\",\"))\n", "# 提取【针对公司】中的简写\n", "Content_data[\"针对公司\"] = Content_data[\"针对公司\"].map(\n", - " lambda x: [\"\".join(re.findall('[(](.*?)[)]', i, re.S)) for i in x if not i.find(\"(\")==-1])\n", + " lambda x: [\"\".join(re.findall('[(](.*?)[)]', i, re.S)) for i in x if not i.find(\"(\") == -1])\n", "\n", "# 显示前五行内容\n", "Content_data.head()" @@ -436,11 +436,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-06-22T00:58:16.698088Z", - "start_time": "2020-06-22T00:58:14.472842Z" + "end_time": "2020-06-30T02:58:07.744754Z", + "start_time": "2020-06-30T02:58:06.465822Z" } }, "outputs": [ @@ -466,76 +466,58 @@ " \n", " \n", " 标题\n", - " 针对公司_x\n", - " 发布时间_x\n", " 作者\n", " 评论数\n", - " 发布时间_y\n", - " 针对公司_y\n", " 摘要\n", " 针对公司\n", + " 发布时间\n", " \n", " \n", " \n", " \n", " Micron Technology: Insanely Cheap Stock Given Its High Earnings Quality\n", " Micron Technology: Insanely Cheap Stock Given ...\n", - " MU\n", - " Dec. 31, 2018, 7:57 PM\n", " Ruerd Heeg\n", " 75\n", - " Dec. 31, 2018 7:57 PM\n", - " [MU, MU, MU, MU, MU]\n", " SummaryLast year, a combination of relatively ...\n", - " [None]\n", + " MU\n", + " Dec. 31, 2018 7:57 PM\n", " \n", " \n", " Molson Coors Seems Attractive At These Valuations\n", " Molson Coors Seems Attractive At These Valuations\n", - " TAP\n", - " Dec. 31, 2018, 7:44 PM\n", " Sanjit Deepalam\n", " 16\n", - " Dec. 31, 2018 7:44 PM\n", - " [TAP, TAP, TAP]\n", " SummaryMolson Coors's stock has fallen over 30...\n", - " [None]\n", + " TAP\n", + " Dec. 31, 2018 7:44 PM\n", " \n", " \n", " Gerdau: The Brazilian Play On U.S. Steel\n", " Gerdau: The Brazilian Play On U.S. Steel\n", - " GGB\n", - " Dec. 31, 2018, 7:10 PM\n", " Shannon Bruce\n", " 1\n", - " Dec. 31, 2018 7:10 PM\n", - " [GGB, GGB, GGB]\n", " SummaryGerdau is delivering good results, incl...\n", - " [None]\n", + " GGB\n", + " Dec. 31, 2018 7:10 PM\n", " \n", " \n", " Will Apple Get Its Mojo Back?\n", " Will Apple Get Its Mojo Back?\n", - " AAPL\n", - " Dec. 31, 2018, 5:36 PM\n", " TipRanks\n", " 68\n", - " Dec. 31, 2018 5:36 PM\n", - " [AAPL, AAPL, AAPL]\n", " SummaryApple has been resting on a reputation ...\n", - " [None]\n", + " AAPL\n", + " Dec. 31, 2018 5:36 PM\n", " \n", " \n", " Lululemon Stock Looks Compelling On This Dip\n", " Lululemon Stock Looks Compelling On This Dip\n", - " LULU\n", - " Dec. 31, 2018, 5:26 PM\n", " L&F Capital Management\n", " 4\n", - " Dec. 31, 2018 5:26 PM\n", - " [LULU, LULU, LULU]\n", " SummaryLululemon stock had a strong 2018 but f...\n", - " [None]\n", + " LULU\n", + " Dec. 31, 2018 5:26 PM\n", " \n", " \n", "\n", @@ -549,20 +531,6 @@ "Will Apple Get Its Mojo Back? Will Apple Get Its Mojo Back? \n", "Lululemon Stock Looks Compelling On This Dip Lululemon Stock Looks Compelling On This Dip \n", "\n", - " 针对公司_x \\\n", - "Micron Technology: Insanely Cheap Stock Given I... MU \n", - "Molson Coors Seems Attractive At These Valuations TAP \n", - "Gerdau: The Brazilian Play On U.S. Steel GGB \n", - "Will Apple Get Its Mojo Back? AAPL \n", - "Lululemon Stock Looks Compelling On This Dip LULU \n", - "\n", - " 发布时间_x \\\n", - "Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018, 7:57 PM \n", - "Molson Coors Seems Attractive At These Valuations Dec. 31, 2018, 7:44 PM \n", - "Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018, 7:10 PM \n", - "Will Apple Get Its Mojo Back? Dec. 31, 2018, 5:36 PM \n", - "Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018, 5:26 PM \n", - "\n", " 作者 \\\n", "Micron Technology: Insanely Cheap Stock Given I... Ruerd Heeg \n", "Molson Coors Seems Attractive At These Valuations Sanjit Deepalam \n", @@ -577,20 +545,6 @@ "Will Apple Get Its Mojo Back? 68 \n", "Lululemon Stock Looks Compelling On This Dip 4 \n", "\n", - " 发布时间_y \\\n", - "Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018 7:57 PM \n", - "Molson Coors Seems Attractive At These Valuations Dec. 31, 2018 7:44 PM \n", - "Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018 7:10 PM \n", - "Will Apple Get Its Mojo Back? Dec. 31, 2018 5:36 PM \n", - "Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018 5:26 PM \n", - "\n", - " 针对公司_y \\\n", - "Micron Technology: Insanely Cheap Stock Given I... [MU, MU, MU, MU, MU] \n", - "Molson Coors Seems Attractive At These Valuations [TAP, TAP, TAP] \n", - "Gerdau: The Brazilian Play On U.S. Steel [GGB, GGB, GGB] \n", - "Will Apple Get Its Mojo Back? [AAPL, AAPL, AAPL] \n", - "Lululemon Stock Looks Compelling On This Dip [LULU, LULU, LULU] \n", - "\n", " 摘要 \\\n", "Micron Technology: Insanely Cheap Stock Given I... SummaryLast year, a combination of relatively ... \n", "Molson Coors Seems Attractive At These Valuations SummaryMolson Coors's stock has fallen over 30... \n", @@ -598,15 +552,22 @@ "Will Apple Get Its Mojo Back? SummaryApple has been resting on a reputation ... \n", "Lululemon Stock Looks Compelling On This Dip SummaryLululemon stock had a strong 2018 but f... \n", "\n", - " 针对公司 \n", - "Micron Technology: Insanely Cheap Stock Given I... [None] \n", - "Molson Coors Seems Attractive At These Valuations [None] \n", - "Gerdau: The Brazilian Play On U.S. Steel [None] \n", - "Will Apple Get Its Mojo Back? [None] \n", - "Lululemon Stock Looks Compelling On This Dip [None] " + " 针对公司 \\\n", + "Micron Technology: Insanely Cheap Stock Given I... MU \n", + "Molson Coors Seems Attractive At These Valuations TAP \n", + "Gerdau: The Brazilian Play On U.S. Steel GGB \n", + "Will Apple Get Its Mojo Back? AAPL \n", + "Lululemon Stock Looks Compelling On This Dip LULU \n", + "\n", + " 发布时间 \n", + "Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018 7:57 PM \n", + "Molson Coors Seems Attractive At These Valuations Dec. 31, 2018 7:44 PM \n", + "Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018 7:10 PM \n", + "Will Apple Get Its Mojo Back? Dec. 31, 2018 5:36 PM \n", + "Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018 5:26 PM " ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -616,17 +577,71 @@ " Title_data, Content_data, right_on='标题', left_index=True, how='outer')\n", "\n", "\n", - "def Update(x):\n", + "def UpdateCompany(x):\n", " try:\n", - " return np.unique(x[\"针对公司_y\"].append(x[\"针对公司_x\"]))\n", + " if(x[\"针对公司_x\"]):\n", + " return x[\"针对公司_x\"]\n", + " else:\n", + " return x[\"针对公司_y\"][0]\n", " except:\n", " return np.NAN\n", "\n", + "\n", + "def UpdateTime(x):\n", + " try:\n", + " return np.unique([x[\"发布时间_x\"].strip(), x[\"发布时间_y\"].strip()])[0]\n", + " except:\n", + " return np.NAN\n", + "\n", + "\n", "Title_Content_data['针对公司'] = Title_Content_data[['针对公司_x', '针对公司_y']].apply(\n", - " lambda x: Update(x), axis=1)\n", + " lambda x: UpdateCompany(x), axis=1)\n", + "\n", + "Title_Content_data['发布时间'] = Title_Content_data[[\"发布时间_x\", \"发布时间_y\"]].apply(\n", + " lambda x: UpdateTime(x), axis=1)\n", + "\n", + "# pd.to_datetime(df)\n", + "\n", + "# 删除无用列\n", + "Title_Content_data.drop([\"发布时间_x\", \"发布时间_y\"], axis=1, inplace=True)\n", + "Title_Content_data.drop([\"针对公司_x\", \"针对公司_y\"], axis=1, inplace=True)\n", + "\n", + "# 删除无用行\n", + "Title_Content_data.dropna(axis=0, how='any', subset=[\n", + " \"发布时间\", \"针对公司\", \"评论数\"], inplace=True)\n", "\n", "Title_Content_data.head()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 回帖聚合\n", + "这是网友在各文章下的回复内容 \n", + "Title:各文章的标题;空标题的,用最靠近的有内容的下方标题 \n", + "Content:回复的全文字内容" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 创建一个空的 DataFrame\n", + "Reply_data = pd.DataFrame(columns=['字段', '标题1'])\n", + "\n", + "for root, dirs, files in os.walk(os.path.join(os.getcwd(), \"\")):\n", + " for file in files:\n", + " if(file.endswith('.xlsx')):\n", + " # 获取文件路径\n", + " data_xls = pd.read_excel(os.path.join(root, file), index_col=0)\n", + " data_xls.to_csv(os.path.join(root, file).replace(\n", + " '.xlsx', '.csv'), encoding='utf-8')\n", + " print(os.path.join(root, file), \"转化成功\")\n", + " os.remove(os.path.join(root, file))" + ] } ], "metadata": { -- GitLab