提交 f52bac0d 编写于 作者: 东方怂天's avatar 东方怂天

略作修改

上级 29cc6ce7
......@@ -95,8 +95,8 @@
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-22T00:57:31.811677Z",
"start_time": "2020-06-22T00:57:31.407717Z"
"end_time": "2020-06-30T02:54:55.500263Z",
"start_time": "2020-06-30T02:54:54.444909Z"
}
},
"outputs": [],
......@@ -157,8 +157,8 @@
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-22T00:57:31.901434Z",
"start_time": "2020-06-22T00:57:31.812674Z"
"end_time": "2020-06-30T02:54:55.580119Z",
"start_time": "2020-06-30T02:54:55.501297Z"
}
},
"outputs": [
......@@ -301,8 +301,8 @@
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-22T00:57:32.040039Z",
"start_time": "2020-06-22T00:57:31.903398Z"
"end_time": "2020-06-30T02:54:55.705015Z",
"start_time": "2020-06-30T02:54:55.581124Z"
}
},
"outputs": [
......@@ -421,7 +421,7 @@
" lambda x: x.split(\"Includes:\")[0].replace(\"| About: \", \"\").split(\",\"))\n",
"# 提取【针对公司】中的简写\n",
"Content_data[\"针对公司\"] = Content_data[\"针对公司\"].map(\n",
" lambda x: [\"\".join(re.findall('[(](.*?)[)]', i, re.S)) for i in x if not i.find(\"(\")==-1])\n",
" lambda x: [\"\".join(re.findall('[(](.*?)[)]', i, re.S)) for i in x if not i.find(\"(\") == -1])\n",
"\n",
"# 显示前五行内容\n",
"Content_data.head()"
......@@ -436,11 +436,11 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-22T00:58:16.698088Z",
"start_time": "2020-06-22T00:58:14.472842Z"
"end_time": "2020-06-30T02:58:07.744754Z",
"start_time": "2020-06-30T02:58:06.465822Z"
}
},
"outputs": [
......@@ -466,76 +466,58 @@
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>标题</th>\n",
" <th>针对公司_x</th>\n",
" <th>发布时间_x</th>\n",
" <th>作者</th>\n",
" <th>评论数</th>\n",
" <th>发布时间_y</th>\n",
" <th>针对公司_y</th>\n",
" <th>摘要</th>\n",
" <th>针对公司</th>\n",
" <th>发布时间</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Micron Technology: Insanely Cheap Stock Given Its High Earnings Quality</th>\n",
" <td>Micron Technology: Insanely Cheap Stock Given ...</td>\n",
" <td>MU</td>\n",
" <td>Dec. 31, 2018, 7:57 PM</td>\n",
" <td>Ruerd Heeg</td>\n",
" <td>75</td>\n",
" <td>Dec. 31, 2018 7:57 PM</td>\n",
" <td>[MU, MU, MU, MU, MU]</td>\n",
" <td>SummaryLast year, a combination of relatively ...</td>\n",
" <td>[None]</td>\n",
" <td>MU</td>\n",
" <td>Dec. 31, 2018 7:57 PM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Molson Coors Seems Attractive At These Valuations</th>\n",
" <td>Molson Coors Seems Attractive At These Valuations</td>\n",
" <td>TAP</td>\n",
" <td>Dec. 31, 2018, 7:44 PM</td>\n",
" <td>Sanjit Deepalam</td>\n",
" <td>16</td>\n",
" <td>Dec. 31, 2018 7:44 PM</td>\n",
" <td>[TAP, TAP, TAP]</td>\n",
" <td>SummaryMolson Coors's stock has fallen over 30...</td>\n",
" <td>[None]</td>\n",
" <td>TAP</td>\n",
" <td>Dec. 31, 2018 7:44 PM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Gerdau: The Brazilian Play On U.S. Steel</th>\n",
" <td>Gerdau: The Brazilian Play On U.S. Steel</td>\n",
" <td>GGB</td>\n",
" <td>Dec. 31, 2018, 7:10 PM</td>\n",
" <td>Shannon Bruce</td>\n",
" <td>1</td>\n",
" <td>Dec. 31, 2018 7:10 PM</td>\n",
" <td>[GGB, GGB, GGB]</td>\n",
" <td>SummaryGerdau is delivering good results, incl...</td>\n",
" <td>[None]</td>\n",
" <td>GGB</td>\n",
" <td>Dec. 31, 2018 7:10 PM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Will Apple Get Its Mojo Back?</th>\n",
" <td>Will Apple Get Its Mojo Back?</td>\n",
" <td>AAPL</td>\n",
" <td>Dec. 31, 2018, 5:36 PM</td>\n",
" <td>TipRanks</td>\n",
" <td>68</td>\n",
" <td>Dec. 31, 2018 5:36 PM</td>\n",
" <td>[AAPL, AAPL, AAPL]</td>\n",
" <td>SummaryApple has been resting on a reputation ...</td>\n",
" <td>[None]</td>\n",
" <td>AAPL</td>\n",
" <td>Dec. 31, 2018 5:36 PM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lululemon Stock Looks Compelling On This Dip</th>\n",
" <td>Lululemon Stock Looks Compelling On This Dip</td>\n",
" <td>LULU</td>\n",
" <td>Dec. 31, 2018, 5:26 PM</td>\n",
" <td>L&amp;F Capital Management</td>\n",
" <td>4</td>\n",
" <td>Dec. 31, 2018 5:26 PM</td>\n",
" <td>[LULU, LULU, LULU]</td>\n",
" <td>SummaryLululemon stock had a strong 2018 but f...</td>\n",
" <td>[None]</td>\n",
" <td>LULU</td>\n",
" <td>Dec. 31, 2018 5:26 PM</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
......@@ -549,20 +531,6 @@
"Will Apple Get Its Mojo Back? Will Apple Get Its Mojo Back? \n",
"Lululemon Stock Looks Compelling On This Dip Lululemon Stock Looks Compelling On This Dip \n",
"\n",
" 针对公司_x \\\n",
"Micron Technology: Insanely Cheap Stock Given I... MU \n",
"Molson Coors Seems Attractive At These Valuations TAP \n",
"Gerdau: The Brazilian Play On U.S. Steel GGB \n",
"Will Apple Get Its Mojo Back? AAPL \n",
"Lululemon Stock Looks Compelling On This Dip LULU \n",
"\n",
" 发布时间_x \\\n",
"Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018, 7:57 PM \n",
"Molson Coors Seems Attractive At These Valuations Dec. 31, 2018, 7:44 PM \n",
"Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018, 7:10 PM \n",
"Will Apple Get Its Mojo Back? Dec. 31, 2018, 5:36 PM \n",
"Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018, 5:26 PM \n",
"\n",
" 作者 \\\n",
"Micron Technology: Insanely Cheap Stock Given I... Ruerd Heeg \n",
"Molson Coors Seems Attractive At These Valuations Sanjit Deepalam \n",
......@@ -577,20 +545,6 @@
"Will Apple Get Its Mojo Back? 68 \n",
"Lululemon Stock Looks Compelling On This Dip 4 \n",
"\n",
" 发布时间_y \\\n",
"Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018 7:57 PM \n",
"Molson Coors Seems Attractive At These Valuations Dec. 31, 2018 7:44 PM \n",
"Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018 7:10 PM \n",
"Will Apple Get Its Mojo Back? Dec. 31, 2018 5:36 PM \n",
"Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018 5:26 PM \n",
"\n",
" 针对公司_y \\\n",
"Micron Technology: Insanely Cheap Stock Given I... [MU, MU, MU, MU, MU] \n",
"Molson Coors Seems Attractive At These Valuations [TAP, TAP, TAP] \n",
"Gerdau: The Brazilian Play On U.S. Steel [GGB, GGB, GGB] \n",
"Will Apple Get Its Mojo Back? [AAPL, AAPL, AAPL] \n",
"Lululemon Stock Looks Compelling On This Dip [LULU, LULU, LULU] \n",
"\n",
" 摘要 \\\n",
"Micron Technology: Insanely Cheap Stock Given I... SummaryLast year, a combination of relatively ... \n",
"Molson Coors Seems Attractive At These Valuations SummaryMolson Coors's stock has fallen over 30... \n",
......@@ -598,15 +552,22 @@
"Will Apple Get Its Mojo Back? SummaryApple has been resting on a reputation ... \n",
"Lululemon Stock Looks Compelling On This Dip SummaryLululemon stock had a strong 2018 but f... \n",
"\n",
" 针对公司 \n",
"Micron Technology: Insanely Cheap Stock Given I... [None] \n",
"Molson Coors Seems Attractive At These Valuations [None] \n",
"Gerdau: The Brazilian Play On U.S. Steel [None] \n",
"Will Apple Get Its Mojo Back? [None] \n",
"Lululemon Stock Looks Compelling On This Dip [None] "
" 针对公司 \\\n",
"Micron Technology: Insanely Cheap Stock Given I... MU \n",
"Molson Coors Seems Attractive At These Valuations TAP \n",
"Gerdau: The Brazilian Play On U.S. Steel GGB \n",
"Will Apple Get Its Mojo Back? AAPL \n",
"Lululemon Stock Looks Compelling On This Dip LULU \n",
"\n",
" 发布时间 \n",
"Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018 7:57 PM \n",
"Molson Coors Seems Attractive At These Valuations Dec. 31, 2018 7:44 PM \n",
"Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018 7:10 PM \n",
"Will Apple Get Its Mojo Back? Dec. 31, 2018 5:36 PM \n",
"Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018 5:26 PM "
]
},
"execution_count": 5,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
......@@ -616,17 +577,71 @@
" Title_data, Content_data, right_on='标题', left_index=True, how='outer')\n",
"\n",
"\n",
"def Update(x):\n",
"def UpdateCompany(x):\n",
" try:\n",
" return np.unique(x[\"针对公司_y\"].append(x[\"针对公司_x\"]))\n",
" if(x[\"针对公司_x\"]):\n",
" return x[\"针对公司_x\"]\n",
" else:\n",
" return x[\"针对公司_y\"][0]\n",
" except:\n",
" return np.NAN\n",
"\n",
"\n",
"def UpdateTime(x):\n",
" try:\n",
" return np.unique([x[\"发布时间_x\"].strip(), x[\"发布时间_y\"].strip()])[0]\n",
" except:\n",
" return np.NAN\n",
"\n",
"\n",
"Title_Content_data['针对公司'] = Title_Content_data[['针对公司_x', '针对公司_y']].apply(\n",
" lambda x: Update(x), axis=1)\n",
" lambda x: UpdateCompany(x), axis=1)\n",
"\n",
"Title_Content_data['发布时间'] = Title_Content_data[[\"发布时间_x\", \"发布时间_y\"]].apply(\n",
" lambda x: UpdateTime(x), axis=1)\n",
"\n",
"# pd.to_datetime(df)\n",
"\n",
"# 删除无用列\n",
"Title_Content_data.drop([\"发布时间_x\", \"发布时间_y\"], axis=1, inplace=True)\n",
"Title_Content_data.drop([\"针对公司_x\", \"针对公司_y\"], axis=1, inplace=True)\n",
"\n",
"# 删除无用行\n",
"Title_Content_data.dropna(axis=0, how='any', subset=[\n",
" \"发布时间\", \"针对公司\", \"评论数\"], inplace=True)\n",
"\n",
"Title_Content_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.2 回帖聚合\n",
"这是网友在各文章下的回复内容 \n",
"Title:各文章的标题;空标题的,用最靠近的有内容的下方标题 \n",
"Content:回复的全文字内容"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 创建一个空的 DataFrame\n",
"Reply_data = pd.DataFrame(columns=['字段', '标题1'])\n",
"\n",
"for root, dirs, files in os.walk(os.path.join(os.getcwd(), \"\")):\n",
" for file in files:\n",
" if(file.endswith('.xlsx')):\n",
" # 获取文件路径\n",
" data_xls = pd.read_excel(os.path.join(root, file), index_col=0)\n",
" data_xls.to_csv(os.path.join(root, file).replace(\n",
" '.xlsx', '.csv'), encoding='utf-8')\n",
" print(os.path.join(root, file), \"转化成功\")\n",
" os.remove(os.path.join(root, file))"
]
}
],
"metadata": {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册