提交 f52bac0d 编写于 作者: 东方怂天's avatar 东方怂天

略作修改

上级 29cc6ce7
...@@ -95,8 +95,8 @@ ...@@ -95,8 +95,8 @@
"execution_count": 1, "execution_count": 1,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2020-06-22T00:57:31.811677Z", "end_time": "2020-06-30T02:54:55.500263Z",
"start_time": "2020-06-22T00:57:31.407717Z" "start_time": "2020-06-30T02:54:54.444909Z"
} }
}, },
"outputs": [], "outputs": [],
...@@ -157,8 +157,8 @@ ...@@ -157,8 +157,8 @@
"execution_count": 2, "execution_count": 2,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2020-06-22T00:57:31.901434Z", "end_time": "2020-06-30T02:54:55.580119Z",
"start_time": "2020-06-22T00:57:31.812674Z" "start_time": "2020-06-30T02:54:55.501297Z"
} }
}, },
"outputs": [ "outputs": [
...@@ -301,8 +301,8 @@ ...@@ -301,8 +301,8 @@
"execution_count": 3, "execution_count": 3,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2020-06-22T00:57:32.040039Z", "end_time": "2020-06-30T02:54:55.705015Z",
"start_time": "2020-06-22T00:57:31.903398Z" "start_time": "2020-06-30T02:54:55.581124Z"
} }
}, },
"outputs": [ "outputs": [
...@@ -421,7 +421,7 @@ ...@@ -421,7 +421,7 @@
" lambda x: x.split(\"Includes:\")[0].replace(\"| About: \", \"\").split(\",\"))\n", " lambda x: x.split(\"Includes:\")[0].replace(\"| About: \", \"\").split(\",\"))\n",
"# 提取【针对公司】中的简写\n", "# 提取【针对公司】中的简写\n",
"Content_data[\"针对公司\"] = Content_data[\"针对公司\"].map(\n", "Content_data[\"针对公司\"] = Content_data[\"针对公司\"].map(\n",
" lambda x: [\"\".join(re.findall('[(](.*?)[)]', i, re.S)) for i in x if not i.find(\"(\")==-1])\n", " lambda x: [\"\".join(re.findall('[(](.*?)[)]', i, re.S)) for i in x if not i.find(\"(\") == -1])\n",
"\n", "\n",
"# 显示前五行内容\n", "# 显示前五行内容\n",
"Content_data.head()" "Content_data.head()"
...@@ -436,11 +436,11 @@ ...@@ -436,11 +436,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 10,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2020-06-22T00:58:16.698088Z", "end_time": "2020-06-30T02:58:07.744754Z",
"start_time": "2020-06-22T00:58:14.472842Z" "start_time": "2020-06-30T02:58:06.465822Z"
} }
}, },
"outputs": [ "outputs": [
...@@ -466,76 +466,58 @@ ...@@ -466,76 +466,58 @@
" <tr style=\"text-align: right;\">\n", " <tr style=\"text-align: right;\">\n",
" <th></th>\n", " <th></th>\n",
" <th>标题</th>\n", " <th>标题</th>\n",
" <th>针对公司_x</th>\n",
" <th>发布时间_x</th>\n",
" <th>作者</th>\n", " <th>作者</th>\n",
" <th>评论数</th>\n", " <th>评论数</th>\n",
" <th>发布时间_y</th>\n",
" <th>针对公司_y</th>\n",
" <th>摘要</th>\n", " <th>摘要</th>\n",
" <th>针对公司</th>\n", " <th>针对公司</th>\n",
" <th>发布时间</th>\n",
" </tr>\n", " </tr>\n",
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
" <tr>\n", " <tr>\n",
" <th>Micron Technology: Insanely Cheap Stock Given Its High Earnings Quality</th>\n", " <th>Micron Technology: Insanely Cheap Stock Given Its High Earnings Quality</th>\n",
" <td>Micron Technology: Insanely Cheap Stock Given ...</td>\n", " <td>Micron Technology: Insanely Cheap Stock Given ...</td>\n",
" <td>MU</td>\n",
" <td>Dec. 31, 2018, 7:57 PM</td>\n",
" <td>Ruerd Heeg</td>\n", " <td>Ruerd Heeg</td>\n",
" <td>75</td>\n", " <td>75</td>\n",
" <td>Dec. 31, 2018 7:57 PM</td>\n",
" <td>[MU, MU, MU, MU, MU]</td>\n",
" <td>SummaryLast year, a combination of relatively ...</td>\n", " <td>SummaryLast year, a combination of relatively ...</td>\n",
" <td>[None]</td>\n", " <td>MU</td>\n",
" <td>Dec. 31, 2018 7:57 PM</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>Molson Coors Seems Attractive At These Valuations</th>\n", " <th>Molson Coors Seems Attractive At These Valuations</th>\n",
" <td>Molson Coors Seems Attractive At These Valuations</td>\n", " <td>Molson Coors Seems Attractive At These Valuations</td>\n",
" <td>TAP</td>\n",
" <td>Dec. 31, 2018, 7:44 PM</td>\n",
" <td>Sanjit Deepalam</td>\n", " <td>Sanjit Deepalam</td>\n",
" <td>16</td>\n", " <td>16</td>\n",
" <td>Dec. 31, 2018 7:44 PM</td>\n",
" <td>[TAP, TAP, TAP]</td>\n",
" <td>SummaryMolson Coors's stock has fallen over 30...</td>\n", " <td>SummaryMolson Coors's stock has fallen over 30...</td>\n",
" <td>[None]</td>\n", " <td>TAP</td>\n",
" <td>Dec. 31, 2018 7:44 PM</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>Gerdau: The Brazilian Play On U.S. Steel</th>\n", " <th>Gerdau: The Brazilian Play On U.S. Steel</th>\n",
" <td>Gerdau: The Brazilian Play On U.S. Steel</td>\n", " <td>Gerdau: The Brazilian Play On U.S. Steel</td>\n",
" <td>GGB</td>\n",
" <td>Dec. 31, 2018, 7:10 PM</td>\n",
" <td>Shannon Bruce</td>\n", " <td>Shannon Bruce</td>\n",
" <td>1</td>\n", " <td>1</td>\n",
" <td>Dec. 31, 2018 7:10 PM</td>\n",
" <td>[GGB, GGB, GGB]</td>\n",
" <td>SummaryGerdau is delivering good results, incl...</td>\n", " <td>SummaryGerdau is delivering good results, incl...</td>\n",
" <td>[None]</td>\n", " <td>GGB</td>\n",
" <td>Dec. 31, 2018 7:10 PM</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>Will Apple Get Its Mojo Back?</th>\n", " <th>Will Apple Get Its Mojo Back?</th>\n",
" <td>Will Apple Get Its Mojo Back?</td>\n", " <td>Will Apple Get Its Mojo Back?</td>\n",
" <td>AAPL</td>\n",
" <td>Dec. 31, 2018, 5:36 PM</td>\n",
" <td>TipRanks</td>\n", " <td>TipRanks</td>\n",
" <td>68</td>\n", " <td>68</td>\n",
" <td>Dec. 31, 2018 5:36 PM</td>\n",
" <td>[AAPL, AAPL, AAPL]</td>\n",
" <td>SummaryApple has been resting on a reputation ...</td>\n", " <td>SummaryApple has been resting on a reputation ...</td>\n",
" <td>[None]</td>\n", " <td>AAPL</td>\n",
" <td>Dec. 31, 2018 5:36 PM</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>Lululemon Stock Looks Compelling On This Dip</th>\n", " <th>Lululemon Stock Looks Compelling On This Dip</th>\n",
" <td>Lululemon Stock Looks Compelling On This Dip</td>\n", " <td>Lululemon Stock Looks Compelling On This Dip</td>\n",
" <td>LULU</td>\n",
" <td>Dec. 31, 2018, 5:26 PM</td>\n",
" <td>L&amp;F Capital Management</td>\n", " <td>L&amp;F Capital Management</td>\n",
" <td>4</td>\n", " <td>4</td>\n",
" <td>Dec. 31, 2018 5:26 PM</td>\n",
" <td>[LULU, LULU, LULU]</td>\n",
" <td>SummaryLululemon stock had a strong 2018 but f...</td>\n", " <td>SummaryLululemon stock had a strong 2018 but f...</td>\n",
" <td>[None]</td>\n", " <td>LULU</td>\n",
" <td>Dec. 31, 2018 5:26 PM</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
...@@ -549,20 +531,6 @@ ...@@ -549,20 +531,6 @@
"Will Apple Get Its Mojo Back? Will Apple Get Its Mojo Back? \n", "Will Apple Get Its Mojo Back? Will Apple Get Its Mojo Back? \n",
"Lululemon Stock Looks Compelling On This Dip Lululemon Stock Looks Compelling On This Dip \n", "Lululemon Stock Looks Compelling On This Dip Lululemon Stock Looks Compelling On This Dip \n",
"\n", "\n",
" 针对公司_x \\\n",
"Micron Technology: Insanely Cheap Stock Given I... MU \n",
"Molson Coors Seems Attractive At These Valuations TAP \n",
"Gerdau: The Brazilian Play On U.S. Steel GGB \n",
"Will Apple Get Its Mojo Back? AAPL \n",
"Lululemon Stock Looks Compelling On This Dip LULU \n",
"\n",
" 发布时间_x \\\n",
"Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018, 7:57 PM \n",
"Molson Coors Seems Attractive At These Valuations Dec. 31, 2018, 7:44 PM \n",
"Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018, 7:10 PM \n",
"Will Apple Get Its Mojo Back? Dec. 31, 2018, 5:36 PM \n",
"Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018, 5:26 PM \n",
"\n",
" 作者 \\\n", " 作者 \\\n",
"Micron Technology: Insanely Cheap Stock Given I... Ruerd Heeg \n", "Micron Technology: Insanely Cheap Stock Given I... Ruerd Heeg \n",
"Molson Coors Seems Attractive At These Valuations Sanjit Deepalam \n", "Molson Coors Seems Attractive At These Valuations Sanjit Deepalam \n",
...@@ -577,20 +545,6 @@ ...@@ -577,20 +545,6 @@
"Will Apple Get Its Mojo Back? 68 \n", "Will Apple Get Its Mojo Back? 68 \n",
"Lululemon Stock Looks Compelling On This Dip 4 \n", "Lululemon Stock Looks Compelling On This Dip 4 \n",
"\n", "\n",
" 发布时间_y \\\n",
"Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018 7:57 PM \n",
"Molson Coors Seems Attractive At These Valuations Dec. 31, 2018 7:44 PM \n",
"Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018 7:10 PM \n",
"Will Apple Get Its Mojo Back? Dec. 31, 2018 5:36 PM \n",
"Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018 5:26 PM \n",
"\n",
" 针对公司_y \\\n",
"Micron Technology: Insanely Cheap Stock Given I... [MU, MU, MU, MU, MU] \n",
"Molson Coors Seems Attractive At These Valuations [TAP, TAP, TAP] \n",
"Gerdau: The Brazilian Play On U.S. Steel [GGB, GGB, GGB] \n",
"Will Apple Get Its Mojo Back? [AAPL, AAPL, AAPL] \n",
"Lululemon Stock Looks Compelling On This Dip [LULU, LULU, LULU] \n",
"\n",
" 摘要 \\\n", " 摘要 \\\n",
"Micron Technology: Insanely Cheap Stock Given I... SummaryLast year, a combination of relatively ... \n", "Micron Technology: Insanely Cheap Stock Given I... SummaryLast year, a combination of relatively ... \n",
"Molson Coors Seems Attractive At These Valuations SummaryMolson Coors's stock has fallen over 30... \n", "Molson Coors Seems Attractive At These Valuations SummaryMolson Coors's stock has fallen over 30... \n",
...@@ -598,15 +552,22 @@ ...@@ -598,15 +552,22 @@
"Will Apple Get Its Mojo Back? SummaryApple has been resting on a reputation ... \n", "Will Apple Get Its Mojo Back? SummaryApple has been resting on a reputation ... \n",
"Lululemon Stock Looks Compelling On This Dip SummaryLululemon stock had a strong 2018 but f... \n", "Lululemon Stock Looks Compelling On This Dip SummaryLululemon stock had a strong 2018 but f... \n",
"\n", "\n",
" 针对公司 \n", " 针对公司 \\\n",
"Micron Technology: Insanely Cheap Stock Given I... [None] \n", "Micron Technology: Insanely Cheap Stock Given I... MU \n",
"Molson Coors Seems Attractive At These Valuations [None] \n", "Molson Coors Seems Attractive At These Valuations TAP \n",
"Gerdau: The Brazilian Play On U.S. Steel [None] \n", "Gerdau: The Brazilian Play On U.S. Steel GGB \n",
"Will Apple Get Its Mojo Back? [None] \n", "Will Apple Get Its Mojo Back? AAPL \n",
"Lululemon Stock Looks Compelling On This Dip [None] " "Lululemon Stock Looks Compelling On This Dip LULU \n",
"\n",
" 发布时间 \n",
"Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018 7:57 PM \n",
"Molson Coors Seems Attractive At These Valuations Dec. 31, 2018 7:44 PM \n",
"Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018 7:10 PM \n",
"Will Apple Get Its Mojo Back? Dec. 31, 2018 5:36 PM \n",
"Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018 5:26 PM "
] ]
}, },
"execution_count": 5, "execution_count": 10,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -616,17 +577,71 @@ ...@@ -616,17 +577,71 @@
" Title_data, Content_data, right_on='标题', left_index=True, how='outer')\n", " Title_data, Content_data, right_on='标题', left_index=True, how='outer')\n",
"\n", "\n",
"\n", "\n",
"def Update(x):\n", "def UpdateCompany(x):\n",
" try:\n", " try:\n",
" return np.unique(x[\"针对公司_y\"].append(x[\"针对公司_x\"]))\n", " if(x[\"针对公司_x\"]):\n",
" return x[\"针对公司_x\"]\n",
" else:\n",
" return x[\"针对公司_y\"][0]\n",
" except:\n", " except:\n",
" return np.NAN\n", " return np.NAN\n",
"\n", "\n",
"\n",
"def UpdateTime(x):\n",
" try:\n",
" return np.unique([x[\"发布时间_x\"].strip(), x[\"发布时间_y\"].strip()])[0]\n",
" except:\n",
" return np.NAN\n",
"\n",
"\n",
"Title_Content_data['针对公司'] = Title_Content_data[['针对公司_x', '针对公司_y']].apply(\n", "Title_Content_data['针对公司'] = Title_Content_data[['针对公司_x', '针对公司_y']].apply(\n",
" lambda x: Update(x), axis=1)\n", " lambda x: UpdateCompany(x), axis=1)\n",
"\n",
"Title_Content_data['发布时间'] = Title_Content_data[[\"发布时间_x\", \"发布时间_y\"]].apply(\n",
" lambda x: UpdateTime(x), axis=1)\n",
"\n",
"# pd.to_datetime(df)\n",
"\n",
"# 删除无用列\n",
"Title_Content_data.drop([\"发布时间_x\", \"发布时间_y\"], axis=1, inplace=True)\n",
"Title_Content_data.drop([\"针对公司_x\", \"针对公司_y\"], axis=1, inplace=True)\n",
"\n",
"# 删除无用行\n",
"Title_Content_data.dropna(axis=0, how='any', subset=[\n",
" \"发布时间\", \"针对公司\", \"评论数\"], inplace=True)\n",
"\n", "\n",
"Title_Content_data.head()" "Title_Content_data.head()"
] ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.2 回帖聚合\n",
"这是网友在各文章下的回复内容 \n",
"Title:各文章的标题;空标题的,用最靠近的有内容的下方标题 \n",
"Content:回复的全文字内容"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 创建一个空的 DataFrame\n",
"Reply_data = pd.DataFrame(columns=['字段', '标题1'])\n",
"\n",
"for root, dirs, files in os.walk(os.path.join(os.getcwd(), \"\")):\n",
" for file in files:\n",
" if(file.endswith('.xlsx')):\n",
" # 获取文件路径\n",
" data_xls = pd.read_excel(os.path.join(root, file), index_col=0)\n",
" data_xls.to_csv(os.path.join(root, file).replace(\n",
" '.xlsx', '.csv'), encoding='utf-8')\n",
" print(os.path.join(root, file), \"转化成功\")\n",
" os.remove(os.path.join(root, file))"
]
} }
], ],
"metadata": { "metadata": {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册