From f52bac0d4f2b112ee5fb42fca3f80275adf95dc5 Mon Sep 17 00:00:00 2001
From: EasternDay <849919718@qq.com>
Date: Tue, 30 Jun 2020 11:08:38 +0800
Subject: [PATCH] =?UTF-8?q?=E7=95=A5=E4=BD=9C=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
...1825101045-\346\235\250\347\245\211.ipynb" | 169 ++++++++++--------
1 file changed, 92 insertions(+), 77 deletions(-)
diff --git "a/\344\270\200\347\217\255-1825101045-\346\235\250\347\245\211.ipynb" "b/\344\270\200\347\217\255-1825101045-\346\235\250\347\245\211.ipynb"
index 85fb073..a82e75f 100644
--- "a/\344\270\200\347\217\255-1825101045-\346\235\250\347\245\211.ipynb"
+++ "b/\344\270\200\347\217\255-1825101045-\346\235\250\347\245\211.ipynb"
@@ -95,8 +95,8 @@
"execution_count": 1,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-06-22T00:57:31.811677Z",
- "start_time": "2020-06-22T00:57:31.407717Z"
+ "end_time": "2020-06-30T02:54:55.500263Z",
+ "start_time": "2020-06-30T02:54:54.444909Z"
}
},
"outputs": [],
@@ -157,8 +157,8 @@
"execution_count": 2,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-06-22T00:57:31.901434Z",
- "start_time": "2020-06-22T00:57:31.812674Z"
+ "end_time": "2020-06-30T02:54:55.580119Z",
+ "start_time": "2020-06-30T02:54:55.501297Z"
}
},
"outputs": [
@@ -301,8 +301,8 @@
"execution_count": 3,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-06-22T00:57:32.040039Z",
- "start_time": "2020-06-22T00:57:31.903398Z"
+ "end_time": "2020-06-30T02:54:55.705015Z",
+ "start_time": "2020-06-30T02:54:55.581124Z"
}
},
"outputs": [
@@ -421,7 +421,7 @@
" lambda x: x.split(\"Includes:\")[0].replace(\"| About: \", \"\").split(\",\"))\n",
"# 提取【针对公司】中的简写\n",
"Content_data[\"针对公司\"] = Content_data[\"针对公司\"].map(\n",
- " lambda x: [\"\".join(re.findall('[(](.*?)[)]', i, re.S)) for i in x if not i.find(\"(\")==-1])\n",
+ " lambda x: [\"\".join(re.findall('[(](.*?)[)]', i, re.S)) for i in x if not i.find(\"(\") == -1])\n",
"\n",
"# 显示前五行内容\n",
"Content_data.head()"
@@ -436,11 +436,11 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 10,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-06-22T00:58:16.698088Z",
- "start_time": "2020-06-22T00:58:14.472842Z"
+ "end_time": "2020-06-30T02:58:07.744754Z",
+ "start_time": "2020-06-30T02:58:06.465822Z"
}
},
"outputs": [
@@ -466,76 +466,58 @@
"
\n",
" | \n",
" 标题 | \n",
- " 针对公司_x | \n",
- " 发布时间_x | \n",
" 作者 | \n",
" 评论数 | \n",
- " 发布时间_y | \n",
- " 针对公司_y | \n",
" 摘要 | \n",
" 针对公司 | \n",
+ " 发布时间 | \n",
"
\n",
" \n",
" \n",
" \n",
" Micron Technology: Insanely Cheap Stock Given Its High Earnings Quality | \n",
" Micron Technology: Insanely Cheap Stock Given ... | \n",
- " MU | \n",
- " Dec. 31, 2018, 7:57 PM | \n",
" Ruerd Heeg | \n",
" 75 | \n",
- " Dec. 31, 2018 7:57 PM | \n",
- " [MU, MU, MU, MU, MU] | \n",
" SummaryLast year, a combination of relatively ... | \n",
- " [None] | \n",
+ " MU | \n",
+ " Dec. 31, 2018 7:57 PM | \n",
"
\n",
" \n",
" Molson Coors Seems Attractive At These Valuations | \n",
" Molson Coors Seems Attractive At These Valuations | \n",
- " TAP | \n",
- " Dec. 31, 2018, 7:44 PM | \n",
" Sanjit Deepalam | \n",
" 16 | \n",
- " Dec. 31, 2018 7:44 PM | \n",
- " [TAP, TAP, TAP] | \n",
" SummaryMolson Coors's stock has fallen over 30... | \n",
- " [None] | \n",
+ " TAP | \n",
+ " Dec. 31, 2018 7:44 PM | \n",
"
\n",
" \n",
" Gerdau: The Brazilian Play On U.S. Steel | \n",
" Gerdau: The Brazilian Play On U.S. Steel | \n",
- " GGB | \n",
- " Dec. 31, 2018, 7:10 PM | \n",
" Shannon Bruce | \n",
" 1 | \n",
- " Dec. 31, 2018 7:10 PM | \n",
- " [GGB, GGB, GGB] | \n",
" SummaryGerdau is delivering good results, incl... | \n",
- " [None] | \n",
+ " GGB | \n",
+ " Dec. 31, 2018 7:10 PM | \n",
"
\n",
" \n",
" Will Apple Get Its Mojo Back? | \n",
" Will Apple Get Its Mojo Back? | \n",
- " AAPL | \n",
- " Dec. 31, 2018, 5:36 PM | \n",
" TipRanks | \n",
" 68 | \n",
- " Dec. 31, 2018 5:36 PM | \n",
- " [AAPL, AAPL, AAPL] | \n",
" SummaryApple has been resting on a reputation ... | \n",
- " [None] | \n",
+ " AAPL | \n",
+ " Dec. 31, 2018 5:36 PM | \n",
"
\n",
" \n",
" Lululemon Stock Looks Compelling On This Dip | \n",
" Lululemon Stock Looks Compelling On This Dip | \n",
- " LULU | \n",
- " Dec. 31, 2018, 5:26 PM | \n",
" L&F Capital Management | \n",
" 4 | \n",
- " Dec. 31, 2018 5:26 PM | \n",
- " [LULU, LULU, LULU] | \n",
" SummaryLululemon stock had a strong 2018 but f... | \n",
- " [None] | \n",
+ " LULU | \n",
+ " Dec. 31, 2018 5:26 PM | \n",
"
\n",
" \n",
"\n",
@@ -549,20 +531,6 @@
"Will Apple Get Its Mojo Back? Will Apple Get Its Mojo Back? \n",
"Lululemon Stock Looks Compelling On This Dip Lululemon Stock Looks Compelling On This Dip \n",
"\n",
- " 针对公司_x \\\n",
- "Micron Technology: Insanely Cheap Stock Given I... MU \n",
- "Molson Coors Seems Attractive At These Valuations TAP \n",
- "Gerdau: The Brazilian Play On U.S. Steel GGB \n",
- "Will Apple Get Its Mojo Back? AAPL \n",
- "Lululemon Stock Looks Compelling On This Dip LULU \n",
- "\n",
- " 发布时间_x \\\n",
- "Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018, 7:57 PM \n",
- "Molson Coors Seems Attractive At These Valuations Dec. 31, 2018, 7:44 PM \n",
- "Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018, 7:10 PM \n",
- "Will Apple Get Its Mojo Back? Dec. 31, 2018, 5:36 PM \n",
- "Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018, 5:26 PM \n",
- "\n",
" 作者 \\\n",
"Micron Technology: Insanely Cheap Stock Given I... Ruerd Heeg \n",
"Molson Coors Seems Attractive At These Valuations Sanjit Deepalam \n",
@@ -577,20 +545,6 @@
"Will Apple Get Its Mojo Back? 68 \n",
"Lululemon Stock Looks Compelling On This Dip 4 \n",
"\n",
- " 发布时间_y \\\n",
- "Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018 7:57 PM \n",
- "Molson Coors Seems Attractive At These Valuations Dec. 31, 2018 7:44 PM \n",
- "Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018 7:10 PM \n",
- "Will Apple Get Its Mojo Back? Dec. 31, 2018 5:36 PM \n",
- "Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018 5:26 PM \n",
- "\n",
- " 针对公司_y \\\n",
- "Micron Technology: Insanely Cheap Stock Given I... [MU, MU, MU, MU, MU] \n",
- "Molson Coors Seems Attractive At These Valuations [TAP, TAP, TAP] \n",
- "Gerdau: The Brazilian Play On U.S. Steel [GGB, GGB, GGB] \n",
- "Will Apple Get Its Mojo Back? [AAPL, AAPL, AAPL] \n",
- "Lululemon Stock Looks Compelling On This Dip [LULU, LULU, LULU] \n",
- "\n",
" 摘要 \\\n",
"Micron Technology: Insanely Cheap Stock Given I... SummaryLast year, a combination of relatively ... \n",
"Molson Coors Seems Attractive At These Valuations SummaryMolson Coors's stock has fallen over 30... \n",
@@ -598,15 +552,22 @@
"Will Apple Get Its Mojo Back? SummaryApple has been resting on a reputation ... \n",
"Lululemon Stock Looks Compelling On This Dip SummaryLululemon stock had a strong 2018 but f... \n",
"\n",
- " 针对公司 \n",
- "Micron Technology: Insanely Cheap Stock Given I... [None] \n",
- "Molson Coors Seems Attractive At These Valuations [None] \n",
- "Gerdau: The Brazilian Play On U.S. Steel [None] \n",
- "Will Apple Get Its Mojo Back? [None] \n",
- "Lululemon Stock Looks Compelling On This Dip [None] "
+ " 针对公司 \\\n",
+ "Micron Technology: Insanely Cheap Stock Given I... MU \n",
+ "Molson Coors Seems Attractive At These Valuations TAP \n",
+ "Gerdau: The Brazilian Play On U.S. Steel GGB \n",
+ "Will Apple Get Its Mojo Back? AAPL \n",
+ "Lululemon Stock Looks Compelling On This Dip LULU \n",
+ "\n",
+ " 发布时间 \n",
+ "Micron Technology: Insanely Cheap Stock Given I... Dec. 31, 2018 7:57 PM \n",
+ "Molson Coors Seems Attractive At These Valuations Dec. 31, 2018 7:44 PM \n",
+ "Gerdau: The Brazilian Play On U.S. Steel Dec. 31, 2018 7:10 PM \n",
+ "Will Apple Get Its Mojo Back? Dec. 31, 2018 5:36 PM \n",
+ "Lululemon Stock Looks Compelling On This Dip Dec. 31, 2018 5:26 PM "
]
},
- "execution_count": 5,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -616,17 +577,71 @@
" Title_data, Content_data, right_on='标题', left_index=True, how='outer')\n",
"\n",
"\n",
- "def Update(x):\n",
+ "def UpdateCompany(x):\n",
" try:\n",
- " return np.unique(x[\"针对公司_y\"].append(x[\"针对公司_x\"]))\n",
+ " if(x[\"针对公司_x\"]):\n",
+ " return x[\"针对公司_x\"]\n",
+ " else:\n",
+ " return x[\"针对公司_y\"][0]\n",
" except:\n",
" return np.NAN\n",
"\n",
+ "\n",
+ "def UpdateTime(x):\n",
+ " try:\n",
+ " return np.unique([x[\"发布时间_x\"].strip(), x[\"发布时间_y\"].strip()])[0]\n",
+ " except:\n",
+ " return np.NAN\n",
+ "\n",
+ "\n",
"Title_Content_data['针对公司'] = Title_Content_data[['针对公司_x', '针对公司_y']].apply(\n",
- " lambda x: Update(x), axis=1)\n",
+ " lambda x: UpdateCompany(x), axis=1)\n",
+ "\n",
+ "Title_Content_data['发布时间'] = Title_Content_data[[\"发布时间_x\", \"发布时间_y\"]].apply(\n",
+ " lambda x: UpdateTime(x), axis=1)\n",
+ "\n",
+ "# pd.to_datetime(df)\n",
+ "\n",
+ "# 删除无用列\n",
+ "Title_Content_data.drop([\"发布时间_x\", \"发布时间_y\"], axis=1, inplace=True)\n",
+ "Title_Content_data.drop([\"针对公司_x\", \"针对公司_y\"], axis=1, inplace=True)\n",
+ "\n",
+ "# 删除无用行\n",
+ "Title_Content_data.dropna(axis=0, how='any', subset=[\n",
+ " \"发布时间\", \"针对公司\", \"评论数\"], inplace=True)\n",
"\n",
"Title_Content_data.head()"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.2 回帖聚合\n",
+ "这是网友在各文章下的回复内容 \n",
+ "Title:各文章的标题;空标题的,用最靠近的有内容的下方标题 \n",
+ "Content:回复的全文字内容"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 创建一个空的 DataFrame\n",
+ "Reply_data = pd.DataFrame(columns=['字段', '标题1'])\n",
+ "\n",
+ "for root, dirs, files in os.walk(os.path.join(os.getcwd(), \"\")):\n",
+ " for file in files:\n",
+ " if(file.endswith('.xlsx')):\n",
+ " # 获取文件路径\n",
+ " data_xls = pd.read_excel(os.path.join(root, file), index_col=0)\n",
+ " data_xls.to_csv(os.path.join(root, file).replace(\n",
+ " '.xlsx', '.csv'), encoding='utf-8')\n",
+ " print(os.path.join(root, file), \"转化成功\")\n",
+ " os.remove(os.path.join(root, file))"
+ ]
}
],
"metadata": {
--
GitLab