iMet Data Analysis all

From: https://www.kaggle.com/ji648513181/imet-data-analysis-all

Author: Dongfei Ji

In [1]:
import numpy as np 
import pandas as pd 
import pylab as plt
import seaborn as sns
import os
import cv2
import warnings
warnings.filterwarnings("ignore")
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
['test', 'train', 'train.csv', 'labels.csv', 'sample_submission.csv']

Number of samples

In [2]:
train_df = pd.read_csv('../input/train.csv')
labels_df = pd.read_csv('../input/labels.csv')
submission_df = pd.read_csv('../input/sample_submission.csv')
print("Train data shape -  rows:",train_df.shape[0]," columns:", train_df.shape[1])
print("Lables shape -      rows:",labels_df.shape[0]," columns:", labels_df.shape[1])
print("submission csv shape -  rows:",submission_df.shape[0]," columns:", submission_df.shape[1])
Train data shape -  rows: 109237  columns: 2
Lables shape -      rows: 1103  columns: 2
submission csv shape -  rows: 7443  columns: 2

训练集共有109237张训练图片,测试集共有7443张训练图片,全部共有1103种标签

Most frequent attributes

In [3]:
attribute_ids=train_df['attribute_ids'].values
attribute_ids
Out[3]:
array(['147 616 813', '51 616 734 813', '776', ..., '156 763', '121 433',
       '462 733 813 1020'], dtype=object)

将多标签变为单标签列表

In [4]:
attribute_ids=train_df['attribute_ids'].values
attributes=[]
for attribute_items in [x.split(' ') for x in attribute_ids]:
        for attribute in attribute_items:
            attributes.append(int(attribute))
attributes[0:10]
Out[4]:
[147, 616, 813, 51, 616, 734, 813, 776, 51, 671]

id与标签对应

In [5]:
att_df=pd.DataFrame(attributes,columns=['attribute_id'])
att_df=att_df.merge(labels_df)
att_df.head(5)
Out[5]:
attribute_id attribute_name
0 147 culture::french
1 147 culture::french
2 147 culture::french
3 147 culture::french
4 147 culture::french
In [6]:
frequent=att_df['attribute_name'].value_counts()[:30].to_frame()
frequent
Out[6]:
attribute_name
tag::men 19970
tag::women 14281
culture::french 13522
culture::italian 10375
culture::american 9151
tag::flowers 8419
culture::british 7615
culture::japan 7394
tag::utilitarian objects 6564
culture::egyptian 6542
tag::portraits 5955
tag::trees 5591
culture::china 5382
tag::leaves 5259
culture::german 5163
culture::turkish or venice 4416
tag::inscriptions 3890
tag::birds 3692
tag::human figures 3665
tag::textile fragments 3570
culture::islamic 3500
tag::mythical creatures 3005
tag::profiles 2552
tag::animals 2548
tag::writing systems 2327
tag::clothing and accessories 2180
tag::books 2097
tag::landscapes 2075
culture::greek 2050
tag::bowls 2045
In [7]:
plt.figure(figsize=(8,8))
most=sns.barplot(x=frequent.index,y="attribute_name",data=frequent, palette="rocket")
most.set_xlabel("Kind of label",fontsize=14)
most.set_ylabel("Count",fontsize=14)
sns.despine()
most.set_xticklabels(most.get_xticklabels(), rotation=90,fontsize=14)
plt.title("Most frequent attributes",fontsize=18)
plt.show()
In [8]:
infrequent=att_df['attribute_name'].value_counts(ascending=True)[:50].to_frame()

plt.figure(figsize=(12,8))
inmost=sns.barplot(x=infrequent.index,y="attribute_name",data=infrequent, palette="rocket")
inmost.set_xlabel("Kind of label",fontsize=14)
inmost.set_ylabel("Count",fontsize=14)
sns.despine()
inmost.set_xticklabels(most.get_xticklabels(), rotation=90,fontsize=14)
plt.title("Most infrequent attributes",fontsize=18)
plt.show()
In [9]:
train_df['Number of Tags']=train_df['attribute_ids'].apply(lambda x: len(x.split(' ')))
train_df.head(5)
Out[9]:
id attribute_ids Number of Tags
0 1000483014d91860 147 616 813 3
1 1000fe2e667721fe 51 616 734 813 4
2 1001614cb89646ee 776 1
3 10041eb49b297c08 51 671 698 813 1092 5
4 100501c227f8beea 13 404 492 903 1093 5
In [10]:
f, ax = plt.subplots(figsize=(6, 6))
ax=sns.countplot(x="Number of Tags",data=train_df,palette="GnBu_d")
ax.set_xlabel("Number of Tags",fontsize=14)
ax.set_ylabel("Count",fontsize=14)
sns.despine()
plt.title("Number of Tags distribution",fontsize=18)
plt.show()

Analysis of picture size

In [11]:
os.listdir("../input/train")[:300]
Out[11]:
['e232597c213332cd.png',
 '4c3e9596dafb4d13.png',
 '4712bc2351789604.png',
 'be77f39c438a3448.png',
 '2ffa9ece3a622644.png',
 '9caa967cba20461c.png',
 'a4a099220bcafb7.png',
 '5c6675ae34aa5307.png',
 '371705db8277fa72.png',
 '6c934d937be3d500.png',
 '2f305db4e14e9246.png',
 '635e5a6ef19476c.png',
 '3c741a858038662a.png',
 '427adcd60e9806d6.png',
 'bc27cf7895f091a4.png',
 'ebe7cfe23ef42381.png',
 '7e72aa4da10c88fa.png',
 '6f832ae1dc442271.png',
 '8976e4085e0311ad.png',
 '26dead0af2c91eea.png',
 'd1f06ce6e699be9a.png',
 '40b57b35d555db7.png',
 '2199ed8eb58fca1f.png',
 '7898ef2c82605a92.png',
 '27264becef963ebb.png',
 '5551981a15b8e9af.png',
 '954c8b0f3fef2385.png',
 'cebaa4eb910a49b6.png',
 'e0badcf2041a0859.png',
 '6008ccc1dbb8d761.png',
 'a1d4c91aee8c7062.png',
 'f43f42bd25a10667.png',
 '13ebc5cfca334c15.png',
 '1a87eda029016ca3.png',
 '1c6ad56c157d7993.png',
 'ff054b114999b028.png',
 'e96f60edc02803f1.png',
 'd1ab29aac6fdee4c.png',
 '9b93782c9f3dfd05.png',
 '5806bfd2d402f33b.png',
 'e2fe00ccb4855bc7.png',
 '4efb5f9a4e2941af.png',
 '3d8472aa3d5f5092.png',
 'bb912d2c5541d696.png',
 '95b5f4165fdca319.png',
 'ac4f3f450bc66946.png',
 '23009df4dd791ff4.png',
 '6ea31f5f4ce4637.png',
 '104aa6660ce67379.png',
 '5eb7dc30076f50b7.png',
 'c794412b9b9adcc5.png',
 '5bb570fbe29c0bad.png',
 'dca39ed4ba26957f.png',
 '7335b0723ddc0f0c.png',
 'e6f63b8cd1fe5e51.png',
 'ca272f0fa9e03a18.png',
 'f3efbfb49416bf8.png',
 '3f928efeda393fe8.png',
 'b1d2ecc19e70541d.png',
 'bf7bedeae4328cb0.png',
 'ee987ea55ad8a26.png',
 'c149bbfeff38fdf8.png',
 '7aaa6ac55a43b155.png',
 '1f153e76257e4cd7.png',
 'bc2a764e277e8967.png',
 'f4e4551eea61fd5f.png',
 '370cec159a0810a0.png',
 'b67625ee244ce519.png',
 '3f7ae35a659b58c8.png',
 'ea1fd52334692a08.png',
 '585b8464a24608bc.png',
 'ee824cb316c27c7.png',
 'ab269c85169b3f07.png',
 'ba3b20c2245093bc.png',
 '436c3f0ee214ba25.png',
 'f0325250a4f7a4bc.png',
 'e64f8c5d2c7b6985.png',
 'a51dba81b6da9dcd.png',
 '336499bee0c9c1a.png',
 'f45a2fc24672681.png',
 'e4e54182a9a44603.png',
 '2b94052c6acf09d8.png',
 '8bc9721165a43c31.png',
 '837d501dd610128.png',
 '8a4c98c54673fb88.png',
 '94fc05caa726ee4f.png',
 '54ca3f63133828f1.png',
 'dd6e76656dab76d7.png',
 '35dacb38701b34d6.png',
 '13fa29d779de9a05.png',
 'dcad68e17221aafe.png',
 'f9583a8f68808ad0.png',
 'd69f2cd11cfe973d.png',
 'b0b117be7d19a87a.png',
 'a5e5eb39a5410de8.png',
 'e08dfcb10aa8b3ec.png',
 '9d344519c730f854.png',
 '1a1db256429a816e.png',
 'f417b9f13e1997df.png',
 '7a441c57612553dd.png',
 '6086b72a53844320.png',
 'eb0f88696cae6b2.png',
 '3528b62505020e4a.png',
 '21ac6bed1a71a1c9.png',
 '5efad4efed6a2f3b.png',
 '4d325ae5931aedfb.png',
 'e1212d3043295761.png',
 '8a53d2f4522e681a.png',
 '778a9d25dca22c96.png',
 'dd579aeab9965f09.png',
 '10ccb3d736c8c029.png',
 '862a99a40deb5c.png',
 '23f0d17cb49d4410.png',
 'cfbc80dd0c5dd8fa.png',
 '5d8b08712b9b9bea.png',
 'f0308916cf9ed119.png',
 'ab8febf8c5644239.png',
 '868230c2dfae4ae9.png',
 'e8896d4b6f32d176.png',
 '1599d3c01fe9fcc0.png',
 'f97eba2c549e3544.png',
 '5aa3ed9f9421270b.png',
 'e5f71e0b9872f015.png',
 '5aa74cf6ec0d6899.png',
 '4b5904efd01ccd62.png',
 'b502be6eaeb8e98c.png',
 '6fdba3aec0fbebc7.png',
 '6658fc19a920dfd4.png',
 '826b4c6a66cc416b.png',
 'd78d801d1bb3329f.png',
 '1e27d89fc74c996.png',
 '4114d20c687146a9.png',
 'c6906c15ea7c5668.png',
 '4428cf847e8ab2cd.png',
 '3b690568039674d.png',
 '786ac4e3bea7261a.png',
 '9c63f00b5d5198e3.png',
 '4fc2bdc602346d8a.png',
 '2c03c66b1a21abd0.png',
 '552a91998e1fab1e.png',
 'd26cd76add681ee1.png',
 'e780a45c059d1071.png',
 '52f60a3126ee2d0a.png',
 '9df266ad121fe5d5.png',
 '86b44fe790e96b39.png',
 'e6def4b27fd33f63.png',
 '11c1c907367ce613.png',
 'f8ef6a39aeddae.png',
 'a3dc369461888215.png',
 'e637629e3b729f69.png',
 '6b4bd9706c85a9f.png',
 'bbe9080bc5e60673.png',
 '846d9809fb073c57.png',
 '6a72fcd8ed309af7.png',
 'afdd48c1cd056925.png',
 'b424c01906bfd7a0.png',
 '7a34ac6e53300d43.png',
 '485dd3270506c102.png',
 'f11c5c9750f6a07f.png',
 '75dff9e16483ff4f.png',
 'a042ff313c2dc6ac.png',
 '54006542184d4501.png',
 '4b2c571aacabc49e.png',
 'fa3bcf05c89a9bb9.png',
 'f67d6be841cfdb01.png',
 'd1a36137b004cee2.png',
 'f5649805cf85a20c.png',
 '512d35a4e2bfc1bc.png',
 'd8d7a1e1c35c63be.png',
 'b14481c6ea1dde1.png',
 '46aee29a5e61d83e.png',
 'dfc7385c1e70c08f.png',
 'c4f71263c4b724ff.png',
 'aa4dc5629e2e0aa3.png',
 '3b32e5c6c799280.png',
 'ec17096ab73d61a0.png',
 'ef056a464340bf1d.png',
 'ba172b59f7169496.png',
 '74b2476e67c133d7.png',
 '8cb270d453a169a5.png',
 'd843f522cec155f3.png',
 'f5cc02dc2269db9.png',
 '5c02b6d426423688.png',
 '67fd3960049e2192.png',
 '72be9847db7875b1.png',
 '70c1bf4654b5491f.png',
 'c95073e27194303c.png',
 '95c30977597a51d6.png',
 '28e063b00a3e21ca.png',
 '5c7e85f4ff759c9e.png',
 'd8d4c2347495b5c4.png',
 '8a39684383944758.png',
 'fb21bc1120cbafa.png',
 '732d506a5fede0ca.png',
 'ed869325258b8ece.png',
 '60d1b79bfd794a49.png',
 'f847090c4eddf026.png',
 '67e1e575b0be4a00.png',
 '147268a7e30596f.png',
 '4322f4d08643e31b.png',
 'c14b7011e79ddac9.png',
 'ae1e9cb6fdb8a11a.png',
 'cdb7bbf52450c007.png',
 '3e21275e8dec9c4d.png',
 'c03e305a642e5625.png',
 '1e69cf75e644688d.png',
 '6f589130e4a9637f.png',
 '8918fe5f7702855a.png',
 'f7f066f8e97323f8.png',
 'a8535f9ad6d081fb.png',
 'e18eed36cf876a.png',
 'ec441780b52df3e7.png',
 'a7ca6699bd7c9e59.png',
 '6fc8e83be78db77b.png',
 '5d7455c7baa00bd0.png',
 '428ae9cd2d33c041.png',
 'ec3ba89913f10c10.png',
 '558d9bad43ab5ece.png',
 'd4fd520f6c31da71.png',
 '532bd5ed076926fa.png',
 '7b667730d5b80aa7.png',
 '8a33d042aaf45aa9.png',
 '29eb628f53bbf799.png',
 '5f07ec28c3d1ba58.png',
 'cc73cdd8e6c04b0e.png',
 '2df0adc098c1205c.png',
 '2d52d9a3aca75bea.png',
 'bb42d37e725974bb.png',
 'df69c7bfb7c85467.png',
 '5741c6272b3d6b15.png',
 'b2942a59d23d5643.png',
 'c936cad80fba61ca.png',
 'fe9887139c167505.png',
 'c47521cb77621c51.png',
 '949f5506e5574d76.png',
 'a476e9a341ebba93.png',
 '9ee994883ca84545.png',
 '927401919eb134a8.png',
 'ab723914e13c750d.png',
 '992e6cdff459d02f.png',
 'a4f37da686994633.png',
 '9d75b00f5ba0ccc6.png',
 '1f253f5ec9df80a2.png',
 'fa5f499d91fbafd9.png',
 'a5fa4d382ce9b229.png',
 'fdcce410730ad6bc.png',
 '295ef604d2a6b4da.png',
 '4200b92dfe69229f.png',
 '77846ed4c6dd9468.png',
 'd8c7e28181f7b621.png',
 '2ca8fd3a62a9c8e6.png',
 '14ccb898ed25f7e7.png',
 '36d2dfd3308ef0c8.png',
 'f723490b8ee60bd7.png',
 '55dc97a09220af33.png',
 'a26e40ea1d6f287f.png',
 '8e733304c978a901.png',
 '6ba62d417afb832f.png',
 '2b014905adeb8336.png',
 'c3bc54edca501e52.png',
 'f89183f149ce2239.png',
 'edb2e9856669fc67.png',
 'ee63cab97a115d52.png',
 '7573010bcf4f3408.png',
 '75b5ed473ab4624c.png',
 'e9cdab86019977e6.png',
 '2e4f6895eace10fd.png',
 'e3c9f23b47fb7cc7.png',
 '7066a369cf9fd605.png',
 'f826cf50041c31af.png',
 'f5fb9d9301445eed.png',
 '62a15389ee80cd9c.png',
 'c2a5d4588c157e91.png',
 '56a281912bc43895.png',
 '695576c475ea3ea1.png',
 'c82b69888d9e030c.png',
 '1857a32dab4fdcc4.png',
 '5bf9da847eec7b63.png',
 '625421974a0bfde4.png',
 '5aaef17c6e52790c.png',
 'ddc8f473d92f9e84.png',
 'b1813420d424aa60.png',
 'fe5582d7f7113082.png',
 '5a19eb4382b157db.png',
 '3addd7deb0850b38.png',
 '14bb89209d34b47a.png',
 '8fb5f8522ff176cb.png',
 'eb1dab8e726d9b20.png',
 '45da64eda7c1950c.png',
 '8b66b0dd7609c017.png',
 'b4f195b970bce936.png',
 '781f622c997afeb2.png',
 '80ba6093e203a523.png',
 '10f31be59a446685.png',
 '5edf5fde5fa91436.png',
 '7f542c98dcfa6d6d.png',
 '71fe15a7b5556e99.png',
 '78bf52a15bb7263b.png',
 '686181199d9f456c.png',
 '63a46338abbd13d.png']
In [12]:
width = []
height = []
for img_name in os.listdir("../input/train/")[-3000:]:
    shape = cv2.imread("../input/train/%s" % img_name).shape
    height.append(shape[0])
    width.append(shape[1])
size = pd.DataFrame({'height':height, 'width':width})
sns.jointplot("height", "width", size, kind='reg')  
plt.show()
In [13]:
width=[]
height=[]
for img_name in os.listdir("../input/train")[:3000]:
    img = cv2.imread("../input/train/%s" % img_name)
    width.append(img.shape[0])
    height.append(img.shape[1])
print(width[:5])
print(height[:5])
[300, 300, 309, 445, 355]
[361, 352, 300, 300, 300]
In [14]:
size_df = pd.DataFrame({'width':width,'height':height})
In [15]:
size_df.head(5)
Out[15]:
width height
0 300 361
1 300 352
2 309 300
3 445 300
4 355 300
In [16]:
print(size_df.count())
width     3000
height    3000
dtype: int64
In [17]:
sns.lmplot('width','height',size_df, fit_reg=False) 
plt.show()
In [18]:
print('The average width is '+str(np.mean(size_df.width)))
print('The median width is '+str(np.median(size_df.width)))
print('The average height is '+str(np.mean(size_df.height)))
print('The average height is '+str(np.mean(size_df.height)))
The average width is 371.408
The median width is 301.0
The average height is 376.635
The average height is 376.635

Pictures with most frequents

In [19]:
most_fre=train_df[train_df['Number of Tags']>9]
least_fre=train_df[train_df['Number of Tags']<2]
most_fre
Out[19]:
id attribute_ids Number of Tags
4625 1a311e499220e8cc 189 306 538 633 701 734 813 971 973 1092 10
20646 3d74ff2317aa3a8b 18 161 331 534 734 813 833 847 1062 1092 10
38081 63e46b86851b6bfe 51 420 519 573 733 734 813 923 971 1092 10
55029 88c7e0e377a69d27 92 477 519 616 671 734 813 912 1035 1092 369 11
57837 8f0df4e2b81262cd 40 147 420 477 591 671 709 813 834 996 10
69600 a94284e9d464adfa 51 477 538 616 637 784 813 822 975 1092 10
In [20]:
count=1
plt.figure(figsize=[30,20])
for img_name in most_fre['id'].values[:6]:
    img = cv2.imread("../input/train/%s.png" % img_name)
    plt.subplot(2, 3, count)
    plt.imshow(img)
    count+=1
plt.show()
    

Pictures with least attributes

In [21]:
count = 1
plt.figure(figsize=[30,20])
for img_name in least_fre['id'].values[:6]:
    img = cv2.imread("../input/train/%s.png" % img_name)
    plt.subplot(2, 3, count)
    plt.imshow(img)
    count += 1
plt.show
Out[21]:
<function matplotlib.pyplot.show(*args, **kw)>
In [22]:
train_df.head(3)
Out[22]:
id attribute_ids Number of Tags
0 1000483014d91860 147 616 813 3
1 1000fe2e667721fe 51 616 734 813 4
2 1001614cb89646ee 776 1
多标签问题,其中标签序号对应实际意义在lables.csv文件
In [23]:
labels_df.head(3)
Out[23]:
attribute_id attribute_name
0 0 culture::abruzzi
1 1 culture::achaemenid
2 2 culture::aegean

lables是将attribute_id进行编号来对应name

In [24]:
submission_df.head(3)
Out[24]:
id attribute_ids
0 10023b2cc4ed5f68 0 1 2
1 100fbe75ed8fd887 0 1 2
2 101b627524a04f19 0 1 2
In [25]:
missing = train_df.isnull().sum()
all_val = train_df.count()
print(missing)
print(all_val)
missing_train_df = pd.concat([missing, all_val], axis=1, keys=['Missing', 'All'])
missing_train_df
id                0
attribute_ids     0
Number of Tags    0
dtype: int64
id                109237
attribute_ids     109237
Number of Tags    109237
dtype: int64
Out[25]:
Missing All
id 0 109237
attribute_ids 0 109237
Number of Tags 0 109237

训练数据中无任何缺失值

In [26]:
image_names=sorted(os.listdir("../input/train"))
print(image_names[0])
print(image_names[1])
print(image_names[2])
print(image_names[3])
1000483014d91860.png
1000fe2e667721fe.png
1001614cb89646ee.png
10041eb49b297c08.png
In [27]:
train_img=cv2.imread("../input/train/1000483014d91860.png")
plt.imshow(train_img)
plt.axis('off')
train_img.shape
Out[27]:
(300, 339, 3)

其中第一张图片大小300*339 3个颜色通道,下面提取数据。 提图像路径与其对应的标签一一对应且元组化,使用Numpy对其进行序列化。

In [28]:
train_df
Out[28]:
id attribute_ids Number of Tags
0 1000483014d91860 147 616 813 3
1 1000fe2e667721fe 51 616 734 813 4
2 1001614cb89646ee 776 1
3 10041eb49b297c08 51 671 698 813 1092 5
4 100501c227f8beea 13 404 492 903 1093 5
5 10050ed12fbad46d 189 279 774 800 1051 5
6 100543a032517972 188 1034 2
7 1006665c0aad488 1010 1053 2
8 1007057734dba6df 189 541 542 993 4
9 1008abd71f3ed5bc 70 776 794 813 1046 1092 6
10 1008c7837081f985 79 1062 2
11 1009f5737fc77f2 188 668 754 3
12 100a0dcde728cb36 51 675 2
13 100a58282c6584bf 147 716 903 1092 4
14 100b45b7c4020f5d 161 489 704 1100 4
15 100bb499d37d0751 188 535 2
16 100bbf5e832083d3 51 212 426 586 940 5
17 100d750286e85bf3 13 616 2
18 100e1e65a6d7850e 737 1009 1046 3
19 100ef61c00e1b5d3 25 161 784 1059 4
20 100efeead4d0f90c 189 542 670 813 949 1092 6
21 100f00204964e81d 147 189 418 742 1072 1092 6
22 1010af4eeb3f95ee 194 480 483 485 813 1099 6
23 1012a188a1fd6166 13 813 896 3
24 1014806043fa009 156 1084 2
25 1014ac8807369589 103 180 573 3
26 1014daea6638d09a 147 1039 2
27 101534933c122e23 25 161 584 616 627 738 868 7
28 1015ddcd27215ca6 51 212 1059 3
29 101659003bdb1499 835 1059 2
... ... ... ...
109207 ffec5791dc172baf 13 879 2
109208 ffed849cb177f26c 13 896 1092 1093 4
109209 ffee19e34e27d86e 147 477 835 1007 4
109210 ffef15e85ff1ea17 194 487 2
109211 fff02bab32073629 156 813 2
109212 fff10c8e135c00ab 584 369 2
109213 fff10f2dd40adaa 149 1035 2
109214 fff30f4d700df04e 25 161 413 425 440 1059 6
109215 fff37e9edfea080a 111 813 989 3
109216 fff4519bee1e950c 639 1
109217 fff4816aa6de9926 189 412 733 813 993 5
109218 fff4aa8d4a6fa07a 79 612 671 3
109219 fff4f747cf87b90 121 484 2
109220 fff5dcfe2e8a69c3 131 835 934 949 1072 5
109221 fff600d362e3e868 13 477 2
109222 fff7393a3eac3a33 13 492 742 988 1092 5
109223 fff73d548fe603f3 194 404 655 3
109224 fff792d0d1a053e2 188 597 723 3
109225 fff99448a6e7ce33 156 733 938 3
109226 fffb443e70171976 189 541 542 813 4
109227 fffb6ec0e3cbed0b 156 813 2
109228 fffc78822e379d29 157 465 2
109229 fffcac2f6e70232 189 734 813 3
109230 fffd47c6fedf4d3c 127 616 655 813 1092 5
109231 fffdae8164c9cfff 189 583 1064 3
109232 fffedb8a287c9f55 121 432 724 3
109233 ffff04c4482c28d2 189 511 813 896 4
109234 ffff3e66a42ab868 156 763 2
109235 ffff45b237a32bd5 121 433 2
109236 ffffbf00586b8e37 462 733 813 1020 4

109237 rows × 3 columns

In [29]:
train_dataset_info = []
for name, labels in zip(train_df['id'],train_df['attribute_ids'].str.split(' ')):
    train_dataset_info.append({
         'path':os.path.join('../input/train', name),
        'labels':np.array([int(label) for label in labels])})
train_dataset_info = np.array(train_dataset_info)
train_dataset_info
Out[29]:
array([{'path': '../input/train/1000483014d91860', 'labels': array([147, 616, 813])},
       {'path': '../input/train/1000fe2e667721fe', 'labels': array([ 51, 616, 734, 813])},
       {'path': '../input/train/1001614cb89646ee', 'labels': array([776])},
       ...,
       {'path': '../input/train/ffff3e66a42ab868', 'labels': array([156, 763])},
       {'path': '../input/train/ffff45b237a32bd5', 'labels': array([121, 433])},
       {'path': '../input/train/ffffbf00586b8e37', 'labels': array([ 462,  733,  813, 1020])}],
      dtype=object)
In [30]:
len(train_dataset_info)
Out[30]:
109237
In [31]:
first=train_df['attribute_ids'].str.split(' ')
train_df['first'] = first
train_df.head(10)
Out[31]:
id attribute_ids Number of Tags first
0 1000483014d91860 147 616 813 3 [147, 616, 813]
1 1000fe2e667721fe 51 616 734 813 4 [51, 616, 734, 813]
2 1001614cb89646ee 776 1 [776]
3 10041eb49b297c08 51 671 698 813 1092 5 [51, 671, 698, 813, 1092]
4 100501c227f8beea 13 404 492 903 1093 5 [13, 404, 492, 903, 1093]
5 10050ed12fbad46d 189 279 774 800 1051 5 [189, 279, 774, 800, 1051]
6 100543a032517972 188 1034 2 [188, 1034]
7 1006665c0aad488 1010 1053 2 [1010, 1053]
8 1007057734dba6df 189 541 542 993 4 [189, 541, 542, 993]
9 1008abd71f3ed5bc 70 776 794 813 1046 1092 6 [70, 776, 794, 813, 1046, 1092]

记录一下单标签统计方法

In [32]:
th10 = pd.DataFrame(train_df.attribute_ids.value_counts().head(10))
th10.reset_index(level=0, inplace=True)
th10.columns = ['attribute_ids','count']
th10
Out[32]:
attribute_ids count
0 13 405 896 1092 1158
1 813 896 586
2 194 1034 489
3 13 552 482
4 121 1059 465
5 121 433 425
6 13 626 365
7 79 1059 352
8 13 813 896 339
9 121 1039 332