From: https://www.kaggle.com/ttahara/eda-compare-number-of-culture-and-tag-attributes
Author: Tawara
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 500
import os
print(os.listdir("../input"))
import glob
import json
from matplotlib import pyplot as plt
import seaborn as sns
from collections import Counter
import gc
%matplotlib inline
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/sample_submission.csv")
labels_df = pd.read_csv("../input/labels.csv")
print("[train]")
print(len(train_df))
print(Counter(map(lambda x: x.split(".")[-1], os.listdir("../input/train/"))))
print("[test]")
print(len(test_df))
print(Counter(map(lambda x: x.split(".")[-1], os.listdir("../input/test/"))))
len(labels_df)
labels_df["attribute_type"] = labels_df.attribute_name.apply(lambda x: x.split("::")[0])
print(labels_df["attribute_type"].value_counts())
labels_df.attribute_type.value_counts().plot.bar()
labels_df.query("attribute_type == 'culture'").index
labels_df.query("attribute_type == 'tag'").index
labels_df[:398].head(10)
labels_df[398:].head(10)
train_attr_ohot = np.zeros((len(train_df), len(labels_df)), dtype=int)
for idx, attr_arr in enumerate(train_df.attribute_ids.str.split(" ").apply(lambda l: list(map(int, l))).values):
train_attr_ohot[idx, attr_arr] = 1
names_arr = labels_df.attribute_name.values
train_df["attribute_names"] = [", ".join(names_arr[arr == 1]) for arr in train_attr_ohot]
train_df["attr_num"] = train_attr_ohot.sum(axis=1)
train_df["culture_attr_num"] = train_attr_ohot[:, :398].sum(axis=1)
train_df["tag_attr_num"] = train_attr_ohot[:, 398:].sum(axis=1)
train_df.head()
train_df.attr_num.value_counts().sort_index()
train_df.culture_attr_num.value_counts().sort_index()
train_df.tag_attr_num.value_counts().sort_index()
fig = plt.figure(figsize=(15, 10))
fig.subplots_adjust(hspace=0.4)
ax1 = fig.add_subplot(3,1,1)
sns.countplot(train_df.attr_num,)
ax1.set_title("number of attributes each art has")
ax2 = fig.add_subplot(3,1,2,)
sns.countplot(train_df.culture_attr_num, ax=ax2)
ax2.set_title("number of 'culture' attributes each art has")
ax3 = fig.add_subplot(3,1,3,)
ax3.set_title("number of 'tag' attributes each art has")
sns.countplot(train_df.tag_attr_num, ax=ax3)
pd.pivot_table(
train_df, index='culture_attr_num', columns='tag_attr_num', values='attr_num',
aggfunc=len)
train_df.culture_attr_num.value_counts(normalize=True).sort_index()
train_df.tag_attr_num.value_counts(normalize=True).sort_index()
There is difference between the distribution of number of culture attributes and one of tag attributes.
The number of culture attributes 99% of arts have is 0 or 1 or 2, moreover, 80% is 1.
On the other hands, the number of tag attributes shows a gentler slope from 1 to 5. Very few of arts have no tag attribute.
I think these observations may be useful for deciding thresholds ofclassifiers.
Next, I show the arts which have many culuture or tag attributes.
train_df.sort_values(by="culture_attr_num", ascending=False).head(15)
train_df.sort_values(by="tag_attr_num", ascending=False).head(15)
It is difficult for me to find somthing from these tables. Let's show images of arts in these tables.
from PIL import Image
fig = plt.figure(figsize=(5 * 5, 5 * 6))
fig.subplots_adjust(wspace=0.6, hspace=0.6)
for i, (art_id, attr_names) in enumerate(train_df.sort_values(by="culture_attr_num", ascending=False)[["id", "attribute_names"]].values[:15]):
ax = fig.add_subplot(5, 3, i // 3 * 3 + i % 3 + 1)
im = Image.open("../input/train/{}.png".format(art_id))
ax.imshow(im)
im.close()
attr_split = attr_names.split(", ")
attr_culture = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:7] == "culture", attr_split)))
attr_tag = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:3] == "tag", attr_split)))
ax.set_title("art id: {}\nculture: {}\ntag: {}".format(art_id, attr_culture, attr_tag))
fig = plt.figure(figsize=(5 * 6, 5 * 5))
fig.subplots_adjust(wspace=0.6, hspace=0.6)
for i, (art_id, attr_names) in enumerate(train_df.sort_values(by="tag_attr_num", ascending=False)[["id", "attribute_names"]].values[:15]):
ax = fig.add_subplot(5, 3, i // 3 * 3 + i % 3 + 1)
im = Image.open("../input/train/{}.png".format(art_id))
ax.imshow(im)
im.close()
attr_split = attr_names.split(", ")
attr_culture = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:7] == "culture", attr_split)))
attr_tag = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:3] == "tag", attr_split)))
ax.set_title("art id: {}\nculture: {}\ntag: {}".format(art_id, attr_culture, attr_tag))
Since I have poor knowledge of art, cannot validate culture attributes.
How about tag attributes? They are relatively interpretable, but may be splitable into some types:
Therefore, I think it may be useful for classification to consider type of tag attribute.
With respect to number, pictures tend to have more tag attributes because of painted objects on them.
I have one assumption that number of tag attributes depends on type of arts. Then, check several examples.
fig = plt.figure(figsize=(5 * 8, 5 * 7))
fig.subplots_adjust(wspace=0.6, hspace=0.6)
for i, (art_id, attr_names) in enumerate(train_df[train_df.tag_attr_num == 1][["id", "attribute_names"]].values[:49]):
ax = fig.add_subplot(7, 7, i // 7 * 7 + i % 7 + 1)
im = Image.open("../input/train/{}.png".format(art_id))
ax.imshow(im)
im.close()
attr_split = attr_names.split(", ")
attr_culture = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:7] == "culture", attr_split)))
attr_tag = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:3] == "tag", attr_split)))
ax.set_title("art id: {}\nculture: {}\ntag: {}".format(art_id, attr_culture, attr_tag))
fig = plt.figure(figsize=(5 * 8, 5 * 7))
fig.subplots_adjust(wspace=0.6, hspace=0.6)
for i, (art_id, attr_names) in enumerate(train_df[train_df.tag_attr_num == 2][["id", "attribute_names"]].values[:49]):
ax = fig.add_subplot(7, 7, i // 7 * 7 + i % 7 + 1)
im = Image.open("../input/train/{}.png".format(art_id))
ax.imshow(im)
im.close()
attr_split = attr_names.split(", ")
attr_culture = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:7] == "culture", attr_split)))
attr_tag = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:3] == "tag", attr_split)))
ax.set_title("art id: {}\nculture: {}\ntag: {}".format(art_id, attr_culture, attr_tag))
fig = plt.figure(figsize=(5 * 8, 5 * 7))
fig.subplots_adjust(wspace=0.6, hspace=0.6)
for i, (art_id, attr_names) in enumerate(train_df[train_df.tag_attr_num == 3][["id", "attribute_names"]].values[:49]):
ax = fig.add_subplot(7, 7, i // 7 * 7 + i % 7 + 1)
im = Image.open("../input/train/{}.png".format(art_id))
ax.imshow(im)
im.close()
attr_split = attr_names.split(", ")
attr_culture = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:7] == "culture", attr_split)))
attr_tag = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:3] == "tag", attr_split)))
ax.set_title("art id: {}\nculture: {}\ntag: {}".format(art_id, attr_culture, attr_tag))
fig = plt.figure(figsize=(5 * 8, 5 * 7))
fig.subplots_adjust(wspace=0.6, hspace=0.6)
for i, (art_id, attr_names) in enumerate(train_df[train_df.tag_attr_num == 4][["id", "attribute_names"]].values[:49]):
ax = fig.add_subplot(7, 7, i // 7 * 7 + i % 7 + 1)
im = Image.open("../input/train/{}.png".format(art_id))
ax.imshow(im)
im.close()
attr_split = attr_names.split(", ")
attr_culture = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:7] == "culture", attr_split)))
attr_tag = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:3] == "tag", attr_split)))
ax.set_title("art id: {}\nculture: {}\ntag: {}".format(art_id, attr_culture, attr_tag))
fig = plt.figure(figsize=(5 * 8, 5 * 7))
fig.subplots_adjust(wspace=0.6, hspace=0.6)
for i, (art_id, attr_names) in enumerate(train_df[train_df.tag_attr_num == 5][["id", "attribute_names"]].values[:49]):
ax = fig.add_subplot(7, 7, i // 7 * 7 + i % 7 + 1)
im = Image.open("../input/train/{}.png".format(art_id))
ax.imshow(im)
im.close()
attr_split = attr_names.split(", ")
attr_culture = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:7] == "culture", attr_split)))
attr_tag = list(map(lambda x: x.split("::")[-1], filter(lambda x: x[:3] == "tag", attr_split)))
ax.set_title("art id: {}\nculture: {}\ntag: {}".format(art_id, attr_culture, attr_tag))
It seems that more tag attributes arts have, more complex they are.
Most of arts with one tag attributes are single objects such as ornaments.
In contrast, most of arts with five ones are pictures or objects with complex design.
Maybe, we can predict number of tag attributes by complexity of arts ?