sklearn Dummy Baseline

From: https://www.kaggle.com/eseymour/sklearn-dummy-baseline

Author: Edward Seymour

Score: 0.039

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import csv
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.dummy import DummyClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
datadir = '../input'

with open(os.path.join(datadir, 'train.csv')) as f:
    next(f) # Eat the first line of input
    train_labels = [[int(label)-1 for label in line[1].split(' ')] for line in csv.reader(f, delimiter=',')]
    
y_train = np.zeros((len(train_labels), max(label for labels in train_labels for label in labels)+1))
for i, labels in enumerate(train_labels):
    y_train[i, labels] = 1.
    
baselineClassifier = DummyClassifier(strategy='most_frequent', random_state=0xdeadbeef)
baselineClassifier.fit(y_train, y_train)

images = [image.split('.')[0] for image in os.listdir(os.path.join(datadir, 'test'))]
predictions = baselineClassifier.predict(images)

with open('submission.csv', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerow(['id', 'attribute_ids'])
    for image, prediction in zip(images, predictions):
        labels = []
        for i, label in enumerate(prediction):
            if label:
                labels.append(str(i+1))
        csv_writer.writerow([image, ' '.join(labels)])

# Any results you write to the current directory are saved as output.