NOTA: En algunas situaciones esto puede significar una perdida de información, para detalle sobre esto, ver referencias 1 y 2.
Conceptualmente sería así:
Script:
import pandas as pd import numpy as np from scipy.stats.mstats import mquantiles def qbinarizer(dataset, not_feature, qt = np.linspace(0,1,11)): """ Discretization of continuous features """ print('binarizing dataset...') df_new = pd.DataFrame(index=dataset.index) headers =dataset.columns headers = [x for x in headers if x not in not_feature] for i, col in enumerate(headers): if len(dataset[col].unique()) > 1: q = mquantiles(dataset[col], qt) q = sorted(list(set(q))) feature = pd.cut(dataset[col], q, include_lowest = True) feature = feature.to_frame().astype(str) df = pd.get_dummies(feature) df_new = pd.concat([df_new, df], axis = 1) if i%20 == 0: print(i, 'feature binarized.', str(dataset.shape[1]-i)+' remain') print('binarized dataset shahpe: ', df_new.shape) return df_new if __name__ == '__main__': path = 'https://www.dropbox.com/s/3g4rz8a0lpvkag4/payment.csv?dl=1' data = pd.read_csv(path) not_feature = ['fecha', 'user_id'] data_bin = qbinarizer(data, not_feature, qt = np.linspace(0,1,5)) data_bin.to_csv('data_bin.csv')
Referencia
1. https://stats.stackexchange.com/questions/68834/what-is-the-benefit-of-breaking-up-a-continuous-predictor-variable
2. http://biostat.mc.vanderbilt.edu/wiki/Main/CatContinuous
No hay comentarios:
Publicar un comentario