Giter VIP home page Giter VIP logo

dsml-toolkit's Introduction

Data Science and Machine Learning Tool-Kit

Important python for Data Science and Machine Learning scripts and functions:

Feature Engineering

Click to view code

def feature_interactions(df, continuous_features=[], categorical_features=[]):
    
    assert len(continuous_features) > 0 and len(categorical_features) > 0,\
    "Please specify continuous and/or categorical variables"
    #if len(continuous_features) < 1:
        #print("Please specify continuous variables to be used.")
        
    for cat_feat in categorical_features:
        for cont_feat in continuous_features:
            df[f'{cat_feat}_interact_{cont_feat}_mean'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].mean().to_dict())
            df[f'{cat_feat}_interact_{cont_feat}_count'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].count().to_dict())
            df[f'{cat_feat}_interact_{cont_feat}_median'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].median().to_dict())
            df[f'{cat_feat}_interact_{cont_feat}_sum'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].sum().to_dict())
            df[f'{cat_feat}_interact_{cont_feat}_max'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].max().to_dict())
            df[f'{cat_feat}_interact_{cont_feat}_min'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].min().to_dict())
            df[f'{cat_feat}_interact_{cont_feat}_std'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].std().to_dict())
            df[f'{cat_feat}_interact_{cont_feat}_var'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].var().to_dict())
            df[f'{cat_feat}_interact_{cont_feat}_skew'] = df[cat_feat].map(df.groupby(cat_feat)[cont_feat].skew().to_dict())
            
    return df

Click to view code

def feature_combinations(df: pd.DataFrame, features: list):
    for i in tqdm(range(len(features))):
        if i < len(features) - 1:
            df[f'{features[i]}_X_{features[i+1]}'] = df[features[i]].astype(str) + '_X_' + df[features[i+1]].astype(str)
            
            for feat in set(features).difference({features[i]}):
                if '_X_' not in feat:
                    df[f'{features[i]}_X_{feat}'] = df[features[i]].astype(str) + '_X_' + df[feat].astype(str)
                    
    # and now for all features
    df['master_combination_X_'] = df[features].agg(sum, axis=1)

    return df

Other

Click to view code

cols, rows = 3, 2

fig, axes = plt.subplots(rows, cols, figsize=(16,12))

columns = ['CreditLimit', 'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue']


for index, col in enumerate(columns):
    # new subplot with (i + 1)-th index laying on a grid
    plt.subplot(rows, cols, index + 1) 
    # drawing the plot
    sns.boxplot(x='cleared_cat', y=col, data=data)
    plt.title(f"{col}")

fig.suptitle("Numerical columns in relation to Cleared status")
plt.show()

Click to view code

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Click to view code

from time import strftime

def get_current_timestamp():
    return strftime('%Y%m%d_%H%M%S')

submission_fname = 'submissions/stacking_%s.csv' % get_current_timestamp()
print(submission_fname)

submission_df.to_csv(submission_fname, index=False)

dsml-toolkit's People

Contributors

lyraxvincent avatar

Watchers

 avatar

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.