# Data collector

In [3]:
#### Imports ####
#################

### basics
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import copy


### yahoo finance
import yfinance as yf

### pandas
import pandas as pd
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix


### data optimization library
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage,risk_matrix
from pypfopt import plotting

In [4]:
#### Functions ####
###################
def data_from_names(stocks_names):
    '''
    Downloads daily data of 10 years history of stocks.

    Arguments:
    stock_names: (list) stock names as named by yahoo finance.
    '''
    dataset = pd.DataFrame()
    for s in stocks_names:
        value = yf.Ticker(s).history(period = "10y").Close
        dat = pd.DataFrame({"{}".format(s): value})
        dataset = pd.concat([dataset,dat],axis = 1)
    return dataset


def split_train_test(dataset,first,end):
    '''
    Divide data set in train and test. This will be useful to train our stocks along with our previous Risk tolerance, and then the test set will be used to perform backtests. 
    
    Arguments:
    dataset: (Dataframe) index:(Timestamp), cols:(float) stock price values
    first: ((list) shape=(3)) [(int) year,(int) month,(int) day]
    end:   ((list) shape=(3)) [(int) year,(int) month,(int) day]
    '''

    first = [str(f) for f in first]
    end = [str(e) for e in end]

    y_1,m_1,d_1 = first
    y_2,m_2,d_2 = end
    print( y_1,m_1,d_1)

    if len(m_1) == 1:
        m_1 = "0"+m_1
    if len(m_2) == 1:
        m_2 = "0"+m_2
    if len(d_1) == 1:
        d_1 = "0"+d_1
    if len(d_2) == 1:
        d_2 = "0"+d_2


    dates_times = dataset.index
    dates = [fulldate.strftime('%Y-%m-%d') for fulldate in dates_times]

    
    try:
        idx_1 =  dates.index(f"{y_1}-{m_1}-{d_1}")
    except:
        print("This date for Begin doesnt exist, try another")

    try:
        idx_2 =  dates.index(f"{y_2}-{m_2}-{d_2}")
    except:
        print("This date for END doesnt exist, try another")
    
    return dataset[idx_1:idx_2],dataset[idx_2:]


def data_cleaner(data):
  '''
    Divide data set in train and test. This will be useful to train our stocks along with our previous Risk tolerance, and then the test set will be used to perform backtests. 
    
    Arguments:
    dataset: (Dataframe) index:(Timestamp), cols:(float) stock price values
    first: ((list) shape=(3)) [(int) year,(int) month,(int) day]
    end:   ((list) shape=(3)) [(int) year,(int) month,(int) day]
    '''
    #Checking for any null values and removing the null values'''
    if data.isnull().values.any():
        print("There are NULLS in dataframe")
        return

    else:
        print("Remove columns with less than 30 percent of data")
        missing_fractions = data.isnull().mean().sort_values(ascending=False)
        missing_fractions.head(10)
        drop_list = sorted(list(missing_fractions[missing_fractions > 0.3].index))
        data.drop(labels=drop_list, axis=1, inplace=True)

        return





    

### Read n Export

In [5]:
data = data_from_names(["ITOT","IVV","QQQ","VTI","IJR","VPL","VWO","SUSA","HYEM","MGV"])
data_cleaner(data)
data.to_csv("data/assets.csv")

Descarto las columnas que les faltan m√°s del 30% de los datos
