Source code for fifa_preprocessing

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""This module provides methods conceived to preprocess data stored
in a csv file etc., with the intent to perform data analysis and
Machine Learning.

It was originally created to preprocess data from the EA Sports'
FIFA 19 for a Machine Learning project to predictplayers' wages
by regression. Therefore it contains functions that can be
universally used for data preprocessing but also functions that
are made specifically with the FIFA 19 data set in mind.

This module requires that `pandas` be installed within the Python
environment this module is being run in.

Functions
---------
exclude_goalkeepers(data_frame)
    Delete goalkeepers from the data.
money_format(money)
    Return integer value of the monetary amount.
rating_format(rating)
    Express rating string as an integer.
work_format(work)
    Code amplitude string as an integer.
to_int(not_int)
    Floor floating point numbers.
apply_format(data_frame, column_names, format_method)
    Apply `format_method` to `data_frame` columns.
to_dummy(data_frame, column_names)
    Dummy code categorical variables.
split_work_rate(data_frame)
    Split the players' work rate column.
preprocess(source_file)
    Preprocess the FIFA 19 data from the path by default.
"""
import math
import pandas as pd


[docs]def exclude_goalkeepers(data_frame):
    """Remove goalkeepers and return the DataFrame.

    Go through the `data_frame` find all the tuples with the players'
    Position column set to "GK" and remove them. Return the `data_frame`
    with no goalkeeper tuples.

    Parameters
    ----------
    data_frame : pandas.DataFrame
        DataFrame containing FIFA19 data set including goalkeepers.

    Returns
    -------
    data_frame : pandas.DataFrame
        DataFrame containing FIFA19 data set with goalkeepers' tuples
        removed.

    Notes
    -----
    This function can be used when preprocessing the FIFA19 dataset to
    perform Machine Learning as a goalkeeper is a peculiar player  and
    all his properties vary compared to other positions on the field.

    Examples
    --------
    >>> data = pd.read_csv("data.csv")
    >>> print(data[['Name', 'Position']][0:5]) #print first few rows
                    Name Position
    0           L. Messi       RF
    1  Cristiano Ronaldo       ST
    2          Neymar Jr       LW
    3             De Gea       GK
    4       K. De Bruyne      RCM
    >>> data = exclude_goalkeepers(data)
    >>> print(data[['Name', 'Position']][0:5]) #print the same number of rows
                    Name Position
    0           L. Messi       RF
    1  Cristiano Ronaldo       ST
    2          Neymar Jr       LW
    4       K. De Bruyne      RCM
    5          E. Hazard       LF
    """
    goalkeepers = data_frame[data_frame['Position'] == 'GK']
    data_frame.drop(goalkeepers.index, inplace=True)
    return data_frame


[docs]def money_format(money):
    """Return the integer value of a monetary amount string.
    
    Remove euro currency sign from `money` and letters expressing the
    order of magnitude from the passed in string, e.g. "K" for thousands
    and "M" for milions. Cast the string into an integer. Returned values
    are expressed in thousands of euros.

    Parameters
    ----------
    money : str
        String containing monetary amount with a currency sign and
        order of magnitude abbreviation.
    
    Returns
    -------
    money : int
        Integer value of the monetary amount.

    Examples
    --------
    >>> v = money_format("€500K")
    >>> print(v)
    500

    >>> v = money_format("€70.5M")
    >>> print(v)
    70500
    """
    money = money.replace('€', '')
    if 'M' in money:
        money = money.replace('M', '')
        return int(float(money)*1000)
    money = money.replace('K', '')
    return int(money)

    
[docs]def rating_format(rating):
    """Return an integer equal to the string represented sum.

    Cast a string expressing a sum of integers delimited by a plus sign,
    e.g. 81+3, into an integer equal to a sum of the two numbers and
    return the integer.

    Parameters
    ----------
    rating : str
        String representing a sum of two integers with a plus sign
        inbetween.

    Returns
    -------
    rating : int
        Integer value of the sum of the numbers. Integer is equal to zero
        if the type of the input is not string.

    Notes
    -----
    This function is used in the FIFA19 data set to convert the player's
    special rating format in the game to an integer to get the proper
    understanding of the data when performing Machine Learning.

    Examples
    --------
    >>> r = rating_format("81+3")
    >>> print(r)
    84
    """
    if type(rating) is not str:
        return 0
    elif '+' in rating:
        plus = rating.index('+')
        base = int(rating[:plus])
        add = int(rating[plus + 1:])
        return base + add
    else:
        return int(rating)


[docs]def work_format(work):
    """Return a numerical interpretation of a categorical variable.

    Take in a string representing a categorical variable representing
    an amplitude of a phenomenon as "High", "Medium" and any other word,
    e.g. "Low", and return an integer representing the amplitude: 2, 1, 0
    respectively.
    
    Parameters
    ----------
    work : str
        String representing a categorical variable.

    Returns
    -------
    int
        Integer value of the work rate: 0, 1 or 2.

    Notes
    -----
    This function is used on the FIFA19 data set to convert the description
    of player's work rate ("High", "Medium" and "Low") into an integer in
    order to enable some of the Machine Learning algorithms to make use of
    the properties.

    Examples
    --------
    >>> w = work_format("High")
    >>> print(w)
    2

    >>> w = work_format("Low")
    >>> print(w)
    0

    >>> w = work_format("Poor")
    >>> print(w)
    0
    """
    if work == 'High':
        return 2
    elif work == 'Medium':
        return 1
    else:
        return 0


[docs]def to_int(not_int):
    """Return the integer value of a floating point number.
    
    Return the floored integer value of a floating point number. Return
    0 if `not_int` is a NaN.
    
    Parameters
    ----------
    not_int : not_integer
        Not_integer means all those types -- float, NaN -- to be converted
        into an integer.

    Returns
    -------
    int
	Integer value of the parameter.

    See Also
    --------
    numpy.nan : Nan stands for not a number.
    
    Examples
    --------
    >>> n = to_int(17.5)
    >>> print(n)
    17

    >>> import numpy
    >>> n = to_int(numpy.nan)
    >>> print(n)
    0
    """
    if math.isnan(not_int):
        return 0
    else:
        return int(not_int)


[docs]def apply_format(data_frame, column_names, format_method):
    """Apply a formatting function to a DataFrame column and return.

    Simplify applying format modifications to the data stored in columns
    of `data_frame`. Check if the parameters are of the right type, apply
    `format_method` to the columns of `data_frame` whose labels are passed
    in `column names`. Return the DataFrame with the applied changes.

    Parameters
    ----------
    data_frame : pandas.DataFrame
        DataFrame containing the data to be modified.
    column_names : list
        List of string labels of columns in `data_frame` to be modified.
    format_method : function
        Function to be applied to the columns of `data_frame`, whose labels
        are listed in `column_names`.

    Returns
    -------
    data_frame : pandas.DataFrame
        The passed in DataFrame with the formatting changes applied to
        its columns.

    See Also
    --------
    pandas.apply

    Examples
    --------
    >>> data = pd.read_csv("data.csv")
    >>> print(data[['Wage']][0:3]) #print first few lines
        Wage
    0  €565K
    1  €405K
    2  €290K
    >>> data = apply_format(data, ['Wage'], money_format)
    >>> print(data[['Wage']][0:3])
       Wage
    0   565
    1   405
    2   290
    """
    for column in column_names:
        if isinstance(column, str) and (column in data_frame) and callable(format_method):
            data_frame.loc[:, column] = data_frame[column].apply(format_method)
    return data_frame


[docs]def to_dummy(data_frame, column_names):
    """Return the DataFrame with dummy coded categorical variables.

    Add dummy coded categorical variables columns of `data_frame`. Remove
    the categorical variable columns. Return the modified DataFrame.
    
    Parameters
    ----------
    data_frame : pandas.DataFrame
        DataFrame containing the data to be dummy coded.
    column_names : list
        List of string labels of columns in `data_frame` to be dummy coded.
    
    Returns
    -------
    data_frame : pandas.DataFrame
        The passed in DataFrame with the dummy coded categorical variables.

    See Also
    --------
    pandas.get_dummies, pandas.concat

    Notes
    -----
    Thanks to dummy coding, statistical analysis may be performed on
    categorical data.

    [1] "Dummy coding refers to the process of coding a categorical
    variable into dichotomous variables. For example, we may have data
    about participants' religion, with each participant coded as follows:

        A categorical or nominal variable with three categories

                        ==========      =======
                        Religion	Code
                        ==========      =======
                        Christian	1
                        Muslim		2
                        Atheist		3
                        ==========      =======

    This is a nominal variable (see level of measurement) which would be
    inappropriate as a predictor in MLR. However, this variable could be
    represented using a series of three dichotomous variables (coded as
    0 or 1), as follows:

        Full dummy coding for a categorical variable with three categories"

                ==========	==========	==========	==========
                Religion	Christian	Muslim		Atheist
                ==========	==========	==========	==========
                Christian	1		0		0
                Muslim		0		1		0
                Atheist		0		0		1	
                ==========	==========	==========	==========

    References
    ----------
    [1] https://en.wikiversity.org/wiki/Dummy_variable_(statistics)
    """
    for column in column_names:
        if isinstance(column, str) and column in data_frame:
            dummies = pd.get_dummies(data_frame[column])
            data_frame = pd.concat([data_frame, dummies], axis=1)
            data_frame = data_frame.drop([column], axis=1)
    return data_frame


[docs]def split_work_rate(data_frame):
    """Split 'Work Rate' column into two and return the DataFrame.
    
    Split 'Work Rate' column of `data_frame` into 'Defensive Work Rate'
    and 'Offensive Work Rate', apply `work_format` function to the 
    columns and return the modified DataFrame.

    Parameters
    ----------
    data_frame : pandas.DataFrame
        DataFrame containing a 'Work Rate' column to be split.

    Returns
    -------
    data_frame : pandas.DataFrame
        The DataFrame with the 'Work Rate' column split into formatted
        defensive and offensive work rate columns.

    See Also
    --------
    apply_format, work_format

    Notes
    -----
    This function can be used to split and format work rate column into
    defensive and offensive work rates of a player as they are stored in
    one column in the FIFA19 data set.

    Examples
    --------
    >>> data = pd.read_csv("data.csv")
    >>> print(data[['Work Rate']][0:3]) #print first few rows
            Work Rate
    0  Medium/ Medium
    1       High/ Low
    2    High/ Medium
    >>> data = split_work_rate(data)
    >>> print(data[['Defensive Work Rate', 'Offensive Work Rate']][0:3])
       Defensive Work Rate  Offensive Work Rate
    0                    1                    1
    1                    2                    0
    2                    2                    1
    """
    work_split = data_frame['Work Rate'].str.split('/ ', expand=True)
    work_split.rename(columns={0 : 'Defensive Work Rate', 1 : 'Offensive Work Rate'}, inplace=True)
    data_frame = pd.concat([data_frame, work_split], axis=1)
    data_frame = data_frame.drop('Work Rate', axis=1)
    return apply_format(data_frame, ['Defensive Work Rate', 'Offensive Work Rate'], work_format)


[docs]def preprocess(data):
    """Preprocess data to enable its analysis.

    Perform optimal preprocessing on the FIFA 19 data set. Drop irrelevant
    attributes. Convert attribute types, e.g. categorical data into numerical
    or floating point numbers carrying integers into integers. Manage column
    representation of attributes. Return the preprocessed DataFrame,
    ready to perform data analysis on it.

    Parameters
    ----------
    data : pandas.DataFrame
        Data to preprocess.

    Returns
    -------
    data : pandas.DataFrame
        Preprocessed data, ready to perform analysis on it.

    See Also
    --------
    pandas.DataFrame.drop, pandas.DataFrame.dropna
    exclude_goalkeepers, apply_format, money_format, to_int,
    to_dummy, split_work_rate
    """
    # Drop useless attributes.
    # Unnamed: 0 is an index (0 - n).
    # ID is FIFA19's internal id.
    # Photo, Flag and Club Logo are images.
    # Real Face - Yes/No value if the game uses a 3D scan of the actual face of the player.
    # Loaned From is usually missing, duration of the contract and
    # date of joining the club are not essential.
    # Name and body parameters are not correlated to wage.
    data = data.drop(['Unnamed: 0', 'ID', 'Name', 'Photo', 'Flag', 'Club Logo',
                        'Loaned From', 'Height', 'Weight', 'Body Type', 'Real Face',
                        'Joined', 'Contract Valid Until'], axis=1)

    # Exclude goalkeepers as they constitute a special class of players and may
    #  confuse algorithms analyzing it.
    data = exclude_goalkeepers(data)
    
    # Compute ratings on specific positions on the field and on football skills.
    for label in data.columns[15:41]:
        data.loc[:, label] = data[label].apply(rating_format)
    for label in data.columns[41:75]:
        data.loc[:, label] = data[label].apply(to_int)

    # Drop rows with missing values.
    data.dropna(inplace=True)

    # Convert monetary amounts.
    data = apply_format(data, ['Wage', 'Value', 'Release Clause'], money_format)

    # Convert floats to int as the nature of this information ('Jersey Number',
    # 'International Reputation', 'Skill Moves', 'Weak Foot') is discrete.
    data = apply_format(data, ['Jersey Number', 'International Reputation',
                                 'Skill Moves', 'Weak Foot'], to_int)

    # Convert categorical data to dummy variables in order to enable its analysis.
    data = to_dummy(data, ['Preferred Foot', 'Club', 'Position', 'Nationality'])

    # Split work rate into defensive and offensive work rate.
    data = split_work_rate(data)

    return data