Objective of this notebook:

  • Apply webscrap by myself
  • Visualise FIFA21 player data
  • Unsupervised learning to predict players' position

Note book summary:

  • Create python script (BeautifulSoup) to scrap FIFA21 EPL players data from https://sofifa.com/
  • Clean data
  • Gained information from the data
  • Simplify position
  • Train test split via stratified sampling
  • Predict position => 90% accuracy and F1 score for the test set via SVM
  • Midfielders are the position that are hardest to predict

Import python module

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs

import time
import re

from random import randrange

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Webscrap FIFA21 EPL players' stats

First, check if you can webscrap. sofifa.com indicate you can webscrap through their robots.txt

Webscrap procedure:

  • Search prem players in Fifa21
  • Results are stored in table, 60 per page. up to 600
  • open each pages, grab the link to pages for individual players

Note: scrap responsibly. Add a random 2-6 seconds delay in between access to webpages

Grab list of EPL players

links = []

for x in range(1):
    
    # make a search and scrap the webpage --------------------------------------------------
    base_url = 'https://sofifa.com/players?type=all&lg%5B%5D=13'
    
    
    if x > 0:
        add_url ='&offset='+str(x*60)
    else:
        add_url=''

    r = requests.get(base_url+add_url)
    
    # call the soup function to make the html readable --------------------------------------
    webpage = bs(r.content)
    table = webpage.find('table')
    rows = table.find_all('tr')
    
    for row in rows[1::]:
        link = row.find_all("a",{"class":"tooltip"})[0]
        links.append(link['href'])
    
    # print progress on scrapping link
    print("Page",x,"done")
    
    #Be sure to pause
    time.sleep(randrange(2,6))
Page 0 done

Number of players in EPL:

len(links)
60

The top 10 players, sorted by overall rating:

links[:10]
['/player/192985/kevin-de-bruyne/210012/',
 '/player/212831/alisson-ramses-becker/210012/',
 '/player/209331/mohamed-salah/210012/',
 '/player/208722/sadio-mane/210012/',
 '/player/203376/virgil-van-dijk/210012/',
 '/player/153079/sergio-aguero/210012/',
 '/player/202126/harry-kane/210012/',
 '/player/215914/ngolo-kante/210012/',
 '/player/210257/ederson-santana-de-moraes/210012/',
 '/player/202652/raheem-sterling/210012/']

Ok, we have 644 players registered in EPL in the FIFA21 game. That is 32 players per club which is about right

Scrap individual player stats

Note: I have scrapped them before using the same function below. For for this demo, I will only scrap the first 10 players

links3 = links[:10]

Players to scrap (show first 5):

links3[:5]
['/player/192985/kevin-de-bruyne/210012/',
 '/player/212831/alisson-ramses-becker/210012/',
 '/player/209331/mohamed-salah/210012/',
 '/player/208722/sadio-mane/210012/',
 '/player/203376/virgil-van-dijk/210012/']

Use function to scrap

list_stats_top10 = []

def player_name(weblink):
    return weblink.find('h1').text

def nat(weblink):
    return weblink.find('div',{"class":"meta bp3-text-overflow-ellipsis"}).a['title']
    
def dob_wh(weblink):
    stuff = weblink.find('div',{"class":"meta bp3-text-overflow-ellipsis"}).text
    temp = stuff.split("(")[1]
    dob = temp.split(")")[0]
    temp2 = temp.split(")")[1].split(" ")
    height = temp2[1]
    weight = temp2[2]
    return dob, height, weight

def club_info(weblink):
    club = weblink.find(text = re.compile('Player Specialities')).parent.findNext('h5').text
    jersey = weblink.find(text = re.compile('Jersey Number')).next
    c_valid = weblink.find(text = re.compile('Contract Valid Until')).next
    c_value = weblink.find('section',{"class":"card spacing"}).find(text = re.compile('Value')).previous.previous
    wage = weblink.find('section',{"class":"card spacing"}).find(text = re.compile('Wage')).previous.previous
    return club, jersey, c_valid, c_value, wage

def player_stats(weblink):
    best_pos = weblink.find(text = re.compile('Best Position')).next.text
    best_rating = weblink.find(text = re.compile('Best Overall Rating')).next.text
    return best_pos, best_rating
    
def player_stats_detail(weblink):
    
    # Attacking stats
    temp = weblink.find(text = re.compile('Attacking')).parent.parent.find_all("li")
    keys = ['Crossing','Finishing','Heading Accuracy','Short Passing','Volleys']
    for index,attrs in enumerate(temp):
        temp2 = attrs.find_all('span')
        stats[keys[index]] = temp2[0].text
    
    # skill stats
    temp = weblink.find(text = re.compile('Attacking')).parent.parent.parent.find_next_sibling("div").find_all("li")
    keys = ['Dribbling','Curve','FK Accuracy','Long Passing','Ball Control']
    for index,attrs in enumerate(temp):
        temp2 = attrs.find_all('span')
        stats[keys[index]] = temp2[0].text
    
    # movement stats
    temp = weblink.find(text = re.compile('Movement')).parent.parent.find_all("li")
    keys = ['Acceleration','Spring Speed','Agility','Reactions','Balance']
    for index,attrs in enumerate(temp):
        temp2 = attrs.find_all('span')
        stats[keys[index]] = temp2[0].text

    # power stats
    temp = weblink.find(text = re.compile('Power')).parent.parent.find_all("li")
    keys = ['Shot Power','Jumping','Stamina','Strength','Long Shots']
    for index,attrs in enumerate(temp):
        temp2 = attrs.find_all('span')
        stats[keys[index]] = temp2[0].text
    
    
    # mentality stats
    temp = weblink.find(text = re.compile('Mentality')).parent.parent.find_all("li")
    keys = ['Aggression','Interceptions','Positioning','Vision','Penalties','Composure']
    for index,attrs in enumerate(temp):
        temp2 = attrs.find_all('span')
        stats[keys[index]] = temp2[0].text
    
    # defending stats
    temp = weblink.find(text = re.compile('Defending')).parent.parent.find_all("li")
    keys = ['Defensive Awareness','Standing Tackle','Sliding Tackle']
    for index,attrs in enumerate(temp):
        temp2 = attrs.find_all('span')
        stats[keys[index]] = temp2[0].text
    
    # goalkeeping stats
    temp = weblink.find(text = re.compile('Goalkeeping')).parent.parent.find_all("li")
    keys = ['GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes']
    for index,attrs in enumerate(temp):
        temp2 = attrs.find_all('span')
        stats[keys[index]] = temp2[0].text
    
    # traits stats
    try:
        temp = weblink.find(text = re.compile('Traits')).parent.parent.find_all("li")
        for attrs in temp:
            if 'Traits' in stats:
                stats['Traits'].append(attrs.text)
            else:
                stats['Traits'] = [attrs.text]
    except:
        stats['Traits'] = None
    
for index,link in enumerate(links3):
    # make a search and scrap the webpage --------------------------------------------------
    base_url = 'https://sofifa.com/'
    add_url = link
    r = requests.get(base_url+add_url)
    weblink = bs(r.content)
    
    stats={}
    
    # get player name
    name = player_name(weblink)
    stats['Player_name'] = name
    
    # get nationality
    nationality = nat(weblink)
    stats['Nationality'] = nationality
    
    # get dob, weight, height
    dob, height, weight = dob_wh(weblink)
    stats['dob'] = dob
    stats['height'] = height
    stats['weight'] = weight
    
    # get club info
    club, jersey, c_valid, c_value, wage = club_info(weblink)
    stats['club'] = club
    stats['jersey'] = jersey
    stats['c_valid'] = c_valid
    stats['c_value'] = c_value
    stats['wage'] = wage
    
    # add general player stats
    pos, rating = player_stats(weblink)
    stats['pos'] = pos
    stats['rating'] = rating
    
    # add player stats detail
    player_stats_detail(weblink)
    
    
    # print progress --------------------------
    list_stats_top10.append(stats)
    
    if index % 10 ==0:
        print(index)
    
    #Be sure to pause between accessing pages
    time.sleep(randrange(2,6))
    
0

Save JSON data of FIFA21 EPL players

import json

def save_data(title,data):
    with open(title, 'w', encoding ='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent = 2)
    
def load_data(title):
    with open(title, encoding ='utf-8') as f:
        return json.load(f)
#save_data('FIFA21_EPL_top10.json', list_stats_top10)

Load data for later use

import os
os.chdir("C:/Users/Riyan Aditya/Desktop/ML_learning/Project6_EPL_20192020")
FIFA21_data = load_data('FIFA21_EPL.json')

Lets see the first data, Kevin De Bruyne

FIFA21_data[0]
{'Player_name': 'K. De Bruyne',
 'Nationality': 'Belgium',
 'dob': 'Jun 28, 1991',
 'height': '5\'11"',
 'weight': '154lbs',
 'club': 'Manchester City',
 'jersey': '17',
 'c_valid': '2023',
 'c_value': '€129M',
 'wage': '€370K',
 'pos': 'CAM',
 'rating': '91',
 'Crossing': '94',
 'Finishing': '82',
 'Heading Accuracy': '55',
 'Short Passing': '94',
 'Volleys': '82',
 'Dribbling': '88',
 'Curve': '85',
 'FK Accuracy': '83',
 'Long Passing': '93',
 'Ball Control': '92',
 'Acceleration': '77',
 'Spring Speed': '76',
 'Agility': '78',
 'Reactions': '91',
 'Balance': '76',
 'Shot Power': '91',
 'Jumping': '63',
 'Stamina': '89',
 'Strength': '74',
 'Long Shots': '91',
 'Aggression': '76',
 'Interceptions': '66',
 'Positioning': '88',
 'Vision': '94',
 'Penalties': '84',
 'Composure': '91',
 'Defensive Awareness': '68',
 'Standing Tackle': '65',
 'Sliding Tackle': '53',
 'GK Diving': '15',
 'GK Handling': '13',
 'GK Kicking': '5',
 'GK Positioning': '10',
 'GK Reflexes': '13',
 'Traits': ['Injury Prone',
  'Leadership',
  'Early Crosser',
  'Long Passer (AI)',
  'Long Shot Taker (AI)',
  'Playmaker (AI)',
  'Outside Foot Shot']}

Data cleaning

df = pd.DataFrame(FIFA21_data)

Few things to do:

  • Everything is string. Convert to numeric when needed to be, especially the individual stats
  • Convert DOB to datetime
  • Convert height and weight to the SI unit
  • Convert value and wages to the right unit (eg: no M and no K)
  • What to do with traits in players? Perhaps ignore for now

Traits

First, lets look at the traits

  • How many unique traits are there?
  • Proportion of players that have traits?
  • Is it worth keeping?

Players with no traits:

df['Traits'].isnull().values.ravel().sum()
197

199 from 644 players (~30%) do not have any traits

Top 5 players that have no traits:

df[df['Traits'].isnull()][:5].Player_name
6        N. Kanté
32          Rodri
94     D. Sánchez
115      N. Matić
124      J. Evans
Name: Player_name, dtype: object

Wow. Kante doesnt have any traits? This could be a mistake from Fifa21 database

Unique traits:

df.Traits.explode().unique() 
array(['Injury Prone', 'Leadership', 'Early Crosser', 'Long Passer (AI)',
       'Long Shot Taker (AI)', 'Playmaker (AI)', 'Outside Foot Shot',
       'GK Long Throw', 'Rushes Out Of Goal', 'Finesse Shot',
       'Speed Dribbler (AI)', 'Chip Shot (AI)', 'Flair', 'Solid Player',
       'Power Header', None, 'Comes For Crosses', 'Team Player',
       'Dives Into Tackles (AI)', 'Technical Dribbler (AI)',
       'Cautious With Crosses', 'Saves with Feet', 'Long Throw-in',
       'Power Free-Kick', 'Giant Throw-in', nan], dtype=object)

print("number of unique traits :",len(df.Traits.explode().unique()))
number of unique traits : 26

26 unique traits. Probably too long if I expand the column similar to how a OHE works

Is it worth keeping?

Probably not. Remove them for now

df2 = df.copy()
df2 = df2.drop(labels='Traits', axis=1)

df2.columns
Index(['Player_name', 'Nationality', 'dob', 'height', 'weight', 'club',
       'jersey', 'c_valid', 'c_value', 'wage', 'pos', 'rating', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys',
       'Dribbling', 'Curve', 'FK Accuracy', 'Long Passing', 'Ball Control',
       'Acceleration', 'Spring Speed', 'Agility', 'Reactions', 'Balance',
       'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Defensive Awareness', 'Standing Tackle', 'Sliding Tackle',
       'GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning',
       'GK Reflexes'],
      dtype='object')

# fix mistake with column name
df2.rename(columns={'Spring Speed':'Sprint Speed'}, inplace=True)

Convert to numeric

DOB to datetime

df2['dob'] = pd.to_datetime(df2['dob'])
df2.dob.dtype
dtype('<M8[ns]')

Height to numeric & SI

def height_conversion(ht):
    # current format is x'x"
    ht2 = ht.split("'")
    ft = float(ht2[0])
    inc = float(ht2[1].replace("\"",""))   # " is a special character
    return round(((12*ft)+inc)*2.54,0)

df2['height'] = df2['height'].apply(lambda x:height_conversion(x))

df2.height.dtype
dtype('float64')

Weight to numeric & SI

def weight_conversion(wt):
    # current format is xxxlbs
    wt2 = wt.split('lbs')
    return round(float(wt2[0])*0.453592,0)

df2['weight'] = df2['weight'].apply(lambda x:weight_conversion(x))

df2.weight.dtype
dtype('float64')

Contract value and wage to numeric

value_dict = {"K":1000, "M":1000000 }

def money_conversion(money):
    # current format is €xxxxK
    money = money.replace('€','')
    
    if money[-1] in value_dict:
        num, values = money[:-1], money[-1]
        return (float(num)* value_dict[values])

df2['c_value'] = df2['c_value'].apply(lambda x:money_conversion(x))
df2['wage'] = df2['wage'].apply(lambda x:money_conversion(x))

df2.c_value.dtype,  df2.wage.dtype
(dtype('float64'), dtype('float64'))

Convert jersey, rating and individual stats attributes to numeric

df2['jersey'] = pd.to_numeric(df2['jersey'], errors='coerce')
df2['rating'] = pd.to_numeric(df2['rating'], errors='coerce')

for col in  df2.columns[12::]:
    df2[col] = pd.to_numeric(df2[col], errors='coerce')

df3 = df2.copy()

Top players of Fifa21

Top rated players from each club

Note that there can be multiple top rated players per club

idx = df3.groupby(['club'])['rating'].transform(max) == df3['rating']
top_rated = df3[idx][['club','Player_name','pos','rating']]
top_rated.sort_values('club')
club Player_name pos rating
11 Arsenal P. Aubameyang ST 87
105 Aston Villa J. Grealish CAM 83
194 Brighton & Hove Albion L. Trossard CAM 80
129 Brighton & Hove Albion M. Ryan GK 80
193 Brighton & Hove Albion L. Dunk CB 80
77 Burnley N. Pope GK 82
6 Chelsea N. Kanté CDM 88
63 Crystal Palace W. Zaha CF 83
51 Everton Allan CDM 85
76 Fulham A. Areola GK 82
84 Leeds United Raphinha LM 82
68 Leeds United Rodrigo ST 82
20 Leicester City J. Vardy ST 87
45 Leicester City W. Ndidi CB 87
1 Liverpool Alisson GK 90
2 Liverpool M. Salah RW 90
3 Liverpool S. Mané LW 90
4 Liverpool V. van Dijk CB 90
0 Manchester City K. De Bruyne CAM 91
18 Manchester United Bruno Fernandes CAM 88
59 Newcastle United M. Dúbravka GK 83
156 Sheffield United S. Berge CDM 80
187 Sheffield United J. Egan CB 80
189 Sheffield United J. O'Connell CB 80
92 Southampton D. Ings ST 81
145 Southampton J. Ward-Prowse CM 81
10 Tottenham Hotspur H. Kane ST 88
9 Tottenham Hotspur H. Son LM 88
153 West Bromwich Albion B. Ivanović CB 79
96 West Ham United T. Souček CDM 82
97 West Ham United S. Haller ST 82
47 Wolverhampton Wanderers R. Jiménez ST 84
46 Wolverhampton Wanderers Rui Patrício GK 84

Top rated players per pos

These positions are based on what FIFA21 recommend as "Best Position"

idx = df3.groupby(['pos'])['rating'].transform(max) == df3['rating']
top_rated = df3[idx][['pos','club','Player_name','rating']]

# create custom sort so this makes positional sense
custom_dict = {'GK':0, 'CB':1, 'LB':2, 'RB':3, 'LWB':4, 'RWB':5, 'CDM':6, 'CM':7, 'CAM':8, 'RM':9, 'LM':10,
              'RW':11, 'LW':12, 'CF':13, 'ST':14}
top_rated['rank'] = top_rated['pos'].map(custom_dict)
top_pos = top_rated.sort_values('rank')
top_pos.drop(labels=['rank'], axis=1)
pos club Player_name rating
1 GK Liverpool Alisson 90
4 CB Liverpool V. van Dijk 90
13 LB Liverpool A. Robertson 87
12 RB Liverpool T. Alexander-Arnold 87
89 LWB Manchester City B. Mendy 82
54 RWB Manchester City João Cancelo 84
6 CDM Chelsea N. Kanté 88
23 CM Manchester United P. Pogba 86
50 CM Chelsea M. Kovačić 86
0 CAM Manchester City K. De Bruyne 91
34 RM Manchester United M. Rashford 86
9 LM Tottenham Hotspur H. Son 88
2 RW Liverpool M. Salah 90
3 LW Liverpool S. Mané 90
15 CF Liverpool Roberto Firmino 87
5 ST Manchester City S. Agüero 89

Some weird result here. Right forward maybe, but I would not put Rasford as a Right milfielder

Top rated players per original country

Unique nationalities of Prem players

df3.Nationality.unique().shape
(60,)

List of top players per nationality of origin:

idx = df3.groupby(['Nationality'])['rating'].transform(max) == df3['rating']
top_rated = df3[idx][['Nationality','club','Player_name','pos','rating']]
top_rated.sort_values('Nationality')
Nationality club Player_name pos rating
33 Algeria Manchester City R. Mahrez RW 85
5 Argentina Manchester City S. Agüero ST 89
129 Australia Brighton & Hove Albion M. Ryan GK 80
390 Austria Leicester City C. Fuchs CDM 74
0 Belgium Manchester City K. De Bruyne CAM 91
... ... ... ... ... ...
112 Ukraine Manchester City O. Zinchenko CM 81
80 United States Chelsea C. Pulisic CAM 82
43 Uruguay Manchester United E. Cavani ST 84
60 Wales Tottenham Hotspur G. Bale RW 83
297 Zimbabwe Aston Villa M. Nakamba CDM 76

64 rows × 5 columns

Top player based on some stats

Best crosser

df3[['Player_name','club','pos','Crossing']].loc[df3['Crossing'].idxmax()]
Player_name       K. De Bruyne
club           Manchester City
pos                        CAM
Crossing                    94
Name: 0, dtype: object

Best short passer

df3[['Player_name','club','pos','Short Passing']].loc[df3['Short Passing'].idxmax()]
Player_name         K. De Bruyne
club             Manchester City
pos                          CAM
Short Passing                 94
Name: 0, dtype: object

Best long passer

df3[['Player_name','club','pos','Long Passing']].loc[df3['Long Passing'].idxmax()]
Player_name        K. De Bruyne
club            Manchester City
pos                         CAM
Long Passing                 93
Name: 0, dtype: object

Best header

df3[['Player_name','club','pos','Heading Accuracy']].loc[df3['Heading Accuracy'].idxmax()]
Player_name         O. Giroud
club                  Chelsea
pos                        ST
Heading Accuracy           90
Name: 130, dtype: object

Best finisher

df3[['Player_name','club','pos','Finishing']].loc[df3['Finishing'].idxmax()]
Player_name          S. Agüero
club           Manchester City
pos                         ST
Finishing                   94
Name: 5, dtype: object

Best FK taker

df3[['Player_name','club','pos','FK Accuracy']].loc[df3['FK Accuracy'].idxmax()]
Player_name    J. Ward-Prowse
club              Southampton
pos                        CM
FK Accuracy                91
Name: 145, dtype: object

Best PK taker

df3[['Player_name','club','pos','Penalties']].loc[df3['Penalties'].idxmax()]
Player_name                 R. Jiménez
club           Wolverhampton Wanderers
pos                                 ST
Penalties                           92
Name: 47, dtype: object

Best volleyer

df3[['Player_name','club','pos','Volleys']].loc[df3['Volleys'].idxmax()]
Player_name    J. Rodríguez
club                Everton
pos                     CAM
Volleys                  90
Name: 67, dtype: object

Highest shot power

df3[['Player_name','club','pos','Shot Power']].loc[df3['Shot Power'].idxmax()]
Player_name       K. De Bruyne
club           Manchester City
pos                        CAM
Shot Power                  91
Name: 0, dtype: object

Speed merchant and acceleration

df3[['Player_name','club','pos','Sprint Speed']].loc[df3['Sprint Speed'].idxmax()]
Player_name                Adama Traoré
club            Wolverhampton Wanderers
pos                                  RM
Sprint Speed                         96
Name: 155, dtype: object

df3[['Player_name','club','pos','Acceleration']].loc[df3['Acceleration'].idxmax()]
Player_name                Adama Traoré
club            Wolverhampton Wanderers
pos                                  RM
Acceleration                         97
Name: 155, dtype: object

Dribbler

df3[['Player_name','club','pos','Dribbling']].loc[df3['Dribbling'].idxmax()]
Player_name     Bernardo Silva
club           Manchester City
pos                         RW
Dribbling                   92
Name: 17, dtype: object

df3[['Player_name','club','pos','Ball Control']].loc[df3['Ball Control'].idxmax()]
Player_name        K. De Bruyne
club            Manchester City
pos                         CAM
Ball Control                 92
Name: 0, dtype: object

Best stamina

df3[['Player_name','club','pos','Stamina']].loc[df3['Stamina'].idxmax()]
Player_name    N. Kanté
club            Chelsea
pos                 CDM
Stamina              96
Name: 6, dtype: object

Best strength

df3[['Player_name','club','pos','Strength']].loc[df3['Strength'].idxmax()]
Player_name                    W. Boly
club           Wolverhampton Wanderers
pos                                 CB
Strength                            93
Name: 126, dtype: object

Best positioning

df3[['Player_name','club','pos','Positioning']].loc[df3['Positioning'].idxmax()]
Player_name          S. Agüero
club           Manchester City
pos                         ST
Positioning                 94
Name: 5, dtype: object

Best vision

df3[['Player_name','club','pos','Vision']].loc[df3['Vision'].idxmax()]
Player_name       K. De Bruyne
club           Manchester City
pos                        CAM
Vision                      94
Name: 0, dtype: object

Best composure

df3[['Player_name','club','pos','Composure']].loc[df3['Composure'].idxmax()]
Player_name       K. De Bruyne
club           Manchester City
pos                        CAM
Composure                   91
Name: 0, dtype: object

Best defensive awareness

df3[['Player_name','club','pos','Defensive Awareness']].loc[df3['Defensive Awareness'].idxmax()]
Player_name            V. van Dijk
club                     Liverpool
pos                             CB
Defensive Awareness             93
Name: 4, dtype: object

Interceptions

df3[['Player_name','club','pos','Interceptions']].loc[df3['Interceptions'].idxmax()]
Player_name      N. Kanté
club              Chelsea
pos                   CDM
Interceptions          91
Name: 6, dtype: object

Best sliding tackle

df3[['Player_name','club','pos','Sliding Tackle']].loc[df3['Sliding Tackle'].idxmax()]
Player_name          A. Wan-Bissaka
club              Manchester United
pos                              RB
Sliding Tackle                   90
Name: 57, dtype: object

Best GK reflexes

df3[['Player_name','club','pos','GK Reflexes']].loc[df3['GK Reflexes'].idxmax()]
Player_name            H. Lloris
club           Tottenham Hotspur
pos                           GK
GK Reflexes                   90
Name: 19, dtype: object

Best GK kicking

df3[['Player_name','club','pos','GK Kicking']].loc[df3['GK Kicking'].idxmax()]
Player_name            Ederson
club           Manchester City
pos                         GK
GK Kicking                  93
Name: 7, dtype: object

Tallest player (in cm)

df3[['Player_name','club','pos','height']].loc[df3['height'].idxmax()]
Player_name    N. Pope
club           Burnley
pos                 GK
height             201
Name: 77, dtype: object

Heaviest player (kg)

df3[['Player_name','club','pos','weight']].loc[df3['weight'].idxmax()]
Player_name         W. Morgan
club           Leicester City
pos                        CB
weight                    101
Name: 389, dtype: object

Top player based on wages and salaries

Highest wages

df3[['Player_name','club','pos','wage']].loc[df3['wage'].idxmax()]
Player_name       K. De Bruyne
club           Manchester City
pos                        CAM
wage                    370000
Name: 0, dtype: object

Highest contract values

df3[['Player_name','club','pos','c_value']].loc[df3['c_value'].idxmax()]
Player_name       K. De Bruyne
club           Manchester City
pos                        CAM
c_value               1.29e+08
Name: 0, dtype: object

Top earner per club

idx = df3.groupby(['club'])['wage'].transform(max) == df3['wage']
top_rated = df3[idx][['club','Player_name','pos','rating','wage']]
top_rated.sort_values('club')
club Player_name pos rating wage
11 Arsenal P. Aubameyang ST 87 170000.0
165 Aston Villa R. Barkley CAM 81 81000.0
194 Brighton & Hove Albion L. Trossard CAM 80 56000.0
193 Brighton & Hove Albion L. Dunk CB 80 56000.0
192 Burnley C. Wood ST 78 63000.0
6 Chelsea N. Kanté CDM 88 190000.0
63 Crystal Palace W. Zaha CF 83 89000.0
51 Everton Allan CDM 85 115000.0
226 Fulham A. Mitrović ST 79 90000.0
68 Leeds United Rodrigo ST 82 140000.0
20 Leicester City J. Vardy ST 87 170000.0
2 Liverpool M. Salah RW 90 250000.0
3 Liverpool S. Mané LW 90 250000.0
0 Manchester City K. De Bruyne CAM 91 370000.0
18 Manchester United Bruno Fernandes CAM 88 195000.0
173 Newcastle United C. Wilson ST 78 48000.0
231 Sheffield United J. Fleck CM 77 38000.0
230 Sheffield United O. Norwood CM 77 38000.0
92 Southampton D. Ings ST 81 75000.0
10 Tottenham Hotspur H. Kane ST 88 220000.0
365 West Bromwich Albion M. Phillips RM 74 65000.0
97 West Ham United S. Haller ST 82 58000.0
47 Wolverhampton Wanderers R. Jiménez ST 84 140000.0

Total wages per club

grouped = df3.groupby('club')['wage'].sum().reset_index()
grouped.sort_values('wage', ascending=False)
club wage
11 Manchester City 3552000.0
12 Manchester United 2984000.0
10 Liverpool 2964000.0
16 Tottenham Hotspur 2503000.0
4 Chelsea 2371000.0
0 Arsenal 2116000.0
6 Everton 1835000.0
9 Leicester City 1817000.0
7 Fulham 1642000.0
8 Leeds United 1375000.0
19 Wolverhampton Wanderers 1299000.0
5 Crystal Palace 1212000.0
17 West Bromwich Albion 1103000.0
1 Aston Villa 1102000.0
2 Brighton & Hove Albion 1049000.0
3 Burnley 982000.0
13 Newcastle United 978000.0
15 Southampton 918000.0
18 West Ham United 833000.0
14 Sheffield United 652000.0

I am not sure the wages in FIFA21 reflect the real wages of these players

Feature engineering

We have 16 different positions. We need to simplify this.

Let's simplify those to the 4 traditional positions: Goalkeeper, Defender, Midfielder, Forward. For simplicity:

  • GK: GK
  • DF: CB, LB, RB, LWB, RWB
  • MF: CDM, CM, CAM, RM, LM,
  • FW: RW, LW, CF, ST

Simplify position

def repos(pos):
    if pos == 'GK':
        return 'GK'
    elif pos[-1] == 'B':
        return 'DF'
    elif pos[-1] == 'M':
        return 'MF'
    else:
        return 'FW'

df3['pos2'] = df3.apply(lambda x: repos(x['pos']), axis=1)
        

pos_count = df3['pos2'].value_counts()
pos_count
MF    240
DF    232
FW    104
GK     67
Name: pos2, dtype: int64

import plotly.express as px
from IPython.display import HTML

cat_order = ['GK','DF','MF','FW']
fig = px.bar(pos_count.reindex(cat_order))
fig.update_layout(yaxis_title="Count")
fig.update_layout(xaxis_title="Position")
fig.update_layout(showlegend=False)
#fig.show()
HTML(fig.to_html(include_plotlyjs='cdn'))

Plot of variables vs simpler position

wage, rating per position

fig2 = px.scatter(df3, x="rating", y="wage", color="pos2", hover_data=['Player_name'])
HTML(fig2.to_html(include_plotlyjs='cdn'))

Interesting trend here. Seems like FW tend to be the expensive one, while MF are cheap

Rating vs position

df3['pos2'] = pd.Categorical(df3['pos2'], ['GK','DF','MF','FW'])

fig3 = px.box(df3.sort_values("pos2"), x="pos2", y="rating", color = 'pos2', points="all", hover_data=['Player_name'])
fig3.update_layout(xaxis_title="Position")
fig3.update_layout(showlegend=False)
HTML(fig3.to_html(include_plotlyjs='cdn'))

Every position have players from 55 to 90s rating

Wage vs position

fig4 = px.box(df3.sort_values("pos2"), x="pos2", y="wage", color = 'pos2', points="all", hover_data=['Player_name'])
fig4.update_layout(xaxis_title="Position")
fig4.update_layout(showlegend=False)
HTML(fig4.to_html(include_plotlyjs='cdn'))

KDB is an outlier in term of his FIFA21 salary

pd.pivot_table(df3, index = 'pos2', values = 'wage')
wage
pos2
GK 42522.388060
DF 47622.807018
MF 53268.907563
FW 66365.384615

Forward gets higher wages. Make sense. Goalscoring attribute is a premium

Correlation map

df4 = df3.copy()
df4 = df4[['pos2','height', 'weight','c_value', 'wage', 'rating', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys',
       'Dribbling', 'Curve', 'FK Accuracy', 'Long Passing', 'Ball Control',
       'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance',
       'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Defensive Awareness', 'Standing Tackle', 'Sliding Tackle',
       'GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning',
       'GK Reflexes']]
df4.shape
(643, 40)
import seaborn as sns

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

corr = df4.corr()

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
<AxesSubplot:>

Some insights:

  • Contract value and wages strongly correlated with attacking attributes => attakeer tends to have higher wages
  • Interesting how height and weight tend to be correlated with Goalkeeping stats. Perhaps Goalie tends to be taller and heavier
  • A lot of individual stats attributes seem to be correlated to each other

Perhaps, instead of predicting positions through 30+ different attributes, we can simplify this to the 6 attributes that FIFA Ultimate team use. These stats are the following:

  • Pace
  • Shooting
  • Passing
  • Dribbling
  • Defending
  • Physical
  • Goalkeeping

Based on: https://www.fifauteam.com/fifa-20-attributes-guide/

They are a combination based on several attributes. For example, shooting is made up of: finishing, long sots, penalties, positioning, shot power, volleys. These stats are weighted, but for simplicity I will just average them.

Note that Keepers usually have their own attributes, but I am going to make up keeping attributes which are simply the average of all the keepers attributes here.

Simplify stats

df4['pace'] = df4.loc[:,['Acceleration','Sprint Speed'] ].mean(axis=1)
df4['shooting'] = df4.loc[:,['Finishing','Long shots','Penalties','Positioning','Shot Power','Volleys']].mean(axis=1)
df4['passing'] = df4.loc[:,['Crossing','Curve','FK Accuracy','Long Passing','Short Passing','Vision']].mean(axis=1)
df4['dribbling'] = df4.loc[:,['Agility','Balance','Ball Control','Composure','Dribbling','Reactions']].mean(axis=1)
df4['defending'] = df4.loc[:,['Heading Accuracy','Interceptions','Defensive Awareness','Sliding Tackle','Standing Tackle']].mean(axis=1)
df4['physical'] = df4.loc[:,['Aggression','Jumping','Stamina','Strength']].mean(axis=1)
df4['goalkeeping'] = df4.loc[:,['GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning','GK Reflexes']].mean(axis=1)
C:\Users\Riyan Aditya\Anaconda3\lib\site-packages\pandas\core\indexing.py:1418: FutureWarning:


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike

And again remove unnecessary column

df5 = df4[['pos2', 'height', 'weight', 'c_value', 'wage', 'rating', 
           'pace', 'shooting', 'passing', 'dribbling', 'defending','physical', 'goalkeeping']]

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

corr = df5.corr()

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
<AxesSubplot:>
corr['rating'].sort_values(ascending=False)
rating         1.000000
wage           0.792763
c_value        0.690792
dribbling      0.558178
physical       0.557454
passing        0.508782
shooting       0.420499
defending      0.308034
pace           0.227797
weight         0.182961
height         0.034694
goalkeeping    0.031497
Name: rating, dtype: float64

fig5 = px.scatter(df5, x="shooting", y="passing", color="pos2", width = 600, height = 600)
HTML(fig5.to_html(include_plotlyjs='cdn'))

This looks promising for unsupervised learning per position

Data cleaning

Remove NAN in wages

There are 7 players with no info on wages. They will be removed

df5 = df5[df5['wage'].notna()]

Model selection

Create label encoder for positions

from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()
df5['pos2'] = encoder.fit_transform(df5['pos2'])
positions2 = encoder.inverse_transform([0,1,2,3])
positions2
array(['DF', 'FW', 'GK', 'MF'], dtype=object)

Split to train test

Split test and training set. use stratified sampling

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index in split.split(df5, df5.loc[:,'pos2']):
    strat_train_set = df5.iloc[train_index]
    strat_test_set = df5.iloc[test_index]
strat_train_set['pos2'].value_counts()/len(strat_train_set)
3    0.373281
0    0.357564
1    0.163065
2    0.106090
Name: pos2, dtype: float64
strat_test_set['pos2'].value_counts()/len(strat_test_set)
3    0.375000
0    0.359375
1    0.164062
2    0.101562
Name: pos2, dtype: float64
strat_train_set.shape, strat_test_set.shape
((509, 13), (128, 13))

Split to Xtrain, ytrain, Xtest, ytest

X_train = strat_train_set.copy().drop('pos2', axis=1)
y_train = strat_train_set['pos2']
X_test = strat_test_set.copy().drop('pos2', axis=1)
y_test = strat_test_set['pos2']

Numerical pipeline for standardisation

Numerical pipeline to standardise all x parameters

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer

num_pipeline = Pipeline([('std_scaler',StandardScaler())])
X_train_tr = num_pipeline.fit_transform(X_train)

Logistic regression

from sklearn.linear_model import LogisticRegressionCV

#scores
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,roc_auc_score,auc,f1_score  
from sklearn.model_selection import cross_val_score,learning_curve,GridSearchCV,validation_curve
LR = LogisticRegressionCV(cv=3,random_state=20, solver='liblinear', max_iter=1000)
clf_lr = LR.fit(X_train_tr,y_train)
y_pred_train_lr = clf_lr.predict(X_train_tr)
positions2
array(['DF', 'FW', 'GK', 'MF'], dtype=object)

fig = plt.figure(figsize=(10,7))
sns.set(font_scale=1.4)

cf = confusion_matrix(y_train, y_pred_train_lr)
df_cm_lr = pd.DataFrame(cf,index=positions2, columns=positions2)
heatmap = sns.heatmap(df_cm_lr, annot=True, fmt="d", annot_kws={"size": 16})
plt.title('Logistic Regression')
plt.ylabel('True label')
plt.xlabel('Predicted label')

print(" Accuracy: ",accuracy_score(y_train, y_pred_train_lr))
print(" F1 score: ",f1_score(y_train, y_pred_train_lr,average='weighted'))
 Accuracy:  0.9056974459724951
 F1 score:  0.905553244116887

Wow, this is much better than k-means model

plt.figure()
plt.title('Learning curve Logistic regression')
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(clf_lr, X_train_tr, y_train)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
plt.legend(loc="best")
C:\Users\Riyan Aditya\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:672: UserWarning:

The least populated class in y has only 2 members, which is less than n_splits=3.

<matplotlib.legend.Legend at 0x22b3c217b88>

KNN model

from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
param_grid = {'n_neighbors':np.arange(1,15)}
KNN = GridSearchCV(knn_model, param_grid, cv=3)
best_clf_knn = KNN.fit(X_train_tr,y_train)

def clf_performance(classifier,model_name):
    print(model_name)
    print('Best_score: '+str(classifier.best_score_))
    print('Best_parameters: '+str(classifier.best_params_))
clf_performance(best_clf_knn,'K Nearest Neighbors')
K Nearest Neighbors
Best_score: 0.8408980160111382
Best_parameters: {'n_neighbors': 14}
y_pred_train_knn = best_clf_knn.predict(X_train_tr)

fig = plt.figure(figsize=(10,7))
sns.set(font_scale=1.4)

cf = confusion_matrix(y_train, y_pred_train_knn)
df_cm_knn = pd.DataFrame(cf,index=positions2, columns=positions2)
heatmap = sns.heatmap(df_cm_knn, annot=True, fmt="d", annot_kws={"size": 16})
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('K nearest neighbour')

print(" Accuracy: ",accuracy_score(y_train, y_pred_train_knn))
print(" F1 score: ",f1_score(y_train, y_pred_train_knn,average='weighted'))
 Accuracy:  0.8722986247544204
 F1 score:  0.8708993540631961

plt.figure()
plt.title('Learning curve K nearest neighbour')
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(best_clf_knn, X_train_tr, y_train)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
plt.legend(loc="best")
C:\Users\Riyan Aditya\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:672: UserWarning:

The least populated class in y has only 2 members, which is less than n_splits=3.

<matplotlib.legend.Legend at 0x22b3c4ac1c8>

Random Forest

from sklearn.ensemble import RandomForestClassifier

gridsearch_forest = RandomForestClassifier()

params = {
    "n_estimators": [1, 10, 100],
    "max_depth": [5,8,15], #2,3,5 85 #5,8,10 88 #5 8 15 89
    "min_samples_leaf" : [1, 2, 4]}

RF = GridSearchCV(gridsearch_forest, param_grid=params, cv=3 )
best_clf_rf = RF.fit(X_train_tr,y_train)
clf_performance(best_clf_rf,'Random Forest')
Random Forest
Best_score: 0.86248984801021
Best_parameters: {'max_depth': 8, 'min_samples_leaf': 2, 'n_estimators': 100}
y_pred_train_rf = best_clf_rf.predict(X_train_tr)

fig = plt.figure(figsize=(10,7))
sns.set(font_scale=1.4)

cf = confusion_matrix(y_train, y_pred_train_rf)
df_cm_rf = pd.DataFrame(cf,index=positions2, columns=positions2)
heatmap = sns.heatmap(df_cm_rf, annot=True, fmt="d", annot_kws={"size": 16})
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Random forest')

print(" Accuracy: ",accuracy_score(y_train, y_pred_train_rf))
print(" F1 score: ",f1_score(y_train, y_pred_train_rf,average='weighted'))
 Accuracy:  0.9705304518664047
 F1 score:  0.97040738131732

plt.figure()
plt.title('Learning curve Random Forest')
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(best_clf_rf, X_train_tr, y_train)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
plt.legend(loc="best")
C:\Users\Riyan Aditya\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:672: UserWarning:

The least populated class in y has only 2 members, which is less than n_splits=3.

<matplotlib.legend.Legend at 0x22b3c0dd388>

SVM

from sklearn.svm import SVC

SVM = SVC(kernel='linear', C=1)
clf_svm = SVM.fit(X_train_tr,y_train)
y_pred_train_svm = clf_svm.predict(X_train_tr)

fig = plt.figure(figsize=(10,7))
sns.set(font_scale=1.4)

cf = confusion_matrix(y_train, y_pred_train_svm)
df_cm_svm = pd.DataFrame(cf,index=positions2, columns=positions2)
heatmap = sns.heatmap(df_cm_svm, annot=True, fmt="d", annot_kws={"size": 16})
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Support Vector Machine')

print(" Accuracy: ",accuracy_score(y_train, y_pred_train_svm))
print(" F1 score: ",f1_score(y_train, y_pred_train_svm,average='weighted'))
 Accuracy:  0.9155206286836935
 F1 score:  0.9154216694655551

plt.figure()
plt.title('Learning curve SVM')
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(clf_svm, X_train_tr, y_train)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x22b3b22d448>

Summary

data_matrix = [["Model","accuracy","F1-score"],
         ['Logistic regression',accuracy_score(y_train, y_pred_train_lr).round(3),f1_score(y_train, y_pred_train_lr,average='weighted').round(3)],
         ['K nearest neighbour',accuracy_score(y_train, y_pred_train_knn).round(3),f1_score(y_train, y_pred_train_knn,average='weighted').round(3)],
         ['Random forest',accuracy_score(y_train, y_pred_train_rf).round(3),f1_score(y_train, y_pred_train_rf,average='weighted').round(3)],
         ['Support vector machine',accuracy_score(y_train, y_pred_train_svm).round(3),f1_score(y_train, y_pred_train_svm,average='weighted').round(3)]
              ]

data_matrix
[['Model', 'accuracy', 'F1-score'],
 ['Logistic regression', 0.906, 0.906],
 ['K nearest neighbour', 0.872, 0.871],
 ['Random forest', 0.971, 0.97],
 ['Support vector machine', 0.916, 0.915]]

Seems like random forest is the best based on this training set

fig,axn = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(12,12))
fig.suptitle('Confusion matrix for training dataset')

ax = plt.subplot(2, 2, 1)
sns.heatmap(df_cm_lr, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax, cbar=False)
ax.set_title('Logistic regression')
ax.set_aspect('equal')
plt.ylabel('True label')

ax = plt.subplot(2, 2, 2)
sns.heatmap(df_cm_knn, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax, cbar=False)
ax.set_title('K nearest neighbour')
ax.set_aspect('equal')

ax = plt.subplot(2, 2, 3)
sns.heatmap(df_cm_rf, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax, cbar=False)
ax.set_title('Random Forest')
ax.set_aspect('equal')
plt.ylabel('True label')
plt.xlabel('Predicted label')

ax = plt.subplot(2, 2, 4)
sns.heatmap(df_cm_svm, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax, cbar=False)
ax.set_title('Support Vector Machine')
ax.set_aspect('equal')
plt.xlabel('Predicted label')

#sns.heatmap(df_cm_rf, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax[2])
#sns.heatmap(df_cm_svm, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax[3])
#plt.title('Logistic Regression')


plt.show()

Insights:

  • All models can predict goalkeeper right. That is expected
  • Midfielder is harder to predict. Not surprised, considering a lot of midfielders may have skills like defenders or forwards

Evaluate models with test set

Prepare Xtest

X_test_tr = num_pipeline.fit_transform(X_test)

Make prediction

y_pred_test_lr = clf_lr.predict(X_test_tr)
y_pred_test_knn = best_clf_knn.predict(X_test_tr)
y_pred_test_rf = best_clf_rf.predict(X_test_tr)
y_pred_test_svm = clf_svm.predict(X_test_tr)

Accuracy and F1 score

data_matrix = [["Model","accuracy","F1-score"],
         ['Logistic regression',accuracy_score(y_test, y_pred_test_lr).round(3),f1_score(y_test, y_pred_test_lr,average='weighted').round(3)],
         ['K nearest neighbour',accuracy_score(y_test, y_pred_test_knn).round(3),f1_score(y_test, y_pred_test_knn,average='weighted').round(3)],
         ['Random forest',accuracy_score(y_test, y_pred_test_rf).round(3),f1_score(y_test, y_pred_test_rf,average='weighted').round(3)],
         ['Support vector machine',accuracy_score(y_test, y_pred_test_svm).round(3),f1_score(y_test, y_pred_test_svm,average='weighted').round(3)]
              ]

data_matrix
[['Model', 'accuracy', 'F1-score'],
 ['Logistic regression', 0.898, 0.898],
 ['K nearest neighbour', 0.852, 0.848],
 ['Random forest', 0.867, 0.865],
 ['Support vector machine', 0.906, 0.905]]

For the test set, SVM is the best model (it was RF in the training set)

Confusion matrix

cf = confusion_matrix(y_test, y_pred_test_lr)
df_cm_lr_t = pd.DataFrame(cf,index=positions2, columns=positions2)
cf = confusion_matrix(y_test, y_pred_test_knn)
df_cm_knn_t = pd.DataFrame(cf,index=positions2, columns=positions2)
cf = confusion_matrix(y_test, y_pred_test_rf)
df_cm_rf_t = pd.DataFrame(cf,index=positions2, columns=positions2)
cf = confusion_matrix(y_test, y_pred_test_svm)
df_cm_svm_t = pd.DataFrame(cf,index=positions2, columns=positions2)

fig,axn = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(12,12))
fig.suptitle('Confusion matrix for test dataset')

ax = plt.subplot(2, 2, 1)
sns.heatmap(df_cm_lr_t, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax, cbar=False)
ax.set_title('Logistic regression')
ax.set_aspect('equal')
plt.ylabel('True label')

ax = plt.subplot(2, 2, 2)
sns.heatmap(df_cm_knn_t, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax, cbar=False)
ax.set_title('K nearest neighbour')
ax.set_aspect('equal')

ax = plt.subplot(2, 2, 3)
sns.heatmap(df_cm_rf_t, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax, cbar=False)
ax.set_title('Random Forest')
ax.set_aspect('equal')
plt.ylabel('True label')
plt.xlabel('Predicted label')

ax = plt.subplot(2, 2, 4)
sns.heatmap(df_cm_svm_t, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax, cbar=False)
ax.set_title('Support Vector Machine')
ax.set_aspect('equal')
plt.xlabel('Predicted label')

#sns.heatmap(df_cm_rf, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax[2])
#sns.heatmap(df_cm_svm, annot=True, fmt="d", annot_kws={"size": 16}, ax = ax[3])
#plt.title('Logistic Regression')


plt.show()