import requests
from bs4 import BeautifulSoup
import time
import csv
import pandas as pd
import unidecode
import matplotlib.pyplot as plt

# Xbox = xbox
# Battle.net = battle
# Playstation = psn

XBOX = 'xbl'
BNET = 'battle'
PSN = 'psn'

# Gets Kill/Death Ratio of given user.
def getKD(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player', params=params)

    return r.json()['data']['lifetime']['mode']['br']['properties']['kdRatio']

# Gets # of wins for given user.
def getWins(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player', params=params)

    j = r.json()

    return j['data']['lifetime']['mode']['br']['properties']['wins']

# Gets win percentage of given player.
def getWinPct(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player', params=params)

    j = r.json()

    return ((j['data']['lifetime']['mode']['br']['properties']['wins'] / j['data']['lifetime']['mode']['br']['properties']['gamesPlayed']) * 100)

# Gets total kills from given player.
def getKills(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player', params=params)

    j = r.json()

    return j['data']['lifetime']['mode']['br']['properties']['kills']

# Gets average kills per games from given player.
def getKillsPerGame(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player', params=params)

    j = r.json()

    return (j['data']['lifetime']['mode']['br']['properties']['kills'] / j['data']['lifetime']['mode']['br']['properties']['gamesPlayed'])

# Gets the Gulag win percentage of last 100 games (Gulag is a one vs. one battle a player goes to after death)
def getGulagLast100(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player', params=params)

    j = r.json()

    return j['last100games']['gulagWinPercentage']

# Gets headshot accuracy of last 100 games.
def getHSLast100(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player', params=params)

    j = r.json()

    return (j['last100games']['headshots'] / j['last100games']['kills'])

# Gets KD ratio of last 100 games.
def getKDLast100(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player', params=params)

    j = r.json()

    return (j['last100games']['kills'] / j['last100games']['deaths'])

# Gets list of last 20 matches' IDs.
def getLast20Matches(user, platform):
    params = {
        'username': user,
        'platform': platform
    }

    r = requests.get('https://app.wzstats.gg/v2/player/match', params=params)

    j = r.json()
    
    matches = []

    for m in j:
        matches.append(m['id'])

    return matches

# Gets average KD of this match.
def getAvgKDMatch(match):
    params = {
        'matchId': match
    }

    r = requests.get('https://app.wzstats.gg/v2/', params=params)

    j = r.json()

    return j['matchStatData']['playerAverage']

# Gets median KD of this match.
def getMedianKDMatch(match):
    params = {
        'matchId': match
    }

    r = requests.get('https://app.wzstats.gg/v2/', params=params)

    j = r.json()

    return j['matchStatData']['playerMedian']

# Gets average KD by team of this match.
def getAvgTeamKDMatch(match):
    params = {
        'matchId': match
    }

    r = requests.get('https://app.wzstats.gg/v2/', params=params)

    j = r.json()

    return j['matchStatData']['teamAverage']

# Gets median KD by team of this match.
def getMedianTeamKDMatch(match):
    params = {
        'matchId': match
    }

    r = requests.get('https://app.wzstats.gg/v2/', params=params)

    j = r.json()

    return j['matchStatData']['teamMedian']

seen_accounts = []

# Gets stats of players from this lobby.
def getLobbyStats(match, unique=False):
    params = {
        'matchId': match
    }

    # Some requests for specfic games cannot be converted to json
    try:
        r = requests.get('https://app.wzstats.gg/v2/', params=params)
        j = r.json()
    except:
        if unique:
            return [], []
        return []

    players = j['data']['players']
    unseen_players = []
    results = []

    for p in players:
        stat = p['playerStat']
        if stat != None:
            account = ""
            platform = ""

            # Do they have a public account? Check if their account is linked to psn, battle, or xbox live
            if stat['battle'] != None or stat['psn'] != None or stat['xbl'] != None:
                if stat['battle'] != None:
                    account = stat['battle']
                    platform = 'battle'
                elif stat['psn'] != None:
                    account = stat['psn']
                    platform = 'psn'
                else:
                    account = stat['xbl']
                    platform = 'xbl'

                if unique and (not account in seen_accounts):
                    unseen_players.append({'username': account, 'platform':platform})

            lifetime_kd = stat['lifetime']['mode']['br']['properties']['kdRatio']

            results.append({'id':match, 'username':account, 'platform':platform, 'lifetime_kd': lifetime_kd})

        else:
            print("Account not old enough/no decisive data.")
    if unique:
        return results, unseen_players

    return results

# Starts with list of accounts, and loads n matches.
def loadAccounts(file_name, n):
    df = pd.read_csv(file_name)

    queue = []

    for index, account in df.iterrows():
        if not account["username"] in seen_accounts:
            queue.append(account)

    for i in range(n):
        print(i)
        next_user = queue.pop(0)
        while next_user["username"] in seen_accounts:
            next_user = queue.pop(0)
        seen_accounts.append(next_user['username']) 
        matches = getLast20Matches(next_user["username"], next_user["platform"])

        for match in matches:
            match_players, new_players = getLobbyStats(match, unique=True)
            # print(new_players)
            queue += new_players

            # Open data set and write new line 
            with open('./dataset/gen_pop_games.csv', 'a', newline='') as csvfile:   
                writer = csv.writer(csvfile, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
                # For each player, get their stats in the game
                for match_player in match_players:
                    # Some player names have foreign characters that csv cannot handle, we need to transliterate to ascii
                    match_player['username'] = unidecode.unidecode(match_player['username'])
                    # Converting the entire data values from a player and writing it as a singular line 
                    # Format is id,username,platform,lifetime_kd
                    line = list(match_player.values())
                    writer.writerow(line)

# Gets top players from WZStats' featured player list.
def getTopPlayers():
    r = requests.get('https://app.wzstats.gg/player/top')

    return r.json()

# The line below is what we used to create our data, it takes roughly 4-6 hours to run, so run at your own risk.
# loadAccounts("./config/accounts.csv", 125)


# Games
df = pd.read_csv('./dataset/gen_pop_games.csv')
df.head()


# Commented so that it doesnt run everytime
# You should only run this code to collect the data, to save time we wrote it to a file called top_players_games.csv, and then read from that for the future.
import utils.wzstats as wz
accounts = wz.getTopPlayers()

games = []

for a in accounts:
    account = ""
    platform = ""

    if a['battle'] != None:
        account = a['battle']
        platform = 'battle'
    elif a['xbl'] != None:
        account = a['xbl']
        platform = 'xbl'
    else:
        account = a['psn']
        platform = 'psn'

    print(f'Fetching data for {account}')

    last20 = wz.getLast20Matches(account, platform)

    for game in last20:
        kd = wz.getAvgKDMatch(game)
        games.append((game, kd))

Fetching data for iron#11745
Fetching data for nickmercs#11526
Fetching data for jgod#11463
Fetching data for teepee#1840
Fetching data for truegamedata#1375
Fetching data for zlaner#1345
Fetching data for icemanisaac#1815
Fetching data for aydan#11691
Fetching data for huskerrs#1343
Fetching data for averagejoewo#1438
Fetching data for almond#11120
Fetching data for tommey#21329
Fetching data for superevan#11680
Fetching data for shadedstep#1738
Fetching data for yeet#11987
Fetching data for opmarked#1818
Fetching data for devious#11655
Fetching data for lenun#21968
Fetching data for jaredfps#1454
Fetching data for frozone#11329
Fetching data for warsz#2905
Fetching data for picnick#11353
Fetching data for flexz#2541
Fetching data for rated#21620
Fetching data for destroy#12878
Fetching data for stu#11800
Fetching data for clutchbelk#1526
Fetching data for bbreadman#1673
Fetching data for metaphor#11972
Fetching data for soki#21161
Fetching data for newbz#11184
Fetching data for finessen#1762
Fetching data for jukeyz#2681
Fetching data for ahtract#1570
Fetching data for intechs#1266
Fetching data for nickool#1437


# Open data set and write new line 
import csv

# Open csv and write game data of pro players.
with open('./dataset/top_players_games.csv', 'a', newline='') as csvfile:   
    writer = csv.writer(csvfile, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
    for tup in games:
        line = [tup[0], tup[1]]
        writer.writerow(line)


# Reads the games and imports them to a dataframe.
prodf = pd.read_csv('./dataset/top_players_games.csv')
prodf.head()


import numpy as np
import statistics as st
kds_of_md = [] # Will store the kds of players with missing data
kds_not_missing = [] # Will store the kds of players without missing data

# Check if a player's name or platform is NaN, if so they are private, else public
# private --> missing data, public --> not missing data
for index, x in df.iterrows():
    #private
    if pd.isna(x.username)or pd.isna(x.platform):
        if pd.isna(x.lifetime_kd):
            print(index)
        else:
            kds_of_md.append(x.lifetime_kd)
    #public
    else:
        kds_not_missing.append(x.lifetime_kd)


plt.hist(kds_of_md)
plt.title('Histogram of KDs of Private Players')
plt.xlabel('Lifetime KD')
plt.ylabel('Number of Players')
plt.show()
plt.close()
plt.hist(kds_not_missing)
plt.title('Histogram of KDs of Public Players')
plt.xlabel('Lifetime KD')
plt.ylabel('Number of Players')
plt.show()
plt.close()


avg_missing_kd = st.mean(kds_of_md)
avg_not_missing_kd = st.mean(kds_not_missing)
std_missing_kd = st.stdev(kds_of_md)
std_not_missing_kd = st.stdev(kds_not_missing)
var_missing_kd = st.variance(kds_of_md)
var_not_missing_kd = st.variance(kds_not_missing)
print(f'Average KD of Private Players is: {avg_missing_kd}')
print(f'Average KD of Public Players is: {avg_not_missing_kd}')
print(f'Std Dev of KD of Private Players is: {std_missing_kd}')
print(f'Std Dev of KD of Public Players is: {std_not_missing_kd}')
print(f'Variance of KD of Private Players is: {var_missing_kd}')
print(f'Variance of KD of Public Players is: {var_not_missing_kd}')

Average KD of Private Players is: 0.9586685429154905
Average KD of Public Players is: 1.3580333113942
Std Dev of KD of Private Players is: 0.4725462437744462
Std Dev of KD of Public Players is: 0.5610117987298344
Variance of KD of Private Players is: 0.22329995250533835
Variance of KD of Public Players is: 0.3147342383140842


# Check private players first
priv_inside = 0
for kd in kds_of_md:
    if abs(avg_missing_kd - kd) < (2 * std_missing_kd):
        priv_inside += 1
priv_percent = 100 * (priv_inside/len(kds_of_md))
print(f'{priv_percent}% of Private Players data is within 2 standard deviations of the mean.')

# Check public players now
pub_inside = 0
for kd in kds_not_missing:
    if abs(avg_not_missing_kd - kd) < (2 * std_not_missing_kd):
        pub_inside += 1
pub_percent = 100 * (pub_inside/len(kds_not_missing))
print(f'{pub_percent}% of Public Players data is within 2 standard deviations of the mean.')

97.10278036911299% of Private Players data is within 2 standard deviations of the mean.
95.77899961074348% of Public Players data is within 2 standard deviations of the mean.


avg_kd =st.mean(df['lifetime_kd'])
std_kd = st.stdev(df['lifetime_kd'])
mdlen = len(kds_of_md) # Length of private player's kds array
notmdlen = len(kds_not_missing) # Length of public player's kds array
# difference(mean of private and overall mean) divided by stdev didvided by sqrt num users
# Since our data sets are a sample of the main set, by Central limit theorem it can be assumed to be approximately normally distributed. We can also then use the Square Root of N law to find Z-scores with regard to the original data set.
missing_zscore = (avg_missing_kd-avg_kd) / (std_kd / st.sqrt(mdlen))
not_missing_zscore = (avg_not_missing_kd-avg_kd) / (std_kd / st.sqrt(notmdlen))
print(f'The average lifetime KD of a player in our dataset is {avg_kd}')
print(f'The average lifetime KD of a player whose data settings are PRIVATE is {avg_missing_kd}, which is {missing_zscore} standard deviations from the mean')
print(f'The average lifetime KD of a player whose data settings are PUBLIC is {avg_not_missing_kd}, which is {not_missing_zscore} standard deviations from the mean')

The average lifetime KD of a player in our dataset is 1.024577792192975
The average lifetime KD of a player whose data settings are PRIVATE is 0.9586685429154905, which is -58.90356957344962 standard deviations from the mean
The average lifetime KD of a player whose data settings are PUBLIC is 1.3580333113942, which is 132.49130766135403 standard deviations from the mean


from scipy import stats as scistats
T=(avg_missing_kd - avg_not_missing_kd)/st.sqrt((std_missing_kd**2)/mdlen + (std_not_missing_kd**2)/notmdlen)
print(f'T-distribution value: {T}')
deg_of_freedom = (((std_not_missing_kd**2)/notmdlen + (std_missing_kd**2)/mdlen)**2)/((((std_not_missing_kd**2)/notmdlen)**2)/(notmdlen-1)+(((std_missing_kd**2)/mdlen)**2)/(mdlen-1))
print(f'Degrees of Freedom: {deg_of_freedom}')
p_value = 2*scistats.t.cdf(T, deg_of_freedom, loc=0, scale=1)
print(f'Probability value: {p_value}')

T-distribution value: -135.15833033968178
Degrees of Freedom: 53232.472835692744
Probability value: 0.0


plt.hist(df['lifetime_kd'])
plt.title('Histogram of lifetime KDs for sample')
plt.xlabel('Lifetime KD')
plt.ylabel('Number of Players')

Text(0, 0.5, 'Number of Players')


plt.hist(df.loc[df['lifetime_kd'] >= 6]['lifetime_kd'])
plt.title('Histogram of lifetime KDs (>= 6) for sample')
plt.xlabel('Lifetime KD')
plt.ylabel('Number of Players')

Text(0, 0.5, 'Number of Players')


over10df =df.loc[df['lifetime_kd'] >= 10]
plt.hist(over10df['lifetime_kd'])
plt.title('Histogram of Player\'s Lifetime KDs (over 10 only)')
plt.xlabel('Lifetime KD')
plt.ylabel('Numnber of Players')
print(f'Number of players with over a 10 lifetime kd: {len(over10df)}')
count = 1
for index, x in over10df.iterrows():
    if pd.isna(x.username) or pd.isna(x.platform):
        print(f'{count}  There is a PRIVATE user with an over 10 lifetime kd in match {x.id}')
    else:
        print(f'{count}  There is a PUBLIC user with an over 10 lifetime kd in match {x.id}')
    count += 1

Number of players with over a 10 lifetime kd: 15
1  There is a PRIVATE user with an over 10 lifetime kd in match 15312029568279681814
2  There is a PRIVATE user with an over 10 lifetime kd in match 4673868877075267553
3  There is a PRIVATE user with an over 10 lifetime kd in match 6573573280802323490
4  There is a PRIVATE user with an over 10 lifetime kd in match 526039276483101153
5  There is a PRIVATE user with an over 10 lifetime kd in match 15204777370665016310
6  There is a PRIVATE user with an over 10 lifetime kd in match 14455535145959848534
7  There is a PRIVATE user with an over 10 lifetime kd in match 17803083994292567640
8  There is a PRIVATE user with an over 10 lifetime kd in match 10527309497664288941
9  There is a PRIVATE user with an over 10 lifetime kd in match 3287462810166929906
10  There is a PRIVATE user with an over 10 lifetime kd in match 1031614374727749052
11  There is a PRIVATE user with an over 10 lifetime kd in match 7282482404698762792
12  There is a PRIVATE user with an over 10 lifetime kd in match 4082030459319508710
13  There is a PRIVATE user with an over 10 lifetime kd in match 10887373374029236929
14  There is a PRIVATE user with an over 10 lifetime kd in match 11571758250919960897
15  There is a PRIVATE user with an over 10 lifetime kd in match 17784864676679737013


plt.hist(df.loc[df['lifetime_kd'] <= 6]['lifetime_kd'])
plt.title('Histogram of Players (Lifetime KD <= 6)')
plt.xlabel('Lifetime KD')
plt.ylabel('Number of Players')

Text(0, 0.5, 'Number of Players')


kds = dict()
# Define a subset of our original dataframe using this syntax
realdf = df.loc[df['lifetime_kd'] <= 6]
for index, x in realdf.iterrows():
    if x.lifetime_kd in kds.keys():
        kds[x.lifetime_kd] += 1
    else:
        kds[x.lifetime_kd] = 1


plt.scatter(kds.keys(), kds.values())
plt.title('Number of people with a specific KD')
plt.xlabel('Lifetime KD')
plt.ylabel('Number of People')

Text(0, 0.5, 'Number of People')


for k,v in kds.items():
    if v > 100:
        print(f'There are {v} users with KD = {k}')

There are 250 users with KD = 1.0
There are 108 users with KD = 0.3333333333333333
There are 252 users with KD = 0.5
There are 109 users with KD = 0.0
There are 148 users with KD = 0.6666666666666666


# Mean KD per game
gameIDs = df.id.unique()

lf_kd_by_game = {}

for game in gameIDs:
    game_df = df.loc[df['id'] == game]
    
    lf_kd = game_df.lifetime_kd.mean()
    lf_kd_by_game[game] = lf_kd

plt.hist(lf_kd_by_game.values())
plt.title('Avg Lifetime KD Per Game')
plt.xlabel('Avg Lifetime KD')
plt.ylabel('Number of Lobbies')
basicmean = st.mean(lf_kd_by_game.values())
basicstdev = st.stdev(lf_kd_by_game.values())
print(f'Average General Player Lobby Lifetime KD: {basicmean}')
print(f'Standard Deviation of General Player Lobby Lifetime KD: {basicstdev}')

Average General Player Lobby Lifetime KD: 1.012254568012402
Standard Deviation of General Player Lobby Lifetime KD: 0.16818777931045018


plt.hist(prodf['avg_lobby_lifetime_kd'])
plt.xlabel('Lifetime KD')
plt.ylabel('Number of Games')
plt.title('Average Lifetime KD of Pro Players\' Lobbies')
promean = st.mean(prodf['avg_lobby_lifetime_kd'])
prostdev = st.stdev(prodf['avg_lobby_lifetime_kd'])
print(f'Average pro player lobby lifetime kd: {promean}')
print(f'Standard Deviation of pro player lobby lifetime kd: {prostdev}')

Average pro player lobby lifetime kd: 1.1463587618335986
Standard Deviation of pro player lobby lifetime kd: 0.14089705204015202


# By convolving the pmf of two random variables, we obtain the pmf of their sum.
# To find the pmf of a sample of size n from the distribution of KDs, the theoretical distribution of
# lobbies in our null model we auto-correlate (convolution of a distribution with itself) the pmf of the distribution
# of KDs in our dataset n-1 times to find the pmf of the sum of n observances, and then rescale the support
# of the resultant random variable by 1/n. The result is the theoretical pmf of the average KD of a lobby of size n
# under our null model.

# Takes the pmf of a single KD observation, and convolves it with itself n-1 times.
# The result is the pmf of the sum of n independent KD observations
def n_auto_correlation(pmf, n):
    result = pmf
    for _ in range(n-1):
        result = np.convolve(result, pmf)
        
    return result

# This uses the previous function to find the pmf of the sum of n independent KD observations
# And then rescales the support of the distribution to 1/n times its original support
# giving the pmf of the mean of a random sample of n KDs
def pmf_lobby_avg(densities, n):
    bins = densities[1]
    #print(bins[-1])
    
    for _ in range(n-1):
        bins = np.concatenate((bins[:-2], densities[1] + bins[-2]))
        
    bins = bins / n
    
    pmf = n_auto_correlation(densities[0], n)
    
    return (pmf, bins)

# The cdf of an average lobby KD value x is the probability that an observation is less than or equal to x
# This will be important later.

def pmf_cdf_lobby_avg(densities, n):
    pmf, bins = pmf_lobby_avg(densities, n)
    
    cdf = np.cumsum(pmf)
    
    return pmf, cdf, bins


# Verifying that our correlation produces the expected distribution. First we simulate the underlying distribution
# of KDs by producing a histogram and normalizing by the total number of observations (~250,000) to produce a
# discrete pmf.

import seaborn as sb 
q1, q3 = np.percentile(df['lifetime_kd'], [25, 75])
fmdc = 2 * (q3 - q1) / (len(df['lifetime_kd'])**(1/3))
n_bins = int(10 / fmdc)
print(fmdc)
print(n_bins)

densities = np.histogram(df['lifetime_kd'], bins = n_bins, range = (0, 10))
densities = (densities[0]/len(df['lifetime_kd']), densities[1])

cmf = (np.cumsum(densities[0]), densities[1])

plt.plot(densities[1][:-1], densities[0])
plt.title('PMF of Lifetime KD Ratios')
plt.ylabel('Probability Mass')
plt.xlabel('Lifetime KD')
plt.show()

plt.plot(cmf[1][:-1], cmf[0])
plt.title('CDF of Lifetime KD Ratios')
plt.ylabel('Probability KD <= x ')
plt.xlabel('Lifetime KD')
plt.show()

# Verifying that autocorrelation creates a pmf centered on the center of the underlying KD pmf.

densities_150 = pmf_lobby_avg(densities, 150)

plt.plot(densities_150[1][:-1], densities_150[0])
plt.title('PMF of Average Lobby Lifetime KD')
plt.xlabel('Average Lifetime KD')
plt.ylabel('Probability Mass')
plt.show()

cmf = (np.cumsum(densities_150[0]), densities_150[1])

plt.plot(cmf[1][:-1], cmf[0])
plt.title('CDF of Lobby Average Lifetime KD Ratios')
plt.ylabel('Probability AVG KD <= x ')
plt.xlabel('Average Lifetime KD')
plt.show()

0.0169973515457261
588


# Mean KD per game
gameIDs = df.id.unique()

lf_kd_by_game = {}

for game in gameIDs:
    game_df = df.loc[df['id'] == game]
    
    lf_kd = game_df.lifetime_kd.mean()
    lf_kd_by_game[game] = (lf_kd, len(game_df))


lobby_kds = list(map(lambda x: x[0], lf_kd_by_game.values()))

q1, q3 = np.percentile(lobby_kds, [25, 75])
fmdc = 2 * (q3 - q1) / (len(lobby_kds)**(1/3))
n_bins = int(10 / fmdc)

lobby_kd_densities = np.histogram(lobby_kds, bins = n_bins, range = (0, 10))
lobby_kd_densities = (lobby_kd_densities[0]/len(lobby_kds), lobby_kd_densities[1])
plt.plot(lobby_kd_densities[1][:-1], lobby_kd_densities[0])
plt.show()


# takes like 2 hours to run
# sum_likelihoods = {}
# for game in lf_kd_by_game.keys():
#     game_avg = lf_kd_by_game[game][0]
#     n_players = lf_kd_by_game[game][1]
    
#     pmf, cdf, bins = pmf_cdf_lobby_avg(densities, n_players)
    
#     index = np.argmax(bins>game_avg) - 1
#     sum_likelihood = cdf[index]
#     # print(sum_likelihood)
#     likelihoods[game] = (game_avg, n_players, sum_likelihood)


sumlike = pd.read_csv('./dataset/match_sum_likelihood.csv')
sumlike.head()


plt.hist(sumlike['sum_likelihood'])
plt.xlabel('Cumulative Probability of Occurrence')
plt.ylabel('Number of Games')

Text(0, 0.5, 'Number of Games')


prodf.head()


genlobdf = pd.DataFrame(index = lf_kd_by_game.keys(), data = lf_kd_by_game.values())
genlobdf.columns = ['avg_lobby_lifetime_kd', 'num_players']
genlobdf.head()


avg_pro_kd = st.mean(prodf['avg_lobby_lifetime_kd'])
avg_gen_kd = st.mean(genlobdf['avg_lobby_lifetime_kd'])
std_pro_kd = st.stdev(prodf['avg_lobby_lifetime_kd'])
std_gen_kd = st.stdev(genlobdf['avg_lobby_lifetime_kd'])
var_pro_kd = st.variance(prodf['avg_lobby_lifetime_kd'])
var_gen_kd = st.variance(genlobdf['avg_lobby_lifetime_kd'])
avg_kd =st.mean(df['lifetime_kd'])
std_kd = st.stdev(df['lifetime_kd'])
prolen = len(prodf['avg_lobby_lifetime_kd'])
genlen = len(genlobdf['avg_lobby_lifetime_kd'])
print(f'Average KD of Top Player\'s Games is: {avg_pro_kd}')
print(f'Average KD of General Player\'s Games is: {avg_gen_kd}')
print(f'Std Dev of KD of Top Player\'s Games is: {std_pro_kd}')
print(f'Std Dev of KD of General Player\'s is: {std_gen_kd}')
print(f'Variance of KD of Top Player\'s Games is: {var_pro_kd}')
print(f'Variance of KD of General Player\'s is: {var_gen_kd}')

Average KD of Top Player's Games is: 1.1463587618335986
Average KD of General Player's Games is: 1.012254568012402
Std Dev of KD of Top Player's Games is: 0.14089705204015202
Std Dev of KD of General Player's is: 0.16818777931045018
Variance of KD of Top Player's Games is: 0.019851979273605307
Variance of KD of General Player's is: 0.028287129109380693


pro_inside = 0
for kd in prodf['avg_lobby_lifetime_kd']:
    if abs(avg_pro_kd - kd) < (2 * std_pro_kd):
        pro_inside += 1
pro_percent = 100 * (pro_inside/len(prodf['avg_lobby_lifetime_kd']))
print(f'{pro_percent}% of Top Player\'s data is within 2 standard deviations of the mean.')

gen_inside = 0
for kd in genlobdf['avg_lobby_lifetime_kd']:
    if abs(avg_gen_kd - kd) < (2 * std_gen_kd):
        gen_inside += 1
gen_percent = 100 * (gen_inside/len(genlobdf['avg_lobby_lifetime_kd']))
print(f'{gen_percent}% of General Player\'s data is within 2 standard deviations of the mean.')

96.759941089838% of Top Player's data is within 2 standard deviations of the mean.
96.9208211143695% of General Player's data is within 2 standard deviations of the mean.


# difference(mean of private and overall mean) divided by stdev didvided by sqrt num users
pro_zscore = (avg_pro_kd-avg_kd) / (std_kd / st.sqrt(prolen))
gen_zscore = (avg_gen_kd-avg_kd) / (std_kd / st.sqrt(genlen))
print(f'The average lifetime KD of a player in our dataset is {avg_kd}')
print(f'The average lifetime KD of a player in a TOP PLAYER\'S game is {avg_pro_kd}, which is {pro_zscore} standard deviations from the mean')
print(f'The average lifetime KD of a player in a GEN POP game is {avg_gen_kd}, which is {gen_zscore} standard deviations from the mean')

The average lifetime KD of a player in our dataset is 1.024577792192975
The average lifetime KD of a player in a TOP PLAYER'S game is 1.1463587618335986, which is 8.795018263209576 standard deviations from the mean
The average lifetime KD of a player in a GEN POP game is 1.012254568012402, which is -1.092407307924713 standard deviations from the mean


T=(avg_pro_kd - avg_gen_kd)/st.sqrt((std_pro_kd**2)/prolen + (std_gen_kd**2)/genlen)
print(f'T-distribution value: {T}')
deg_of_freedom = (((std_gen_kd**2)/genlen + (std_pro_kd**2)/prolen)**2)/((((std_gen_kd**2)/genlen)**2)/(genlen-1)+(((std_pro_kd**2)/prolen)**2)/(prolen-1))
print(f'Degrees of Freedom: {deg_of_freedom}')
# note here we are using -T because positive T gives us a value of 200% which does not make sense in probability 
p_value = 2*scistats.t.cdf(-1*T, deg_of_freedom, loc=0, scale=1)
print(f'Probability value: {p_value}')

T-distribution value: 25.144680744771893
Degrees of Freedom: 3224.004288059503
Probability value: 1.4902752992955875e-127

	id	avg_lobby_lifetime_kd
0	15379849324100702267	1.085038
1	13602511679897282652	1.147214
2	6161366157749200230	1.065039
3	5821957974244316095	1.116253
4	15541493233622371097	1.167926

	gameid	avg_kd	lobby_size	sum_likelihood
0	377175583563765943	1.091592	136	0.953550
1	1867167520034580454	1.054870	151	0.834745
2	13063882378137964531	1.475153	34	0.997929
3	8648205854600033849	1.219164	146	0.991239
4	3981297559543080699	1.130915	148	0.987540

	id	avg_lobby_lifetime_kd
0	15379849324100702267	1.085038
1	13602511679897282652	1.147214
2	6161366157749200230	1.065039
3	5821957974244316095	1.116253
4	15541493233622371097	1.167926

	avg_lobby_lifetime_kd	num_players
377175583563765943	1.091592	136
1867167520034580454	1.054870	151
13063882378137964531	1.475153	34
8648205854600033849	1.219164	146
3981297559543080699	1.130915	148

Introduction¶

What is Call of Duty/Warzone?¶

Why would someone care about their matchmaking?¶

What exactly is skill based matchmaking?¶

What is the goal of our project?¶

Data Collection¶

Missing Data¶

Exploratory Data Analysis (EDA) and Data Visualization¶

Hypothesis Testing and Evaluation of Null Model¶

Conclusion¶

	id	username	platform	lifetime_kd
0	377175583563765943	NaN	NaN	0.786765
1	377175583563765943	NaN	NaN	0.751709
2	377175583563765943	NaN	NaN	0.727273
3	377175583563765943	NaN	NaN	0.220339
4	377175583563765943	NaN	NaN	0.548535