9. Statistics¶

데이터 표현¶

num_friends = [100,49,41,40,25,21,21,19,19,18,18,16,15,15,15,
               15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,
               10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,
               9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,
               7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,
               6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
               5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,
               3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,
               2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

Counter와 plt.bar()를 이용하여 데이터 표현

%matplotlib inline

from collections import Counter
from matplotlib import pyplot as plt

friend_counts = Counter(num_friends)
xs = range(101)
ys = [friend_counts[x] for x in xs]
plt.bar(xs, ys)
plt.show()

통계량¶

자료로부터 몇 가지 통계량을 적용해 보자.

num_points = len(num_friends)
print(num_points)

204

lagest_value = max(num_friends)
print(lagest_value)

100

smallest_value = min(num_friends) 
print(smallest_value)

1

sorted_vaule = sorted(num_friends) 
print(sorted_vaule)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16, 18, 18, 19, 19, 21, 21, 25, 40, 41, 49, 100]

smallest_value = sorted_vaule[0]
print(smallest_value)

1

second_smallest_value = sorted_vaule[1]
print(second_smallest_value)

1

second_largest_value = sorted_vaule[-2]
print(second_largest_value)

49

중심 성향 - 평균¶

# Python2.x에서는 from __future__ import division가 필요하나 
# Python3.x에서는 필요없음
from __future__ import division
def mean(x):
    return sum(x)/len(x)

mean(num_friends)    #7.333333

7.333333333333333

중심 성향 - 중앙값¶

def median(v):
    """finds the 'middle-most' value of v"""
    n = len(v)
    sorted_v = sorted(v)
    
    if n % 2 == 1:
        # if odd, return the middle value
        return sorted_v[n // 2]        # n // 2는 n을 2로 나눈 몫을 뜻함
    else:
        # if even, return the average of the middle values
        return (sorted_v[n // 2 - 1] + sorted_v[n // 2]) / 2

median(num_friends)     #6.0

6.0

표본백분위수 - percentile¶

표본의 제 100p 백분위수 계산

데이터를 작은 것부터 크기순으로 나열한다.
(표본크기)×(비율)=𝑛𝑝를 구한다.
만일 𝑛𝑝가 정수가 아니면, 다음 정수로 올림하고 그에 대응되는 순서화된 값을 찾는다.
만일 𝑛𝑝가 𝑘인 정수이면, 𝑘번째와 (𝑘+1)번째 순서화된 값의 평균을 구한다.

def percentile(x, p):
    """returns the pth-percentile value in x"""
    # 주의 : p = 0 일 때는 잘 작동하지 않음.
    
    np = len(x) * p
    sorted_x = sorted(x)
    
    if np % 1 == 0:
        return (sorted_x[int(np) - 1] + sorted_x[int(np)])/2
    else:
        return sorted_x[int(np)]

percentile(num_friends, 0.10)   # 1

1

percentile(num_friends, 0.25)   # 3

3.0

percentile(num_friends, 0.75)   # 9

9.0

최빈값 – mode¶

def mode(x):
    """returns a list, might be more than one mode"""
    counts = Counter(x)
    max_count = max(counts.values())
    return [x_i for x_i, count in counts.items()
            if count == max_count]

mode(num_friends)    # 1 and 6

[6, 1]

범위 – range¶

def data_range(x):
    return max(x) - min(x)

data_range(num_friends)     # 99

99

사분범위 - Interquartile range¶

def interquartile_range(x):
    return percentile(x, 0.75) - percentile(x, 0.25)

interquartile_range(num_friends)   # 6

6.0

표본 분산 - variance¶

Linear algebra 단원에서 작성한 sum_of_squares 함수를 이용한다.

def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def sum_of_squares(v):
    return dot(v, v)

def de_mean(x):
    """translate x by subtracting its mean (so the result has mean 0)"""
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]

def variance(x):
    """assumes x has at least two elements"""
    n = len(x)
    deviations = de_mean(x)
    return sum_of_squares(deviations) / (n - 1)

variance(num_friends)   # 81.54

81.54351395730716

표준 편차¶

import math

def standard_deviation(x):
    return math.sqrt(variance(x))

standard_deviation(num_friends)    # 9.03

9.03014473623248

공분산¶

daily_minutes = [1,68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,
                 54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,
                 35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,
                 21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,
                 40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,
                 35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,
                 29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,
                 17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,
                 25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,
                 44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,
                 35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,
                 20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,
                 24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,
                 29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,
                 14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,
                 33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,
                 31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,
                 17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,
                 23.48,8.38,27.81,32.35,23.84]

공분산

def covariance(x, y):
    n = len(x)
    return dot(de_mean(x), de_mean(y)) / (n - 1)

covariance(num_friends, daily_minutes)   # 22.43

22.425435139573064

상관관계¶

def correlation(x, y):
    stdev_x = standard_deviation(x)
    stdev_y = standard_deviation(y)
    if stdev_x > 0 and stdev_y > 0:
        return covariance(x, y) / stdev_x / stdev_y    
    else:
        return 0      # if no variation, correlation is zero

correlation(num_friends, daily_minutes)  # 0.25

0.24736957366478218

scatter plot으로 데이터 확인¶

import matplotlib.pyplot as plt
plt.scatter(num_friends, daily_minutes)
plt.show()

Outlier¶

100명의 친구를 가진 사람은 outlier라고 간주해 보자.
- 상관관계(correlation)는 outlier에 민감함.

plt.scatter(num_friends, daily_minutes)
plt.annotate("outlier", xy=(100, 0), xytext=(-40, 20), 
             textcoords='offset points', arrowprops={"arrowstyle" : "->"})
plt.show()

outlier = num_friends.index(100)
num_friends_good = [x for i, x in enumerate(num_friends)
                         if i != outlier]
daily_minutes_good = [x for i, x in enumerate(daily_minutes)
                         if i != outlier]
correlation(num_friends_good, daily_minutes_good)   # 0.57

0.5736792115665573

plt.scatter(num_friends_good, daily_minutes_good)
plt.show()

numpy를 이용한 평균¶

mean을 이용하여 평균을 구함. function이나 method로 활용

import numpy as np
x = np.arange(10)
print(x.mean())                 #4.5
print(np.mean(x))              #4.5

4.5
4.5

행렬 형식의 데이터의 평균

b = np.array([[0,1,2], [3,4,5]])
print(b.mean())
print(b.mean(0))
print(b.mean(1))

2.5
[1.5 2.5 3.5]
[1. 4.]

numpy를 이용한 중앙값¶

np.median은 함수 형식으로 존재

x = np.random.randn(4, 5)
print(np.median(x))
print(np.median(x, 0))
print(np.median(x, 1))

0.7032520518358474
[0.27776393 0.98835516 0.72885183 0.00309088 0.70325205]
[0.13311176 0.70179697 0.90549454 0.38980393]

여기서 np.random.randn(4, 5)는 표준정규분포를 따르는 난수로 이루어진 4×5 행렬 생성

numpy를 이용한 분산과 표준편차¶

np.std와 np.var함수를 이용하여 분산과 표준편차를 계산

x = np.random.randn(4, 5)
print(np.std(x))
print(np.std(x, 0))
print(np.std(x, 1))

1.2125938608042772
[1.03717881 0.9030792  1.21249364 0.43835015 1.11308511]
[1.01231363 1.21633294 0.59926683 1.52108491]

ddof 인자를 이용하여 자유도 설정 가능 (기본값 : ddof=0)

print(np.std(x, ddof=1))   #표준편차 계산시 분모를 N-1로

1.2440950510565472

numpy를 이용한 상관계수¶

np.corrcoef(x)는 x가 2차원 행렬일 때, 각 행들간의 상관계수 행렬 계산

x = np.random.randn(3, 4)
print(np.corrcoef(x))

[[ 1.         -0.31716391 -0.00406439]
 [-0.31716391  1.          0.58301553]
 [-0.00406439  0.58301553  1.        ]]

np.corrcoef(x, y)는 x, y가 각각 1차원 array일 때 x와 y간의 상관계수 행렬 계산

print(np.corrcoef(x[0], x[1]))

[[ 1.         -0.31716391]
 [-0.31716391  1.        ]]

공분산 행렬은 np.cov()를 이용하여 계산

Python Programming (9) - 통계

9. Statistics¶

데이터 표현¶

통계량¶

중심 성향 - 평균¶

중심 성향 - 중앙값¶

표본백분위수 - percentile¶

최빈값 – mode¶

범위 – range¶

사분범위 - Interquartile range¶

표본 분산 - variance¶

표준 편차¶

공분산¶

상관관계¶

scatter plot으로 데이터 확인¶

Outlier¶

numpy를 이용한 평균¶

numpy를 이용한 중앙값¶

numpy를 이용한 분산과 표준편차¶

numpy를 이용한 상관계수¶