Math Gamification

 

Materials for foreign and Russian-speaking students studying the Data Analysis course in English 
 

Basic discrete distributions \ Базовые дискретные распределения 

Классификатор Байеса 

 

library(e1071)

data(Titanic)

m <- naiveBayes(Survived ~ .,

             data = Titanic)

predict(m, as.data.frame(Titanic))

Эмпирические законы распределения

 

x<-pbinom(1:7, size=20,1/2)  
n <- length(x)
x <- sort(x); vals <- unique(x)
rval <- approxfun(vals,

        cumsum(tabulate(match(x, vals)))/n, 

        method = "constant", yleft = 0, yright = 1, 

        f = 0, ties = "ordered")
plot(rval,ylab='F(x)')

Basic continuous distributions \ Базовые непрерывные распределения

R scripts \ Код

Wrkshp 12 - Integrate if pdf is known distribution

 

integrand <-function(x){dunif(x, min = 0, max = 3)}
integrate(integrand, lower = 2, upper = 3)
integrate(integrand, lower = 0, upper = 1)

Дискретные случайные величины

 

#Равномерное распределение - кубик
s<-1:6
p<-c(1/6,1/6,1/6,1/6,1/6,1/6) 
sum(p) #проверка
plot(s,p,col="red",type="h")

Ms=sum(s*p)

Ds=sum(s^2*p)-Ms^2  #1 формула
DDs=sum((s-Ms)^2*p) #2 формула

Uniform distribution pdf&cdf \ Плотность и кумулята равномерного закона

 

curve(dunif(x, min = 1, max = 2), from = -1, to = 3,
      xlab='x', ylab='f(x)', main='PDF for Unif(1,2)')
curve(punif(x, min = 1, max = 2), from = -1, to = 3,
      xlab='x', ylab='F(x)', main='CDF for Unif(1,2)')

Функции распределения

 

library(mosaic)

plotDist('norm', mean=1, sd=1, col="red",kind="density", under=TRUE) 
plotDist('norm', kind='cdf')
plotDist('exp',  kind='histogram')
plotDist('binom', 25, .25)
          

Приведение к стандартному нормальному

 распределению 

 

library(mosaic)

plotDist("norm")

integrand <- function(x) {dnorm(x, mean=0, sd=1)} 

integrate(integrand, lower =(52.5-45)/sqrt(18), upper = Inf)

Correlation matrix \ Корреляционная матрица 

 

library(corrplot)

cormat<-cor(as.matrix(dataset_name))
corrplot(cormat, method = 'number', order='FPC')

# or / или

library(corrgram)
corrgram(dataset_name, font.labels=6,

lower.panel=panel.ellipse, upper.panel=panel.cor, diag.panel=panel.density)

Нелинейная связь

 

library(devtools); devtools::install_github("r-lib/remotes")
install_github("ProcessMiner/nlcor", force=TRUE); library(nlcor)

a<-c(1,2,3,4,5,6,7,8,9,10,11,12,13); b<-c(1,1,2,3,4,5,7,5,4,3,2,1,1)
plot(a,b,  lwd = 10)
cor(a,b); ab <- nlcor(a, b, plt = T); ab$cor.estimate  
print(ab$cor.plot)

Portfolio of 2 stocks \ Портфель двух активов

 

risk <- function(x1,x2,s1=0.05,s2=0.14,ro=0.36) 

{(s1^2*x1^2+s2^2*x2^2+2*ro*s1*s2*x1*x2)} 
gb_risk<- function(x) risk(x[1],x[2])
constraint.mat<-rbind(c(-1,-1), # matrix of constraint coefficients
                 c(1,0), c(0,1), c(0.16,0.23)) 
b<-c(-1,0,0,0.1)
constrOptim(c(0.4,0.4),gb_risk,NULL,constraint.mat,b)

Wrkshp 1 - Basic Statistic Functions \Основные статистики

 

a<-1:30
summary(a) #stats at a glance

mean(a) 
median(a)

var(a) #variance

sd(a) #standard deviation

min(a), max(a)
quantile(a)

IQR(a) #an interquartile range
boxplot(a) #the box and whiskers plot

Combinatorial Formulas \ Комбинаторика

 

# computing the number of combinations
n=16
k=14
C1=factorial(n)/(factorial(k)*factorial(n-k))

#or

C2=choose(n,k) 

#Excel
ФАКТР(n)/ФАКТР(k)*ФАКТР(n-k)

#permutations

library(combinat)
permn(x=c("A","B","C")) 
permn(x=2:5) 

How to solve \ Файлы решений

Symbolic calculus\Символьные вычисления

 

library(rSymPy) #old version 
.jinit()
sympy("var('x')")
sympy("var('y')")
sympy("var('C')")

sympy("integrate(0.5*x + C*y,(y,0,2),(x,0,1))")

 

library(Ryacas) #new version
f3y <- ysym("0.75*(2-2*x^2)")
integrate(f3y, "x"))

shared by Владислав Бычков МБНиА24-1

 

Exponential distribution pdf&cdf \ Плотность и кумулята экспоненциального закона

 

x<-0:10
rate=5 #rate = lambda = 1/E(x)
plot(x,dexp(x,rate),type='l')
plot(x,pexp(x,rate),type='l')

Bernoulli Distribution - Modeling the outcome of a single trial with two possible outcomes (success/failure).

Example: probability of a head in 1 throwing a coin

R

pmf: dbinom(x, size=1, prob) 

cmf: pbinom(q, size=1, prob)

quantile: qbinom(q, size=1, prob)

Excel

pmf:

Binom.Dist(number_success, number_trials=1, prob, 0)

cmf:

Binom.Dist(number_success, number_trials=1, prob, 1)

Binom.Inv(trials=1, probability_s, alpha)

 

 

Binomial Distribution - Modeling the number of successes in a series of independent trials (success/failure).

Example: number of heads in a series of throwing a coin

R

pmf: dbinom(x, size, prob) 

cmf: pbinom(q, size, prob)

quantile: qbinom(q, size, prob)

Excel

pmf:

Binom.Dist(number_susccess, number_trials, prob, 0)

cmf:

Binom.Dist(number_susccess, number_trials, prob, 1)

quantile:

Binom.Inv(trials, probability_s, alpha)

 

Geometric Distribution - Modeling the number of failures before the first success in independent trials

Example: number of misses the target until hit 

R

pmf: dgeom(x, prob)      cmf: pgeom(x, prob)  

quantile: qgeom(q, prob)

Excel -----------------

 

Hypergeometric Distribution - Selection without replacement from a finite population 

Example: ballot selection, lotto

R

pmf: dhyper(x, m, n, k) cmf: phyper(q, m, n, k)

quantile: qhyper(q, m, n, k)

Excel

pmf: HYPGEOM.DIST(sample_s,number_sample,population_s,number_pop, 0)

cmf: HYPGEOM.DIST(sample_s,number_sample,population_s,number_pop,1)

quantile: -------

 

Uniform Distribution - Modeling equally likely events over a specified interval

Example: The waiting time for a train that arrives every 15 minutes follows a continuous uniform distribution between 0 and 15 minutes

R

pdf: dunif(x, min, max)

quantile: qunif(q, min, max)

 

Exponential Distribution - Modeling the time until an event occurs

Example: time between arrivals

R

pdf: dexp(x, rate)    cdf: pexp(x, rate)

quantile: qexp(q, rate)

Excel

pdf:

 

Normal Distribution - Modeling random variables with central tendency

Example: height, weight.

R

pdf: dnorm(x, mean, sd) cdf: pnorm(p, mean, sd)

quantile: qnorm(q, mean, sd)

Excel

pdf: Norm.Dist(x, mean, sd, cumulative)

quantile: Norm.Inv(p, mean, sd)

 

 

Log-Normal Distribution - Describes a variable whose logarithm is normally distributed

Example: positive continuous random variables  such as income, stock prices 

R

dlnorm(x, mean, sd)

 

Student's t-Distribution - Used in statistical tests with small samples or unknown variance

Example: hypothesis testing, confidence interval

R

pdf: dt(x, mean, sd)   cdf: pt(prob, mean, sd)

quantile: qt(q, mean, sd)

Excel

pdf, cdf: T.DIST(x, degrees_freedom, cumulative)

Right tail probability: T.DIST.RT(x, degrees_freedom)

Two tails probability: T.DIST.2T(x, degrees_freedom)

T.INV(probability, degrees_freedom)

 

Negative Binomial distribution models the number of failures until a specified number of successes occurs in a series of independent Bernoulli trials

Example: 5 heads in a series of throwing a coin

R

pmf: dnbinom(x, size, prob) 

cmf: pnbinom(q, size, prob)

quantile: qnbinom(q, size, prob) 

Excel

pmf: NEGBINOM.DIST(number_f,number_s,probability_s,0)

cmf: NEGBINOM.DIST(number_f,number_s,probability_s,1)

quantile: NEGBINOM.INV(number_f,number_s,probability_s,alpha)

 

Poisson Distribution - Modeling the number of events in a fixed period of time

Example: number of calls per hour

R

pmf: dpois(x, lambda)    cmf: ppois(q, lambda)

quantile: qpois(q, lambda)

Excel

pmf: POISSON.DIST(x,mean,0)

cmf: POISSON.DIST(x,mean,1)

quantile: -------

 

  • Биномиальная случайная величина определяет вероятность m успехов в n испытаниях. Биномиальная случайная величина - выборка с возвращением, т.е. вероятность успеха остаётся постоянной для всей серии испытаний.
  • Гипергеометрическое распределение - это выборка без возвращения (вероятность успеха изменяется после каждого испытания).
  • Геометрическая случайная величина - вероятность m испытаний до первого успеха (включая первый успех). Геометрическое распределение - это дискретный вариант экспоненциального распределения
  • Отрицательное биномиальное рспределение - обобщение геометрического распределения. Оно описывает распределение количества неудач k в независимых испытаниях, исход которых распределен по Бернулли с вероятностью успеха  до наступления r успехов в сумме.

 

Generalization \ Обобщение 

Discrete Uniform Distribution - Modeling equally likely events over a specified interval

Example: a dice

Flat probability density curve

 

 

Standard Normal Distribution - Normal Distribution with mean=0 and sd=1 

Example: height, weight.

R

pdf: dnorm(x) cdf: pnorm(p)

quantile: qnorm(q)

Excel

pdf, cdf: Norm.Dist(x, mean, sd, cumulative)

quantile: Norm.Inv(p, mean, sd)

 

 

Pareto Distribution - A model for rare but impactful events

Example: wealth, crises, disasters

R

library(actuar)

pdf: dpareto(x, shape = α, scale = xₘᵢₙ) cdf: ppareto(q, shape = α, scale = xₘᵢₙ)

quantile: qpareto(p, shape = α, scale = xₘᵢₙ)

Excel ----------

Wrkshp 2 - Combinatorics \Комбинаторика

 

factorial(a) #factorial
choose(n, k) #combination

prod((n-k+1):n) #n*(n-1)*..*(n-k+1)
combn(8,5) #all combinations of 8 choose 5

n^k #arrangements with repetition

choose(n+k-1,k) #combinations with repetition

library (combinat)

permn(3) #all permutations of (1,2,3)

library(combinat)
combn(n, k) #all subsets of k-size from n 

 

Wrkshp 11 - Integrate to get F(x) from f(x)

 

I <-function(x){1/x^2}
integrate(I, lower = 2, upper = 3)

#or 

integrate(function(x) {1/x^2}, 2, 3)

# to find const in function C/x^2

C=1/integrate(function(x) {1/x^2}, 2, 3)$value

Двойной интеграл

 

library(rSymPy) #symbolic calculations
.jinit()
sympy("var('x')")
sympy("var('y')")
sympy("var('C')")

sympy("integrate(0.5*x + C*y,(y,0,2),(x,0,1))")