第课:基知识
第二课:基运算
第三课:画图
第四课:数整理数分析
第五课:统计分析
第六课:分布产生
第七课:求导极值计算
第八课:Bootstrap方法
第九课:MCMC算法
第课 基知识
. R历史特点
Objectoriented 语言
二. R系统介绍
安装镜
Installing packages
library(survival) # 加载程序包
library(helpsurvival) # 显示程序包 survival 里容
help(mean) #查关函数 mean 帮助
search() # 出系统里已加载程序包
searchpaths() # 出系统里已加载程序包路径
Commands are separated either by semicolon or a new line
三. Data objects
# There are 7 basic types of data objects
# Vector Matrix Array List Factor Time series Data frame
# creating vectors
# vector type character numeric integer logical complex
my1
rep(NA5)
rep(c(TTF)2)
rep(c(yesno)c(42))
15
124
11
seq(pipi 5)
seq(pipi length10)
seq(1by05 length10)
a1 vector(logical3)
a2 numeric(4)
a3 complex(5)
a4 character(6)
#what’s the difference between
mat1 < rep(14rep(34))
mat2 < rep(143)
scan()
scan(what) #扫描字符型变量
numberedletters < letters
names(numberedletters) < paste(obs126sep)
# generating matrix
mat1 < matrix(112 ncol3 byrowT)
dimnames(mat1) list(c(row 1row 2row 3row 4)c(col 1col 2col 3))
mat2 < matrix(112 ncol4 dimnameslist(NULLpaste(col14)))
mat < rep(14rep(34))
dim(mat) < c(34)
dimnames(mat) < list(paste(rowletters[13])paste(colLETTERS[14]))
rbind(c(2006882433)c(20108327115))
dim(mat)
# array
array(c(181118111118)dimc(243))
#list
attributes(iris)
grp < c(rep(111)rep(210))
thw < c(450760325495285450460375310245350340300310270300360405290)
heartlist < list(groupgrp thwthw descripheart data)
names(heartlist)
names(heartlist) < c(grouptotal heart weight descrip)
# factors
classlist < c(male female male male malefemale female male female)
classlist factor(classlist)
factor(c(HiLoMedHiHiLo)levelsc(LoHi)labelsc(LowDoseHighDose))
cut(statex77[Murder] breaksc(0816))
cut(statex77[Murder] breaks2)
cut(statex77[Murder] c(0816) labelsc(LowHigh))
# data frames
cars readtable(D93carstxt headerTRUE)
cars1 asmatrix(cars)
cars1
a 19
b letters[19]
classlist < c(malefemalemalemalemalefemalemalemalemale)
classlist factor(classlist)
all dataframe(ab classlist)
isfactor(allclasslist) #列保持属性
mylogical < sample(c(TF) size20 replaceT)
mycomplex < rnorm(20) + runif(20)*1i
mynumeric < rnorm(20)
mymatrix < matrix(rnorm(40) ncol2)
mydf1 cbind(mylogical mycomplex mynumeric mymatrix) #矩阵
mydf dataframe(mylogical mycomplex mynumeric mymatrix)
#指令dataframe
mydf < dataframe(a rnorm(20) b sample(c(TF) size20 replaceT))
#missing and special values
> 50
Inf
> 00
NaN # not a number
> a c(123NA4) # NA represents missing value
> mean(a)
[1] NA
a c(asiadaNA) # NA 代表字符型missing value
a c(TFTT NA)
# NULL 代表空值
a NULL # a is a empty vector
for( i in 1 10) a[i] i^2
# testing and coercing
letter factor (letters)
> isvector(letter) # test whether letter is a vector
[1] FALSE
> letter asvector(letter) # coerce letter into a vector
ismatrix()
asmatrix()
isdataframe()
asdataframe()
islist()
aslist()
………
writetable(lizard ERnewRdata)
new readtable(ERnewRdata)
save(alizardnew x fileERmydumprda)
load(fileERmydumprda)
sink(ERnew22txt)
> lizard
> ls()
> sink()
四. Data manipulation
逻辑运算符号
符号
意义
等
等
>
等
>
<
<
等
# indexing vectors
myseq 1100
myseq[15] # 取出第第五元素
myseq[rep(25)]
myseq[c(12045)]
my c(14789)
my[c(35)] # 掉第三第五列
myseq seq(1 by3 length20)
myseq > 30 # 逻辑值量
myseq[myseq >30] # 出myseq里等30值
x < c(19 30 41 26 36 23 28 32 66 76 74 10)
x[ x > 2]
x[ x > 2 & x < 4]
x [ c(FFT)]
char c(mondaytuesdaywednesdaythursdayfriday)
names(char) c(v1v2v3v4v5)
char[v1] # 取出名v1元素
char[] # 显示元素
char[8] # 指标出界
stateabb
stateabb[Alaska]
statename
names(stateabb) < statename
stateabb[Alaska]
statename[ grep(So* statename) ]
# indexing data frames matrices and lists
J matrix( c(12 15 6 10 2 9 2 7 19 14 11 19) nrow3 byrowTRUE)
J[9]
J[ c(13) c(24) ]
J[ c(24) ]
J[ 1 c(24) ]
J[13 dropF]
statex77 # Data sets related to the 50 states of the United States of America
help(statex77)
illit < statex77[150 3] #美国1970年文盲率
illit < statex77[ 3]
statex77[illit > 2 35]
statex77[Arizona Area] # Arizona 州面积
stackx[c(FFT) grep(Wa* dimnames(stackx)[[2]]) dropF]
state dataframe(statex77)
dimnames(state)
statelifeexp # statex77[Life Exp] 较
heartlistgroup
heartlist[[1]]
# combining data frames or matrices
a matrix(120 ncol 5)
b matrix(160 ncol 5)
d matrix(172 nrow 12)
c1 rbind(ab)
c2 cbind(bd)
# combining data frames
state_no 150
state dataframe(statestate_no)
# sorting
a 150
sort(a)
sort(a decreasingTRUE)
a round( rnorm(20))
order(a)
aa state[order(stateArea statePopulation stateIncome decreasingTRUE)]
第二课 基运算
. Vectorized calculation
数学运算
符号
意义
+
加法
减法
*
法
法
^ **
幂次
*
矩阵相
J matrix(11025)
sqrt(J)
J+3
a < 5*(03)
x < c(246810)
y < 15
z ((3*x^2+2*y)((x+y)*(xy)))^(05)
x 110
x < 2 | x > 4
x > 2 & x < 4
x c(19 30 41263623283266767410)
y x
any(x > 1) && all( y < 0)
any(x > 1) && any (y > 0 )
all( x < 8 ) || 2 > 7
# Pearson chisquared Statistic
X^2 ∑ (f_{ij} – e_{ij})^2 e_{ij} e_{ij} f_{i}f_{j}f_{}
f matrix(120 ncol4)
fi f * rep(1ncol(f)) # 计算列联表fPearson卡方统计量:
fj rep(1nrow(f)) * f
e < (fi * fj)sum(fi)
x2 sum((fe)^2e)
二.apply and tapply
O < Orange
col_mean apply(O[23] 2 mean)
a array(1100 c(2510))
apply(a c(12) mean)
attach(iris)
tapply(iris[1] Species mean)
三 sapply应
x < list(a110betaexp(33)logicc(TRUEFALSETRUEFALSE))
lapply(xmean) #函数mean应x分量
lapply(110 sqrt) # lapply 量
lapply(110 function(x) x^2) # lapply 定义函数
# sapply(x fun … simplify TRUE) #运算结果矩阵
sapply(110 sqrt) # 结果量
sapply(110 function(x) x^2)
sample(10) # 数列110机置换
# 产生mn置换结果存入mxn矩阵?
sapply(17 function(o) sample(10))
t(sapply(17function(o) sample(10)))
X< function(mn) t(sapply(1mfunction(o) sample(n))) #o 哑变量t 表示转置
四 n元素机置换中少动点概率少?
Y
Y[Y>0] #包含Y零元素子量
Y[Y0] # 动点
length(Y[Y0]) #动点数
fixedpoints < function(mn) sapply(1m function(o) { Z < sample(n) 1n length(Z[Z0]) })
smry
q75 quantile(X075)
c(mumusigmasigmaminaq25q25q50q50q75q75maxb)
}
FP
apply(FP2smry)
t( apply(FP2smry))
五较处理量两种方法
循环语句
a NULL
for ( i in 1index) a[i] ……
#函数格式
# functionname < function(x) {………………………………}
fixedpointsfast
timestart
cat('User time elapsed (fast)'timeused[1] '\n')
fixedpointsslow < function(mn)
{ FP < rep(0m) #gives a zero vector of length m
for (j in 1m) { W < sample(n)
k < 0
for ( i in 1n) { if (W[i]i) k < k+1}
FP[j] < k }
FP }
timestart
cat('User time elapsed (slow)'timeused[1] '\n')
#the fast approach takes only 60 as long as the slow approach
六 单位正方形凸型
CL < function(o) floor(657*runif(1))+1 # random color
k < 110
sum(k) # quick way to add all integers from 1 to 10
v < sqrt(110)
sum(v) # quick way to add the square roots of 1 to 10
sum(v[k]) # another quick way to do the same (but why do this)
v < c(0v) # prepends the vector v with a 0
v
v[k+1]v[k] # the vector of successive differences of square roots
sum((v[k+1]v[k])^2) # sum of squared successive differences
cnvx < function(n) { X < matrix(runif(2*n) ncol2)
x11(width5 height55)
plot(X xlim01 ylim01 pch22)
h < chull(X)
h < c(h h[1])
lines(X[h ]colCL()lwd3)
}
cnvx(12)
cnvx(12)
cnvx(12)
# 单位圆凸型
rdisk < function(q) { R < matrix(runif(10000) ncol1) #产生单位圆点半径
Theta < matrix(runif(1000002*pi) ncol1) #产生单位圆点夹角
X < cbind(R^q*cos(Theta)R^q*sin(Theta))
x11(width5height55)
plot(Xxlimc(11)ylimc(11)pch)
uppcrcl < function(x) sqrt(1x^2)
lowcrcl < function(x) sqrt(1x^2)
curve(uppcrclfrom1to1addT)
curve(lowcrclfrom1to1addT) }
rdisk(1) # too much weight at center
rdisk(12) # uniformly distributed (can be proved via calculus)
rdisk(14) # too much weight at boundary
cnvx < function(n) { # convex hull of n points in the unit disk
R < matrix(runif(n) ncol1)
Theta < matrix(runif(n02*pi) ncol1)
X < cbind(sqrt(R)*cos(Theta)sqrt(R)*sin(Theta))
x11(width5height55)
plot(Xxlimc(11)ylimc(11)pch22)
h < chull(X) h < c(h h[1])
lines(X[h ]colCL()lwd3)
uppcrcl < function(x) sqrt(1x^2)
lowcrcl < function(x) sqrt(1x^2)
curve(uppcrclfrom1to1addT)
curve(lowcrclfrom1to1addT)
}
cnvx(12)
第三课 画图
.常统计作图
# plot
x < seq(5 5 1)
y < x^2
plot(x y pchX mainMain Title subSubtitle xlabX Axis Label ylabY Axis Label xlimc(8 8) typeo lty2)
plot(x y axesF typeb pchx xlab ylab)
axis(1 atc(0 1 2 pi 4 5 2*pi) labelsc(0 1 2 Pi 4 5 2 Pi) pos0)
axis(2 atc(1 05 0 025 05 075 1) adj1)
abline(hc(1 05 05 1) lty3)
text(pi 01 sin(pi)0 adj0)
title(The sine function\nfrom 0 to 2 Pi)
x 55 # generate −5 −4 3 4 5
y x^2 # y equals x squared
par(mfrowc(3 2)) # set a multiple figure screen
plot(x y) # and create the graphs
plot(x y type l) # in different styles
plot(x y type b)
plot(x y type h)
plot(x y type o)
plot(x y type n)
mtext(Different options for the plot parameter type side3 outerT line1)
par(mfrowc(11))
plot(1100 sin(110010))
plot(x < sort(rnorm(47)) type s main plot(x type \s\))
points(x cex 5 col dark red)
plot(cars) # cars 第列x量第二列y量
lines(lowess(cars)) # locally –weighted polynomial regression滑曲线
# 现图加图形指令
abline(a b) # Line with intercept a and slope b
abline(h c) # vertical line
abline(v c) # horizontal line
abline(lmobject)
arrows(x1 y1 x2 y2) #Arrows from (x1 y1) to (x2 y2)
points(x y) # Points at the coordinates given by x and y
lines(x y) #Lines through the points given by x and y
polygon(x y) #Shaded polygon figure
segments(x1 y1 x2 y2) #Disconnected line segments from (x1 y1) to (x2 y2)
# barplot
AirPassengers
air matrix(AirPassengers 12 12)
dimnames(air)[[2]] paste(19 4960 sep)
barplot(air names colnames(air) mainMonthly Airline Passenger Numbers 19491960)
barplot(air names colnames(air) anglec(45135) density10 col 1) # different style
# boxplot
boxplot(air[9] names 1957)
par(mfrowc(21))
boxplot(air[9] names 1957)
hist(air[9])
boxplot(air names colnames(air))
# hist
library(ISwR)
data(energy)
attach(energy)
expendlean expend[stature lean]
expendobese expend[stature obese]
par(mfrowc(21))
hist(expendlean breaks 10 xlim c(513) ylimc(04) col white )
hist(expendobese breaks 10 xlim c(513) ylimc(04) col grey )
boxplot(expend ~ stature)
二更复杂作图
# contour
# 二元标准正态等高线图
x seq(55 length50)
y x
f function(xy) exp(05*x^2 05*y^2)
z outer(x x f)
contour(x y z levelsseq(min(z)max(z)length10))
f function(xy) exp(05*x^2 05*y^2 08*x*y)
z outer(x x f)
contour(x y z levelsseq(min(z)max(z)length10))
x seq(55 length50)
f function(xy) exp(05*x^2 05*y^2)
z outer(x x f)
persp(z)
persp(z d 5 theta 160 phi 30)
f function(xy) exp(05*x^2 05*y^2 08*x*y)
z outer(x x f)
persp(z)
persp(z d 5 theta 160 phi 30)
三硬币赌博
假设游戏规:
1断掷均匀硬币直正面反面差绝值3时局结束
2果决定参加游戏次掷硬币时需付元钱允许中途退出
3次游戏结束时8元钱
问期收入少?
n < 120
x < rbinom(n105)
cumsum(x)
1ncumsum(x) # n次掷硬币反面数
abs(1n2*cumsum(x)) # n次掷硬币中正面反面绝值
d < abs(1n2*cumsum(x))
which(d3) # 绝值等三
min(c(which(d3)Inf)) # 绝值等三
rndtrials < function(mn) { sapply(1m function(o) {
D < abs(1n2*cumsum(rbinom(n105)))
min(c(which(D3)Inf)) }) }
TM < rndtrials(50000120)
smry(TM)
# 局均时间接理值 9期收入 1均说损失元钱
hist(TM[TM<50] freqF breaks25505 xlimc(050)borderdarkblue plotT)
graphicsoff() # close plot
四两硬币赌博 (The Game of Craps)
掷两骰子直列发生:
· 果第次掷出711第次掷出4568910 掷出7前掷出相结果赢
· 果第次掷出2312第次掷出4568910 掷出相结果前掷出7 输
请问赢概率?均掷骰子次数少?
rndrolls < function(n) # sequence of n rolls of two dice
{ S < sample(6nreplaceT) + sample(6nreplaceT) S }
rndrolls(80)
hist(rndrolls(50000) freqF breaks15125 xlimc(014) borderdarkblue plotT)
# triangular distribution for two dice
simcraps < function(SQNC) {
dur < 1
if (SQNC[1]7 | SQNC[1]11) win <1
else if (SQNC[1]2 | SQNC[1]3 | SQNC[1]12) win < 0
else # first roll is (>4 & <6) | (>8 & <10)
{ TMP < SQNC[2length(SQNC)]
rtrninit < min(c(which(TMPSQNC[1])Inf))
hitseven < min(c(which(TMP7) Inf))
if (rtrninit < hitseven) { win < 1 dur < 1+rtrninit }
else { win < 0 dur < 1+hitseven }
}
c(winwindurdur) }
SQNC < rndrolls(80)
simcraps(SQNC)
rndtrials < function(mn) t(sapply(1m function(o) simcraps(rndrolls(n))))
WD < rndtrials(5000080)
t(apply(WD2summary)) # theoretical probability of winning is 244495 0493
X < WD[2][WD[1]1]
dim(X) < c(length(X)1)
t(apply(X2summary)) # assuming a win what is duration of game
Y < WD[2][WD[1]0]
dim(Y) < c(length(Y)1)
t(apply(Y2summary)) # assuming a loss what is duration of game
{ hist(WD[2][WD[2]<20]freqF breaks05205ylimc(005) borderdarkblueplotT)
hx < hist(X[X<20]breaks05205plotF)density
for (j in 120) lines(c(j04j+04) c(hx[j]hx[j])colred) #赢次局数
hy < hist(Y[Y<20]breaks05205plotF)density
for (j in 120) lines(c(j04j+04) c(hy[j]hy[j])colgreen) #输掉次局数
}
graphicsoff() # close plot
五.解方程子分解
## while loops are sometimes unavoidable
# Newton's method for computing zeroes of a function
# Solve the equation x 2*(1 exp(x))
{ curve(2*(1 exp(x)) from0 to2 colred lwd2)
curve(1*x from0 to2 addTRUE colblue lwd2) }
options(digits15)
g < function(x) x 2*(1 exp(x))
uniroot(g lower15 upper18 tol10^(15))root # builtin function
x < 16 # 果没 uniroot 函数 while 循环
j < 0
del < 1
while(abs(del)> 10^(15) & j<1000) {
del < (x 2*(1 exp(x)))(1 2*exp(x)) # Newton's iteration method
x < x del
j < j+1
cat(j x \n) }
options(digits6)
# Here is a famous example from number theory
primes < function(n) { x < 1n
x[1] < 0
p < 1
m < floor(sqrt(n)) # pupdate occurs just prior to p
x[seq(p^2np)] < 0 # composites p^2 p^2+pp(p+1) p^2+2pp(p+2)
x[x>0] }
primes(100)
2912 # 29 modulo 12 that is remainder of 29 when divided by 12
n < 45 # let's factor 45 ()
tbl < primes(45)
tbl
fac < NULL
# first iteration ntbl 0s occur where any factor of 45 (3 or 5) resides
tbl < tbl[ntbl0] tbl n < nprod(tbl) # 4515 3
n
fac < c(factbl) # 3 & 5 added to list
fac
# second iteration ntbl # 0s occur where any factor of 3 (3 only) resides
tbl < tbl[ntbl0] tbl n < nprod(tbl) # 33 1 n
fac < c(factbl) # another 3 added to list
fac
# start of third iteration ntbl 1 has no prime factors hence no 0s occur
length(tbl < tbl[ntbl0]) # new tbl is empty
sort(fac) # put list in increasing order stop
prmfctrs < function(n) { tbl < primes(n)
fac < NULL
while (length(tbl < tbl[ntbl0]) > 0)
{ n < nprod(tbl)
fac < c(factbl) }
sort(fac) }
prmfctrs(12600)
第四课 数整理数分析
Data set BANK
bank0 < readdelim(Dbanktxt headerT)
T < summary(bank0)
bank < subset(bank0select c(idjobcatsalbeginjobtimepreexp))
str(bank) # str 函数出R object 结构(structure)
R读入数时默认变量均数值型非数值型变量解释 factors
str(bankgender) summary(bankgender) # 4空值没名字
str(bankminority) summary(bankminority) # 希数值变量属性变量
str(banksalary) summary(banksalary) # 希属性变量数值变量
str(bankbdate) summary(bankbdate) # 希属性变量时间变量
summary(bank) # 表起错
希改进数 BANK 描述
count < function(x) sum(isna(x)) # 计算样量
count(bankgender) # 未正确处理缺省值
GENDER < bankgender #处理缺省值
levels(GENDER) < c(ascharacter(NA) Female Male) # NA NA变字符型
count(GENDER)
dob asDate(bankbdate format mdY) #生日转数值变量
str(dob) dob[110] dob[432436] # 缺失时间转化 NA
count(dob)
tod < SysDate() # today's date c(julian(dob[1])julian(tod))
AGE < (julian(tod)julian(dob))36525 # 111970 应 julian0
summary(AGE)
MINORITY < factor(bankminoritylevels10) # minority 转factor(属性变量)
levels(MINORITY) levels(MINORITY) < c(YesNo)
str(bankminority)
str(MINORITY)
summary(MINORITY)
x < ascharacter(banksalary) #先 salary 转字符型掉转数值型
y < sub('\\'''x)
z < sub('\\'''y)
SALARY < asnumeric(z)
SALARY
summary(SALARY digits8)
EDUC bankeduc
BANK dataframe(GENDER AGE EDUC SALARY MINORITY) #全部合起
summary(BANK)
tapply(SALARYGENDERsummarydigits8) # the four NA's are tossed out appalling difference in mean salaries
# categorize EDUC and add a new column to the data frame BANK
EDUCAT cut(EDUC c(0121625)rightFALSE) #右端点包括区间
levels(EDUCAT) c(no HSno BA>BA)
BANK dataframe(BANK EDUCAT) # object时出现左右两端
BANK
# crosstabs between EDUCAT and GENDER
es < table(EDUCATGENDER)
print(proptable(es1)*100digits4) # 行百分
print(proptable(es2)*100digits4) # 列百分
print(es*100sum(es)digits3) # 总体中百分
# salary means by EDUCAT (the 4 sexless employees included this time)
tapply(SALARYEDUCATsummarydigits8)
# a sample histogram plot to finish things off
hist(AGEborderdarkbluecolpeachpuff)
二 数 93cars分析
cars0 < readtable(D93carstxt nastrings*headerT)
library(prettyR)
stats < c(meansdminq25q50q75maxvalidn)
cars < subset(cars0select c(manumodeltypepricectympghwympgcylwghtdmst))
cars # 1 cyl cell marked * in cars0 changed to NA in cars
summary(carstype)
summary(carsprice[carsdmst0]) # price of foreign cars
attach(cars)
by(pricedmstsummary)
by(wghtdmstsummary) # wish more compact display of these & more
frgncars < subset(carsdmst0select c(pricectympghwympgwght))
dmstcars < subset(carsdmst1select c(pricectympghwympgwght))
describe(dmstcarsnumdescstats) # stats vector defined at top
# Welch approximation used for the case of unequal variances
ttest(frgncarsprice dmstcarsprice)
par(mfrowc(21)) # 两直方图张作图纸张张面
hist(frgncarspricebreaksseq(5655)xlimc(565)ylimc(020)colorange)
hist(dmstcarspricebreaksseq(5655)xlimc(565)ylimc(020)colgreen)
par(mfrowc(11)) # reset
# 行盒状图
boxplot(frgncarsctympg
dmstcarsctympg
frgncarshwympg
dmstcarshwympg
borderred
namesc(Foreign CityDomestic City Foreign HighwayDomestic Highway) mainMiles per Gallon Performancecexaxis07)
allcars < subset(carsselect c(pricectympghwympgwght))
plot(allcars) # 数值型变量数值型变量太
# 称散点图矩阵(scatterplot matrix) 网格图形
# (Trellis graphics) & 探索性数分析(EDA)
cor(wghtctympg)
{ plot(wghtctympgxlabWeightylabCity MPGmainCity MPG versus Weight)
wc < lm(ctympg~wght) # 线性模型
abline(wccolredlwd2) # 画出二直线
}
zidentify(wghtctympg) # 三观察值异常 city MPG值
# 左点击点 obs number
z # 结束右点击选择 stop
par(mfrowc(32)mex05) # 开做图纸中放3X2图边缘压缩
plot(wcwhich16) # Cook 距离度量观测值回系数影响
par(mfrowc(11)mex1)} # 回默认设定
plot(wcwhich16) # 次张图
z # obs numbers for the three influential points
cars[z] # same as cars[c(394283)] inclusion
WGHT < wght[z] # same as wght[c(394283)] exclusion
CTYMPG < ctympg[z] # 掉三影响数
WC < lm(CTYMPG~WGHT) # 掉三点效果
summary(WC) # 线性模型结果总结(R^2 07754 > 07109)
ab
{ par(mfrowc(32)mex05) plot(WCwhich16) }
{ plot(WGHTCTYMPGxlabWeightylabCity MPGmainCity MPG versus Weight
(minus 3 outliers))
abline(wccolredlwd2) # old least squares line
abline(WCcolbluelwd2) } # new least squares line
三 prettyR
library(prettyR)
describe(BANK)
describe(BANKnumdescc(meansdminmedianmaxvalidn))
q25 < function(X) quantile(Xprobs025narmTRUE) # wrapper functions
q50 < function(X) quantile(Xprobs050narmTRUE)
q75 < function(X) quantile(Xprobs075narmTRUE)
describe(BANKnumdescc(meansdminq25q50q75maxvalidn))
brkdn(SALARY~GENDERBANKnumdescc(meansdvalidn))
# categorize EDUC and add a new column to the data frame BANK
EDUCAT cut(BANKEDUC c(0121625)rightFALSE)
levels(EDUCAT) c(no HSno BA>BA)
# crosstabs between EDUCAT and GENDER
ES < calculatextab(EDUCATGENDER)
printxtab(ES) # counts row percentages column percentages as well as marginal sums
brkdn(SALARY~GENDERBANKnumdescc(meansdvalidn))
四数
stdv function(x) { if (NROW(x)>1) sigma < round(sd(x)2) else sigma < 0
sigma }
smry < function(X){ mu < mean(X) sigma < stdv(X) c(mumusigmasigma)}
patients < c(00710211983070120080014004001500071201198307213009002000500200
00909031983066110070137003000000050705198307414008201300900000
00501151982080180096014020015000050618198207017008401400800400
0050703198306414008401400800200)
id < substr(patients13)
date < asDate(substr(patients411) format mdY)
hr < asnumeric(substr(patients1214)) sbp < asnumeric(substr(patients1517))
dbp < asnumeric(substr(patients1820)) dx < substr(patients2123)
docfee < asnumeric(substr(patients2427)) labfee < asnumeric(substr(patients2831))
tapply(hridmean) tapply(hridstdv)
# 希结果更紧凑表现出现
PATIENTS < dataframe(idhrsbpdbpdocfeelabfee)
str(PATIENTS)
smry(PATIENTS[id'005'26]) smry(PATIENTS[id'007'26]) smry(PATIENTS[id'009'26])
by(PATIENTS[26] id smry) by(PATIENTS[26] id summary)
# 计算 HR SBP DBP 观测值第观测值差
HrSbpDbp < dataframe(iddatehrsbpdbp)
order(HrSbpDbpid HrSbpDbpdate) # order of sorted rows
HrSbpDbpSorted < HrSbpDbp[order(HrSbpDbpid HrSbpDbpdate) ]
HrSbpDbpSorted # original row numbers still appear at left
ID < HrSbpDbpSorted[1] HR < HrSbpDbpSorted[3] SBP < HrSbpDbpSorted[4]
DBP < HrSbpDbpSorted[5]
which(ID005) # remember 'which' function from r5R & r6R
f < function(Xy) X[max(which(IDy))] X[min(which(IDy))] # use ID not id
c( f(HR005) f(HR007) f(HR009) ) # use HR not hr
c( f(SBP005) f(SBP007) f(SBP009) ) # use SBP not sbp
c( f(DBP005) f(DBP007) f(DBP009) ) # use DBP not dbp
TailHead < function(v) v[nrow(v)] v[1]
by(HrSbpDbpSorted[3] ID TailHead) by(HrSbpDbpSorted[4] ID TailHead)
by(HrSbpDbpSorted[5] ID TailHead)
# now want gaps (in days) between every pair of adjacent visits
DATE < HrSbpDbpSorted[2]
diff(DATE[which(ID005)]) # use DATE not date
diff(DATE[which(ID007)]) diff(DATE[which(ID009)])
by(DATE ID diff)
五掉 duplicated 值
attach(cars)
cars[order(price)] # sort in terms of increasing price
cars[order(ctympghwympg)] # increasing ctymp increasing hwympg
cars[order(ctympghwympg)] # increasing ctymp decreasing hwympg
cars[duplicated(type)] # gives exactly one (the first) of each type of car
type[124]
duplicated(type[124]) # T if duplicate F if not
duplicated(type[124]) # opposite of above
MT < subset(carsselectc(manutype)) MT[110] duplicated(MT[110])
cars[duplicated(MT)] # gives exactly one (the first) of each (manutype) of car
# another function 'unique' at first glance seems like it might do the same
unique(type)
detach(cars)
第五课 统计分析
.ttest
### Daily energy intake in kJ for 11 women (Altman 1991)
dailyintake < c(52605470564061806390651568057515751582308770)
plot(dailyintake) # unsatisfactory
plot(dailyintakexlabsubjectxaxtnmainEnergy intake for 11 subjects)
axis(1atseq(1111)) # puts one tick mark per subject
#Does the women's energy intake deviate significantly from a recommended value of 7725?
ttest(dailyintake mu7725) # we reject H0 at 5 significance level
# Same question but want a 99 confidence interval instead
ttest(dailyintakemu7725 conflevel099) # we accept H1 at 1 level
# Is the women's energy intake significantly less than a recommended value of 7725 kJ
ttest(dailyintake mu7725conflevel099 altl) # we reject H0 at 1 level
### Comparing energy expenditure between lean & obese women
### Twosample t test
library(ISwR)
data(energy) # energy expenditure in groups of lean & obese women
str(energy) # structure of data set energy
summary(energy) # summary
str(energyexpend) # overview of one variable at a time
summary(energyexpend)
tapply(energyexpendenergystaturesummary) # crosstabs
plot(energyexpend~energystature mainPlot of Energy Data) # parallel boxplots
attach(energy)
tapply(expendstaturesummary)
plot(expend~staturemainPlot of Energy Data)
search() # our dataset now added (no 2 in search path)
# Is there a significant difference in energy levels between the two groups
ttest(expend~stature) # 假定方差相等
ttest(expend~stature varequalT) # 假定方差相等
# F test for comparing variances
vartest(expend~stature)
detach(energy) # tidy up
search()
### Simple linear regression
data(thuesen)
attach(thuesen) # ventricular shortening velocity & blood glucose for diabetes patients
thuesen # another dataframe
str(thuesen)
summary(thuesen)
lm(shortvelocity~bloodglucose) #出非常简短结果
#利 extractor functions
• summary(lm(shortvelocity~bloodglucose))
• plot(shortvelocity~bloodglucosemainPlot of Thuesen Data)
• abline(lm(shortvelocity~bloodglucose) colred lwd2)
• fitted(lm(shortvelocity~bloodglucose))
• resid(lm(shortvelocity~bloodglucose))
FIT < fitted(lm(shortvelocity~bloodglucose))
RES < resid(lm(shortvelocity~bloodglucose))
segments(bloodglucoseFITbloodglucoseshortvelocity) # (x1y1x2y2)
getOption(naaction)
options(naactionnaexclude) # pads FIT & RES with NAs in 16th slot
FIT < fitted(lm(shortvelocity~bloodglucose))
RES < resid(lm(shortvelocity~bloodglucose))
FIT
RES
plot(shortvelocity~bloodglucosemainPlot of Thuesen Data)
abline(lm(shortvelocity~bloodglucose) colred lwd2)
segments(bloodglucoseFITbloodglucoseshortvelocity) # (x1y1x2y2)
{ qqnorm(RES) # graphical test of residual normality
qqline(RES) # anchor line at the quartiles }
cor(bloodglucoseshortvelocity) # fails because of missing datapoint
cor(bloodglucoseshortvelocityusecompleteobs) # different option
options(naactionnaomit) # tidy up (return to default setting)
二卡方检验
# 192436 44 of worker agree monitoring is unethical
# 40121 33 of bosses agree monitoring is unethical
M < matrix(c(1922444081) ncol2 byrowTRUE)
chisqtest(M correctFALSE)
chisqtest(M) # default value of 'correct' is TRUE
chisqtest(M simulatepvalueTRUE B10^6)
# random sampling from set of all contingency tables with given marginals
# pvalue here seems close to Fisher exact test pvalue 00369 (from SAS)
# CHISQ(32153023OPTIONSAGREE)
# reproduces the table on pp 9798 of Cody & Smith
# McNemar's test for paired data
# antismoking commercial effectiveness before & after
M < matrix(c(32153023) ncol2 byrowTRUE)
mcnemartest(M correctFALSE)
# the commercial was effective in changing attitudes (with 00253 sig)
mcnemartest(M) # cannot find in SAS (at least at the moment)
第六课 分布产生
Inverse CDF method
# Our examples here will be distributions on the interval [11]
(c(01827))^(13) (c(1827))^(13)
cbrt < function(x) { y < (abs(x))^(13) y[x<0] < y[x<0] y} cbrt(c(278101827))
1 Ushaped distribution
rndfcn1 < function(n) { S < cbrt(2*runif(n)1) S }
hist(rndfcn1(50000)freqFbreaksseq(11005)ylimc(015)borderdarkblueplotT)
f1 < function(x) ifelse(abs(x)<1 3*x^22 0)
curve(f1from1to1addTcolredlwd2)
2 upsidedown Ushaped distribution
rndfcn2 < function(n) { S < 2*rbeta(n22)1 S } # beta(22) rv
hist(rndfcn2(50000)freqFbreaksseq(11005)ylimc(0075)borderdarkblueplotT)
f2 < function(x) ifelse(abs(x)<1 (34)*(1x^2) 0)
curve(f2from1to1addTcolredlwd2)
3 Vshaped distribution
SQRT < function(x) { y < (abs(x))^(12) y[x<0] < y[x<0] y} SQRT(c(9410149))
rndfcn3 < function(n) { S < SQRT(2*runif(n)1) S }
hist(rndfcn3(50000)freqFbreaksseq(11005)ylimc(01)borderdarkblueplotT)
f3 < function(x) ifelse(abs(x)<1abs(x)0)
curve(f3from1to1addTcolredlwd2)
4 upsidedown Vshaped (or triangular) distribution
rndfcn4 < function(n) { S < runif(n)+runif(n)1 S} # sum of two unif rvs
hist(rndfcn4(50000)freqFbreaksseq(11005)ylimc(01)borderdarkblueplotT)
f4 < function(x) ifelse(abs(x)<11abs(x)0)
curve(f4from1to1addTcolredlwd2)
5 symmetric bimodal distribution
g5 < function(yx) (14)*(3*y^5 + 5*y^3 + 2) x
h5 < function(x) uniroot(g5 lower1 upper1 tol10^(15) xx)root
h5 < Vectorize(h5)
rndfcn5 < function(n) { S < h5(runif(n)) S }
hist(rndfcn5(20000)freqFbreaksseq(11005)ylimc(01)borderdarkblueplotT)
f5 < function(x) (154)*x^2*(1x^2)
curve(f5from1to1addTcolredlwd2)
6 asymmetric unimodal distribution
g6 < function(yx) (sqrt(2)16)*(73*y)*(y+1)^(32) x
h6 < function(x) uniroot(g6 lower1 upper1 tol10^(15) xx)root
h6 < Vectorize(h6)
rndfcn6 < function(n) { S < h6(runif(n)) S }
hist(rndfcn6(20000)freqFbreaksseq(11005)ylimc(01)borderdarkblueplotT)
f6 < function(x) (15*sqrt(2)32)*(1x)*(x+1)^(12)
curve(f6from1to1addTcolredlwd2)
# focus on the last example what is the maximum value of f6
optimize(f6intervalc(11)maximumTRUE) # simple numerical routine
# likewise what is the maximum value of f5
optimize(f5intervalc(11)maximumTRUE) # gives just one of the two max points
二 Acceptancerejection method
RjctSmpl6 < function (n) { xvec < runif(n 1 1)
yvec < runif(n 0 5*sqrt(3)12)
zvec < xvec[yvec < f6(xvec)]
zvec} # key idea of rejection Z ~ f6
NROW(RjctSmpl6(10000)) # number of random points (> 67)
hist(RjctSmpl6(30000)[120000]freqFbreaksseq(11005)ylimc(01)
borderdarkblueplotTRUE)
curve(f6from1to1addTcolredlwd2) # much faster
RjctSmpl5 < function (n) { xvec < runif(n 1 1)
yvec < runif(n 0 1516)
zvec < xvec[yvec < f5(xvec)] } # key idea of rejection Z ~ f5
NROW(RjctSmpl5(10000)) # number of random points (> 50)
hist(RjctSmpl5(40000)[120000]freqFbreaksseq(11005)ylimc(01)
borderdarkblueplotT)
curve(f5from1to1addTcolredlwd2) # again much faster
#### Mixing Distributions
F6 < function(n) RjctSmpl6(floor(3*n2))[1n]
rndfcn66 < function(n){ s < rbinom(n105) # same as testing whether runif(n)<12
S < s*F6(n)+(1s)*(F6(n)+4) S}
hist(rndfcn66(20000)freqFbreaksseq(1501)ylimc(005)borderdarkblueplotT)
f66l < function(x) f6(x)2
f66r < function(x) f6(4x)2
curve(f66lfrom1to1addTcolredlwd2)
curve(f66rfrom3 to5addTcolredlwd2) # a mixture plus reflection
rndfcn44 < function(n) { s < rbinom(n108) # same as testing whether runif(n)<45
S < s*rndfcn4(n)+(1s)*(rndfcn4(n)2+25) S }
hist(rndfcn44(20000)freqFbreaksseq(1301)ylimc(008)borderdarkblueplotT)
f44l < function(x) (45)*f4(x)
f44r < function(x) f4(52*x)25
curve(f44lfrom1to1addTRUEcolredlwd2)
curve(f44rfrom2 to3addTRUEcolredlwd2)
# another mixture with right triangle half the baseheight of the left
rndfcn7 < function(n){ s < rbinom(n101) # same as testing whether runif(n)<110
S < rnorm(100007*s) S }
F7 < rndfcn7(20000)
F77 < F7[F7>4 & F7<11]
hist(F77freqFbreaksseq(41102)ylimc(004)borderdarkblueplotT)
f7 < function(x) 09*dnorm(x0)+01*dnorm(x7) # both rv have variance 1
curve(f7from4to11addTcolredlwd2)
# another mixture this one involving two normal densities
第七课 求导极值计算
求导
# R can take symbolic derivatives here is an example
D(expression((15*sqrt(2)32)*(1x)*(x+1)^(12))x)
# to use this expression for numerical work must use eval command
Df < function(x) eval(D(expression((15*sqrt(2)32)*(1x)*(x+1)^(12))x))
Df(seq(1105))
uniroot(Df lower05 upper05 tol10^(15))root
# 种方法计算导数
fp < deriv(~(15*sqrt(2)32)*(1x)*(x+1)^(12)x function(x) {})
fp(seq(1105)) # values of both f and f' appear
fp(seq(1105))[15] # values of only f appear
attr(fp(seq(1105)) gradient) # values of only f' appear
vp < attr(fp(seq(1105)) gradient)
dim(vp) < 5 # listing f' values in a row
# 二阶导数 deriv3
fpp < deriv3(~(15*sqrt(2)32)*(1x)*(x+1)^(12)x function(x) {})
fpp(seq(1105)) # values of f f' and f'' appear
二函数nlminb 求优
# Rosenbrock banana function B(xy)100*(yx^2)^2+(1x)^2
x < seq(22005) y < seq(22005)
B < function(xy) 100*(yx^2)^2+(1x)^2
z < outer(x y B) # outer product ie B applied to each (xy)
persp(x y z zlim c(01000) theta 15 phi 30 expand 05 col c(greyred)
ticktype detailed main Rosenbrock banana surface plot I)
persp(x y z zlim c(01000) theta 0 phi 45 expand 05 col c(greyred)
ticktype detailed main Rosenbrock banana surface plot II)
persp(x y z zlim c(01000) theta 25 phi 60 expand 05 col c(greyred)
ticktype detailed main Rosenbrock banana surface plot III)
# the expand parameter governs to what degree the box opens to us
filledcontour(x y z color colorRampPalette(c(greenbluedarkorange))
plottitle title(main Rosenbrock banana contour plot xlab x ylab y))
# Use 'nlminb' the most general optimization routine in R which uses a quasiNewton method
F < deriv3(~ 100*(yx^2)^2+(1x)^2 c(x y) function(x y) {})
# F contains not only information on B but also B' and B''
q0 < c(x12 y1) # initial guess values (traditional)
F(q0[1]q0[2]) # without the cleaning this is a mess
banobj < function(q) { Q < F(q[1] q[2]) sum(Q) }
bangrd < function(q) { Q < F(q[1] q[2]) colSums(attr(Q gradient))}
banhsn < function(q) { Q < F(q[1] q[2]) colSums(attr(Q hessian))}
banobj(q0) bangrd(q0) banhsn(q0)
bansol < nlminb(q0 banobj bangrd banhsn) # call to nlminb here
bansol[c(parobjective)]
# hence the minimum occurs at (11) and has value 0
# call to nlminb but suppressing the Hessian (nlimb thus uses a numerical estimate of it instead)
bansol < nlminb(q0 banobj bangrd)
bansol[c(parobjective)]
# suppressing the gradient as well (nlimb thus uses a numerical estimate of it instead)
bansol < nlminb(q0 banobj)
bansol[c(parobjective)]
# repeat the above but observe each step of the optimization
bansol < nlminb(q0 banobj bangrd banhsn controllist(traceTRUE))
bansoliterations # can also obtain the number of iterations this way
bansol < nlminb(q0 banobj bangrd controllist(traceTRUE)) # Hessian suppressed
bansoliterations
bansol < nlminb(q0 banobj controllist(traceTRUE)) # gradient suppressed too
bansoliterations
# 'optim' is another general purpose optimizer and comes in different flavors
# 1 NelderMead method the default (no derivatives used but relatively slow)
# 2 BFGS (Broyden Fletcher Goldfarb & Shanno) a quasiNewton method
# 3 CG (conjugate gradients method)
# 4 LBFGSB (limited memory modification of BFGS with box constraints)
# 5 SANN (simulated annealing)
# which is better 'nlminb' or 'optim'
# some history 'nlminb' was based on Fortran functions dmnfb dmngb & dmnhb (1980s) and # # # # # # appeared in SPLUS was ported over to R fairly recently 'optim' evolved independently in R thus # # it overlaps somewhat with 'nlminb'
三 数拟合正态分布
# Given a unimodal data histogram we wish to fit the best possible normal distribution (via maximum likelihood) FACT dnorm((Xmu)sigma)sigma dnorm(Xmusigma) always
# must use the singleargument version (the LH side) for 'nlminb'
library(MASS)
waiting < c(68 81 62 75 57 80 59 62 74 74 68 73 55 70 67 75 75 73 73 57 85 73 66 63 81 93 70 65 55 68)
truehist(waiting xlimc(35110) ymax006 h5 colwhiteborderdarkblue)
F < deriv3( ~ log(dnorm((xm1)s1)s1) c(m1 s1) function(x m1 s1) {})
# F contains not only information on L but also L' and L''
q0 < c(m175 s110) # initial guess values for m1 s1 read off plot
mixobj < function(x q) {Q < F(x q[1] q[2]) sum(Q) }
# sum over all datapoints x_i later when we call 'nlminb'
mixgrd < function(x q) {Q < F(x q[1] q[2]) colSums(attr(Q gradient))}
# sum over x not over the qs
mixhsn < function(x q) { Q < F(x q[1] q[2]) colSums(attr(Q hessian))}
# sum over x not over the qs
mixobj(70 q0) # a scalar
mixobj(68q0)+mixobj(72q0) # test that sum is working OK
mixobj(c(6872)q0)
mixgrd(70 q0) # a 1x2 matrix (row vector of partial derivatives)
mixgrd(68q0)+mixgrd(72q0) # test that colSums is working OK
mixgrd(c(6872)q0)
mixhsn(70 q0) # a 2x2 symmetric matrix (matrix of mixed partials)
mixhsn(68q0)+mixhsn(72q0) # test that colSums is working OK
mixhsn(c(6872)q0)
mixsol < nlminb(q0 mixobj mixgrd mixhsn lower c(Inf 0) upper c(Inf Inf) x waiting)
# x is further supplied to mixobj (since x is not included in q)
mixsol[c(parobjective)]par
c(mean(waiting)sqrt(2930)*sd(waiting)) # for sd sum is normalized by n129
# completely unsurprising agreement the MLEs of normal mean & variance
# are indeed the data sample mean and sample variance (normalized by n30)
q < mixsol[par]par
f < function(x q) { v < NULL
for (w in x) v < c(vexp(mixobj(wq)))
v}
waitfit < list(x seq(3511001) y f(seq(3511001) q))
truehist(waiting xlimc(35110) ymax006 h5 colwhiteborderblack)
lines(waitfitcoldarkmagentalwd2)
四 指数化线性模型非线性模型
q05 < function(X) quantile(Xprobs005narmTRUEnamesF)
q95 < function(X) quantile(Xprobs095narmTRUEnamesF)
# 计算残差标准差
RMSE < function(vk) { n < length(v)
sqrt(sum(v^2)(nk)) } # k number of model parameters
Year < 115
Sales < c(301320372423500608721826978113513151530180021522491)
YearSales < dataframe(YearSales)
plot(YearSalesxaxtnmainTime Series for Sales)
axis(1atseq(1151))
## 两参数模型先 log(Sales)作线性回然取指数
logYearSalesfit < lm(log(Sales)~Year)
summary(logYearSalesfit)
# 残差标准差 002996中度 13 R^2 09983
predict(logYearSalesfit) # log(Years)预测值
plot(Yearlog(Sales))
lines(Yearpredict(logYearSalesfit)colredlwd2)
# 三种外计算残差方法
RMSE(predict(logYearSalesfit) log(Sales) 2)
sqrt(1413)*sd(logYearSalesfitres)
predict(logYearSalesfit sefitTRUE)residualscale # sefitTRUE required here
exp(predict(logYearSalesfit)) # predicted values of Years
plot(YearSales)
lines(Yearexp(predict(logYearSalesfit))colredlwd2)
coef(logYearSalesfit) # c( log(a) b) where log(Sales) ~ log(a)+ b*Year
# exponentiating both sides obtain Sales ~ a*exp(b*Year)
Forecast < exp(predict(logYearSalesfit))
Diffrncs < Sales exp(predict(logYearSalesfit))
cbind(Forecast Diffrncs)
exp(predict(logYearSalesfitlist(Year1617))) # 插值法2年值
## TwoParameter Model use nonlinear regression on Sales
YearSalesfit2 < nls(Sales ~ a*exp(b*Year) YearSales start list(a250 b015) algorithm port trace TRUE)
summary(YearSalesfit2) ## 残差标准差 1549中度13
RMSE(predict(YearSalesfit2) Sales 2)
# slightly different starting values almost identical outcome no R^2 output in summary(nls)
plot(YearSales)
lines(Yearpredict(YearSalesfit2)colbluelwd2)
Forecast < predict(YearSalesfit2)
Diffrncs < Sales predict(YearSalesfit2)
cbind(Forecast Diffrncs)
predict(YearSalesfit2list(Year1617)) # extrapolation into future
sqrt(sum(Diffrncs^2)15) # consistent with Hesse Solver rmse
sqrt(1315)*RMSE(Forecast Sales 2)
# important parameter estimates coming from nls are _not_ the same as parameter estimates coming from exponentiated lm
# RMSE from nls is smaller than RMSE from exponentiated lm (obviously)
# from R 292 help At present sefit and interval are ignored
# bootstrap replicates of YearSales
YearSales[sample(NROW(YearSales)replaceT)]
YearSales[sample(NROW(YearSales)replaceT)]
YearSales[sample(NROW(YearSales)replaceT)]
f < function(m) { t(sapply(1m function(o)
{ coef(nls(Sales ~ a*exp(b*Year) YearSales[sample(NROW(YearSales)replaceT)]
start list(a250 b015) algorithm port)) } )) }
F < f(1000)
c(coef(summary(YearSalesfit2))[11] mean(F[1]))
c(coef(summary(YearSalesfit2))[21] mean(F[2]))
c(coef(summary(YearSalesfit2))[12] sd(F[1]))
c(coef(summary(YearSalesfit2))[22] sd(F[2]))
hist(F[1] brseq(14952995by1) borderdarkblue plotT)
hist(F[2] brseq(012018by00005) borderdarkblue plotT)
c(q05(F[1]) q95(F[1]))
c(q05(F[2]) q95(F[2]))
# called CI using percentile bootstrap method
## ThreeParameter Model use nonlinear regression on Sales
YearSalesfit3 < nls(Sales ~ a*exp(b*Year)+c YearSales start list(a250 b015 c20)
algorithm port trace TRUE)
summary(YearSalesfit3) # note residual standard error is 1558 on 12 degrees of freedom
RMSE(predict(YearSalesfit3) Sales 3)
Forecast < predict(YearSalesfit3)
Diffrncs < Sales predict(YearSalesfit3)
cbind(Forecast Diffrncs)
predict(YearSalesfit3list(Year1617)) # extrapolation into future
sqrt(sum(Diffrncs^2)15) # consistent with Hesse Solver rmse
sqrt(1215)*RMSE(Forecast Sales 3)
# 13936 < 14424 hence 3par model is indeed better than 2par
第八课 – Bootstrap方法
中位数分布 (假定样量奇数)
q05 < function(X) quantile(X probs005 narmTRUE namesF)
q95 < function(X) quantile(X probs095 narmTRUE
namesF)
# weighting function
w < function(nj) {m<(n1)2 pbinom(mn(j1)n)pbinom(mnjn) }
# weighting vector (symmetric about middle)
W < function(n) { m<(n1)2 v < NULL
for (j in 1(m+1)) v < c(vw(nj))
for (j in m1) v < c(vw(nj))
v}
X < c(03040505060917) # cell lifetimes
hist(X freqF breaksseq(00519501)xlimc(02)ylimc(03)
mainHistogramdensity plots for cell lifetime data
xlababsolute differences of sister cell lifetimes ylabdensity)
M < median(X) # sample median
#How good of an estimate of the true median is this What is its bias What is its uncertainty
n length(X)
MEAN < X*W(n) # theoretical mean of M
SIGMA < sqrt(X^2*W(n)(X*W(n))^2) # theoretical stddev of M
# How to verify these sampling results Use bootstrapping
# Fundamental idea due to Efron
# the population is to the sample as the sample is to the bootstrap samples
# Fact w(nj) is theoretical probability that bootstrap medianjth order statistic of original
# sample
# bootstrap using 'sample' function
f < function(m) sapply(1m function(o) median(sample(XreplaceT)))
Y < f(50000)
library(MASS) # usercontributed library
{ truehist(Y h01 x0005 # alternative plotting tool (in MASS)
mainHistogram of bootstrap distribution for median
xlababsolute differences of sister cell lifetimes
ylabdensitycolwhiteborderdarkblue)
lines(c(026034) c(10*w(n1)10*w(n1)) colred)
lines(c(036044) c(10*w(n2)10*w(n2)) colred)
lines(c(046054) c(10*sum(w(n34))10*sum(w(n34))) colred)
lines(c(056064) c(10*w(n5)10*w(n5)) colred)
lines(c(086094) c(10*w(n6)10*w(n6)) colred)
lines(c(166174) c(10*w(n7)10*w(n7)) colred) }
c(MEANmean(Y)) c(SIGMAsd(Y))
# several different definitions of 90 confidence intervals
· c(2*MMEAN1645*SIGMA2*MMEAN+1645*SIGMA)
# biascorrected CI using normal approximation
· c(q05(Y)q95(Y)) # called CI using percentile bootstrap method
· c(2*Mq95(Y)2*Mq05(Y)) # called CI using basic bootstrap method
# there is considerable controversy concerning use of bootstrap confidence intervals
# asymptotically you can do better than the three types of CIs mentioned here (normal percent & basic) but for small samples things are more complicated
# biascorrected accelerated percentile and studentized methods are available in
# usercontributed library boot
二 LawSchool Data Set置信区间计算
LSATGPA < readtable(DLawSchooltxt nastrings*headerT)
attach(LSATGPA)
plot(LSAT GPA)
rho < cor(LSAT GPA)
# What is the uncertainty associated with this estimate of crosscorrelation
n < length(LSAT) # sample size clearly plays a role
(1rho^2)sqrt(n3) # classical standard error
cortest(LSAT GPA)confint[12] # 95 CI based on normal theory
z < sqrt(n3)*(12)*log((1+rho)(1rho)) # Fisher z transformation
c(tanh((zqnorm(0975))sqrt(n3))tanh((z+qnorm(0975))sqrt(n3)))
# classical 95 confidence interval for rho
# bootstrap replicates of YearTime
LSATGPA[sample(NROW(LSATGPA)replaceT)]
# bootstrap using 'sample' function
f < function(m) { sapply(1m function(o) {
lsatgpa < LSATGPA[sample(NROW (LSATGPA)replaceT)]
lsat < lsatgpa[1]
gpa < lsatgpa[2]
cor(lsat gpa) } ) }
F < f(3200) summary(F) sd(F) # standard error for bootstrap ( > classical result)
hist(F brseq(051by001) ylimc(04) freqFALSE borderdarkblue plotTRUE)
g < function(r) (n2)*gamma(n1)*(1rho^2)^((n1)2)*(1r^2)^((n4)2)
(sqrt(2*pi)*gamma(n12)*(1rho*r)^(n32))
curve(gfrom0to1addTcolredlwd2)
# theoretical density (for normal case) has a similar shape but falls off more quickly at the
# upper tail
c(q05(F) q95(F)) # called CI using percentile bootstrap method
cortest(LSAT GPA)confint[12]
# not surprisingly the CI upperlower limits are both higher than the normal predictions
三 Olympic Dataset
YearTime < readtable(EOlympictxt headerT)
# single space (not tab) between columns use readtable (we used readdelim before)
attach(YearTime)
plot(YearTime)
YearTimefit < nls(Time ~ a+b*exp(c*(Year1900)) YearTime start list(a200 b40 c001) algorithm port trace TRUE)
summary(YearTimefit) # how were pvalues computed
# slightly different starting values almost identical outcome
plot(YearTime) lines(Yearpredict(YearTimefit)colbluelwd2)
# from online help theory used to find the distribution of the standard errors and of the
# residual standard error (for t ratios) is based on linearization and is approximate maybe #_very_ approximate
coef(YearTimefit) # coefficients only
coef(summary(YearTimefit)) # plus uncertainties
# bootstrap replicates of YearTime
YearTime[sample(NROW(YearTime)replaceT)]
# bootstrap using 'sample' function
f < function(m) { t(sapply(1m function(o)
{coef(nls(Time ~ a+b*exp(c*(Year1900))
YearTime[sample(NROW(YearTime)replaceT)]
start list(a206 b42 c001)
algorithm port)) } )) }
F < f(1000)
c(coef(summary(YearTimefit))[11] mean(F[1]))
c(coef(summary(YearTimefit))[21] mean(F[2]))
c(coef(summary(YearTimefit))[31] mean(F[3]))
c(coef(summary(YearTimefit))[12] sd(F[1]))
c(coef(summary(YearTimefit))[22] sd(F[2]))
c(coef(summary(YearTimefit))[32] sd(F[3]))
hist(F[1] brseq(052495by2) borderdarkblue plotT)
hist(F[2] brseq(051495by1) borderdarkblue plotT)
hist(F[3] brseq(01001by0001) borderdarkblue plotT)
c(q05(F[1]) q95(F[1])) c(q05(F[2]) q95(F[2])) c(q05(F[3]) q95(F[3]))
# called CI using percentile bootstrap method
第九课 MCMC算法
.马氏链极限分布
# 马氏链两状态
x c(12)
# 转移矩阵
A matrix( c(02080307) 2 byrowTRUE)
#令A极限分布sta (pq)sta sta*A sta(311811)(02730727)
# solve(Ab) 求解 Ax b 解
# 求矩阵转移矩阵 A 稳分布 x x 满足 xA x
# 转置 A’x’ x’ x’ A’ 特征量
eigen(t(A))
eigen(t(A))values
eigen(t(A))vectors
p0 eigen(t(A))vectors[1]
p0 p0 sum(p0)
# 定初始状态 x 求状态
state function(xA) sample(12 1 prob A[x])
# 马氏链前 n 步状态
tran_state function(xn) { y NULL
for(i in 1n)
{ x sample(12 1 probA[x])
y c(yx) }
y}
x01
mc1 tran_state(x05000)
hist(mc1 freqFALSE)
table(mc1)5000
hist(mc1[(11000)]freqFALSE ) # burnin
table(mc1[(11000)])4000
# 般状态 {12…m} 转移矩阵 A 马氏链 x 出发前 n 状# 态
tran_state function(mAxn)
{ y x #初始状态
for(i in 2n) {
p A[x ] # 目前状态出发转移概率
x sample(1m 1 probp) # 新状态
y c(yx) }
y }
二 连续分布例子
ffunction(x)
03*(12)*exp(x^28) + 07*exp((x10)^22)
curve(ffrom10to20)
# 均匀分布转移矩阵
chain function(xn)
{ y x #初始状态
for(i in 2n)
{ x1 runif(1 minx3 maxx+3)
a (f(x1) f(x))*I(x13
else x sample(c(xx1) 1 probc(1aa))
y c(yx) }
y }
ss chain(030000)
hist(ssbreaksseq(1020by05)freqFALSE ylimc(004))
ffunction(x)
(1sqrt(2*pi))*(03*(12)*exp(x^28) + 07*exp((x10)^22))
curve(ffrom10to20addT)
# 正态转移矩阵
chain function(xn)
{ y x #初始状态
for(i in 2n)
{ x1 rnorm(1 meanx sd3)
a f(x1) f(x)
if ( a > 1) x x1
else x sample(c(xx1) 1 probc(1aa))
y c(yx) }
y }
ss chain(030000)
hist(ssbreaksseq(1020by05)freqFALSE ylimc(004))
curve(ffrom10to20addT)
三 逆卡方分布
f1 function(x) C*x^{n2}*exp(a(2*x)) # C a constant unknown
f2 function(x) x^{n2}*exp(a(2*x)) # ignoring the constant C
# to simulate draws from above distribution with n 5 df and scaling factor a 4
f2 function(x) x^{52}*exp(2x)
curve(f2 from0 to 10)
chain function(xn)
{ yx
for(i in 2n)
{ x1 runif(1 min0 maxx+3)
a ((f2(x1) *(x+3))( f2(x)*(x1+3)))*I(0
else x sample(c(xx1) 1 probc(1aa))
yc(yx)}
y}
ss chain(130000)
hist(ssbreaksseq(040by05)freqFALSE ylimc(002))
curve(f2from0to20addT)
ss chain(130000)
m max( hist(ssbreaksseq(040by05)freqFALSE ylimc(005))density )
f2 function(x) (m01)*x^{52}*exp(2x)
hist(ssbreaksseq(040by05)freqFALSE ylimc(008))
curve(f2from0to20addT)
文档香网(httpswwwxiangdangnet)户传
《香当网》用户分享的内容,不代表《香当网》观点或立场,请自行判断内容的真实性和可靠性!
该内容是文档的文本内容,更好的格式请下载文档