** R就像一台計算機 **
print("Hello R")
## [1] "Hello R"
8+7
## [1] 15
6^5
## [1] 7776
8/2
## [1] 4
sqrt(25)
## [1] 5
log(100,10)
## [1] 2
sum(c(100,200,500)*c(-0.1,0.2,0.3))
## [1] 180
5%%2
## [1] 1
** R 裡面所有的東西都是一個object **
prob <- c(0.2,0.5,0.3)
payoff <- c(500,100,400)
expected_payoff <-sum(prob*payoff)
邏輯值:& (and)、| (or)、!(not)、==(等於)、!=(不等於)
x=c(1,20,30,55,66)
length(x) #vector 內個數多少
## [1] 5
sum(x) #個數加總
## [1] 172
prod(x) #相乘
## [1] 2178000
cumsum(x) #累積相加
## [1] 1 21 51 106 172
cumprod(x) #累積相乘
## [1] 1 20 600 33000 2178000
sort(x) # 排列
## [1] 1 20 30 55 66
rank(x) # 排列
## [1] 1 2 3 4 5
x[2]
## [1] 20
x[6]
## [1] NA
x[c(1,2)]
## [1] 1 20
mean(x) #平均數
## [1] 34.4
max(x)
## [1] 66
min(x)
## [1] 1
var(x)
## [1] 691.3
median(x)
## [1] 30
sum(x)
## [1] 172
sd(x)
## [1] 26.29258
不用sd function 要如何計算標準差呢?
#標準差
n = length(x)
x.sd = sqrt((sum(x^2)-n*mean(x)^2)/(n-1)) #26.29
library(readr)
athlete<-read_csv("asset/athlete_all.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_integer(),
## country = col_character(),
## year = col_integer(),
## ID = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_integer(),
## Height = col_integer(),
## NOC = col_character(),
## Games = col_character(),
## Season = col_character(),
## City = col_character(),
## Sport = col_character(),
## Event = col_character(),
## Medal = col_character(),
## food_suppiy = col_integer(),
## income_GDP = col_integer(),
## democracy_score = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not a
## multiple of vector length (arg 1)
## Warning: 42 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1166 female_school no trailing characters r 'asset/athlete_all.csv' file 2 1167 female_school no trailing characters r 'asset/athlete_all.csv' row 3 1168 female_school no trailing characters r 'asset/athlete_all.csv' col 4 1169 female_school no trailing characters r 'asset/athlete_all.csv' expected 5 1170 female_school no trailing characters r 'asset/athlete_all.csv'

## See problems(...) for more details.
athlete<-athlete[,-1]
資料摘要
summary(athlete)
## country year ID Name
## Length:271116 Min. :1896 Min. : 1 Length:271116
## Class :character 1st Qu.:1960 1st Qu.: 34643 Class :character
## Mode :character Median :1988 Median : 68205 Mode :character
## Mean :1978 Mean : 68249
## 3rd Qu.:2002 3rd Qu.:102097
## Max. :2016 Max. :135571
##
## Sex Age Height Weight
## Length:271116 Min. :10.00 Min. :127.0 Min. : 25.0
## Class :character 1st Qu.:21.00 1st Qu.:168.0 1st Qu.: 60.0
## Mode :character Median :24.00 Median :175.0 Median : 70.0
## Mean :25.56 Mean :175.3 Mean : 70.7
## 3rd Qu.:28.00 3rd Qu.:183.0 3rd Qu.: 79.0
## Max. :97.00 Max. :226.0 Max. :214.0
## NA's :9474 NA's :60171 NA's :62875
## NOC Games Season City
## Length:271116 Length:271116 Length:271116 Length:271116
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Sport Event Medal ce_rate
## Length:271116 Length:271116 Length:271116 Min. : 17.00
## Class :character Class :character Class :character 1st Qu.: 47.60
## Mode :character Mode :character Mode :character Median : 51.70
## Mean : 55.43
## 3rd Qu.: 58.20
## Max. :118.00
## NA's :83417
## electricity female_school food_suppiy health_expense
## Min. :3.500e+09 Min. : 15.40 Min. :1440 Min. : 1.69
## 1st Qu.:7.350e+10 1st Qu.: 96.50 1st Qu.:2890 1st Qu.: 5.71
## Median :2.260e+11 Median :100.00 Median :3130 Median : 7.76
## Mean :6.796e+11 Mean : 96.84 Mean :3090 Mean : 7.88
## 3rd Qu.:6.020e+11 3rd Qu.:103.00 3rd Qu.:3390 3rd Qu.: 9.49
## Max. :6.140e+12 Max. :130.00 Max. :3810 Max. :20.60
## NA's :166832 NA's :131069 NA's :122790 NA's :206269
## income_GDP baby_pw co2_em child_mortality
## Min. : 263 Min. :1.12 Min. : 0.01 Min. : 2.10
## 1st Qu.: 7390 1st Qu.:1.61 1st Qu.: 3.04 1st Qu.: 7.60
## Median : 14800 Median :2.04 Median : 6.48 Median : 19.60
## Mean : 19366 Mean :2.52 Mean : 7.34 Mean : 52.64
## 3rd Qu.: 30100 3rd Qu.:2.87 3rd Qu.:10.00 3rd Qu.: 60.60
## Max. :136000 Max. :8.30 Max. :61.80 Max. :536.00
## NA's :45550 NA's :46166 NA's :59753 NA's :45763
## inflation unemployment_15 bc_w_death democracy_score
## Min. : -17.10 Min. : 0.14 Min. : 0.53 Min. :-10.00
## 1st Qu.: 1.67 1st Qu.: 4.65 1st Qu.:15.50 1st Qu.: 3.00
## Median : 3.92 Median : 6.83 Median :20.20 Median : 10.00
## Mean : 17.17 Mean : 7.56 Mean :19.48 Mean : 5.43
## 3rd Qu.: 8.46 3rd Qu.: 9.62 3rd Qu.:23.80 3rd Qu.: 10.00
## Max. :6040.00 Max. :35.50 Max. :46.60 Max. : 10.00
## NA's :115730 NA's :154684 NA's :103013 NA's :79678
head(table(athlete$country,athlete$Medal), 10)
##
## Bronze Gold Silver
## 30. Februar 0 0 0
## A North American Team 4 0 0
## Acipactli 0 0 0
## Acturus 0 0 0
## Afghanistan 2 0 0
## Akatonbo 0 0 0
## Alain IV 0 0 0
## Albania 0 0 0
## Alcaid 0 0 0
## Alcyon-6 0 0 0
確認資料結構
is.data.frame(athlete)
## [1] TRUE
基本資料探索
nrow(athlete)
## [1] 271116
ncol(athlete)
## [1] 28
dim(athlete) #資料維度
## [1] 271116 28
class(athlete$year) #看資料型態
## [1] "integer"
head(athlete)
## # A tibble: 6 x 28
## country year ID Name Sex Age Height Weight NOC Games Season City
## <chr> <int> <int> <chr> <chr> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 30. Fe… 1952 34666 Hara… M 23 167 70 AUT 1952… Summer Hels…
## 2 30. Fe… 1952 127384 Hara… M 44 176 NA AUT 1952… Summer Hels…
## 3 A Nort… 1900 33106 Jos … M 38 NA NA MEX 1900… Summer Paris
## 4 A Nort… 1900 33107 Jos … M 42 NA NA MEX 1900… Summer Paris
## 5 A Nort… 1900 33109 Jos … M 44 NA NA MEX 1900… Summer Paris
## 6 A Nort… 1900 131733 Will… M NA NA NA USA 1900… Summer Paris
## # … with 16 more variables: Sport <chr>, Event <chr>, Medal <chr>,
## # ce_rate <dbl>, electricity <dbl>, female_school <dbl>, food_suppiy <int>,
## # health_expense <dbl>, income_GDP <int>, baby_pw <dbl>, co2_em <dbl>,
## # child_mortality <dbl>, inflation <dbl>, unemployment_15 <dbl>,
## # bc_w_death <dbl>, democracy_score <int>
names(athlete) #看變數名稱
## [1] "country" "year" "ID" "Name"
## [5] "Sex" "Age" "Height" "Weight"
## [9] "NOC" "Games" "Season" "City"
## [13] "Sport" "Event" "Medal" "ce_rate"
## [17] "electricity" "female_school" "food_suppiy" "health_expense"
## [21] "income_GDP" "baby_pw" "co2_em" "child_mortality"
## [25] "inflation" "unemployment_15" "bc_w_death" "democracy_score"
傳回資料框架變數
athlete[2]
## # A tibble: 271,116 x 1
## year
## <int>
## 1 1952
## 2 1952
## 3 1900
## 4 1900
## 5 1900
## 6 1900
## 7 1964
## 8 1964
## 9 1964
## 10 1948
## # … with 271,106 more rows
athlete["year"]
## # A tibble: 271,116 x 1
## year
## <int>
## 1 1952
## 2 1952
## 3 1900
## 4 1900
## 5 1900
## 6 1900
## 7 1964
## 8 1964
## 9 1964
## 10 1948
## # … with 271,106 more rows
傳回向量
head(athlete$year, 10)
## [1] 1952 1952 1900 1900 1900 1900 1964 1964 1964 1948
athlete[,2]
## # A tibble: 271,116 x 1
## year
## <int>
## 1 1952
## 2 1952
## 3 1900
## 4 1900
## 5 1900
## 6 1900
## 7 1964
## 8 1964
## 9 1964
## 10 1948
## # … with 271,106 more rows
region<-read_csv("asset/noc_regions.csv")
## Parsed with column specification:
## cols(
## NOC = col_character(),
## region = col_character(),
## notes = col_character()
## )
region<-region[,-3]
兩者取相同欄位
ar<-merge(athlete,region,by="NOC",all.x = T)
weight<-c(65,75,80,87)
height<-c(155,165,175,185)
gender<-c("F","F","M","M")
nsysu<-data.frame(weight,height,gender)
major<-c("chinese","math","BM","finance")
age<-c(20,27,28,29)
nsysu2<-data.frame(major,age)
nsysu<-cbind(nsysu,nsysu2)
weight<-c(85,95,100,87)
height<-c(155,155,145,185)
gender<-c("M","F","M","M")
major<-c("chinese","math","BM","finance")
age<-c(25,25,25,24)
nsysu2<-data.frame(weight,height,gender,major,age)
nsysu<-rbind(nsysu,nsysu2)