read file

auto <- read.csv('car.csv')

summary(auto)

##     buying             maint              doors             persons         
##  Length:1728        Length:1728        Length:1728        Length:1728       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    lug_boot            safety           car_accept       
##  Length:1728        Length:1728        Length:1728       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character

auto$buying<-as.factor(auto$buying)
auto$maint<-as.factor(auto$maint)
auto$doors<-as.numeric(auto$doors)

## Warning: NAs introduced by coercion

auto$persons<-as.numeric(auto$persons)

## Warning: NAs introduced by coercion

auto$lug_boot<-as.factor(auto$lug_boot)
auto$safety<-as.factor(auto$safety)
auto$car_accept<-as.factor(auto$car_accept)

summary(auto)

##    buying      maint         doors        persons     lug_boot    safety   
##  high :432   high :432   Min.   :2     Min.   :2     big  :576   high:576  
##  low  :432   low  :432   1st Qu.:2     1st Qu.:2     med  :576   low :576  
##  med  :432   med  :432   Median :3     Median :3     small:576   med :576  
##  vhigh:432   vhigh:432   Mean   :3     Mean   :3                           
##                          3rd Qu.:4     3rd Qu.:4                           
##                          Max.   :4     Max.   :4                           
##                          NA's   :432   NA's   :576                         
##  car_accept  
##  acc  : 384  
##  good :  69  
##  unacc:1210  
##  vgood:  65  
##              
##              
##

auto_freq<-table(auto$lug_boot,auto$car_accept)

condicional percentage my way

percentage_auto_freq<-rbind(auto_freq[,'acc']/sum(auto_freq[,'acc']),auto_freq[,'good']/sum(auto_freq[,'good']),auto_freq[,'unacc']/sum(auto_freq[,'unacc']),auto_freq[,'vgood']/sum(auto_freq[,'vgood']))
#percentage_auto_freq


percentage_auto_freq<-t(percentage_auto_freq*100)


colnames(percentage_auto_freq)<-(colnames(auto_freq))
percentage_auto_freq

##            acc     good    unacc    vgood
## big   37.50000 34.78261 30.41322 61.53846
## med   35.15625 34.78261 32.39669 38.46154
## small 27.34375 30.43478 37.19008  0.00000

condicional percentage teacher way

cond_percent <- function(X) X/sum(X)*100
percentage_auto_freq<-apply(auto_freq, 2,cond_percent )
percentage_auto_freq

##        
##              acc     good    unacc    vgood
##   big   37.50000 34.78261 30.41322 61.53846
##   med   35.15625 34.78261 32.39669 38.46154
##   small 27.34375 30.43478 37.19008  0.00000

library(RColorBrewer)

coul <- brewer.pal(5, "Set2") 
par(mfrow=c(1,2))

#barplot(t(auto_freq),legend.text = TRUE,xlab = 'Lug Boot', ylab = 'Frecuency' ,main = 'Luggage Boot by Car Accept', #col=coul ,beside = TRUE)

barplot(auto_freq,xlab = 'Car Accept', ylab = 'Frecuency',main = 'Car Accept by Luggage Boot' ,beside = TRUE,legend.text=c('big','med','small'),col=c('lightblue','pink','lightgreen'))

auto_freq

##        
##         acc good unacc vgood
##   big   144   24   368    40
##   med   135   24   392    25
##   small 105   21   450     0

chisq.test(auto_freq)

## 
##  Pearson's Chi-squared test
## 
## data:  auto_freq
## X-squared = 53.282, df = 6, p-value = 1.029e-09

Movies

movies <- read.csv('movies.csv')

summary(movies)

##     Movie            LeadStudio        RottenTomatoes  AudienceScore  
##  Length:612         Length:612         Min.   : 0.00   Min.   :19.00  
##  Class :character   Class :character   1st Qu.:26.00   1st Qu.:48.00  
##  Mode  :character   Mode  :character   Median :47.50   Median :60.00  
##                                        Mean   :49.14   Mean   :60.41  
##                                        3rd Qu.:72.00   3rd Qu.:73.00  
##                                        Max.   :99.00   Max.   :96.00  
##     Story              Genre           TheatersOpenWeek OpeningWeekend   
##  Length:612         Length:612         Min.   :   2     Min.   :  0.032  
##  Class :character   Class :character   1st Qu.:2424     1st Qu.:  8.360  
##  Mode  :character   Mode  :character   Median :2858     Median : 14.780  
##                                        Mean   :2717     Mean   : 22.558  
##                                        3rd Qu.:3332     3rd Qu.: 27.762  
##                                        Max.   :4468     Max.   :169.190  
##  BOAvgOpenWeekend DomesticGross     ForeignGross       WorldGross      
##  Min.   : 1003    Min.   :  0.97   Min.   :   0.01   Min.   :   4.677  
##  1st Qu.: 3828    1st Qu.: 25.03   1st Qu.:  16.82   1st Qu.:  43.610  
##  Median : 5978    Median : 44.47   Median :  46.66   Median :  91.375  
##  Mean   : 8046    Mean   : 73.58   Mean   :  96.94   Mean   : 170.377  
##  3rd Qu.: 9715    3rd Qu.: 94.97   3rd Qu.: 103.43   3rd Qu.: 203.060  
##  Max.   :93230    Max.   :760.50   Max.   :2021.00   Max.   :2781.500  
##      Budget      Profitability       OpenProfit           Year     
##  Min.   :  0.5   Min.   :  18.17   Min.   :   0.34   Min.   :2007  
##  1st Qu.: 20.0   1st Qu.: 150.79   1st Qu.:  21.90   1st Qu.:2008  
##  Median : 38.5   Median : 253.78   Median :  37.41   Median :2009  
##  Mean   : 57.2   Mean   : 355.40   Mean   :  56.39   Mean   :2009  
##  3rd Qu.: 75.0   3rd Qu.: 394.61   3rd Qu.:  59.18   3rd Qu.:2010  
##  Max.   :300.0   Max.   :6694.40   Max.   :1368.00   Max.   :2011

movies$LeadStudio<-as.factor(movies$LeadStudio)
movies$RottenTomatoes<-as.numeric(movies$RottenTomatoes)
movies$AudienceScore<-as.numeric(movies$AudienceScore)
movies$Story<-as.factor(movies$Story)
movies$Genre<-as.factor(movies$Genre)
movies$TheatersOpenWeek<-as.numeric(movies$TheatersOpenWeek)
movies$OpeningWeekend<-as.numeric(movies$OpeningWeekend)
movies$BOAvgOpenWeekend<-as.numeric(movies$BOAvgOpenWeekend)
movies$DomesticGross<-as.numeric(movies$DomesticGross)
movies$ForeignGross<-as.numeric(movies$ForeignGross)
movies$WorldGross<-as.numeric(movies$WorldGross)
movies$Budget<-as.numeric(movies$Budget)
movies$Profitability<-as.numeric(movies$Profitability)
movies$OpenProfit<-as.numeric(movies$OpenProfit)
movies$Year<-as.factor(movies$Year)

plot(movies$RottenTomatoes, movies$AudienceScore)

cor(movies$AudienceScore, movies$RottenTomatoes)

## [1] 0.6881901

model<- lm(movies$AudienceScore ~ movies$RottenTomatoes)
model

## 
## Call:
## lm(formula = movies$AudienceScore ~ movies$RottenTomatoes)
## 
## Coefficients:
##           (Intercept)  movies$RottenTomatoes  
##               39.5875                 0.4238

plot(movies$RottenTomatoes, movies$AudienceScore)
abline(model)

library(MASS)
residual=resid(model) ## Obtain residuals
stu.residual =studres(model) ## Obtain Studentized residuals
## Studentized residual plot
plot(movies$RottenTomatoes, stu.residual ,main="Studentized Residual Plot",xlab = 'Rotten Tomatoes',ylab = 'Residual')
## Add different horizontal line.
abline(h=c(-2,0,2), col=c("red", "blue", "red"), lty=c(2,1,2), lwd=c(3,2,3))
abline(h=0)

## histogram of studentized residual
hist(stu.residual )

ERTL COMPANY

A<-c(15,15,20)
B<-c(13,11,25)
C<-c(21,13,12)
company<-data.frame(A,B,C)
company_total<-company

add total by column

l<-function(x) sum(x)
company_total<-rbind(company, apply(company, 2,l))
row.names(company_total)<-c('Minor Defect','Major Defect','Good','Total')
company_total

add total by row

l<-function(x) sum(x)
company_total<-cbind(company_total, apply(company_total, 1,l))
colnames(company_total)<-c('A','B','C','Total')
company_total

Total<-apply(company, 1, function(x) sum(x))
company_total<-cbind(company,Total)
total<-company_total[,'Total']
total/sum(total)*100

## [1] 33.79310 26.89655 39.31034

#total

row.names(company)<-c('Minor Defect','Major Defect','Good')
company

chisq.test(company) # Chi-square Test

## 
##  Pearson's Chi-squared test
## 
## data:  company
## X-squared = 7.2248, df = 4, p-value = 0.1245

Exploratory data analysis : Car Evaluation and movies dataset

Andres Viloria

2022-10-29

read file

condicional percentage my way

condicional percentage teacher way

Movies

ERTL COMPANY

add total by column

add total by row