#https://www.kaggle.com/carolzhangdc/predict-imdb-score-with-data-mining-algorithms
library(dplyr)
library(tidyverse)
library(forecast)
library(leaps)
library(forecast)
library(caret)
library(car)
library(data.table)
library(VIM)
library(corrplot)
library(ggplot2)
library(ggcorrplot)
library(plotly)
library(ggrepel)
library(caret)
library(ggthemes)
library(rpart) # Popular decision tree algorithm
library(rattle) # Fancy tree plot
library(readr)
movie_metadata <- read_csv("movie_metadata.csv")
sum(duplicated(movie_metadata))
## [1] 45
movie_metadata <- movie_metadata[!duplicated(movie_metadata),] #removing duplicate row
str(movie_metadata)
## Classes 'tbl_df', 'tbl' and 'data.frame': 4998 obs. of 28 variables:
## $ color : chr "Color" "Color" "Color" "Color" ...
## $ director_name : chr "James Cameron" "Gore Verbinski" "Sam Mendes" "Christopher Nolan" ...
## $ num_critic_for_reviews : num 723 302 602 813 NA 462 392 324 635 375 ...
## $ duration : num 178 169 148 164 NA 132 156 100 141 153 ...
## $ director_facebook_likes : num 0 563 0 22000 131 475 0 15 0 282 ...
## $ actor_3_facebook_likes : num 855 1000 161 23000 NA 530 4000 284 19000 10000 ...
## $ actor_2_name : chr "Joel David Moore" "Orlando Bloom" "Rory Kinnear" "Christian Bale" ...
## $ actor_1_facebook_likes : num 1000 40000 11000 27000 131 640 24000 799 26000 25000 ...
## $ gross : num 7.61e+08 3.09e+08 2.00e+08 4.48e+08 NA ...
## $ genres : chr "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thriller" ...
## $ actor_1_name : chr "CCH Pounder" "Johnny Depp" "Christoph Waltz" "Tom Hardy" ...
## $ movie_title : chr "Avatar " "Pirates of the Caribbean: At World's End " "Spectre " "The Dark Knight Rises " ...
## $ num_voted_users : num 886204 471220 275868 1144337 8 ...
## $ cast_total_facebook_likes: num 4834 48350 11700 106759 143 ...
## $ actor_3_name : chr "Wes Studi" "Jack Davenport" "Stephanie Sigman" "Joseph Gordon-Levitt" ...
## $ facenumber_in_poster : num 0 0 1 0 0 1 0 1 4 3 ...
## $ plot_keywords : chr "avatar|future|marine|native|paraplegic" "goddess|marriage ceremony|marriage proposal|pirate|singapore" "bomb|espionage|sequel|spy|terrorist" "deception|imprisonment|lawlessness|police officer|terrorist plot" ...
## $ movie_imdb_link : chr "http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1" ...
## $ num_user_for_reviews : num 3054 1238 994 2701 NA ...
## $ language : chr "English" "English" "English" "English" ...
## $ country : chr "USA" "USA" "UK" "USA" ...
## $ content_rating : chr "PG-13" "PG-13" "PG-13" "PG-13" ...
## $ budget : num 2.37e+08 3.00e+08 2.45e+08 2.50e+08 NA ...
## $ title_year : num 2009 2007 2015 2012 NA ...
## $ actor_2_facebook_likes : num 936 5000 393 23000 12 632 11000 553 21000 11000 ...
## $ imdb_score : num 7.9 7.1 6.8 8.5 7.1 6.6 6.2 7.8 7.5 7.5 ...
## $ aspect_ratio : num 1.78 2.35 2.35 2.35 NA 2.35 2.35 1.85 2.35 2.35 ...
## $ movie_facebook_likes : num 33000 0 85000 164000 0 24000 0 29000 118000 10000 ...
#dealing with missing data
sum(is.na(movie_metadata)) #2674 of null values
## [1] 2674
colSums(sapply(movie_metadata, is.na))
## color director_name num_critic_for_reviews
## 19 103 49
## duration director_facebook_likes actor_3_facebook_likes
## 15 103 23
## actor_2_name actor_1_facebook_likes gross
## 13 7 874
## genres actor_1_name movie_title
## 0 7 0
## num_voted_users cast_total_facebook_likes actor_3_name
## 0 0 23
## facenumber_in_poster plot_keywords movie_imdb_link
## 13 152 0
## num_user_for_reviews language country
## 21 12 5
## content_rating budget title_year
## 301 487 107
## actor_2_facebook_likes imdb_score aspect_ratio
## 13 0 327
## movie_facebook_likes
## 0
missing.values <- aggr(movie_metadata, sortVars = T, prop = T, sortCombs = T, cex.lab = 1.5, cex.axis = .6, cex.numbers = 5, combined = F, gap = -.2) #visualize missing value
##
## Variables sorted by number of missings:
## Variable Count
## gross 0.174869948
## budget 0.097438976
## aspect_ratio 0.065426170
## content_rating 0.060224090
## plot_keywords 0.030412165
## title_year 0.021408563
## director_name 0.020608243
## director_facebook_likes 0.020608243
## num_critic_for_reviews 0.009803922
## actor_3_facebook_likes 0.004601841
## actor_3_name 0.004601841
## num_user_for_reviews 0.004201681
## color 0.003801521
## duration 0.003001200
## actor_2_name 0.002601040
## facenumber_in_poster 0.002601040
## actor_2_facebook_likes 0.002601040
## language 0.002400960
## actor_1_facebook_likes 0.001400560
## actor_1_name 0.001400560
## country 0.001000400
## genres 0.000000000
## movie_title 0.000000000
## num_voted_users 0.000000000
## cast_total_facebook_likes 0.000000000
## movie_imdb_link 0.000000000
## imdb_score 0.000000000
## movie_facebook_likes 0.000000000
#Gross is missing 17% or data and budget is 9% of data, hence we just have to remove them
movie_metadata <- movie_metadata[!is.na(movie_metadata$gross),]
movie_metadata <- movie_metadata[!is.na(movie_metadata$budget),]
colSums(sapply(movie_metadata, is.na))
## color director_name num_critic_for_reviews
## 2 0 1
## duration director_facebook_likes actor_3_facebook_likes
## 1 0 10
## actor_2_name actor_1_facebook_likes gross
## 5 3 0
## genres actor_1_name movie_title
## 0 3 0
## num_voted_users cast_total_facebook_likes actor_3_name
## 0 0 10
## facenumber_in_poster plot_keywords movie_imdb_link
## 6 31 0
## num_user_for_reviews language country
## 0 3 0
## content_rating budget title_year
## 51 0 0
## actor_2_facebook_likes imdb_score aspect_ratio
## 5 0 74
## movie_facebook_likes
## 0
missing.values <- aggr(movie_metadata, sortVars = T, prop = T, sortCombs = T, cex.lab = 1.5, cex.axis = .6, cex.numbers = 5, combined = F, gap = -.2) #visualize missing value
##
## Variables sorted by number of missings:
## Variable Count
## aspect_ratio 0.0191858958
## content_rating 0.0132227120
## plot_keywords 0.0080373347
## actor_3_facebook_likes 0.0025926886
## actor_3_name 0.0025926886
## facenumber_in_poster 0.0015556132
## actor_2_name 0.0012963443
## actor_2_facebook_likes 0.0012963443
## actor_1_facebook_likes 0.0007778066
## actor_1_name 0.0007778066
## language 0.0007778066
## color 0.0005185377
## num_critic_for_reviews 0.0002592689
## duration 0.0002592689
## director_name 0.0000000000
## director_facebook_likes 0.0000000000
## gross 0.0000000000
## genres 0.0000000000
## movie_title 0.0000000000
## num_voted_users 0.0000000000
## cast_total_facebook_likes 0.0000000000
## movie_imdb_link 0.0000000000
## num_user_for_reviews 0.0000000000
## country 0.0000000000
## budget 0.0000000000
## title_year 0.0000000000
## imdb_score 0.0000000000
## movie_facebook_likes 0.0000000000
# aspect ratio still has 74 missing value, lets inspect that
table(movie_metadata$aspect_ratio)
##
## 1.18 1.33 1.37 1.5 1.66 1.75 1.77 1.78 1.85 2 2.2 2.24 2.35 2.39 2.4 2.55
## 1 19 50 1 40 2 1 41 1600 3 10 1 1995 11 3 1
## 2.76 16
## 3 1
movie_metadata$aspect_ratio[is.na(movie_metadata$aspect_ratio)] <- 0 # creplacing the null aspect ratio with 0
mean(movie_metadata$gross[movie_metadata$aspect_ratio == 1.85]) #checking the mean of gross
## [1] 44123725
mean(movie_metadata$gross[movie_metadata$aspect_ratio == 2.35])
## [1] 58306242
mean(movie_metadata$gross[movie_metadata$aspect_ratio != 1.85 & movie_metadata$aspect_ratio != 2.35])
## [1] 36073031
mean(movie_metadata$gross[movie_metadata$aspect_ratio == 0]) # We can tell that with aspect ratio being null value the gross will be significantly less. We will keep this as is aspect ratio has a hiarchical structure. The higher Aspecct ratio the higher the gross are.
## [1] 2605095
summary(movie_metadata$gross)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 162 6754898 27829874 50912638 65452312 760505847
# deal with other missing variable
colSums(sapply(movie_metadata, is.na)) #lets start with content rating
## color director_name num_critic_for_reviews
## 2 0 1
## duration director_facebook_likes actor_3_facebook_likes
## 1 0 10
## actor_2_name actor_1_facebook_likes gross
## 5 3 0
## genres actor_1_name movie_title
## 0 3 0
## num_voted_users cast_total_facebook_likes actor_3_name
## 0 0 10
## facenumber_in_poster plot_keywords movie_imdb_link
## 6 31 0
## num_user_for_reviews language country
## 0 3 0
## content_rating budget title_year
## 51 0 0
## actor_2_facebook_likes imdb_score aspect_ratio
## 5 0 0
## movie_facebook_likes
## 0
table(movie_metadata$content_rating)
##
## Approved G GP M NC-17 Not Rated Passed PG
## 17 91 1 2 6 42 3 573
## PG-13 R Unrated X
## 1314 1723 24 10
movie_metadata <-movie_metadata [!is.na(movie_metadata$content_rating),] # removing null value in movie_metadata
sum(is.na(movie_metadata$content_rating)) # double checking the sum of missing value
## [1] 0
table(movie_metadata$content_rating)
##
## Approved G GP M NC-17 Not Rated Passed PG
## 17 91 1 2 6 42 3 573
## PG-13 R Unrated X
## 1314 1723 24 10
#replacing rating with the modern rating
movie_metadata$content_rating[movie_metadata$content_rating == "M"] <- "PG"
movie_metadata$content_rating[movie_metadata$content_rating == "GP"] <- "PG"
movie_metadata$content_rating[movie_metadata$content_rating == "X"] <- "NC-17"
table(movie_metadata$content_rating)
##
## Approved G NC-17 Not Rated Passed PG PG-13 R
## 17 91 16 42 3 576 1314 1723
## Unrated
## 24
#replacing the rest of rating with the most common rating of R
movie_metadata$content_rating[movie_metadata$content_rating == "Approved"] <- "R"
movie_metadata$content_rating[movie_metadata$content_rating == "Not Rated"] <- "R"
movie_metadata$content_rating[movie_metadata$content_rating == "Passed"] <- "R"
movie_metadata$content_rating[movie_metadata$content_rating == "Unrated"] <- "R"
table(movie_metadata$content_rating)
##
## G NC-17 PG PG-13 R
## 91 16 576 1314 1809
# Now lets look the rest
colSums(sapply(movie_metadata, is.na))
## color director_name num_critic_for_reviews
## 2 0 1
## duration director_facebook_likes actor_3_facebook_likes
## 0 0 6
## actor_2_name actor_1_facebook_likes gross
## 2 1 0
## genres actor_1_name movie_title
## 0 1 0
## num_voted_users cast_total_facebook_likes actor_3_name
## 0 0 6
## facenumber_in_poster plot_keywords movie_imdb_link
## 6 21 0
## num_user_for_reviews language country
## 0 2 0
## content_rating budget title_year
## 0 0 0
## actor_2_facebook_likes imdb_score aspect_ratio
## 2 0 0
## movie_facebook_likes
## 0
# replace NA with column average for facenumber_in_poster
movie_metadata$facenumber_in_poster[is.na(movie_metadata$facenumber_in_poster)] <- round(mean(movie_metadata$facenumber_in_poster, na.rm = TRUE))
# convert 0s into NAs for other predictors
movie_metadata[,c(5,6,8,13,24,26)][movie_metadata[,c(5,6,8,13,24,26)] == 0] <- NA
# impute missing value with column mean
movie_metadata$num_critic_for_reviews[is.na(movie_metadata$num_critic_for_reviews)] <- round(mean(movie_metadata$num_critic_for_reviews, na.rm = TRUE))
movie_metadata$duration[is.na(movie_metadata$duration)] <- round(mean(movie_metadata$duration, na.rm = TRUE))
movie_metadata$director_facebook_likes[is.na(movie_metadata$director_facebook_likes)] <- round(mean(movie_metadata$director_facebook_likes, na.rm = TRUE))
movie_metadata$actor_3_facebook_likes[is.na(movie_metadata$actor_3_facebook_likes)] <- round(mean(movie_metadata$actor_3_facebook_likes, na.rm = TRUE))
movie_metadata$actor_1_facebook_likes[is.na(movie_metadata$actor_1_facebook_likes)] <- round(mean(movie_metadata$actor_1_facebook_likes, na.rm = TRUE))
movie_metadata$cast_total_facebook_likes[is.na(movie_metadata$cast_total_facebook_likes)] <- round(mean(movie_metadata$cast_total_facebook_likes, na.rm = TRUE))
movie_metadata$actor_2_facebook_likes[is.na(movie_metadata$actor_2_facebook_likes)] <- round(mean(movie_metadata$actor_2_facebook_likes, na.rm = TRUE))
movie_metadata$movie_facebook_likes[is.na(movie_metadata$movie_facebook_likes)] <- round(mean(movie_metadata$movie_facebook_likes, na.rm = TRUE))
colSums(sapply(movie_metadata, is.na)) #check out the result (We still have some left)
## color director_name num_critic_for_reviews
## 2 0 0
## duration director_facebook_likes actor_3_facebook_likes
## 0 0 0
## actor_2_name actor_1_facebook_likes gross
## 2 0 0
## genres actor_1_name movie_title
## 0 1 0
## num_voted_users cast_total_facebook_likes actor_3_name
## 0 0 6
## facenumber_in_poster plot_keywords movie_imdb_link
## 0 21 0
## num_user_for_reviews language country
## 0 2 0
## content_rating budget title_year
## 0 0 0
## actor_2_facebook_likes imdb_score aspect_ratio
## 0 0 0
## movie_facebook_likes
## 0
# See does language matter
table(movie_metadata$language) # We can tell mostly is English, hence it shouldn't matter to us.
##
## Aboriginal Arabic Aramaic Bosnian Cantonese Czech Danish
## 2 1 1 1 7 1 3
## Dari Dutch English Filipino French German Hebrew
## 2 3 3644 1 34 11 2
## Hindi Hungarian Indonesian Italian Japanese Kazakh Korean
## 5 1 2 7 10 1 5
## Mandarin Maya Mongolian None Norwegian Persian Portuguese
## 14 1 1 1 4 3 5
## Romanian Russian Spanish Thai Vietnamese Zulu
## 1 1 24 3 1 1
movie_metadata <- movie_metadata[,-c(20) ] # We drop language column as a result
# See does Color matter
colSums(sapply(movie_metadata, is.na))
## color director_name num_critic_for_reviews
## 2 0 0
## duration director_facebook_likes actor_3_facebook_likes
## 0 0 0
## actor_2_name actor_1_facebook_likes gross
## 2 0 0
## genres actor_1_name movie_title
## 0 1 0
## num_voted_users cast_total_facebook_likes actor_3_name
## 0 0 6
## facenumber_in_poster plot_keywords movie_imdb_link
## 0 21 0
## num_user_for_reviews country content_rating
## 0 0 0
## budget title_year actor_2_facebook_likes
## 0 0 0
## imdb_score aspect_ratio movie_facebook_likes
## 0 0 0
table(movie_metadata$color)
##
## Black and White Color
## 124 3680
movie_metadata <- movie_metadata[,-1] # It does not matter, hence we decided to drop it.
# There are 30 more missing value (Since it is so small (< 1%) compare to our sample, we decided to drop it.)
colSums(sapply(movie_metadata, is.na))
## director_name num_critic_for_reviews duration
## 0 0 0
## director_facebook_likes actor_3_facebook_likes actor_2_name
## 0 0 2
## actor_1_facebook_likes gross genres
## 0 0 0
## actor_1_name movie_title num_voted_users
## 1 0 0
## cast_total_facebook_likes actor_3_name facenumber_in_poster
## 0 6 0
## plot_keywords movie_imdb_link num_user_for_reviews
## 21 0 0
## country content_rating budget
## 0 0 0
## title_year actor_2_facebook_likes imdb_score
## 0 0 0
## aspect_ratio movie_facebook_likes
## 0 0
sum(is.na(movie_metadata))
## [1] 30
movie_metadata <- na.omit(movie_metadata)
colSums(sapply(movie_metadata, is.na))
## director_name num_critic_for_reviews duration
## 0 0 0
## director_facebook_likes actor_3_facebook_likes actor_2_name
## 0 0 0
## actor_1_facebook_likes gross genres
## 0 0 0
## actor_1_name movie_title num_voted_users
## 0 0 0
## cast_total_facebook_likes actor_3_name facenumber_in_poster
## 0 0 0
## plot_keywords movie_imdb_link num_user_for_reviews
## 0 0 0
## country content_rating budget
## 0 0 0
## title_year actor_2_facebook_likes imdb_score
## 0 0 0
## aspect_ratio movie_facebook_likes
## 0 0
# Now lets look at language
table(movie_metadata$country) # We can tell most movie are from US and UK, we will create three catergory. (US, UK and Other)
##
## Afghanistan Argentina Aruba Australia Belgium
## 1 3 1 40 1
## Brazil Canada Chile China Colombia
## 5 60 1 13 1
## Czech Republic Denmark Finland France Georgia
## 3 9 1 102 1
## Germany Greece Hong Kong Hungary Iceland
## 79 1 13 2 1
## India Indonesia Iran Ireland Israel
## 5 1 4 7 2
## Italy Japan Mexico Netherlands New Line
## 11 15 6 3 1
## New Zealand Norway Official site Peru Philippines
## 11 4 1 1 1
## Poland Romania Russia South Africa South Korea
## 1 2 3 3 8
## Spain Taiwan Thailand UK USA
## 21 2 4 315 3008
## West Germany
## 1
movie_metadata$country[movie_metadata$country != "USA" & movie_metadata$country !="UK"] <- "Others"
table(movie_metadata$country)
##
## Others UK USA
## 456 315 3008
library(stringr)
movie_metadata$movie_title <- gsub("Â", "", as.character(factor(movie_metadata$movie_title)))
str_trim(movie_metadata$movie_title, side = "right")
# create a new data frame
genres.df <- as.data.frame(movie_metadata[,c("genres", "gross")])
# separate different genres into new columns
genres.df$Action <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Action") 1 else 0)
genres.df$Adventure <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Adventure") 1 else 0)
genres.df$Animation <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Animation") 1 else 0)
genres.df$Biography <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Biography") 1 else 0)
genres.df$Comedy <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Comedy") 1 else 0)
genres.df$Crime <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Crime") 1 else 0)
genres.df$Documentary <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Documentary") 1 else 0)
genres.df$Drama <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Drama") 1 else 0)
genres.df$Family <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Family") 1 else 0)
genres.df$Fantasy <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Fantasy") 1 else 0)
genres.df$`Film-Noir` <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Film-Noir") 1 else 0)
genres.df$History <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "History") 1 else 0)
genres.df$Horror <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Horror") 1 else 0)
genres.df$Musical <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Musical") 1 else 0)
genres.df$Mystery <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Mystery") 1 else 0)
genres.df$News <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "News") 1 else 0)
genres.df$Romance <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Romance") 1 else 0)
genres.df$`Sci-Fi` <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Sci-Fi") 1 else 0)
genres.df$Short <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Short") 1 else 0)
genres.df$Sport <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Sport") 1 else 0)
genres.df$Thriller <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Thriller") 1 else 0)
genres.df$War <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "War") 1 else 0)
genres.df$Western <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Western") 1 else 0)
# get the mean of Gross for different genres
means <- rep(0,23)
for (i in 1:23) {
means[i] <- mean(genres.df$gross[genres.df[i+2]==1])
}
head(means)
## [1] 77289281 100214983 109074453 36175004 50842175 39192003
# plot the means
barplot(means/10^6, main = "Average gross for different genres")
head(genres.df$action)
## NULL
# We found out genre does matter to the Gross, hence we combine both df
movie_metadata <- cbind(movie_metadata, genres.df)
movie_metadata <- movie_metadata[, -c(9, 27, 28)]
hist(movie_metadata$title_year) # find out movie release before 1980 is probably irrelevent, hence we remove any movie that is release before 1980
movie_metadata <- movie_metadata[movie_metadata$title_year>= 1980, ]
movie_metadata %>%
plot_ly(x = ~movie_facebook_likes, y = ~gross, color = ~content_rating , mode = "markers", text = ~content_rating, alpha = 0.7, type = "scatter")
movie_metadata$ROI <- round(movie_metadata$gross / movie_metadata$budget *100, 2)
movie_metadata$profitable <- ifelse(movie_metadata$ROI > 1, 1, 0)
summary(movie_metadata$ROI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 46.7 106.9 584.8 215.0 719448.6
summary(movie_metadata$profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.9718 1.0000 1.0000
summary(movie_metadata$gross)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 162 7667175 28902322 51809423 66438272 760505847
# Looking at the top 25 gross movie ROI
movie_metadata %>%
filter(budget > 100000) %>%
arrange(desc(gross)) %>%
top_n(25, gross) %>%
ggplot(aes(x = budget/10^6, y= ROI)) + geom_point() + geom_smooth() + geom_text_repel(aes(label = movie_title), size = 3) +
labs(x = "Budget in Millions ($)", y = "ROI (%)", title = "Top 25 movie ROI base on gross" )+
theme_economist() +
scale_color_economist()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
uniqueN(movie_metadata$director_name)
## [1] 1643
uniqueN(movie_metadata$actor_1_name)
## [1] 1409
uniqueN(movie_metadata$actor_3_name)
## [1] 2557
uniqueN(movie_metadata$actor_2_name)
## [1] 2150
uniqueN(movie_metadata$plot_keywords)
## [1] 3621
# all of them are all different, hence it make no sense to use them to predict. We decided to drop movie title, plot_keywords and movie_imdb_link too.
final_df <- subset(movie_metadata, select = -c(director_name, actor_2_name, actor_1_name, movie_title, actor_3_name, plot_keywords, movie_imdb_link))
#Checking for genre that is not usable and drop it
sum(uniqueN(final_df$Action))
## [1] 2
sum(uniqueN(final_df$Adventure))
## [1] 2
sum(uniqueN(final_df$Animation))
## [1] 2
sum(uniqueN(final_df$Biography))
## [1] 2
sum(uniqueN(final_df$Comedy))
## [1] 2
sum(uniqueN(final_df$Crime))
## [1] 2
sum(uniqueN(final_df$Documentary))
## [1] 2
sum(uniqueN(final_df$Drama))
## [1] 2
sum(uniqueN(final_df$Family))
## [1] 2
sum(uniqueN(final_df$Fantasy))
## [1] 2
sum(uniqueN(final_df$`Film-Noir`)) # Remove
## [1] 1
sum(uniqueN(final_df$History))
## [1] 2
sum(uniqueN(final_df$Horror))
## [1] 2
sum(uniqueN(final_df$Musical))
## [1] 2
sum(uniqueN(final_df$Mystery))
## [1] 2
sum(uniqueN(final_df$News)) # Remove
## [1] 1
sum(uniqueN(final_df$Romance))
## [1] 2
sum(uniqueN(final_df$`Sci-Fi`))
## [1] 2
sum(uniqueN(final_df$Short)) # Remove
## [1] 1
sum(uniqueN(final_df$Thriller))
## [1] 2
sum(uniqueN(final_df$War))
## [1] 2
sum(uniqueN(final_df$Western))
## [1] 2
final_df <- subset(final_df, select = -c(`Film-Noir`, News, Short))
# factor country and content rating
final_df$country <- (as.factor(final_df$country))
final_df$content_rating <- (as.factor(final_df$content_rating))
str(final_df)
## 'data.frame': 3686 obs. of 40 variables:
## $ num_critic_for_reviews : num 723 302 602 813 462 392 324 635 375 673 ...
## $ duration : num 178 169 148 164 132 156 100 141 153 183 ...
## $ director_facebook_likes : num 959 563 959 22000 475 959 15 959 282 959 ...
## $ actor_3_facebook_likes : num 855 1000 161 23000 530 4000 284 19000 10000 2000 ...
## $ actor_1_facebook_likes : num 1000 40000 11000 27000 640 24000 799 26000 25000 15000 ...
## $ gross : num 7.61e+08 3.09e+08 2.00e+08 4.48e+08 7.31e+07 ...
## $ num_voted_users : num 886204 471220 275868 1144337 212204 ...
## $ cast_total_facebook_likes: num 4834 48350 11700 106759 1873 ...
## $ facenumber_in_poster : num 0 0 1 0 1 0 1 4 3 0 ...
## $ num_user_for_reviews : num 3054 1238 994 2701 738 ...
## $ country : Factor w/ 3 levels "Others","UK",..: 3 3 2 3 3 3 3 3 2 3 ...
## $ content_rating : Factor w/ 5 levels "G","NC-17","PG",..: 4 4 4 4 4 4 3 4 3 4 ...
## $ budget : num 2.37e+08 3.00e+08 2.45e+08 2.50e+08 2.64e+08 ...
## $ title_year : num 2009 2007 2015 2012 2012 ...
## $ actor_2_facebook_likes : num 936 5000 393 23000 632 11000 553 21000 11000 4000 ...
## $ imdb_score : num 7.9 7.1 6.8 8.5 6.6 6.2 7.8 7.5 7.5 6.9 ...
## $ aspect_ratio : num 1.78 2.35 2.35 2.35 2.35 2.35 1.85 2.35 2.35 2.35 ...
## $ movie_facebook_likes : num 33000 0 85000 164000 24000 0 29000 118000 10000 197000 ...
## $ Action : num 1 1 1 1 1 1 0 1 0 1 ...
## $ Adventure : num 1 1 1 0 1 1 1 1 1 1 ...
## $ Animation : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Biography : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Comedy : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Crime : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Documentary : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Drama : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Family : num 0 0 0 0 0 0 1 0 1 0 ...
## $ Fantasy : num 1 1 0 0 0 0 1 0 1 0 ...
## $ History : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Horror : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Musical : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Mystery : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Romance : num 0 0 0 0 0 1 1 0 0 0 ...
## $ Sci-Fi : num 1 0 0 0 1 0 0 1 0 1 ...
## $ Sport : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Thriller : num 0 0 1 1 0 0 0 0 0 0 ...
## $ War : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Western : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ROI : num 320.9 103.1 81.7 179.2 27.7 ...
## $ profitable : num 1 1 1 1 1 1 1 1 1 1 ...
final_df <- subset(final_df, select = -c(ROI, profitable)) #removing ROI and Profitable column
# try basic lm
reg1 <- lm(gross ~ . , data = final_df)
summary(reg1)
##
## Call:
## lm(formula = gross ~ ., data = final_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -352431861 -21863672 -2288584 16609028 451581160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.607e+09 2.678e+08 5.998 2.19e-09 ***
## num_critic_for_reviews 1.173e+05 1.187e+04 9.880 < 2e-16 ***
## duration 3.545e+05 4.369e+04 8.115 6.56e-16 ***
## director_facebook_likes -1.466e+03 2.623e+02 -5.591 2.43e-08 ***
## actor_3_facebook_likes -8.800e+03 1.105e+03 -7.964 2.20e-15 ***
## actor_1_facebook_likes -7.596e+03 6.722e+02 -11.301 < 2e-16 ***
## num_voted_users 1.723e+02 9.803e+00 17.577 < 2e-16 ***
## cast_total_facebook_likes 7.555e+03 6.703e+02 11.270 < 2e-16 ***
## facenumber_in_poster -6.501e+05 3.819e+05 -1.702 0.088795 .
## num_user_for_reviews 1.783e+04 3.193e+03 5.584 2.52e-08 ***
## countryUK -7.906e+05 3.375e+06 -0.234 0.814819
## countryUSA 1.545e+07 2.346e+06 6.586 5.17e-11 ***
## content_ratingNC-17 -2.991e+07 1.361e+07 -2.197 0.028059 *
## content_ratingPG 3.271e+06 5.824e+06 0.562 0.574369
## content_ratingPG-13 6.601e+06 6.539e+06 1.010 0.312789
## content_ratingR -1.554e+07 6.549e+06 -2.373 0.017701 *
## budget 5.707e-03 3.266e-03 1.747 0.080669 .
## title_year -8.231e+05 1.333e+05 -6.173 7.45e-10 ***
## actor_2_facebook_likes -7.023e+03 7.122e+02 -9.861 < 2e-16 ***
## imdb_score -6.373e+05 9.928e+05 -0.642 0.520967
## aspect_ratio -3.154e+05 1.837e+06 -0.172 0.863713
## movie_facebook_likes -8.589e+01 5.079e+01 -1.691 0.090894 .
## Action 8.479e+06 2.149e+06 3.946 8.11e-05 ***
## Adventure 1.185e+07 2.356e+06 5.032 5.08e-07 ***
## Animation 2.099e+07 4.356e+06 4.817 1.51e-06 ***
## Biography -1.543e+06 3.406e+06 -0.453 0.650554
## Comedy 7.262e+06 2.008e+06 3.617 0.000302 ***
## Crime -4.187e+06 2.212e+06 -1.893 0.058464 .
## Documentary 1.262e+07 6.756e+06 1.868 0.061891 .
## Drama -1.105e+07 1.956e+06 -5.649 1.74e-08 ***
## Family 2.859e+07 3.997e+06 7.151 1.03e-12 ***
## Fantasy 2.481e+06 2.414e+06 1.028 0.304094
## History -7.452e+06 4.360e+06 -1.709 0.087526 .
## Horror -6.120e+06 2.962e+06 -2.066 0.038914 *
## Musical -4.368e+06 5.286e+06 -0.826 0.408680
## Mystery -1.591e+06 2.638e+06 -0.603 0.546362
## Romance 3.840e+06 1.930e+06 1.989 0.046737 *
## `Sci-Fi` -7.444e+06 2.470e+06 -3.013 0.002602 **
## Sport 2.449e+06 3.883e+06 0.631 0.528296
## Thriller 2.486e+06 2.103e+06 1.182 0.237195
## War 3.321e+06 4.165e+06 0.797 0.425306
## Western -1.059e+07 6.317e+06 -1.676 0.093821 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44340000 on 3644 degrees of freedom
## Multiple R-squared: 0.599, Adjusted R-squared: 0.5945
## F-statistic: 132.7 on 41 and 3644 DF, p-value: < 2.2e-16
final_df_matrix <- as.matrix(final_df[, c(-11,-12,-39)]) #getting rid of factor variable (country, content rating and gross_category)
m <- cor(final_df_matrix)
corrplot(m, method = "circle", type = "upper", order = "hclust")
# Cut gross in to percentile (cut it by 0 to 25 percentile,25 to 75 percentile, Above 75 percentile)
summary(final_df$gross)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 162 7667175 28902322 51809423 66438272 760505847
final_df$gross_catogorical <- cut(final_df$gross/10^6, breaks = c(0, 7, 66, 761)) # change gross in millions
sum(is.na(final_df$gross_catogorical)) # double checking the completeness of data
## [1] 0
# partition data
set.seed(1) # set seed for reproducing the partition
train.index <- sample(c(1:3686), 3686*0.6)
#Create and set aside the remaining 40% of the data, to be used after omitting unhelpful data points and unnecessary variables.
train.df <- final_df[train.index,]
valid.df <- final_df[-train.index,]
library(rpart)
reg2 <- lm(gross ~ . , data = train.df)
summary(reg2)
##
## Call:
## lm(formula = gross ~ ., data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220666456 -14855093 469990 12121669 433329462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.742e+08 2.843e+08 2.723 0.00652 **
## num_critic_for_reviews 5.344e+04 1.297e+04 4.121 3.91e-05 ***
## duration 7.498e+04 4.632e+04 1.619 0.10564
## director_facebook_likes -1.103e+03 2.717e+02 -4.058 5.13e-05 ***
## actor_3_facebook_likes -8.193e+03 1.139e+03 -7.192 8.75e-13 ***
## actor_1_facebook_likes -7.209e+03 6.855e+02 -10.516 < 2e-16 ***
## num_voted_users 9.902e+01 9.928e+00 9.974 < 2e-16 ***
## cast_total_facebook_likes 7.124e+03 6.844e+02 10.408 < 2e-16 ***
## facenumber_in_poster -1.892e+05 4.118e+05 -0.459 0.64597
## num_user_for_reviews 1.346e+04 3.258e+03 4.132 3.74e-05 ***
## countryUK -5.033e+05 3.559e+06 -0.141 0.88755
## countryUSA 4.982e+06 2.532e+06 1.968 0.04923 *
## content_ratingNC-17 -5.453e+06 1.380e+07 -0.395 0.69289
## content_ratingPG 6.331e+06 5.718e+06 1.107 0.26838
## content_ratingPG-13 1.109e+07 6.553e+06 1.692 0.09086 .
## content_ratingR -1.501e+05 6.609e+06 -0.023 0.98189
## budget 2.543e-02 9.090e-03 2.798 0.00519 **
## title_year -3.978e+05 1.415e+05 -2.811 0.00499 **
## actor_2_facebook_likes -7.289e+03 7.208e+02 -10.112 < 2e-16 ***
## imdb_score -2.214e+05 1.031e+06 -0.215 0.82996
## aspect_ratio -1.241e+06 1.729e+06 -0.718 0.47309
## movie_facebook_likes 8.997e+01 5.518e+01 1.630 0.10316
## Action 3.320e+06 2.218e+06 1.497 0.13460
## Adventure 1.111e+07 2.514e+06 4.419 1.04e-05 ***
## Animation 1.901e+07 4.549e+06 4.178 3.06e-05 ***
## Biography 6.611e+05 3.575e+06 0.185 0.85331
## Comedy 2.055e+06 2.123e+06 0.968 0.33331
## Crime -1.879e+06 2.338e+06 -0.804 0.42156
## Documentary 1.050e+07 6.868e+06 1.528 0.12655
## Drama -2.059e+06 2.073e+06 -0.993 0.32070
## Family 1.169e+07 4.316e+06 2.710 0.00679 **
## Fantasy 3.309e+06 2.519e+06 1.313 0.18919
## History -2.166e+06 4.485e+06 -0.483 0.62927
## Horror -4.988e+05 3.104e+06 -0.161 0.87232
## Musical -9.219e+05 5.304e+06 -0.174 0.86203
## Mystery 4.281e+05 2.837e+06 0.151 0.88008
## Romance 2.912e+06 1.990e+06 1.463 0.14366
## `Sci-Fi` -1.527e+06 2.622e+06 -0.582 0.56043
## Sport 3.114e+06 3.927e+06 0.793 0.42800
## Thriller -4.775e+05 2.216e+06 -0.215 0.82941
## War -3.906e+06 4.204e+06 -0.929 0.35294
## Western -8.329e+06 6.226e+06 -1.338 0.18112
## gross_catogorical(7,66] 1.009e+07 2.150e+06 4.691 2.89e-06 ***
## gross_catogorical(66,761] 8.641e+07 3.006e+06 28.750 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35520000 on 2167 degrees of freedom
## Multiple R-squared: 0.7416, Adjusted R-squared: 0.7364
## F-statistic: 144.6 on 43 and 2167 DF, p-value: < 2.2e-16
vif(reg2)
## GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews 4.322856 1 2.079148
## duration 1.766013 1 1.328914
## director_facebook_likes 1.168806 1 1.081113
## actor_3_facebook_likes 8.175606 1 2.859302
## actor_1_facebook_likes 271.233874 1 16.469180
## num_voted_users 3.812219 1 1.952490
## cast_total_facebook_likes 383.393845 1 19.580445
## facenumber_in_poster 1.142121 1 1.068701
## num_user_for_reviews 3.024278 1 1.739045
## country 1.247509 2 1.056844
## content_rating 4.482774 4 1.206267
## budget 1.221955 1 1.105421
## title_year 2.128414 1 1.458909
## actor_2_facebook_likes 21.846923 1 4.674069
## imdb_score 2.033370 1 1.425963
## aspect_ratio 1.114447 1 1.055674
## movie_facebook_likes 2.251619 1 1.500540
## Action 1.623187 1 1.274043
## Adventure 1.782783 1 1.335209
## Animation 2.007143 1 1.416737
## Biography 1.275232 1 1.129262
## Comedy 1.893194 1 1.375934
## Crime 1.424992 1 1.193730
## Documentary 1.178901 1 1.085772
## Drama 1.882076 1 1.371888
## Family 3.352694 1 1.831036
## Fantasy 1.311481 1 1.145199
## History 1.303287 1 1.141616
## Horror 1.524613 1 1.234752
## Musical 1.153285 1 1.073911
## Mystery 1.207384 1 1.098810
## Romance 1.238638 1 1.112941
## `Sci-Fi` 1.328006 1 1.152391
## Sport 1.100352 1 1.048977
## Thriller 1.751239 1 1.323344
## War 1.234893 1 1.111258
## Western 1.058230 1 1.028703
## gross_catogorical 2.118734 2 1.206477
reg2 <- lm(gross ~ . -actor_1_facebook_likes -cast_total_facebook_likes -actor_2_facebook_likes -gross_catogorical , data = train.df) # Removed colinear variables
reg.step <- step(reg2, direction = "both")
## Start: AIC=77988.88
## gross ~ (num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + actor_1_facebook_likes + num_voted_users +
## cast_total_facebook_likes + facenumber_in_poster + num_user_for_reviews +
## country + content_rating + budget + title_year + actor_2_facebook_likes +
## imdb_score + aspect_ratio + movie_facebook_likes + Action +
## Adventure + Animation + Biography + Comedy + Crime + Documentary +
## Drama + Family + Fantasy + History + Horror + Musical + Mystery +
## Romance + `Sci-Fi` + Sport + Thriller + War + Western + gross_catogorical) -
## actor_1_facebook_likes - cast_total_facebook_likes - actor_2_facebook_likes -
## gross_catogorical
##
## Df Sum of Sq RSS AIC
## - movie_facebook_likes 1 4.0329e+12 4.4483e+18 77987
## - Fantasy 1 3.2144e+13 4.4483e+18 77987
## - War 1 4.5981e+13 4.4483e+18 77987
## - aspect_ratio 1 8.5974e+13 4.4484e+18 77987
## - Sport 1 2.8756e+14 4.4486e+18 77987
## - Mystery 1 2.8912e+14 4.4486e+18 77987
## - Biography 1 3.4704e+14 4.4486e+18 77987
## - imdb_score 1 1.2778e+15 4.4495e+18 77988
## - Documentary 1 1.3585e+15 4.4496e+18 77988
## - Thriller 1 1.6021e+15 4.4499e+18 77988
## - facenumber_in_poster 1 1.6807e+15 4.4500e+18 77988
## - History 1 2.2026e+15 4.4505e+18 77988
## - Musical 1 2.4439e+15 4.4507e+18 77988
## - Romance 1 2.6570e+15 4.4509e+18 77988
## - Crime 1 3.5636e+15 4.4518e+18 77989
## <none> 4.4483e+18 77989
## - Horror 1 4.4982e+15 4.4528e+18 77989
## - Western 1 4.5774e+15 4.4528e+18 77989
## - `Sci-Fi` 1 1.2077e+16 4.4603e+18 77993
## - Adventure 1 2.2116e+16 4.4704e+18 77998
## - Action 1 2.5694e+16 4.4740e+18 78000
## - num_user_for_reviews 1 2.6205e+16 4.4745e+18 78000
## - Comedy 1 2.6999e+16 4.4753e+18 78000
## - director_facebook_likes 1 3.0964e+16 4.4792e+18 78002
## - Animation 1 3.1588e+16 4.4799e+18 78003
## - Drama 1 3.2914e+16 4.4812e+18 78003
## - budget 1 3.4660e+16 4.4829e+18 78004
## - Family 1 5.3064e+16 4.5013e+18 78013
## - title_year 1 6.5325e+16 4.5136e+18 78019
## - duration 1 6.5963e+16 4.5142e+18 78019
## - actor_3_facebook_likes 1 6.7538e+16 4.5158e+18 78020
## - country 2 1.3960e+17 4.5879e+18 78053
## - num_critic_for_reviews 1 1.3687e+17 4.5851e+18 78054
## - content_rating 4 2.0491e+17 4.6532e+18 78080
## - num_voted_users 1 3.7183e+17 4.8201e+18 78164
##
## Step: AIC=77986.88
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + imdb_score + aspect_ratio + Action + Adventure +
## Animation + Biography + Comedy + Crime + Documentary + Drama +
## Family + Fantasy + History + Horror + Musical + Mystery +
## Romance + `Sci-Fi` + Sport + Thriller + War + Western
##
## Df Sum of Sq RSS AIC
## - Fantasy 1 3.1535e+13 4.4483e+18 77985
## - War 1 4.6563e+13 4.4483e+18 77985
## - aspect_ratio 1 8.8307e+13 4.4484e+18 77985
## - Mystery 1 2.8702e+14 4.4486e+18 77985
## - Sport 1 2.8751e+14 4.4486e+18 77985
## - Biography 1 3.4775e+14 4.4486e+18 77985
## - imdb_score 1 1.2884e+15 4.4496e+18 77986
## - Documentary 1 1.3653e+15 4.4496e+18 77986
## - Thriller 1 1.5999e+15 4.4499e+18 77986
## - facenumber_in_poster 1 1.6798e+15 4.4500e+18 77986
## - History 1 2.2119e+15 4.4505e+18 77986
## - Musical 1 2.4408e+15 4.4507e+18 77986
## - Romance 1 2.6584e+15 4.4509e+18 77986
## - Crime 1 3.5716e+15 4.4518e+18 77987
## <none> 4.4483e+18 77987
## - Horror 1 4.5047e+15 4.4528e+18 77987
## - Western 1 4.5796e+15 4.4529e+18 77987
## + movie_facebook_likes 1 4.0329e+12 4.4483e+18 77989
## - `Sci-Fi` 1 1.2081e+16 4.4604e+18 77991
## - Adventure 1 2.2113e+16 4.4704e+18 77996
## - Action 1 2.5696e+16 4.4740e+18 77998
## - Comedy 1 2.7007e+16 4.4753e+18 77998
## - num_user_for_reviews 1 2.7302e+16 4.4756e+18 77998
## - director_facebook_likes 1 3.0965e+16 4.4792e+18 78000
## - Animation 1 3.1593e+16 4.4799e+18 78001
## - Drama 1 3.2923e+16 4.4812e+18 78001
## - budget 1 3.4658e+16 4.4829e+18 78002
## - Family 1 5.3107e+16 4.5014e+18 78011
## - title_year 1 6.5545e+16 4.5138e+18 78017
## - duration 1 6.6129e+16 4.5144e+18 78018
## - actor_3_facebook_likes 1 6.7938e+16 4.5162e+18 78018
## - country 2 1.3962e+17 4.5879e+18 78051
## - num_critic_for_reviews 1 1.7630e+17 4.6246e+18 78071
## - content_rating 4 2.0500e+17 4.6533e+18 78078
## - num_voted_users 1 4.0036e+17 4.8486e+18 78175
##
## Step: AIC=77984.9
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + imdb_score + aspect_ratio + Action + Adventure +
## Animation + Biography + Comedy + Crime + Documentary + Drama +
## Family + History + Horror + Musical + Mystery + Romance +
## `Sci-Fi` + Sport + Thriller + War + Western
##
## Df Sum of Sq RSS AIC
## - War 1 4.7999e+13 4.4484e+18 77983
## - aspect_ratio 1 8.7974e+13 4.4484e+18 77983
## - Mystery 1 2.9023e+14 4.4486e+18 77983
## - Sport 1 2.9815e+14 4.4486e+18 77983
## - Biography 1 3.5696e+14 4.4487e+18 77983
## - imdb_score 1 1.2769e+15 4.4496e+18 77984
## - Documentary 1 1.3801e+15 4.4497e+18 77984
## - Thriller 1 1.6363e+15 4.4499e+18 77984
## - facenumber_in_poster 1 1.6647e+15 4.4500e+18 77984
## - History 1 2.1918e+15 4.4505e+18 77984
## - Musical 1 2.4655e+15 4.4508e+18 77984
## - Romance 1 2.6563e+15 4.4510e+18 77984
## - Crime 1 3.5412e+15 4.4518e+18 77985
## <none> 4.4483e+18 77985
## - Western 1 4.5552e+15 4.4529e+18 77985
## - Horror 1 4.6017e+15 4.4529e+18 77985
## + Fantasy 1 3.1535e+13 4.4483e+18 77987
## + movie_facebook_likes 1 3.4244e+12 4.4483e+18 77987
## - `Sci-Fi` 1 1.2140e+16 4.4604e+18 77989
## - Adventure 1 2.2115e+16 4.4704e+18 77994
## - Action 1 2.5664e+16 4.4740e+18 77996
## - num_user_for_reviews 1 2.7272e+16 4.4756e+18 77996
## - Comedy 1 2.7417e+16 4.4757e+18 77996
## - director_facebook_likes 1 3.0941e+16 4.4792e+18 77998
## - Animation 1 3.1604e+16 4.4799e+18 77999
## - Drama 1 3.2909e+16 4.4812e+18 77999
## - budget 1 3.4634e+16 4.4829e+18 78000
## - Family 1 5.4050e+16 4.5024e+18 78010
## - title_year 1 6.5624e+16 4.5139e+18 78015
## - duration 1 6.6389e+16 4.5147e+18 78016
## - actor_3_facebook_likes 1 6.8086e+16 4.5164e+18 78016
## - country 2 1.3959e+17 4.5879e+18 78049
## - num_critic_for_reviews 1 1.7647e+17 4.6248e+18 78069
## - content_rating 4 2.0540e+17 4.6537e+18 78077
## - num_voted_users 1 4.0038e+17 4.8487e+18 78173
##
## Step: AIC=77982.92
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + imdb_score + aspect_ratio + Action + Adventure +
## Animation + Biography + Comedy + Crime + Documentary + Drama +
## Family + History + Horror + Musical + Mystery + Romance +
## `Sci-Fi` + Sport + Thriller + Western
##
## Df Sum of Sq RSS AIC
## - aspect_ratio 1 8.6248e+13 4.4484e+18 77981
## - Sport 1 2.8396e+14 4.4486e+18 77981
## - Mystery 1 2.8783e+14 4.4486e+18 77981
## - Biography 1 3.6622e+14 4.4487e+18 77981
## - imdb_score 1 1.2733e+15 4.4496e+18 77982
## - Documentary 1 1.4134e+15 4.4498e+18 77982
## - Thriller 1 1.6205e+15 4.4500e+18 77982
## - facenumber_in_poster 1 1.6816e+15 4.4500e+18 77982
## - History 1 2.1740e+15 4.4505e+18 77982
## - Musical 1 2.4824e+15 4.4508e+18 77982
## - Romance 1 2.6439e+15 4.4510e+18 77982
## - Crime 1 3.6614e+15 4.4520e+18 77983
## <none> 4.4484e+18 77983
## - Western 1 4.5648e+15 4.4529e+18 77983
## - Horror 1 4.6249e+15 4.4530e+18 77983
## + War 1 4.7999e+13 4.4483e+18 77985
## + Fantasy 1 3.2971e+13 4.4483e+18 77985
## + movie_facebook_likes 1 3.9571e+12 4.4484e+18 77985
## - `Sci-Fi` 1 1.2271e+16 4.4606e+18 77987
## - Adventure 1 2.2070e+16 4.4704e+18 77992
## - Action 1 2.6045e+16 4.4744e+18 77994
## - num_user_for_reviews 1 2.7280e+16 4.4756e+18 77994
## - Comedy 1 2.7384e+16 4.4757e+18 77994
## - director_facebook_likes 1 3.0898e+16 4.4793e+18 77996
## - Animation 1 3.1727e+16 4.4801e+18 77997
## - Drama 1 3.2872e+16 4.4812e+18 77997
## - budget 1 3.4961e+16 4.4833e+18 77998
## - Family 1 5.4112e+16 4.5025e+18 78008
## - title_year 1 6.5582e+16 4.5139e+18 78013
## - duration 1 6.7159e+16 4.5155e+18 78014
## - actor_3_facebook_likes 1 6.8120e+16 4.5165e+18 78015
## - country 2 1.3979e+17 4.5881e+18 78047
## - num_critic_for_reviews 1 1.7653e+17 4.6249e+18 78067
## - content_rating 4 2.0603e+17 4.6544e+18 78075
## - num_voted_users 1 4.0035e+17 4.8487e+18 78171
##
## Step: AIC=77980.96
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + imdb_score + Action + Adventure + Animation +
## Biography + Comedy + Crime + Documentary + Drama + Family +
## History + Horror + Musical + Mystery + Romance + `Sci-Fi` +
## Sport + Thriller + Western
##
## Df Sum of Sq RSS AIC
## - Mystery 1 2.7752e+14 4.4487e+18 77979
## - Sport 1 2.8189e+14 4.4487e+18 77979
## - Biography 1 3.5950e+14 4.4488e+18 77979
## - imdb_score 1 1.2664e+15 4.4497e+18 77980
## - Documentary 1 1.3995e+15 4.4498e+18 77980
## - Thriller 1 1.5842e+15 4.4500e+18 77980
## - facenumber_in_poster 1 1.6942e+15 4.4501e+18 77980
## - History 1 2.1804e+15 4.4506e+18 77980
## - Musical 1 2.4907e+15 4.4509e+18 77980
## - Romance 1 2.6233e+15 4.4511e+18 77980
## - Crime 1 3.6941e+15 4.4521e+18 77981
## <none> 4.4484e+18 77981
## - Western 1 4.5997e+15 4.4530e+18 77981
## - Horror 1 4.6478e+15 4.4531e+18 77981
## + aspect_ratio 1 8.6248e+13 4.4484e+18 77983
## + War 1 4.6273e+13 4.4484e+18 77983
## + Fantasy 1 3.2608e+13 4.4484e+18 77983
## + movie_facebook_likes 1 6.2308e+12 4.4484e+18 77983
## - `Sci-Fi` 1 1.2254e+16 4.4607e+18 77985
## - Adventure 1 2.1998e+16 4.4704e+18 77990
## - Action 1 2.5974e+16 4.4744e+18 77992
## - num_user_for_reviews 1 2.7299e+16 4.4757e+18 77992
## - Comedy 1 2.7552e+16 4.4760e+18 77993
## - director_facebook_likes 1 3.0917e+16 4.4794e+18 77994
## - Animation 1 3.1730e+16 4.4802e+18 77995
## - Drama 1 3.2874e+16 4.4813e+18 77995
## - budget 1 3.4925e+16 4.4834e+18 77996
## - Family 1 5.4172e+16 4.5026e+18 78006
## - title_year 1 6.6211e+16 4.5147e+18 78012
## - duration 1 6.7138e+16 4.5156e+18 78012
## - actor_3_facebook_likes 1 6.8277e+16 4.5167e+18 78013
## - country 2 1.4000e+17 4.5884e+18 78045
## - num_critic_for_reviews 1 1.7670e+17 4.6251e+18 78065
## - content_rating 4 2.0595e+17 4.6544e+18 78073
## - num_voted_users 1 4.0034e+17 4.8488e+18 78169
##
## Step: AIC=77979.1
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + imdb_score + Action + Adventure + Animation +
## Biography + Comedy + Crime + Documentary + Drama + Family +
## History + Horror + Musical + Romance + `Sci-Fi` + Sport +
## Thriller + Western
##
## Df Sum of Sq RSS AIC
## - Sport 1 2.6132e+14 4.4490e+18 77977
## - Biography 1 3.1941e+14 4.4490e+18 77977
## - imdb_score 1 1.2422e+15 4.4500e+18 77978
## - Documentary 1 1.3477e+15 4.4501e+18 77978
## - facenumber_in_poster 1 1.7171e+15 4.4504e+18 77978
## - Thriller 1 1.9942e+15 4.4507e+18 77978
## - History 1 2.2494e+15 4.4510e+18 77978
## - Romance 1 2.5431e+15 4.4513e+18 77978
## - Musical 1 2.5577e+15 4.4513e+18 77978
## - Crime 1 3.6672e+15 4.4524e+18 77979
## <none> 4.4487e+18 77979
## - Horror 1 4.5106e+15 4.4532e+18 77979
## - Western 1 4.6405e+15 4.4534e+18 77979
## + Mystery 1 2.7752e+14 4.4484e+18 77981
## + aspect_ratio 1 7.5933e+13 4.4486e+18 77981
## + War 1 4.4061e+13 4.4487e+18 77981
## + Fantasy 1 3.5784e+13 4.4487e+18 77981
## + movie_facebook_likes 1 3.4141e+12 4.4487e+18 77981
## - `Sci-Fi` 1 1.2138e+16 4.4609e+18 77983
## - Adventure 1 2.1834e+16 4.4706e+18 77988
## - Action 1 2.5815e+16 4.4745e+18 77990
## - Comedy 1 2.7275e+16 4.4760e+18 77991
## - num_user_for_reviews 1 2.7282e+16 4.4760e+18 77991
## - director_facebook_likes 1 3.0816e+16 4.4795e+18 77992
## - Animation 1 3.1644e+16 4.4804e+18 77993
## - Drama 1 3.3286e+16 4.4820e+18 77994
## - budget 1 3.4838e+16 4.4836e+18 77994
## - Family 1 5.4701e+16 4.5034e+18 78004
## - title_year 1 6.5973e+16 4.5147e+18 78010
## - duration 1 6.7669e+16 4.5164e+18 78010
## - actor_3_facebook_likes 1 6.8219e+16 4.5169e+18 78011
## - country 2 1.3973e+17 4.5885e+18 78043
## - num_critic_for_reviews 1 1.7650e+17 4.6252e+18 78063
## - content_rating 4 2.0692e+17 4.6556e+18 78072
## - num_voted_users 1 4.0049e+17 4.8492e+18 78168
##
## Step: AIC=77977.23
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + imdb_score + Action + Adventure + Animation +
## Biography + Comedy + Crime + Documentary + Drama + Family +
## History + Horror + Musical + Romance + `Sci-Fi` + Thriller +
## Western
##
## Df Sum of Sq RSS AIC
## - Biography 1 4.1719e+14 4.4494e+18 77975
## - imdb_score 1 1.2042e+15 4.4502e+18 77976
## - Documentary 1 1.3627e+15 4.4503e+18 77976
## - facenumber_in_poster 1 1.7438e+15 4.4507e+18 77976
## - Thriller 1 1.9147e+15 4.4509e+18 77976
## - History 1 2.3470e+15 4.4513e+18 77976
## - Romance 1 2.4468e+15 4.4514e+18 77976
## - Musical 1 2.6772e+15 4.4517e+18 77977
## - Crime 1 3.8215e+15 4.4528e+18 77977
## <none> 4.4490e+18 77977
## - Horror 1 4.6056e+15 4.4536e+18 77978
## - Western 1 4.7242e+15 4.4537e+18 77978
## + Sport 1 2.6132e+14 4.4487e+18 77979
## + Mystery 1 2.5694e+14 4.4487e+18 77979
## + aspect_ratio 1 7.4432e+13 4.4489e+18 77979
## + Fantasy 1 4.5922e+13 4.4489e+18 77979
## + War 1 3.1100e+13 4.4489e+18 77979
## + movie_facebook_likes 1 3.2817e+12 4.4490e+18 77979
## - `Sci-Fi` 1 1.2226e+16 4.4612e+18 77981
## - Adventure 1 2.1597e+16 4.4706e+18 77986
## - Action 1 2.5808e+16 4.4748e+18 77988
## - Comedy 1 2.7152e+16 4.4761e+18 77989
## - num_user_for_reviews 1 2.7227e+16 4.4762e+18 77989
## - director_facebook_likes 1 3.0676e+16 4.4797e+18 77990
## - Animation 1 3.1505e+16 4.4805e+18 77991
## - Drama 1 3.3239e+16 4.4822e+18 77992
## - budget 1 3.4855e+16 4.4838e+18 77992
## - Family 1 5.4705e+16 4.5037e+18 78002
## - title_year 1 6.5905e+16 4.5149e+18 78008
## - duration 1 6.7957e+16 4.5169e+18 78009
## - actor_3_facebook_likes 1 6.8158e+16 4.5171e+18 78009
## - country 2 1.4073e+17 4.5897e+18 78042
## - num_critic_for_reviews 1 1.7627e+17 4.6253e+18 78061
## - content_rating 4 2.0953e+17 4.6585e+18 78071
## - num_voted_users 1 4.0023e+17 4.8492e+18 78166
##
## Step: AIC=77975.44
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + imdb_score + Action + Adventure + Animation +
## Comedy + Crime + Documentary + Drama + Family + History +
## Horror + Musical + Romance + `Sci-Fi` + Thriller + Western
##
## Df Sum of Sq RSS AIC
## - imdb_score 1 1.1478e+15 4.4505e+18 77974
## - Documentary 1 1.2940e+15 4.4507e+18 77974
## - Thriller 1 1.7117e+15 4.4511e+18 77974
## - facenumber_in_poster 1 1.8270e+15 4.4512e+18 77974
## - History 1 2.0160e+15 4.4514e+18 77974
## - Romance 1 2.3211e+15 4.4517e+18 77975
## - Musical 1 2.6576e+15 4.4521e+18 77975
## - Crime 1 3.7831e+15 4.4532e+18 77975
## <none> 4.4494e+18 77975
## - Horror 1 4.7716e+15 4.4542e+18 77976
## - Western 1 4.8806e+15 4.4543e+18 77976
## + Biography 1 4.1719e+14 4.4490e+18 77977
## + Sport 1 3.5910e+14 4.4490e+18 77977
## + Mystery 1 2.0949e+14 4.4492e+18 77977
## + aspect_ratio 1 6.8181e+13 4.4493e+18 77977
## + Fantasy 1 6.0103e+13 4.4493e+18 77977
## + War 1 3.7142e+13 4.4494e+18 77977
## + movie_facebook_likes 1 4.0606e+12 4.4494e+18 77977
## - `Sci-Fi` 1 1.2356e+16 4.4618e+18 77980
## - Adventure 1 2.1280e+16 4.4707e+18 77984
## - Action 1 2.5543e+16 4.4749e+18 77986
## - Comedy 1 2.6737e+16 4.4761e+18 77987
## - num_user_for_reviews 1 2.6874e+16 4.4763e+18 77987
## - director_facebook_likes 1 3.0546e+16 4.4799e+18 77989
## - Animation 1 3.1706e+16 4.4811e+18 77989
## - Drama 1 3.2959e+16 4.4824e+18 77990
## - budget 1 3.4569e+16 4.4840e+18 77991
## - Family 1 5.4318e+16 4.5037e+18 78000
## - title_year 1 6.5936e+16 4.5153e+18 78006
## - actor_3_facebook_likes 1 6.8316e+16 4.5177e+18 78007
## - duration 1 6.8377e+16 4.5178e+18 78007
## - country 2 1.4032e+17 4.5897e+18 78040
## - num_critic_for_reviews 1 1.7755e+17 4.6269e+18 78060
## - content_rating 4 2.1245e+17 4.6618e+18 78071
## - num_voted_users 1 4.0029e+17 4.8497e+18 78164
##
## Step: AIC=77974.01
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + Action + Adventure + Animation + Comedy + Crime +
## Documentary + Drama + Family + History + Horror + Musical +
## Romance + `Sci-Fi` + Thriller + Western
##
## Df Sum of Sq RSS AIC
## - Documentary 1 1.0056e+15 4.4516e+18 77973
## - facenumber_in_poster 1 1.7479e+15 4.4523e+18 77973
## - Thriller 1 1.9457e+15 4.4525e+18 77973
## - History 1 2.0946e+15 4.4526e+18 77973
## - Romance 1 2.3658e+15 4.4529e+18 77973
## - Musical 1 2.7113e+15 4.4533e+18 77973
## - Crime 1 3.8816e+15 4.4544e+18 77974
## <none> 4.4505e+18 77974
## - Horror 1 4.1831e+15 4.4547e+18 77974
## - Western 1 4.8615e+15 4.4554e+18 77974
## + imdb_score 1 1.1478e+15 4.4494e+18 77975
## + Biography 1 3.6076e+14 4.4502e+18 77976
## + Sport 1 3.0830e+14 4.4502e+18 77976
## + Mystery 1 1.9355e+14 4.4504e+18 77976
## + aspect_ratio 1 6.3185e+13 4.4505e+18 77976
## + Fantasy 1 4.2645e+13 4.4505e+18 77976
## + War 1 3.4808e+13 4.4505e+18 77976
## + movie_facebook_likes 1 1.4066e+13 4.4505e+18 77976
## - `Sci-Fi` 1 1.2015e+16 4.4626e+18 77978
## - Adventure 1 2.1799e+16 4.4723e+18 77983
## - Action 1 2.6988e+16 4.4775e+18 77985
## - Comedy 1 2.7916e+16 4.4785e+18 77986
## - num_user_for_reviews 1 2.9249e+16 4.4798e+18 77986
## - director_facebook_likes 1 3.0499e+16 4.4810e+18 77987
## - Animation 1 3.0568e+16 4.4811e+18 77987
## - budget 1 3.4891e+16 4.4854e+18 77989
## - Drama 1 3.7849e+16 4.4884e+18 77991
## - Family 1 5.5435e+16 4.5060e+18 77999
## - title_year 1 6.5205e+16 4.5157e+18 78004
## - duration 1 6.7229e+16 4.5178e+18 78005
## - actor_3_facebook_likes 1 6.9123e+16 4.5197e+18 78006
## - country 2 1.4794e+17 4.5985e+18 78042
## - num_critic_for_reviews 1 1.8301e+17 4.6336e+18 78061
## - content_rating 4 2.1754e+17 4.6681e+18 78072
## - num_voted_users 1 4.3668e+17 4.8872e+18 78179
##
## Step: AIC=77972.51
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + Action + Adventure + Animation + Comedy + Crime +
## Drama + Family + History + Horror + Musical + Romance + `Sci-Fi` +
## Thriller + Western
##
## Df Sum of Sq RSS AIC
## - Thriller 1 1.6961e+15 4.4532e+18 77971
## - facenumber_in_poster 1 1.8538e+15 4.4534e+18 77971
## - History 1 2.0540e+15 4.4536e+18 77972
## - Romance 1 2.1269e+15 4.4537e+18 77972
## - Musical 1 2.8303e+15 4.4544e+18 77972
## - Crime 1 4.0053e+15 4.4556e+18 77972
## <none> 4.4516e+18 77973
## - Horror 1 4.7955e+15 4.4563e+18 77973
## - Western 1 4.9457e+15 4.4565e+18 77973
## + Documentary 1 1.0056e+15 4.4505e+18 77974
## + imdb_score 1 8.5938e+14 4.4507e+18 77974
## + Sport 1 3.2054e+14 4.4512e+18 77974
## + Biography 1 3.1020e+14 4.4512e+18 77974
## + Mystery 1 1.6037e+14 4.4514e+18 77974
## + Fantasy 1 5.8879e+13 4.4515e+18 77974
## + War 1 5.8641e+13 4.4515e+18 77974
## + aspect_ratio 1 5.4630e+13 4.4515e+18 77974
## + movie_facebook_likes 1 2.1531e+13 4.4515e+18 77974
## - `Sci-Fi` 1 1.2332e+16 4.4639e+18 77977
## - Adventure 1 2.1254e+16 4.4728e+18 77981
## - Action 1 2.6327e+16 4.4779e+18 77984
## - Comedy 1 2.6912e+16 4.4785e+18 77984
## - num_user_for_reviews 1 2.9605e+16 4.4812e+18 77985
## - director_facebook_likes 1 3.0523e+16 4.4821e+18 77986
## - Animation 1 3.0527e+16 4.4821e+18 77986
## - budget 1 3.4664e+16 4.4862e+18 77988
## - Drama 1 4.1788e+16 4.4933e+18 77991
## - Family 1 5.4439e+16 4.5060e+18 77997
## - title_year 1 6.4227e+16 4.5158e+18 78002
## - duration 1 6.6638e+16 4.5182e+18 78003
## - actor_3_facebook_likes 1 6.8944e+16 4.5205e+18 78004
## - country 2 1.4823e+17 4.5998e+18 78041
## - num_critic_for_reviews 1 1.8201e+17 4.6336e+18 78059
## - content_rating 4 2.1748e+17 4.6690e+18 78070
## - num_voted_users 1 4.3610e+17 4.8877e+18 78177
##
## Step: AIC=77971.35
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + Action + Adventure + Animation + Comedy + Crime +
## Drama + Family + History + Horror + Musical + Romance + `Sci-Fi` +
## Western
##
## Df Sum of Sq RSS AIC
## - Romance 1 1.8610e+15 4.4551e+18 77970
## - facenumber_in_poster 1 2.0338e+15 4.4553e+18 77970
## - History 1 2.3556e+15 4.4556e+18 77971
## - Crime 1 2.7784e+15 4.4560e+18 77971
## - Musical 1 2.9300e+15 4.4562e+18 77971
## <none> 4.4532e+18 77971
## - Horror 1 4.2430e+15 4.4575e+18 77971
## - Western 1 5.2594e+15 4.4585e+18 77972
## + Thriller 1 1.6961e+15 4.4516e+18 77973
## + imdb_score 1 1.0821e+15 4.4522e+18 77973
## + Documentary 1 7.5593e+14 4.4525e+18 77973
## + Mystery 1 4.9279e+14 4.4528e+18 77973
## + Sport 1 2.0575e+14 4.4530e+18 77973
## + Biography 1 1.4115e+14 4.4531e+18 77973
## + Fantasy 1 9.8935e+13 4.4531e+18 77973
## + War 1 3.6413e+13 4.4532e+18 77973
## + aspect_ratio 1 2.3296e+13 4.4532e+18 77973
## + movie_facebook_likes 1 1.0915e+13 4.4532e+18 77973
## - `Sci-Fi` 1 1.1611e+16 4.4649e+18 77975
## - Adventure 1 2.0853e+16 4.4741e+18 77980
## - Comedy 1 2.5235e+16 4.4785e+18 77982
## - num_user_for_reviews 1 2.9208e+16 4.4825e+18 77984
## - Action 1 2.9350e+16 4.4826e+18 77984
## - director_facebook_likes 1 3.0264e+16 4.4835e+18 77984
## - Animation 1 3.0456e+16 4.4837e+18 77984
## - budget 1 3.4468e+16 4.4877e+18 77986
## - Drama 1 4.2272e+16 4.4955e+18 77990
## - Family 1 5.4475e+16 4.5077e+18 77996
## - title_year 1 6.4805e+16 4.5181e+18 78001
## - duration 1 6.5925e+16 4.5192e+18 78002
## - actor_3_facebook_likes 1 6.8978e+16 4.5222e+18 78003
## - country 2 1.4803e+17 4.6013e+18 78040
## - num_critic_for_reviews 1 1.8461e+17 4.6379e+18 78059
## - content_rating 4 2.1704e+17 4.6703e+18 78069
## - num_voted_users 1 4.3477e+17 4.8880e+18 78175
##
## Step: AIC=77970.28
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + facenumber_in_poster +
## num_user_for_reviews + country + content_rating + budget +
## title_year + Action + Adventure + Animation + Comedy + Crime +
## Drama + Family + History + Horror + Musical + `Sci-Fi` +
## Western
##
## Df Sum of Sq RSS AIC
## - facenumber_in_poster 1 2.1370e+15 4.4572e+18 77969
## - History 1 2.5516e+15 4.4577e+18 77970
## - Musical 1 2.7665e+15 4.4579e+18 77970
## - Crime 1 3.6139e+15 4.4587e+18 77970
## <none> 4.4551e+18 77970
## - Horror 1 4.9157e+15 4.4600e+18 77971
## - Western 1 5.2391e+15 4.4603e+18 77971
## + Romance 1 1.8610e+15 4.4532e+18 77971
## + Thriller 1 1.4302e+15 4.4537e+18 77972
## + imdb_score 1 1.1335e+15 4.4540e+18 77972
## + Documentary 1 5.8001e+14 4.4545e+18 77972
## + Mystery 1 3.8670e+14 4.4547e+18 77972
## + Sport 1 1.2648e+14 4.4550e+18 77972
## + Biography 1 8.9985e+13 4.4550e+18 77972
## + Fantasy 1 8.3104e+13 4.4550e+18 77972
## + War 1 2.7952e+13 4.4551e+18 77972
## + aspect_ratio 1 1.7869e+13 4.4551e+18 77972
## + movie_facebook_likes 1 1.2970e+13 4.4551e+18 77972
## - `Sci-Fi` 1 1.2150e+16 4.4673e+18 77974
## - Adventure 1 2.0029e+16 4.4751e+18 77978
## - Comedy 1 2.8348e+16 4.4835e+18 77982
## - Action 1 2.8536e+16 4.4836e+18 77982
## - num_user_for_reviews 1 2.9677e+16 4.4848e+18 77983
## - Animation 1 2.9692e+16 4.4848e+18 77983
## - director_facebook_likes 1 3.0995e+16 4.4861e+18 77984
## - budget 1 3.4952e+16 4.4901e+18 77986
## - Drama 1 4.0843e+16 4.4960e+18 77988
## - Family 1 5.3871e+16 4.5090e+18 77995
## - title_year 1 6.4987e+16 4.5201e+18 78000
## - duration 1 6.6669e+16 4.5218e+18 78001
## - actor_3_facebook_likes 1 6.9790e+16 4.5249e+18 78003
## - country 2 1.4780e+17 4.6029e+18 78038
## - num_critic_for_reviews 1 1.8355e+17 4.6387e+18 78058
## - content_rating 4 2.2414e+17 4.6793e+18 78071
## - num_voted_users 1 4.3316e+17 4.8883e+18 78173
##
## Step: AIC=77969.34
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + num_user_for_reviews +
## country + content_rating + budget + title_year + Action +
## Adventure + Animation + Comedy + Crime + Drama + Family +
## History + Horror + Musical + `Sci-Fi` + Western
##
## Df Sum of Sq RSS AIC
## - History 1 2.3983e+15 4.4596e+18 77969
## - Musical 1 2.9804e+15 4.4602e+18 77969
## - Crime 1 3.6160e+15 4.4609e+18 77969
## <none> 4.4572e+18 77969
## - Horror 1 4.3916e+15 4.4616e+18 77970
## - Western 1 5.0109e+15 4.4623e+18 77970
## + facenumber_in_poster 1 2.1370e+15 4.4551e+18 77970
## + Romance 1 1.9642e+15 4.4553e+18 77970
## + Thriller 1 1.5933e+15 4.4557e+18 77971
## + imdb_score 1 1.0469e+15 4.4562e+18 77971
## + Documentary 1 6.5132e+14 4.4566e+18 77971
## + Mystery 1 4.2554e+14 4.4568e+18 77971
## + Sport 1 1.4960e+14 4.4571e+18 77971
## + Biography 1 1.2530e+14 4.4571e+18 77971
## + Fantasy 1 5.9970e+13 4.4572e+18 77971
## + War 1 4.2712e+13 4.4572e+18 77971
## + aspect_ratio 1 2.2261e+13 4.4572e+18 77971
## + movie_facebook_likes 1 1.1143e+13 4.4572e+18 77971
## - `Sci-Fi` 1 1.1941e+16 4.4692e+18 77973
## - Adventure 1 2.0083e+16 4.4773e+18 77977
## - Comedy 1 2.6524e+16 4.4838e+18 77980
## - Action 1 2.8494e+16 4.4857e+18 77981
## - director_facebook_likes 1 3.0634e+16 4.4879e+18 77982
## - num_user_for_reviews 1 3.0713e+16 4.4880e+18 77983
## - Animation 1 3.1840e+16 4.4891e+18 77983
## - budget 1 3.5259e+16 4.4925e+18 77985
## - Drama 1 4.0198e+16 4.4974e+18 77987
## - Family 1 5.3703e+16 4.5109e+18 77994
## - duration 1 6.5038e+16 4.5223e+18 77999
## - title_year 1 6.7694e+16 4.5249e+18 78001
## - actor_3_facebook_likes 1 6.8237e+16 4.5255e+18 78001
## - country 2 1.4765e+17 4.6049e+18 78037
## - num_critic_for_reviews 1 1.8468e+17 4.6419e+18 78057
## - content_rating 4 2.2472e+17 4.6820e+18 78070
## - num_voted_users 1 4.3203e+17 4.8893e+18 78172
##
## Step: AIC=77968.52
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + num_user_for_reviews +
## country + content_rating + budget + title_year + Action +
## Adventure + Animation + Comedy + Crime + Drama + Family +
## Horror + Musical + `Sci-Fi` + Western
##
## Df Sum of Sq RSS AIC
## - Musical 1 2.8721e+15 4.4625e+18 77968
## - Crime 1 3.0385e+15 4.4627e+18 77968
## <none> 4.4596e+18 77969
## - Horror 1 4.2310e+15 4.4639e+18 77969
## - Western 1 4.9049e+15 4.4645e+18 77969
## + History 1 2.3983e+15 4.4572e+18 77969
## + Romance 1 2.1552e+15 4.4575e+18 77969
## + facenumber_in_poster 1 1.9838e+15 4.4577e+18 77970
## + Thriller 1 1.8689e+15 4.4578e+18 77970
## + imdb_score 1 1.1605e+15 4.4585e+18 77970
## + Documentary 1 5.8740e+14 4.4591e+18 77970
## + Mystery 1 5.7960e+14 4.4591e+18 77970
## + Sport 1 1.6501e+14 4.4595e+18 77970
## + War 1 5.7753e+13 4.4596e+18 77970
## + aspect_ratio 1 2.4368e+13 4.4596e+18 77971
## + Fantasy 1 2.2358e+13 4.4596e+18 77971
## + movie_facebook_likes 1 2.2346e+13 4.4596e+18 77971
## + Biography 1 1.5081e+12 4.4596e+18 77971
## - `Sci-Fi` 1 1.1225e+16 4.4709e+18 77972
## - Adventure 1 2.0125e+16 4.4798e+18 77976
## - Comedy 1 2.7588e+16 4.4872e+18 77980
## - Action 1 2.8002e+16 4.4876e+18 77980
## - num_user_for_reviews 1 3.1073e+16 4.4907e+18 77982
## - director_facebook_likes 1 3.1108e+16 4.4908e+18 77982
## - Animation 1 3.1149e+16 4.4908e+18 77982
## - budget 1 3.5501e+16 4.4951e+18 77984
## - Drama 1 4.1657e+16 4.5013e+18 77987
## - Family 1 5.4182e+16 4.5138e+18 77993
## - duration 1 6.2640e+16 4.5223e+18 77997
## - title_year 1 6.7647e+16 4.5273e+18 78000
## - actor_3_facebook_likes 1 6.7697e+16 4.5273e+18 78000
## - country 2 1.5174e+17 4.6114e+18 78039
## - num_critic_for_reviews 1 1.8419e+17 4.6438e+18 78056
## - content_rating 4 2.2499e+17 4.6846e+18 78069
## - num_voted_users 1 4.3603e+17 4.8957e+18 78173
##
## Step: AIC=77967.95
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + num_user_for_reviews +
## country + content_rating + budget + title_year + Action +
## Adventure + Animation + Comedy + Crime + Drama + Family +
## Horror + `Sci-Fi` + Western
##
## Df Sum of Sq RSS AIC
## - Crime 1 2.8983e+15 4.4654e+18 77967
## <none> 4.4625e+18 77968
## - Horror 1 4.2230e+15 4.4667e+18 77968
## - Western 1 4.7235e+15 4.4672e+18 77968
## + Musical 1 2.8721e+15 4.4596e+18 77969
## + History 1 2.2900e+15 4.4602e+18 77969
## + facenumber_in_poster 1 2.1896e+15 4.4603e+18 77969
## + Thriller 1 1.9871e+15 4.4605e+18 77969
## + Romance 1 1.9764e+15 4.4605e+18 77969
## + imdb_score 1 1.1962e+15 4.4613e+18 77969
## + Mystery 1 6.8682e+14 4.4618e+18 77970
## + Documentary 1 6.8622e+14 4.4618e+18 77970
## + Sport 1 2.6435e+14 4.4623e+18 77970
## + Fantasy 1 5.0408e+13 4.4625e+18 77970
## + War 1 3.8888e+13 4.4625e+18 77970
## + aspect_ratio 1 2.7666e+13 4.4625e+18 77970
## + movie_facebook_likes 1 1.2489e+13 4.4625e+18 77970
## + Biography 1 2.3060e+12 4.4625e+18 77970
## - `Sci-Fi` 1 1.0855e+16 4.4734e+18 77971
## - Adventure 1 2.0935e+16 4.4834e+18 77976
## - Comedy 1 2.7771e+16 4.4903e+18 77980
## - Action 1 2.8423e+16 4.4909e+18 77980
## - Animation 1 2.9790e+16 4.4923e+18 77981
## - num_user_for_reviews 1 3.0096e+16 4.4926e+18 77981
## - director_facebook_likes 1 3.0981e+16 4.4935e+18 77981
## - budget 1 3.5163e+16 4.4977e+18 77983
## - Drama 1 4.2383e+16 4.5049e+18 77987
## - Family 1 5.3637e+16 4.5162e+18 77992
## - duration 1 6.1933e+16 4.5244e+18 77996
## - title_year 1 6.6923e+16 4.5294e+18 77999
## - actor_3_facebook_likes 1 6.8435e+16 4.5309e+18 78000
## - country 2 1.5076e+17 4.6133e+18 78037
## - num_critic_for_reviews 1 1.8431e+17 4.6468e+18 78055
## - content_rating 4 2.2272e+17 4.6852e+18 78068
## - num_voted_users 1 4.4117e+17 4.9037e+18 78174
##
## Step: AIC=77967.38
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + num_user_for_reviews +
## country + content_rating + budget + title_year + Action +
## Adventure + Animation + Comedy + Drama + Family + Horror +
## `Sci-Fi` + Western
##
## Df Sum of Sq RSS AIC
## - Horror 1 3.2316e+15 4.4686e+18 77967
## <none> 4.4654e+18 77967
## - Western 1 4.4137e+15 4.4698e+18 77968
## + Crime 1 2.8983e+15 4.4625e+18 77968
## + Musical 1 2.7318e+15 4.4627e+18 77968
## + Romance 1 2.7155e+15 4.4627e+18 77968
## + facenumber_in_poster 1 2.2051e+15 4.4632e+18 77968
## + History 1 1.7412e+15 4.4637e+18 77969
## + imdb_score 1 1.1488e+15 4.4643e+18 77969
## + Documentary 1 8.5266e+14 4.4646e+18 77969
## + Thriller 1 5.3317e+14 4.4649e+18 77969
## + Sport 1 4.3251e+14 4.4650e+18 77969
## + Mystery 1 3.8510e+14 4.4650e+18 77969
## + aspect_ratio 1 6.0915e+13 4.4654e+18 77969
## + movie_facebook_likes 1 3.5240e+13 4.4654e+18 77969
## + Fantasy 1 4.0157e+12 4.4654e+18 77969
## + War 1 1.8843e+12 4.4654e+18 77969
## + Biography 1 9.6705e+11 4.4654e+18 77969
## - `Sci-Fi` 1 9.6181e+15 4.4750e+18 77970
## - Adventure 1 2.3818e+16 4.4892e+18 77977
## - Action 1 2.6001e+16 4.4914e+18 77978
## - Animation 1 2.9412e+16 4.4948e+18 77980
## - Comedy 1 2.9763e+16 4.4952e+18 77980
## - director_facebook_likes 1 3.1226e+16 4.4966e+18 77981
## - num_user_for_reviews 1 3.1466e+16 4.4969e+18 77981
## - budget 1 3.5566e+16 4.5010e+18 77983
## - Drama 1 4.0802e+16 4.5062e+18 77985
## - Family 1 5.3801e+16 4.5192e+18 77992
## - duration 1 6.1907e+16 4.5273e+18 77996
## - title_year 1 6.6504e+16 4.5319e+18 77998
## - actor_3_facebook_likes 1 6.9460e+16 4.5349e+18 78000
## - country 2 1.5000e+17 4.6154e+18 78036
## - num_critic_for_reviews 1 1.8446e+17 4.6499e+18 78055
## - content_rating 4 2.3850e+17 4.7039e+18 78074
## - num_voted_users 1 4.3845e+17 4.9039e+18 78172
##
## Step: AIC=77966.98
## gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + num_user_for_reviews +
## country + content_rating + budget + title_year + Action +
## Adventure + Animation + Comedy + Drama + Family + `Sci-Fi` +
## Western
##
## Df Sum of Sq RSS AIC
## <none> 4.4686e+18 77967
## - Western 1 4.2657e+15 4.4729e+18 77967
## + Horror 1 3.2316e+15 4.4654e+18 77967
## + Romance 1 3.1799e+15 4.4655e+18 77967
## + Musical 1 2.7491e+15 4.4659e+18 77968
## + Crime 1 1.9069e+15 4.4667e+18 77968
## + facenumber_in_poster 1 1.7368e+15 4.4669e+18 77968
## + History 1 1.7065e+15 4.4669e+18 77968
## + Documentary 1 1.2831e+15 4.4674e+18 77968
## + imdb_score 1 5.4383e+14 4.4681e+18 77969
## + Sport 1 5.3766e+14 4.4681e+18 77969
## + Thriller 1 3.8645e+14 4.4683e+18 77969
## + Mystery 1 1.9824e+14 4.4684e+18 77969
## + aspect_ratio 1 7.4767e+13 4.4686e+18 77969
## + Fantasy 1 7.1609e+13 4.4686e+18 77969
## + movie_facebook_likes 1 4.5236e+13 4.4686e+18 77969
## + Biography 1 1.9161e+13 4.4686e+18 77969
## + War 1 4.4471e+12 4.4686e+18 77969
## - `Sci-Fi` 1 1.0276e+16 4.4789e+18 77970
## - Adventure 1 2.5574e+16 4.4942e+18 77978
## - Animation 1 2.9419e+16 4.4981e+18 77979
## - num_user_for_reviews 1 2.9864e+16 4.4985e+18 77980
## - Action 1 3.0313e+16 4.4990e+18 77980
## - director_facebook_likes 1 3.0501e+16 4.4991e+18 77980
## - budget 1 3.6625e+16 4.5053e+18 77983
## - Drama 1 3.7633e+16 4.5063e+18 77984
## - Comedy 1 3.7954e+16 4.5066e+18 77984
## - Family 1 5.3487e+16 4.5221e+18 77991
## - title_year 1 6.3808e+16 4.5325e+18 77996
## - duration 1 6.6815e+16 4.5355e+18 77998
## - actor_3_facebook_likes 1 7.0152e+16 4.5388e+18 77999
## - country 2 1.4968e+17 4.6183e+18 78036
## - num_critic_for_reviews 1 1.8127e+17 4.6499e+18 78053
## - content_rating 4 2.4881e+17 4.7175e+18 78079
## - num_voted_users 1 4.6184e+17 4.9305e+18 78182
summary(reg.step)
##
## Call:
## lm(formula = gross ~ num_critic_for_reviews + duration + director_facebook_likes +
## actor_3_facebook_likes + num_voted_users + num_user_for_reviews +
## country + content_rating + budget + title_year + Action +
## Adventure + Animation + Comedy + Drama + Family + `Sci-Fi` +
## Western, data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340852907 -22273257 -2022764 17472968 450406303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.839e+09 3.362e+08 5.469 5.03e-08 ***
## num_critic_for_reviews 1.249e+05 1.326e+04 9.421 < 2e-16 ***
## duration 3.149e+05 5.505e+04 5.720 1.21e-08 ***
## director_facebook_likes -1.329e+03 3.439e+02 -3.865 0.000115 ***
## actor_3_facebook_likes 3.115e+03 5.316e+02 5.861 5.31e-09 ***
## num_voted_users 1.657e+02 1.102e+01 15.038 < 2e-16 ***
## num_user_for_reviews 1.506e+04 3.938e+03 3.824 0.000135 ***
## countryUK -2.080e+06 4.490e+06 -0.463 0.643267
## countryUSA 2.024e+07 3.052e+06 6.632 4.15e-11 ***
## content_ratingNC-17 -4.034e+07 1.723e+07 -2.341 0.019321 *
## content_ratingPG 2.285e+06 7.041e+06 0.325 0.745567
## content_ratingPG-13 3.308e+06 8.052e+06 0.411 0.681249
## content_ratingR -2.052e+07 8.017e+06 -2.560 0.010541 *
## budget 4.848e-02 1.145e-02 4.235 2.38e-05 ***
## title_year -9.392e+05 1.680e+05 -5.590 2.56e-08 ***
## Action 9.963e+06 2.586e+06 3.853 0.000120 ***
## Adventure 1.090e+07 3.080e+06 3.539 0.000411 ***
## Animation 2.127e+07 5.603e+06 3.795 0.000151 ***
## Comedy 9.915e+06 2.300e+06 4.311 1.70e-05 ***
## Drama -9.978e+06 2.324e+06 -4.293 1.84e-05 ***
## Family 2.707e+07 5.290e+06 5.118 3.37e-07 ***
## `Sci-Fi` -7.190e+06 3.205e+06 -2.243 0.024990 *
## Western -1.132e+07 7.831e+06 -1.445 0.148544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45190000 on 2188 degrees of freedom
## Multiple R-squared: 0.5776, Adjusted R-squared: 0.5734
## F-statistic: 136 on 22 and 2188 DF, p-value: < 2.2e-16
vif(reg.step)
## GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews 2.792028 1 1.670936
## duration 1.541081 1 1.241403
## director_facebook_likes 1.156544 1 1.075427
## actor_3_facebook_likes 1.099715 1 1.048673
## num_voted_users 2.899775 1 1.702873
## num_user_for_reviews 2.730457 1 1.652409
## country 1.067460 2 1.016454
## content_rating 3.368236 4 1.163926
## budget 1.197485 1 1.094297
## title_year 1.853400 1 1.361396
## Action 1.362788 1 1.167385
## Adventure 1.653866 1 1.286027
## Animation 1.880984 1 1.371490
## Comedy 1.372139 1 1.171384
## Drama 1.461923 1 1.209100
## Family 3.112236 1 1.764153
## `Sci-Fi` 1.226409 1 1.107434
## Western 1.034322 1 1.017016
reg.step.pred <- predict(reg.step, valid.df)
accuracy(reg.step.pred, valid.df$gross)
## ME RMSE MAE MPE MAPE
## Test set -646609.8 47890105 29563770 10409.66 13770.11
par(mfrow=c(2,2))
plot(reg.step)
all.residuals <- (valid.df$gross - reg.step.pred)/10^6
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")
data.frame("Predicted" = reg.step.pred, "Actual" = valid.df$gross,
"Residual" = all.residuals)[0:20,]
## Predicted Actual Residual
## 1 341348703 760505847 419.15714
## 2 202928599 309404152 106.47555
## 6 212255541 336530303 124.27476
## 8 270894608 458991599 188.09699
## 18 177001550 241063875 64.06232
## 20 149962221 255108370 105.14615
## 21 223505491 262030663 38.52517
## 24 128802190 70083519 -58.71867
## 25 189834993 218051260 28.21627
## 26 257703404 658672302 400.96890
## 30 228756733 304360277 75.60354
## 31 189792120 373377893 183.58577
## 32 250840534 408992272 158.15174
## 33 190501522 334185206 143.68368
## 42 136277311 116593191 -19.68412
## 43 242638182 414984497 172.34631
## 47 266079517 233914986 -32.16453
## 50 123354065 144812796 21.45873
## 52 189220019 101785482 -87.43454
## 57 253271582 223806889 -29.46469
Final Linear model with a R-square of 0.5734
library(rpart)
library(rpart.plot)
class.tree <- rpart(gross_catogorical ~. -gross, data = train.df, method = "class")
prp(class.tree, type = 1, extra = "auto", split.font = 1, varlen = 0)
fancyRpartPlot(class.tree)
class.tree.pred <- predict(class.tree, valid.df, type = "class")
#accuracy(class.tree, valid.df$gross)
confusionMatrix(class.tree.pred, as.factor(valid.df$gross_catogorical))
## Confusion Matrix and Statistics
##
## Reference
## Prediction (0,7] (7,66] (66,761]
## (0,7] 214 75 4
## (7,66] 146 582 146
## (66,761] 5 78 225
##
## Overall Statistics
##
## Accuracy : 0.6922
## 95% CI : (0.6679, 0.7157)
## No Information Rate : 0.4983
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4891
##
## Mcnemar's Test P-Value : 1.868e-09
##
## Statistics by Class:
##
## Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity 0.5863 0.7918 0.6000
## Specificity 0.9288 0.6054 0.9245
## Pos Pred Value 0.7304 0.6659 0.7305
## Neg Pred Value 0.8723 0.7454 0.8715
## Prevalence 0.2475 0.4983 0.2542
## Detection Rate 0.1451 0.3946 0.1525
## Detection Prevalence 0.1986 0.5925 0.2088
## Balanced Accuracy 0.7576 0.6986 0.7623
Classification tree accuracy = 0.6922
# Cross Validation
set.seed(2)
crossvalid_ct <- rpart(gross_catogorical ~. - gross, data = train.df, method = "class", cp = 0.001, minsplit = 5, xval = 5)
printcp(crossvalid_ct)
##
## Classification tree:
## rpart(formula = gross_catogorical ~ . - gross, data = train.df,
## method = "class", cp = 0.001, minsplit = 5, xval = 5)
##
## Variables actually used in tree construction:
## [1] actor_1_facebook_likes actor_2_facebook_likes
## [3] actor_3_facebook_likes budget
## [5] cast_total_facebook_likes Comedy
## [7] content_rating country
## [9] Crime director_facebook_likes
## [11] duration facenumber_in_poster
## [13] Family Fantasy
## [15] imdb_score movie_facebook_likes
## [17] num_critic_for_reviews num_user_for_reviews
## [19] num_voted_users Romance
## [21] title_year War
##
## Root node error: 1073/2211 = 0.4853
##
## n= 2211
##
## CP nsplit rel error xerror xstd
## 1 0.1081081 0 1.00000 1.00000 0.021902
## 2 0.0251631 3 0.67568 0.75023 0.021086
## 3 0.0195713 4 0.65051 0.71668 0.020871
## 4 0.0177074 5 0.63094 0.70177 0.020767
## 5 0.0158434 6 0.61323 0.70457 0.020787
## 6 0.0102516 7 0.59739 0.68313 0.020630
## 7 0.0083877 11 0.55638 0.67102 0.020536
## 8 0.0074557 12 0.54800 0.66449 0.020484
## 9 0.0069897 13 0.54054 0.65331 0.020392
## 10 0.0052811 16 0.51817 0.64306 0.020305
## 11 0.0046598 19 0.50233 0.63653 0.020248
## 12 0.0037279 20 0.49767 0.63653 0.020248
## 13 0.0034172 29 0.46319 0.63560 0.020240
## 14 0.0027959 35 0.43802 0.64119 0.020289
## 15 0.0023299 48 0.40168 0.64958 0.020360
## 16 0.0022367 52 0.39236 0.66542 0.020491
## 17 0.0018639 67 0.35508 0.66636 0.020499
## 18 0.0016775 106 0.28239 0.67661 0.020580
## 19 0.0015533 111 0.27400 0.68500 0.020644
## 20 0.0013979 119 0.26002 0.68220 0.020623
## 21 0.0013048 140 0.23020 0.68406 0.020637
## 22 0.0012426 146 0.22181 0.68966 0.020679
## 23 0.0010000 168 0.19012 0.70643 0.020800
# prune tree with the smallest tree within 1 xstd of min. error (13 split is the best)
pruned_ct <- prune(crossvalid_ct,
cp = 0.0069897)
length(pruned_ct$frame$var[pruned_ct$frame$var == "<leaf>"])
## [1] 17
prp(pruned_ct)
fancyRpartPlot(pruned_ct)
# Apply prune tree model with prediction
pruned_ct_pred <- predict(pruned_ct, train.df, type = "class")
confusionMatrix(pruned_ct_pred, train.df$gross_catogorical)
## Confusion Matrix and Statistics
##
## Reference
## Prediction (0,7] (7,66] (66,761]
## (0,7] 355 98 5
## (7,66] 154 889 138
## (66,761] 10 151 411
##
## Overall Statistics
##
## Accuracy : 0.7485
## 95% CI : (0.7299, 0.7665)
## No Information Rate : 0.5147
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5889
##
## Mcnemar's Test P-Value : 0.002096
##
## Statistics by Class:
##
## Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity 0.6840 0.7812 0.7419
## Specificity 0.9391 0.7279 0.9028
## Pos Pred Value 0.7751 0.7528 0.7185
## Neg Pred Value 0.9064 0.7583 0.9128
## Prevalence 0.2347 0.5147 0.2506
## Detection Rate 0.1606 0.4021 0.1859
## Detection Prevalence 0.2071 0.5341 0.2587
## Balanced Accuracy 0.8116 0.7545 0.8224
pruned_ct_pred <- predict(pruned_ct, valid.df, type = "class")
confusionMatrix(pruned_ct_pred, valid.df$gross_catogorical)
## Confusion Matrix and Statistics
##
## Reference
## Prediction (0,7] (7,66] (66,761]
## (0,7] 232 83 5
## (7,66] 122 546 123
## (66,761] 11 106 247
##
## Overall Statistics
##
## Accuracy : 0.6949
## 95% CI : (0.6707, 0.7183)
## No Information Rate : 0.4983
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.505
##
## Mcnemar's Test P-Value : 0.0121
##
## Statistics by Class:
##
## Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity 0.6356 0.7429 0.6587
## Specificity 0.9207 0.6689 0.8936
## Pos Pred Value 0.7250 0.6903 0.6786
## Neg Pred Value 0.8848 0.7237 0.8848
## Prevalence 0.2475 0.4983 0.2542
## Detection Rate 0.1573 0.3702 0.1675
## Detection Prevalence 0.2169 0.5363 0.2468
## Balanced Accuracy 0.7782 0.7059 0.7762
Accuracy for testing dataset is 0.7752 Accuracy for validation dataset is 0.6949
library(FNN)
# initialize normalized training, validation, test data, complete data frames to originals
train.norm <- train.df
valid.norm <- valid.df
# use preProcess() from the caret package to normalize predictors.
norm.values <- preProcess(train.df[, -39-6], method=c("center", "scale"))
train.norm[, -39-6] <- predict(norm.values, train.df[, -39-6])
valid.norm[, -39-6] <- predict(norm.values, valid.df[, -39-6])
valid.norm <- valid.norm[ -c(11,12,6) ]
train.norm <- train.norm[ -c(11,12,6) ]
#Find the best k
# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 36, 1), accuracy = rep(0, 36))
# compute knn for different k on validation data.
for(i in 1:36) {
knn.pred <- knn(train.norm[, -36], valid.norm[, -36],
cl = train.norm[, 36], k = i)
accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm[, 36])$overall[1]
}
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
accuracy.df
## k accuracy
## 1 1 0.5518644
## 2 2 0.5254237
## 3 3 0.5708475
## 4 4 0.5844068
## 5 5 0.5803390
## 6 6 0.5911864
## 7 7 0.5945763
## 8 8 0.5925424
## 9 9 0.5884746
## 10 10 0.5871186
## 11 11 0.5877966
## 12 12 0.5966102
## 13 13 0.6061017
## 14 14 0.6122034
## 15 15 0.6020339
## 16 16 0.6108475
## 17 17 0.6033898
## 18 18 0.6027119
## 19 19 0.5979661
## 20 20 0.6033898
## 21 21 0.6013559
## 22 22 0.5993220
## 23 23 0.6054237
## 24 24 0.6020339
## 25 25 0.6013559
## 26 26 0.6040678
## 27 27 0.6013559
## 28 28 0.6040678
## 29 29 0.5986441
## 30 30 0.6006780
## 31 31 0.5932203
## 32 32 0.5945763
## 33 33 0.5945763
## 34 34 0.5925424
## 35 35 0.5938983
## 36 36 0.5972881
#plot accuracy for each K
ggplot(accuracy.df, aes(y = accuracy, x = k)) + geom_point() + geom_line()
knn.pred <- knn(train.norm[, -36], valid.norm[, -36],
cl = train.norm[, 36], k = 14)
confusionMatrix(knn.pred, valid.norm[, 36])
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction (0,7] (7,66] (66,761]
## (0,7] 142 90 11
## (7,66] 212 585 188
## (66,761] 11 60 176
##
## Overall Statistics
##
## Accuracy : 0.6122
## 95% CI : (0.5868, 0.6372)
## No Information Rate : 0.4983
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3358
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity 0.38904 0.7959 0.4693
## Specificity 0.90901 0.4595 0.9355
## Pos Pred Value 0.58436 0.5939 0.7126
## Neg Pred Value 0.81899 0.6939 0.8379
## Prevalence 0.24746 0.4983 0.2542
## Detection Rate 0.09627 0.3966 0.1193
## Detection Prevalence 0.16475 0.6678 0.1675
## Balanced Accuracy 0.64903 0.6277 0.7024
#From the accuracy result, we found that the best k is 5. However, the highest accuracy is only 0.438
From the accuracy result, we found that the best k is 14. However, the highest accuracy is only 0.6122
library(ggthemes)
# Change point shapes and line types by groups
ggplot(accuracy.df, aes(y = accuracy, x = k))+
geom_point(color="yellow")+
labs(title="Plot Accuracy for each K Value",x="K Value", y = "Accuracy")+
theme_economist() +
scale_color_economist()+
geom_step(color="yellow")
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(5)
# I removed 'Sci-Fi' column because the system said it could not find this column.
rf <- randomForest(gross_catogorical ~ . -gross -`Sci-Fi`- Documentary - Western - Musical - History - Sport - War - Biography - Mystery - Animation - Fantasy, data = train.df, mtry = 10, ntree = 1500)
# Show model error (We can see 500 tree is sufficient enough)
plot(rf)
legend('topright', colnames(rf$err.rate), col=1:4, fill=1:4)
#tuning mtry ( Number of variables randomly sampled as candidates at each split)
tune_rf <- tuneRF(train.df[,-c(39,6)], train.df[,39], mtrystart = 5, ntreeTry = 500, stepFactor=1.5, trace = TRUE, plot = TRUE, dobest = TRUE)
## mtry = 6 OOB error = 25.37%
## Searching left ...
## mtry = 4 OOB error = 25.64%
## -0.01069519 0.05
## Searching right ...
## mtry = 9 OOB error = 25.19%
## 0.007130125 0.05
#install packages for further steps
library(ggthemes)
#relative variable importance by plotting the mean decrease in Gini calculated across all trees
# Get importance
importance <- importance(rf)
# ??? what is MeanDcreaseGini???
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))
# Create a rank variable based on importance
rankImportance <- varImportance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
# Use ggplot2 to visualize the relative importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = Importance, fill = Importance)) +
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'red') +
labs(x = 'Variables') +
coord_flip() +
theme_few()
# Apply Model
set.seed(632)
# apply model on validation set
rf.pred.valid <- predict(rf, valid.df)
# generate confusion matrix for validation data
confusionMatrix(rf.pred.valid, valid.df$gross_catogorical)
## Confusion Matrix and Statistics
##
## Reference
## Prediction (0,7] (7,66] (66,761]
## (0,7] 249 57 2
## (7,66] 113 613 127
## (66,761] 3 65 246
##
## Overall Statistics
##
## Accuracy : 0.7512
## 95% CI : (0.7283, 0.7731)
## No Information Rate : 0.4983
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5894
##
## Mcnemar's Test P-Value : 2.041e-08
##
## Statistics by Class:
##
## Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity 0.6822 0.8340 0.6560
## Specificity 0.9468 0.6757 0.9382
## Pos Pred Value 0.8084 0.7186 0.7834
## Neg Pred Value 0.9006 0.8039 0.8889
## Prevalence 0.2475 0.4983 0.2542
## Detection Rate 0.1688 0.4156 0.1668
## Detection Prevalence 0.2088 0.5783 0.2129
## Balanced Accuracy 0.8145 0.7548 0.7971
Random Forest accuracy = 0.7512
# Create data frame with profit
final_df_profit <- final_df
final_df_profit$profitable <- as.factor(ifelse(final_df$gross-final_df$budget > 0, 1 ,0))
# partition data
set.seed(3) # set seed for reproducing the partition
train.index <- sample(c(1:3686), 3686*0.6)
#Create and set aside the remaining 40% of the data, to be used after omitting unhelpful data points and unnecessary variables.
train.df.logistic <- final_df_profit[train.index,]
valid.df.logistic <- final_df_profit[-train.index,]
# Try using logistic model to predict wether a movie will profit or not (Profit = 1, Did not profit = 0)
logistic_reg <- glm(profitable~. - gross - gross_catogorical, data = train.df.logistic, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_reg)
##
## Call:
## glm(formula = profitable ~ . - gross - gross_catogorical, family = "binomial",
## data = train.df.logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2825 -0.9054 0.1644 0.9050 2.3584
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.222e+02 1.932e+01 6.327 2.50e-10 ***
## num_critic_for_reviews 2.561e-03 9.408e-04 2.723 0.00648 **
## duration -4.634e-03 3.549e-03 -1.306 0.19161
## director_facebook_likes -6.474e-05 2.038e-05 -3.177 0.00149 **
## actor_3_facebook_likes -9.209e-05 8.556e-05 -1.076 0.28179
## actor_1_facebook_likes -1.212e-04 5.521e-05 -2.195 0.02817 *
## num_voted_users 1.111e-05 1.334e-06 8.329 < 2e-16 ***
## cast_total_facebook_likes 1.162e-04 5.518e-05 2.105 0.03526 *
## facenumber_in_poster 4.203e-02 2.845e-02 1.477 0.13958
## num_user_for_reviews 3.613e-04 2.989e-04 1.209 0.22680
## countryUK -3.157e-02 2.419e-01 -0.130 0.89618
## countryUSA 1.013e+00 1.743e-01 5.812 6.17e-09 ***
## content_ratingNC-17 -7.967e-01 9.670e-01 -0.824 0.40998
## content_ratingPG 1.824e-01 4.047e-01 0.451 0.65230
## content_ratingPG-13 -2.681e-01 4.575e-01 -0.586 0.55790
## content_ratingR -9.660e-01 4.582e-01 -2.109 0.03499 *
## budget -1.883e-08 2.192e-09 -8.590 < 2e-16 ***
## title_year -6.189e-02 9.629e-03 -6.428 1.29e-10 ***
## actor_2_facebook_likes -1.230e-04 5.939e-05 -2.071 0.03837 *
## imdb_score 2.075e-01 7.229e-02 2.870 0.00410 **
## aspect_ratio -3.657e-02 1.079e-01 -0.339 0.73458
## movie_facebook_likes 1.472e-06 4.390e-06 0.335 0.73744
## Action 3.988e-02 1.534e-01 0.260 0.79487
## Adventure -1.843e-01 1.691e-01 -1.090 0.27559
## Animation 6.898e-02 3.086e-01 0.224 0.82314
## Biography 6.661e-02 2.430e-01 0.274 0.78398
## Comedy 3.599e-01 1.368e-01 2.630 0.00853 **
## Crime -2.366e-01 1.500e-01 -1.578 0.11463
## Documentary 9.126e-01 4.445e-01 2.053 0.04004 *
## Drama -1.602e-01 1.360e-01 -1.178 0.23887
## Family 7.028e-01 2.863e-01 2.455 0.01409 *
## Fantasy -3.485e-01 1.769e-01 -1.970 0.04880 *
## History -3.397e-04 3.026e-01 -0.001 0.99910
## Horror 1.047e+00 2.160e-01 4.849 1.24e-06 ***
## Musical -4.638e-01 3.575e-01 -1.297 0.19453
## Mystery -6.504e-02 1.792e-01 -0.363 0.71662
## Romance 2.818e-01 1.298e-01 2.171 0.02992 *
## `Sci-Fi` -8.684e-01 1.802e-01 -4.819 1.44e-06 ***
## Sport -2.819e-01 2.716e-01 -1.038 0.29920
## Thriller 1.193e-01 1.453e-01 0.821 0.41166
## War -2.531e-01 3.303e-01 -0.766 0.44350
## Western -6.909e-01 4.646e-01 -1.487 0.13702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3053.4 on 2210 degrees of freedom
## Residual deviance: 2354.0 on 2169 degrees of freedom
## AIC: 2438
##
## Number of Fisher Scoring iterations: 6
vif(logistic_reg)
## GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews 4.336826 1 2.082505
## duration 1.910592 1 1.382242
## director_facebook_likes 1.115568 1 1.056205
## actor_3_facebook_likes 7.081803 1 2.661166
## actor_1_facebook_likes 423.623912 1 20.582126
## num_voted_users 4.358102 1 2.087607
## cast_total_facebook_likes 535.683721 1 23.144842
## facenumber_in_poster 1.152229 1 1.073419
## num_user_for_reviews 2.795533 1 1.671985
## country 1.188991 2 1.044226
## content_rating 4.102662 4 1.192980
## budget 3.093648 1 1.758877
## title_year 2.064589 1 1.436868
## actor_2_facebook_likes 19.762006 1 4.445448
## imdb_score 1.957286 1 1.399030
## aspect_ratio 1.095206 1 1.046521
## movie_facebook_likes 2.006638 1 1.416558
## Action 1.738799 1 1.318635
## Adventure 1.703976 1 1.305364
## Animation 1.719364 1 1.311245
## Biography 1.327803 1 1.152303
## Comedy 1.812076 1 1.346134
## Crime 1.420812 1 1.191978
## Documentary 1.171711 1 1.082456
## Drama 1.843044 1 1.357587
## Family 3.297168 1 1.815811
## Fantasy 1.331065 1 1.153718
## History 1.347422 1 1.160785
## Horror 1.507041 1 1.227616
## Musical 1.110534 1 1.053819
## Mystery 1.175515 1 1.084212
## Romance 1.223927 1 1.106312
## `Sci-Fi` 1.283822 1 1.133059
## Sport 1.126163 1 1.061208
## Thriller 1.787914 1 1.337129
## War 1.238787 1 1.113008
## Western 1.047854 1 1.023647
logistic_reg <- glm(profitable ~. - gross - gross_catogorical - facenumber_in_poster - content_rating - aspect_ratio -movie_facebook_likes - Action -Adventure -Animation -Biography -History - Mystery - Sport - Thriller - War - Western - actor_1_facebook_likes - actor_2_facebook_likes - cast_total_facebook_likes - actor_3_facebook_likes - num_user_for_reviews - duration - Musical, data = train.df.logistic, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_reg)
##
## Call:
## glm(formula = profitable ~ . - gross - gross_catogorical - facenumber_in_poster -
## content_rating - aspect_ratio - movie_facebook_likes - Action -
## Adventure - Animation - Biography - History - Mystery - Sport -
## Thriller - War - Western - actor_1_facebook_likes - actor_2_facebook_likes -
## cast_total_facebook_likes - actor_3_facebook_likes - num_user_for_reviews -
## duration - Musical, family = "binomial", data = train.df.logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2428 -0.9385 0.1826 0.9414 2.1974
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.108e+02 1.777e+01 6.238 4.42e-10 ***
## num_critic_for_reviews 2.478e-03 8.204e-04 3.020 0.002525 **
## director_facebook_likes -6.588e-05 1.987e-05 -3.315 0.000917 ***
## num_voted_users 1.154e-05 1.126e-06 10.248 < 2e-16 ***
## countryUK 1.201e-01 2.332e-01 0.515 0.606468
## countryUSA 1.076e+00 1.690e-01 6.367 1.92e-10 ***
## budget -1.661e-08 1.789e-09 -9.287 < 2e-16 ***
## title_year -5.642e-02 8.859e-03 -6.369 1.90e-10 ***
## imdb_score 1.101e-01 6.573e-02 1.676 0.093830 .
## Comedy 3.787e-01 1.191e-01 3.180 0.001473 **
## Crime -3.399e-01 1.321e-01 -2.572 0.010106 *
## Documentary 9.266e-01 4.265e-01 2.173 0.029813 *
## Drama -2.435e-01 1.259e-01 -1.935 0.052977 .
## Family 1.189e+00 1.839e-01 6.467 9.97e-11 ***
## Fantasy -3.031e-01 1.705e-01 -1.778 0.075447 .
## Horror 8.879e-01 2.014e-01 4.407 1.05e-05 ***
## Romance 3.304e-01 1.233e-01 2.680 0.007364 **
## `Sci-Fi` -7.666e-01 1.707e-01 -4.490 7.12e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3053.4 on 2210 degrees of freedom
## Residual deviance: 2421.2 on 2193 degrees of freedom
## AIC: 2457.2
##
## Number of Fisher Scoring iterations: 6
vif(logistic_reg)
## GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews 3.430066 1 1.852044
## director_facebook_likes 1.095915 1 1.046859
## num_voted_users 3.208795 1 1.791311
## country 1.112021 2 1.026900
## budget 2.192059 1 1.480560
## title_year 1.805832 1 1.343812
## imdb_score 1.661448 1 1.288972
## Comedy 1.417410 1 1.190550
## Crime 1.140841 1 1.068101
## Documentary 1.123695 1 1.060045
## Drama 1.633249 1 1.277986
## Family 1.414124 1 1.189170
## Fantasy 1.280177 1 1.131449
## Horror 1.349041 1 1.161482
## Romance 1.148672 1 1.071761
## `Sci-Fi` 1.198800 1 1.094897
#Apply logistics regression model
logistic_reg_pred <- predict(logistic_reg, valid.df.logistic[, - 40], type = "response")
confusionMatrix(as.factor(ifelse(logistic_reg_pred > 0.5 ,1, 0)), as.factor(valid.df.logistic$profitable))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 505 204
## 1 212 554
##
## Accuracy : 0.718
## 95% CI : (0.6942, 0.7408)
## No Information Rate : 0.5139
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4353
##
## Mcnemar's Test P-Value : 0.7314
##
## Sensitivity : 0.7043
## Specificity : 0.7309
## Pos Pred Value : 0.7123
## Neg Pred Value : 0.7232
## Prevalence : 0.4861
## Detection Rate : 0.3424
## Detection Prevalence : 0.4807
## Balanced Accuracy : 0.7176
##
## 'Positive' Class : 0
##
Logistic regression Accuracy = 0.718
logistic_reg <- glm(profitable ~. - gross - gross_catogorical - facenumber_in_poster - content_rating - aspect_ratio -movie_facebook_likes - Action -Adventure -Animation -Biography -History - Mystery - Sport - Thriller - War - Western - actor_1_facebook_likes - actor_2_facebook_likes - cast_total_facebook_likes - actor_3_facebook_likes - num_user_for_reviews - duration - Musical, data = train.df.logistic[-c(647,2817),], family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_reg)
##
## Call:
## glm(formula = profitable ~ . - gross - gross_catogorical - facenumber_in_poster -
## content_rating - aspect_ratio - movie_facebook_likes - Action -
## Adventure - Animation - Biography - History - Mystery - Sport -
## Thriller - War - Western - actor_1_facebook_likes - actor_2_facebook_likes -
## cast_total_facebook_likes - actor_3_facebook_likes - num_user_for_reviews -
## duration - Musical, family = "binomial", data = train.df.logistic[-c(647,
## 2817), ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2411 -0.9376 0.1828 0.9416 2.1976
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.111e+02 1.777e+01 6.250 4.11e-10 ***
## num_critic_for_reviews 2.473e-03 8.203e-04 3.014 0.002578 **
## director_facebook_likes -6.606e-05 1.987e-05 -3.324 0.000886 ***
## num_voted_users 1.153e-05 1.126e-06 10.242 < 2e-16 ***
## countryUK 1.200e-01 2.332e-01 0.515 0.606834
## countryUSA 1.078e+00 1.690e-01 6.377 1.81e-10 ***
## budget -1.660e-08 1.789e-09 -9.282 < 2e-16 ***
## title_year -5.654e-02 8.861e-03 -6.381 1.76e-10 ***
## imdb_score 1.107e-01 6.573e-02 1.684 0.092225 .
## Comedy 3.768e-01 1.191e-01 3.164 0.001557 **
## Crime -3.424e-01 1.322e-01 -2.591 0.009581 **
## Documentary 9.237e-01 4.265e-01 2.166 0.030337 *
## Drama -2.426e-01 1.259e-01 -1.928 0.053884 .
## Family 1.188e+00 1.839e-01 6.461 1.04e-10 ***
## Fantasy -3.040e-01 1.705e-01 -1.783 0.074551 .
## Horror 8.861e-01 2.015e-01 4.398 1.09e-05 ***
## Romance 3.281e-01 1.233e-01 2.661 0.007792 **
## `Sci-Fi` -7.678e-01 1.707e-01 -4.497 6.88e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3051.8 on 2209 degrees of freedom
## Residual deviance: 2420.1 on 2192 degrees of freedom
## AIC: 2456.1
##
## Number of Fisher Scoring iterations: 6
vif(logistic_reg)
## GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews 3.429202 1 1.851810
## director_facebook_likes 1.096059 1 1.046929
## num_voted_users 3.208691 1 1.791282
## country 1.112211 2 1.026944
## budget 2.191782 1 1.480467
## title_year 1.804937 1 1.343479
## imdb_score 1.661278 1 1.288906
## Comedy 1.417260 1 1.190487
## Crime 1.141222 1 1.068280
## Documentary 1.123681 1 1.060038
## Drama 1.632540 1 1.277709
## Family 1.414042 1 1.189135
## Fantasy 1.280181 1 1.131451
## Horror 1.348977 1 1.161455
## Romance 1.148907 1 1.071871
## `Sci-Fi` 1.198888 1 1.094937
#Apply logistics regression model
logistic_reg_pred <- predict(logistic_reg, valid.df.logistic[, - 40], type = "response")
confusionMatrix(as.factor(ifelse(logistic_reg_pred > 0.5 ,1, 0)), as.factor(valid.df.logistic$profitable))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 505 204
## 1 212 554
##
## Accuracy : 0.718
## 95% CI : (0.6942, 0.7408)
## No Information Rate : 0.5139
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4353
##
## Mcnemar's Test P-Value : 0.7314
##
## Sensitivity : 0.7043
## Specificity : 0.7309
## Pos Pred Value : 0.7123
## Neg Pred Value : 0.7232
## Prevalence : 0.4861
## Detection Rate : 0.3424
## Detection Prevalence : 0.4807
## Balanced Accuracy : 0.7176
##
## 'Positive' Class : 0
##