#https://www.kaggle.com/carolzhangdc/predict-imdb-score-with-data-mining-algorithms

library(dplyr)
library(tidyverse)
library(forecast)
library(leaps)
library(forecast)
library(caret)
library(car)
library(data.table)
library(VIM)
library(corrplot)
library(ggplot2)
library(ggcorrplot)
library(plotly)
library(ggrepel)
library(caret)
library(ggthemes)
library(rpart)                      # Popular decision tree algorithm
library(rattle)                 # Fancy tree plot

Read Datasets and removing duplicate datas

library(readr)
movie_metadata <- read_csv("movie_metadata.csv")
sum(duplicated(movie_metadata))
## [1] 45
movie_metadata <- movie_metadata[!duplicated(movie_metadata),] #removing duplicate row
str(movie_metadata)
## Classes 'tbl_df', 'tbl' and 'data.frame':    4998 obs. of  28 variables:
##  $ color                    : chr  "Color" "Color" "Color" "Color" ...
##  $ director_name            : chr  "James Cameron" "Gore Verbinski" "Sam Mendes" "Christopher Nolan" ...
##  $ num_critic_for_reviews   : num  723 302 602 813 NA 462 392 324 635 375 ...
##  $ duration                 : num  178 169 148 164 NA 132 156 100 141 153 ...
##  $ director_facebook_likes  : num  0 563 0 22000 131 475 0 15 0 282 ...
##  $ actor_3_facebook_likes   : num  855 1000 161 23000 NA 530 4000 284 19000 10000 ...
##  $ actor_2_name             : chr  "Joel David Moore" "Orlando Bloom" "Rory Kinnear" "Christian Bale" ...
##  $ actor_1_facebook_likes   : num  1000 40000 11000 27000 131 640 24000 799 26000 25000 ...
##  $ gross                    : num  7.61e+08 3.09e+08 2.00e+08 4.48e+08 NA ...
##  $ genres                   : chr  "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thriller" ...
##  $ actor_1_name             : chr  "CCH Pounder" "Johnny Depp" "Christoph Waltz" "Tom Hardy" ...
##  $ movie_title              : chr  "Avatar " "Pirates of the Caribbean: At World's End " "Spectre " "The Dark Knight Rises " ...
##  $ num_voted_users          : num  886204 471220 275868 1144337 8 ...
##  $ cast_total_facebook_likes: num  4834 48350 11700 106759 143 ...
##  $ actor_3_name             : chr  "Wes Studi" "Jack Davenport" "Stephanie Sigman" "Joseph Gordon-Levitt" ...
##  $ facenumber_in_poster     : num  0 0 1 0 0 1 0 1 4 3 ...
##  $ plot_keywords            : chr  "avatar|future|marine|native|paraplegic" "goddess|marriage ceremony|marriage proposal|pirate|singapore" "bomb|espionage|sequel|spy|terrorist" "deception|imprisonment|lawlessness|police officer|terrorist plot" ...
##  $ movie_imdb_link          : chr  "http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1" ...
##  $ num_user_for_reviews     : num  3054 1238 994 2701 NA ...
##  $ language                 : chr  "English" "English" "English" "English" ...
##  $ country                  : chr  "USA" "USA" "UK" "USA" ...
##  $ content_rating           : chr  "PG-13" "PG-13" "PG-13" "PG-13" ...
##  $ budget                   : num  2.37e+08 3.00e+08 2.45e+08 2.50e+08 NA ...
##  $ title_year               : num  2009 2007 2015 2012 NA ...
##  $ actor_2_facebook_likes   : num  936 5000 393 23000 12 632 11000 553 21000 11000 ...
##  $ imdb_score               : num  7.9 7.1 6.8 8.5 7.1 6.6 6.2 7.8 7.5 7.5 ...
##  $ aspect_ratio             : num  1.78 2.35 2.35 2.35 NA 2.35 2.35 1.85 2.35 2.35 ...
##  $ movie_facebook_likes     : num  33000 0 85000 164000 0 24000 0 29000 118000 10000 ...
#dealing with missing data 
sum(is.na(movie_metadata)) #2674 of null values
## [1] 2674
colSums(sapply(movie_metadata, is.na))
##                     color             director_name    num_critic_for_reviews 
##                        19                       103                        49 
##                  duration   director_facebook_likes    actor_3_facebook_likes 
##                        15                       103                        23 
##              actor_2_name    actor_1_facebook_likes                     gross 
##                        13                         7                       874 
##                    genres              actor_1_name               movie_title 
##                         0                         7                         0 
##           num_voted_users cast_total_facebook_likes              actor_3_name 
##                         0                         0                        23 
##      facenumber_in_poster             plot_keywords           movie_imdb_link 
##                        13                       152                         0 
##      num_user_for_reviews                  language                   country 
##                        21                        12                         5 
##            content_rating                    budget                title_year 
##                       301                       487                       107 
##    actor_2_facebook_likes                imdb_score              aspect_ratio 
##                        13                         0                       327 
##      movie_facebook_likes 
##                         0
missing.values <- aggr(movie_metadata, sortVars = T, prop = T, sortCombs = T, cex.lab = 1.5, cex.axis = .6, cex.numbers = 5, combined = F, gap = -.2) #visualize missing value

## 
##  Variables sorted by number of missings: 
##                   Variable       Count
##                      gross 0.174869948
##                     budget 0.097438976
##               aspect_ratio 0.065426170
##             content_rating 0.060224090
##              plot_keywords 0.030412165
##                 title_year 0.021408563
##              director_name 0.020608243
##    director_facebook_likes 0.020608243
##     num_critic_for_reviews 0.009803922
##     actor_3_facebook_likes 0.004601841
##               actor_3_name 0.004601841
##       num_user_for_reviews 0.004201681
##                      color 0.003801521
##                   duration 0.003001200
##               actor_2_name 0.002601040
##       facenumber_in_poster 0.002601040
##     actor_2_facebook_likes 0.002601040
##                   language 0.002400960
##     actor_1_facebook_likes 0.001400560
##               actor_1_name 0.001400560
##                    country 0.001000400
##                     genres 0.000000000
##                movie_title 0.000000000
##            num_voted_users 0.000000000
##  cast_total_facebook_likes 0.000000000
##            movie_imdb_link 0.000000000
##                 imdb_score 0.000000000
##       movie_facebook_likes 0.000000000
#Gross is missing 17% or data and budget is 9% of data, hence we just have to remove them
movie_metadata <- movie_metadata[!is.na(movie_metadata$gross),]
movie_metadata <- movie_metadata[!is.na(movie_metadata$budget),]
colSums(sapply(movie_metadata, is.na))
##                     color             director_name    num_critic_for_reviews 
##                         2                         0                         1 
##                  duration   director_facebook_likes    actor_3_facebook_likes 
##                         1                         0                        10 
##              actor_2_name    actor_1_facebook_likes                     gross 
##                         5                         3                         0 
##                    genres              actor_1_name               movie_title 
##                         0                         3                         0 
##           num_voted_users cast_total_facebook_likes              actor_3_name 
##                         0                         0                        10 
##      facenumber_in_poster             plot_keywords           movie_imdb_link 
##                         6                        31                         0 
##      num_user_for_reviews                  language                   country 
##                         0                         3                         0 
##            content_rating                    budget                title_year 
##                        51                         0                         0 
##    actor_2_facebook_likes                imdb_score              aspect_ratio 
##                         5                         0                        74 
##      movie_facebook_likes 
##                         0
missing.values <- aggr(movie_metadata, sortVars = T, prop = T, sortCombs = T, cex.lab = 1.5, cex.axis = .6, cex.numbers = 5, combined = F, gap = -.2) #visualize missing value

## 
##  Variables sorted by number of missings: 
##                   Variable        Count
##               aspect_ratio 0.0191858958
##             content_rating 0.0132227120
##              plot_keywords 0.0080373347
##     actor_3_facebook_likes 0.0025926886
##               actor_3_name 0.0025926886
##       facenumber_in_poster 0.0015556132
##               actor_2_name 0.0012963443
##     actor_2_facebook_likes 0.0012963443
##     actor_1_facebook_likes 0.0007778066
##               actor_1_name 0.0007778066
##                   language 0.0007778066
##                      color 0.0005185377
##     num_critic_for_reviews 0.0002592689
##                   duration 0.0002592689
##              director_name 0.0000000000
##    director_facebook_likes 0.0000000000
##                      gross 0.0000000000
##                     genres 0.0000000000
##                movie_title 0.0000000000
##            num_voted_users 0.0000000000
##  cast_total_facebook_likes 0.0000000000
##            movie_imdb_link 0.0000000000
##       num_user_for_reviews 0.0000000000
##                    country 0.0000000000
##                     budget 0.0000000000
##                 title_year 0.0000000000
##                 imdb_score 0.0000000000
##       movie_facebook_likes 0.0000000000
# aspect ratio still has 74 missing value, lets inspect that
table(movie_metadata$aspect_ratio)
## 
## 1.18 1.33 1.37  1.5 1.66 1.75 1.77 1.78 1.85    2  2.2 2.24 2.35 2.39  2.4 2.55 
##    1   19   50    1   40    2    1   41 1600    3   10    1 1995   11    3    1 
## 2.76   16 
##    3    1
movie_metadata$aspect_ratio[is.na(movie_metadata$aspect_ratio)] <- 0 # creplacing the null aspect ratio with 0 
mean(movie_metadata$gross[movie_metadata$aspect_ratio == 1.85]) #checking the mean of gross 
## [1] 44123725
mean(movie_metadata$gross[movie_metadata$aspect_ratio == 2.35])
## [1] 58306242
mean(movie_metadata$gross[movie_metadata$aspect_ratio != 1.85 & movie_metadata$aspect_ratio != 2.35])
## [1] 36073031
mean(movie_metadata$gross[movie_metadata$aspect_ratio == 0]) # We can tell that with aspect ratio being null value the gross will be significantly less. We will keep this as is aspect ratio has a hiarchical structure. The higher Aspecct ratio the higher the gross are. 
## [1] 2605095
summary(movie_metadata$gross)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##       162   6754898  27829874  50912638  65452312 760505847
# deal with other missing variable
colSums(sapply(movie_metadata, is.na)) #lets start with content rating 
##                     color             director_name    num_critic_for_reviews 
##                         2                         0                         1 
##                  duration   director_facebook_likes    actor_3_facebook_likes 
##                         1                         0                        10 
##              actor_2_name    actor_1_facebook_likes                     gross 
##                         5                         3                         0 
##                    genres              actor_1_name               movie_title 
##                         0                         3                         0 
##           num_voted_users cast_total_facebook_likes              actor_3_name 
##                         0                         0                        10 
##      facenumber_in_poster             plot_keywords           movie_imdb_link 
##                         6                        31                         0 
##      num_user_for_reviews                  language                   country 
##                         0                         3                         0 
##            content_rating                    budget                title_year 
##                        51                         0                         0 
##    actor_2_facebook_likes                imdb_score              aspect_ratio 
##                         5                         0                         0 
##      movie_facebook_likes 
##                         0
table(movie_metadata$content_rating)
## 
##  Approved         G        GP         M     NC-17 Not Rated    Passed        PG 
##        17        91         1         2         6        42         3       573 
##     PG-13         R   Unrated         X 
##      1314      1723        24        10
movie_metadata <-movie_metadata [!is.na(movie_metadata$content_rating),] # removing null value in movie_metadata
sum(is.na(movie_metadata$content_rating)) # double checking the sum of missing value
## [1] 0
table(movie_metadata$content_rating)
## 
##  Approved         G        GP         M     NC-17 Not Rated    Passed        PG 
##        17        91         1         2         6        42         3       573 
##     PG-13         R   Unrated         X 
##      1314      1723        24        10
#replacing rating with the modern rating
movie_metadata$content_rating[movie_metadata$content_rating == "M"] <- "PG"
movie_metadata$content_rating[movie_metadata$content_rating == "GP"] <- "PG"
movie_metadata$content_rating[movie_metadata$content_rating == "X"] <- "NC-17"
table(movie_metadata$content_rating)
## 
##  Approved         G     NC-17 Not Rated    Passed        PG     PG-13         R 
##        17        91        16        42         3       576      1314      1723 
##   Unrated 
##        24
#replacing the rest of rating with the most common rating of R
movie_metadata$content_rating[movie_metadata$content_rating == "Approved"] <- "R"
movie_metadata$content_rating[movie_metadata$content_rating == "Not Rated"] <- "R"
movie_metadata$content_rating[movie_metadata$content_rating == "Passed"] <- "R"
movie_metadata$content_rating[movie_metadata$content_rating == "Unrated"] <- "R"
table(movie_metadata$content_rating)
## 
##     G NC-17    PG PG-13     R 
##    91    16   576  1314  1809
# Now lets look the rest
colSums(sapply(movie_metadata, is.na)) 
##                     color             director_name    num_critic_for_reviews 
##                         2                         0                         1 
##                  duration   director_facebook_likes    actor_3_facebook_likes 
##                         0                         0                         6 
##              actor_2_name    actor_1_facebook_likes                     gross 
##                         2                         1                         0 
##                    genres              actor_1_name               movie_title 
##                         0                         1                         0 
##           num_voted_users cast_total_facebook_likes              actor_3_name 
##                         0                         0                         6 
##      facenumber_in_poster             plot_keywords           movie_imdb_link 
##                         6                        21                         0 
##      num_user_for_reviews                  language                   country 
##                         0                         2                         0 
##            content_rating                    budget                title_year 
##                         0                         0                         0 
##    actor_2_facebook_likes                imdb_score              aspect_ratio 
##                         2                         0                         0 
##      movie_facebook_likes 
##                         0
# replace NA with column average for facenumber_in_poster
movie_metadata$facenumber_in_poster[is.na(movie_metadata$facenumber_in_poster)] <- round(mean(movie_metadata$facenumber_in_poster, na.rm = TRUE))
# convert 0s into NAs for other predictors
movie_metadata[,c(5,6,8,13,24,26)][movie_metadata[,c(5,6,8,13,24,26)] == 0] <- NA
# impute missing value with column mean
movie_metadata$num_critic_for_reviews[is.na(movie_metadata$num_critic_for_reviews)] <- round(mean(movie_metadata$num_critic_for_reviews, na.rm = TRUE))
movie_metadata$duration[is.na(movie_metadata$duration)] <- round(mean(movie_metadata$duration, na.rm = TRUE))
movie_metadata$director_facebook_likes[is.na(movie_metadata$director_facebook_likes)] <- round(mean(movie_metadata$director_facebook_likes, na.rm = TRUE))
movie_metadata$actor_3_facebook_likes[is.na(movie_metadata$actor_3_facebook_likes)] <- round(mean(movie_metadata$actor_3_facebook_likes, na.rm = TRUE))
movie_metadata$actor_1_facebook_likes[is.na(movie_metadata$actor_1_facebook_likes)] <- round(mean(movie_metadata$actor_1_facebook_likes, na.rm = TRUE))
movie_metadata$cast_total_facebook_likes[is.na(movie_metadata$cast_total_facebook_likes)] <- round(mean(movie_metadata$cast_total_facebook_likes, na.rm = TRUE))
movie_metadata$actor_2_facebook_likes[is.na(movie_metadata$actor_2_facebook_likes)] <- round(mean(movie_metadata$actor_2_facebook_likes, na.rm = TRUE))
movie_metadata$movie_facebook_likes[is.na(movie_metadata$movie_facebook_likes)] <- round(mean(movie_metadata$movie_facebook_likes, na.rm = TRUE))
colSums(sapply(movie_metadata, is.na)) #check out the result (We still have some left)
##                     color             director_name    num_critic_for_reviews 
##                         2                         0                         0 
##                  duration   director_facebook_likes    actor_3_facebook_likes 
##                         0                         0                         0 
##              actor_2_name    actor_1_facebook_likes                     gross 
##                         2                         0                         0 
##                    genres              actor_1_name               movie_title 
##                         0                         1                         0 
##           num_voted_users cast_total_facebook_likes              actor_3_name 
##                         0                         0                         6 
##      facenumber_in_poster             plot_keywords           movie_imdb_link 
##                         0                        21                         0 
##      num_user_for_reviews                  language                   country 
##                         0                         2                         0 
##            content_rating                    budget                title_year 
##                         0                         0                         0 
##    actor_2_facebook_likes                imdb_score              aspect_ratio 
##                         0                         0                         0 
##      movie_facebook_likes 
##                         0
# See does language matter 
table(movie_metadata$language) # We can tell mostly is English, hence it shouldn't matter to us.
## 
## Aboriginal     Arabic    Aramaic    Bosnian  Cantonese      Czech     Danish 
##          2          1          1          1          7          1          3 
##       Dari      Dutch    English   Filipino     French     German     Hebrew 
##          2          3       3644          1         34         11          2 
##      Hindi  Hungarian Indonesian    Italian   Japanese     Kazakh     Korean 
##          5          1          2          7         10          1          5 
##   Mandarin       Maya  Mongolian       None  Norwegian    Persian Portuguese 
##         14          1          1          1          4          3          5 
##   Romanian    Russian    Spanish       Thai Vietnamese       Zulu 
##          1          1         24          3          1          1
movie_metadata <- movie_metadata[,-c(20) ] # We drop language column as a result

# See does Color matter
colSums(sapply(movie_metadata, is.na)) 
##                     color             director_name    num_critic_for_reviews 
##                         2                         0                         0 
##                  duration   director_facebook_likes    actor_3_facebook_likes 
##                         0                         0                         0 
##              actor_2_name    actor_1_facebook_likes                     gross 
##                         2                         0                         0 
##                    genres              actor_1_name               movie_title 
##                         0                         1                         0 
##           num_voted_users cast_total_facebook_likes              actor_3_name 
##                         0                         0                         6 
##      facenumber_in_poster             plot_keywords           movie_imdb_link 
##                         0                        21                         0 
##      num_user_for_reviews                   country            content_rating 
##                         0                         0                         0 
##                    budget                title_year    actor_2_facebook_likes 
##                         0                         0                         0 
##                imdb_score              aspect_ratio      movie_facebook_likes 
##                         0                         0                         0
table(movie_metadata$color)
## 
## Black and White           Color 
##             124            3680
movie_metadata <- movie_metadata[,-1] # It does not matter, hence we decided to drop it. 

# There are 30 more missing value (Since it is so small (< 1%) compare to our sample, we decided to drop it.)
colSums(sapply(movie_metadata, is.na)) 
##             director_name    num_critic_for_reviews                  duration 
##                         0                         0                         0 
##   director_facebook_likes    actor_3_facebook_likes              actor_2_name 
##                         0                         0                         2 
##    actor_1_facebook_likes                     gross                    genres 
##                         0                         0                         0 
##              actor_1_name               movie_title           num_voted_users 
##                         1                         0                         0 
## cast_total_facebook_likes              actor_3_name      facenumber_in_poster 
##                         0                         6                         0 
##             plot_keywords           movie_imdb_link      num_user_for_reviews 
##                        21                         0                         0 
##                   country            content_rating                    budget 
##                         0                         0                         0 
##                title_year    actor_2_facebook_likes                imdb_score 
##                         0                         0                         0 
##              aspect_ratio      movie_facebook_likes 
##                         0                         0
sum(is.na(movie_metadata))
## [1] 30
movie_metadata <- na.omit(movie_metadata)
colSums(sapply(movie_metadata, is.na)) 
##             director_name    num_critic_for_reviews                  duration 
##                         0                         0                         0 
##   director_facebook_likes    actor_3_facebook_likes              actor_2_name 
##                         0                         0                         0 
##    actor_1_facebook_likes                     gross                    genres 
##                         0                         0                         0 
##              actor_1_name               movie_title           num_voted_users 
##                         0                         0                         0 
## cast_total_facebook_likes              actor_3_name      facenumber_in_poster 
##                         0                         0                         0 
##             plot_keywords           movie_imdb_link      num_user_for_reviews 
##                         0                         0                         0 
##                   country            content_rating                    budget 
##                         0                         0                         0 
##                title_year    actor_2_facebook_likes                imdb_score 
##                         0                         0                         0 
##              aspect_ratio      movie_facebook_likes 
##                         0                         0
# Now lets look at language 
table(movie_metadata$country) # We can tell most movie are from US and UK, we will create three catergory. (US, UK and Other)
## 
##    Afghanistan      Argentina          Aruba      Australia        Belgium 
##              1              3              1             40              1 
##         Brazil         Canada          Chile          China       Colombia 
##              5             60              1             13              1 
## Czech Republic        Denmark        Finland         France        Georgia 
##              3              9              1            102              1 
##        Germany         Greece      Hong Kong        Hungary        Iceland 
##             79              1             13              2              1 
##          India      Indonesia           Iran        Ireland         Israel 
##              5              1              4              7              2 
##          Italy          Japan         Mexico    Netherlands       New Line 
##             11             15              6              3              1 
##    New Zealand         Norway  Official site           Peru    Philippines 
##             11              4              1              1              1 
##         Poland        Romania         Russia   South Africa    South Korea 
##              1              2              3              3              8 
##          Spain         Taiwan       Thailand             UK            USA 
##             21              2              4            315           3008 
##   West Germany 
##              1
movie_metadata$country[movie_metadata$country != "USA"  & movie_metadata$country !="UK"] <- "Others"
table(movie_metadata$country) 
## 
## Others     UK    USA 
##    456    315   3008

Tidy up movie Title

library(stringr)
movie_metadata$movie_title <- gsub("Â", "", as.character(factor(movie_metadata$movie_title)))
str_trim(movie_metadata$movie_title, side = "right")

Work on Genres

# create a new data frame
genres.df <- as.data.frame(movie_metadata[,c("genres", "gross")])
# separate different genres into new columns
genres.df$Action <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Action") 1 else 0)
genres.df$Adventure <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Adventure") 1 else 0)
genres.df$Animation <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Animation") 1 else 0)
genres.df$Biography <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Biography") 1 else 0)
genres.df$Comedy <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Comedy") 1 else 0)
genres.df$Crime <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Crime") 1 else 0)
genres.df$Documentary <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Documentary") 1 else 0)
genres.df$Drama <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Drama") 1 else 0)
genres.df$Family <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Family") 1 else 0)
genres.df$Fantasy <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Fantasy") 1 else 0)
genres.df$`Film-Noir` <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Film-Noir") 1 else 0)
genres.df$History <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "History") 1 else 0)
genres.df$Horror <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Horror") 1 else 0)
genres.df$Musical <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Musical") 1 else 0)
genres.df$Mystery <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Mystery") 1 else 0)
genres.df$News <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "News") 1 else 0)
genres.df$Romance <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Romance") 1 else 0)
genres.df$`Sci-Fi` <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Sci-Fi") 1 else 0)
genres.df$Short <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Short") 1 else 0)
genres.df$Sport <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Sport") 1 else 0)
genres.df$Thriller <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Thriller") 1 else 0)
genres.df$War <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "War") 1 else 0)
genres.df$Western <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Western") 1 else 0)
# get the mean of Gross for different genres
means <- rep(0,23)
for (i in 1:23) {
  means[i] <- mean(genres.df$gross[genres.df[i+2]==1])
}
head(means)
## [1]  77289281 100214983 109074453  36175004  50842175  39192003
# plot the means
barplot(means/10^6, main = "Average gross for different genres")

head(genres.df$action)
## NULL
# We found out genre does matter to the Gross, hence we combine both df
movie_metadata <- cbind(movie_metadata, genres.df)
movie_metadata <- movie_metadata[, -c(9, 27, 28)]

looking at movie release year histogram

hist(movie_metadata$title_year) # find out movie release before 1980 is probably irrelevent, hence we remove any movie that is release before 1980

movie_metadata <- movie_metadata[movie_metadata$title_year>= 1980, ] 

Little visualization before predicting

movie_metadata %>%
  plot_ly(x = ~movie_facebook_likes, y = ~gross, color = ~content_rating , mode = "markers", text = ~content_rating, alpha = 0.7, type = "scatter")
movie_metadata$ROI <- round(movie_metadata$gross / movie_metadata$budget *100, 2)
movie_metadata$profitable <- ifelse(movie_metadata$ROI > 1, 1, 0)
summary(movie_metadata$ROI)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      0.0     46.7    106.9    584.8    215.0 719448.6
summary(movie_metadata$profit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.9718  1.0000  1.0000
summary(movie_metadata$gross)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##       162   7667175  28902322  51809423  66438272 760505847
# Looking at the top 25 gross movie ROI
movie_metadata %>% 
  filter(budget > 100000) %>% 
  arrange(desc(gross)) %>%
  top_n(25, gross) %>%
  ggplot(aes(x = budget/10^6, y= ROI)) + geom_point() + geom_smooth() + geom_text_repel(aes(label = movie_title), size = 3) +
  labs(x = "Budget in Millions ($)", y = "ROI (%)", title = "Top 25 movie ROI base on gross" )+ 
  theme_economist() + 
  scale_color_economist()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Checking out the does the name of director and actors matter

uniqueN(movie_metadata$director_name)
## [1] 1643
uniqueN(movie_metadata$actor_1_name)
## [1] 1409
uniqueN(movie_metadata$actor_3_name)
## [1] 2557
uniqueN(movie_metadata$actor_2_name)
## [1] 2150
uniqueN(movie_metadata$plot_keywords)
## [1] 3621
# all of them are all different, hence it make no sense to use them to predict. We decided to drop movie title, plot_keywords and movie_imdb_link too. 
final_df <- subset(movie_metadata, select = -c(director_name, actor_2_name, actor_1_name, movie_title, actor_3_name, plot_keywords, movie_imdb_link))

Change catergorical variable to catergorical and finalize the final_df

#Checking for genre that is not usable and drop it
sum(uniqueN(final_df$Action))
## [1] 2
sum(uniqueN(final_df$Adventure))
## [1] 2
sum(uniqueN(final_df$Animation))
## [1] 2
sum(uniqueN(final_df$Biography))
## [1] 2
sum(uniqueN(final_df$Comedy))
## [1] 2
sum(uniqueN(final_df$Crime))
## [1] 2
sum(uniqueN(final_df$Documentary))
## [1] 2
sum(uniqueN(final_df$Drama))
## [1] 2
sum(uniqueN(final_df$Family))
## [1] 2
sum(uniqueN(final_df$Fantasy))
## [1] 2
sum(uniqueN(final_df$`Film-Noir`)) # Remove
## [1] 1
sum(uniqueN(final_df$History))
## [1] 2
sum(uniqueN(final_df$Horror))
## [1] 2
sum(uniqueN(final_df$Musical))
## [1] 2
sum(uniqueN(final_df$Mystery))
## [1] 2
sum(uniqueN(final_df$News)) # Remove
## [1] 1
sum(uniqueN(final_df$Romance))
## [1] 2
sum(uniqueN(final_df$`Sci-Fi`))
## [1] 2
sum(uniqueN(final_df$Short)) # Remove
## [1] 1
sum(uniqueN(final_df$Thriller))
## [1] 2
sum(uniqueN(final_df$War))
## [1] 2
sum(uniqueN(final_df$Western))
## [1] 2
final_df <- subset(final_df, select = -c(`Film-Noir`, News, Short))

# factor country and content rating
final_df$country <- (as.factor(final_df$country))
final_df$content_rating <- (as.factor(final_df$content_rating))
str(final_df)
## 'data.frame':    3686 obs. of  40 variables:
##  $ num_critic_for_reviews   : num  723 302 602 813 462 392 324 635 375 673 ...
##  $ duration                 : num  178 169 148 164 132 156 100 141 153 183 ...
##  $ director_facebook_likes  : num  959 563 959 22000 475 959 15 959 282 959 ...
##  $ actor_3_facebook_likes   : num  855 1000 161 23000 530 4000 284 19000 10000 2000 ...
##  $ actor_1_facebook_likes   : num  1000 40000 11000 27000 640 24000 799 26000 25000 15000 ...
##  $ gross                    : num  7.61e+08 3.09e+08 2.00e+08 4.48e+08 7.31e+07 ...
##  $ num_voted_users          : num  886204 471220 275868 1144337 212204 ...
##  $ cast_total_facebook_likes: num  4834 48350 11700 106759 1873 ...
##  $ facenumber_in_poster     : num  0 0 1 0 1 0 1 4 3 0 ...
##  $ num_user_for_reviews     : num  3054 1238 994 2701 738 ...
##  $ country                  : Factor w/ 3 levels "Others","UK",..: 3 3 2 3 3 3 3 3 2 3 ...
##  $ content_rating           : Factor w/ 5 levels "G","NC-17","PG",..: 4 4 4 4 4 4 3 4 3 4 ...
##  $ budget                   : num  2.37e+08 3.00e+08 2.45e+08 2.50e+08 2.64e+08 ...
##  $ title_year               : num  2009 2007 2015 2012 2012 ...
##  $ actor_2_facebook_likes   : num  936 5000 393 23000 632 11000 553 21000 11000 4000 ...
##  $ imdb_score               : num  7.9 7.1 6.8 8.5 6.6 6.2 7.8 7.5 7.5 6.9 ...
##  $ aspect_ratio             : num  1.78 2.35 2.35 2.35 2.35 2.35 1.85 2.35 2.35 2.35 ...
##  $ movie_facebook_likes     : num  33000 0 85000 164000 24000 0 29000 118000 10000 197000 ...
##  $ Action                   : num  1 1 1 1 1 1 0 1 0 1 ...
##  $ Adventure                : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ Animation                : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Biography                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Comedy                   : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Crime                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Documentary              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Drama                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Family                   : num  0 0 0 0 0 0 1 0 1 0 ...
##  $ Fantasy                  : num  1 1 0 0 0 0 1 0 1 0 ...
##  $ History                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Horror                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Musical                  : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Mystery                  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ Romance                  : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ Sci-Fi                   : num  1 0 0 0 1 0 0 1 0 1 ...
##  $ Sport                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Thriller                 : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ War                      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Western                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ROI                      : num  320.9 103.1 81.7 179.2 27.7 ...
##  $ profitable               : num  1 1 1 1 1 1 1 1 1 1 ...
final_df <- subset(final_df, select = -c(ROI, profitable)) #removing ROI and Profitable column

# try basic lm
reg1 <- lm(gross ~ . , data = final_df)
summary(reg1)
## 
## Call:
## lm(formula = gross ~ ., data = final_df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -352431861  -21863672   -2288584   16609028  451581160 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.607e+09  2.678e+08   5.998 2.19e-09 ***
## num_critic_for_reviews     1.173e+05  1.187e+04   9.880  < 2e-16 ***
## duration                   3.545e+05  4.369e+04   8.115 6.56e-16 ***
## director_facebook_likes   -1.466e+03  2.623e+02  -5.591 2.43e-08 ***
## actor_3_facebook_likes    -8.800e+03  1.105e+03  -7.964 2.20e-15 ***
## actor_1_facebook_likes    -7.596e+03  6.722e+02 -11.301  < 2e-16 ***
## num_voted_users            1.723e+02  9.803e+00  17.577  < 2e-16 ***
## cast_total_facebook_likes  7.555e+03  6.703e+02  11.270  < 2e-16 ***
## facenumber_in_poster      -6.501e+05  3.819e+05  -1.702 0.088795 .  
## num_user_for_reviews       1.783e+04  3.193e+03   5.584 2.52e-08 ***
## countryUK                 -7.906e+05  3.375e+06  -0.234 0.814819    
## countryUSA                 1.545e+07  2.346e+06   6.586 5.17e-11 ***
## content_ratingNC-17       -2.991e+07  1.361e+07  -2.197 0.028059 *  
## content_ratingPG           3.271e+06  5.824e+06   0.562 0.574369    
## content_ratingPG-13        6.601e+06  6.539e+06   1.010 0.312789    
## content_ratingR           -1.554e+07  6.549e+06  -2.373 0.017701 *  
## budget                     5.707e-03  3.266e-03   1.747 0.080669 .  
## title_year                -8.231e+05  1.333e+05  -6.173 7.45e-10 ***
## actor_2_facebook_likes    -7.023e+03  7.122e+02  -9.861  < 2e-16 ***
## imdb_score                -6.373e+05  9.928e+05  -0.642 0.520967    
## aspect_ratio              -3.154e+05  1.837e+06  -0.172 0.863713    
## movie_facebook_likes      -8.589e+01  5.079e+01  -1.691 0.090894 .  
## Action                     8.479e+06  2.149e+06   3.946 8.11e-05 ***
## Adventure                  1.185e+07  2.356e+06   5.032 5.08e-07 ***
## Animation                  2.099e+07  4.356e+06   4.817 1.51e-06 ***
## Biography                 -1.543e+06  3.406e+06  -0.453 0.650554    
## Comedy                     7.262e+06  2.008e+06   3.617 0.000302 ***
## Crime                     -4.187e+06  2.212e+06  -1.893 0.058464 .  
## Documentary                1.262e+07  6.756e+06   1.868 0.061891 .  
## Drama                     -1.105e+07  1.956e+06  -5.649 1.74e-08 ***
## Family                     2.859e+07  3.997e+06   7.151 1.03e-12 ***
## Fantasy                    2.481e+06  2.414e+06   1.028 0.304094    
## History                   -7.452e+06  4.360e+06  -1.709 0.087526 .  
## Horror                    -6.120e+06  2.962e+06  -2.066 0.038914 *  
## Musical                   -4.368e+06  5.286e+06  -0.826 0.408680    
## Mystery                   -1.591e+06  2.638e+06  -0.603 0.546362    
## Romance                    3.840e+06  1.930e+06   1.989 0.046737 *  
## `Sci-Fi`                  -7.444e+06  2.470e+06  -3.013 0.002602 ** 
## Sport                      2.449e+06  3.883e+06   0.631 0.528296    
## Thriller                   2.486e+06  2.103e+06   1.182 0.237195    
## War                        3.321e+06  4.165e+06   0.797 0.425306    
## Western                   -1.059e+07  6.317e+06  -1.676 0.093821 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44340000 on 3644 degrees of freedom
## Multiple R-squared:  0.599,  Adjusted R-squared:  0.5945 
## F-statistic: 132.7 on 41 and 3644 DF,  p-value: < 2.2e-16

Checking for final_df correlation

final_df_matrix <- as.matrix(final_df[, c(-11,-12,-39)]) #getting rid of factor variable (country, content rating and gross_category)
m <- cor(final_df_matrix)
corrplot(m, method = "circle", type = "upper", order = "hclust")

Start predicting

# Cut gross in to percentile (cut it by 0 to 25 percentile,25 to 75 percentile, Above 75 percentile)
summary(final_df$gross)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##       162   7667175  28902322  51809423  66438272 760505847
final_df$gross_catogorical <- cut(final_df$gross/10^6, breaks = c(0, 7, 66, 761)) # change gross in millions
sum(is.na(final_df$gross_catogorical)) # double checking the completeness of data
## [1] 0
# partition data
set.seed(1)  # set seed for reproducing the partition
train.index <- sample(c(1:3686), 3686*0.6)  

#Create and set aside the remaining 40% of the data, to be used after omitting unhelpful data points and unnecessary variables.
train.df <- final_df[train.index,]
valid.df <- final_df[-train.index,]

library(rpart)

Linear model (Don’t use this model, since our target variable are now factor variable)

reg2 <- lm(gross ~ . , data = train.df)
summary(reg2)
## 
## Call:
## lm(formula = gross ~ ., data = train.df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -220666456  -14855093     469990   12121669  433329462 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                7.742e+08  2.843e+08   2.723  0.00652 ** 
## num_critic_for_reviews     5.344e+04  1.297e+04   4.121 3.91e-05 ***
## duration                   7.498e+04  4.632e+04   1.619  0.10564    
## director_facebook_likes   -1.103e+03  2.717e+02  -4.058 5.13e-05 ***
## actor_3_facebook_likes    -8.193e+03  1.139e+03  -7.192 8.75e-13 ***
## actor_1_facebook_likes    -7.209e+03  6.855e+02 -10.516  < 2e-16 ***
## num_voted_users            9.902e+01  9.928e+00   9.974  < 2e-16 ***
## cast_total_facebook_likes  7.124e+03  6.844e+02  10.408  < 2e-16 ***
## facenumber_in_poster      -1.892e+05  4.118e+05  -0.459  0.64597    
## num_user_for_reviews       1.346e+04  3.258e+03   4.132 3.74e-05 ***
## countryUK                 -5.033e+05  3.559e+06  -0.141  0.88755    
## countryUSA                 4.982e+06  2.532e+06   1.968  0.04923 *  
## content_ratingNC-17       -5.453e+06  1.380e+07  -0.395  0.69289    
## content_ratingPG           6.331e+06  5.718e+06   1.107  0.26838    
## content_ratingPG-13        1.109e+07  6.553e+06   1.692  0.09086 .  
## content_ratingR           -1.501e+05  6.609e+06  -0.023  0.98189    
## budget                     2.543e-02  9.090e-03   2.798  0.00519 ** 
## title_year                -3.978e+05  1.415e+05  -2.811  0.00499 ** 
## actor_2_facebook_likes    -7.289e+03  7.208e+02 -10.112  < 2e-16 ***
## imdb_score                -2.214e+05  1.031e+06  -0.215  0.82996    
## aspect_ratio              -1.241e+06  1.729e+06  -0.718  0.47309    
## movie_facebook_likes       8.997e+01  5.518e+01   1.630  0.10316    
## Action                     3.320e+06  2.218e+06   1.497  0.13460    
## Adventure                  1.111e+07  2.514e+06   4.419 1.04e-05 ***
## Animation                  1.901e+07  4.549e+06   4.178 3.06e-05 ***
## Biography                  6.611e+05  3.575e+06   0.185  0.85331    
## Comedy                     2.055e+06  2.123e+06   0.968  0.33331    
## Crime                     -1.879e+06  2.338e+06  -0.804  0.42156    
## Documentary                1.050e+07  6.868e+06   1.528  0.12655    
## Drama                     -2.059e+06  2.073e+06  -0.993  0.32070    
## Family                     1.169e+07  4.316e+06   2.710  0.00679 ** 
## Fantasy                    3.309e+06  2.519e+06   1.313  0.18919    
## History                   -2.166e+06  4.485e+06  -0.483  0.62927    
## Horror                    -4.988e+05  3.104e+06  -0.161  0.87232    
## Musical                   -9.219e+05  5.304e+06  -0.174  0.86203    
## Mystery                    4.281e+05  2.837e+06   0.151  0.88008    
## Romance                    2.912e+06  1.990e+06   1.463  0.14366    
## `Sci-Fi`                  -1.527e+06  2.622e+06  -0.582  0.56043    
## Sport                      3.114e+06  3.927e+06   0.793  0.42800    
## Thriller                  -4.775e+05  2.216e+06  -0.215  0.82941    
## War                       -3.906e+06  4.204e+06  -0.929  0.35294    
## Western                   -8.329e+06  6.226e+06  -1.338  0.18112    
## gross_catogorical(7,66]    1.009e+07  2.150e+06   4.691 2.89e-06 ***
## gross_catogorical(66,761]  8.641e+07  3.006e+06  28.750  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35520000 on 2167 degrees of freedom
## Multiple R-squared:  0.7416, Adjusted R-squared:  0.7364 
## F-statistic: 144.6 on 43 and 2167 DF,  p-value: < 2.2e-16
vif(reg2)
##                                 GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews      4.322856  1        2.079148
## duration                    1.766013  1        1.328914
## director_facebook_likes     1.168806  1        1.081113
## actor_3_facebook_likes      8.175606  1        2.859302
## actor_1_facebook_likes    271.233874  1       16.469180
## num_voted_users             3.812219  1        1.952490
## cast_total_facebook_likes 383.393845  1       19.580445
## facenumber_in_poster        1.142121  1        1.068701
## num_user_for_reviews        3.024278  1        1.739045
## country                     1.247509  2        1.056844
## content_rating              4.482774  4        1.206267
## budget                      1.221955  1        1.105421
## title_year                  2.128414  1        1.458909
## actor_2_facebook_likes     21.846923  1        4.674069
## imdb_score                  2.033370  1        1.425963
## aspect_ratio                1.114447  1        1.055674
## movie_facebook_likes        2.251619  1        1.500540
## Action                      1.623187  1        1.274043
## Adventure                   1.782783  1        1.335209
## Animation                   2.007143  1        1.416737
## Biography                   1.275232  1        1.129262
## Comedy                      1.893194  1        1.375934
## Crime                       1.424992  1        1.193730
## Documentary                 1.178901  1        1.085772
## Drama                       1.882076  1        1.371888
## Family                      3.352694  1        1.831036
## Fantasy                     1.311481  1        1.145199
## History                     1.303287  1        1.141616
## Horror                      1.524613  1        1.234752
## Musical                     1.153285  1        1.073911
## Mystery                     1.207384  1        1.098810
## Romance                     1.238638  1        1.112941
## `Sci-Fi`                    1.328006  1        1.152391
## Sport                       1.100352  1        1.048977
## Thriller                    1.751239  1        1.323344
## War                         1.234893  1        1.111258
## Western                     1.058230  1        1.028703
## gross_catogorical           2.118734  2        1.206477
reg2 <- lm(gross ~ . -actor_1_facebook_likes -cast_total_facebook_likes -actor_2_facebook_likes -gross_catogorical , data = train.df) # Removed colinear variables

reg.step <- step(reg2, direction = "both")
## Start:  AIC=77988.88
## gross ~ (num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + actor_1_facebook_likes + num_voted_users + 
##     cast_total_facebook_likes + facenumber_in_poster + num_user_for_reviews + 
##     country + content_rating + budget + title_year + actor_2_facebook_likes + 
##     imdb_score + aspect_ratio + movie_facebook_likes + Action + 
##     Adventure + Animation + Biography + Comedy + Crime + Documentary + 
##     Drama + Family + Fantasy + History + Horror + Musical + Mystery + 
##     Romance + `Sci-Fi` + Sport + Thriller + War + Western + gross_catogorical) - 
##     actor_1_facebook_likes - cast_total_facebook_likes - actor_2_facebook_likes - 
##     gross_catogorical
## 
##                           Df  Sum of Sq        RSS   AIC
## - movie_facebook_likes     1 4.0329e+12 4.4483e+18 77987
## - Fantasy                  1 3.2144e+13 4.4483e+18 77987
## - War                      1 4.5981e+13 4.4483e+18 77987
## - aspect_ratio             1 8.5974e+13 4.4484e+18 77987
## - Sport                    1 2.8756e+14 4.4486e+18 77987
## - Mystery                  1 2.8912e+14 4.4486e+18 77987
## - Biography                1 3.4704e+14 4.4486e+18 77987
## - imdb_score               1 1.2778e+15 4.4495e+18 77988
## - Documentary              1 1.3585e+15 4.4496e+18 77988
## - Thriller                 1 1.6021e+15 4.4499e+18 77988
## - facenumber_in_poster     1 1.6807e+15 4.4500e+18 77988
## - History                  1 2.2026e+15 4.4505e+18 77988
## - Musical                  1 2.4439e+15 4.4507e+18 77988
## - Romance                  1 2.6570e+15 4.4509e+18 77988
## - Crime                    1 3.5636e+15 4.4518e+18 77989
## <none>                                  4.4483e+18 77989
## - Horror                   1 4.4982e+15 4.4528e+18 77989
## - Western                  1 4.5774e+15 4.4528e+18 77989
## - `Sci-Fi`                 1 1.2077e+16 4.4603e+18 77993
## - Adventure                1 2.2116e+16 4.4704e+18 77998
## - Action                   1 2.5694e+16 4.4740e+18 78000
## - num_user_for_reviews     1 2.6205e+16 4.4745e+18 78000
## - Comedy                   1 2.6999e+16 4.4753e+18 78000
## - director_facebook_likes  1 3.0964e+16 4.4792e+18 78002
## - Animation                1 3.1588e+16 4.4799e+18 78003
## - Drama                    1 3.2914e+16 4.4812e+18 78003
## - budget                   1 3.4660e+16 4.4829e+18 78004
## - Family                   1 5.3064e+16 4.5013e+18 78013
## - title_year               1 6.5325e+16 4.5136e+18 78019
## - duration                 1 6.5963e+16 4.5142e+18 78019
## - actor_3_facebook_likes   1 6.7538e+16 4.5158e+18 78020
## - country                  2 1.3960e+17 4.5879e+18 78053
## - num_critic_for_reviews   1 1.3687e+17 4.5851e+18 78054
## - content_rating           4 2.0491e+17 4.6532e+18 78080
## - num_voted_users          1 3.7183e+17 4.8201e+18 78164
## 
## Step:  AIC=77986.88
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + imdb_score + aspect_ratio + Action + Adventure + 
##     Animation + Biography + Comedy + Crime + Documentary + Drama + 
##     Family + Fantasy + History + Horror + Musical + Mystery + 
##     Romance + `Sci-Fi` + Sport + Thriller + War + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Fantasy                  1 3.1535e+13 4.4483e+18 77985
## - War                      1 4.6563e+13 4.4483e+18 77985
## - aspect_ratio             1 8.8307e+13 4.4484e+18 77985
## - Mystery                  1 2.8702e+14 4.4486e+18 77985
## - Sport                    1 2.8751e+14 4.4486e+18 77985
## - Biography                1 3.4775e+14 4.4486e+18 77985
## - imdb_score               1 1.2884e+15 4.4496e+18 77986
## - Documentary              1 1.3653e+15 4.4496e+18 77986
## - Thriller                 1 1.5999e+15 4.4499e+18 77986
## - facenumber_in_poster     1 1.6798e+15 4.4500e+18 77986
## - History                  1 2.2119e+15 4.4505e+18 77986
## - Musical                  1 2.4408e+15 4.4507e+18 77986
## - Romance                  1 2.6584e+15 4.4509e+18 77986
## - Crime                    1 3.5716e+15 4.4518e+18 77987
## <none>                                  4.4483e+18 77987
## - Horror                   1 4.5047e+15 4.4528e+18 77987
## - Western                  1 4.5796e+15 4.4529e+18 77987
## + movie_facebook_likes     1 4.0329e+12 4.4483e+18 77989
## - `Sci-Fi`                 1 1.2081e+16 4.4604e+18 77991
## - Adventure                1 2.2113e+16 4.4704e+18 77996
## - Action                   1 2.5696e+16 4.4740e+18 77998
## - Comedy                   1 2.7007e+16 4.4753e+18 77998
## - num_user_for_reviews     1 2.7302e+16 4.4756e+18 77998
## - director_facebook_likes  1 3.0965e+16 4.4792e+18 78000
## - Animation                1 3.1593e+16 4.4799e+18 78001
## - Drama                    1 3.2923e+16 4.4812e+18 78001
## - budget                   1 3.4658e+16 4.4829e+18 78002
## - Family                   1 5.3107e+16 4.5014e+18 78011
## - title_year               1 6.5545e+16 4.5138e+18 78017
## - duration                 1 6.6129e+16 4.5144e+18 78018
## - actor_3_facebook_likes   1 6.7938e+16 4.5162e+18 78018
## - country                  2 1.3962e+17 4.5879e+18 78051
## - num_critic_for_reviews   1 1.7630e+17 4.6246e+18 78071
## - content_rating           4 2.0500e+17 4.6533e+18 78078
## - num_voted_users          1 4.0036e+17 4.8486e+18 78175
## 
## Step:  AIC=77984.9
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + imdb_score + aspect_ratio + Action + Adventure + 
##     Animation + Biography + Comedy + Crime + Documentary + Drama + 
##     Family + History + Horror + Musical + Mystery + Romance + 
##     `Sci-Fi` + Sport + Thriller + War + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - War                      1 4.7999e+13 4.4484e+18 77983
## - aspect_ratio             1 8.7974e+13 4.4484e+18 77983
## - Mystery                  1 2.9023e+14 4.4486e+18 77983
## - Sport                    1 2.9815e+14 4.4486e+18 77983
## - Biography                1 3.5696e+14 4.4487e+18 77983
## - imdb_score               1 1.2769e+15 4.4496e+18 77984
## - Documentary              1 1.3801e+15 4.4497e+18 77984
## - Thriller                 1 1.6363e+15 4.4499e+18 77984
## - facenumber_in_poster     1 1.6647e+15 4.4500e+18 77984
## - History                  1 2.1918e+15 4.4505e+18 77984
## - Musical                  1 2.4655e+15 4.4508e+18 77984
## - Romance                  1 2.6563e+15 4.4510e+18 77984
## - Crime                    1 3.5412e+15 4.4518e+18 77985
## <none>                                  4.4483e+18 77985
## - Western                  1 4.5552e+15 4.4529e+18 77985
## - Horror                   1 4.6017e+15 4.4529e+18 77985
## + Fantasy                  1 3.1535e+13 4.4483e+18 77987
## + movie_facebook_likes     1 3.4244e+12 4.4483e+18 77987
## - `Sci-Fi`                 1 1.2140e+16 4.4604e+18 77989
## - Adventure                1 2.2115e+16 4.4704e+18 77994
## - Action                   1 2.5664e+16 4.4740e+18 77996
## - num_user_for_reviews     1 2.7272e+16 4.4756e+18 77996
## - Comedy                   1 2.7417e+16 4.4757e+18 77996
## - director_facebook_likes  1 3.0941e+16 4.4792e+18 77998
## - Animation                1 3.1604e+16 4.4799e+18 77999
## - Drama                    1 3.2909e+16 4.4812e+18 77999
## - budget                   1 3.4634e+16 4.4829e+18 78000
## - Family                   1 5.4050e+16 4.5024e+18 78010
## - title_year               1 6.5624e+16 4.5139e+18 78015
## - duration                 1 6.6389e+16 4.5147e+18 78016
## - actor_3_facebook_likes   1 6.8086e+16 4.5164e+18 78016
## - country                  2 1.3959e+17 4.5879e+18 78049
## - num_critic_for_reviews   1 1.7647e+17 4.6248e+18 78069
## - content_rating           4 2.0540e+17 4.6537e+18 78077
## - num_voted_users          1 4.0038e+17 4.8487e+18 78173
## 
## Step:  AIC=77982.92
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + imdb_score + aspect_ratio + Action + Adventure + 
##     Animation + Biography + Comedy + Crime + Documentary + Drama + 
##     Family + History + Horror + Musical + Mystery + Romance + 
##     `Sci-Fi` + Sport + Thriller + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - aspect_ratio             1 8.6248e+13 4.4484e+18 77981
## - Sport                    1 2.8396e+14 4.4486e+18 77981
## - Mystery                  1 2.8783e+14 4.4486e+18 77981
## - Biography                1 3.6622e+14 4.4487e+18 77981
## - imdb_score               1 1.2733e+15 4.4496e+18 77982
## - Documentary              1 1.4134e+15 4.4498e+18 77982
## - Thriller                 1 1.6205e+15 4.4500e+18 77982
## - facenumber_in_poster     1 1.6816e+15 4.4500e+18 77982
## - History                  1 2.1740e+15 4.4505e+18 77982
## - Musical                  1 2.4824e+15 4.4508e+18 77982
## - Romance                  1 2.6439e+15 4.4510e+18 77982
## - Crime                    1 3.6614e+15 4.4520e+18 77983
## <none>                                  4.4484e+18 77983
## - Western                  1 4.5648e+15 4.4529e+18 77983
## - Horror                   1 4.6249e+15 4.4530e+18 77983
## + War                      1 4.7999e+13 4.4483e+18 77985
## + Fantasy                  1 3.2971e+13 4.4483e+18 77985
## + movie_facebook_likes     1 3.9571e+12 4.4484e+18 77985
## - `Sci-Fi`                 1 1.2271e+16 4.4606e+18 77987
## - Adventure                1 2.2070e+16 4.4704e+18 77992
## - Action                   1 2.6045e+16 4.4744e+18 77994
## - num_user_for_reviews     1 2.7280e+16 4.4756e+18 77994
## - Comedy                   1 2.7384e+16 4.4757e+18 77994
## - director_facebook_likes  1 3.0898e+16 4.4793e+18 77996
## - Animation                1 3.1727e+16 4.4801e+18 77997
## - Drama                    1 3.2872e+16 4.4812e+18 77997
## - budget                   1 3.4961e+16 4.4833e+18 77998
## - Family                   1 5.4112e+16 4.5025e+18 78008
## - title_year               1 6.5582e+16 4.5139e+18 78013
## - duration                 1 6.7159e+16 4.5155e+18 78014
## - actor_3_facebook_likes   1 6.8120e+16 4.5165e+18 78015
## - country                  2 1.3979e+17 4.5881e+18 78047
## - num_critic_for_reviews   1 1.7653e+17 4.6249e+18 78067
## - content_rating           4 2.0603e+17 4.6544e+18 78075
## - num_voted_users          1 4.0035e+17 4.8487e+18 78171
## 
## Step:  AIC=77980.96
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + imdb_score + Action + Adventure + Animation + 
##     Biography + Comedy + Crime + Documentary + Drama + Family + 
##     History + Horror + Musical + Mystery + Romance + `Sci-Fi` + 
##     Sport + Thriller + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Mystery                  1 2.7752e+14 4.4487e+18 77979
## - Sport                    1 2.8189e+14 4.4487e+18 77979
## - Biography                1 3.5950e+14 4.4488e+18 77979
## - imdb_score               1 1.2664e+15 4.4497e+18 77980
## - Documentary              1 1.3995e+15 4.4498e+18 77980
## - Thriller                 1 1.5842e+15 4.4500e+18 77980
## - facenumber_in_poster     1 1.6942e+15 4.4501e+18 77980
## - History                  1 2.1804e+15 4.4506e+18 77980
## - Musical                  1 2.4907e+15 4.4509e+18 77980
## - Romance                  1 2.6233e+15 4.4511e+18 77980
## - Crime                    1 3.6941e+15 4.4521e+18 77981
## <none>                                  4.4484e+18 77981
## - Western                  1 4.5997e+15 4.4530e+18 77981
## - Horror                   1 4.6478e+15 4.4531e+18 77981
## + aspect_ratio             1 8.6248e+13 4.4484e+18 77983
## + War                      1 4.6273e+13 4.4484e+18 77983
## + Fantasy                  1 3.2608e+13 4.4484e+18 77983
## + movie_facebook_likes     1 6.2308e+12 4.4484e+18 77983
## - `Sci-Fi`                 1 1.2254e+16 4.4607e+18 77985
## - Adventure                1 2.1998e+16 4.4704e+18 77990
## - Action                   1 2.5974e+16 4.4744e+18 77992
## - num_user_for_reviews     1 2.7299e+16 4.4757e+18 77992
## - Comedy                   1 2.7552e+16 4.4760e+18 77993
## - director_facebook_likes  1 3.0917e+16 4.4794e+18 77994
## - Animation                1 3.1730e+16 4.4802e+18 77995
## - Drama                    1 3.2874e+16 4.4813e+18 77995
## - budget                   1 3.4925e+16 4.4834e+18 77996
## - Family                   1 5.4172e+16 4.5026e+18 78006
## - title_year               1 6.6211e+16 4.5147e+18 78012
## - duration                 1 6.7138e+16 4.5156e+18 78012
## - actor_3_facebook_likes   1 6.8277e+16 4.5167e+18 78013
## - country                  2 1.4000e+17 4.5884e+18 78045
## - num_critic_for_reviews   1 1.7670e+17 4.6251e+18 78065
## - content_rating           4 2.0595e+17 4.6544e+18 78073
## - num_voted_users          1 4.0034e+17 4.8488e+18 78169
## 
## Step:  AIC=77979.1
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + imdb_score + Action + Adventure + Animation + 
##     Biography + Comedy + Crime + Documentary + Drama + Family + 
##     History + Horror + Musical + Romance + `Sci-Fi` + Sport + 
##     Thriller + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Sport                    1 2.6132e+14 4.4490e+18 77977
## - Biography                1 3.1941e+14 4.4490e+18 77977
## - imdb_score               1 1.2422e+15 4.4500e+18 77978
## - Documentary              1 1.3477e+15 4.4501e+18 77978
## - facenumber_in_poster     1 1.7171e+15 4.4504e+18 77978
## - Thriller                 1 1.9942e+15 4.4507e+18 77978
## - History                  1 2.2494e+15 4.4510e+18 77978
## - Romance                  1 2.5431e+15 4.4513e+18 77978
## - Musical                  1 2.5577e+15 4.4513e+18 77978
## - Crime                    1 3.6672e+15 4.4524e+18 77979
## <none>                                  4.4487e+18 77979
## - Horror                   1 4.5106e+15 4.4532e+18 77979
## - Western                  1 4.6405e+15 4.4534e+18 77979
## + Mystery                  1 2.7752e+14 4.4484e+18 77981
## + aspect_ratio             1 7.5933e+13 4.4486e+18 77981
## + War                      1 4.4061e+13 4.4487e+18 77981
## + Fantasy                  1 3.5784e+13 4.4487e+18 77981
## + movie_facebook_likes     1 3.4141e+12 4.4487e+18 77981
## - `Sci-Fi`                 1 1.2138e+16 4.4609e+18 77983
## - Adventure                1 2.1834e+16 4.4706e+18 77988
## - Action                   1 2.5815e+16 4.4745e+18 77990
## - Comedy                   1 2.7275e+16 4.4760e+18 77991
## - num_user_for_reviews     1 2.7282e+16 4.4760e+18 77991
## - director_facebook_likes  1 3.0816e+16 4.4795e+18 77992
## - Animation                1 3.1644e+16 4.4804e+18 77993
## - Drama                    1 3.3286e+16 4.4820e+18 77994
## - budget                   1 3.4838e+16 4.4836e+18 77994
## - Family                   1 5.4701e+16 4.5034e+18 78004
## - title_year               1 6.5973e+16 4.5147e+18 78010
## - duration                 1 6.7669e+16 4.5164e+18 78010
## - actor_3_facebook_likes   1 6.8219e+16 4.5169e+18 78011
## - country                  2 1.3973e+17 4.5885e+18 78043
## - num_critic_for_reviews   1 1.7650e+17 4.6252e+18 78063
## - content_rating           4 2.0692e+17 4.6556e+18 78072
## - num_voted_users          1 4.0049e+17 4.8492e+18 78168
## 
## Step:  AIC=77977.23
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + imdb_score + Action + Adventure + Animation + 
##     Biography + Comedy + Crime + Documentary + Drama + Family + 
##     History + Horror + Musical + Romance + `Sci-Fi` + Thriller + 
##     Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Biography                1 4.1719e+14 4.4494e+18 77975
## - imdb_score               1 1.2042e+15 4.4502e+18 77976
## - Documentary              1 1.3627e+15 4.4503e+18 77976
## - facenumber_in_poster     1 1.7438e+15 4.4507e+18 77976
## - Thriller                 1 1.9147e+15 4.4509e+18 77976
## - History                  1 2.3470e+15 4.4513e+18 77976
## - Romance                  1 2.4468e+15 4.4514e+18 77976
## - Musical                  1 2.6772e+15 4.4517e+18 77977
## - Crime                    1 3.8215e+15 4.4528e+18 77977
## <none>                                  4.4490e+18 77977
## - Horror                   1 4.6056e+15 4.4536e+18 77978
## - Western                  1 4.7242e+15 4.4537e+18 77978
## + Sport                    1 2.6132e+14 4.4487e+18 77979
## + Mystery                  1 2.5694e+14 4.4487e+18 77979
## + aspect_ratio             1 7.4432e+13 4.4489e+18 77979
## + Fantasy                  1 4.5922e+13 4.4489e+18 77979
## + War                      1 3.1100e+13 4.4489e+18 77979
## + movie_facebook_likes     1 3.2817e+12 4.4490e+18 77979
## - `Sci-Fi`                 1 1.2226e+16 4.4612e+18 77981
## - Adventure                1 2.1597e+16 4.4706e+18 77986
## - Action                   1 2.5808e+16 4.4748e+18 77988
## - Comedy                   1 2.7152e+16 4.4761e+18 77989
## - num_user_for_reviews     1 2.7227e+16 4.4762e+18 77989
## - director_facebook_likes  1 3.0676e+16 4.4797e+18 77990
## - Animation                1 3.1505e+16 4.4805e+18 77991
## - Drama                    1 3.3239e+16 4.4822e+18 77992
## - budget                   1 3.4855e+16 4.4838e+18 77992
## - Family                   1 5.4705e+16 4.5037e+18 78002
## - title_year               1 6.5905e+16 4.5149e+18 78008
## - duration                 1 6.7957e+16 4.5169e+18 78009
## - actor_3_facebook_likes   1 6.8158e+16 4.5171e+18 78009
## - country                  2 1.4073e+17 4.5897e+18 78042
## - num_critic_for_reviews   1 1.7627e+17 4.6253e+18 78061
## - content_rating           4 2.0953e+17 4.6585e+18 78071
## - num_voted_users          1 4.0023e+17 4.8492e+18 78166
## 
## Step:  AIC=77975.44
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + imdb_score + Action + Adventure + Animation + 
##     Comedy + Crime + Documentary + Drama + Family + History + 
##     Horror + Musical + Romance + `Sci-Fi` + Thriller + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - imdb_score               1 1.1478e+15 4.4505e+18 77974
## - Documentary              1 1.2940e+15 4.4507e+18 77974
## - Thriller                 1 1.7117e+15 4.4511e+18 77974
## - facenumber_in_poster     1 1.8270e+15 4.4512e+18 77974
## - History                  1 2.0160e+15 4.4514e+18 77974
## - Romance                  1 2.3211e+15 4.4517e+18 77975
## - Musical                  1 2.6576e+15 4.4521e+18 77975
## - Crime                    1 3.7831e+15 4.4532e+18 77975
## <none>                                  4.4494e+18 77975
## - Horror                   1 4.7716e+15 4.4542e+18 77976
## - Western                  1 4.8806e+15 4.4543e+18 77976
## + Biography                1 4.1719e+14 4.4490e+18 77977
## + Sport                    1 3.5910e+14 4.4490e+18 77977
## + Mystery                  1 2.0949e+14 4.4492e+18 77977
## + aspect_ratio             1 6.8181e+13 4.4493e+18 77977
## + Fantasy                  1 6.0103e+13 4.4493e+18 77977
## + War                      1 3.7142e+13 4.4494e+18 77977
## + movie_facebook_likes     1 4.0606e+12 4.4494e+18 77977
## - `Sci-Fi`                 1 1.2356e+16 4.4618e+18 77980
## - Adventure                1 2.1280e+16 4.4707e+18 77984
## - Action                   1 2.5543e+16 4.4749e+18 77986
## - Comedy                   1 2.6737e+16 4.4761e+18 77987
## - num_user_for_reviews     1 2.6874e+16 4.4763e+18 77987
## - director_facebook_likes  1 3.0546e+16 4.4799e+18 77989
## - Animation                1 3.1706e+16 4.4811e+18 77989
## - Drama                    1 3.2959e+16 4.4824e+18 77990
## - budget                   1 3.4569e+16 4.4840e+18 77991
## - Family                   1 5.4318e+16 4.5037e+18 78000
## - title_year               1 6.5936e+16 4.5153e+18 78006
## - actor_3_facebook_likes   1 6.8316e+16 4.5177e+18 78007
## - duration                 1 6.8377e+16 4.5178e+18 78007
## - country                  2 1.4032e+17 4.5897e+18 78040
## - num_critic_for_reviews   1 1.7755e+17 4.6269e+18 78060
## - content_rating           4 2.1245e+17 4.6618e+18 78071
## - num_voted_users          1 4.0029e+17 4.8497e+18 78164
## 
## Step:  AIC=77974.01
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + Action + Adventure + Animation + Comedy + Crime + 
##     Documentary + Drama + Family + History + Horror + Musical + 
##     Romance + `Sci-Fi` + Thriller + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Documentary              1 1.0056e+15 4.4516e+18 77973
## - facenumber_in_poster     1 1.7479e+15 4.4523e+18 77973
## - Thriller                 1 1.9457e+15 4.4525e+18 77973
## - History                  1 2.0946e+15 4.4526e+18 77973
## - Romance                  1 2.3658e+15 4.4529e+18 77973
## - Musical                  1 2.7113e+15 4.4533e+18 77973
## - Crime                    1 3.8816e+15 4.4544e+18 77974
## <none>                                  4.4505e+18 77974
## - Horror                   1 4.1831e+15 4.4547e+18 77974
## - Western                  1 4.8615e+15 4.4554e+18 77974
## + imdb_score               1 1.1478e+15 4.4494e+18 77975
## + Biography                1 3.6076e+14 4.4502e+18 77976
## + Sport                    1 3.0830e+14 4.4502e+18 77976
## + Mystery                  1 1.9355e+14 4.4504e+18 77976
## + aspect_ratio             1 6.3185e+13 4.4505e+18 77976
## + Fantasy                  1 4.2645e+13 4.4505e+18 77976
## + War                      1 3.4808e+13 4.4505e+18 77976
## + movie_facebook_likes     1 1.4066e+13 4.4505e+18 77976
## - `Sci-Fi`                 1 1.2015e+16 4.4626e+18 77978
## - Adventure                1 2.1799e+16 4.4723e+18 77983
## - Action                   1 2.6988e+16 4.4775e+18 77985
## - Comedy                   1 2.7916e+16 4.4785e+18 77986
## - num_user_for_reviews     1 2.9249e+16 4.4798e+18 77986
## - director_facebook_likes  1 3.0499e+16 4.4810e+18 77987
## - Animation                1 3.0568e+16 4.4811e+18 77987
## - budget                   1 3.4891e+16 4.4854e+18 77989
## - Drama                    1 3.7849e+16 4.4884e+18 77991
## - Family                   1 5.5435e+16 4.5060e+18 77999
## - title_year               1 6.5205e+16 4.5157e+18 78004
## - duration                 1 6.7229e+16 4.5178e+18 78005
## - actor_3_facebook_likes   1 6.9123e+16 4.5197e+18 78006
## - country                  2 1.4794e+17 4.5985e+18 78042
## - num_critic_for_reviews   1 1.8301e+17 4.6336e+18 78061
## - content_rating           4 2.1754e+17 4.6681e+18 78072
## - num_voted_users          1 4.3668e+17 4.8872e+18 78179
## 
## Step:  AIC=77972.51
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + Action + Adventure + Animation + Comedy + Crime + 
##     Drama + Family + History + Horror + Musical + Romance + `Sci-Fi` + 
##     Thriller + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Thriller                 1 1.6961e+15 4.4532e+18 77971
## - facenumber_in_poster     1 1.8538e+15 4.4534e+18 77971
## - History                  1 2.0540e+15 4.4536e+18 77972
## - Romance                  1 2.1269e+15 4.4537e+18 77972
## - Musical                  1 2.8303e+15 4.4544e+18 77972
## - Crime                    1 4.0053e+15 4.4556e+18 77972
## <none>                                  4.4516e+18 77973
## - Horror                   1 4.7955e+15 4.4563e+18 77973
## - Western                  1 4.9457e+15 4.4565e+18 77973
## + Documentary              1 1.0056e+15 4.4505e+18 77974
## + imdb_score               1 8.5938e+14 4.4507e+18 77974
## + Sport                    1 3.2054e+14 4.4512e+18 77974
## + Biography                1 3.1020e+14 4.4512e+18 77974
## + Mystery                  1 1.6037e+14 4.4514e+18 77974
## + Fantasy                  1 5.8879e+13 4.4515e+18 77974
## + War                      1 5.8641e+13 4.4515e+18 77974
## + aspect_ratio             1 5.4630e+13 4.4515e+18 77974
## + movie_facebook_likes     1 2.1531e+13 4.4515e+18 77974
## - `Sci-Fi`                 1 1.2332e+16 4.4639e+18 77977
## - Adventure                1 2.1254e+16 4.4728e+18 77981
## - Action                   1 2.6327e+16 4.4779e+18 77984
## - Comedy                   1 2.6912e+16 4.4785e+18 77984
## - num_user_for_reviews     1 2.9605e+16 4.4812e+18 77985
## - director_facebook_likes  1 3.0523e+16 4.4821e+18 77986
## - Animation                1 3.0527e+16 4.4821e+18 77986
## - budget                   1 3.4664e+16 4.4862e+18 77988
## - Drama                    1 4.1788e+16 4.4933e+18 77991
## - Family                   1 5.4439e+16 4.5060e+18 77997
## - title_year               1 6.4227e+16 4.5158e+18 78002
## - duration                 1 6.6638e+16 4.5182e+18 78003
## - actor_3_facebook_likes   1 6.8944e+16 4.5205e+18 78004
## - country                  2 1.4823e+17 4.5998e+18 78041
## - num_critic_for_reviews   1 1.8201e+17 4.6336e+18 78059
## - content_rating           4 2.1748e+17 4.6690e+18 78070
## - num_voted_users          1 4.3610e+17 4.8877e+18 78177
## 
## Step:  AIC=77971.35
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + Action + Adventure + Animation + Comedy + Crime + 
##     Drama + Family + History + Horror + Musical + Romance + `Sci-Fi` + 
##     Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Romance                  1 1.8610e+15 4.4551e+18 77970
## - facenumber_in_poster     1 2.0338e+15 4.4553e+18 77970
## - History                  1 2.3556e+15 4.4556e+18 77971
## - Crime                    1 2.7784e+15 4.4560e+18 77971
## - Musical                  1 2.9300e+15 4.4562e+18 77971
## <none>                                  4.4532e+18 77971
## - Horror                   1 4.2430e+15 4.4575e+18 77971
## - Western                  1 5.2594e+15 4.4585e+18 77972
## + Thriller                 1 1.6961e+15 4.4516e+18 77973
## + imdb_score               1 1.0821e+15 4.4522e+18 77973
## + Documentary              1 7.5593e+14 4.4525e+18 77973
## + Mystery                  1 4.9279e+14 4.4528e+18 77973
## + Sport                    1 2.0575e+14 4.4530e+18 77973
## + Biography                1 1.4115e+14 4.4531e+18 77973
## + Fantasy                  1 9.8935e+13 4.4531e+18 77973
## + War                      1 3.6413e+13 4.4532e+18 77973
## + aspect_ratio             1 2.3296e+13 4.4532e+18 77973
## + movie_facebook_likes     1 1.0915e+13 4.4532e+18 77973
## - `Sci-Fi`                 1 1.1611e+16 4.4649e+18 77975
## - Adventure                1 2.0853e+16 4.4741e+18 77980
## - Comedy                   1 2.5235e+16 4.4785e+18 77982
## - num_user_for_reviews     1 2.9208e+16 4.4825e+18 77984
## - Action                   1 2.9350e+16 4.4826e+18 77984
## - director_facebook_likes  1 3.0264e+16 4.4835e+18 77984
## - Animation                1 3.0456e+16 4.4837e+18 77984
## - budget                   1 3.4468e+16 4.4877e+18 77986
## - Drama                    1 4.2272e+16 4.4955e+18 77990
## - Family                   1 5.4475e+16 4.5077e+18 77996
## - title_year               1 6.4805e+16 4.5181e+18 78001
## - duration                 1 6.5925e+16 4.5192e+18 78002
## - actor_3_facebook_likes   1 6.8978e+16 4.5222e+18 78003
## - country                  2 1.4803e+17 4.6013e+18 78040
## - num_critic_for_reviews   1 1.8461e+17 4.6379e+18 78059
## - content_rating           4 2.1704e+17 4.6703e+18 78069
## - num_voted_users          1 4.3477e+17 4.8880e+18 78175
## 
## Step:  AIC=77970.28
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + facenumber_in_poster + 
##     num_user_for_reviews + country + content_rating + budget + 
##     title_year + Action + Adventure + Animation + Comedy + Crime + 
##     Drama + Family + History + Horror + Musical + `Sci-Fi` + 
##     Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - facenumber_in_poster     1 2.1370e+15 4.4572e+18 77969
## - History                  1 2.5516e+15 4.4577e+18 77970
## - Musical                  1 2.7665e+15 4.4579e+18 77970
## - Crime                    1 3.6139e+15 4.4587e+18 77970
## <none>                                  4.4551e+18 77970
## - Horror                   1 4.9157e+15 4.4600e+18 77971
## - Western                  1 5.2391e+15 4.4603e+18 77971
## + Romance                  1 1.8610e+15 4.4532e+18 77971
## + Thriller                 1 1.4302e+15 4.4537e+18 77972
## + imdb_score               1 1.1335e+15 4.4540e+18 77972
## + Documentary              1 5.8001e+14 4.4545e+18 77972
## + Mystery                  1 3.8670e+14 4.4547e+18 77972
## + Sport                    1 1.2648e+14 4.4550e+18 77972
## + Biography                1 8.9985e+13 4.4550e+18 77972
## + Fantasy                  1 8.3104e+13 4.4550e+18 77972
## + War                      1 2.7952e+13 4.4551e+18 77972
## + aspect_ratio             1 1.7869e+13 4.4551e+18 77972
## + movie_facebook_likes     1 1.2970e+13 4.4551e+18 77972
## - `Sci-Fi`                 1 1.2150e+16 4.4673e+18 77974
## - Adventure                1 2.0029e+16 4.4751e+18 77978
## - Comedy                   1 2.8348e+16 4.4835e+18 77982
## - Action                   1 2.8536e+16 4.4836e+18 77982
## - num_user_for_reviews     1 2.9677e+16 4.4848e+18 77983
## - Animation                1 2.9692e+16 4.4848e+18 77983
## - director_facebook_likes  1 3.0995e+16 4.4861e+18 77984
## - budget                   1 3.4952e+16 4.4901e+18 77986
## - Drama                    1 4.0843e+16 4.4960e+18 77988
## - Family                   1 5.3871e+16 4.5090e+18 77995
## - title_year               1 6.4987e+16 4.5201e+18 78000
## - duration                 1 6.6669e+16 4.5218e+18 78001
## - actor_3_facebook_likes   1 6.9790e+16 4.5249e+18 78003
## - country                  2 1.4780e+17 4.6029e+18 78038
## - num_critic_for_reviews   1 1.8355e+17 4.6387e+18 78058
## - content_rating           4 2.2414e+17 4.6793e+18 78071
## - num_voted_users          1 4.3316e+17 4.8883e+18 78173
## 
## Step:  AIC=77969.34
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + num_user_for_reviews + 
##     country + content_rating + budget + title_year + Action + 
##     Adventure + Animation + Comedy + Crime + Drama + Family + 
##     History + Horror + Musical + `Sci-Fi` + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - History                  1 2.3983e+15 4.4596e+18 77969
## - Musical                  1 2.9804e+15 4.4602e+18 77969
## - Crime                    1 3.6160e+15 4.4609e+18 77969
## <none>                                  4.4572e+18 77969
## - Horror                   1 4.3916e+15 4.4616e+18 77970
## - Western                  1 5.0109e+15 4.4623e+18 77970
## + facenumber_in_poster     1 2.1370e+15 4.4551e+18 77970
## + Romance                  1 1.9642e+15 4.4553e+18 77970
## + Thriller                 1 1.5933e+15 4.4557e+18 77971
## + imdb_score               1 1.0469e+15 4.4562e+18 77971
## + Documentary              1 6.5132e+14 4.4566e+18 77971
## + Mystery                  1 4.2554e+14 4.4568e+18 77971
## + Sport                    1 1.4960e+14 4.4571e+18 77971
## + Biography                1 1.2530e+14 4.4571e+18 77971
## + Fantasy                  1 5.9970e+13 4.4572e+18 77971
## + War                      1 4.2712e+13 4.4572e+18 77971
## + aspect_ratio             1 2.2261e+13 4.4572e+18 77971
## + movie_facebook_likes     1 1.1143e+13 4.4572e+18 77971
## - `Sci-Fi`                 1 1.1941e+16 4.4692e+18 77973
## - Adventure                1 2.0083e+16 4.4773e+18 77977
## - Comedy                   1 2.6524e+16 4.4838e+18 77980
## - Action                   1 2.8494e+16 4.4857e+18 77981
## - director_facebook_likes  1 3.0634e+16 4.4879e+18 77982
## - num_user_for_reviews     1 3.0713e+16 4.4880e+18 77983
## - Animation                1 3.1840e+16 4.4891e+18 77983
## - budget                   1 3.5259e+16 4.4925e+18 77985
## - Drama                    1 4.0198e+16 4.4974e+18 77987
## - Family                   1 5.3703e+16 4.5109e+18 77994
## - duration                 1 6.5038e+16 4.5223e+18 77999
## - title_year               1 6.7694e+16 4.5249e+18 78001
## - actor_3_facebook_likes   1 6.8237e+16 4.5255e+18 78001
## - country                  2 1.4765e+17 4.6049e+18 78037
## - num_critic_for_reviews   1 1.8468e+17 4.6419e+18 78057
## - content_rating           4 2.2472e+17 4.6820e+18 78070
## - num_voted_users          1 4.3203e+17 4.8893e+18 78172
## 
## Step:  AIC=77968.52
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + num_user_for_reviews + 
##     country + content_rating + budget + title_year + Action + 
##     Adventure + Animation + Comedy + Crime + Drama + Family + 
##     Horror + Musical + `Sci-Fi` + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Musical                  1 2.8721e+15 4.4625e+18 77968
## - Crime                    1 3.0385e+15 4.4627e+18 77968
## <none>                                  4.4596e+18 77969
## - Horror                   1 4.2310e+15 4.4639e+18 77969
## - Western                  1 4.9049e+15 4.4645e+18 77969
## + History                  1 2.3983e+15 4.4572e+18 77969
## + Romance                  1 2.1552e+15 4.4575e+18 77969
## + facenumber_in_poster     1 1.9838e+15 4.4577e+18 77970
## + Thriller                 1 1.8689e+15 4.4578e+18 77970
## + imdb_score               1 1.1605e+15 4.4585e+18 77970
## + Documentary              1 5.8740e+14 4.4591e+18 77970
## + Mystery                  1 5.7960e+14 4.4591e+18 77970
## + Sport                    1 1.6501e+14 4.4595e+18 77970
## + War                      1 5.7753e+13 4.4596e+18 77970
## + aspect_ratio             1 2.4368e+13 4.4596e+18 77971
## + Fantasy                  1 2.2358e+13 4.4596e+18 77971
## + movie_facebook_likes     1 2.2346e+13 4.4596e+18 77971
## + Biography                1 1.5081e+12 4.4596e+18 77971
## - `Sci-Fi`                 1 1.1225e+16 4.4709e+18 77972
## - Adventure                1 2.0125e+16 4.4798e+18 77976
## - Comedy                   1 2.7588e+16 4.4872e+18 77980
## - Action                   1 2.8002e+16 4.4876e+18 77980
## - num_user_for_reviews     1 3.1073e+16 4.4907e+18 77982
## - director_facebook_likes  1 3.1108e+16 4.4908e+18 77982
## - Animation                1 3.1149e+16 4.4908e+18 77982
## - budget                   1 3.5501e+16 4.4951e+18 77984
## - Drama                    1 4.1657e+16 4.5013e+18 77987
## - Family                   1 5.4182e+16 4.5138e+18 77993
## - duration                 1 6.2640e+16 4.5223e+18 77997
## - title_year               1 6.7647e+16 4.5273e+18 78000
## - actor_3_facebook_likes   1 6.7697e+16 4.5273e+18 78000
## - country                  2 1.5174e+17 4.6114e+18 78039
## - num_critic_for_reviews   1 1.8419e+17 4.6438e+18 78056
## - content_rating           4 2.2499e+17 4.6846e+18 78069
## - num_voted_users          1 4.3603e+17 4.8957e+18 78173
## 
## Step:  AIC=77967.95
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + num_user_for_reviews + 
##     country + content_rating + budget + title_year + Action + 
##     Adventure + Animation + Comedy + Crime + Drama + Family + 
##     Horror + `Sci-Fi` + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Crime                    1 2.8983e+15 4.4654e+18 77967
## <none>                                  4.4625e+18 77968
## - Horror                   1 4.2230e+15 4.4667e+18 77968
## - Western                  1 4.7235e+15 4.4672e+18 77968
## + Musical                  1 2.8721e+15 4.4596e+18 77969
## + History                  1 2.2900e+15 4.4602e+18 77969
## + facenumber_in_poster     1 2.1896e+15 4.4603e+18 77969
## + Thriller                 1 1.9871e+15 4.4605e+18 77969
## + Romance                  1 1.9764e+15 4.4605e+18 77969
## + imdb_score               1 1.1962e+15 4.4613e+18 77969
## + Mystery                  1 6.8682e+14 4.4618e+18 77970
## + Documentary              1 6.8622e+14 4.4618e+18 77970
## + Sport                    1 2.6435e+14 4.4623e+18 77970
## + Fantasy                  1 5.0408e+13 4.4625e+18 77970
## + War                      1 3.8888e+13 4.4625e+18 77970
## + aspect_ratio             1 2.7666e+13 4.4625e+18 77970
## + movie_facebook_likes     1 1.2489e+13 4.4625e+18 77970
## + Biography                1 2.3060e+12 4.4625e+18 77970
## - `Sci-Fi`                 1 1.0855e+16 4.4734e+18 77971
## - Adventure                1 2.0935e+16 4.4834e+18 77976
## - Comedy                   1 2.7771e+16 4.4903e+18 77980
## - Action                   1 2.8423e+16 4.4909e+18 77980
## - Animation                1 2.9790e+16 4.4923e+18 77981
## - num_user_for_reviews     1 3.0096e+16 4.4926e+18 77981
## - director_facebook_likes  1 3.0981e+16 4.4935e+18 77981
## - budget                   1 3.5163e+16 4.4977e+18 77983
## - Drama                    1 4.2383e+16 4.5049e+18 77987
## - Family                   1 5.3637e+16 4.5162e+18 77992
## - duration                 1 6.1933e+16 4.5244e+18 77996
## - title_year               1 6.6923e+16 4.5294e+18 77999
## - actor_3_facebook_likes   1 6.8435e+16 4.5309e+18 78000
## - country                  2 1.5076e+17 4.6133e+18 78037
## - num_critic_for_reviews   1 1.8431e+17 4.6468e+18 78055
## - content_rating           4 2.2272e+17 4.6852e+18 78068
## - num_voted_users          1 4.4117e+17 4.9037e+18 78174
## 
## Step:  AIC=77967.38
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + num_user_for_reviews + 
##     country + content_rating + budget + title_year + Action + 
##     Adventure + Animation + Comedy + Drama + Family + Horror + 
##     `Sci-Fi` + Western
## 
##                           Df  Sum of Sq        RSS   AIC
## - Horror                   1 3.2316e+15 4.4686e+18 77967
## <none>                                  4.4654e+18 77967
## - Western                  1 4.4137e+15 4.4698e+18 77968
## + Crime                    1 2.8983e+15 4.4625e+18 77968
## + Musical                  1 2.7318e+15 4.4627e+18 77968
## + Romance                  1 2.7155e+15 4.4627e+18 77968
## + facenumber_in_poster     1 2.2051e+15 4.4632e+18 77968
## + History                  1 1.7412e+15 4.4637e+18 77969
## + imdb_score               1 1.1488e+15 4.4643e+18 77969
## + Documentary              1 8.5266e+14 4.4646e+18 77969
## + Thriller                 1 5.3317e+14 4.4649e+18 77969
## + Sport                    1 4.3251e+14 4.4650e+18 77969
## + Mystery                  1 3.8510e+14 4.4650e+18 77969
## + aspect_ratio             1 6.0915e+13 4.4654e+18 77969
## + movie_facebook_likes     1 3.5240e+13 4.4654e+18 77969
## + Fantasy                  1 4.0157e+12 4.4654e+18 77969
## + War                      1 1.8843e+12 4.4654e+18 77969
## + Biography                1 9.6705e+11 4.4654e+18 77969
## - `Sci-Fi`                 1 9.6181e+15 4.4750e+18 77970
## - Adventure                1 2.3818e+16 4.4892e+18 77977
## - Action                   1 2.6001e+16 4.4914e+18 77978
## - Animation                1 2.9412e+16 4.4948e+18 77980
## - Comedy                   1 2.9763e+16 4.4952e+18 77980
## - director_facebook_likes  1 3.1226e+16 4.4966e+18 77981
## - num_user_for_reviews     1 3.1466e+16 4.4969e+18 77981
## - budget                   1 3.5566e+16 4.5010e+18 77983
## - Drama                    1 4.0802e+16 4.5062e+18 77985
## - Family                   1 5.3801e+16 4.5192e+18 77992
## - duration                 1 6.1907e+16 4.5273e+18 77996
## - title_year               1 6.6504e+16 4.5319e+18 77998
## - actor_3_facebook_likes   1 6.9460e+16 4.5349e+18 78000
## - country                  2 1.5000e+17 4.6154e+18 78036
## - num_critic_for_reviews   1 1.8446e+17 4.6499e+18 78055
## - content_rating           4 2.3850e+17 4.7039e+18 78074
## - num_voted_users          1 4.3845e+17 4.9039e+18 78172
## 
## Step:  AIC=77966.98
## gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + num_user_for_reviews + 
##     country + content_rating + budget + title_year + Action + 
##     Adventure + Animation + Comedy + Drama + Family + `Sci-Fi` + 
##     Western
## 
##                           Df  Sum of Sq        RSS   AIC
## <none>                                  4.4686e+18 77967
## - Western                  1 4.2657e+15 4.4729e+18 77967
## + Horror                   1 3.2316e+15 4.4654e+18 77967
## + Romance                  1 3.1799e+15 4.4655e+18 77967
## + Musical                  1 2.7491e+15 4.4659e+18 77968
## + Crime                    1 1.9069e+15 4.4667e+18 77968
## + facenumber_in_poster     1 1.7368e+15 4.4669e+18 77968
## + History                  1 1.7065e+15 4.4669e+18 77968
## + Documentary              1 1.2831e+15 4.4674e+18 77968
## + imdb_score               1 5.4383e+14 4.4681e+18 77969
## + Sport                    1 5.3766e+14 4.4681e+18 77969
## + Thriller                 1 3.8645e+14 4.4683e+18 77969
## + Mystery                  1 1.9824e+14 4.4684e+18 77969
## + aspect_ratio             1 7.4767e+13 4.4686e+18 77969
## + Fantasy                  1 7.1609e+13 4.4686e+18 77969
## + movie_facebook_likes     1 4.5236e+13 4.4686e+18 77969
## + Biography                1 1.9161e+13 4.4686e+18 77969
## + War                      1 4.4471e+12 4.4686e+18 77969
## - `Sci-Fi`                 1 1.0276e+16 4.4789e+18 77970
## - Adventure                1 2.5574e+16 4.4942e+18 77978
## - Animation                1 2.9419e+16 4.4981e+18 77979
## - num_user_for_reviews     1 2.9864e+16 4.4985e+18 77980
## - Action                   1 3.0313e+16 4.4990e+18 77980
## - director_facebook_likes  1 3.0501e+16 4.4991e+18 77980
## - budget                   1 3.6625e+16 4.5053e+18 77983
## - Drama                    1 3.7633e+16 4.5063e+18 77984
## - Comedy                   1 3.7954e+16 4.5066e+18 77984
## - Family                   1 5.3487e+16 4.5221e+18 77991
## - title_year               1 6.3808e+16 4.5325e+18 77996
## - duration                 1 6.6815e+16 4.5355e+18 77998
## - actor_3_facebook_likes   1 7.0152e+16 4.5388e+18 77999
## - country                  2 1.4968e+17 4.6183e+18 78036
## - num_critic_for_reviews   1 1.8127e+17 4.6499e+18 78053
## - content_rating           4 2.4881e+17 4.7175e+18 78079
## - num_voted_users          1 4.6184e+17 4.9305e+18 78182
summary(reg.step)
## 
## Call:
## lm(formula = gross ~ num_critic_for_reviews + duration + director_facebook_likes + 
##     actor_3_facebook_likes + num_voted_users + num_user_for_reviews + 
##     country + content_rating + budget + title_year + Action + 
##     Adventure + Animation + Comedy + Drama + Family + `Sci-Fi` + 
##     Western, data = train.df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -340852907  -22273257   -2022764   17472968  450406303 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.839e+09  3.362e+08   5.469 5.03e-08 ***
## num_critic_for_reviews   1.249e+05  1.326e+04   9.421  < 2e-16 ***
## duration                 3.149e+05  5.505e+04   5.720 1.21e-08 ***
## director_facebook_likes -1.329e+03  3.439e+02  -3.865 0.000115 ***
## actor_3_facebook_likes   3.115e+03  5.316e+02   5.861 5.31e-09 ***
## num_voted_users          1.657e+02  1.102e+01  15.038  < 2e-16 ***
## num_user_for_reviews     1.506e+04  3.938e+03   3.824 0.000135 ***
## countryUK               -2.080e+06  4.490e+06  -0.463 0.643267    
## countryUSA               2.024e+07  3.052e+06   6.632 4.15e-11 ***
## content_ratingNC-17     -4.034e+07  1.723e+07  -2.341 0.019321 *  
## content_ratingPG         2.285e+06  7.041e+06   0.325 0.745567    
## content_ratingPG-13      3.308e+06  8.052e+06   0.411 0.681249    
## content_ratingR         -2.052e+07  8.017e+06  -2.560 0.010541 *  
## budget                   4.848e-02  1.145e-02   4.235 2.38e-05 ***
## title_year              -9.392e+05  1.680e+05  -5.590 2.56e-08 ***
## Action                   9.963e+06  2.586e+06   3.853 0.000120 ***
## Adventure                1.090e+07  3.080e+06   3.539 0.000411 ***
## Animation                2.127e+07  5.603e+06   3.795 0.000151 ***
## Comedy                   9.915e+06  2.300e+06   4.311 1.70e-05 ***
## Drama                   -9.978e+06  2.324e+06  -4.293 1.84e-05 ***
## Family                   2.707e+07  5.290e+06   5.118 3.37e-07 ***
## `Sci-Fi`                -7.190e+06  3.205e+06  -2.243 0.024990 *  
## Western                 -1.132e+07  7.831e+06  -1.445 0.148544    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45190000 on 2188 degrees of freedom
## Multiple R-squared:  0.5776, Adjusted R-squared:  0.5734 
## F-statistic:   136 on 22 and 2188 DF,  p-value: < 2.2e-16
vif(reg.step)
##                             GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews  2.792028  1        1.670936
## duration                1.541081  1        1.241403
## director_facebook_likes 1.156544  1        1.075427
## actor_3_facebook_likes  1.099715  1        1.048673
## num_voted_users         2.899775  1        1.702873
## num_user_for_reviews    2.730457  1        1.652409
## country                 1.067460  2        1.016454
## content_rating          3.368236  4        1.163926
## budget                  1.197485  1        1.094297
## title_year              1.853400  1        1.361396
## Action                  1.362788  1        1.167385
## Adventure               1.653866  1        1.286027
## Animation               1.880984  1        1.371490
## Comedy                  1.372139  1        1.171384
## Drama                   1.461923  1        1.209100
## Family                  3.112236  1        1.764153
## `Sci-Fi`                1.226409  1        1.107434
## Western                 1.034322  1        1.017016
reg.step.pred <- predict(reg.step, valid.df)
accuracy(reg.step.pred, valid.df$gross)
##                 ME     RMSE      MAE      MPE     MAPE
## Test set -646609.8 47890105 29563770 10409.66 13770.11
par(mfrow=c(2,2))
plot(reg.step)

all.residuals <- (valid.df$gross - reg.step.pred)/10^6
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")

data.frame("Predicted" = reg.step.pred, "Actual" = valid.df$gross,
           "Residual" = all.residuals)[0:20,]
##    Predicted    Actual  Residual
## 1  341348703 760505847 419.15714
## 2  202928599 309404152 106.47555
## 6  212255541 336530303 124.27476
## 8  270894608 458991599 188.09699
## 18 177001550 241063875  64.06232
## 20 149962221 255108370 105.14615
## 21 223505491 262030663  38.52517
## 24 128802190  70083519 -58.71867
## 25 189834993 218051260  28.21627
## 26 257703404 658672302 400.96890
## 30 228756733 304360277  75.60354
## 31 189792120 373377893 183.58577
## 32 250840534 408992272 158.15174
## 33 190501522 334185206 143.68368
## 42 136277311 116593191 -19.68412
## 43 242638182 414984497 172.34631
## 47 266079517 233914986 -32.16453
## 50 123354065 144812796  21.45873
## 52 189220019 101785482 -87.43454
## 57 253271582 223806889 -29.46469

Final Linear model with a R-square of 0.5734

Classification tree

library(rpart)
library(rpart.plot)

class.tree <- rpart(gross_catogorical ~. -gross, data = train.df, method = "class")
prp(class.tree, type = 1, extra = "auto", split.font = 1, varlen = 0)

fancyRpartPlot(class.tree)

class.tree.pred <- predict(class.tree, valid.df, type = "class")
#accuracy(class.tree, valid.df$gross)


confusionMatrix(class.tree.pred, as.factor(valid.df$gross_catogorical))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (0,7] (7,66] (66,761]
##   (0,7]      214     75        4
##   (7,66]     146    582      146
##   (66,761]     5     78      225
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6922          
##                  95% CI : (0.6679, 0.7157)
##     No Information Rate : 0.4983          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4891          
##                                           
##  Mcnemar's Test P-Value : 1.868e-09       
## 
## Statistics by Class:
## 
##                      Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity                0.5863        0.7918          0.6000
## Specificity                0.9288        0.6054          0.9245
## Pos Pred Value             0.7304        0.6659          0.7305
## Neg Pred Value             0.8723        0.7454          0.8715
## Prevalence                 0.2475        0.4983          0.2542
## Detection Rate             0.1451        0.3946          0.1525
## Detection Prevalence       0.1986        0.5925          0.2088
## Balanced Accuracy          0.7576        0.6986          0.7623

Classification tree accuracy = 0.6922

Pruned Tree

# Cross Validation
set.seed(2)
crossvalid_ct <- rpart(gross_catogorical ~. - gross, data = train.df, method = "class", cp = 0.001, minsplit = 5, xval = 5)
printcp(crossvalid_ct)
## 
## Classification tree:
## rpart(formula = gross_catogorical ~ . - gross, data = train.df, 
##     method = "class", cp = 0.001, minsplit = 5, xval = 5)
## 
## Variables actually used in tree construction:
##  [1] actor_1_facebook_likes    actor_2_facebook_likes   
##  [3] actor_3_facebook_likes    budget                   
##  [5] cast_total_facebook_likes Comedy                   
##  [7] content_rating            country                  
##  [9] Crime                     director_facebook_likes  
## [11] duration                  facenumber_in_poster     
## [13] Family                    Fantasy                  
## [15] imdb_score                movie_facebook_likes     
## [17] num_critic_for_reviews    num_user_for_reviews     
## [19] num_voted_users           Romance                  
## [21] title_year                War                      
## 
## Root node error: 1073/2211 = 0.4853
## 
## n= 2211 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.1081081      0   1.00000 1.00000 0.021902
## 2  0.0251631      3   0.67568 0.75023 0.021086
## 3  0.0195713      4   0.65051 0.71668 0.020871
## 4  0.0177074      5   0.63094 0.70177 0.020767
## 5  0.0158434      6   0.61323 0.70457 0.020787
## 6  0.0102516      7   0.59739 0.68313 0.020630
## 7  0.0083877     11   0.55638 0.67102 0.020536
## 8  0.0074557     12   0.54800 0.66449 0.020484
## 9  0.0069897     13   0.54054 0.65331 0.020392
## 10 0.0052811     16   0.51817 0.64306 0.020305
## 11 0.0046598     19   0.50233 0.63653 0.020248
## 12 0.0037279     20   0.49767 0.63653 0.020248
## 13 0.0034172     29   0.46319 0.63560 0.020240
## 14 0.0027959     35   0.43802 0.64119 0.020289
## 15 0.0023299     48   0.40168 0.64958 0.020360
## 16 0.0022367     52   0.39236 0.66542 0.020491
## 17 0.0018639     67   0.35508 0.66636 0.020499
## 18 0.0016775    106   0.28239 0.67661 0.020580
## 19 0.0015533    111   0.27400 0.68500 0.020644
## 20 0.0013979    119   0.26002 0.68220 0.020623
## 21 0.0013048    140   0.23020 0.68406 0.020637
## 22 0.0012426    146   0.22181 0.68966 0.020679
## 23 0.0010000    168   0.19012 0.70643 0.020800
# prune tree with the smallest tree within 1 xstd of min. error (13 split is the best)
pruned_ct <- prune(crossvalid_ct, 
                   cp = 0.0069897)
length(pruned_ct$frame$var[pruned_ct$frame$var == "<leaf>"])
## [1] 17
prp(pruned_ct)

fancyRpartPlot(pruned_ct)

# Apply prune tree model with prediction

pruned_ct_pred <- predict(pruned_ct, train.df, type = "class")
confusionMatrix(pruned_ct_pred, train.df$gross_catogorical)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (0,7] (7,66] (66,761]
##   (0,7]      355     98        5
##   (7,66]     154    889      138
##   (66,761]    10    151      411
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7485          
##                  95% CI : (0.7299, 0.7665)
##     No Information Rate : 0.5147          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5889          
##                                           
##  Mcnemar's Test P-Value : 0.002096        
## 
## Statistics by Class:
## 
##                      Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity                0.6840        0.7812          0.7419
## Specificity                0.9391        0.7279          0.9028
## Pos Pred Value             0.7751        0.7528          0.7185
## Neg Pred Value             0.9064        0.7583          0.9128
## Prevalence                 0.2347        0.5147          0.2506
## Detection Rate             0.1606        0.4021          0.1859
## Detection Prevalence       0.2071        0.5341          0.2587
## Balanced Accuracy          0.8116        0.7545          0.8224
pruned_ct_pred <- predict(pruned_ct, valid.df, type = "class")
confusionMatrix(pruned_ct_pred, valid.df$gross_catogorical)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (0,7] (7,66] (66,761]
##   (0,7]      232     83        5
##   (7,66]     122    546      123
##   (66,761]    11    106      247
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6949          
##                  95% CI : (0.6707, 0.7183)
##     No Information Rate : 0.4983          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.505           
##                                           
##  Mcnemar's Test P-Value : 0.0121          
## 
## Statistics by Class:
## 
##                      Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity                0.6356        0.7429          0.6587
## Specificity                0.9207        0.6689          0.8936
## Pos Pred Value             0.7250        0.6903          0.6786
## Neg Pred Value             0.8848        0.7237          0.8848
## Prevalence                 0.2475        0.4983          0.2542
## Detection Rate             0.1573        0.3702          0.1675
## Detection Prevalence       0.2169        0.5363          0.2468
## Balanced Accuracy          0.7782        0.7059          0.7762

Accuracy for testing dataset is 0.7752 Accuracy for validation dataset is 0.6949

K-Nearest Neighbors

normalize our data

library(FNN)

# initialize normalized training, validation, test data, complete data frames to originals
train.norm <- train.df
valid.norm <- valid.df


# use preProcess() from the caret package to normalize predictors.
norm.values <- preProcess(train.df[, -39-6], method=c("center", "scale"))
train.norm[, -39-6] <- predict(norm.values, train.df[, -39-6])
valid.norm[, -39-6] <- predict(norm.values, valid.df[, -39-6])

remove column ‘country’, ‘country rating’, ‘gross’ (becuase KNN can only process numeric data, while ‘country’ and ‘country rating’ are factor variables.)

valid.norm <- valid.norm[ -c(11,12,6) ]
train.norm <- train.norm[ -c(11,12,6) ]

#Find the best k

# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 36, 1), accuracy = rep(0, 36))
# compute knn for different k on validation data.
for(i in 1:36) {
  knn.pred <- knn(train.norm[, -36], valid.norm[, -36],
                  cl = train.norm[, 36], k = i)
  accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm[, 36])$overall[1]
}
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
accuracy.df
##     k  accuracy
## 1   1 0.5518644
## 2   2 0.5254237
## 3   3 0.5708475
## 4   4 0.5844068
## 5   5 0.5803390
## 6   6 0.5911864
## 7   7 0.5945763
## 8   8 0.5925424
## 9   9 0.5884746
## 10 10 0.5871186
## 11 11 0.5877966
## 12 12 0.5966102
## 13 13 0.6061017
## 14 14 0.6122034
## 15 15 0.6020339
## 16 16 0.6108475
## 17 17 0.6033898
## 18 18 0.6027119
## 19 19 0.5979661
## 20 20 0.6033898
## 21 21 0.6013559
## 22 22 0.5993220
## 23 23 0.6054237
## 24 24 0.6020339
## 25 25 0.6013559
## 26 26 0.6040678
## 27 27 0.6013559
## 28 28 0.6040678
## 29 29 0.5986441
## 30 30 0.6006780
## 31 31 0.5932203
## 32 32 0.5945763
## 33 33 0.5945763
## 34 34 0.5925424
## 35 35 0.5938983
## 36 36 0.5972881

#plot accuracy for each K

ggplot(accuracy.df, aes(y = accuracy, x = k)) + geom_point() + geom_line()

knn.pred <- knn(train.norm[, -36], valid.norm[, -36],
                  cl = train.norm[, 36], k = 14)

confusionMatrix(knn.pred, valid.norm[, 36])
## Warning in confusionMatrix.default(knn.pred, valid.norm[, 36]): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (0,7] (7,66] (66,761]
##   (0,7]      142     90       11
##   (7,66]     212    585      188
##   (66,761]    11     60      176
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6122          
##                  95% CI : (0.5868, 0.6372)
##     No Information Rate : 0.4983          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3358          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity               0.38904        0.7959          0.4693
## Specificity               0.90901        0.4595          0.9355
## Pos Pred Value            0.58436        0.5939          0.7126
## Neg Pred Value            0.81899        0.6939          0.8379
## Prevalence                0.24746        0.4983          0.2542
## Detection Rate            0.09627        0.3966          0.1193
## Detection Prevalence      0.16475        0.6678          0.1675
## Balanced Accuracy         0.64903        0.6277          0.7024
#From the accuracy result, we found that the best k is 5. However, the highest accuracy is only 0.438

From the accuracy result, we found that the best k is 14. However, the highest accuracy is only 0.6122

library(ggthemes)
# Change point shapes and line types by groups

ggplot(accuracy.df, aes(y = accuracy, x = k))+ 
    geom_point(color="yellow")+
    labs(title="Plot Accuracy for each K Value",x="K Value", y = "Accuracy")+
    theme_economist() + 
    scale_color_economist()+
    geom_step(color="yellow")

random Forest

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
set.seed(5)
# I removed 'Sci-Fi' column because the system said it could not find this column.
rf <- randomForest(gross_catogorical ~ . -gross -`Sci-Fi`- Documentary - Western - Musical - History - Sport - War - Biography - Mystery - Animation - Fantasy, data = train.df, mtry = 10, ntree = 1500)
# Show model error (We can see 500 tree is sufficient enough)
plot(rf)
legend('topright', colnames(rf$err.rate), col=1:4, fill=1:4)

#tuning mtry ( Number of variables randomly sampled as candidates at each split)
tune_rf <- tuneRF(train.df[,-c(39,6)], train.df[,39], mtrystart = 5, ntreeTry = 500, stepFactor=1.5, trace = TRUE, plot = TRUE, dobest = TRUE)
## mtry = 6  OOB error = 25.37% 
## Searching left ...
## mtry = 4     OOB error = 25.64% 
## -0.01069519 0.05 
## Searching right ...
## mtry = 9     OOB error = 25.19% 
## 0.007130125 0.05

#install packages for further steps

library(ggthemes)

#relative variable importance by plotting the mean decrease in Gini calculated across all trees

# Get importance
importance <- importance(rf)
# ??? what is MeanDcreaseGini???
varImportance <- data.frame(Variables = row.names(importance), 
                            Importance = round(importance[ ,'MeanDecreaseGini'],2))

# Create a rank variable based on importance
rankImportance <- varImportance %>%
  mutate(Rank = paste0('#',dense_rank(desc(Importance))))

# Use ggplot2 to visualize the relative importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance), 
                           y = Importance, fill = Importance)) +
  geom_bar(stat='identity') + 
  geom_text(aes(x = Variables, y = 0.5, label = Rank),
            hjust=0, vjust=0.55, size = 4, colour = 'red') +
  labs(x = 'Variables') +
  coord_flip() + 
  theme_few()

# Apply Model

set.seed(632)
# apply model on validation set
rf.pred.valid <- predict(rf, valid.df)
# generate confusion matrix for validation data
confusionMatrix(rf.pred.valid, valid.df$gross_catogorical)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (0,7] (7,66] (66,761]
##   (0,7]      249     57        2
##   (7,66]     113    613      127
##   (66,761]     3     65      246
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7512          
##                  95% CI : (0.7283, 0.7731)
##     No Information Rate : 0.4983          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5894          
##                                           
##  Mcnemar's Test P-Value : 2.041e-08       
## 
## Statistics by Class:
## 
##                      Class: (0,7] Class: (7,66] Class: (66,761]
## Sensitivity                0.6822        0.8340          0.6560
## Specificity                0.9468        0.6757          0.9382
## Pos Pred Value             0.8084        0.7186          0.7834
## Neg Pred Value             0.9006        0.8039          0.8889
## Prevalence                 0.2475        0.4983          0.2542
## Detection Rate             0.1688        0.4156          0.1668
## Detection Prevalence       0.2088        0.5783          0.2129
## Balanced Accuracy          0.8145        0.7548          0.7971

Random Forest accuracy = 0.7512

Try predict profitability with GLM model

# Create data frame with profit

final_df_profit <- final_df
final_df_profit$profitable <- as.factor(ifelse(final_df$gross-final_df$budget > 0, 1 ,0))

# partition data
set.seed(3)  # set seed for reproducing the partition
train.index <- sample(c(1:3686), 3686*0.6)  

#Create and set aside the remaining 40% of the data, to be used after omitting unhelpful data points and unnecessary variables.
train.df.logistic <- final_df_profit[train.index,]
valid.df.logistic <- final_df_profit[-train.index,]


# Try using logistic model to predict wether a movie will profit or not (Profit = 1, Did not profit = 0)
logistic_reg <- glm(profitable~. - gross - gross_catogorical, data = train.df.logistic, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_reg)
## 
## Call:
## glm(formula = profitable ~ . - gross - gross_catogorical, family = "binomial", 
##     data = train.df.logistic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2825  -0.9054   0.1644   0.9050   2.3584  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                1.222e+02  1.932e+01   6.327 2.50e-10 ***
## num_critic_for_reviews     2.561e-03  9.408e-04   2.723  0.00648 ** 
## duration                  -4.634e-03  3.549e-03  -1.306  0.19161    
## director_facebook_likes   -6.474e-05  2.038e-05  -3.177  0.00149 ** 
## actor_3_facebook_likes    -9.209e-05  8.556e-05  -1.076  0.28179    
## actor_1_facebook_likes    -1.212e-04  5.521e-05  -2.195  0.02817 *  
## num_voted_users            1.111e-05  1.334e-06   8.329  < 2e-16 ***
## cast_total_facebook_likes  1.162e-04  5.518e-05   2.105  0.03526 *  
## facenumber_in_poster       4.203e-02  2.845e-02   1.477  0.13958    
## num_user_for_reviews       3.613e-04  2.989e-04   1.209  0.22680    
## countryUK                 -3.157e-02  2.419e-01  -0.130  0.89618    
## countryUSA                 1.013e+00  1.743e-01   5.812 6.17e-09 ***
## content_ratingNC-17       -7.967e-01  9.670e-01  -0.824  0.40998    
## content_ratingPG           1.824e-01  4.047e-01   0.451  0.65230    
## content_ratingPG-13       -2.681e-01  4.575e-01  -0.586  0.55790    
## content_ratingR           -9.660e-01  4.582e-01  -2.109  0.03499 *  
## budget                    -1.883e-08  2.192e-09  -8.590  < 2e-16 ***
## title_year                -6.189e-02  9.629e-03  -6.428 1.29e-10 ***
## actor_2_facebook_likes    -1.230e-04  5.939e-05  -2.071  0.03837 *  
## imdb_score                 2.075e-01  7.229e-02   2.870  0.00410 ** 
## aspect_ratio              -3.657e-02  1.079e-01  -0.339  0.73458    
## movie_facebook_likes       1.472e-06  4.390e-06   0.335  0.73744    
## Action                     3.988e-02  1.534e-01   0.260  0.79487    
## Adventure                 -1.843e-01  1.691e-01  -1.090  0.27559    
## Animation                  6.898e-02  3.086e-01   0.224  0.82314    
## Biography                  6.661e-02  2.430e-01   0.274  0.78398    
## Comedy                     3.599e-01  1.368e-01   2.630  0.00853 ** 
## Crime                     -2.366e-01  1.500e-01  -1.578  0.11463    
## Documentary                9.126e-01  4.445e-01   2.053  0.04004 *  
## Drama                     -1.602e-01  1.360e-01  -1.178  0.23887    
## Family                     7.028e-01  2.863e-01   2.455  0.01409 *  
## Fantasy                   -3.485e-01  1.769e-01  -1.970  0.04880 *  
## History                   -3.397e-04  3.026e-01  -0.001  0.99910    
## Horror                     1.047e+00  2.160e-01   4.849 1.24e-06 ***
## Musical                   -4.638e-01  3.575e-01  -1.297  0.19453    
## Mystery                   -6.504e-02  1.792e-01  -0.363  0.71662    
## Romance                    2.818e-01  1.298e-01   2.171  0.02992 *  
## `Sci-Fi`                  -8.684e-01  1.802e-01  -4.819 1.44e-06 ***
## Sport                     -2.819e-01  2.716e-01  -1.038  0.29920    
## Thriller                   1.193e-01  1.453e-01   0.821  0.41166    
## War                       -2.531e-01  3.303e-01  -0.766  0.44350    
## Western                   -6.909e-01  4.646e-01  -1.487  0.13702    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3053.4  on 2210  degrees of freedom
## Residual deviance: 2354.0  on 2169  degrees of freedom
## AIC: 2438
## 
## Number of Fisher Scoring iterations: 6
vif(logistic_reg)
##                                 GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews      4.336826  1        2.082505
## duration                    1.910592  1        1.382242
## director_facebook_likes     1.115568  1        1.056205
## actor_3_facebook_likes      7.081803  1        2.661166
## actor_1_facebook_likes    423.623912  1       20.582126
## num_voted_users             4.358102  1        2.087607
## cast_total_facebook_likes 535.683721  1       23.144842
## facenumber_in_poster        1.152229  1        1.073419
## num_user_for_reviews        2.795533  1        1.671985
## country                     1.188991  2        1.044226
## content_rating              4.102662  4        1.192980
## budget                      3.093648  1        1.758877
## title_year                  2.064589  1        1.436868
## actor_2_facebook_likes     19.762006  1        4.445448
## imdb_score                  1.957286  1        1.399030
## aspect_ratio                1.095206  1        1.046521
## movie_facebook_likes        2.006638  1        1.416558
## Action                      1.738799  1        1.318635
## Adventure                   1.703976  1        1.305364
## Animation                   1.719364  1        1.311245
## Biography                   1.327803  1        1.152303
## Comedy                      1.812076  1        1.346134
## Crime                       1.420812  1        1.191978
## Documentary                 1.171711  1        1.082456
## Drama                       1.843044  1        1.357587
## Family                      3.297168  1        1.815811
## Fantasy                     1.331065  1        1.153718
## History                     1.347422  1        1.160785
## Horror                      1.507041  1        1.227616
## Musical                     1.110534  1        1.053819
## Mystery                     1.175515  1        1.084212
## Romance                     1.223927  1        1.106312
## `Sci-Fi`                    1.283822  1        1.133059
## Sport                       1.126163  1        1.061208
## Thriller                    1.787914  1        1.337129
## War                         1.238787  1        1.113008
## Western                     1.047854  1        1.023647
logistic_reg <- glm(profitable ~. - gross - gross_catogorical - facenumber_in_poster - content_rating - aspect_ratio -movie_facebook_likes - Action -Adventure -Animation -Biography -History - Mystery - Sport - Thriller - War - Western - actor_1_facebook_likes - actor_2_facebook_likes - cast_total_facebook_likes - actor_3_facebook_likes - num_user_for_reviews - duration - Musical, data = train.df.logistic, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_reg)
## 
## Call:
## glm(formula = profitable ~ . - gross - gross_catogorical - facenumber_in_poster - 
##     content_rating - aspect_ratio - movie_facebook_likes - Action - 
##     Adventure - Animation - Biography - History - Mystery - Sport - 
##     Thriller - War - Western - actor_1_facebook_likes - actor_2_facebook_likes - 
##     cast_total_facebook_likes - actor_3_facebook_likes - num_user_for_reviews - 
##     duration - Musical, family = "binomial", data = train.df.logistic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2428  -0.9385   0.1826   0.9414   2.1974  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              1.108e+02  1.777e+01   6.238 4.42e-10 ***
## num_critic_for_reviews   2.478e-03  8.204e-04   3.020 0.002525 ** 
## director_facebook_likes -6.588e-05  1.987e-05  -3.315 0.000917 ***
## num_voted_users          1.154e-05  1.126e-06  10.248  < 2e-16 ***
## countryUK                1.201e-01  2.332e-01   0.515 0.606468    
## countryUSA               1.076e+00  1.690e-01   6.367 1.92e-10 ***
## budget                  -1.661e-08  1.789e-09  -9.287  < 2e-16 ***
## title_year              -5.642e-02  8.859e-03  -6.369 1.90e-10 ***
## imdb_score               1.101e-01  6.573e-02   1.676 0.093830 .  
## Comedy                   3.787e-01  1.191e-01   3.180 0.001473 ** 
## Crime                   -3.399e-01  1.321e-01  -2.572 0.010106 *  
## Documentary              9.266e-01  4.265e-01   2.173 0.029813 *  
## Drama                   -2.435e-01  1.259e-01  -1.935 0.052977 .  
## Family                   1.189e+00  1.839e-01   6.467 9.97e-11 ***
## Fantasy                 -3.031e-01  1.705e-01  -1.778 0.075447 .  
## Horror                   8.879e-01  2.014e-01   4.407 1.05e-05 ***
## Romance                  3.304e-01  1.233e-01   2.680 0.007364 ** 
## `Sci-Fi`                -7.666e-01  1.707e-01  -4.490 7.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3053.4  on 2210  degrees of freedom
## Residual deviance: 2421.2  on 2193  degrees of freedom
## AIC: 2457.2
## 
## Number of Fisher Scoring iterations: 6
vif(logistic_reg)
##                             GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews  3.430066  1        1.852044
## director_facebook_likes 1.095915  1        1.046859
## num_voted_users         3.208795  1        1.791311
## country                 1.112021  2        1.026900
## budget                  2.192059  1        1.480560
## title_year              1.805832  1        1.343812
## imdb_score              1.661448  1        1.288972
## Comedy                  1.417410  1        1.190550
## Crime                   1.140841  1        1.068101
## Documentary             1.123695  1        1.060045
## Drama                   1.633249  1        1.277986
## Family                  1.414124  1        1.189170
## Fantasy                 1.280177  1        1.131449
## Horror                  1.349041  1        1.161482
## Romance                 1.148672  1        1.071761
## `Sci-Fi`                1.198800  1        1.094897
#Apply logistics regression model

logistic_reg_pred <- predict(logistic_reg, valid.df.logistic[, - 40], type = "response")
confusionMatrix(as.factor(ifelse(logistic_reg_pred > 0.5 ,1, 0)), as.factor(valid.df.logistic$profitable))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 505 204
##          1 212 554
##                                           
##                Accuracy : 0.718           
##                  95% CI : (0.6942, 0.7408)
##     No Information Rate : 0.5139          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4353          
##                                           
##  Mcnemar's Test P-Value : 0.7314          
##                                           
##             Sensitivity : 0.7043          
##             Specificity : 0.7309          
##          Pos Pred Value : 0.7123          
##          Neg Pred Value : 0.7232          
##              Prevalence : 0.4861          
##          Detection Rate : 0.3424          
##    Detection Prevalence : 0.4807          
##       Balanced Accuracy : 0.7176          
##                                           
##        'Positive' Class : 0               
## 

Logistic regression Accuracy = 0.718

logistic_reg <- glm(profitable ~. - gross - gross_catogorical - facenumber_in_poster - content_rating - aspect_ratio -movie_facebook_likes - Action -Adventure -Animation -Biography -History - Mystery - Sport - Thriller - War - Western - actor_1_facebook_likes - actor_2_facebook_likes - cast_total_facebook_likes - actor_3_facebook_likes - num_user_for_reviews - duration - Musical, data = train.df.logistic[-c(647,2817),], family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_reg)
## 
## Call:
## glm(formula = profitable ~ . - gross - gross_catogorical - facenumber_in_poster - 
##     content_rating - aspect_ratio - movie_facebook_likes - Action - 
##     Adventure - Animation - Biography - History - Mystery - Sport - 
##     Thriller - War - Western - actor_1_facebook_likes - actor_2_facebook_likes - 
##     cast_total_facebook_likes - actor_3_facebook_likes - num_user_for_reviews - 
##     duration - Musical, family = "binomial", data = train.df.logistic[-c(647, 
##     2817), ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2411  -0.9376   0.1828   0.9416   2.1976  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              1.111e+02  1.777e+01   6.250 4.11e-10 ***
## num_critic_for_reviews   2.473e-03  8.203e-04   3.014 0.002578 ** 
## director_facebook_likes -6.606e-05  1.987e-05  -3.324 0.000886 ***
## num_voted_users          1.153e-05  1.126e-06  10.242  < 2e-16 ***
## countryUK                1.200e-01  2.332e-01   0.515 0.606834    
## countryUSA               1.078e+00  1.690e-01   6.377 1.81e-10 ***
## budget                  -1.660e-08  1.789e-09  -9.282  < 2e-16 ***
## title_year              -5.654e-02  8.861e-03  -6.381 1.76e-10 ***
## imdb_score               1.107e-01  6.573e-02   1.684 0.092225 .  
## Comedy                   3.768e-01  1.191e-01   3.164 0.001557 ** 
## Crime                   -3.424e-01  1.322e-01  -2.591 0.009581 ** 
## Documentary              9.237e-01  4.265e-01   2.166 0.030337 *  
## Drama                   -2.426e-01  1.259e-01  -1.928 0.053884 .  
## Family                   1.188e+00  1.839e-01   6.461 1.04e-10 ***
## Fantasy                 -3.040e-01  1.705e-01  -1.783 0.074551 .  
## Horror                   8.861e-01  2.015e-01   4.398 1.09e-05 ***
## Romance                  3.281e-01  1.233e-01   2.661 0.007792 ** 
## `Sci-Fi`                -7.678e-01  1.707e-01  -4.497 6.88e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3051.8  on 2209  degrees of freedom
## Residual deviance: 2420.1  on 2192  degrees of freedom
## AIC: 2456.1
## 
## Number of Fisher Scoring iterations: 6
vif(logistic_reg)
##                             GVIF Df GVIF^(1/(2*Df))
## num_critic_for_reviews  3.429202  1        1.851810
## director_facebook_likes 1.096059  1        1.046929
## num_voted_users         3.208691  1        1.791282
## country                 1.112211  2        1.026944
## budget                  2.191782  1        1.480467
## title_year              1.804937  1        1.343479
## imdb_score              1.661278  1        1.288906
## Comedy                  1.417260  1        1.190487
## Crime                   1.141222  1        1.068280
## Documentary             1.123681  1        1.060038
## Drama                   1.632540  1        1.277709
## Family                  1.414042  1        1.189135
## Fantasy                 1.280181  1        1.131451
## Horror                  1.348977  1        1.161455
## Romance                 1.148907  1        1.071871
## `Sci-Fi`                1.198888  1        1.094937

#Apply logistics regression model

logistic_reg_pred <- predict(logistic_reg, valid.df.logistic[, - 40], type = "response")
confusionMatrix(as.factor(ifelse(logistic_reg_pred > 0.5 ,1, 0)), as.factor(valid.df.logistic$profitable))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 505 204
##          1 212 554
##                                           
##                Accuracy : 0.718           
##                  95% CI : (0.6942, 0.7408)
##     No Information Rate : 0.5139          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4353          
##                                           
##  Mcnemar's Test P-Value : 0.7314          
##                                           
##             Sensitivity : 0.7043          
##             Specificity : 0.7309          
##          Pos Pred Value : 0.7123          
##          Neg Pred Value : 0.7232          
##              Prevalence : 0.4861          
##          Detection Rate : 0.3424          
##    Detection Prevalence : 0.4807          
##       Balanced Accuracy : 0.7176          
##                                           
##        'Positive' Class : 0               
##