Bike share Case Study

Differnt bike use tendency between casual riders and members

1. Install and load the packages for the analysis

#install and load packages
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/User/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\User\AppData\Local\Temp\RtmpeiVez0\downloaded_packages
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
install.packages("lubridate", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/User/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'lubridate' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lubridate'
## Warning in file.copy(savedcopy, lib, recursive = TRUE):
## problem copying C:\Users\User\AppData\Local\R\win-
## library\4.2\00LOCK\lubridate\libs\x64\lubridate.dll to C:
## \Users\User\AppData\Local\R\win-library\4.2\lubridate\libs\x64\lubridate.dll:
## Permission denied
## Warning: restored 'lubridate'
## 
## The downloaded binary packages are in
##  C:\Users\User\AppData\Local\Temp\RtmpeiVez0\downloaded_packages
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)

2. Import the 12 months datasets and bind them into a complete dataset

#Import datasets from Sep 2021 to Aug 2022 and bind them all
filepath <- "data/202109-divvy-tripdata.csv"
new_dataset <- read.csv(filepath)
repeat{
 dateofdata <- as.integer(substr(filepath, 6, 11))+1
 substr(filepath, 6, 11) <- as.character(dateofdata)
 second_import <- read.csv(filepath)
 new_dataset <- rbind(new_dataset, second_import)
 if(dateofdata == 202112){
   filepath <- "data/202201-divvy-tripdata.csv"
   second_import <- read.csv(filepath)
   new_dataset <- rbind(new_dataset, second_import)
 }else if(dateofdata == 202208){
   break
 }
}

3. Split the dataset into two datasets(time, location)

#Prepare a dataset to inspect its datetime data
dt_dataset <- select(new_dataset, 'ride_id', 'started_at', 'ended_at', 'member_casual')

#Prepare a dataset to inspect geographic data
lc_dataset <- select(new_dataset, 'ride_id', 'start_station_name', 'end_station_name', 'member_casual')

4. Prepare the dataframes

  • Convert the date columns from character to date type.

  • Extract the month column and wday column from the date column.

  • Reorder the weekdays, because it’s not in the right order.

  • Convert the started_at, ended_at columns to time type.

  • Calculate the trip duration using the converted time columns.

  • Remove bad data which contains negative time difference from trip duration column.

  • Deal with missing values.

  • Concatenate start and end station names to observe trip routes.

#Data cleaning for dt_dataset

#Convert Character to Date and format to Month and Weekday
dt_dataset$trip_date <- as.Date(dt_dataset$started_at)
dt_dataset$trip_month <- format(as.Date(dt_dataset$trip_date), "%m")
dt_dataset$trip_wday <- format(as.Date(dt_dataset$trip_date), "%a")

#Reorder the weekdays
dt_dataset$trip_wday <- factor(dt_dataset$trip_wday, levels= c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"))

#Convert Datetime to Time datatype, remove rows with negative trip_duration
dt_dataset$trip_stime <- as.POSIXct(dt_dataset$started_at, tz='UTC')
dt_dataset$trip_etime <- as.POSIXct(dt_dataset$ended_at, tz='UTC')

dt_dataset$trip_duration <- difftime(dt_dataset$trip_etime, dt_dataset$trip_stime)
dt_dataset2 <- subset(dt_dataset, trip_duration>0)


#Data Cleaning for lc_dataset

#Deal with missing Values
lc_dataset <- lc_dataset %>% 
  filter(start_station_name!=""&end_station_name!="")

#Create trip_route column by concatenating the start and end station names
lc_dataset$trip_route <- paste(lc_dataset$start_station_name, lc_dataset$end_station_name, sep=" - ")

#Create round trip column by inspecting whether start station name equals to end station name or not
lc_dataset$round_trip[lc_dataset$start_station_name==lc_dataset$end_station_name] <- "YES"
lc_dataset$round_trip[lc_dataset$start_station_name!=lc_dataset$end_station_name] <- "NO"

5. Explore and analyze the data

  • Take a look at total ride trends on daily basis by user types.
#Comparison by the number of daily ride between user types
dt_dataset2 %>%
  group_by(trip_date, member_casual) %>% 
  summarize(daily_ride_count=n_distinct(ride_id)) %>% 
  ggplot(aes(x = trip_date, y = daily_ride_count, color = member_casual)) +
  geom_point() +
  geom_line(aes(col=member_casual), size=1, alpha=0.4) +
  facet_wrap(~member_casual) +
  labs(title="Daily ride trends by member users and casual users(Ride counts)") +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="none")
## `summarise()` has grouped output by 'trip_date'. You can override using the
## `.groups` argument.

  • Check out the difference between the user types by each weekday
#Compare the number of ride by each user type every weekdays
dt_dataset2 %>% 
  group_by(trip_wday, member_casual) %>% 
  summarise(ride_count_by_weekdays=n_distinct(ride_id)) %>% 
  ggplot(aes(x=trip_wday, y=ride_count_by_weekdays, group=member_casual)) +
  geom_line(aes(color=member_casual), size=2) +
  labs(title="Ride trends on each weekday by member users and casual users") +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="bottom", legend.title = element_blank())
## `summarise()` has grouped output by 'trip_wday'. You can override using the
## `.groups` argument.

  • Ride comparison between the user types by each month
#Compare the number of ride by each user type every months
dt_dataset2 %>% 
  group_by(trip_month, member_casual) %>% 
  summarise(ride_count_by_months=n_distinct(ride_id)) %>% 
  ggplot(aes(x=trip_month, y=ride_count_by_months, group=member_casual)) +
  geom_line(aes(color=member_casual), size=2) +
  labs(title="Ride trends in each month by member users and casual users") +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="bottom", legend.title = element_blank())
## `summarise()` has grouped output by 'trip_month'. You can override using the
## `.groups` argument.

  • Compare the average trip duration
#Compare the average trip duration between the two user types
dt_dataset2 %>% 
  group_by(member_casual) %>% 
  summarise(trip_duration_average=mean(as.integer(trip_duration)/60)) %>% 
  ggplot(aes(x=member_casual, y=trip_duration_average, fill=member_casual)) + 
  geom_bar(stat = "identity", width=0.2) +
  labs(title="Trip duration average by member users and casual users (min)") +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="none")

  • Round trip proportion by user type
#Create the pie chart to see the proportion of each user type from all the round trips
lc_dataset %>% 
  filter(round_trip=="YES") %>% 
  group_by(member_casual) %>% 
  summarize(count=n_distinct(ride_id)) %>% 
  mutate(percent=count/sum(count)*100) %>% 
  ggplot(aes(x = 2, y = percent, fill = member_casual)) +
  geom_bar(stat = "identity", color = "white") +
  coord_polar(theta = "y", start = 0) +
  labs(title="Round trip percentage by user type(%)") +
  geom_text(aes(label = round(percent,2)), position = position_stack(vjust = 0.5), color = "black") +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        plot.background = element_blank(),
        legend.position="bottom",
        legend.title = element_blank())

  • Check out the 10 most popular trip routes for each user type
#Check the 10 most popular trip routes
lc_dataset %>% 
  group_by(member_casual, trip_route) %>% 
  summarise(count=n_distinct(ride_id)) %>% 
  arrange(desc(count), trip_route) %>% 
  head(n=10) %>% 
  ggplot(aes(x=count, y=reorder(trip_route, count), fill=member_casual)) + 
  geom_bar(stat = "identity", width=0.4) +
  labs(title="The 10 most popular trip routes") +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="bottom", legend.title = element_blank())
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

  • The 10 most popular bike pickup(start) station
#Create the bar chart to see the top 10 popular station to start the trip
lc_dataset %>% 
  group_by(start_station_name, member_casual) %>% 
  summarize(ride_count=n_distinct(ride_id)) %>%
  arrange(desc(ride_count)) %>% 
  head(n=10) %>% 
  ggplot(aes(x=ride_count, y=reorder(start_station_name, ride_count), fill=member_casual)) + 
  geom_bar(stat = "identity", width=0.4) +
  labs(title="The 10 most popular bike pickup station") +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="bottom", legend.title = element_blank())
## `summarise()` has grouped output by 'start_station_name'. You can override
## using the `.groups` argument.