 
								Data Source & Purpose of Data Analysis
Data Source: Divvy Bikes license
Purpose of Data Analysis
- How Does a Bike-Share Navigate Speedy Success?
- How can we convert casual riders into members?
- How annual members and casual riders differ?(When and where do they ride?)
If you want to see this project as a HTML document from R Markdown, please click this Link If you want to see this project as a PDF document from R Markdown, please click this Link
1. Prepare Data
1.1 Install Packages
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
library(tidyverse)
install.packages("lubridate", repos = "http://cran.us.r-project.org")
library(lubridate)
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
library(ggplot2)1.2 Import Data
#Import datasets from Sep 2021 to Aug 2022 and bind them all
filepath <- "data/202109-divvy-tripdata.csv"
new_dataset <- read.csv(filepath)
repeat{
	dateofdata <- as.integer(substr(filepath, 6, 11))+1
	substr(filepath, 6, 11) <- as.character(dateofdata)
	second_import <- read.csv(filepath)
	new_dataset <- rbind(new_dataset, second_import)
	if(dateofdata == 202112){
	filepath <- "data/202201-divvy-tripdata.csv"
	second_import <- read.csv(filepath)
	new_dataset <- rbind(new_dataset, second_import)
	}else if(dateofdata == 202208){
	break
	}
}2. Process Data
2.1 Create subsets
#Prepare a dataset to inspect its datetime data
dt_dataset <- select(new_dataset, 'ride_id', 'started_at', 'ended_at', 'member_casual')
#Prepare a dataset to inspect geographic data
lc_dataset <- select(new_dataset, 'ride_id', 'start_station_name', 'end_station_name', 'member_casual')2.2 Organize and Clean Data
- Convert the date columns from character to date type.
- Extract the month column and wday column from the date column.
- Reorder the weekdays, because it's not in the right order.
- Convert the started_at, ended_at columns to time type.
- Calculate the trip duration using the converted time columns.
- Remove bad data which contains negative time difference from trip duration column.
- Deal with missing values.
- Concatenate start and end station names to observe trip routes.
#Data cleaning for dt_dataset
#Convert Character to Date and format to Month and Weekday
dt_dataset$trip_date <- as.Date(dt_dataset$started_at)
dt_dataset$trip_month <- format(as.Date(dt_dataset$trip_date), "%m")
dt_dataset$trip_wday <- format(as.Date(dt_dataset$trip_date), "%a")
#Reorder the weekdays
dt_dataset$trip_wday <- factor(dt_dataset$trip_wday, levels= c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"))
#Convert Datetime to Time datatype, remove rows with negative trip_duration
dt_dataset$trip_stime <- as.POSIXct(dt_dataset$started_at, tz='UTC')
dt_dataset$trip_etime <- as.POSIXct(dt_dataset$ended_at, tz='UTC')
dt_dataset$trip_duration <- difftime(dt_dataset$trip_etime, dt_dataset$trip_stime)
dt_dataset2 <- subset(dt_dataset, trip_duration>0)
#Data Cleaning for lc_dataset
#Deal with missing Values
lc_dataset <- lc_dataset %>% 
	filter(start_station_name!=""&end_station_name!="")
#Create trip_route column by concatenating the start and end station names
lc_dataset$trip_route <- paste(lc_dataset$start_station_name, lc_dataset$end_station_name, sep=" - ")
#Create round trip column by inspecting whether start station name equals to end station name or not
lc_dataset$round_trip[lc_dataset$start_station_name==lc_dataset$end_station_name] <- "YES"
lc_dataset$round_trip[lc_dataset$start_station_name!=lc_dataset$end_station_name] <- "NO"3. Analyze Data
3.1 Daily Bike Ride Trends
#Comparison by the number of daily ride between user types
dt_dataset2 %>%
	group_by(trip_date, member_casual) %>% 
	summarize(daily_ride_count=n_distinct(ride_id)) %>% 
	ggplot(aes(x = trip_date, y = daily_ride_count, color = member_casual)) +
	geom_point() +
	geom_line(aes(col=member_casual), size=1, alpha=0.4) +
	facet_wrap(~member_casual) +
	labs(title="Daily ride trends by member users and casual users(Ride counts)") +
	theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="none") 
								
								
								3.2 Bike Ride Tendency by each weekday
#Compare the number of ride by each user type every weekdays
dt_dataset2 %>% 
	group_by(trip_wday, member_casual) %>% 
	summarise(ride_count_by_weekdays=n_distinct(ride_id)) %>% 
	ggplot(aes(x=trip_wday, y=ride_count_by_weekdays, group=member_casual)) +
	geom_line(aes(color=member_casual), size=2) +
	labs(title="Ride trends on each weekday by member users and casual users") +
	theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="bottom", legend.title = element_blank()) 
								
								
								3.3 Bike Ride Tendency by each month
#Compare the number of ride by each user type every months
dt_dataset2 %>% 
	group_by(trip_month, member_casual) %>% 
	summarise(ride_count_by_months=n_distinct(ride_id)) %>% 
	ggplot(aes(x=trip_month, y=ride_count_by_months, group=member_casual)) +
	geom_line(aes(color=member_casual), size=2) +
	labs(title="Ride trends in each month by member users and casual users") +
	theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="bottom", legend.title = element_blank()) 
		
								
								
								3.4 Average Trip Duration
#Compare the average trip duration between the two user types
dt_dataset2 %>% 
	group_by(member_casual) %>% 
	summarise(trip_duration_average=mean(as.integer(trip_duration)/60)) %>% 
	ggplot(aes(x=member_casual, y=trip_duration_average, fill=member_casual)) + 
	geom_bar(stat = "identity", width=0.2) +
	labs(title="Trip duration average by member users and casual users (min)") +
	theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="none") 
		
								
								
								3.5 Round Trip User Proportion
#Create the pie chart to see the proportion of each user type from all the round trips
	lc_dataset %>% 
		filter(round_trip=="YES") %>% 
		group_by(member_casual) %>% 
		summarize(count=n_distinct(ride_id)) %>% 
		mutate(percent=count/sum(count)*100) %>% 
		ggplot(aes(x = 2, y = percent, fill = member_casual)) +
		geom_bar(stat = "identity", color = "white") +
		coord_polar(theta = "y", start = 0) +
		labs(title="Round trip percentage by user type(%)") +
		geom_text(aes(label = round(percent,2)), position = position_stack(vjust = 0.5), color = "black") +
		theme(axis.text = element_blank(),
			axis.ticks = element_blank(),
			axis.title = element_blank(),
			panel.grid = element_blank(),
			panel.background = element_blank(),
			plot.background = element_blank(),
			legend.position="bottom",
			legend.title = element_blank()) 
								
								
								3.6 The 10 Most Popular Trip Route
#Check the 10 most popular trip routes
lc_dataset %>% 
	group_by(member_casual, trip_route) %>% 
	summarise(count=n_distinct(ride_id)) %>% 
	arrange(desc(count), trip_route) %>% 
	head(n=10) %>% 
	ggplot(aes(x=count, y=reorder(trip_route, count), fill=member_casual)) + 
	geom_bar(stat = "identity", width=0.4) +
	labs(title="The 10 most popular trip routes") +
	theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="bottom", legend.title = element_blank()) 
								
								
								3.7 The 10 Most Popular Pickup Station
#Create the bar chart to see the top 10 popular station to start the trip
lc_dataset %>% 
	group_by(start_station_name, member_casual) %>% 
	summarize(ride_count=n_distinct(ride_id)) %>%
	arrange(desc(ride_count)) %>% 
	head(n=10) %>% 
	ggplot(aes(x=ride_count, y=reorder(start_station_name, ride_count), fill=member_casual)) + 
	geom_bar(stat = "identity", width=0.4) +
	labs(title="The 10 most popular bike pickup station") +
	theme(axis.title.x=element_blank(), axis.title.y=element_blank(), legend.position="bottom", legend.title = element_blank())