Project4 Youtube Web Scraper - Yoga with Adriene

Data Source & Purpose of Data Analysis

Data Source : Youtube(Channel : Yoga with Adriene)

Purpose of Data Analysis : Let's pull the data from a youtube channel and analyze the data

If you want to see this project from github, please click this Link

1. Prepare the environment for Web scraping

1.1 Import Libraries

# import libraries
from googleapiclient.discovery import build
import pandas as pd
import seaborn as sns

1.2 Prepare API key and channel ID

# prepare api key and channel id
api_key = 'AIzaSyC7pCgKz77npiTz7cVQlD7V0-S_5Ih8ybM'
channel_id = 'UCFKE7WVJfvaHW5q283SxchA'  # Youtube Channel 'Yoga with Adriene'

youtube = build('youtube','v3',developerKey=api_key)

2. Function to get channel statistics

def get_channel_stats(youtube, channel_id):

request = youtube.channels().list(
    part = 'snippet,contentDetails,statistics',
    id = channel_id
)

response = request.execute()

data = dict(title = response['items'][0]['snippet']['title'],
            country = response['items'][0]['snippet']['country'],
            subscribers = response['items'][0]['statistics']['subscriberCount'],
            views = response['items'][0]['statistics']['viewCount'],
            videos = response['items'][0]['statistics']['videoCount'],
            playlist = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
)

return data

channel_data = get_channel_stats(youtube, channel_id)

3. Function to get video ids

def get_video_ids(youtube, playlist_id):

request = youtube.playlistItems().list(
    part = 'contentDetails',
    playlistId = playlist_id,
    maxResults = 50
)

response = request.execute()

video_ids = []

for i in range(len(response['items'])):
    video_ids.append(response['items'][i]['contentDetails']['videoId'])

next_page_token = response.get('nextPageToken')
more_pages = True

while more_pages:
    if next_page_token is None:
        more_pages = False
    else:
        request = youtube.playlistItems().list(
                    part = 'contentDetails',
                    playlistId = playlist_id,
                    maxResults = 50,
                    pageToken = next_page_token
                )

        response = request.execute()
        
        for i in range(len(response['items'])):
            video_ids.append(response['items'][i]['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')

return video_ids

video_ids = get_video_ids(youtube, channel_data['playlist'])

4. Function to get video details

def get_video_details(youtube, video_ids):

all_video_stats = []
for i in range(0, len(video_ids), 50):
    request = youtube.videos().list(
                part = 'snippet,statistics',
                id = ','.join(video_ids[i:i+50])
    )
    response = request.execute()
    
    for video in response['items']:
        video_stats = dict(Title = video['snippet']['title'],
                            Published_date = video['snippet']['publishedAt'],
                            Views = video['statistics']['viewCount'],
                            Likes = video['statistics']['likeCount'],
                            Comments = video['statistics']['commentCount'],
                            )

        all_video_stats.append(video_stats)

return all_video_stats

5. Create a dataframe with video details

video_details = get_video_details(youtube, video_ids)
video_data = pd.DataFrame(video_details)
video_data.head()

6. Clean data

video_data['Published_date'] = pd.to_datetime(video_data['Published_date']).dt.date
video_data['Views'] = pd.to_numeric(video_data['Views'])
video_data['Likes'] = pd.to_numeric(video_data['Likes'])
video_data['Comments'] = pd.to_numeric(video_data['Comments'])
video_data.head()

7. Process and Analyze the data

7.1 Top 10 Videos

top10_videos = video_data.sort_values(by='Views', ascending=False).head(10)
top10_videos

ax1 = sns.barplot(x='Views', y='Title', data = top10_videos)

7.2 Sort the data by month

video_data['Month'] = pd.to_datetime(video_data['Published_date']).dt.strftime('%b')
videos_per_month = video_data.groupby('Month', as_index=False).size()
sort_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
videos_per_month.index = pd.CategoricalIndex(videos_per_month['Month'], categories=sort_order, ordered=True)
videos_per_month = videos_per_month.sort_index()
# New video release count per month
ax2 = sns.barplot(x='Month', y='size', data = videos_per_month)

7.3 Total views for each year

video_data['Year'] = pd.to_datetime(video_data['Published_date']).dt.strftime('%Y')
views_per_year = video_data.groupby('Year', as_index=False).Views.sum()
views_per_year

# Total views per year
ax3 = sns.barplot(x='Year', y='Views', data = views_per_year)

7.4 Total Video released each year

videos_per_year = video_data.groupby('Year', as_index=False).Title.count()
videos_per_year

# Published videos per year
ax4 = sns.barplot(x='Year', y='Title', data = videos_per_year)

7.5 Correlation between View counts and Likes

# correlation between the view count and the like count
ax5 = sns.scatterplot(x='Views', y='Likes', data=video_data)