import time
import json
import math
import requests
import youtube_dl
from bs4 import BeautifulSoup
BASE_URL_VIDEO = 'https://www.pornhub.com/view_video.php?viewkey={key}'
BASE_URL_HOT = 'https://www.pornhub.com/video?o=ht&cc=dk&page={page}'
BASE_URL_VIEWED = 'https://www.pornhub.com/video?o=mv&cc=dk&page={page}'
class DownloadLogger(object):
def debug(self, msg):
pass
def warning(self, msg):
pass
def error(self, msg):
print(msg)
class PornhubVideo(object):
def __init__(self, key, title, uploader, views, rating, added):
self.key = key
self.title = title
self.fixed_title = title.replace(' ', '_')
self.uploader = uploader
self.views = views
self.rating = rating
self.added = added
self.link = BASE_URL_VIDEO.format(key=key)
def to_json(self):
return {
'key': self.key,
'title': self.title,
'uploader': self.uploader,
'views': self.views,
'rating': self.rating,
'added': self.added,
'link': self.link
}
def get_page_count(total_videos):
return math.ceil((total_videos - 32) / 44) + 1
def get_page_details(videos):
title = videos.find('div', class_='sectionTitle').find('h1').text.strip()
video_count = int(videos.find('div', class_='showingCounter').text.strip().split(' ')[-1])
page_amount = get_page_count(video_count)
return title, video_count, page_amount
def parse_most_viewed_video_page(page_number):
raw_page = requests.get(BASE_URL_VIEWED.format(page=str(page_number)))
page_soup = BeautifulSoup(raw_page.text, features='html.parser')
return page_soup
def extract_video_information(video_div):
try:
key = str(video_div['data-video-vkey'])
info = video_div.find('div', class_='wrap').find('div', class_='thumbnail-info-wrapper')
title = str(info.find('span', class_='title').find('a')['title'])
uploader = str(info.find('div', class_='videoUploaderBlock').find('div', class_='usernameWrap').find('a').text.strip())
details = info.find('div', class_='videoDetailsBlock')
views = str(details.find('span', class_='views').find('var').text)
rating = str(details.find('div', class_='rating-container').find('div', class_='value').text)
added = str(details.find('var', class_='added').text)
return PornhubVideo(key, title, uploader, views, rating, added)
except:
return None
def get_videos(page):
wrapper = page.find('div', class_='wrapper')
container = wrapper.find('div', class_='container')
videos = container.find('div', class_='gridWrapper').find('div', class_='nf-videos').find('div', class_='sectionWrapper')
return videos
def download_video(key, path, title='%(title)s'):
print('')
print(f'Downloading video {title}')
start_time = time.time()
outtmpl = f'{path}/{title}.%(ext)s'
ydl_opts_start = {
'format': 'best',
'playliststart:': 1,
'playlistend': 4,
'outtmpl': outtmpl,
'nooverwrites': True,
'no_warnings': False,
'ignoreerrors': True,
'logger': DownloadLogger(),
'postprocessors': [{
'key': 'FFmpegVideoConvertor',
'preferedformat': 'mp4'
}],
'quiet': True
}
with youtube_dl.YoutubeDL(ydl_opts_start) as ydl:
ydl.download([BASE_URL_VIDEO.format(key=key)])
print(f'Finished download in {str(time.time() - start_time)} seconds')
def generate_videos_index(title, video_count, page_amount, videos):
if isinstance(videos, (list, tuple)):
videos = videos
else:
videos = [videos]
json = {
'title': str(title),
'video_count': str(video_count),
'page_amount': str(page_amount),
'videos': [video.to_json() for video in videos]
}
return json
def get_most_viewed(all_data=False):
raw_page = requests.get(BASE_URL_VIEWED.format(page='1'))
page = BeautifulSoup(raw_page.text, features='html.parser')
videos = get_videos(page)
title, video_count, page_amount = get_page_details(videos)
videos = []
for i in range(page_amount):
print(f'Extracting videos from page {str(i + 1)}')
video_list = get_videos(parse_most_viewed_video_page(i + 1)).find('ul', id='videoCategory', class_='videos')
for video in video_list.find_all('li', class_='pcVideoListItem'):
video = extract_video_information(video)
# the video is not accessible
if video is None:
continue
else:
videos.append(video)
if all_data:
return title, video_count, page_amount, videos
else:
return videos
def download_most_viewed(path):
title, video_count, page_amount, videos = get_most_viewed(all_data=True)
with open(f'{path}/index.json', 'a+') as index_file:
index_file.write(json.dumps(generate_videos_index(title, video_count, page_amount, videos)))
print(f'index.json successfully genereated in {path}/index.json')
print('Downloading videos')
start_time = time.time()
for video in videos:
download_video(video.key, path, video.fixed_title)
print(f'Downloaded all videos in {time.time() - start_time} seconds')
if __name__ == '__main__':
download_most_viewed('./out')