phautodownloader

import time
import json
import math
import requests
import youtube_dl
from bs4 import BeautifulSoup


BASE_URL_VIDEO = 'https://www.pornhub.com/view_video.php?viewkey={key}'
BASE_URL_HOT = 'https://www.pornhub.com/video?o=ht&cc=dk&page={page}'
BASE_URL_VIEWED = 'https://www.pornhub.com/video?o=mv&cc=dk&page={page}'

class DownloadLogger(object):
    def debug(self, msg):
        pass

    def warning(self, msg):
        pass

    def error(self, msg):
        print(msg)

class PornhubVideo(object):
    def __init__(self, key, title, uploader, views, rating, added):
        self.key = key
        self.title = title
        self.fixed_title = title.replace(' ', '_')
        self.uploader = uploader
        self.views = views
        self.rating = rating
        self.added = added
        self.link = BASE_URL_VIDEO.format(key=key)

    def to_json(self):
        return {
            'key': self.key,
            'title': self.title,
            'uploader': self.uploader,
            'views': self.views,
            'rating': self.rating,
            'added': self.added,
            'link': self.link
        }

def get_page_count(total_videos):
    return math.ceil((total_videos - 32) / 44) + 1

def get_page_details(videos):
    title = videos.find('div', class_='sectionTitle').find('h1').text.strip()
    video_count = int(videos.find('div', class_='showingCounter').text.strip().split(' ')[-1])
    page_amount = get_page_count(video_count)

    return title, video_count, page_amount

def parse_most_viewed_video_page(page_number):
    raw_page = requests.get(BASE_URL_VIEWED.format(page=str(page_number)))
    page_soup = BeautifulSoup(raw_page.text, features='html.parser')

    return page_soup

def extract_video_information(video_div):
    try:
        key = str(video_div['data-video-vkey'])
        info = video_div.find('div', class_='wrap').find('div', class_='thumbnail-info-wrapper')
        title = str(info.find('span', class_='title').find('a')['title'])
        uploader = str(info.find('div', class_='videoUploaderBlock').find('div', class_='usernameWrap').find('a').text.strip())
        details = info.find('div', class_='videoDetailsBlock')
        views = str(details.find('span', class_='views').find('var').text)
        rating = str(details.find('div', class_='rating-container').find('div', class_='value').text)
        added = str(details.find('var', class_='added').text)

        return PornhubVideo(key, title, uploader, views, rating, added)
    except:
        return None

def get_videos(page):
    wrapper = page.find('div', class_='wrapper')
    container = wrapper.find('div', class_='container')
    videos = container.find('div', class_='gridWrapper').find('div', class_='nf-videos').find('div', class_='sectionWrapper')

    return videos

def download_video(key, path, title='%(title)s'):
    print('')
    print(f'Downloading video {title}')
    start_time = time.time()

    outtmpl = f'{path}/{title}.%(ext)s'
    ydl_opts_start = {
        'format': 'best',
        'playliststart:': 1,
        'playlistend': 4,
        'outtmpl': outtmpl,
        'nooverwrites': True,
        'no_warnings': False,
        'ignoreerrors': True,
        'logger': DownloadLogger(),
        'postprocessors': [{
            'key': 'FFmpegVideoConvertor',
            'preferedformat': 'mp4'
        }],
        'quiet': True
    }

    with youtube_dl.YoutubeDL(ydl_opts_start) as ydl:
        ydl.download([BASE_URL_VIDEO.format(key=key)])

    print(f'Finished download in {str(time.time() - start_time)} seconds')

def generate_videos_index(title, video_count, page_amount, videos):
    if isinstance(videos, (list, tuple)):
        videos = videos
    else:
        videos = [videos]

    json = {
        'title': str(title),
        'video_count': str(video_count),
        'page_amount': str(page_amount),
        'videos': [video.to_json() for video in videos]
    }

    return json

def get_most_viewed(all_data=False):
    raw_page = requests.get(BASE_URL_VIEWED.format(page='1'))
    page = BeautifulSoup(raw_page.text, features='html.parser')

    videos = get_videos(page)

    title, video_count, page_amount = get_page_details(videos)

    videos = []
    for i in range(page_amount):
        print(f'Extracting videos from page {str(i + 1)}')
        video_list = get_videos(parse_most_viewed_video_page(i + 1)).find('ul', id='videoCategory', class_='videos')
        for video in video_list.find_all('li', class_='pcVideoListItem'):
            video = extract_video_information(video)
            # the video is not accessible
            if video is None:
                continue
            else:
                videos.append(video)

    if all_data:
        return title, video_count, page_amount, videos
    else:
        return videos

def download_most_viewed(path):
    title, video_count, page_amount, videos = get_most_viewed(all_data=True)

    with open(f'{path}/index.json', 'a+') as index_file:
        index_file.write(json.dumps(generate_videos_index(title, video_count, page_amount, videos)))

    print(f'index.json successfully genereated in {path}/index.json')
    print('Downloading videos')
    start_time = time.time()

    for video in videos:
        download_video(video.key, path, video.fixed_title)

    print(f'Downloaded all videos in {time.time() - start_time} seconds')

if __name__ == '__main__':
    download_most_viewed('./out')