scripts/movie_list/index.py

#!/usr/bin/env python

"""
1. Import a movie_list txt file
2. Query IMDb for each entry, retrieving actual movie name, rating and genres
3. Generate an HTML table from the IMDb data
4. Store the HTML in index.html
"""

import os
import sys
import time
import threading
from pathlib import Path
from http.client import IncompleteRead
import progressbar
from imdb import IMDb


class MovieList:
    """ Class to generate a movie list HTML table """
    def __init__(self, src=None, dst=None):
        self.prev_html = []
        self.html = """<html>
    <head>
        <title>My Movie List</title>
        <link rel="stylesheet" type="text/css" href="style.css">
        <script src="../jquery-3.1.0.min.js"></script>
        <script src="jquery.dataTables.min.js"></script>
        <script>
        $(document).ready(function(){
                $('#sortable').DataTable({
                    "pageLength": -1,
                    "bPaginate": false
                });
        });
        </script>
    </head>
    <body>
        <header>
            <div class="scroll-indicator" />
        </header>
    <base target="_parent" />
    <table id="sortable" class="sortable">
        <thead>
            <tr>
                <th> Index </th><th> Title </th><th> Year </th><th> IMDb Rating </th><th> Genre </th><th> Status </th>
            </tr>
        </thead>
        <tbody>"""
        self.src = src
        self.dst = Path(dst) if dst else Path(os.path.dirname(sys.argv[0])) / 'index.html'
        self.movie_list = []
        self.threads = []
        self.read_prev_output()
        self.html_table = None

    def _worker(self, arg, index):
        # Scan IMDb for a given movie and append it to the html
        # This collects rating, genres, official name and a hyperlink
        imdb = IMDb()
        save_stdout = sys.stdout
        with open(os.devnull, 'wb') as sys.stdout:
            while True:
                try:
                    query = imdb.search_movie(f'{arg["title"]} {arg["year"]}')
                    break
                except IncompleteRead:
                    pass
        sys.stdout = save_stdout

        movie = None
        for entry in query:
            has_minimum_keys = True
            for key in ['kind', 'year', 'title']:
                if key not in entry.keys():
                    has_minimum_keys = False
            if not has_minimum_keys:
                continue
            # Try to eliminate episode results
            if [i for i in entry.keys() if 'episode' in i.lower()] or \
                    'episode' in entry['title'].lower():
                continue
            if entry['kind'].lower() == arg['kind'].lower():
                movie = entry
                break
        if not movie:
            movie = {
                'title': arg['title'],
                'kind': arg['kind'],
                'year': arg['year'],
                'dummy': None
            }

        if 'genres' not in movie.keys():
            movie['genres'] = ['N/A']
        if 'rating' not in movie.keys():
            movie['rating'] = 'N/A'

        html_title_td = movie['title'] if 'dummy' in movie.keys() else \
                f'<a href="https://www.imdb.com/title/tt{movie.movieID}" target="_blank">{movie["title"]}</a>'
        self.html_table[index] = (
            f'\n{" "*8}<tr><td>{index + 1}</td>'
            f'<td><p hidden>{movie["title"]}</p>{html_title_td}</td>'
            f'<td>{movie["year"]}</td><td align="center">{movie["rating"]}</td>'
            f'<td>{", ".join(movie["genres"])}</td>'
            f'<td align="center">{arg["status"]}</td></tr>'
        )

    def gen(self):
        """ Generate an HTML list based on input, using a threaded worker """
        if not self.src:
            self.src = Path(os.path.dirname(sys.argv[0])) / 'movie_list'
        else:
            self.src = Path(self.src)
            if not self.src.exists():
                sys.stderr.write(f'error: input does not exist - {self.src}\n')
                return False

        self.movie_list = {}
        # Open the movie list & split the columns
        with open(self.src, 'r', encoding='utf-8') as fp_handle:
            mlist_raw = fp_handle.read()
            for raw_line in mlist_raw.splitlines():
                self.movie_list.update({
                    len(self.movie_list): {
                        'title': raw_line.split('(', 1)[0].strip(),
                        'kind': raw_line[raw_line.find('<')+1:raw_line.rfind('>')+1].strip('<>') or 'movie',
                        'year': raw_line[raw_line.find('(')+1:raw_line.find(')')],
                        'status': raw_line[raw_line.find('[')+1:raw_line.find(']')],
                    }
                })
            self.html_table = [None] * len(self.movie_list)

        # Progress bar. Enough said
        pbar = progressbar.ProgressBar(max_value=len(self.movie_list))
        for idx, movie in self.movie_list.items():
            match = [html_row for html_row in self.prev_html if movie['title'] in html_row]
            if match:
                # Update movies as DONE in case of change
                match = match[0].replace('*', movie['status'])
                # Directly insert the current HTML line from the older output
                self.html_table[idx] = \
                        f'\n{" "*8}<tr><td>{idx + 1}</td>{match[match.find("</td>") + 5:]}'
                pbar.update(idx + 1)
            else:
                thread = threading.Thread(target=self._worker, args=(movie, idx))
                self.threads.append(thread)
                thread.start()
                pbar.update(idx+1)
                time.sleep(0.2)
                if len(self.threads) % 16 == 0:
                    time.sleep(6)

        for thread in self.threads:
            thread.join()
        self.html += ''.join(self.html_table)
        return True

    def write(self, dst=None):
        """ Write the HTML list to index.html """
        out_path = dst if dst else self.dst
        # Just a fancy scrollbar for the html
        scroll = '<script type="text/javascript" src="scroll-indicator.js"></script>'
        self.html += ('\n\t</tbody>\n</table>\n' +
                      '\nGenerated on: ' + time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) +
                      ' by ' + sys.argv[0] + scroll + '</body>\n</html>')
        with open(out_path, 'wb') as fp_handle:
            fp_handle.write(self.html.encode('utf8'))

    def read_prev_output(self):
        """ Import a previous HTML table """
        if self.dst.exists():
            with open(self.dst, 'rb') as fp_handle:
                self.prev_html = fp_handle.read().decode('utf8').split('\n')


def main():
    """ Default run """
    src = dst = None
    if len(sys.argv) > 3:
        sys.stderr.write(f'error: max 2 variables, {len(sys.argv)-1} given!\n')
        sys.exit(1)

    if len(sys.argv) > 1:
        src = sys.argv[1]
    if len(sys.argv) == 3:
        dst = sys.argv[2]

    mlist = MovieList(src=src, dst=dst)
    if mlist.gen():
        mlist.write(dst=dst)


if __name__ == "__main__":
    main()