add movies and update the script

Signed-off-by: Bogomil Vasilev <smirky@smirky.net>
2026-02-13 14:00:30 +02:00
parent b2b8089320
commit 6da4cb891b
4 changed files with 17 additions and 66 deletions
--- a/movie_list/gen_movie_list.py
+++ b/movie_list/gen_movie_list.py
@@ -0,0 +1,546 @@
+#!/usr/bin/env python
+
+"""
+1. Import a movie_list txt file
+2. Query IMDb for each entry, retrieving actual movie name, rating and genres
+3. Generate an HTML table from the IMDb data
+4. Store the HTML in index.html
+"""
+
+import os
+import sys
+import time
+import threading
+from pathlib import Path
+import progressbar
+from imdb import IMDb
+from imdb._exceptions import IMDbParserError, IMDbDataAccessError
+
+
+class MovieList:
+    """ Class to generate a movie list HTML table """
+    def __init__(self, src=None, dst=None):
+        self.prev_html = []
+        self.html = """<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>My Movie List</title>
+    <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.24/css/jquery.dataTables.min.css">
+    <script src="../jquery-3.7.1.min.js"></script>
+    <script src="https://cdn.datatables.net/1.10.24/js/jquery.dataTables.min.js"></script>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+            background: transparent;
+            height: 100vh;
+            padding: 0.5rem;
+            color: #fff;
+            overflow: hidden;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .container {
+            background: rgba(0, 0, 0, 0.6);
+            backdrop-filter: blur(10px);
+            border-radius: 8px;
+            border: 1px solid rgba(255, 255, 255, 0.1);
+            padding: 0.75rem;
+            height: 100%;
+            overflow: hidden;
+            display: flex;
+            flex-direction: column;
+        }
+
+        h1 {
+            color: #fff;
+            margin-bottom: 0.75rem;
+            font-size: 1.5rem;
+            font-weight: 600;
+            text-align: center;
+        }
+
+        .dataTables_wrapper {
+            flex: 1;
+            overflow: hidden;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .dataTables_filter {
+            margin-bottom: 0.5rem;
+            text-align: right;
+        }
+
+        .dataTables_filter label {
+            display: flex;
+            align-items: center;
+            justify-content: flex-end;
+            gap: 0.5rem;
+            font-size: 0.9rem;
+            color: rgba(255, 255, 255, 0.9);
+        }
+
+        .dataTables_filter input {
+            padding: 0.4rem 0.75rem;
+            border: 1px solid rgba(255, 255, 255, 0.2);
+            border-radius: 6px;
+            background: rgba(0, 0, 0, 0.3);
+            color: #fff;
+            font-size: 0.9rem;
+            width: 250px;
+        }
+
+        .dataTables_filter input::placeholder {
+            color: rgba(255, 255, 255, 0.5);
+        }
+
+        .dataTables_filter input:focus {
+            outline: none;
+            border-color: rgba(255, 255, 255, 0.4);
+            background: rgba(0, 0, 0, 0.4);
+        }
+
+        .dataTables_info {
+            padding: 0.5rem 0;
+            color: rgba(255, 255, 255, 0.6);
+            font-size: 0.85rem;
+        }
+
+        .dataTables_scroll {
+            flex: 1;
+            overflow: hidden;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .dataTables_scrollHead {
+            flex-shrink: 0;
+        }
+
+        .dataTables_scrollBody {
+            flex: 1;
+            overflow-y: auto !important;
+        }
+
+        .dataTables_scrollBody::-webkit-scrollbar {
+            width: 6px;
+        }
+
+        .dataTables_scrollBody::-webkit-scrollbar-track {
+            background: rgba(255, 255, 255, 0.05);
+        }
+
+        .dataTables_scrollBody::-webkit-scrollbar-thumb {
+            background: rgba(255, 255, 255, 0.2);
+            border-radius: 3px;
+        }
+
+        table.dataTable.stripe tbody tr.odd, 
+        table.dataTable.display tbody tr.odd,
+        table.dataTable tbody tr {
+            background: transparent !important;
+        }
+
+        table.dataTable.hover tbody tr:hover,
+        table.dataTable.display tbody tr:hover {
+            background: rgba(255, 255, 255, 0.1) !important;
+        }
+
+        table.dataTable tbody td {
+            background: transparent !important;
+        }
+
+        #sortable {
+            width: 100%;
+            border-collapse: collapse;
+            background: transparent;
+        }
+
+        #sortable thead {
+            background: rgba(0, 0, 0, 0.4);
+        }
+
+        #sortable thead th {
+            padding: 0.5rem 0.75rem;
+            text-align: left;
+            font-weight: 600;
+            font-size: 0.85rem;
+            color: rgba(255, 255, 255, 0.9);
+            cursor: pointer;
+            border-bottom: 1px solid rgba(255, 255, 255, 0.1);
+            white-space: nowrap;
+        }
+
+        #sortable thead th:hover {
+            background: rgba(255, 255, 255, 0.05);
+        }
+
+        #sortable thead th.sorting,
+        #sortable thead th.sorting_asc,
+        #sortable thead th.sorting_desc {
+            padding-right: 1.5rem;
+            position: relative;
+        }
+
+        #sortable thead th.sorting:after {
+            content: '⇅';
+            position: absolute;
+            right: 0.5rem;
+            top: 50%;
+            transform: translateY(-50%);
+            font-size: 0.75rem;
+            color: rgba(255, 255, 255, 0.5);
+        }
+
+        #sortable thead th.sorting_asc:after {
+            content: '↑';
+            position: absolute;
+            right: 0.5rem;
+            top: 50%;
+            transform: translateY(-50%);
+            font-size: 0.75rem;
+            color: rgba(255, 255, 255, 0.9);
+        }
+
+        #sortable thead th.sorting_desc:after {
+            content: '↓';
+            position: absolute;
+            right: 0.5rem;
+            top: 50%;
+            transform: translateY(-50%);
+            font-size: 0.75rem;
+            color: rgba(255, 255, 255, 0.9);
+        }
+
+        #sortable tbody tr {
+            border-bottom: 1px solid rgba(255, 255, 255, 0.05);
+            transition: background 0.15s ease;
+            background: transparent !important;
+        }
+
+        #sortable tbody tr:hover {
+            background: rgba(255, 255, 255, 0.1) !important;
+        }
+
+        #sortable tbody td {
+            padding: 0.5rem 0.75rem;
+            font-size: 0.9rem;
+            color: rgba(255, 255, 255, 0.9);
+            background: transparent !important;
+        }
+
+        #sortable tbody td:first-child {
+            color: rgba(255, 255, 255, 0.5);
+            font-size: 0.85rem;
+            width: 50px;
+        }
+
+        #sortable tbody td a {
+            color: rgba(135, 206, 250, 0.9);
+            text-decoration: none;
+        }
+
+        #sortable tbody td a:hover {
+            color: rgba(135, 206, 250, 1);
+            text-decoration: underline;
+        }
+
+        #sortable tbody td p[hidden] {
+            display: none;
+        }
+
+        #sortable tbody td:nth-child(4) {
+            color: rgba(255, 193, 7, 0.9);
+            font-weight: 500;
+        }
+
+        .timestamp {
+            text-align: center;
+            margin-top: 0.5rem;
+            padding-top: 0.5rem;
+            border-top: 1px solid rgba(255, 255, 255, 0.1);
+            color: rgba(255, 255, 255, 0.5);
+            font-size: 0.8rem;
+        }
+    </style>
+    <script>
+    $(document).ready(function(){
+        $('#sortable').DataTable({
+            "paging": false,
+            "info": true,
+            "searching": true,
+            "ordering": true,
+            "order": [[0, "asc"]],
+            "scrollY": "calc(100vh - 200px)",
+            "scrollCollapse": true,
+            "language": {
+                "search": "Search:",
+                "info": "Showing _TOTAL_ movies",
+                "infoEmpty": "No movies",
+                "infoFiltered": "(filtered from _MAX_)"
+            }
+        });
+    });
+    </script>
+</head>
+<body>
+    <base target="_parent" />
+    <div class="container">
+        <h1>🎬 My Movie Collection</h1>
+        <table id="sortable" class="sortable">
+            <thead>
+                <tr>
+                    <th>#</th>
+                    <th>Title</th>
+                    <th>Year</th>
+                    <th>Rating</th>
+                    <th>Genre</th>
+                    <th>Status</th>
+                </tr>
+            </thead>
+            <tbody>"""
+        self.src = src
+        self.dst = Path(dst) if dst else Path(os.path.dirname(sys.argv[0])) / 'index.html'
+        self.movie_list = []
+        self.threads = []
+        self.read_prev_output()
+        self.html_table = None
+
+    def _worker(self, arg, index):
+        # Scan IMDb for a given movie and append it to the html
+        # This collects rating, genres, official name and a hyperlink
+        imdb = IMDb()
+        first_run = True
+        while True:
+            if not first_run:
+                time.sleep(10)
+            else:
+                first_run = False
+            try:
+                query = imdb.search_movie(f'{arg["title"]} {arg["year"]}')
+                break
+            except IMDbDataAccessError as imdb_data_exc:
+                exc = str(imdb_data_exc)
+                if '503' in exc:
+                    sys.stderr.write('503 - Service Unavailable, retrying...')
+                elif '403' in exc:
+                    sys.stderr.write('403 - Forbidden, retrying...\n')
+                query = []
+                time.sleep(10)
+            except IMDbParserError as imdb_parser_exc:
+                query = []
+                break
+            except Exception as exc:
+                time.sleep(10)
+
+        movie = None
+        for entry in query:
+            try:
+                imdb.update(entry)
+            except Exception as e:
+                sys.stderr.write('update err')
+            # in case any of these keys is missing in the query, continue
+            if not all(key in entry.keys() for key in ['kind', 'year', 'title']):
+                continue
+            if arg['status'] == 'DONE' and 'rating' not in entry.keys():
+                continue
+            # Try to eliminate episode results
+            if [i for i in entry.keys() if 'episode' in i.lower()] or (
+                    'episode' in entry['title'].lower() and \
+                    'episode' not in arg['title'].lower()):
+                continue
+            if entry['kind'].lower() == arg['kind'].lower():
+                movie = entry
+                break
+        if not movie:
+            movie = {
+                'title': arg['title'],
+                'kind': arg['kind'],
+                'year': arg['year'],
+                'dummy': None
+            }
+        if 'genres' not in movie.keys():
+            movie['genres'] = ['N/A']
+        if 'rating' not in movie.keys():
+            movie['rating'] = 'N/A'
+
+        html_title_td = movie['title'] if 'dummy' in movie.keys() else \
+                f'<a href="https://www.imdb.com/title/tt{movie.movieID}" target="_blank">{movie["title"]}</a>'
+        
+        self.html_table[index] = (
+            f'\n                <tr>'
+            f'<td data-label="#">{index + 1}</td>'
+            f'<td data-label="Title"><p hidden>{arg["title"]}</p>{html_title_td}</td>'
+            f'<td data-label="Year">{movie["year"]}</td>'
+            f'<td data-label="Rating" align="center">{movie["rating"]}</td>'
+            f'<td data-label="Genre">{", ".join(movie["genres"])}</td>'
+            f'<td data-label="Status" align="center">{arg["status"]}</td>'
+            f'</tr>'
+        )
+
+    def gen(self):
+        """ Generate an HTML list based on input, using a threaded worker """
+        if not self.src:
+            self.src = Path(os.path.dirname(sys.argv[0])) / 'movie_list'
+        else:
+            self.src = Path(self.src)
+            if not self.src.exists():
+                sys.stderr.write(f'error: input does not exist - {self.src}\n')
+                return False
+
+        self.movie_list = {}
+        # Open the movie list & split the columns
+        with open(self.src, 'r', encoding='utf-8') as fp_handle:
+            mlist_raw = fp_handle.read()
+            for raw_line in mlist_raw.splitlines():
+                # In case the line is empty
+                if not raw_line:
+                    continue
+                self.movie_list.update({
+                    len(self.movie_list): {
+                        'title': raw_line[0:next((i for i, ch in enumerate(raw_line) if ch in {'<', '('}), None) - 1],
+                        'kind': raw_line[raw_line.find('<')+1:raw_line.rfind('>')+1].strip('<>') or 'movie',
+                        'year': raw_line[raw_line.find('(')+1:raw_line.find(')')],
+                        'status': raw_line[raw_line.find('[')+1:raw_line.find(']')],
+                    }
+                })
+            self.html_table = [None] * len(self.movie_list)
+
+        # Progress bar
+        pbar = progressbar.ProgressBar(max_value=len(self.movie_list))
+        for idx, movie in self.movie_list.items():
+            # More precise matching - look for the hidden <p> tag with exact title
+            match = [html_row for html_row in self.prev_html 
+                     if f'<p hidden>{movie["title"]}</p>' in html_row 
+                     and 'N/A' not in html_row]
+            if match:
+                # Update the index and status from the cached row
+                match_str = match[0]
+                # Replace the status (* -> DONE or vice versa)
+                match_str = match_str.replace('*', movie['status']).replace('DONE', movie['status'])
+                # Update the index number
+                if '<td data-label="#">' in match_str:
+                    # Extract everything after the index cell
+                    after_index = match_str.split('</td>', 1)[1] if '</td>' in match_str else match_str
+                    self.html_table[idx] = f'\n                <tr><td data-label="#">{idx + 1}</td>{after_index}'
+                else:
+                    self.html_table[idx] = match_str
+                pbar.increment()
+            else:
+                thread = threading.Thread(target=self._worker, args=(movie, idx))
+                self.threads.append(thread)
+
+        max_threads = 10
+        while self.threads:
+            threads_alive = self.get_alive_threads()
+            threads_to_be_started = [i for i in self.threads if i not in threads_alive]
+            for idx in range(max_threads if max_threads < len(threads_to_be_started) else len(threads_to_be_started)):
+                threads_to_be_started[idx].start()
+                pbar.increment()
+                time.sleep(2)
+            time.sleep(2)
+            self.delete_finished_threads()
+
+        self.html += ''.join(self.html_table)
+        
+        # Deduplicate entries before writing
+        num_entries = self.deduplicate_html()
+        print(f"\nDeduplicated to {num_entries} unique entries")
+        
+        self.html = self.html.split('</tbody>')[0]  # Remove everything after tbody if it exists
+        self.html += ''.join(self.html_table)
+        
+        return True
+
+    def delete_finished_threads(self):
+        for idx, thread in enumerate(self.threads):
+            if not thread.is_alive() and thread._started.is_set():
+                thread.join()
+                self.threads[idx] = None
+        self.threads = list(filter(lambda a: a is not None, self.threads))
+
+    def get_alive_threads(self):
+        threads = []
+        for thread in self.threads:
+            if thread.is_alive() or thread._started.is_set():
+                threads.append(thread)
+        return threads
+
+    def write(self, dst=None):
+        """ Write the HTML list to index.html """
+        out_path = dst if dst else self.dst
+        timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
+        self.html += f'''
+            </tbody>
+        </table>
+    </div>
+    <div class="timestamp">Generated {timestamp} UTC</div>
+</body>
+</html>'''
+        with open(out_path, 'wb') as fp_handle:
+            fp_handle.write(self.html.encode('utf8'))
+
+    def read_prev_output(self):
+        """ Import a previous HTML table """
+        if self.dst.exists():
+            with open(self.dst, 'rb') as fp_handle:
+                self.prev_html = fp_handle.read().decode('utf8').split('\n')
+    
+    def deduplicate_html(self):
+        """ Remove duplicate entries from html_table based on movie titles """
+        seen_titles = set()
+        deduplicated = []
+        
+        for idx, row in enumerate(self.html_table):
+            if row is None:
+                continue
+            
+            # Extract the hidden title from the row
+            if '<p hidden>' in row and '</p>' in row:
+                start = row.find('<p hidden>') + 10
+                end = row.find('</p>', start)
+                title = row[start:end]
+                
+                if title not in seen_titles:
+                    seen_titles.add(title)
+                    deduplicated.append(row)
+                else:
+                    # Skip duplicate
+                    continue
+            else:
+                # If we can't find the hidden title, keep the row anyway
+                deduplicated.append(row)
+        
+        # Update html_table with deduplicated content
+        self.html_table = deduplicated
+        return len(self.html_table)
+
+
+def main():
+    """ Default run """
+    src = dst = None
+    if len(sys.argv) > 3:
+        sys.stderr.write(f'error: max 2 variables, {len(sys.argv)-1} given!\n')
+        sys.exit(1)
+
+    if len(sys.argv) > 1:
+        src = sys.argv[1]
+    if len(sys.argv) == 3:
+        dst = sys.argv[2]
+
+    mlist = MovieList(src=src, dst=dst)
+    if mlist.gen():
+        mlist.write(dst=dst)
+
+
+if __name__ == "__main__":
+    main()