add movies and update the script

Signed-off-by: Bogomil Vasilev <smirky@smirky.net>
This commit is contained in:
2026-02-13 14:00:30 +02:00
parent b2b8089320
commit 6da4cb891b
4 changed files with 17 additions and 66 deletions

546
movie_list/gen_movie_list.py Executable file
View File

@@ -0,0 +1,546 @@
#!/usr/bin/env python
"""
1. Import a movie_list txt file
2. Query IMDb for each entry, retrieving actual movie name, rating and genres
3. Generate an HTML table from the IMDb data
4. Store the HTML in index.html
"""
import os
import sys
import time
import threading
from pathlib import Path
import progressbar
from imdb import IMDb
from imdb._exceptions import IMDbParserError, IMDbDataAccessError
class MovieList:
""" Class to generate a movie list HTML table """
def __init__(self, src=None, dst=None):
self.prev_html = []
self.html = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>My Movie List</title>
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.24/css/jquery.dataTables.min.css">
<script src="../jquery-3.7.1.min.js"></script>
<script src="https://cdn.datatables.net/1.10.24/js/jquery.dataTables.min.js"></script>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
background: transparent;
height: 100vh;
padding: 0.5rem;
color: #fff;
overflow: hidden;
display: flex;
flex-direction: column;
}
.container {
background: rgba(0, 0, 0, 0.6);
backdrop-filter: blur(10px);
border-radius: 8px;
border: 1px solid rgba(255, 255, 255, 0.1);
padding: 0.75rem;
height: 100%;
overflow: hidden;
display: flex;
flex-direction: column;
}
h1 {
color: #fff;
margin-bottom: 0.75rem;
font-size: 1.5rem;
font-weight: 600;
text-align: center;
}
.dataTables_wrapper {
flex: 1;
overflow: hidden;
display: flex;
flex-direction: column;
}
.dataTables_filter {
margin-bottom: 0.5rem;
text-align: right;
}
.dataTables_filter label {
display: flex;
align-items: center;
justify-content: flex-end;
gap: 0.5rem;
font-size: 0.9rem;
color: rgba(255, 255, 255, 0.9);
}
.dataTables_filter input {
padding: 0.4rem 0.75rem;
border: 1px solid rgba(255, 255, 255, 0.2);
border-radius: 6px;
background: rgba(0, 0, 0, 0.3);
color: #fff;
font-size: 0.9rem;
width: 250px;
}
.dataTables_filter input::placeholder {
color: rgba(255, 255, 255, 0.5);
}
.dataTables_filter input:focus {
outline: none;
border-color: rgba(255, 255, 255, 0.4);
background: rgba(0, 0, 0, 0.4);
}
.dataTables_info {
padding: 0.5rem 0;
color: rgba(255, 255, 255, 0.6);
font-size: 0.85rem;
}
.dataTables_scroll {
flex: 1;
overflow: hidden;
display: flex;
flex-direction: column;
}
.dataTables_scrollHead {
flex-shrink: 0;
}
.dataTables_scrollBody {
flex: 1;
overflow-y: auto !important;
}
.dataTables_scrollBody::-webkit-scrollbar {
width: 6px;
}
.dataTables_scrollBody::-webkit-scrollbar-track {
background: rgba(255, 255, 255, 0.05);
}
.dataTables_scrollBody::-webkit-scrollbar-thumb {
background: rgba(255, 255, 255, 0.2);
border-radius: 3px;
}
table.dataTable.stripe tbody tr.odd,
table.dataTable.display tbody tr.odd,
table.dataTable tbody tr {
background: transparent !important;
}
table.dataTable.hover tbody tr:hover,
table.dataTable.display tbody tr:hover {
background: rgba(255, 255, 255, 0.1) !important;
}
table.dataTable tbody td {
background: transparent !important;
}
#sortable {
width: 100%;
border-collapse: collapse;
background: transparent;
}
#sortable thead {
background: rgba(0, 0, 0, 0.4);
}
#sortable thead th {
padding: 0.5rem 0.75rem;
text-align: left;
font-weight: 600;
font-size: 0.85rem;
color: rgba(255, 255, 255, 0.9);
cursor: pointer;
border-bottom: 1px solid rgba(255, 255, 255, 0.1);
white-space: nowrap;
}
#sortable thead th:hover {
background: rgba(255, 255, 255, 0.05);
}
#sortable thead th.sorting,
#sortable thead th.sorting_asc,
#sortable thead th.sorting_desc {
padding-right: 1.5rem;
position: relative;
}
#sortable thead th.sorting:after {
content: '';
position: absolute;
right: 0.5rem;
top: 50%;
transform: translateY(-50%);
font-size: 0.75rem;
color: rgba(255, 255, 255, 0.5);
}
#sortable thead th.sorting_asc:after {
content: '';
position: absolute;
right: 0.5rem;
top: 50%;
transform: translateY(-50%);
font-size: 0.75rem;
color: rgba(255, 255, 255, 0.9);
}
#sortable thead th.sorting_desc:after {
content: '';
position: absolute;
right: 0.5rem;
top: 50%;
transform: translateY(-50%);
font-size: 0.75rem;
color: rgba(255, 255, 255, 0.9);
}
#sortable tbody tr {
border-bottom: 1px solid rgba(255, 255, 255, 0.05);
transition: background 0.15s ease;
background: transparent !important;
}
#sortable tbody tr:hover {
background: rgba(255, 255, 255, 0.1) !important;
}
#sortable tbody td {
padding: 0.5rem 0.75rem;
font-size: 0.9rem;
color: rgba(255, 255, 255, 0.9);
background: transparent !important;
}
#sortable tbody td:first-child {
color: rgba(255, 255, 255, 0.5);
font-size: 0.85rem;
width: 50px;
}
#sortable tbody td a {
color: rgba(135, 206, 250, 0.9);
text-decoration: none;
}
#sortable tbody td a:hover {
color: rgba(135, 206, 250, 1);
text-decoration: underline;
}
#sortable tbody td p[hidden] {
display: none;
}
#sortable tbody td:nth-child(4) {
color: rgba(255, 193, 7, 0.9);
font-weight: 500;
}
.timestamp {
text-align: center;
margin-top: 0.5rem;
padding-top: 0.5rem;
border-top: 1px solid rgba(255, 255, 255, 0.1);
color: rgba(255, 255, 255, 0.5);
font-size: 0.8rem;
}
</style>
<script>
$(document).ready(function(){
$('#sortable').DataTable({
"paging": false,
"info": true,
"searching": true,
"ordering": true,
"order": [[0, "asc"]],
"scrollY": "calc(100vh - 200px)",
"scrollCollapse": true,
"language": {
"search": "Search:",
"info": "Showing _TOTAL_ movies",
"infoEmpty": "No movies",
"infoFiltered": "(filtered from _MAX_)"
}
});
});
</script>
</head>
<body>
<base target="_parent" />
<div class="container">
<h1>🎬 My Movie Collection</h1>
<table id="sortable" class="sortable">
<thead>
<tr>
<th>#</th>
<th>Title</th>
<th>Year</th>
<th>Rating</th>
<th>Genre</th>
<th>Status</th>
</tr>
</thead>
<tbody>"""
self.src = src
self.dst = Path(dst) if dst else Path(os.path.dirname(sys.argv[0])) / 'index.html'
self.movie_list = []
self.threads = []
self.read_prev_output()
self.html_table = None
def _worker(self, arg, index):
# Scan IMDb for a given movie and append it to the html
# This collects rating, genres, official name and a hyperlink
imdb = IMDb()
first_run = True
while True:
if not first_run:
time.sleep(10)
else:
first_run = False
try:
query = imdb.search_movie(f'{arg["title"]} {arg["year"]}')
break
except IMDbDataAccessError as imdb_data_exc:
exc = str(imdb_data_exc)
if '503' in exc:
sys.stderr.write('503 - Service Unavailable, retrying...')
elif '403' in exc:
sys.stderr.write('403 - Forbidden, retrying...\n')
query = []
time.sleep(10)
except IMDbParserError as imdb_parser_exc:
query = []
break
except Exception as exc:
time.sleep(10)
movie = None
for entry in query:
try:
imdb.update(entry)
except Exception as e:
sys.stderr.write('update err')
# in case any of these keys is missing in the query, continue
if not all(key in entry.keys() for key in ['kind', 'year', 'title']):
continue
if arg['status'] == 'DONE' and 'rating' not in entry.keys():
continue
# Try to eliminate episode results
if [i for i in entry.keys() if 'episode' in i.lower()] or (
'episode' in entry['title'].lower() and \
'episode' not in arg['title'].lower()):
continue
if entry['kind'].lower() == arg['kind'].lower():
movie = entry
break
if not movie:
movie = {
'title': arg['title'],
'kind': arg['kind'],
'year': arg['year'],
'dummy': None
}
if 'genres' not in movie.keys():
movie['genres'] = ['N/A']
if 'rating' not in movie.keys():
movie['rating'] = 'N/A'
html_title_td = movie['title'] if 'dummy' in movie.keys() else \
f'<a href="https://www.imdb.com/title/tt{movie.movieID}" target="_blank">{movie["title"]}</a>'
self.html_table[index] = (
f'\n <tr>'
f'<td data-label="#">{index + 1}</td>'
f'<td data-label="Title"><p hidden>{arg["title"]}</p>{html_title_td}</td>'
f'<td data-label="Year">{movie["year"]}</td>'
f'<td data-label="Rating" align="center">{movie["rating"]}</td>'
f'<td data-label="Genre">{", ".join(movie["genres"])}</td>'
f'<td data-label="Status" align="center">{arg["status"]}</td>'
f'</tr>'
)
def gen(self):
""" Generate an HTML list based on input, using a threaded worker """
if not self.src:
self.src = Path(os.path.dirname(sys.argv[0])) / 'movie_list'
else:
self.src = Path(self.src)
if not self.src.exists():
sys.stderr.write(f'error: input does not exist - {self.src}\n')
return False
self.movie_list = {}
# Open the movie list & split the columns
with open(self.src, 'r', encoding='utf-8') as fp_handle:
mlist_raw = fp_handle.read()
for raw_line in mlist_raw.splitlines():
# In case the line is empty
if not raw_line:
continue
self.movie_list.update({
len(self.movie_list): {
'title': raw_line[0:next((i for i, ch in enumerate(raw_line) if ch in {'<', '('}), None) - 1],
'kind': raw_line[raw_line.find('<')+1:raw_line.rfind('>')+1].strip('<>') or 'movie',
'year': raw_line[raw_line.find('(')+1:raw_line.find(')')],
'status': raw_line[raw_line.find('[')+1:raw_line.find(']')],
}
})
self.html_table = [None] * len(self.movie_list)
# Progress bar
pbar = progressbar.ProgressBar(max_value=len(self.movie_list))
for idx, movie in self.movie_list.items():
# More precise matching - look for the hidden <p> tag with exact title
match = [html_row for html_row in self.prev_html
if f'<p hidden>{movie["title"]}</p>' in html_row
and 'N/A' not in html_row]
if match:
# Update the index and status from the cached row
match_str = match[0]
# Replace the status (* -> DONE or vice versa)
match_str = match_str.replace('*', movie['status']).replace('DONE', movie['status'])
# Update the index number
if '<td data-label="#">' in match_str:
# Extract everything after the index cell
after_index = match_str.split('</td>', 1)[1] if '</td>' in match_str else match_str
self.html_table[idx] = f'\n <tr><td data-label="#">{idx + 1}</td>{after_index}'
else:
self.html_table[idx] = match_str
pbar.increment()
else:
thread = threading.Thread(target=self._worker, args=(movie, idx))
self.threads.append(thread)
max_threads = 10
while self.threads:
threads_alive = self.get_alive_threads()
threads_to_be_started = [i for i in self.threads if i not in threads_alive]
for idx in range(max_threads if max_threads < len(threads_to_be_started) else len(threads_to_be_started)):
threads_to_be_started[idx].start()
pbar.increment()
time.sleep(2)
time.sleep(2)
self.delete_finished_threads()
self.html += ''.join(self.html_table)
# Deduplicate entries before writing
num_entries = self.deduplicate_html()
print(f"\nDeduplicated to {num_entries} unique entries")
self.html = self.html.split('</tbody>')[0] # Remove everything after tbody if it exists
self.html += ''.join(self.html_table)
return True
def delete_finished_threads(self):
for idx, thread in enumerate(self.threads):
if not thread.is_alive() and thread._started.is_set():
thread.join()
self.threads[idx] = None
self.threads = list(filter(lambda a: a is not None, self.threads))
def get_alive_threads(self):
threads = []
for thread in self.threads:
if thread.is_alive() or thread._started.is_set():
threads.append(thread)
return threads
def write(self, dst=None):
""" Write the HTML list to index.html """
out_path = dst if dst else self.dst
timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
self.html += f'''
</tbody>
</table>
</div>
<div class="timestamp">Generated {timestamp} UTC</div>
</body>
</html>'''
with open(out_path, 'wb') as fp_handle:
fp_handle.write(self.html.encode('utf8'))
def read_prev_output(self):
""" Import a previous HTML table """
if self.dst.exists():
with open(self.dst, 'rb') as fp_handle:
self.prev_html = fp_handle.read().decode('utf8').split('\n')
def deduplicate_html(self):
""" Remove duplicate entries from html_table based on movie titles """
seen_titles = set()
deduplicated = []
for idx, row in enumerate(self.html_table):
if row is None:
continue
# Extract the hidden title from the row
if '<p hidden>' in row and '</p>' in row:
start = row.find('<p hidden>') + 10
end = row.find('</p>', start)
title = row[start:end]
if title not in seen_titles:
seen_titles.add(title)
deduplicated.append(row)
else:
# Skip duplicate
continue
else:
# If we can't find the hidden title, keep the row anyway
deduplicated.append(row)
# Update html_table with deduplicated content
self.html_table = deduplicated
return len(self.html_table)
def main():
""" Default run """
src = dst = None
if len(sys.argv) > 3:
sys.stderr.write(f'error: max 2 variables, {len(sys.argv)-1} given!\n')
sys.exit(1)
if len(sys.argv) > 1:
src = sys.argv[1]
if len(sys.argv) == 3:
dst = sys.argv[2]
mlist = MovieList(src=src, dst=dst)
if mlist.gen():
mlist.write(dst=dst)
if __name__ == "__main__":
main()