Improve movie list generator, add more movies

Signed-off-by: Bogomil Vasilev <smirky@smirky.net>
2022-12-07 23:36:36 +02:00
parent 4bfc45c100
commit 3e7dafb4b2
2 changed files with 62 additions and 36 deletions
--- a/movie_list/index.py
+++ b/movie_list/index.py
@@ -12,9 +12,9 @@ import sys
 import time
 import threading
 from pathlib import Path
-from http.client import IncompleteRead
 import progressbar
 from imdb import IMDb
+from imdb._exceptions import IMDbParserError


 class MovieList:
@@ -59,27 +59,35 @@ class MovieList:
        # Scan IMDb for a given movie and append it to the html
        # This collects rating, genres, official name and a hyperlink
        imdb = IMDb()
-        save_stdout = sys.stdout
-        with open(os.devnull, 'wb') as sys.stdout:
-            while True:
-                try:
-                    query = imdb.search_movie(f'{arg["title"]} {arg["year"]}')
-                    break
-                except IncompleteRead:
-                    pass
-        sys.stdout = save_stdout
+        while True:
+            try:
+                query = imdb.search_movie(f'{arg["title"]} {arg["year"]}')
+                break
+            except IMDbParserError as exc:
+                query = []
+                #print(exc)
+                break
+            except Exception as exc:
+                #print(f'error: {exc.__class__.__name__}: {arg["title"]}')
+                time.sleep(10)

        movie = None
        for entry in query:
-            has_minimum_keys = True
-            for key in ['kind', 'year', 'title']:
-                if key not in entry.keys():
-                    has_minimum_keys = False
-            if not has_minimum_keys:
+            #print(entry)
+            imdb.update(entry)
+            # in case any of these keys is missing in the query, continue
+            if not all(key in entry.keys() for key in ['kind', 'year', 'title']):
+                #print(f'missing key {entry.keys()}')
+                continue
+            if arg['status'] == 'DONE' and 'rating' not in entry.keys():
                continue
            # Try to eliminate episode results
-            if [i for i in entry.keys() if 'episode' in i.lower()] or \
-                    'episode' in entry['title'].lower():
+            # Must not have "episode" in the object keys
+            # Must not have "episode" in the query title key,
+            # unless "episode" is in the query search string
+            if [i for i in entry.keys() if 'episode' in i.lower()] or (
+                    'episode' in entry['title'].lower() and \
+                    'episode' not in arg['title'].lower()):
                continue
            if entry['kind'].lower() == arg['kind'].lower():
                movie = entry
@@ -91,7 +99,6 @@ class MovieList:
                'year': arg['year'],
                'dummy': None
            }
-
        if 'genres' not in movie.keys():
            movie['genres'] = ['N/A']
        if 'rating' not in movie.keys():
@@ -101,7 +108,7 @@ class MovieList:
                f'<a href="https://www.imdb.com/title/tt{movie.movieID}" target="_blank">{movie["title"]}</a>'
        self.html_table[index] = (
            f'\n{" "*8}<tr><td>{index + 1}</td>'
-            f'<td><p hidden>{movie["title"]}</p>{html_title_td}</td>'
+            f'<td><p hidden>{arg["title"]}</p>{html_title_td}</td>'
            f'<td>{movie["year"]}</td><td align="center">{movie["rating"]}</td>'
            f'<td>{", ".join(movie["genres"])}</td>'
            f'<td align="center">{arg["status"]}</td></tr>'
@@ -124,7 +131,7 @@ class MovieList:
            for raw_line in mlist_raw.splitlines():
                self.movie_list.update({
                    len(self.movie_list): {
-                        'title': raw_line.split('(', 1)[0].strip(),
+                        'title': raw_line[0:next((i for i, ch in enumerate(raw_line) if ch in {'<', '('}), None) - 1],
                        'kind': raw_line[raw_line.find('<')+1:raw_line.rfind('>')+1].strip('<>') or 'movie',
                        'year': raw_line[raw_line.find('(')+1:raw_line.find(')')],
                        'status': raw_line[raw_line.find('[')+1:raw_line.find(']')],
@@ -135,28 +142,46 @@ class MovieList:
        # Progress bar. Enough said
        pbar = progressbar.ProgressBar(max_value=len(self.movie_list))
        for idx, movie in self.movie_list.items():
-            match = [html_row for html_row in self.prev_html if movie['title'] in html_row]
+            match = [html_row for html_row in self.prev_html if movie['title'] in html_row and 'N/A' not in html_row]
            if match:
                # Update movies as DONE in case of change
                match = match[0].replace('*', movie['status'])
                # Directly insert the current HTML line from the older output
                self.html_table[idx] = \
                        f'\n{" "*8}<tr><td>{idx + 1}</td>{match[match.find("</td>") + 5:]}'
-                pbar.update(idx + 1)
+                pbar.increment()
            else:
                thread = threading.Thread(target=self._worker, args=(movie, idx))
                self.threads.append(thread)
-                thread.start()
-                pbar.update(idx+1)
-                time.sleep(0.2)
-                if len(self.threads) % 16 == 0:
-                    time.sleep(6)

-        for thread in self.threads:
-            thread.join()
+        max_threads = 10
+        while self.threads:
+            threads_alive = self.get_alive_threads()
+            threads_to_be_started = [i for i in self.threads if i not in threads_alive]
+            for idx in range(max_threads if max_threads < len(threads_to_be_started) else len(threads_to_be_started)):
+                threads_to_be_started[idx].start()
+                pbar.increment()
+                time.sleep(1)
+            time.sleep(1)
+            self.delete_finished_threads()
+
        self.html += ''.join(self.html_table)
        return True

+    def delete_finished_threads(self):
+        for idx, thread in enumerate(self.threads):
+            if not thread.is_alive() and thread._started.is_set():
+                thread.join()
+                self.threads[idx] = None
+        self.threads = list(filter(lambda a: a is not None, self.threads))
+
+    def get_alive_threads(self):
+        threads = []
+        for thread in self.threads:
+            if thread.is_alive() or thread._started.is_set():
+                threads.append(thread)
+        return threads
+
    def write(self, dst=None):
        """ Write the HTML list to index.html """
        out_path = dst if dst else self.dst