Spring-2023-Data-Mining-Project/data_cleaning_functions.py at main · j-derrico/Spring-2023-Data-Mining-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import csv
import pandas as pd
from imdb import Cinemagoer, IMDbError
from tmdbv3api import TMDb
from tmdbv3api import Movie
from tmdbv3api import Find
import csv


def get_language_from_imdb(outfile, infile):
    '''appends movies that have languages found on imdb to the outfile'''
    out = open(outfile, "a")

    # create an instance of the Cinemagoer class
    ia = Cinemagoer()

    with open(infile, mode ='r') as file:

        # reading the CSV file
        movies = csv.reader(file, delimiter='\t')

        print('Languages:')
        for movie in movies:
            film = ia.get_movie(int(movie[0]))
            if (film.get('languages') is not None):
                print(movie[0],film.get('languages')[0])
                result = movie[0]+'\t'+film.get('languages')[0]+'\t'+movie[1]+'\t'+movie[2]+'\t'+movie[3]+'\t'+movie[4]+'\t'+movie[5]
                result = result + '\t'+movie[6]+'\t'+movie[7]+'\t'+movie[8]+'\t'+movie[9]+'\t'+movie[10]+'\t'+movie[11]+'\n'
            else:
                print(movie[0],"no language")
                result=movie[0]+'\tnolanguage\t'+movie[1]+'\t'+movie[2]+'\t'+movie[3]+'\t'+movie[4]+'\t'+movie[5]
                result = aaaa + '\t'+movie[6]+'\t'+movie[7]+'\t'+movie[8]+'\t'+movie[9]+'\t'+movie[10]+'\t'+movie[11]+'\n'

            out.write(aaaa)
    out.close()

def add_ratings_to_basics(input_file, out_file):
    '''returns outfile that adds star rating and number of ratings to input values'''
    basicsMoviesNoadult = pd.read_csv(input_file, sep='\t')

    ratings = pd.read_csv('title.ratings.tsv', sep='\t')

    my_ratings = ratings['tconst'].unique().tolist()
    new_basic = basicsMoviesNoadult[basicsMoviesNoadult['tconst'].isin(my_ratings)]
    new_basic.to_csv(out_file, sep='\t')

def get_language_from_tmdb(apiKey, input_file):
    '''appends movies that have languages found on tmdb to the outfile'''
    tmdb = TMDb()
    tmdb.api_key = apiKey
    find = Find()

    with open(input_file, mode ='r') as file:
        outfile = open("outfile.tsv", "a")
        # reading the CSV file
        movies = csv.reader(file, delimiter='\t')

        for movie in movies:
            print(movie[2])
            results = find.find_by_imdb_id(movie[2])
            for r in results["movie_results"]:
                result = movie[2]+'\t'+r.original_language+'\t'+movie[3]+'\t'+movie[4]+'\t'+movie[5]+'\t'+movie[6]+'\t'+movie[7]+'\t'
                result = result + movie[8]+'\t'+movie[9]+'\t'+movie[10]+'\t'+movie[11]+'\t'+movie[12]+'\n'
                outfile.write(result)

def missing_values(file1, file2):
    '''returns which films were not found on tmdb'''
    original = pd.read_csv(file1, sep='\t')
    output = pd.read_csv(file2, sep='\t')
    original = original[~original['tconst'].isin(output['tconst'])]
    original.to_csv('output.tsv', sep='\t')

def find_doubles(updated_file, tracking_file):
    '''finds films that have been entered more than once'''
    with open(updated_file, 'r') as file:
        lst = set()
        doubles = []
        movies = csv.reader(file)
        for movie in movies:
            if movie[0] not in lst:
                lst.add(movie[0])
            else:
                doubles.append(movie[0])

    with open(tracking_file, 'a') as fl:
        for item in doubles:
            entry = item + '\n'
            fl.write(entry)