EIGER/prepare_data.py at master · nedavid/EIGER · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# prepare_data.py

import os
import pandas as pd
from docx import Document

class Preparation:
    def __init__(self, doc_path):
        """Initialize with the path to the DOCX file."""
        self.doc = Document(doc_path)
        self.dataframes = {}

    def read_docx_table(self, table_num=1, nheader=1):
        """Read a table from a DOCX file and return it as a DataFrame."""
        table = self.doc.tables[table_num]
        data = [[cell.text for cell in row.cells] for row in table.rows]
        df = pd.DataFrame(data)

        if nheader == 1:
            df = df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
            df = df.set_index(df.columns[0])
            df = df.apply(pd.to_numeric)
        elif nheader == 2:
            df = df.drop(df.index[0]).reset_index(drop=True)
            df = df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
            df = df.set_index(df.columns[0])

        return df

    def read_data(self):
        # Read tables
        self.dataframes = {
            'alpha.json': self.read_docx_table(0),
            'beta_20.json': self.read_docx_table(1),
            'beta_10.json': self.read_docx_table(2),
            'beta_5.json': self.read_docx_table(3),
            'chi_20.json': self.read_docx_table(4),
            'chi_15.json': self.read_docx_table(5),
            'chi_10.json': self.read_docx_table(6),
            'chi_5.json': self.read_docx_table(7),
            'delta_15.json': self.read_docx_table(8),
            'delta_10.json': self.read_docx_table(9),
            'delta_5.json': self.read_docx_table(10),
        }

        # Process lit_conv_df and lit_egs_df separately
        lit_conv_df = self.read_docx_table(19,2).iloc[:-1]
        lit_egs_df = self.read_docx_table(20,2)
        lit_conv_df['Operational CO2 emissions [g/kWh]'] = pd.to_numeric(
            lit_conv_df['Operational CO2 emissions [g/kWh]'], errors='coerce'
        )
        lit_conv_df['Operational CH4 emissions [g/kWh]'] = pd.to_numeric(
            lit_conv_df['Operational CH4 emissions [g/kWh]'], errors='coerce'
        ).fillna(0)
        lit_egs_df['Diesel consumption (MJ/m)'] = 1000*pd.to_numeric(
            lit_egs_df['Diesel consumption (GJ/m)'], errors='coerce'
        )
        lit_egs_df.drop(columns=['Diesel consumption (GJ/m)'], inplace=True)
        lit_egs_df['Installed capacity (MW)'] = pd.to_numeric(
            lit_egs_df['Installed capacity (MW)'], errors='coerce'
        )
        lit_egs_df['Depth of wells [m]'] = pd.to_numeric(
            lit_egs_df['Depth of wells \n[m]'], errors='coerce'
        )
        lit_egs_df['Success rate [%]'] = pd.to_numeric(
            lit_egs_df['Success rate \n[%]'], errors='coerce'
        )
        lit_conv_df['com_key'] = lit_conv_df.index.astype(str) + '_' + lit_conv_df['Scenario'] + '_' + lit_conv_df['Technology']
        lit_conv_df = lit_conv_df.set_index('com_key')
        lit_egs_df['com_key'] = lit_egs_df.index.astype(str) + '_' + lit_egs_df['Scenario']
        lit_egs_df = lit_egs_df.set_index('com_key')

        # Add lit_conv_df and lit_egs_df to dataframes
        self.dataframes['lit_conv.json'] = lit_conv_df
        self.dataframes['lit_egs.json'] = lit_egs_df

    def write_output(self, output_dir="data/"):
        # Save all DataFrames to JSON files
        for filename, df in self.dataframes.items():
            # Convert floating-point numbers to scientific notation strings
            df = df.applymap(lambda x: f"{x:.6e}" if isinstance(x, float) else x)
            df.to_json(os.path.join(output_dir, filename))

def main():
    # Supplementary information from Paulillo et al. (2022), https://doi.org/10.1016/j.cesys.2022.100086
    data = Preparation("data/simplified_SI.docx")

    # Read the tables
    data.read_data()

    # Save all DataFrames to JSON files
    data.write_output("data/")

if __name__ == "__main__":
    main()