Skip to content

Reference

This file documents the key functions in the Spacewalks tool. It is provided as a reference manual.

add_crew_size_column(df)

Add crew_size column to the dataset containing the value of the crew size

Parameters:

Name Type Description Default
df DataFrame

The input data frame.

required

Returns:

Name Type Description
df_copy DataFrame

A copy of the dataframe df with the new crew_size variable added

Source code in eva_data_analysis.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def add_crew_size_column(df):
    """
    Add crew_size column to the dataset containing the value of the crew size

    Args:
        df (pd.DataFrame): The input data frame.

    Returns:
        df_copy (pd.DataFrame): A copy of the dataframe df with the new crew_size variable added
    """
    print('Adding crew size variable (crew_size) to dataset')
    df_copy = df.copy()
    df_copy["crew_size"] = df_copy["crew"].apply(
        calculate_crew_size
    )
    return df_copy

add_duration_hours(df)

Add duration in hours (duration_hours) variable to the dataset

Parameters:

Name Type Description Default
df DataFrame

The input dataframe.

required

Returns:

Name Type Description
df_copy DataFrame

A copy of df with the new duration_hours variable added

Source code in eva_data_analysis.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def add_duration_hours(df):
    """
    Add duration in hours (duration_hours) variable to the dataset

    Args:
        df (pd.DataFrame): The input dataframe.

    Returns:
        df_copy (pd.DataFrame): A copy of df with the new duration_hours variable added
    """
    df_copy = df.copy()
    df_copy["duration_hours"] = df_copy["duration"].apply(
        text_to_duration
    )
    return df_copy

calculate_crew_size(crew)

Calculate the size of the crew for a single crew entry

Parameters:

Name Type Description Default
crew str

The text entry in the crew column containing a list of crew member names

required

Returns:

Type Description
int

The crew size

Source code in eva_data_analysis.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def calculate_crew_size(crew):
    """
    Calculate the size of the crew for a single crew entry

    Args:
        crew (str): The text entry in the crew column containing a list of crew member names

    Returns:
        (int): The crew size
    """
    if crew.split() == []:
        return None
    else:
        crew_members = re.split(r';', crew)
        crew_members = [i for i in crew_members if i != '']

        return len(crew_members)

plot_cumulative_time_in_space(df, graph_file)

Plot the cumulative time spent in space over years.

Convert the duration column from strings to number of hours Calculate cumulative sum of durations Generate a plot of cumulative time spent in space over years and save it to the specified location

Parameters:

Name Type Description Default
df DataFrame

The input dataframe.

required
graph_file file or str

The file object or path to the output graph file.

required

Returns:

Type Description

None

Source code in eva_data_analysis.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def plot_cumulative_time_in_space(df, graph_file):
    """
    Plot the cumulative time spent in space over years.

    Convert the duration column from strings to number of hours
    Calculate cumulative sum of durations
    Generate a plot of cumulative time spent in space over years and
    save it to the specified location

    Args:
        df (pd.DataFrame): The input dataframe.
        graph_file (file or str): The file object or path to the output graph file.

    Returns:
        None
    """
    print(f'Plotting cumulative spacewalk duration and saving to {graph_file}')
    df = add_duration_hours(df)
    df['cumulative_time'] = df['duration_hours'].cumsum()
    plt.plot(df['date'], df['cumulative_time'], 'ko-')
    plt.xlabel('Year')
    plt.ylabel('Total time spent in space to date (hours)')
    plt.tight_layout()
    plt.savefig(graph_file)
    plt.show()

read_json_to_dataframe(input_file)

Read the data from a JSON file into a Pandas dataframe. Clean the data by removing any rows where the 'duration' value is missing.

Parameters:

Name Type Description Default
input_file file or str

The file object or path to the JSON file.

required

Returns:

Name Type Description
eva_df DataFrame

The cleaned data as a dataframe structure

Source code in eva_data_analysis.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def read_json_to_dataframe(input_file):
    """
    Read the data from a JSON file into a Pandas dataframe.
    Clean the data by removing any rows where the 'duration' value is missing.

    Args:
        input_file (file or str): The file object or path to the JSON file.

    Returns:
         eva_df (pd.DataFrame): The cleaned data as a dataframe structure
    """
    print(f'Reading JSON file {input_file}')

    # Read the data from a JSON file into a Pandas dataframe
    eva_df = pd.read_json(input_file, convert_dates=['date'], encoding='ascii')
    eva_df['eva'] = eva_df['eva'].astype(float)

    # Clean the data by removing any rows where duration is missing
    eva_df.dropna(axis=0, subset=['duration', 'date'], inplace=True)

    return eva_df

text_to_duration(duration)

Convert a text format duration "HH:MM" to duration in hours

Parameters:

Name Type Description Default
duration str

The text format duration

required

Returns:

Name Type Description
duration_hours float

The duration in hours

Source code in eva_data_analysis.py
137
138
139
140
141
142
143
144
145
146
147
148
149
def text_to_duration(duration):
    """
    Convert a text format duration "HH:MM" to duration in hours

    Args:
        duration (str): The text format duration

    Returns:
        duration_hours (float): The duration in hours
    """
    hours, minutes = duration.split(":")
    duration_hours = int(hours) + int(minutes) / 60  # there is an intentional bug on this line (should divide by 60 not 6)
    return duration_hours

write_dataframe_to_csv(df, output_file)

Write the dataframe to a CSV file.

Parameters:

Name Type Description Default
df DataFrame

The input dataframe.

required
output_file file or str

The file object or path to the output CSV file.

required

Returns:

Type Description

None

Source code in eva_data_analysis.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def write_dataframe_to_csv(df, output_file):
    """
        Write the dataframe to a CSV file.

        Args:
            df (pd.DataFrame): The input dataframe.
            output_file (file or str): The file object or path to the output CSV file.

        Returns:
            None
        """

    print(f'Saving to CSV file {output_file}')

    # Save dataframe to CSV file for later analysis
    df.to_csv(output_file, index=False, encoding='utf-8')