Skip to content

timeline

get_timeline_df

get_timeline_df(
    json_path: Optional[Path],
    timeline_data: dict,
    repo: str,
) -> pd.DataFrame

Create timeline DataFrame from timeline data in JSON file

If global variable read_cached_df is True, and .feather file with cached data exists, read DataFrame from that file. If global variable save_cached_df is True, and .feather file with cached data does not exist, save DataFrame to that file.

Parameters:

Name Type Description Default
json_path Optional[Path]

used to find cached data, if present, and possibly for error and debug messages (when logging)

required
timeline_data dict

per-repo data to convert to pd.DataFrame and process; usually there is only a single repo (single key) in timeline_data dict

required
repo str

data from which repo to extract from timeline_data

required

Returns:

Type Description
DataFrame

augmented dataframe, for example with 'n_commits' column added

Source code in src/diffinsights_web/datastore/timeline.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def get_timeline_df(json_path: Optional[Path], timeline_data: dict, repo: str) -> pd.DataFrame:
    """Create timeline DataFrame from timeline data in JSON file

    If global variable `read_cached_df` is True, and *.feather file with cached
    data exists, read DataFrame from that file.  If global variable `save_cached_df`
    is True, and *.feather file with cached data does not exist, save DataFrame
    to that file.

    :param json_path: used to find cached data, if present, and possibly
        for error and debug messages (when logging)
    :param timeline_data: per-repo data to convert to pd.DataFrame and process;
        usually there is only a single repo (single key) in `timeline_data` dict
    :param repo: data from which repo to extract from `timeline_data`
    :return: augmented dataframe, for example with 'n_commits' column added
    """
    if json_path is not None:
        # NOTE: json_path can be 'str', not 'Path'
        cache_file = Path(json_path).with_suffix('.feather')
        if read_cached_df and cache_file.is_file():
            # read cached data
            try:
                #print(f"get_timeline_df({json_path=}, {timeline_data=}, {repo=}) -> read .feather cache")
                return pd.read_feather(cache_file)
            except ModuleNotFoundError:
                # No module named 'pyarrow'
                # TODO: log warning for this problem
                print("get_timeline_df -> ModuleNotFoundError")
                pass

    # TODO: remove after test_app_contributors_performance.py gets fixed
    try:
        init_df = pd.DataFrame.from_records(timeline_data[repo])
    except KeyError:
        # workaround: use first (and oftentimes only) repo
        init_df = pd.DataFrame.from_records(timeline_data[next(iter(timeline_data))])

    # no merges, no roots; add 'n_commits' column; drop rows with N/A for timestamps
    df = init_df[init_df['n_parents'] == 1]\
        .dropna(subset=['author.timestamp', 'committer.timestamp'], how='any')\
        .assign(
            n_commits =  1,
            author_date    = lambda x: pd.to_datetime(x['author.timestamp'],    unit='s', utc=True),
            committer_date = lambda x: pd.to_datetime(x['committer.timestamp'], unit='s', utc=True),
        )

    if save_cached_df and json_path is not None:
        # TODO: add logging
        cache_file = Path(json_path).with_suffix('.feather')
        # TODO: check if json_path is newer
        if not cache_file.is_file():
            df.to_feather(cache_file)

    return df