timeline

get_timeline_df ¶

get_timeline_df(
    json_path: Optional[Path],
    timeline_data: dict,
    repo: str,
) -> pd.DataFrame

Create timeline DataFrame from timeline data in JSON file

If global variable read_cached_df is True, and .feather file with cached data exists, read DataFrame from that file. If global variable save_cached_df is True, and .feather file with cached data does not exist, save DataFrame to that file.

Parameters:

Name	Type	Description	Default
`json_path`	`Optional[Path]`	used to find cached data, if present, and possibly for error and debug messages (when logging)	required
`timeline_data`	`dict`	per-repo data to convert to pd.DataFrame and process; usually there is only a single repo (single key) in `timeline_data` dict	required
`repo`	`str`	data from which repo to extract from `timeline_data`	required

Returns:

Type	Description
`DataFrame`	augmented dataframe, for example with 'n_commits' column added

Source code in src/diffinsights_web/datastore/timeline.py

def get_timeline_df(json_path: Optional[Path], timeline_data: dict, repo: str) -> pd.DataFrame:
    """Create timeline DataFrame from timeline data in JSON file

    If global variable `read_cached_df` is True, and *.feather file with cached
    data exists, read DataFrame from that file.  If global variable `save_cached_df`
    is True, and *.feather file with cached data does not exist, save DataFrame
    to that file.

    :param json_path: used to find cached data, if present, and possibly
        for error and debug messages (when logging)
    :param timeline_data: per-repo data to convert to pd.DataFrame and process;
        usually there is only a single repo (single key) in `timeline_data` dict
    :param repo: data from which repo to extract from `timeline_data`
    :return: augmented dataframe, for example with 'n_commits' column added
    """
    if json_path is not None:
        # NOTE: json_path can be 'str', not 'Path'
        cache_file = Path(json_path).with_suffix('.feather')
        if read_cached_df and cache_file.is_file():
            # read cached data
            try:
                #print(f"get_timeline_df({json_path=}, {timeline_data=}, {repo=}) -> read .feather cache")
                return pd.read_feather(cache_file)
            except ModuleNotFoundError:
                # No module named 'pyarrow'
                # TODO: log warning for this problem
                print("get_timeline_df -> ModuleNotFoundError")
                pass

    # TODO: remove after test_app_contributors_performance.py gets fixed
    try:
        init_df = pd.DataFrame.from_records(timeline_data[repo])
    except KeyError:
        # workaround: use first (and oftentimes only) repo
        init_df = pd.DataFrame.from_records(timeline_data[next(iter(timeline_data))])

    # no merges, no roots; add 'n_commits' column; drop rows with N/A for timestamps
    df = init_df[init_df['n_parents'] == 1]\
        .dropna(subset=['author.timestamp', 'committer.timestamp'], how='any')\
        .assign(
            n_commits =  1,
            author_date    = lambda x: pd.to_datetime(x['author.timestamp'],    unit='s', utc=True),
            committer_date = lambda x: pd.to_datetime(x['committer.timestamp'], unit='s', utc=True),
        )

    if save_cached_df and json_path is not None:
        # TODO: add logging
        cache_file = Path(json_path).with_suffix('.feather')
        # TODO: check if json_path is newer
        if not cache_file.is_file():
            df.to_feather(cache_file)

    return df