Modules

`align_horizontally(canopus_file, gnps_file, gnps_mn_file, isdb_file, sirius_file, output=None)` ¶

CLI tool to align and merge data from CANOPUS, GNPS, Sirius, and ISDB horizontally.

Parameters:

Name	Type	Description	Default
`canopus_file`	`Optional[str]`	Path to CANOPUS output file.	required
`gnps_file`	`Optional[str]`	Path to GNPS output file.	required
`gnps_mn_file`	`Optional[str]`	Path to GNPS MN output file.	required
`sirius_file`	`Optional[str]`	Path to Sirius output file.	required
`isdb_file`	`Optional[str]`	Path to ISDB output file.	required
`output`	`Optional[str]`	Output file to save the merged data.	`None`

Returns:

Type	Description
`None`	A dataframe with the aligned data (if the output option is used, the dataframe is saved to a file)

Source code in met_annot_unifier/cli.py

@cli.command()
@click.option("--canopus-file", type=click.Path(), default=None, help="Path to CANOPUS output file.")
@click.option("--gnps-file", type=click.Path(), default=None, help="Path to GNPS output file.")
@click.option("--gnps-mn-file", type=click.Path(), default=None, help="Path to GNPS MN output file.")
@click.option("--isdb-file", type=click.Path(), default=None, help="Path to ISDB output file.")
@click.option("--sirius-file", type=click.Path(), default=None, help="Path to Sirius output file.")
@click.option("--output", "-o", type=click.Path(), help="Output file to save the merged data.")
def align_horizontally(
    canopus_file: Optional[str],
    gnps_file: Optional[str],
    gnps_mn_file: Optional[str],
    isdb_file: Optional[str],
    sirius_file: Optional[str],
    output: Optional[str] = None,
) -> None:
    """CLI tool to align and merge data from CANOPUS, GNPS, Sirius, and ISDB horizontally.

    Args:
        canopus_file (Optional[str]): Path to CANOPUS output file.
        gnps_file (Optional[str]): Path to GNPS output file.
        gnps_mn_file (Optional[str]): Path to GNPS MN output file.
        sirius_file (Optional[str]): Path to Sirius output file.
        isdb_file (Optional[str]): Path to ISDB output file.
        output (Optional[str]): Output file to save the merged data.

    Returns:
        A dataframe with the aligned data (if the output option is used, the dataframe is saved to a file)
    """
    aligned_data = align_data_horizontally(
        canopus_file=canopus_file,
        gnps_file=gnps_file,
        gnps_mn_file=gnps_mn_file,
        isdb_file=isdb_file,
        sirius_file=sirius_file,
    )

    if output:
        aligned_data.to_csv(output, index=False, sep="\t")
        click.echo(f"Aligned data saved to {output}")
    else:
        click.echo(aligned_data)

`align_vertically(gnps_file, sirius_file, isdb_file, output=None)` ¶

CLI tool to align and merge data from GNPS, Sirius, and ISDB.

Parameters:

Name	Type	Description	Default
`gnps_file`	`str`	Path to GNPS output file.	required
`isdb_file`	`str`	Path to ISDB output file.	required
`sirius_file`	`str`	Path to Sirius output file.	required
`output`	`str`	Output file to save the merged data. Defaults to None.	`None`

Returns:

Type	Description
`None`	A dataframe with the aligned data (if the output option is used, the dataframe is saved to a file)

Source code in met_annot_unifier/cli.py

@cli.command()
@click.option("--gnps-file", type=click.Path(exists=True), help="Path to GNPS output file.")
@click.option("--isdb-file", type=click.Path(exists=True), help="Path to ISDB output file.")
@click.option("--sirius-file", type=click.Path(exists=True), help="Path to Sirius output file.")
@click.option("--output", "-o", type=click.Path(), help="Output file to save the merged data.")
def align_vertically(gnps_file: str, sirius_file: str, isdb_file: str, output: Optional[str] = None) -> None:
    """CLI tool to align and merge data from GNPS, Sirius, and ISDB.

    Args:
        gnps_file (str): Path to GNPS output file.
        isdb_file (str): Path to ISDB output file.
        sirius_file (str): Path to Sirius output file.
        output (str, optional): Output file to save the merged data. Defaults to None.

    Returns:
        A dataframe with the aligned data (if the output option is used, the dataframe is saved to a file)
    """
    aligned_data = align_data_vertically(gnps_file=gnps_file, isdb_file=isdb_file, sirius_file=sirius_file)

    if output:
        aligned_data.to_csv(output, index=False, sep="\t")
        click.echo(f"Aligned data saved to {output}")
    else:
        click.echo(aligned_data)

`cli()` ¶

Description for your CLI tool.

Source code in met_annot_unifier/cli.py

@click.group()
def cli() -> None:
    """Description for your CLI tool."""
    print("CLI is running")
    pass

`prune_table(input_file, list_columns, remove, output=None)` ¶

CLI tool to remove columns from a DataFrame.

Parameters:

Name	Type	Description	Default
`input_file`	`str`	Path to the input file.	required
`list_columns`	`str`	list of columns to remove.	required
`remove`	`bool`	If True, removes only the specified columns; otherwise, keeps them.	required
`output`	`str`	Output file to save the pruned data. Defaults to None.	`None`

Returns:

Type	Description
`None`	A dataframe with the pruned data (if the output option is used, the dataframe is saved to a file)

Source code in met_annot_unifier/cli.py

@cli.command()
@click.option("--input-file", type=click.Path(exists=True), help="Path to the input file.")
@click.option("--remove", is_flag=True, help="Removes the specified columns instead of keeping them.")
@click.option(
    "--list-columns",
    required=True,
    help="Key in the JSON configuration for the list of columns to be processed.",
)
@click.option("--output", "-o", type=click.Path(), help="Output file to save the pruned data.")
def prune_table(input_file: str, list_columns: str, remove: bool, output: Optional[str] = None) -> None:
    """CLI tool to remove columns from a DataFrame.

    Args:
        input_file (str): Path to the input file.
        list_columns (str): list of columns to remove.
        remove (bool): If True, removes only the specified columns; otherwise, keeps them.
        output (str, optional): Output file to save the pruned data. Defaults to None.

    Returns:
        A dataframe with the pruned data (if the output option is used, the dataframe is saved to a file)
    """

    # Load the configuration file
    columns_to_remove = load_configuration("column_config.json")

    # Get the columns to process
    columns_to_process = columns_to_remove[list_columns]

    # Load the input file
    df = pd.read_csv(input_file, sep="\t")

    # Prune the table
    pruned_data = table_pruner(df, columns_to_process, remove=remove)

    # Save or print the result
    if output:
        pruned_data.to_csv(output, index=False, sep="\t")
        click.echo(f"Pruned data saved to {output}")
    else:
        click.echo(pruned_data.to_string())

`align_data_horizontally(canopus_file=None, gnps_file=None, gnps_mn_file=None, isdb_file=None, sirius_file=None)` ¶

Aligns and merges data from GNPS, Sirius, ISDB and CANOPUS datasets, if provided. This function merges the data horizontally, keeping the data in a wide format. The function standardizes column names, prefixes them to indicate their source, and merges the data based on 'feature_id'.

Args: canopus_file (Optional[str]): File path for the CANOPUS data in TSV format. gnps_file (Optional[str]): File path for the GNPS data in TSV format. isdb_file (Optional[str]): File path for the ISDB data in TSV format. sirius_file (Optional[str]): File path for the Sirius data in TSV format.

Returns: pd.DataFrame: A DataFrame with aligned and merged data from the provided sources.

Source code in met_annot_unifier/aligner/aligner.py

def align_data_horizontally(
    canopus_file: Optional[str] = None,
    gnps_file: Optional[str] = None,
    gnps_mn_file: Optional[str] = None,
    isdb_file: Optional[str] = None,
    sirius_file: Optional[str] = None,
) -> pd.DataFrame:
    """
    Aligns and merges data from GNPS, Sirius, ISDB  and CANOPUS datasets, if provided. This function merges the data horizontally,
    keeping the data in a wide format. The function standardizes column names, prefixes them to indicate their source,
    and merges the data based on 'feature_id'.

    Args:
    canopus_file (Optional[str]): File path for the CANOPUS data in TSV format.
    gnps_file (Optional[str]): File path for the GNPS data in TSV format.
    isdb_file (Optional[str]): File path for the ISDB data in TSV format.
    sirius_file (Optional[str]): File path for the Sirius data in TSV format.

    Returns:
    pd.DataFrame: A DataFrame with aligned and merged data from the provided sources.
    """

    data_frames = []

    if canopus_file:
        canopus_data = pd.read_csv(canopus_file, sep="\t")
        canopus_data = standardize_column_names(canopus_data, "mappingFeatureId", "feature_id")
        canopus_data = prefix_columns(canopus_data, "canopus_", exclude_columns=[])
        # the CANOPUS NPC classifier columns names are standardized
        canopus_data = standardize_column_names(canopus_data, "canopus_NPC#pathway", "canopus_npc_pathway")
        canopus_data = standardize_column_names(canopus_data, "canopus_NPC#superclass", "canopus_npc_superclass")
        canopus_data = standardize_column_names(canopus_data, "canopus_NPC#class", "canopus_npc_class")
        canopus_data = extract_feature_id(canopus_data, "canopus_feature_id")
        canopus_data = standardize_column_names(canopus_data, "canopus_feature_id", "feature_id")
        data_frames.append(canopus_data)

    if gnps_file:
        gnps_data = pd.read_csv(gnps_file, sep="\t")
        gnps_data = standardize_column_names(gnps_data, "InChIKey-Planar", "IK2D")
        gnps_data = standardize_column_names(gnps_data, "#Scan#", "feature_id")
        gnps_data = standardize_column_names(gnps_data, "Smiles", "SMILES")
        gnps_data = prefix_columns(gnps_data, "gnps_", exclude_columns=[])
        # the GNPS NPC classifier columns names are standardized
        gnps_data = standardize_column_names(gnps_data, "gnps_npclassifier_pathway", "gnps_npc_pathway")
        gnps_data = standardize_column_names(gnps_data, "gnps_npclassifier_superclass", "gnps_npc_superclass")
        gnps_data = standardize_column_names(gnps_data, "gnps_npclassifier_class", "gnps_npc_class")
        gnps_data = standardize_column_names(gnps_data, "gnps_feature_id", "feature_id")
        data_frames.append(gnps_data)

    if isdb_file:
        isdb_data = pd.read_csv(isdb_file, sep="\t")
        isdb_data = standardize_column_names(isdb_data, "short_inchikey", "IK2D")
        isdb_data = standardize_column_names(isdb_data, "feature_id", "feature_id")
        isdb_data = standardize_column_names(isdb_data, "structure_smiles", "SMILES")
        isdb_data = prefix_columns(isdb_data, "isdb_", exclude_columns=[])
        # the ISDB NPC classifier columns names are standardized
        isdb_data = standardize_column_names(
            isdb_data, "isdb_structure_taxonomy_npclassifier_01pathway", "isdb_npc_pathway"
        )
        isdb_data = standardize_column_names(
            isdb_data, "isdb_structure_taxonomy_npclassifier_02superclass", "isdb_npc_superclass"
        )
        isdb_data = standardize_column_names(
            isdb_data, "isdb_structure_taxonomy_npclassifier_03class", "isdb_npc_class"
        )
        isdb_data = standardize_column_names(isdb_data, "isdb_feature_id", "feature_id")
        data_frames.append(isdb_data)

    if sirius_file:
        # Read and process Sirius data
        sirius_data = pd.read_csv(sirius_file, sep="\t")
        sirius_data = standardize_column_names(sirius_data, "InChIkey2D", "IK2D")
        sirius_data = standardize_column_names(sirius_data, "mappingFeatureId", "feature_id")
        sirius_data = standardize_column_names(sirius_data, "smiles", "SMILES")
        sirius_data = prefix_columns(sirius_data, "sirius_", exclude_columns=[])
        sirius_data = extract_feature_id(sirius_data, "sirius_feature_id")
        sirius_data = standardize_column_names(sirius_data, "sirius_feature_id", "feature_id")
        data_frames.append(sirius_data)

    if not data_frames:
        raise DataFileError()

    # Merge the dataframes horizontally on 'feature_id'
    merged_data = reduce(lambda left, right: pd.merge(left, right, on="feature_id", how="outer"), data_frames)

    # The sources of the annotations are processed and combined
    # Create the 'Sources' column. Fill it according the content of the tool_IK2D columns.
    # E.g. if sirius_IK2D is not null and matches isdb_IK2D, then the source is 'SIRIUS, ISDB'

    merged_data = process_IK2D_sources(merged_data)
    merged_data = process_npc_pathway_sources(merged_data)
    merged_data = process_npc_superclass_sources(merged_data)
    merged_data = process_npc_class_sources(merged_data)

    merged_data["sources_number_IK2D"] = merged_data["sources_IK2D"].apply(count_sources)

    # Load GNPS MN data if provided

    if gnps_mn_file:
        gnps_mn_data = pd.read_csv(gnps_mn_file, sep="\t")
        gnps_mn_data = standardize_column_names(gnps_mn_data, "cluster index", "feature_id")
        gnps_mn_data = prefix_columns(gnps_mn_data, "gnps_mn_", exclude_columns=[])
        gnps_mn_data = standardize_column_names(gnps_mn_data, "gnps_mn_feature_id", "feature_id")
        merged_data = pd.merge(merged_data, gnps_mn_data, on="feature_id", how="outer")

    # Select columns

    selected_columns = [
        "feature_id",
        "sources_IK2D",
        "sources_number_IK2D",
        "sources_npc_pathway",
        "sources_npc_superclass",
        "sources_npc_class",
    ]

    # Place the selected columns at the front of the dataframe

    merged_data = merged_data[
        selected_columns + [column for column in merged_data.columns if column not in selected_columns]
    ]

    return merged_data

`align_data_vertically(gnps_file=None, isdb_file=None, sirius_file=None)` ¶

Aligns and merges data from GNPS, Sirius, and ISDB datasets optionally. Files can be provided for any subset of these datasets. The function standardizes column names, prefixes them to indicate their source, merges the data based on 'feature_id' and 'IK2D', and then creates consolidated 'Sources' and 'SMILES' columns.

Parameters:

Name	Type	Description	Default
`gnps_file`	`str`	File path for the GNPS data in TSV format.	`None`
`sirius_file`	`str`	File path for the Sirius data in TSV format.	`None`
`isdb_file`	`str`	File path for the ISDB data in TSV format.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A DataFrame with aligned and merged data from the provided sources.

Example

gnps_file = 'path/to/gnps_data.tsv' sirius_file = 'path/to/sirius_data.tsv' aligned_data = align_data_vertically(gnps_file=gnps_file, sirius_file=sirius_file) print(aligned_data.columns) Index(['feature_id', 'IK2D', 'Sources', 'SMILES', ...], dtype='object')

Source code in met_annot_unifier/aligner/aligner.py

def align_data_vertically(
    gnps_file: Optional[str] = None,
    isdb_file: Optional[str] = None,
    sirius_file: Optional[str] = None,
) -> pd.DataFrame:
    """
    Aligns and merges data from GNPS, Sirius, and ISDB datasets optionally. Files can be provided for any subset of these datasets.
    The function standardizes column names, prefixes them to indicate their source, merges the data based on 'feature_id'
    and 'IK2D', and then creates consolidated 'Sources' and 'SMILES' columns.

    Args:
        gnps_file (str, optional): File path for the GNPS data in TSV format.
        sirius_file (str, optional): File path for the Sirius data in TSV format.
        isdb_file (str, optional): File path for the ISDB data in TSV format.

    Returns:
        pd.DataFrame: A DataFrame with aligned and merged data from the provided sources.

    Example:
        >>> gnps_file = 'path/to/gnps_data.tsv'
        >>> sirius_file = 'path/to/sirius_data.tsv'
        >>> aligned_data = align_data_vertically(gnps_file=gnps_file, sirius_file=sirius_file)
        >>> print(aligned_data.columns)
        Index(['feature_id', 'IK2D', 'Sources', 'SMILES', ...], dtype='object')
    """

    data_frames = []

    if gnps_file:
        gnps_data = process_gnps_data(gnps_file)
        data_frames.append(gnps_data)

    if isdb_file:
        isdb_data = process_isdb_data(isdb_file)
        data_frames.append(isdb_data)

    if sirius_file:
        sirius_data = process_sirius_data(sirius_file)
        data_frames.append(sirius_data)

    # Ensure that at least one data frame has been loaded
    if not data_frames:
        raise DataFileError()

    # Concatenate all available data frames
    combined_data = pd.concat([df for df in data_frames if not df.empty], axis=0, ignore_index=True)
    # Group by 'feature_id' and 'IK2D' and combine the annotations
    merged_data = combined_data.groupby(["feature_id", "IK2D"], as_index=False).agg(
        lambda x: ", ".join(x.dropna().astype(str).unique())
    )

    # Create the 'Sources' column
    source_columns = [col for col in merged_data.columns if col.endswith("annotation_source")]
    merged_data["Sources"] = merged_data.apply(
        lambda row: "|".join(sorted(filter(None, [row.get(col) for col in source_columns]))), axis=1
    )
    merged_data.drop(columns=source_columns, inplace=True)

    # Handle the SMILES column
    # Specify the priority order for SMILES columns explicitly
    smiles_columns = [
        "sirius_SMILES",  # Highest priority
        "isdb_SMILES",
        "gnps_SMILES",  # Lowest priority
    ]

    # Check and keep only those columns that actually exist in the merged data
    smiles_columns = [col for col in smiles_columns if col in merged_data.columns]
    merged_data["SMILES"] = merged_data.apply(
        lambda row: next((row[col] for col in smiles_columns if row[col]), None), axis=1
    )

    # Select and reorder columns
    selected_columns = ["feature_id", "IK2D", "Sources", "SMILES"]
    merged_data = merged_data[selected_columns + [col for col in merged_data.columns if col not in selected_columns]]

    return merged_data

Modules

align_horizontally(canopus_file, gnps_file, gnps_mn_file, isdb_file, sirius_file, output=None) ¶

align_vertically(gnps_file, sirius_file, isdb_file, output=None) ¶

cli() ¶

prune_table(input_file, list_columns, remove, output=None) ¶

align_data_horizontally(canopus_file=None, gnps_file=None, gnps_mn_file=None, isdb_file=None, sirius_file=None) ¶

align_data_vertically(gnps_file=None, isdb_file=None, sirius_file=None) ¶

`align_horizontally(canopus_file, gnps_file, gnps_mn_file, isdb_file, sirius_file, output=None)` ¶

`align_vertically(gnps_file, sirius_file, isdb_file, output=None)` ¶

`cli()` ¶

`prune_table(input_file, list_columns, remove, output=None)` ¶

`align_data_horizontally(canopus_file=None, gnps_file=None, gnps_mn_file=None, isdb_file=None, sirius_file=None)` ¶

`align_data_vertically(gnps_file=None, isdb_file=None, sirius_file=None)` ¶