Skip to content

Modules

align_horizontally(canopus_file, gnps_file, gnps_mn_file, isdb_file, sirius_file, output=None)

CLI tool to align and merge data from CANOPUS, GNPS, Sirius, and ISDB horizontally.

Parameters:

Name Type Description Default
canopus_file Optional[str]

Path to CANOPUS output file.

required
gnps_file Optional[str]

Path to GNPS output file.

required
gnps_mn_file Optional[str]

Path to GNPS MN output file.

required
sirius_file Optional[str]

Path to Sirius output file.

required
isdb_file Optional[str]

Path to ISDB output file.

required
output Optional[str]

Output file to save the merged data.

None

Returns:

Type Description
None

A dataframe with the aligned data (if the output option is used, the dataframe is saved to a file)

Source code in met_annot_unifier/cli.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
@cli.command()
@click.option("--canopus-file", type=click.Path(), default=None, help="Path to CANOPUS output file.")
@click.option("--gnps-file", type=click.Path(), default=None, help="Path to GNPS output file.")
@click.option("--gnps-mn-file", type=click.Path(), default=None, help="Path to GNPS MN output file.")
@click.option("--isdb-file", type=click.Path(), default=None, help="Path to ISDB output file.")
@click.option("--sirius-file", type=click.Path(), default=None, help="Path to Sirius output file.")
@click.option("--output", "-o", type=click.Path(), help="Output file to save the merged data.")
def align_horizontally(
    canopus_file: Optional[str],
    gnps_file: Optional[str],
    gnps_mn_file: Optional[str],
    isdb_file: Optional[str],
    sirius_file: Optional[str],
    output: Optional[str] = None,
) -> None:
    """CLI tool to align and merge data from CANOPUS, GNPS, Sirius, and ISDB horizontally.

    Args:
        canopus_file (Optional[str]): Path to CANOPUS output file.
        gnps_file (Optional[str]): Path to GNPS output file.
        gnps_mn_file (Optional[str]): Path to GNPS MN output file.
        sirius_file (Optional[str]): Path to Sirius output file.
        isdb_file (Optional[str]): Path to ISDB output file.
        output (Optional[str]): Output file to save the merged data.

    Returns:
        A dataframe with the aligned data (if the output option is used, the dataframe is saved to a file)
    """
    aligned_data = align_data_horizontally(
        canopus_file=canopus_file,
        gnps_file=gnps_file,
        gnps_mn_file=gnps_mn_file,
        isdb_file=isdb_file,
        sirius_file=sirius_file,
    )

    if output:
        aligned_data.to_csv(output, index=False, sep="\t")
        click.echo(f"Aligned data saved to {output}")
    else:
        click.echo(aligned_data)

align_vertically(gnps_file, sirius_file, isdb_file, output=None)

CLI tool to align and merge data from GNPS, Sirius, and ISDB.

Parameters:

Name Type Description Default
gnps_file str

Path to GNPS output file.

required
isdb_file str

Path to ISDB output file.

required
sirius_file str

Path to Sirius output file.

required
output str

Output file to save the merged data. Defaults to None.

None

Returns:

Type Description
None

A dataframe with the aligned data (if the output option is used, the dataframe is saved to a file)

Source code in met_annot_unifier/cli.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
@cli.command()
@click.option("--gnps-file", type=click.Path(exists=True), help="Path to GNPS output file.")
@click.option("--isdb-file", type=click.Path(exists=True), help="Path to ISDB output file.")
@click.option("--sirius-file", type=click.Path(exists=True), help="Path to Sirius output file.")
@click.option("--output", "-o", type=click.Path(), help="Output file to save the merged data.")
def align_vertically(gnps_file: str, sirius_file: str, isdb_file: str, output: Optional[str] = None) -> None:
    """CLI tool to align and merge data from GNPS, Sirius, and ISDB.

    Args:
        gnps_file (str): Path to GNPS output file.
        isdb_file (str): Path to ISDB output file.
        sirius_file (str): Path to Sirius output file.
        output (str, optional): Output file to save the merged data. Defaults to None.

    Returns:
        A dataframe with the aligned data (if the output option is used, the dataframe is saved to a file)
    """
    aligned_data = align_data_vertically(gnps_file=gnps_file, isdb_file=isdb_file, sirius_file=sirius_file)

    if output:
        aligned_data.to_csv(output, index=False, sep="\t")
        click.echo(f"Aligned data saved to {output}")
    else:
        click.echo(aligned_data)

cli()

Description for your CLI tool.

Source code in met_annot_unifier/cli.py
10
11
12
13
14
@click.group()
def cli() -> None:
    """Description for your CLI tool."""
    print("CLI is running")
    pass

prune_table(input_file, list_columns, remove, output=None)

CLI tool to remove columns from a DataFrame.

Parameters:

Name Type Description Default
input_file str

Path to the input file.

required
list_columns str

list of columns to remove.

required
remove bool

If True, removes only the specified columns; otherwise, keeps them.

required
output str

Output file to save the pruned data. Defaults to None.

None

Returns:

Type Description
None

A dataframe with the pruned data (if the output option is used, the dataframe is saved to a file)

Source code in met_annot_unifier/cli.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
@cli.command()
@click.option("--input-file", type=click.Path(exists=True), help="Path to the input file.")
@click.option("--remove", is_flag=True, help="Removes the specified columns instead of keeping them.")
@click.option(
    "--list-columns",
    required=True,
    help="Key in the JSON configuration for the list of columns to be processed.",
)
@click.option("--output", "-o", type=click.Path(), help="Output file to save the pruned data.")
def prune_table(input_file: str, list_columns: str, remove: bool, output: Optional[str] = None) -> None:
    """CLI tool to remove columns from a DataFrame.

    Args:
        input_file (str): Path to the input file.
        list_columns (str): list of columns to remove.
        remove (bool): If True, removes only the specified columns; otherwise, keeps them.
        output (str, optional): Output file to save the pruned data. Defaults to None.

    Returns:
        A dataframe with the pruned data (if the output option is used, the dataframe is saved to a file)
    """

    # Load the configuration file
    columns_to_remove = load_configuration("column_config.json")

    # Get the columns to process
    columns_to_process = columns_to_remove[list_columns]

    # Load the input file
    df = pd.read_csv(input_file, sep="\t")

    # Prune the table
    pruned_data = table_pruner(df, columns_to_process, remove=remove)

    # Save or print the result
    if output:
        pruned_data.to_csv(output, index=False, sep="\t")
        click.echo(f"Pruned data saved to {output}")
    else:
        click.echo(pruned_data.to_string())

align_data_horizontally(canopus_file=None, gnps_file=None, gnps_mn_file=None, isdb_file=None, sirius_file=None)

Aligns and merges data from GNPS, Sirius, ISDB and CANOPUS datasets, if provided. This function merges the data horizontally, keeping the data in a wide format. The function standardizes column names, prefixes them to indicate their source, and merges the data based on 'feature_id'.

Args: canopus_file (Optional[str]): File path for the CANOPUS data in TSV format. gnps_file (Optional[str]): File path for the GNPS data in TSV format. isdb_file (Optional[str]): File path for the ISDB data in TSV format. sirius_file (Optional[str]): File path for the Sirius data in TSV format.

Returns: pd.DataFrame: A DataFrame with aligned and merged data from the provided sources.

Source code in met_annot_unifier/aligner/aligner.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def align_data_horizontally(
    canopus_file: Optional[str] = None,
    gnps_file: Optional[str] = None,
    gnps_mn_file: Optional[str] = None,
    isdb_file: Optional[str] = None,
    sirius_file: Optional[str] = None,
) -> pd.DataFrame:
    """
    Aligns and merges data from GNPS, Sirius, ISDB  and CANOPUS datasets, if provided. This function merges the data horizontally,
    keeping the data in a wide format. The function standardizes column names, prefixes them to indicate their source,
    and merges the data based on 'feature_id'.

    Args:
    canopus_file (Optional[str]): File path for the CANOPUS data in TSV format.
    gnps_file (Optional[str]): File path for the GNPS data in TSV format.
    isdb_file (Optional[str]): File path for the ISDB data in TSV format.
    sirius_file (Optional[str]): File path for the Sirius data in TSV format.

    Returns:
    pd.DataFrame: A DataFrame with aligned and merged data from the provided sources.
    """

    data_frames = []

    if canopus_file:
        canopus_data = pd.read_csv(canopus_file, sep="\t")
        canopus_data = standardize_column_names(canopus_data, "mappingFeatureId", "feature_id")
        canopus_data = prefix_columns(canopus_data, "canopus_", exclude_columns=[])
        # the CANOPUS NPC classifier columns names are standardized
        canopus_data = standardize_column_names(canopus_data, "canopus_NPC#pathway", "canopus_npc_pathway")
        canopus_data = standardize_column_names(canopus_data, "canopus_NPC#superclass", "canopus_npc_superclass")
        canopus_data = standardize_column_names(canopus_data, "canopus_NPC#class", "canopus_npc_class")
        canopus_data = extract_feature_id(canopus_data, "canopus_feature_id")
        canopus_data = standardize_column_names(canopus_data, "canopus_feature_id", "feature_id")
        data_frames.append(canopus_data)

    if gnps_file:
        gnps_data = pd.read_csv(gnps_file, sep="\t")
        gnps_data = standardize_column_names(gnps_data, "InChIKey-Planar", "IK2D")
        gnps_data = standardize_column_names(gnps_data, "#Scan#", "feature_id")
        gnps_data = standardize_column_names(gnps_data, "Smiles", "SMILES")
        gnps_data = prefix_columns(gnps_data, "gnps_", exclude_columns=[])
        # the GNPS NPC classifier columns names are standardized
        gnps_data = standardize_column_names(gnps_data, "gnps_npclassifier_pathway", "gnps_npc_pathway")
        gnps_data = standardize_column_names(gnps_data, "gnps_npclassifier_superclass", "gnps_npc_superclass")
        gnps_data = standardize_column_names(gnps_data, "gnps_npclassifier_class", "gnps_npc_class")
        gnps_data = standardize_column_names(gnps_data, "gnps_feature_id", "feature_id")
        data_frames.append(gnps_data)

    if isdb_file:
        isdb_data = pd.read_csv(isdb_file, sep="\t")
        isdb_data = standardize_column_names(isdb_data, "short_inchikey", "IK2D")
        isdb_data = standardize_column_names(isdb_data, "feature_id", "feature_id")
        isdb_data = standardize_column_names(isdb_data, "structure_smiles", "SMILES")
        isdb_data = prefix_columns(isdb_data, "isdb_", exclude_columns=[])
        # the ISDB NPC classifier columns names are standardized
        isdb_data = standardize_column_names(
            isdb_data, "isdb_structure_taxonomy_npclassifier_01pathway", "isdb_npc_pathway"
        )
        isdb_data = standardize_column_names(
            isdb_data, "isdb_structure_taxonomy_npclassifier_02superclass", "isdb_npc_superclass"
        )
        isdb_data = standardize_column_names(
            isdb_data, "isdb_structure_taxonomy_npclassifier_03class", "isdb_npc_class"
        )
        isdb_data = standardize_column_names(isdb_data, "isdb_feature_id", "feature_id")
        data_frames.append(isdb_data)

    if sirius_file:
        # Read and process Sirius data
        sirius_data = pd.read_csv(sirius_file, sep="\t")
        sirius_data = standardize_column_names(sirius_data, "InChIkey2D", "IK2D")
        sirius_data = standardize_column_names(sirius_data, "mappingFeatureId", "feature_id")
        sirius_data = standardize_column_names(sirius_data, "smiles", "SMILES")
        sirius_data = prefix_columns(sirius_data, "sirius_", exclude_columns=[])
        sirius_data = extract_feature_id(sirius_data, "sirius_feature_id")
        sirius_data = standardize_column_names(sirius_data, "sirius_feature_id", "feature_id")
        data_frames.append(sirius_data)

    if not data_frames:
        raise DataFileError()

    # Merge the dataframes horizontally on 'feature_id'
    merged_data = reduce(lambda left, right: pd.merge(left, right, on="feature_id", how="outer"), data_frames)

    # The sources of the annotations are processed and combined
    # Create the 'Sources' column. Fill it according the content of the tool_IK2D columns.
    # E.g. if sirius_IK2D is not null and matches isdb_IK2D, then the source is 'SIRIUS, ISDB'

    merged_data = process_IK2D_sources(merged_data)
    merged_data = process_npc_pathway_sources(merged_data)
    merged_data = process_npc_superclass_sources(merged_data)
    merged_data = process_npc_class_sources(merged_data)

    merged_data["sources_number_IK2D"] = merged_data["sources_IK2D"].apply(count_sources)

    # Load GNPS MN data if provided

    if gnps_mn_file:
        gnps_mn_data = pd.read_csv(gnps_mn_file, sep="\t")
        gnps_mn_data = standardize_column_names(gnps_mn_data, "cluster index", "feature_id")
        gnps_mn_data = prefix_columns(gnps_mn_data, "gnps_mn_", exclude_columns=[])
        gnps_mn_data = standardize_column_names(gnps_mn_data, "gnps_mn_feature_id", "feature_id")
        merged_data = pd.merge(merged_data, gnps_mn_data, on="feature_id", how="outer")

    # Select columns

    selected_columns = [
        "feature_id",
        "sources_IK2D",
        "sources_number_IK2D",
        "sources_npc_pathway",
        "sources_npc_superclass",
        "sources_npc_class",
    ]

    # Place the selected columns at the front of the dataframe

    merged_data = merged_data[
        selected_columns + [column for column in merged_data.columns if column not in selected_columns]
    ]

    return merged_data

align_data_vertically(gnps_file=None, isdb_file=None, sirius_file=None)

Aligns and merges data from GNPS, Sirius, and ISDB datasets optionally. Files can be provided for any subset of these datasets. The function standardizes column names, prefixes them to indicate their source, merges the data based on 'feature_id' and 'IK2D', and then creates consolidated 'Sources' and 'SMILES' columns.

Parameters:

Name Type Description Default
gnps_file str

File path for the GNPS data in TSV format.

None
sirius_file str

File path for the Sirius data in TSV format.

None
isdb_file str

File path for the ISDB data in TSV format.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame with aligned and merged data from the provided sources.

Example

gnps_file = 'path/to/gnps_data.tsv' sirius_file = 'path/to/sirius_data.tsv' aligned_data = align_data_vertically(gnps_file=gnps_file, sirius_file=sirius_file) print(aligned_data.columns) Index(['feature_id', 'IK2D', 'Sources', 'SMILES', ...], dtype='object')

Source code in met_annot_unifier/aligner/aligner.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def align_data_vertically(
    gnps_file: Optional[str] = None,
    isdb_file: Optional[str] = None,
    sirius_file: Optional[str] = None,
) -> pd.DataFrame:
    """
    Aligns and merges data from GNPS, Sirius, and ISDB datasets optionally. Files can be provided for any subset of these datasets.
    The function standardizes column names, prefixes them to indicate their source, merges the data based on 'feature_id'
    and 'IK2D', and then creates consolidated 'Sources' and 'SMILES' columns.

    Args:
        gnps_file (str, optional): File path for the GNPS data in TSV format.
        sirius_file (str, optional): File path for the Sirius data in TSV format.
        isdb_file (str, optional): File path for the ISDB data in TSV format.

    Returns:
        pd.DataFrame: A DataFrame with aligned and merged data from the provided sources.

    Example:
        >>> gnps_file = 'path/to/gnps_data.tsv'
        >>> sirius_file = 'path/to/sirius_data.tsv'
        >>> aligned_data = align_data_vertically(gnps_file=gnps_file, sirius_file=sirius_file)
        >>> print(aligned_data.columns)
        Index(['feature_id', 'IK2D', 'Sources', 'SMILES', ...], dtype='object')
    """

    data_frames = []

    if gnps_file:
        gnps_data = process_gnps_data(gnps_file)
        data_frames.append(gnps_data)

    if isdb_file:
        isdb_data = process_isdb_data(isdb_file)
        data_frames.append(isdb_data)

    if sirius_file:
        sirius_data = process_sirius_data(sirius_file)
        data_frames.append(sirius_data)

    # Ensure that at least one data frame has been loaded
    if not data_frames:
        raise DataFileError()

    # Concatenate all available data frames
    combined_data = pd.concat([df for df in data_frames if not df.empty], axis=0, ignore_index=True)
    # Group by 'feature_id' and 'IK2D' and combine the annotations
    merged_data = combined_data.groupby(["feature_id", "IK2D"], as_index=False).agg(
        lambda x: ", ".join(x.dropna().astype(str).unique())
    )

    # Create the 'Sources' column
    source_columns = [col for col in merged_data.columns if col.endswith("annotation_source")]
    merged_data["Sources"] = merged_data.apply(
        lambda row: "|".join(sorted(filter(None, [row.get(col) for col in source_columns]))), axis=1
    )
    merged_data.drop(columns=source_columns, inplace=True)

    # Handle the SMILES column
    # Specify the priority order for SMILES columns explicitly
    smiles_columns = [
        "sirius_SMILES",  # Highest priority
        "isdb_SMILES",
        "gnps_SMILES",  # Lowest priority
    ]

    # Check and keep only those columns that actually exist in the merged data
    smiles_columns = [col for col in smiles_columns if col in merged_data.columns]
    merged_data["SMILES"] = merged_data.apply(
        lambda row: next((row[col] for col in smiles_columns if row[col]), None), axis=1
    )

    # Select and reorder columns
    selected_columns = ["feature_id", "IK2D", "Sources", "SMILES"]
    merged_data = merged_data[selected_columns + [col for col in merged_data.columns if col not in selected_columns]]

    return merged_data