Source code for cinemol.parsers

# -*- coding: utf-8 -*-

"""This module contains functions for parsing molecular file formats."""

import typing as ty

from cinemol.api import Atom, Bond



[docs]
def parse_sdf(src: str, include_hs: bool = True) -> ty.Tuple[ty.List[Atom], ty.List[Bond]]:
    """Parse first molecule from SDF file format.

    :param src: SDF file content.
    :type src: str
    :param include_hs: Include hydrogens.
    :type include_hs: bool
    :return: Atoms and bonds.
    :rtype: ty.Tuple[ty.List[Atom], ty.List[Bond]]
    """
    atoms, bonds = [], []

    lines = src.split("\n")

    counts_line = lines[3]  # Counts line of the first molecule in the SDF file.

    atom_count = int(counts_line[0:3])
    bond_count = int(counts_line[3:6])

    atom_lines = lines[4 : 4 + atom_count]
    bond_lines = lines[4 + atom_count : 4 + atom_count + bond_count]

    atom_index = 0

    # Parse atom line.
    for atom_line in atom_lines:
        atom_index += 1

        x = float(atom_line[0:10].strip())
        y = float(atom_line[10:20].strip())
        z = float(atom_line[20:30].strip())
        atom_symbol = atom_line[31:34].strip()

        atoms.append(Atom(atom_index, atom_symbol, (x, y, z)))

    # Parse bond line.
    for bond_line in bond_lines:

        start_index = int(bond_line[0:3])
        stop_index = int(bond_line[3:6])
        bond_order = int(bond_line[6:9])

        bonds.append(Bond(int(start_index), int(stop_index), int(bond_order)))

    atom_map = {atom.index: atom for atom in atoms}
    if not include_hs:
        atoms = [atom for atom in atoms if atom.symbol != "H"]
        bonds = [
            bond
            for bond in bonds
            if (atom_map[bond.start_index].symbol != "H" and atom_map[bond.end_index].symbol != "H")
        ]

    return atoms, bonds