Source code for airsspy.analysis.collect

"""
Data collection and analysis utilities for AIRSS search results.

Provides functions for collecting RES file data into DataFrames,
reading `ca` command output, combining similar structures via `cryan`,
and computing minsep ranges from ensembles.
"""

import subprocess
from typing import Optional, Union

import numpy as np
import pandas as pd
from pymatgen.entries.computed_entries import ComputedEntry

from ..restools import RESFile, read_res_atoms



[docs]
def read_stream(stream) -> tuple[list, list]:
    """
    Read from a stream of RES file contents and return lists of
    TitlInfo and Atoms objects.

    Args:
        stream: Iterable of lines from concatenated RES files.

    Returns:
        Tuple of (titl_list, atoms_list).
    """
    lines = []
    atoms_list = []
    titl_list = []
    for line in stream:
        if line.startswith("END"):
            titl, atoms = read_res_atoms(lines)
            titl_list.append(titl)
            atoms_list.append(atoms)
            lines = []
        else:
            lines.append(line)
    return titl_list, atoms_list




[docs]
def read_ca(lines: list[str]) -> pd.DataFrame:
    """
    Read results from the ``ca`` command into a DataFrame.

    Args:
        lines: String lines as returned by the ``ca`` command.

    Returns:
        A DataFrame with columns for label, pressure, volume,
        enthalpy, spin info, formula, symmetry, etc.
    """
    records = []
    if not lines:
        return pd.DataFrame()
    # Detect format from first non-empty line
    ntokens = None
    for line in lines:
        if line.strip():
            ntokens = len(line.split())
            break
    if ntokens is None:
        return pd.DataFrame()

    for line in lines:
        if not line:
            continue
        tokens = line.split()
        if len(tokens) != ntokens:
            continue
        # If has spin (10 tokens per line)
        if ntokens == 10:
            records.append({
                "label": tokens[0],
                "press": float(tokens[1]),
                "volume": float(tokens[2]),
                "H": float(tokens[3]),
                "spin": float(tokens[4]),
                "aspin": float(tokens[5]),
                "nform": int(tokens[6]),
                "formula": tokens[7],
                "symm": tokens[8],
                "nseen": int(tokens[9]),
            })
        else:
            records.append({
                "label": tokens[0],
                "press": float(tokens[1]),
                "volume": float(tokens[2]),
                "H": float(tokens[3]),
                "nform": int(tokens[4]),
                "formula": tokens[5],
                "symm": tokens[6],
                "nseen": int(tokens[7]),
            })

    dataframe = pd.DataFrame.from_records(records)

    # Fix the H field: cryan outputs relative enthalpy after the first entry
    if len(dataframe) > 1:
        h_col = dataframe.columns.get_loc("H")
        dataframe.iloc[1:, h_col] += dataframe.iloc[0, h_col]
    return dataframe




[docs]
def collect_res_in_df(
    res_collection: list[RESFile],
    norm_mode: str = "per_atom",
) -> pd.DataFrame:
    """
    Collect a list of RESFile objects into a DataFrame.

    Args:
        res_collection: A collection of RESFile objects.
        norm_mode: Normalisation mode for energy and volume.
            ``"per_atom"`` (default) or ``"per_formula_unit"``.

    Returns:
        A DataFrame with collected data from the RESFile objects.
    """
    records = []
    for res in res_collection:
        entry = {}
        entry.update(res.data)
        entry["formula"] = res.formula
        entry["reduced_formula"] = res.reduced_formula
        entry["nform"] = res.n_formula_units
        entry["res"] = res
        entry["chemsys"] = res.composition.chemical_system if res.composition else None
        records.append(entry)

    dframe = pd.DataFrame(records)
    if dframe.empty:
        return dframe

    # Normalise the energy and volumes
    if norm_mode == "per_atom":
        dframe["H"] = dframe["enthalpy"] / dframe["natoms"]
        dframe["V"] = dframe["volume"] / dframe["natoms"]
    else:
        # Guard against None nform from structures without loaded data
        if dframe["nform"].isna().any():
            dframe["nform"] = dframe["nform"].fillna(1)
        dframe["H"] = dframe["enthalpy"] / dframe["nform"]
        dframe["V"] = dframe["volume"] / dframe["nform"]
    dframe.sort_values("H", inplace=True)

    return dframe




[docs]
def combine_res_cryan(
    dframe: pd.DataFrame,
    thres: float = 0.1,
    ntop: int = 30,
) -> pd.DataFrame:
    """
    Reduce similar structures using the ``cryan`` command.

    Args:
        dframe: DataFrame with a ``res`` column containing RESFile objects.
        thres: Threshold for combining structures.
        ntop: The number of top structures to be returned.

    Returns:
        A DataFrame of output from the ``cryan`` command.
    """
    lines = []
    for _, row in dframe.iterrows():
        res = row["res"]
        if res.lines:
            lines.extend(res.lines)
        else:
            lines.extend(res.to_res_lines())

    if lines[0].endswith("\n"):
        join_base = ""
    else:
        join_base = "\n"
    inpd = join_base.join(lines)
    cryan_out = subprocess.check_output(
        ["cryan", "-u", str(thres), "-r", "-t", str(ntop), "-l"],
        text=True,
        input=inpd,
    ).split("\n")
    cadf = read_ca(cryan_out)

    return cadf




[docs]
def get_minsep_range(
    minseps: list[dict[str, float]],
    cap: Optional[tuple[float, float]] = None,
) -> dict[str, list[float]]:
    """
    Create ranged minseps from an ensemble of minsep entries.

    Args:
        minseps: A list of minsep dictionaries (species pair -> distance).
        cap: Optional (min, max) cap for distances.

    Returns:
        A dictionary mapping species pairs to [min, max] ranges.
    """
    base: dict[str, list[float]] = {key: [value, value] for key, value in minseps[0].items()}
    for minsep in minseps:
        for key, value in minsep.items():
            if key in base:
                existing = base[key]
                if cap and value < cap[0]:
                    existing[0] = cap[0]
                elif cap and value > cap[1]:
                    existing[1] = cap[1]
                elif existing[0] > value:
                    existing[0] = value
                elif existing[1] < value:
                    existing[1] = value
            else:
                base[key] = [value, value]
    return base




[docs]
def get_entry(
    dataframe: pd.DataFrame,
    pmg_col: str = "pmg_struct",
    label_col: str = "label",
    uuid_col: str = "uuid",
    umap_col: str = "umap",
    xc_col: str = "functional",
    eng_col: str = "energy",
) -> list[ComputedEntry]:
    """
    Create ComputedEntry objects from a DataFrame containing structure data.

    Args:
        dataframe: DataFrame with structure and energy data.
        pmg_col: Column name for pymatgen Structure objects.
        label_col: Column name for structure labels.
        uuid_col: Column name for UUIDs.
        umap_col: Column name for Hubbard U mapping.
        xc_col: Column name for functional labels.
        eng_col: Column name for energy values.

    Returns:
        A list of ComputedEntry objects.
    """
    pd_entries = []
    for idx, row in dataframe.iterrows():
        comp = row[pmg_col].composition
        attrs = {
            "struct_name": row[label_col],
            "entry_type": "MP" if "mp" in str(row.get(label_col, "")) else "AIRSS",
            "structure_uuid": row.get(uuid_col),
            "calc_u": row.get(umap_col),
            "functional": row.get(xc_col),
            "volume": row[pmg_col].volume,
            "dataframe_idx": idx,
        }
        pd_entries.append(ComputedEntry(comp, energy=row[eng_col], parameters=attrs))
    return pd_entries




[docs]
def export_dataframe_as_res(
    dataframe: pd.DataFrame,
    comment: str = "VASP export",
    extra_comments: Optional[list] = None,
    stress_key: Optional[str] = None,
) -> None:
    """
    Write all structures in a DataFrame into RES format for export.

    Creates an ``exports/`` directory and writes one ``.res`` file per row.

    Args:
        dataframe: DataFrame with structure and energy data.
            Must have ``pmg_struct_relaxed``, ``energy_per_atom``,
            ``volume_per_fu``, ``nform_refine``, and ``label`` columns.
        comment: Comment to include in REM lines.
        extra_comments: Additional REM comment strings.
        stress_key: Column name for stress data (optional).
    """
    from pathlib import Path

    comments = [comment]
    if extra_comments:
        comments.extend(extra_comments)

    export_dir = Path("exports")
    export_dir.mkdir(exist_ok=True)

    for _, row in dataframe.iterrows():
        relaxed = row["pmg_struct_relaxed"]
        res = RESFile(
            relaxed,
            {
                "enthalpy": row["energy_per_atom"] * len(relaxed.sites),
                "volume": row.get("volume_per_fu", relaxed.volume) * row.get("nform_refine", 1),
                "pressure": 0.0 if stress_key is None or stress_key not in row.index else row[stress_key],
                "label": row["label"],
                "rem": comments + [f"{key} = {row[key]}" for key in row.index if "struct" not in key],
            },
        )
        content = "\n".join(res.to_res_lines())
        (export_dir / f"{row['label']}.res").write_text(content)




[docs]
def get_pressure_gpa(stress: Union[list, np.ndarray]) -> float:
    """
    Convert a stress tensor to isostatic pressure in GPa.

    Args:
        stress: A 3x3 stress tensor (in kBar units).

    Returns:
        The isostatic pressure in GPa.
    """
    stress = np.asarray(stress)
    # 1 kBar = 0.1 GPa
    return float(np.trace(stress) * 0.1 / 3.0)