#!/usr/bin/env python3
'''
.. code-block::
!---------------------------------------------------------------------------!
! orca: interface to the ORCA program !
! Implementations by: Yuxinxin Chen !
! NEVPT2 and TD-DFT implemented by MikoĊaj Martyka, Li Wang and Xinyu Tong !
!---------------------------------------------------------------------------!
'''
import os
import re
import numpy as np
import math
import tempfile
import subprocess
from .. import constants, data
from ..model_cls import model, method_model
from ..decorators import doc_inherit
[docs]
class orca_methods(method_model):
'''
ORCA interface
Arguments:
method (str): Method specification. Use ``'B3LYP/6-31G*'`` for ground state DFT,
``'TD-wb97x-d3bj/def2-SVP'`` for TD-DFT, ``'TDA-wb97x-d3bj/def2-SVP'`` for TDA,
``'qd-nevpt2/cc-pVDZ'`` for QD-NEVPT2, or ``'CCSD(T)*/CBS'`` for composite method.
save_files_in_current_directory (bool): Keep input/output files, default ``True``
working_directory (str): Directory for output files, default ``None``
nthreads (int): Number of parallel processes (``%pal nprocs``)
maxcore (int): Memory per core in MB (``%maxcore``)
solvent (str): Solvent name for implicit solvation
solvation_model (str): ``'SMD'`` or ``'CPCM'``, default ``'SMD'``
additional_keywords (list): Extra keywords for ORCA input line
Example setup:
.. code-block:: python
td_dft = ml.models.methods(method='TD-B3LYP/def2-SVP', program='orca',
working_directory='./orca_tddft',
nthreads=1,
maxcore=1000,
)
qd_nevpt2 = ml.models.methods(method='QD-NEVPT2/def2-SVP', program='orca',
nthreads=1,
maxcore=1000,
working_directory='./orca_NEVPT2',
)
'''
bin_env_name = 'orcabin'
def __init__(self,
method='wb97x/6-31G*',
save_files_in_current_directory: bool = True,
working_directory: str = None,
nthreads: int = 1,
maxcore: int = 1000,
solvent: str = None,
solvation_model: str = "SMD",
additional_keywords: list = None,
input_file: str = '',
output_keywords: list = None,
**kwargs):
self.orcabin = self.get_bin_env_var()
if self.orcabin is None:
raise ValueError('Cannot find the Orca program, please set the environment variable: export orcabin=...')
self.method = method
self.nthreads = nthreads
self.maxcore = maxcore
self.solvent = solvent
self.solvation_model = solvation_model if solvent else None
self.save_files_in_current_directory = save_files_in_current_directory
self.working_directory = working_directory
self.additional_keywords = additional_keywords
self.input_file = input_file
self.output_keywords = output_keywords
# For CCSD(T)*/CBS composite method
if self.method == 'CCSD(T)*/CBS':
self.cc_method = ccsdtstarcbs(**kwargs)
[docs]
@doc_inherit
def predict(self,
molecular_database: data.molecular_database = None,
molecule: data.molecule = None,
calculate_energy: bool = True,
calculate_energy_gradients = False,
calculate_hessian: bool = False,
nstates: int = 1,
current_state: int = 0,
active_space: tuple = None,
casscf_kwargs: dict = None):
'''
Run ORCA calculation.
Arguments:
molecular_database: collection of molecules stored in ml.data.molecular_database object to make predictions for.
molecule: molecule to predict for.
calculate_energy: bool, whether to update the energy of the molecule.
calculate_energy_gradients: bool or list of bool.
If list, e.g. [False, True, False] requests gradient for S1 only.
Only one state gradient can be requested at a time.
current_state: 0-indexed state (0 = S0, 1 = S1, etc.), for which the energy and gradient will be used.
active_space: tuple (nel, norb) for CASSCF/QD-NEVPT2, e.g. (6, 6)
casscf_kwargs: dict of additional CASSCF keywords (maxiter, actorbs, etc.)
nstates: number of electronic states to compute
calculate_hessian: whether to calculate hessian matrix.
Example usage:
.. code-block:: python
td_dft.predict(
molecule=mol,
nstates=2,
current_state=0,
calculate_energy=True,
calculate_energy_gradients=True,
)
qd_nevpt2.predict(
molecule=mol,
nstates=2,
current_state=1,
active_space = (2,2)
)
'''
molDB = super().predict(molecular_database=molecular_database, molecule=molecule)
# Parse TD-/TDA- prefix from method string
method_upper = self.method.upper()
if method_upper.startswith('TDA-'):
self._tda = True
self._method_cleaned = self.method[4:] # Strip 'TDA-'
is_tddft = True
elif (method_upper.startswith('TD-') or (nstates>1 and not ("PT2" in method_upper)) ): #This is disgustingly ugly, handwritten code that works
self._tda = False
self._method_cleaned = self.method[3:] # Strip 'TD-'
is_tddft = True
else:
self._tda = False
self._method_cleaned = self.method
is_tddft = False
# Validate nstates for TD-DFT
if is_tddft and nstates == 1:
raise ValueError("TD-DFT requires nstates > 1. Use nstates=N to calculate N-1 excited states.")
# Handle calculate_energy_gradients as bool or list
if isinstance(calculate_energy_gradients, list):
# Validate: only one True allowed
true_count = sum(calculate_energy_gradients)
if true_count > 1:
raise ValueError("Only one state gradient can be calculated at a time. "
f"Got {true_count} states requested: {calculate_energy_gradients}")
if true_count == 1:
# Find which state has gradient requested
self.gradient_state = calculate_energy_gradients.index(True)
self.calculate_energy_gradients = True
else:
self.gradient_state = None
self.calculate_energy_gradients = False
else:
self.calculate_energy_gradients = calculate_energy_gradients
self.gradient_state = current_state if calculate_energy_gradients else None
if self.gradient_state ==0 and is_tddft:
raise ValueError("ORCA TD-DFT does not support ground-state gradient calculations. Please create a composite TD-DFT/DFT model in MLatom.")
# Store calculation parameters
self.calculate_energy = calculate_energy
self.calculate_hessian = calculate_hessian
self.nstates = nstates
self.current_state = current_state
self.active_space = active_space
self.casscf_kwargs = casscf_kwargs if casscf_kwargs else {}
if self.method == 'CCSD(T)*/CBS':
self.cc_method.predict(molecular_database=molDB)
return
# Setup working directory
tmpdirname = self._setup_working_directory()
for imol, imolecule in enumerate(molDB.molecules):
self.inpfile = os.path.join(tmpdirname, f'molecule{imol}_{self.input_file}')
# Generate input and run ORCA
self._generate_input(imolecule)
self._run_orca(tmpdirname)
out_file = f'{self.inpfile}.out'
if not os.path.exists(out_file):
raise FileNotFoundError(f"Output file not found: {out_file}")
with open(out_file, 'r') as f:
lines = f.readlines()
# Check last 20 lines for normal termination
tail = lines[-20:] if len(lines) >= 20 else lines
if not any('ORCA TERMINATED NORMALLY' in line for line in tail):
print(f"ORCA did not terminate normally. Check output: {out_file}. The program will try to continue.")
import re
version_match = re.search(r'Program Version (\d+)\.(\d+)\.(\d+)', ''.join(lines))
if version_match:
major_version = int(version_match.group(1))
self._orca_version = major_version
# Parse output based on method type
if self._method_cleaned.lower().startswith('qd_nevpt2') or self._method_cleaned.lower().startswith('qd-nevpt2'):
self._parse_qd_nevpt2_output(imolecule)
elif not self.calculate_energy and self.output_keywords:
# Custom keyword parsing (used by CCSD(T)*/CBS components)
self._parse_output_keywords(imolecule)
else:
self._parse_tddft_output(imolecule)
imolecule.orca_version = self._orca_version
[docs]
def _parse_output_keywords(self, molecule):
'''Parse custom keywords from property.txt file.
Used by CCSD(T)*/CBS and similar methods to extract specific values.
'''
prop_file = f'{self.inpfile}_property.txt'
if not os.path.exists(prop_file):
raise FileNotFoundError(f"Property file not found: {prop_file}")
with open(prop_file, 'r') as f:
lines = f.readlines()
for keyword in self.output_keywords:
keyword_name = self.input_file + '_' + keyword.lower().replace(' ', '_')
for line in lines:
if keyword in line:
molecule.__dict__[keyword_name] = float(line.split()[-1])
break
[docs]
def _setup_working_directory(self) -> str:
'''Setup and return the working directory path.'''
if self.working_directory is not None:
tmpdirname = self.working_directory
if not os.path.exists(tmpdirname):
os.makedirs(tmpdirname)
elif self.save_files_in_current_directory:
tmpdirname = '.'
else:
# Create temporary directory (caller should handle cleanup)
tmpdirname = tempfile.mkdtemp()
return os.path.abspath(tmpdirname)
[docs]
def _run_orca(self, workdir: str):
'''Execute ORCA calculation.'''
cmd = f'{self.orcabin} {self.inpfile}.inp > {self.inpfile}.out 2>&1'
proc = subprocess.Popen(cmd, shell=True, cwd=workdir,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
proc.communicate()
[docs]
def _parse_qd_nevpt2_output(self, molecule):
'''Parse QD-NEVPT2 output from .out file.'''
out_file = f'{self.inpfile}.out'
if not os.path.exists(out_file):
raise FileNotFoundError(f"Output file not found: {out_file}")
with open(out_file, 'r') as f:
lines = f.readlines()
# Extract QD-NEVPT2 and CASSCF energies from "QD-NEVPT2 Results" block
pt2_energies, casscf_energies = self._extract_qd_energies_from_out(lines)
if not pt2_energies:
raise ValueError(f"No QD-NEVPT2 Results block found in {out_file}")
# Build electronic states
molecule.electronic_states = []
for root_idx in sorted(pt2_energies.keys()):
state_mol = data.molecule()
state_mol.atoms = [atom.copy() for atom in molecule.atoms]
state_mol.energy = pt2_energies[root_idx]
if root_idx in casscf_energies:
state_mol.casscf_energy = casscf_energies[root_idx]
# Initialize gradients as NaN (not available for QD-NEVPT2)
for atom in state_mol.atoms:
atom.energy_gradients = np.full(3, np.nan).tolist()
molecule.electronic_states.append(state_mol)
# Set molecule energy to current state
if self.current_state < len(molecule.electronic_states):
molecule.energy = molecule.electronic_states[self.current_state].energy
# QD-NEVPT2 gradients not implemented
if self.calculate_energy_gradients:
raise NotImplementedError("Analytical gradients are not available for QD-NEVPT2")
# Parse Hessian if requested
if self.calculate_hessian:
self._parse_hessian(molecule)
# Parse oscillator strengths from .out file
self._parse_qd_absorption_spectrum(molecule, out_file)
[docs]
def _parse_qd_absorption_spectrum(self, molecule, out_file: str):
'''Parse oscillator strengths from QD-NEVPT2 absorption spectrum.'''
if not os.path.exists(out_file):
return
with open(out_file, 'r') as f:
lines = f.readlines()
# Find last occurrence of absorption spectrum
last_idx = None
for i, line in enumerate(lines):
if 'ABSORPTION SPECTRUM' in line:
last_idx = i
if last_idx is None:
return
excitation_energies = []
oscillator_strengths = []
# Parse spectrum table (skip header lines)
for line in lines[last_idx + 5:]:
if '-' * 20 in line or line.strip() == '':
break
parts = line.split()
if len(parts) >= 8:
# Column 5 is energy in cm^-1, column 7 is oscillator strength
exc_energy_cm = float(parts[5])
exc_energy_hartree = exc_energy_cm * 4.556335e-6
excitation_energies.append(exc_energy_hartree)
oscillator_strengths.append(float(parts[7]))
if oscillator_strengths:
molecule.oscillator_strengths = oscillator_strengths
[docs]
def _parse_tddft_output(self, molecule):
'''Parse TD-DFT/ground-state DFT output.'''
if self._orca_version != 6:
prop_file = f'{self.inpfile}_property.txt'
else:
prop_file = f'{self.inpfile}.property.txt'
out_file = f'{self.inpfile}.out'
# Get ground state energy
gs_energy = None
energy_from_prop_file = False
if os.path.exists(prop_file) and self._orca_version != 6:
with open(prop_file, 'r') as f:
for line in f:
if 'SCF Energy:' in line or 'TOTAL ENERGY' in line:
gs_energy = float(line.split()[-1])
energy_from_prop_file = True
break
elif os.path.exists(prop_file):
with open(prop_file, 'r') as f:
in_scf_block = False
for line in f:
if '$SCF_Energy' in line:
in_scf_block = True
continue
if in_scf_block:
if '&SCF_ENERGY' in line:
# Extract the float value at the end
gs_energy = float(line.split()[-1])
energy_from_prop_file = True
break
elif line.startswith('$'):
# New block started, SCF_Energy block ended
break
if gs_energy is None and os.path.exists(out_file):
with open(out_file, 'r') as f:
for line in f:
if 'FINAL SINGLE POINT ENERGY' in line:
gs_energy = float(line.split()[-1])
break
if gs_energy is None:
raise ValueError("Could not parse ground state energy")
# Parse dispersion correction from .out file if energy came from property.txt
# (FINAL SINGLE POINT ENERGY already includes dispersion, but SCF Energy does not)
dispersion_correction = 0.0
if energy_from_prop_file and os.path.exists(out_file):
with open(out_file, 'r') as f:
for line in f:
if 'Dispersion correction' in line:
try:
dispersion_correction = float(line.split()[-1])
except ValueError:
pass
break
gs_energy += dispersion_correction
# Build electronic states
molecule.electronic_states = []
natoms = len(molecule.atoms)
# Ground state (S0)
gs_mol = data.molecule()
gs_mol.atoms = [atom.copy() for atom in molecule.atoms]
gs_mol.energy = gs_energy
# Initialize gradients as NaN
for atom in gs_mol.atoms:
atom.energy_gradients = np.full(3, np.nan).tolist()
molecule.electronic_states.append(gs_mol)
# Parse excited states if TD-DFT was run
if self.nstates > 1 and os.path.exists(out_file):
excitation_energies, oscillator_strengths = self._parse_tddft_spectrum(out_file)
for i, exc_e in enumerate(excitation_energies):
state_mol = data.molecule()
state_mol.atoms = [atom.copy() for atom in molecule.atoms]
state_mol.energy = gs_energy + exc_e
# Initialize gradients as NaN
for atom in state_mol.atoms:
atom.energy_gradients = np.full(3, np.nan).tolist()
molecule.electronic_states.append(state_mol)
if oscillator_strengths:
molecule.oscillator_strengths = oscillator_strengths
# Set molecule energy to current state
# For TD-DFT: current_state=0 means S0, current_state=1 means S1, etc.
state_idx = self.current_state if self.current_state < len(molecule.electronic_states) else 0
molecule.energy = molecule.electronic_states[state_idx].energy
# Parse gradients for the requested gradient state
if self.calculate_energy_gradients and self.gradient_state is not None:
self._parse_gradients(molecule)
# Store gradients in the appropriate electronic state using add_xyz_derivative_property
grad_idx = self.gradient_state
if grad_idx < len(molecule.electronic_states):
# Get gradient array from molecule.atoms
grad_array = np.array([atom.energy_gradients for atom in molecule.atoms])
molecule.electronic_states[grad_idx].add_xyz_derivative_property(
grad_array, 'energy', 'energy_gradients'
)
# Parse Hessian
if self.calculate_hessian:
self._parse_hessian(molecule)
[docs]
def _parse_tddft_spectrum(self, out_file: str) -> tuple:
'''Parse TD-DFT absorption spectrum from output file.'''
with open(out_file, 'r') as f:
lines = f.readlines()
# Find absorption spectrum section
last_idx = None
for i, line in enumerate(lines):
if 'ABSORPTION SPECTRUM VIA TRANSITION ELECTRIC DIPOLE MOMENTS' in line:
last_idx = i
excitation_energies = []
oscillator_strengths = []
if last_idx is not None:
for line in lines[last_idx + 5:]:
if line.strip() == '':
break
parts = line.split()
if len(parts) >= 4:
# Column 1 is energy in cm^-1, column 3 is oscillator strength
if self._orca_version == 6:
exc_energy_cm = float(parts[4])
else:
exc_energy_cm = float(parts[1])
exc_energy_hartree = exc_energy_cm * 4.556335e-6
excitation_energies.append(exc_energy_hartree)
if self._orca_version == 6:
oscillator_strengths.append(float(parts[6]))
else:
oscillator_strengths.append(float(parts[3]))
return excitation_energies, oscillator_strengths
[docs]
def _parse_gradients(self, molecule):
'''Parse energy gradients from .engrad file.'''
engrad_file = f'{self.inpfile}.engrad'
if not os.path.exists(engrad_file):
raise FileNotFoundError(f"Gradient file not found: {engrad_file}. ORCA may have failed to compute gradients.")
with open(engrad_file, 'r') as f:
lines = f.readlines()
gradients = []
in_gradient_section = False
for i, line in enumerate(lines):
if 'The current gradient in Eh/bohr' in line:
in_gradient_section = True
continue
if in_gradient_section:
line_stripped = line.strip()
# Skip comment lines starting with #
if line_stripped.startswith('#'):
# Check if this is the end marker (# followed by new section)
# Look ahead to see if next non-empty line is a new section header
if i + 1 < len(lines) and 'atomic' in lines[i + 1].lower():
break
continue
if line_stripped:
try:
# Convert from Eh/bohr to Eh/Angstrom
gradients.append(float(line_stripped) * constants.Angstrom2Bohr)
except ValueError:
# Not a number, likely end of section
break
if gradients:
grad_array = np.array(gradients).reshape(-1, 3)
for i, atom in enumerate(molecule.atoms):
atom.energy_gradients = grad_array[i].tolist()
else:
raise ValueError(f"No gradients found in {engrad_file}")
[docs]
def _parse_hessian(self, molecule):
'''Parse Hessian matrix from .hess file.'''
hess_file = f'{self.inpfile}.hess'
if not os.path.exists(hess_file):
return
with open(hess_file, 'r') as f:
lines = f.readlines()
# Find $hessian section
start_line = None
for i, line in enumerate(lines):
if '$hessian' in line:
start_line = i
break
if start_line is None:
return
matrix_size = int(lines[start_line + 1].strip())
hessian = np.zeros((matrix_size, matrix_size))
current_line = start_line + 2
current_col = 0
while current_col < matrix_size and current_line < len(lines):
# Read column header
header = lines[current_line].strip()
if not header:
current_line += 1
continue
cols = [int(x) for x in header.split()]
num_cols = len(cols)
current_line += 1
# Read data rows
for row in range(matrix_size):
if current_line >= len(lines):
break
data_line = lines[current_line].strip()
if not data_line:
current_line += 1
continue
parts = data_line.split()
# First element is row index, rest are values
for col_idx, col_num in enumerate(cols):
if col_idx + 1 < len(parts):
hessian[row, col_num] = float(parts[col_idx + 1])
current_line += 1
current_col += num_cols
# Convert from Eh/bohr^2 to Eh/Angstrom^2
molecule.hessian = hessian / (constants.Bohr2Angstrom ** 2)
# Also parse frequencies if available
self._parse_frequencies(molecule, lines)
[docs]
def _parse_frequencies(self, molecule, hess_lines: list):
'''Parse vibrational frequencies from .hess file content.'''
frequencies = []
ir_intensities = []
for i, line in enumerate(hess_lines):
if '$vibrational_frequencies' in line:
for freq_line in hess_lines[i + 2:]:
if freq_line.strip() == '' or '$' in freq_line:
break
parts = freq_line.split()
if len(parts) >= 2:
frequencies.append(float(parts[-1]))
if '$ir_spectrum' in line:
for ir_line in hess_lines[i + 2:]:
if ir_line.strip() == '' or '$' in ir_line:
break
parts = ir_line.split()
if len(parts) >= 3:
ir_intensities.append(float(parts[2]))
# Remove zero frequencies (translations/rotations)
if frequencies:
zero_count = sum(1 for f in frequencies if f == 0)
molecule.frequencies = frequencies[zero_count:]
if ir_intensities:
molecule.infrared_intensities = ir_intensities[zero_count:]
[docs]
class ccsdtstarcbs(model):
def __init__(self, **kwargs):
if 'nthreads_list' in kwargs:
self.nthreads_list = kwargs['nthreads_list']
if len(self.nthreads_list) != 5:
print('Number of values to asign to each methods should be 5. Default nthreads will be used')
self.nthreads_list = None
else:
self.nthreads_list = None
self.successful = True
if self.nthreads_list:
kwargs.update({'nthreads':self.nthreads_list[0], 'input_file':'mp2_tz', 'output_keywords':['SCF Energy', 'Correlation Energy']})
else:
kwargs.update({'input_file':'mp2_tz', 'output_keywords':['SCF Energy', 'Correlation Energy']})
self.mp2_tz = orca_methods(method='''RIMP2 RIJK cc-pVTZ cc-pVTZ/JK cc-pVTZ/C\n! tightscf noautostart scfconvforced miniprint nopop''',
**kwargs)
if self.nthreads_list:
kwargs.update({'nthreads':self.nthreads_list[1], 'input_file':'mp2_qz', 'output_keywords':['SCF Energy', 'Correlation Energy']})
else:
kwargs.update({'input_file':'mp2_qz', 'output_keywords':['SCF Energy', 'Correlation Energy']})
self.mp2_qz = orca_methods(method='''RIMP2 RIJK cc-pVQZ cc-pVQZ/JK cc-pVQZ/C\n! tightscf noautostart scfconvforced miniprint nopop''',
**kwargs)
if self.nthreads_list:
kwargs.update({'nthreads':self.nthreads_list[2],'input_file':'dlpno_normal_dz', 'output_keywords':['Total Correlation Energy']})
else:
kwargs.update({'input_file':'dlpno_normal_dz', 'output_keywords':['Total Correlation Energy']})
self.npno_dz = orca_methods(method='''DLPNO-CCSD(T) normalPNO RIJK cc-pVDZ cc-pVDZ/C cc-pvTZ/JK\n! tightscf noautostart scfconvforced miniprint nopop''',
**kwargs)
if self.nthreads_list:
kwargs.update({'nthreads':self.nthreads_list[3], 'input_file':'dlpno_normal_tz','output_keywords':['Total Correlation Energy']})
else:
kwargs.update({'input_file':'dlpno_normal_tz','output_keywords':['Total Correlation Energy']})
self.npno_tz = orca_methods(method='''DLPNO-CCSD(T) normalPNO RIJK cc-pVTZ cc-pVTZ/C cc-pVTZ/JK\n! tightscf noautostart scfconvforced miniprint nopop''',
**kwargs)
if self.nthreads_list:
kwargs.update({'nthreads':self.nthreads_list[4], 'input_file':'dlpno_tight_dz','output_keywords':['Total Correlation Energy']})
else:
kwargs.update({'input_file':'dlpno_tight_dz','output_keywords':['Total Correlation Energy']})
self.tpno_dz = orca_methods(method='''DLPNO-CCSD(T) tightPNO RIJK cc-pVDZ cc-pVDZ/C cc-pVTZ/JK\n! tightscf noautostart scfconvforced miniprint nopop''',
**kwargs)
self.combination = [self.mp2_tz, self.mp2_qz, self.npno_dz, self.npno_tz, self.tpno_dz]
[docs]
def predict(self, molecular_database, **kwargs):
for method in self.combination:
method.predict(molecular_database=molecular_database, calculate_energy=False)
for mol in molecular_database:
alpha = 5.46; beta = 2.51
hf_tz = mol.mp2_tz_scf_energy
hf_qz = mol.mp2_qz_scf_energy
mp2_tz = mol.mp2_tz_correlation_energy
mp2_qz = mol.mp2_qz_correlation_energy
npno_dz = mol.dlpno_normal_dz_total_correlation_energy
npno_tz = mol.dlpno_normal_tz_total_correlation_energy
tpno_dz = mol.dlpno_tight_dz_total_correlation_energy
E_hf_xtrap = (math.exp(-alpha * 4**0.5) * hf_tz - math.exp(-alpha * 3**0.5) * hf_qz) \
/ (math.exp(-alpha * 4**0.5) - math.exp(-alpha * 3**0.5))
E_mp2_xtrap = (4**beta * mp2_qz - 3**beta * mp2_tz) / (4**beta - 3**beta)
energy = E_hf_xtrap + E_mp2_xtrap - mp2_tz + npno_tz + tpno_dz - npno_dz
mol.energy = energy
[docs]
def parse_orca_output(filename, molecule=None):
"""
Parse ORCA output file and return/populate a molecule object.
Automatically detects calculation type (QD-NEVPT2, TD-DFT, or ground state DFT).
Also looks for .engrad and .hess files in the same directory.
Args:
filename: Path to ORCA output file (.out)
molecule: Optional molecule object to update. If None, returns new molecule.
Returns:
molecule object if molecule is None, otherwise updates provided molecule
Example:
>>> mol = parse_orca_output('calculation.out')
>>> print(mol.energy)
>>> print(mol.electronic_states[1].energy) # S1 energy
"""
if not os.path.exists(filename):
raise FileNotFoundError(f"ORCA output file not found: {filename}")
with open(filename, 'r') as f:
content = f.read()
lines = content.split('\n')
# Build new molecule from scratch
mol = data.molecule()
# Parse geometry from output
_parse_geometry_from_content(mol, lines)
# Auto-detect calculation type
is_qd_nevpt2 = 'QD-NEVPT2 Results' in content
is_tddft = 'TD-DFT/TDA EXCITED STATES' in content or 'ABSORPTION SPECTRUM VIA TRANSITION ELECTRIC DIPOLE MOMENTS' in content
# Get base path for auxiliary files (.engrad, .hess)
base_path = filename.rsplit('.', 1)[0]
current_state = 0
try:
for line in lines:
if 'State of interest' in line:
current_state = int(line.split()[-1])
break
except:
current_state = 0
if is_qd_nevpt2:
_parse_qd_nevpt2_from_content(mol, lines, current_state)
elif is_tddft:
_parse_tddft_from_content(mol, lines, current_state)
else:
_parse_ground_state_from_content(mol, lines)
# Try to parse gradients if .engrad file exists
engrad_file = f'{base_path}.engrad'
if os.path.exists(engrad_file):
# Default gradient_state to current_state if not specified
_parse_gradients_from_file(mol, engrad_file, current_state)
# Try to parse Hessian if .hess file exists
hess_file = f'{base_path}.hess'
if os.path.exists(hess_file):
_parse_hessian_from_file(mol, hess_file)
if molecule is None:
return mol
else:
molecule.update_from(mol)
[docs]
def _parse_geometry_from_content(mol, lines):
"""Parse molecular geometry from ORCA output."""
# Look for "CARTESIAN COORDINATES (ANGSTROEM)" section
in_coords = False
atoms_data = []
for i, line in enumerate(lines):
if 'CARTESIAN COORDINATES (ANGSTROEM)' in line:
in_coords = True
continue
if in_coords:
if line.strip() == '' or '---' in line:
if atoms_data:
break
continue
parts = line.split()
if len(parts) >= 4:
try:
element = parts[0]
x = float(parts[1])
y = float(parts[2])
z = float(parts[3])
atoms_data.append((element, x, y, z))
except ValueError:
continue
# Build atoms
for element, x, y, z in atoms_data:
atom = data.atom(element_symbol=element)
atom.xyz_coordinates = np.array([x, y, z])
mol.atoms.append(atom)
# Parse charge and multiplicity from input section
for line in lines:
if '* xyz' in line.lower() or '*xyz' in line.lower():
parts = line.replace('*', '').split()
if 'xyz' in parts[0].lower():
parts = parts[1:]
if len(parts) >= 2:
try:
mol.charge = int(parts[0])
mol.multiplicity = int(parts[1])
except ValueError:
pass
break
[docs]
def _parse_qd_nevpt2_from_content(mol, lines, current_state):
"""Parse QD-NEVPT2 energies from output content."""
pt2_energies = {}
casscf_energies = {}
in_block = False
current_root = None
for line in lines:
if 'QD-NEVPT2 Results' in line:
in_block = True
continue
if in_block:
line_stripped = line.strip()
if line_stripped.startswith('ROOT ='):
parts = line_stripped.split('=')
if len(parts) >= 2:
try:
current_root = int(parts[1].strip())
except ValueError:
current_root = None
elif 'Zero Order Energy' in line and current_root is not None:
parts = line.split('=')
if len(parts) >= 2:
try:
casscf_energies[current_root] = float(parts[-1].strip())
except ValueError:
pass
elif 'Total Energy (E0+dE)' in line and current_root is not None:
parts = line.split('=')
if len(parts) >= 2:
try:
pt2_energies[current_root] = float(parts[-1].strip())
except ValueError:
pass
elif '======' in line_stripped and pt2_energies:
break
if not pt2_energies:
raise ValueError("No QD-NEVPT2 Results block found in output")
# Build electronic states
mol.electronic_states = []
for root_idx in sorted(pt2_energies.keys()):
state_mol = data.molecule()
state_mol.atoms = [atom.copy() for atom in mol.atoms]
state_mol.energy = pt2_energies[root_idx]
if root_idx in casscf_energies:
state_mol.casscf_energy = casscf_energies[root_idx]
# Initialize gradients as NaN
for atom in state_mol.atoms:
atom.energy_gradients = np.full(3, np.nan).tolist()
mol.electronic_states.append(state_mol)
# Set molecule energy to current state
if current_state < len(mol.electronic_states):
mol.energy = mol.electronic_states[current_state].energy
# Parse oscillator strengths
_parse_absorption_spectrum(mol, lines, is_qd_nevpt2=True)
[docs]
def _parse_tddft_from_content(mol, lines, current_state):
"""Parse TD-DFT energies from output content."""
# Get ground state energy
gs_energy = None
for line in lines:
if 'FINAL SINGLE POINT ENERGY' in line:
gs_energy = float(line.split()[-1])
break
if gs_energy is None:
raise ValueError("Could not parse ground state energy")
# Build electronic states
mol.electronic_states = []
# Ground state (S0)
gs_mol = data.molecule()
gs_mol.atoms = [atom.copy() for atom in mol.atoms]
gs_mol.energy = gs_energy
for atom in gs_mol.atoms:
atom.energy_gradients = np.full(3, np.nan).tolist()
mol.electronic_states.append(gs_mol)
# Parse excited states
excitation_energies, oscillator_strengths = _parse_tddft_spectrum_from_lines(lines)
for exc_e in excitation_energies:
state_mol = data.molecule()
state_mol.atoms = [atom.copy() for atom in mol.atoms]
state_mol.energy = gs_energy + exc_e
for atom in state_mol.atoms:
atom.energy_gradients = np.full(3, np.nan).tolist()
mol.electronic_states.append(state_mol)
if oscillator_strengths:
mol.oscillator_strengths = oscillator_strengths
# Set molecule energy to current state
state_idx = current_state if current_state < len(mol.electronic_states) else 0
mol.energy = mol.electronic_states[state_idx].energy
[docs]
def _parse_ground_state_from_content(mol, lines):
"""Parse ground state DFT/HF energy from output content."""
gs_energy = None
for line in lines:
if 'FINAL SINGLE POINT ENERGY' in line:
gs_energy = float(line.split()[-1])
break
if gs_energy is None:
raise ValueError("Could not parse ground state energy")
mol.energy = gs_energy
# Create single electronic state
mol.electronic_states = []
gs_mol = data.molecule()
gs_mol.atoms = [atom.copy() for atom in mol.atoms]
gs_mol.energy = gs_energy
for atom in gs_mol.atoms:
atom.energy_gradients = np.full(3, np.nan).tolist()
mol.electronic_states.append(gs_mol)
[docs]
def _parse_tddft_spectrum_from_lines(lines):
"""Parse TD-DFT absorption spectrum from output lines."""
last_idx = None
for i, line in enumerate(lines):
if 'ABSORPTION SPECTRUM VIA TRANSITION ELECTRIC DIPOLE MOMENTS' in line:
last_idx = i
excitation_energies = []
oscillator_strengths = []
if last_idx is not None:
for line in lines[last_idx + 5:]:
if line.strip() == '':
break
parts = line.split()
if len(parts) >= 4:
exc_energy_cm = float(parts[1])
exc_energy_hartree = exc_energy_cm * 4.556335e-6
excitation_energies.append(exc_energy_hartree)
oscillator_strengths.append(float(parts[3]))
return excitation_energies, oscillator_strengths
[docs]
def _parse_absorption_spectrum(mol, lines, is_qd_nevpt2=False):
"""Parse oscillator strengths from absorption spectrum."""
last_idx = None
for i, line in enumerate(lines):
if 'ABSORPTION SPECTRUM' in line:
last_idx = i
if last_idx is None:
return
oscillator_strengths = []
for line in lines[last_idx + 5:]:
if '-' * 20 in line or line.strip() == '':
break
parts = line.split()
if is_qd_nevpt2 and len(parts) >= 8:
oscillator_strengths.append(float(parts[7]))
elif not is_qd_nevpt2 and len(parts) >= 4:
oscillator_strengths.append(float(parts[3]))
if oscillator_strengths:
mol.oscillator_strengths = oscillator_strengths
[docs]
def _parse_gradients_from_file(mol, engrad_file, current_state=0):
"""Parse gradients from .engrad file and add to electronic state."""
with open(engrad_file, 'r') as f:
lines = f.readlines()
gradients = []
in_gradient_section = False
for i, line in enumerate(lines):
if 'The current gradient in Eh/bohr' in line:
in_gradient_section = True
continue
if in_gradient_section:
line_stripped = line.strip()
if line_stripped.startswith('#'):
if i + 1 < len(lines) and 'atomic' in lines[i + 1].lower():
break
continue
if line_stripped:
try:
gradients.append(float(line_stripped) * constants.Angstrom2Bohr)
except ValueError:
break
if gradients and mol.atoms:
grad_array = np.array(gradients).reshape(-1, 3)
# Add to top-level molecule atoms
for i, atom in enumerate(mol.atoms):
if i < len(grad_array):
atom.energy_gradients = grad_array[i].tolist()
# Add to the appropriate electronic state using add_xyz_derivative_property
if hasattr(mol, 'electronic_states') and current_state < len(mol.electronic_states):
mol.electronic_states[current_state].add_xyz_derivative_property(
grad_array, 'energy', 'energy_gradients'
)
[docs]
def _parse_hessian_from_file(mol, hess_file):
"""Parse Hessian matrix from .hess file."""
with open(hess_file, 'r') as f:
lines = f.readlines()
# Find $hessian section
start_line = None
for i, line in enumerate(lines):
if '$hessian' in line:
start_line = i
break
if start_line is None:
return
matrix_size = int(lines[start_line + 1].strip())
hessian = np.zeros((matrix_size, matrix_size))
current_line = start_line + 2
current_col = 0
while current_col < matrix_size and current_line < len(lines):
header = lines[current_line].strip()
if not header:
current_line += 1
continue
cols = [int(x) for x in header.split()]
num_cols = len(cols)
current_line += 1
for row in range(matrix_size):
if current_line >= len(lines):
break
data_line = lines[current_line].strip()
if not data_line:
current_line += 1
continue
parts = data_line.split()
for col_idx, col_num in enumerate(cols):
if col_idx + 1 < len(parts):
hessian[row, col_num] = float(parts[col_idx + 1])
current_line += 1
current_col += num_cols
mol.hessian = hessian / (constants.Bohr2Angstrom ** 2)
# Parse frequencies
_parse_frequencies_from_lines(mol, lines)
[docs]
def _parse_frequencies_from_lines(mol, hess_lines):
"""Parse vibrational frequencies from .hess file content."""
frequencies = []
ir_intensities = []
for i, line in enumerate(hess_lines):
if '$vibrational_frequencies' in line:
for freq_line in hess_lines[i + 2:]:
if freq_line.strip() == '' or '$' in freq_line:
break
parts = freq_line.split()
if len(parts) >= 2:
frequencies.append(float(parts[-1]))
if '$ir_spectrum' in line:
for ir_line in hess_lines[i + 2:]:
if ir_line.strip() == '' or '$' in ir_line:
break
parts = ir_line.split()
if len(parts) >= 3:
ir_intensities.append(float(parts[2]))
if frequencies:
zero_count = sum(1 for f in frequencies if f == 0)
mol.frequencies = frequencies[zero_count:]
if ir_intensities:
mol.infrared_intensities = ir_intensities[zero_count:]