Source code for mlatom.interfaces.orca_interface

#!/usr/bin/env python3
'''
.. code-block::

  !---------------------------------------------------------------------------!
  ! orca: interface to the ORCA program                                       !
  ! Implementations by: Yuxinxin Chen                                         !
  ! NEVPT2 and TD-DFT implemented by MikoĊ‚aj Martyka, Li Wang and Xinyu Tong  !
  !---------------------------------------------------------------------------!
'''

import os
import re
import numpy as np
import math
import tempfile
import subprocess
from .. import constants, data
from ..model_cls import model, method_model
from ..decorators import doc_inherit


[docs] class orca_methods(method_model): ''' ORCA interface Arguments: method (str): Method specification. Use ``'B3LYP/6-31G*'`` for ground state DFT, ``'TD-wb97x-d3bj/def2-SVP'`` for TD-DFT, ``'TDA-wb97x-d3bj/def2-SVP'`` for TDA, ``'qd-nevpt2/cc-pVDZ'`` for QD-NEVPT2, or ``'CCSD(T)*/CBS'`` for composite method. save_files_in_current_directory (bool): Keep input/output files, default ``True`` working_directory (str): Directory for output files, default ``None`` nthreads (int): Number of parallel processes (``%pal nprocs``) maxcore (int): Memory per core in MB (``%maxcore``) solvent (str): Solvent name for implicit solvation solvation_model (str): ``'SMD'`` or ``'CPCM'``, default ``'SMD'`` additional_keywords (list): Extra keywords for ORCA input line Example setup: .. code-block:: python td_dft = ml.models.methods(method='TD-B3LYP/def2-SVP', program='orca', working_directory='./orca_tddft', nthreads=1, maxcore=1000, ) qd_nevpt2 = ml.models.methods(method='QD-NEVPT2/def2-SVP', program='orca', nthreads=1, maxcore=1000, working_directory='./orca_NEVPT2', ) ''' bin_env_name = 'orcabin' def __init__(self, method='wb97x/6-31G*', save_files_in_current_directory: bool = True, working_directory: str = None, nthreads: int = 1, maxcore: int = 1000, solvent: str = None, solvation_model: str = "SMD", additional_keywords: list = None, input_file: str = '', output_keywords: list = None, **kwargs): self.orcabin = self.get_bin_env_var() if self.orcabin is None: raise ValueError('Cannot find the Orca program, please set the environment variable: export orcabin=...') self.method = method self.nthreads = nthreads self.maxcore = maxcore self.solvent = solvent self.solvation_model = solvation_model if solvent else None self.save_files_in_current_directory = save_files_in_current_directory self.working_directory = working_directory self.additional_keywords = additional_keywords self.input_file = input_file self.output_keywords = output_keywords # For CCSD(T)*/CBS composite method if self.method == 'CCSD(T)*/CBS': self.cc_method = ccsdtstarcbs(**kwargs)
[docs] @doc_inherit def predict(self, molecular_database: data.molecular_database = None, molecule: data.molecule = None, calculate_energy: bool = True, calculate_energy_gradients = False, calculate_hessian: bool = False, nstates: int = 1, current_state: int = 0, active_space: tuple = None, casscf_kwargs: dict = None): ''' Run ORCA calculation. Arguments: molecular_database: collection of molecules stored in ml.data.molecular_database object to make predictions for. molecule: molecule to predict for. calculate_energy: bool, whether to update the energy of the molecule. calculate_energy_gradients: bool or list of bool. If list, e.g. [False, True, False] requests gradient for S1 only. Only one state gradient can be requested at a time. current_state: 0-indexed state (0 = S0, 1 = S1, etc.), for which the energy and gradient will be used. active_space: tuple (nel, norb) for CASSCF/QD-NEVPT2, e.g. (6, 6) casscf_kwargs: dict of additional CASSCF keywords (maxiter, actorbs, etc.) nstates: number of electronic states to compute calculate_hessian: whether to calculate hessian matrix. Example usage: .. code-block:: python td_dft.predict( molecule=mol, nstates=2, current_state=0, calculate_energy=True, calculate_energy_gradients=True, ) qd_nevpt2.predict( molecule=mol, nstates=2, current_state=1, active_space = (2,2) ) ''' molDB = super().predict(molecular_database=molecular_database, molecule=molecule) # Parse TD-/TDA- prefix from method string method_upper = self.method.upper() if method_upper.startswith('TDA-'): self._tda = True self._method_cleaned = self.method[4:] # Strip 'TDA-' is_tddft = True elif (method_upper.startswith('TD-') or (nstates>1 and not ("PT2" in method_upper)) ): #This is disgustingly ugly, handwritten code that works self._tda = False self._method_cleaned = self.method[3:] # Strip 'TD-' is_tddft = True else: self._tda = False self._method_cleaned = self.method is_tddft = False # Validate nstates for TD-DFT if is_tddft and nstates == 1: raise ValueError("TD-DFT requires nstates > 1. Use nstates=N to calculate N-1 excited states.") # Handle calculate_energy_gradients as bool or list if isinstance(calculate_energy_gradients, list): # Validate: only one True allowed true_count = sum(calculate_energy_gradients) if true_count > 1: raise ValueError("Only one state gradient can be calculated at a time. " f"Got {true_count} states requested: {calculate_energy_gradients}") if true_count == 1: # Find which state has gradient requested self.gradient_state = calculate_energy_gradients.index(True) self.calculate_energy_gradients = True else: self.gradient_state = None self.calculate_energy_gradients = False else: self.calculate_energy_gradients = calculate_energy_gradients self.gradient_state = current_state if calculate_energy_gradients else None if self.gradient_state ==0 and is_tddft: raise ValueError("ORCA TD-DFT does not support ground-state gradient calculations. Please create a composite TD-DFT/DFT model in MLatom.") # Store calculation parameters self.calculate_energy = calculate_energy self.calculate_hessian = calculate_hessian self.nstates = nstates self.current_state = current_state self.active_space = active_space self.casscf_kwargs = casscf_kwargs if casscf_kwargs else {} if self.method == 'CCSD(T)*/CBS': self.cc_method.predict(molecular_database=molDB) return # Setup working directory tmpdirname = self._setup_working_directory() for imol, imolecule in enumerate(molDB.molecules): self.inpfile = os.path.join(tmpdirname, f'molecule{imol}_{self.input_file}') # Generate input and run ORCA self._generate_input(imolecule) self._run_orca(tmpdirname) out_file = f'{self.inpfile}.out' if not os.path.exists(out_file): raise FileNotFoundError(f"Output file not found: {out_file}") with open(out_file, 'r') as f: lines = f.readlines() # Check last 20 lines for normal termination tail = lines[-20:] if len(lines) >= 20 else lines if not any('ORCA TERMINATED NORMALLY' in line for line in tail): print(f"ORCA did not terminate normally. Check output: {out_file}. The program will try to continue.") import re version_match = re.search(r'Program Version (\d+)\.(\d+)\.(\d+)', ''.join(lines)) if version_match: major_version = int(version_match.group(1)) self._orca_version = major_version # Parse output based on method type if self._method_cleaned.lower().startswith('qd_nevpt2') or self._method_cleaned.lower().startswith('qd-nevpt2'): self._parse_qd_nevpt2_output(imolecule) elif not self.calculate_energy and self.output_keywords: # Custom keyword parsing (used by CCSD(T)*/CBS components) self._parse_output_keywords(imolecule) else: self._parse_tddft_output(imolecule) imolecule.orca_version = self._orca_version
[docs] def _parse_output_keywords(self, molecule): '''Parse custom keywords from property.txt file. Used by CCSD(T)*/CBS and similar methods to extract specific values. ''' prop_file = f'{self.inpfile}_property.txt' if not os.path.exists(prop_file): raise FileNotFoundError(f"Property file not found: {prop_file}") with open(prop_file, 'r') as f: lines = f.readlines() for keyword in self.output_keywords: keyword_name = self.input_file + '_' + keyword.lower().replace(' ', '_') for line in lines: if keyword in line: molecule.__dict__[keyword_name] = float(line.split()[-1]) break
[docs] def _setup_working_directory(self) -> str: '''Setup and return the working directory path.''' if self.working_directory is not None: tmpdirname = self.working_directory if not os.path.exists(tmpdirname): os.makedirs(tmpdirname) elif self.save_files_in_current_directory: tmpdirname = '.' else: # Create temporary directory (caller should handle cleanup) tmpdirname = tempfile.mkdtemp() return os.path.abspath(tmpdirname)
[docs] def _generate_input(self, molecule): '''Generate ORCA input file.''' is_qd_nevpt2 = self._method_cleaned.lower().startswith('qd_nevpt2') or self._method_cleaned.lower().startswith('qd-nevpt2') with open(f'{self.inpfile}.inp', 'w') as f: # Method/keywords line if is_qd_nevpt2: # QD-NEVPT2: parse basis from method string (e.g., qd_nevpt2/cc-pVDZ) if '/' in self._method_cleaned: basis = self._method_cleaned.split('/')[1] else: basis = 'def2-SVP' # default method_line = f'{basis} RIJK AUTOAUX LargePrint' elif '/' in self._method_cleaned and len(self._method_cleaned.split('/')) == 2: method_line = ' '.join(self._method_cleaned.split('/')) else: method_line = self._method_cleaned if self.additional_keywords: method_line += ' ' + ' '.join(self.additional_keywords) f.write(f'! {method_line}\n') # Calculation type keywords (separate line) - NOT for QD-NEVPT2 or custom keyword extraction (e.g., CCSD(T)*/CBS) if not is_qd_nevpt2 and not (not self.calculate_energy and self.output_keywords): calc_keywords = [] if self.calculate_energy: calc_keywords.append('SP') if self.calculate_energy_gradients: calc_keywords.append('ENGRAD') if self.calculate_hessian: calc_keywords.append('FREQ') f.write(f'! {" ".join(calc_keywords)}\n') # Parallel and memory settings f.write(f'%pal nprocs {self.nthreads} end\n') f.write(f'%maxcore {self.maxcore}\n') # TD-DFT block (only for non-CASSCF excited states) if self.nstates > 1 and not is_qd_nevpt2: f.write('%tddft\n') f.write(f' nroots {self.nstates - 1}\n') # iroot specifies which state to compute gradient for # Use gradient_state if gradients requested, otherwise current_state iroot_state = self.gradient_state if self.calculate_energy_gradients else self.current_state if iroot_state is not None and iroot_state > 0: f.write(f' iroot {iroot_state}\n') f.write(f' TDA {str(self._tda).lower()}\n') f.write('end\n') # CASSCF block for QD-NEVPT2 if is_qd_nevpt2 and self.active_space: # SCF settings - need Conventional mode for RIJK f.write('%scf\n') f.write(' SCFMode Conventional\n') f.write('end\n') nel, norb = self.active_space f.write('%casscf\n') f.write(f' nel {nel}\n') f.write(f' norb {norb}\n') f.write(f' mult {molecule.multiplicity}\n') f.write(f' nroots {self.nstates}\n') # Write any additional casscf_kwargs for key, value in self.casscf_kwargs.items(): f.write(f' {key} {value}\n') f.write(' trafostep ri\n') f.write(' PTMethod SC_NEVPT2\n') f.write(' PTSettings\n') f.write(' QDType 1\n') f.write(' end\n') f.write('end\n') # Solvation if self.solvent: if self.solvation_model and self.solvation_model.upper() == 'SMD': f.write('%cpcm\n') f.write(' smd true\n') f.write(f' smdsolvent "{self.solvent}"\n') f.write('end\n') elif self.solvation_model and self.solvation_model.upper() == 'CPCM': f.write(f'! CPCM({self.solvent})\n') # Geometry f.write(f'* xyz {molecule.charge} {molecule.multiplicity}\n') for atom in molecule.atoms: f.write(f'{atom.element_symbol:2s} {atom.xyz_coordinates[0]:14.8f} ' f'{atom.xyz_coordinates[1]:14.8f} {atom.xyz_coordinates[2]:14.8f}\n') f.write('*\n')
[docs] def _run_orca(self, workdir: str): '''Execute ORCA calculation.''' cmd = f'{self.orcabin} {self.inpfile}.inp > {self.inpfile}.out 2>&1' proc = subprocess.Popen(cmd, shell=True, cwd=workdir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate()
[docs] def _parse_qd_nevpt2_output(self, molecule): '''Parse QD-NEVPT2 output from .out file.''' out_file = f'{self.inpfile}.out' if not os.path.exists(out_file): raise FileNotFoundError(f"Output file not found: {out_file}") with open(out_file, 'r') as f: lines = f.readlines() # Extract QD-NEVPT2 and CASSCF energies from "QD-NEVPT2 Results" block pt2_energies, casscf_energies = self._extract_qd_energies_from_out(lines) if not pt2_energies: raise ValueError(f"No QD-NEVPT2 Results block found in {out_file}") # Build electronic states molecule.electronic_states = [] for root_idx in sorted(pt2_energies.keys()): state_mol = data.molecule() state_mol.atoms = [atom.copy() for atom in molecule.atoms] state_mol.energy = pt2_energies[root_idx] if root_idx in casscf_energies: state_mol.casscf_energy = casscf_energies[root_idx] # Initialize gradients as NaN (not available for QD-NEVPT2) for atom in state_mol.atoms: atom.energy_gradients = np.full(3, np.nan).tolist() molecule.electronic_states.append(state_mol) # Set molecule energy to current state if self.current_state < len(molecule.electronic_states): molecule.energy = molecule.electronic_states[self.current_state].energy # QD-NEVPT2 gradients not implemented if self.calculate_energy_gradients: raise NotImplementedError("Analytical gradients are not available for QD-NEVPT2") # Parse Hessian if requested if self.calculate_hessian: self._parse_hessian(molecule) # Parse oscillator strengths from .out file self._parse_qd_absorption_spectrum(molecule, out_file)
[docs] def _extract_qd_energies_from_out(self, lines: list) -> tuple: '''Extract QD-NEVPT2 and CASSCF energies from .out file. Parses "QD-NEVPT2 Results" block: ROOT = 0 ... Zero Order Energy : E0 = -230.38678408697643 (CASSCF) Total Energy (E0+dE) : E = -231.11493305070951 (QD-NEVPT2) Returns: tuple: (pt2_energies dict, casscf_energies dict) ''' pt2_energies = {} casscf_energies = {} in_block = False current_root = None for line in lines: if 'QD-NEVPT2 Results' in line: in_block = True continue if in_block: line_stripped = line.strip() # Match "ROOT = 0" if line_stripped.startswith('ROOT ='): parts = line_stripped.split('=') if len(parts) >= 2: try: current_root = int(parts[1].strip()) except ValueError: current_root = None # Match "Zero Order Energy : E0 = -230.386..." elif 'Zero Order Energy' in line and current_root is not None: parts = line.split('=') if len(parts) >= 2: try: casscf_energies[current_root] = float(parts[-1].strip()) except ValueError: pass # Match "Total Energy (E0+dE) : E = -231.114..." elif 'Total Energy (E0+dE)' in line and current_root is not None: parts = line.split('=') if len(parts) >= 2: try: pt2_energies[current_root] = float(parts[-1].strip()) except ValueError: pass # End of block elif '======' in line_stripped and pt2_energies: break return pt2_energies, casscf_energies
[docs] def _parse_qd_absorption_spectrum(self, molecule, out_file: str): '''Parse oscillator strengths from QD-NEVPT2 absorption spectrum.''' if not os.path.exists(out_file): return with open(out_file, 'r') as f: lines = f.readlines() # Find last occurrence of absorption spectrum last_idx = None for i, line in enumerate(lines): if 'ABSORPTION SPECTRUM' in line: last_idx = i if last_idx is None: return excitation_energies = [] oscillator_strengths = [] # Parse spectrum table (skip header lines) for line in lines[last_idx + 5:]: if '-' * 20 in line or line.strip() == '': break parts = line.split() if len(parts) >= 8: # Column 5 is energy in cm^-1, column 7 is oscillator strength exc_energy_cm = float(parts[5]) exc_energy_hartree = exc_energy_cm * 4.556335e-6 excitation_energies.append(exc_energy_hartree) oscillator_strengths.append(float(parts[7])) if oscillator_strengths: molecule.oscillator_strengths = oscillator_strengths
[docs] def _parse_tddft_output(self, molecule): '''Parse TD-DFT/ground-state DFT output.''' if self._orca_version != 6: prop_file = f'{self.inpfile}_property.txt' else: prop_file = f'{self.inpfile}.property.txt' out_file = f'{self.inpfile}.out' # Get ground state energy gs_energy = None energy_from_prop_file = False if os.path.exists(prop_file) and self._orca_version != 6: with open(prop_file, 'r') as f: for line in f: if 'SCF Energy:' in line or 'TOTAL ENERGY' in line: gs_energy = float(line.split()[-1]) energy_from_prop_file = True break elif os.path.exists(prop_file): with open(prop_file, 'r') as f: in_scf_block = False for line in f: if '$SCF_Energy' in line: in_scf_block = True continue if in_scf_block: if '&SCF_ENERGY' in line: # Extract the float value at the end gs_energy = float(line.split()[-1]) energy_from_prop_file = True break elif line.startswith('$'): # New block started, SCF_Energy block ended break if gs_energy is None and os.path.exists(out_file): with open(out_file, 'r') as f: for line in f: if 'FINAL SINGLE POINT ENERGY' in line: gs_energy = float(line.split()[-1]) break if gs_energy is None: raise ValueError("Could not parse ground state energy") # Parse dispersion correction from .out file if energy came from property.txt # (FINAL SINGLE POINT ENERGY already includes dispersion, but SCF Energy does not) dispersion_correction = 0.0 if energy_from_prop_file and os.path.exists(out_file): with open(out_file, 'r') as f: for line in f: if 'Dispersion correction' in line: try: dispersion_correction = float(line.split()[-1]) except ValueError: pass break gs_energy += dispersion_correction # Build electronic states molecule.electronic_states = [] natoms = len(molecule.atoms) # Ground state (S0) gs_mol = data.molecule() gs_mol.atoms = [atom.copy() for atom in molecule.atoms] gs_mol.energy = gs_energy # Initialize gradients as NaN for atom in gs_mol.atoms: atom.energy_gradients = np.full(3, np.nan).tolist() molecule.electronic_states.append(gs_mol) # Parse excited states if TD-DFT was run if self.nstates > 1 and os.path.exists(out_file): excitation_energies, oscillator_strengths = self._parse_tddft_spectrum(out_file) for i, exc_e in enumerate(excitation_energies): state_mol = data.molecule() state_mol.atoms = [atom.copy() for atom in molecule.atoms] state_mol.energy = gs_energy + exc_e # Initialize gradients as NaN for atom in state_mol.atoms: atom.energy_gradients = np.full(3, np.nan).tolist() molecule.electronic_states.append(state_mol) if oscillator_strengths: molecule.oscillator_strengths = oscillator_strengths # Set molecule energy to current state # For TD-DFT: current_state=0 means S0, current_state=1 means S1, etc. state_idx = self.current_state if self.current_state < len(molecule.electronic_states) else 0 molecule.energy = molecule.electronic_states[state_idx].energy # Parse gradients for the requested gradient state if self.calculate_energy_gradients and self.gradient_state is not None: self._parse_gradients(molecule) # Store gradients in the appropriate electronic state using add_xyz_derivative_property grad_idx = self.gradient_state if grad_idx < len(molecule.electronic_states): # Get gradient array from molecule.atoms grad_array = np.array([atom.energy_gradients for atom in molecule.atoms]) molecule.electronic_states[grad_idx].add_xyz_derivative_property( grad_array, 'energy', 'energy_gradients' ) # Parse Hessian if self.calculate_hessian: self._parse_hessian(molecule)
[docs] def _parse_tddft_spectrum(self, out_file: str) -> tuple: '''Parse TD-DFT absorption spectrum from output file.''' with open(out_file, 'r') as f: lines = f.readlines() # Find absorption spectrum section last_idx = None for i, line in enumerate(lines): if 'ABSORPTION SPECTRUM VIA TRANSITION ELECTRIC DIPOLE MOMENTS' in line: last_idx = i excitation_energies = [] oscillator_strengths = [] if last_idx is not None: for line in lines[last_idx + 5:]: if line.strip() == '': break parts = line.split() if len(parts) >= 4: # Column 1 is energy in cm^-1, column 3 is oscillator strength if self._orca_version == 6: exc_energy_cm = float(parts[4]) else: exc_energy_cm = float(parts[1]) exc_energy_hartree = exc_energy_cm * 4.556335e-6 excitation_energies.append(exc_energy_hartree) if self._orca_version == 6: oscillator_strengths.append(float(parts[6])) else: oscillator_strengths.append(float(parts[3])) return excitation_energies, oscillator_strengths
[docs] def _parse_gradients(self, molecule): '''Parse energy gradients from .engrad file.''' engrad_file = f'{self.inpfile}.engrad' if not os.path.exists(engrad_file): raise FileNotFoundError(f"Gradient file not found: {engrad_file}. ORCA may have failed to compute gradients.") with open(engrad_file, 'r') as f: lines = f.readlines() gradients = [] in_gradient_section = False for i, line in enumerate(lines): if 'The current gradient in Eh/bohr' in line: in_gradient_section = True continue if in_gradient_section: line_stripped = line.strip() # Skip comment lines starting with # if line_stripped.startswith('#'): # Check if this is the end marker (# followed by new section) # Look ahead to see if next non-empty line is a new section header if i + 1 < len(lines) and 'atomic' in lines[i + 1].lower(): break continue if line_stripped: try: # Convert from Eh/bohr to Eh/Angstrom gradients.append(float(line_stripped) * constants.Angstrom2Bohr) except ValueError: # Not a number, likely end of section break if gradients: grad_array = np.array(gradients).reshape(-1, 3) for i, atom in enumerate(molecule.atoms): atom.energy_gradients = grad_array[i].tolist() else: raise ValueError(f"No gradients found in {engrad_file}")
[docs] def _parse_hessian(self, molecule): '''Parse Hessian matrix from .hess file.''' hess_file = f'{self.inpfile}.hess' if not os.path.exists(hess_file): return with open(hess_file, 'r') as f: lines = f.readlines() # Find $hessian section start_line = None for i, line in enumerate(lines): if '$hessian' in line: start_line = i break if start_line is None: return matrix_size = int(lines[start_line + 1].strip()) hessian = np.zeros((matrix_size, matrix_size)) current_line = start_line + 2 current_col = 0 while current_col < matrix_size and current_line < len(lines): # Read column header header = lines[current_line].strip() if not header: current_line += 1 continue cols = [int(x) for x in header.split()] num_cols = len(cols) current_line += 1 # Read data rows for row in range(matrix_size): if current_line >= len(lines): break data_line = lines[current_line].strip() if not data_line: current_line += 1 continue parts = data_line.split() # First element is row index, rest are values for col_idx, col_num in enumerate(cols): if col_idx + 1 < len(parts): hessian[row, col_num] = float(parts[col_idx + 1]) current_line += 1 current_col += num_cols # Convert from Eh/bohr^2 to Eh/Angstrom^2 molecule.hessian = hessian / (constants.Bohr2Angstrom ** 2) # Also parse frequencies if available self._parse_frequencies(molecule, lines)
[docs] def _parse_frequencies(self, molecule, hess_lines: list): '''Parse vibrational frequencies from .hess file content.''' frequencies = [] ir_intensities = [] for i, line in enumerate(hess_lines): if '$vibrational_frequencies' in line: for freq_line in hess_lines[i + 2:]: if freq_line.strip() == '' or '$' in freq_line: break parts = freq_line.split() if len(parts) >= 2: frequencies.append(float(parts[-1])) if '$ir_spectrum' in line: for ir_line in hess_lines[i + 2:]: if ir_line.strip() == '' or '$' in ir_line: break parts = ir_line.split() if len(parts) >= 3: ir_intensities.append(float(parts[2])) # Remove zero frequencies (translations/rotations) if frequencies: zero_count = sum(1 for f in frequencies if f == 0) molecule.frequencies = frequencies[zero_count:] if ir_intensities: molecule.infrared_intensities = ir_intensities[zero_count:]
[docs] class ccsdtstarcbs(model): def __init__(self, **kwargs): if 'nthreads_list' in kwargs: self.nthreads_list = kwargs['nthreads_list'] if len(self.nthreads_list) != 5: print('Number of values to asign to each methods should be 5. Default nthreads will be used') self.nthreads_list = None else: self.nthreads_list = None self.successful = True if self.nthreads_list: kwargs.update({'nthreads':self.nthreads_list[0], 'input_file':'mp2_tz', 'output_keywords':['SCF Energy', 'Correlation Energy']}) else: kwargs.update({'input_file':'mp2_tz', 'output_keywords':['SCF Energy', 'Correlation Energy']}) self.mp2_tz = orca_methods(method='''RIMP2 RIJK cc-pVTZ cc-pVTZ/JK cc-pVTZ/C\n! tightscf noautostart scfconvforced miniprint nopop''', **kwargs) if self.nthreads_list: kwargs.update({'nthreads':self.nthreads_list[1], 'input_file':'mp2_qz', 'output_keywords':['SCF Energy', 'Correlation Energy']}) else: kwargs.update({'input_file':'mp2_qz', 'output_keywords':['SCF Energy', 'Correlation Energy']}) self.mp2_qz = orca_methods(method='''RIMP2 RIJK cc-pVQZ cc-pVQZ/JK cc-pVQZ/C\n! tightscf noautostart scfconvforced miniprint nopop''', **kwargs) if self.nthreads_list: kwargs.update({'nthreads':self.nthreads_list[2],'input_file':'dlpno_normal_dz', 'output_keywords':['Total Correlation Energy']}) else: kwargs.update({'input_file':'dlpno_normal_dz', 'output_keywords':['Total Correlation Energy']}) self.npno_dz = orca_methods(method='''DLPNO-CCSD(T) normalPNO RIJK cc-pVDZ cc-pVDZ/C cc-pvTZ/JK\n! tightscf noautostart scfconvforced miniprint nopop''', **kwargs) if self.nthreads_list: kwargs.update({'nthreads':self.nthreads_list[3], 'input_file':'dlpno_normal_tz','output_keywords':['Total Correlation Energy']}) else: kwargs.update({'input_file':'dlpno_normal_tz','output_keywords':['Total Correlation Energy']}) self.npno_tz = orca_methods(method='''DLPNO-CCSD(T) normalPNO RIJK cc-pVTZ cc-pVTZ/C cc-pVTZ/JK\n! tightscf noautostart scfconvforced miniprint nopop''', **kwargs) if self.nthreads_list: kwargs.update({'nthreads':self.nthreads_list[4], 'input_file':'dlpno_tight_dz','output_keywords':['Total Correlation Energy']}) else: kwargs.update({'input_file':'dlpno_tight_dz','output_keywords':['Total Correlation Energy']}) self.tpno_dz = orca_methods(method='''DLPNO-CCSD(T) tightPNO RIJK cc-pVDZ cc-pVDZ/C cc-pVTZ/JK\n! tightscf noautostart scfconvforced miniprint nopop''', **kwargs) self.combination = [self.mp2_tz, self.mp2_qz, self.npno_dz, self.npno_tz, self.tpno_dz]
[docs] def predict(self, molecular_database, **kwargs): for method in self.combination: method.predict(molecular_database=molecular_database, calculate_energy=False) for mol in molecular_database: alpha = 5.46; beta = 2.51 hf_tz = mol.mp2_tz_scf_energy hf_qz = mol.mp2_qz_scf_energy mp2_tz = mol.mp2_tz_correlation_energy mp2_qz = mol.mp2_qz_correlation_energy npno_dz = mol.dlpno_normal_dz_total_correlation_energy npno_tz = mol.dlpno_normal_tz_total_correlation_energy tpno_dz = mol.dlpno_tight_dz_total_correlation_energy E_hf_xtrap = (math.exp(-alpha * 4**0.5) * hf_tz - math.exp(-alpha * 3**0.5) * hf_qz) \ / (math.exp(-alpha * 4**0.5) - math.exp(-alpha * 3**0.5)) E_mp2_xtrap = (4**beta * mp2_qz - 3**beta * mp2_tz) / (4**beta - 3**beta) energy = E_hf_xtrap + E_mp2_xtrap - mp2_tz + npno_tz + tpno_dz - npno_dz mol.energy = energy
[docs] def parse_orca_output(filename, molecule=None): """ Parse ORCA output file and return/populate a molecule object. Automatically detects calculation type (QD-NEVPT2, TD-DFT, or ground state DFT). Also looks for .engrad and .hess files in the same directory. Args: filename: Path to ORCA output file (.out) molecule: Optional molecule object to update. If None, returns new molecule. Returns: molecule object if molecule is None, otherwise updates provided molecule Example: >>> mol = parse_orca_output('calculation.out') >>> print(mol.energy) >>> print(mol.electronic_states[1].energy) # S1 energy """ if not os.path.exists(filename): raise FileNotFoundError(f"ORCA output file not found: {filename}") with open(filename, 'r') as f: content = f.read() lines = content.split('\n') # Build new molecule from scratch mol = data.molecule() # Parse geometry from output _parse_geometry_from_content(mol, lines) # Auto-detect calculation type is_qd_nevpt2 = 'QD-NEVPT2 Results' in content is_tddft = 'TD-DFT/TDA EXCITED STATES' in content or 'ABSORPTION SPECTRUM VIA TRANSITION ELECTRIC DIPOLE MOMENTS' in content # Get base path for auxiliary files (.engrad, .hess) base_path = filename.rsplit('.', 1)[0] current_state = 0 try: for line in lines: if 'State of interest' in line: current_state = int(line.split()[-1]) break except: current_state = 0 if is_qd_nevpt2: _parse_qd_nevpt2_from_content(mol, lines, current_state) elif is_tddft: _parse_tddft_from_content(mol, lines, current_state) else: _parse_ground_state_from_content(mol, lines) # Try to parse gradients if .engrad file exists engrad_file = f'{base_path}.engrad' if os.path.exists(engrad_file): # Default gradient_state to current_state if not specified _parse_gradients_from_file(mol, engrad_file, current_state) # Try to parse Hessian if .hess file exists hess_file = f'{base_path}.hess' if os.path.exists(hess_file): _parse_hessian_from_file(mol, hess_file) if molecule is None: return mol else: molecule.update_from(mol)
[docs] def _parse_geometry_from_content(mol, lines): """Parse molecular geometry from ORCA output.""" # Look for "CARTESIAN COORDINATES (ANGSTROEM)" section in_coords = False atoms_data = [] for i, line in enumerate(lines): if 'CARTESIAN COORDINATES (ANGSTROEM)' in line: in_coords = True continue if in_coords: if line.strip() == '' or '---' in line: if atoms_data: break continue parts = line.split() if len(parts) >= 4: try: element = parts[0] x = float(parts[1]) y = float(parts[2]) z = float(parts[3]) atoms_data.append((element, x, y, z)) except ValueError: continue # Build atoms for element, x, y, z in atoms_data: atom = data.atom(element_symbol=element) atom.xyz_coordinates = np.array([x, y, z]) mol.atoms.append(atom) # Parse charge and multiplicity from input section for line in lines: if '* xyz' in line.lower() or '*xyz' in line.lower(): parts = line.replace('*', '').split() if 'xyz' in parts[0].lower(): parts = parts[1:] if len(parts) >= 2: try: mol.charge = int(parts[0]) mol.multiplicity = int(parts[1]) except ValueError: pass break
[docs] def _parse_qd_nevpt2_from_content(mol, lines, current_state): """Parse QD-NEVPT2 energies from output content.""" pt2_energies = {} casscf_energies = {} in_block = False current_root = None for line in lines: if 'QD-NEVPT2 Results' in line: in_block = True continue if in_block: line_stripped = line.strip() if line_stripped.startswith('ROOT ='): parts = line_stripped.split('=') if len(parts) >= 2: try: current_root = int(parts[1].strip()) except ValueError: current_root = None elif 'Zero Order Energy' in line and current_root is not None: parts = line.split('=') if len(parts) >= 2: try: casscf_energies[current_root] = float(parts[-1].strip()) except ValueError: pass elif 'Total Energy (E0+dE)' in line and current_root is not None: parts = line.split('=') if len(parts) >= 2: try: pt2_energies[current_root] = float(parts[-1].strip()) except ValueError: pass elif '======' in line_stripped and pt2_energies: break if not pt2_energies: raise ValueError("No QD-NEVPT2 Results block found in output") # Build electronic states mol.electronic_states = [] for root_idx in sorted(pt2_energies.keys()): state_mol = data.molecule() state_mol.atoms = [atom.copy() for atom in mol.atoms] state_mol.energy = pt2_energies[root_idx] if root_idx in casscf_energies: state_mol.casscf_energy = casscf_energies[root_idx] # Initialize gradients as NaN for atom in state_mol.atoms: atom.energy_gradients = np.full(3, np.nan).tolist() mol.electronic_states.append(state_mol) # Set molecule energy to current state if current_state < len(mol.electronic_states): mol.energy = mol.electronic_states[current_state].energy # Parse oscillator strengths _parse_absorption_spectrum(mol, lines, is_qd_nevpt2=True)
[docs] def _parse_tddft_from_content(mol, lines, current_state): """Parse TD-DFT energies from output content.""" # Get ground state energy gs_energy = None for line in lines: if 'FINAL SINGLE POINT ENERGY' in line: gs_energy = float(line.split()[-1]) break if gs_energy is None: raise ValueError("Could not parse ground state energy") # Build electronic states mol.electronic_states = [] # Ground state (S0) gs_mol = data.molecule() gs_mol.atoms = [atom.copy() for atom in mol.atoms] gs_mol.energy = gs_energy for atom in gs_mol.atoms: atom.energy_gradients = np.full(3, np.nan).tolist() mol.electronic_states.append(gs_mol) # Parse excited states excitation_energies, oscillator_strengths = _parse_tddft_spectrum_from_lines(lines) for exc_e in excitation_energies: state_mol = data.molecule() state_mol.atoms = [atom.copy() for atom in mol.atoms] state_mol.energy = gs_energy + exc_e for atom in state_mol.atoms: atom.energy_gradients = np.full(3, np.nan).tolist() mol.electronic_states.append(state_mol) if oscillator_strengths: mol.oscillator_strengths = oscillator_strengths # Set molecule energy to current state state_idx = current_state if current_state < len(mol.electronic_states) else 0 mol.energy = mol.electronic_states[state_idx].energy
[docs] def _parse_ground_state_from_content(mol, lines): """Parse ground state DFT/HF energy from output content.""" gs_energy = None for line in lines: if 'FINAL SINGLE POINT ENERGY' in line: gs_energy = float(line.split()[-1]) break if gs_energy is None: raise ValueError("Could not parse ground state energy") mol.energy = gs_energy # Create single electronic state mol.electronic_states = [] gs_mol = data.molecule() gs_mol.atoms = [atom.copy() for atom in mol.atoms] gs_mol.energy = gs_energy for atom in gs_mol.atoms: atom.energy_gradients = np.full(3, np.nan).tolist() mol.electronic_states.append(gs_mol)
[docs] def _parse_tddft_spectrum_from_lines(lines): """Parse TD-DFT absorption spectrum from output lines.""" last_idx = None for i, line in enumerate(lines): if 'ABSORPTION SPECTRUM VIA TRANSITION ELECTRIC DIPOLE MOMENTS' in line: last_idx = i excitation_energies = [] oscillator_strengths = [] if last_idx is not None: for line in lines[last_idx + 5:]: if line.strip() == '': break parts = line.split() if len(parts) >= 4: exc_energy_cm = float(parts[1]) exc_energy_hartree = exc_energy_cm * 4.556335e-6 excitation_energies.append(exc_energy_hartree) oscillator_strengths.append(float(parts[3])) return excitation_energies, oscillator_strengths
[docs] def _parse_absorption_spectrum(mol, lines, is_qd_nevpt2=False): """Parse oscillator strengths from absorption spectrum.""" last_idx = None for i, line in enumerate(lines): if 'ABSORPTION SPECTRUM' in line: last_idx = i if last_idx is None: return oscillator_strengths = [] for line in lines[last_idx + 5:]: if '-' * 20 in line or line.strip() == '': break parts = line.split() if is_qd_nevpt2 and len(parts) >= 8: oscillator_strengths.append(float(parts[7])) elif not is_qd_nevpt2 and len(parts) >= 4: oscillator_strengths.append(float(parts[3])) if oscillator_strengths: mol.oscillator_strengths = oscillator_strengths
[docs] def _parse_gradients_from_file(mol, engrad_file, current_state=0): """Parse gradients from .engrad file and add to electronic state.""" with open(engrad_file, 'r') as f: lines = f.readlines() gradients = [] in_gradient_section = False for i, line in enumerate(lines): if 'The current gradient in Eh/bohr' in line: in_gradient_section = True continue if in_gradient_section: line_stripped = line.strip() if line_stripped.startswith('#'): if i + 1 < len(lines) and 'atomic' in lines[i + 1].lower(): break continue if line_stripped: try: gradients.append(float(line_stripped) * constants.Angstrom2Bohr) except ValueError: break if gradients and mol.atoms: grad_array = np.array(gradients).reshape(-1, 3) # Add to top-level molecule atoms for i, atom in enumerate(mol.atoms): if i < len(grad_array): atom.energy_gradients = grad_array[i].tolist() # Add to the appropriate electronic state using add_xyz_derivative_property if hasattr(mol, 'electronic_states') and current_state < len(mol.electronic_states): mol.electronic_states[current_state].add_xyz_derivative_property( grad_array, 'energy', 'energy_gradients' )
[docs] def _parse_hessian_from_file(mol, hess_file): """Parse Hessian matrix from .hess file.""" with open(hess_file, 'r') as f: lines = f.readlines() # Find $hessian section start_line = None for i, line in enumerate(lines): if '$hessian' in line: start_line = i break if start_line is None: return matrix_size = int(lines[start_line + 1].strip()) hessian = np.zeros((matrix_size, matrix_size)) current_line = start_line + 2 current_col = 0 while current_col < matrix_size and current_line < len(lines): header = lines[current_line].strip() if not header: current_line += 1 continue cols = [int(x) for x in header.split()] num_cols = len(cols) current_line += 1 for row in range(matrix_size): if current_line >= len(lines): break data_line = lines[current_line].strip() if not data_line: current_line += 1 continue parts = data_line.split() for col_idx, col_num in enumerate(cols): if col_idx + 1 < len(parts): hessian[row, col_num] = float(parts[col_idx + 1]) current_line += 1 current_col += num_cols mol.hessian = hessian / (constants.Bohr2Angstrom ** 2) # Parse frequencies _parse_frequencies_from_lines(mol, lines)
[docs] def _parse_frequencies_from_lines(mol, hess_lines): """Parse vibrational frequencies from .hess file content.""" frequencies = [] ir_intensities = [] for i, line in enumerate(hess_lines): if '$vibrational_frequencies' in line: for freq_line in hess_lines[i + 2:]: if freq_line.strip() == '' or '$' in freq_line: break parts = freq_line.split() if len(parts) >= 2: frequencies.append(float(parts[-1])) if '$ir_spectrum' in line: for ir_line in hess_lines[i + 2:]: if ir_line.strip() == '' or '$' in ir_line: break parts = ir_line.split() if len(parts) >= 3: ir_intensities.append(float(parts[2])) if frequencies: zero_count = sum(1 for f in frequencies if f == 0) mol.frequencies = frequencies[zero_count:] if ir_intensities: mol.infrared_intensities = ir_intensities[zero_count:]