Source code for pygcc.read_db

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 17 16:02:22 2021

@author: adedapo.awolayo and Ben Tutolo, University of Calgary

Copyright (c) 2020 - 2021, Adedapo Awolayo and Ben Tutolo, University of Calgary

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""

import re, os
import textwrap
J_to_cal = 4.184

# from sys import platform

[docs]def findcodecs(filename):
    """Function to find the name of the encoding used to decode or encode any file    """
    data = open(filename, "rb").read()
    # data = open(os.path.join(os.path.dirname(os.path.abspath(__file__)), filename), 'rb').read()
    all_codecs = ['ascii', 'latin_1', 'utf_8']
    f = [0]*len(all_codecs)
    for j, i in enumerate(all_codecs):
        try:
            decoded = data.decode(i)
        except UnicodeDecodeError:
            f[j] = False
        else:
            for ch in decoded:
                if i == 'utf_8' and 0xD800 <= ord(ch) <= 0xDFFF:
                    f[j] = False
            f[j] = True
    if all(f) == True:
        return None
    else:
        return all_codecs[1]
        # if platform == "darwin": # OS X
        #     return all_codecs[1]
        # elif platform in ["linux", "linux2", "win32"]:   # linux and # Windows...
        #     return None


[docs]class db_reader:
    """Class to read direct-access and source thermodynamic database

    Parameters
    ----------
        dbaccess : string
            filename and location of the direct-access/sequential-access database, optional, default is speq21
        dbBerman_dir : string
            filename and location of the Berman mineral database, optional
        dbHP_dir : string
            filename and location of the supcrtbl mineral and gas database, optional
        sourcedb : string
            filename of the source database, optional
        sourcedb : string
            filename of the source database, optional
        sourceformat : string
            specify the source database format, either 'GWB' or 'EQ36', optional
        dbaccessformat : string, optional
            specify the direct-access/sequential-access database format, either 'speq' or 'supcrtbl', default is 'speq'
        sourcedb_codecs : string
            specify the name of the encoding used to decode or encode the sourcedb file, optional
        dbaccess_codecs : string
            specify the name of the encoding used to decode or encode the dbaccess file, optional

    Returns
    -------
        dbaccessdic : dict
            dictionary of minerals, gases, redox and aqueous species     \n
        dbaccess : string
            direct-access database file name     \n
        sourcedic : dict
            dictionary of reaction coefficients and species   \n
        specielist : list
            list of species segmented into the different categories [element, basis, redox, aqueous, minerals, gases, oxides]   \n
        speciecat : list
            list of species categories listed in 'specielist'   \n
        chargedic : dict
            dictionary of charges of species   \n
        MWdic : dict
            dictionary of MW of species   \n
        Mineraltype : dict
            mineral type for minerals   \n
        fugacity_info : dict
            fugacity information as stored in new tdat database for chi and critical ppts   \n
        Sptype : dict
            specie types and eq3/6 and revised date info   \n
        Elemlist : dict
            dictionary of elements and coefficients   \n

    Examples
    --------
    >>> ps = db_reader(sourcedb = './default_db/thermo.com.dat',
                       sourceformat = 'gwb',
                       dbaccess = './default_db/speq21.dat')
    >>> ps.sourcedic, ps.dbaccessdic, ps.specielist


    """
    kwargs = {"dbaccess": None,
              "dbBerman_dir": None,
              "dbHP_dir": None,
              "sourcedb": None,
              "sourceformat": None,
              "dbaccessformat": 'speq',
              "sourcedb_codecs": None,
              "dbaccess_codecs": None}

    def __init__(self, **kwargs):
        self.kwargs = db_reader.kwargs.copy()
        self.__calc__(**kwargs)

[docs]    def __calc__(self, **kwargs):
        self.kwargs.update(kwargs) 
        if self.kwargs["dbaccess"] == 'speq23':
    	    self.dbaccess_dir = './default_db/speq23.dat'
    	    self.dbaccess_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.dbaccess_dir)
        elif self.kwargs["dbaccess"] == 'speq23_dimer':
    	    self.dbaccess_dir = './default_db/speq23_dimer.dat'
    	    self.dbaccess_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.dbaccess_dir)
        elif self.kwargs["dbaccess"] == 'speq21_dimer':
    	    self.dbaccess_dir = './default_db/speq21_dimer.dat'
    	    self.dbaccess_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.dbaccess_dir)
        elif self.kwargs['dbaccess'] is None:
            self.dbaccess_dir = './default_db/speq21.dat'
            self.dbaccess_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.dbaccess_dir)
        else:
            self.dbaccess_dir = self.kwargs["dbaccess"]
        self.dbaccessformat = self.kwargs["dbaccessformat"]
        self.dbBerman_dir = self.kwargs["dbBerman_dir"]
        self.dbHP_dir = self.kwargs["dbHP_dir"]
        self.sourcedb_dir = None if self.kwargs["sourcedb"] is None else self.kwargs["sourcedb"]

        if self.kwargs["sourceformat"] is not None:
            if self.kwargs["sourceformat"].lower() == 'gwb': # options for all default database included with PyGCC
                if self.kwargs["sourcedb"] == 'thermo.com':
                    self.sourcedb = './default_db/thermo.com.dat'
                    self.sourcedb_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.sourcedb)
                elif self.kwargs["sourcedb"] == 'thermo.2021':
                    self.sourcedb = './default_db/thermo.2021.dat'
                    self.sourcedb_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.sourcedb)
                elif self.kwargs["sourcedb"] == 'thermo_latest':
                    self.sourcedb = './default_db/thermo_latest.tdat'
                    self.sourcedb_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.sourcedb)
                elif self.kwargs["sourcedb"] == 'thermo_cemdata_mar':
                    self.sourcedb = './default_db/thermo_cemdata_mar.tdat'
                    self.sourcedb_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.sourcedb)
                elif self.kwargs["sourcedb"] is None:
                    self.sourcedb = './default_db/thermo.com.tdat'
                    self.sourcedb_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.sourcedb)
                else:
                    self.sourcedb_dir = self.kwargs["sourcedb"]
            elif self.kwargs["sourceformat"].lower() == 'eq36':
                if self.kwargs["sourcedb"] == 'data0' or self.kwargs["sourcedb"] is None:
                    self.sourcedb = './default_db/data0.dat'
                    self.sourcedb_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.sourcedb)
                else:
                    self.sourcedb_dir = self.kwargs["sourcedb"]

        if self.kwargs['dbaccess_codecs'] is None:
            self.dbaccess_codecs = findcodecs(self.dbaccess_dir)
        else:
            self.dbaccess_codecs = self.kwargs['dbaccess_codecs']
        if self.kwargs["sourcedb"] is not None and self.kwargs['sourcedb_codecs'] is None:
            self.sourcedb_codecs = findcodecs(self.sourcedb_dir)
        else:
            self.sourcedb_codecs = self.kwargs['sourcedb_codecs']
        if self.dbaccess_dir is not None:
            self.dbaccess = self.dbaccess_dir.split('/')[-1]
        if self.dbaccess_dir is not None:
            self.readAqspecdb() #
        if self.sourcedb_dir is not None:
            if self.kwargs["sourceformat"].lower() == 'gwb':
                self.readSourceGWBdb()
            elif self.kwargs["sourceformat"].lower() == 'eq36':
                self.readSourceEQ36db()

[docs]    def readAqspecdb(self):
        """
        This function reads direct access thermodynamic database and can add other database sources
        at the bottom returns all constants of Maier-Kelley power function  for minerals and gases
        (dG [cal/mol], dH [cal/mol], S [cal/mol-K], V [cm3/mol] a [cal/mol-K], b [*10**3 cal/mol/K^2],
         c [*10^-5 cal/mol/K], Ttrans [K], Htr [cal/mol], Vtr [cm³/mol], dPdTtr [bar/K] ) and
        aqueous species (dG [cal/mol], dH [cal/mol], S [cal/mol-K], V [cm3/mol], a1 [*10 cal/mol/bar],
        a2 [*10**-2 cal/mol], a3 [cal-K/mol/bar], a4 [*10**-4 cal-K/mol], c1 [cal/mol/K], c2 [*10**-4 cal-K/mol],
        ω [*10**-5 cal/mol] ) packed into a dbacess dictionary. In addition, the function can read Berman's
        mineral properties such as (dG [J/mol], dH [J/mol], S [J/mol-K], V [cm³/mol], k0, k1, k2, k3,
        v1 [*10^5 K^-1], v2 [*10^5 K^-2], v3 [*10^5 bar^-1], v4 [*10^8 bar^-2], dTdP [K/bar], Tlambda [K],
        Tref [K], l1 [(J/mol)^0.5/K], l2 [(J/mol)^0.5/K^2], DtH, d0 [J/mol], d1 [J/mol], d2 [J/mol],
        d3 [J/mol], d4 [J/mol], d5 [J/mol], Tmin [K], Tmax [K]). In addition, the function can read supcrtbl's
        mineral and gas properties such as (dG [kJ/mol], dH [kJ/mol], S [J/mol-K], V [J/bar], a [kJ/mol-K],
        b [*10^5 kJ/mol/K^2], c [kJ-mol-K], d [kJ/mol/K^0.5], alpha [*10^5 K^-1], kappa0 [kbar],
        kappa0_d [kbar], kappa0_dd [kbar], n_atom [-], Tc0 [K], Smax [J/mol-K], Vmax [J/bar], dH [KJ/mol],
        dV [J/bar], W [kJ/mol], Wv [J/bar], n [-], SF [-]) \n
        Parameters
        ----------
            dbaccess        filename and location of the direct-access database     \n
            dbBerman_dir    filename and location of the Berman mineral database     \n
            dbHP_dir        filename and location of the supcrtbl (Holland and Powell) mineral and gas database     \n
        Returns
        ----------
            dbaccessdic      dictionary of minerals, gases, redox and aqueous species     \n
            dbaccess        dat file name     \n
        Usage:
        ----------
        [dbaccessdic, dbname] = readAqspecdb(dbaccess)
        """
        # check if its a single liner data
        codecs = self.dbaccess_codecs
        with open(self.dbaccess_dir, encoding = codecs) as g:
            Rd = g.readlines()
        header_counter = [p for p, k in enumerate(Rd) if k.startswith('*************')]

        def multiline_reader(Rd, counter, dbaccess_dir):
            db_dic = {}; last_gas = ''
            for i in range(len(Rd)): #
                s1 = Rd[i].rstrip('\n').strip()
                if (not s1.lstrip().startswith(('*', '!'))) and (s1.lstrip('0123456789.- \t') != ""):
                    if (s1[:3] != 'ref') and (s1[:3] != 'REF') and (s1.split()[0] not in ['minerals', 'gases', 'gas', 'aqueous', 'abandoned']):
                        name = s1.strip().split()[0]
                        name = name.replace('+4', '++++') if name.endswith('+4') else name.replace('+3', '+++') if name.endswith('+3') else name.replace('+2', '++') if name.endswith('+2') else name.replace('-4', '----') if name.endswith('-4') else name.replace('-3', '---') if name.endswith('-3') else name.replace('-2', '--') if name.endswith('-2') else name
                        if self.dbaccessformat.lower() == 'supcrtbl':
                            name = name.title() if not name.endswith(('+', '-', ",aq", "(S)", "(am)",
                                                                      'dis', 'ord')) else name.capitalize() if 'ACID' in name else name
                            name = name.replace(",aq", "(aq)").replace(",G", "(g)").replace("(S)", "(s)").replace("(Am)", "(am)").replace("(Alpha)", "(alpha)").replace("(Beta)", "(beta)")
                            name = name.replace("-acid", "_acid").replace("(High)", "_high").replace("(Low)", "_low").replace(" anhyd", "_anhyd").replace(" hydr", "_hydr")
                            name = name.replace("-Lo", "_low").replace("(-Hi)", "_high").replace("(oh)", "(OH)").replace("(Oh)", "(OH)").replace("(G)", "(g)").replace("(Ordered)", "-ord")

                        if len(s1.split()) > 1:
                            formula = s1.split()[1]
                        else:
                            formula = ''
                        s2 = Rd[i + 1].strip()
                        dates = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
                        if (s2[:3] != 'ref') and (s2[:3] != 'REF') and (s2[0].lstrip('0123456789.,- ') != '') and any(k in s2.split()[-1].lower() for k in dates) == False:
                            s3 = Rd[i+2]; s4 = Rd[i+3]; s5 = Rd[i+4];
                            if i >= len(Rd) - 5:
                                s6 = 'Null'; s7 = 'Null'; s8 = 'Null';
                                s9 = 'Null'; s10 = 'Null'; s11 = 'Null';
                            elif i >= len(Rd) - 6:
                                s6  = Rd[i + 5]; s7 = 'Null'; s8 = 'Null';
                                s9 = 'Null'; s10 = 'Null'; s11 = 'Null';
                            else:
                                s6  = Rd[i + 5]; s7 = Rd[i + 6]; s8 = Rd[i + 7];
                                s9 = Rd[i + 8]; s10 = Rd[i + 9]; s11 = Rd[i + 10];

                            if s3.strip()[:3] == 'ref':
                                ref = s3#.split()[0][4:]
                            else:
                                ref = s3#.split()[0]
                            if (s6.lower().islower() == True) | s6.startswith('*', 0):
                                params = s4.split() + s5.split()
                            elif (s7.lower().islower() == True) | s7.startswith('*', 0):
                                params = s4.split() + s5.split() + s6.split()
                            elif (s8.lower().islower() == True) | s8.startswith('*', 0):
                                params = s4.split() + s5.split() + s6.split() + s7.split()
                            elif (s9.lower().islower() == True) | s9.startswith('*', 0):
                                params = s4.split() + s5.split() + s6.split() + s7.split() + s8.split()
                            elif (s10.lower().islower() == True) | s10.startswith('*', 0):
                                params = s4.split() + s5.split() + s6.split() + s7.split() + s8.split() + \
                                    s9.split()
                            elif (s11.lower().islower() == True) | s11.startswith('*', 0):
                                params = s4.split() + s5.split() + s6.split() + s7.split() + s8.split() + \
                                    s9.split() + s10.split()
                            else:
                                params = s4.split() + s5.split() + s6.split() + s7.split() + s8.split() + \
                                    s9.split() + s10.split() + s11.split()
                            params = [float(i) if float(i) != 999999 else 0 for i in params]
                            counter += 1
                            if name in db_dic.keys():
                                print('Duplicate found for species "%s" in %s' % (name, dbaccess_dir.split('/')[-1]))
                                continue
                            else:
                                db_dic[name] = [formula, ref] + params

                            if len(s5.split()) != 0 and len(s6.split()) and (s5.strip() == '' and s6.strip() == ''):
                                break
                            elif len(s7.split()) != 0 and len(s6.split()) and (s6.strip() == '' and s7.strip() == ''):
                                break
                            elif len(s7.split()) != 0 and len(s8.split()) and (s7.strip() == '' and s8.strip() == ''):
                                break
                            elif len(s8.split()) != 0 and len(s9.split()) and (s8.strip() == '' and s9.strip() == ''):
                                break
                            elif len(s9.split()) != 0 and len(s10.split()) and (s9.strip() == '' and s10.strip() == ''):
                                break
                            elif len(s6.split()) != 0 and (s6.split()[-1] in ['(nmin1)', '(nmin2)', '(nmin3)', '(nmin4)', '(ngas)', '(naqs)']):
                                break
                            elif len(s7.split()) != 0 and (s7.split()[-1] in ['(nmin1)', '(nmin2)', '(nmin3)', '(nmin4)', '(ngas)', '(naqs)']):
                                break
                            elif len(s8.split()) != 0 and (s8.split()[-1] in ['(nmin1)', '(nmin2)', '(nmin3)', '(nmin4)', '(ngas)', '(naqs)']) :
                                break
                            elif len(s9.split()) != 0 and (s9.split()[-1] in ['(nmin1)', '(nmin2)', '(nmin3)', '(nmin4)', '(ngas)', '(naqs)']) :
                                break
                            elif len(s10.split()) != 0 and (s10.split()[-1] in ['(nmin1)', '(nmin2)', '(nmin3)', '(nmin4)', '(ngas)', '(naqs)']):
                                break
                            elif s9.strip('*').strip() == '' and s10.strip('*').strip() == '' and s11.strip('*').strip() == '':
                                break
                if s1.split()[0] in ['gases', 'gas']:
                    last_mineral = list(db_dic.keys())[-1] # aqueous
                if s1.lstrip().split()[0] in ['aqueous']:
                    last_gas = list(db_dic.keys())[-1]

            # last_gas = '' if last_gas is None else last_gas
            return db_dic, last_mineral, last_gas


        self.dbaccessdic = {}; counter = 0
        # if it is single line data like for dpeq20
        if len(Rd) <= 1:
            width = re.search('                     3 ', Rd[0]).start()
            Rd = textwrap.wrap(Rd[0], width=width)

            for i in range(len(Rd)): #
                s1 = Rd[i]
                if not s1.startswith(('*', ' '),0) | (s1.rstrip('\n').lstrip('0123456789.- ') == "") | (s1[0] == "-") :
                    if (s1.strip()[:3] != 'ref') and (s1.strip()[:3] != 'REF'):
                        name = s1.strip().split()[0]
                        if len(s1.strip().split()) > 1:
                            formula = s1.strip().split()[1]
                        else:
                            formula = ''
                        s2 = Rd[i + 1]
                        if (s2.strip()[:3] != 'ref') and (s2.strip()[:3] != 'REF') and (s2.split()[0].lstrip('0123456789.,- ') != ''):
                            s3 = Rd[i+2]; s4 = Rd[i+3]; s5 = Rd[i+4];
                            if s3.split()[0].lstrip('0123456789.,- ') != '':
                                if i >= len(Rd) - 5:
                                    s6 = 'Null'; s7 = 'Null'; s8 = 'Null';
                                    s9 = 'Null'; s10 = 'Null'; s11 = 'Null';
                                elif i >= len(Rd) - 6:
                                    s6  = Rd[i + 5]; s7 = 'Null'; s8 = 'Null';
                                    s9 = 'Null'; s10 = 'Null'; s11 = 'Null';
                                else:
                                    s6  = Rd[i + 5]; s7 = Rd[i + 6]; s8 = Rd[i + 7];
                                    s9 = Rd[i + 8]; s10 = Rd[i + 9]; s11 = Rd[i + 10];

                                if s3.strip()[:3] == 'ref':
                                    ref = s3#.split()[0][4:]
                                else:
                                    ref = s3#.split()[0]
                                if (s6.lower().islower() == True) | (s6.strip().split()[0] == name):
                                    params = s4.split() + s5.split()
                                elif (s7.lower().islower() == True) | (s7.strip().split()[0] == name):
                                    params = s4.split() + s5.split() + s6.split()
                                elif (s8.lower().islower() == True) | (s8.strip().split()[0] == name):
                                    params = s4.split() + s5.split() + s6.split() + s7.split()
                                elif (s9.lower().islower() == True) | (s9.strip().split()[0] == name):
                                    params = s4.split() + s5.split() + s6.split() + s7.split() + s8.split()
                                elif (s10.lower().islower() == True) | (s10.strip().split()[0] == name):
                                    params = s4.split() + s5.split() + s6.split() + s7.split() + s8.split() + \
                                        s9.split()
                                elif (s11.lower().islower() == True) | (s11.strip().split()[0] == name):
                                    params = s4.split() + s5.split() + s6.split() + s7.split() + s8.split() + \
                                        s9.split() + s10.split()
                                else:
                                    params = s4.split() + s5.split() + s6.split() + s7.split() + s8.split() + \
                                        s9.split() + s10.split() + s11.split()
                                params = [float(i) if float(i) != 999999 else 0 for i in params]
                                counter += 1
                                if name in self.dbaccessdic.keys():
                                    print('Duplicate found for species "%s" in %s' % (name, self.dbaccess_dir.split('/')[-1]))
                                    continue
                                else:
                                    self.dbaccessdic[name] = [formula, ref] + params
        else:
            # for multi line data like for speq21
            self.dbaccessdic, last_mineral, last_gas = multiline_reader(Rd, counter, self.dbaccess_dir)

        if self.dbBerman_dir is not None:
            codecs = findcodecs(self.dbBerman_dir)
            mineral_list = list(self.dbaccessdic.keys())[:list(self.dbaccessdic.keys()).index(last_mineral)+1]
            self.dbaccessdic = {k: v for k, v in self.dbaccessdic.items() if k not in mineral_list}
            specie_name = []
            with open(self.dbBerman_dir, encoding = codecs) as g:
                for i, line in enumerate(g, 1):
                    if line.strip('*').startswith('COMMENTS'):
                        break
                    if (not line.lstrip().startswith(('!', 'ST', 'C1', 'C2', 'C3', 'D1', 'D2',
                                                      'T1', 'T2', 'V1', '*'))) and (line.lstrip('0123456789.- \t\n') != ""):
                        specie_name.append(line.strip().split()[0])

            gid = open(self.dbBerman_dir, 'r', encoding = codecs)
            for i in range(5000):
                s1 = gid.readline()
                if s1.strip('*').startswith('MINERAL DATA'):
                    break

            s4 = '0'; s5 = '0'; s6 = '0'
            for i in range(5000):
                s1 = s4 if s4.strip().split()[0] in specie_name else s5 if s5.strip().split()[0] in specie_name else s6 if s6.strip().split()[0] in specie_name else gid.readline()
                if s1.strip('*').startswith('COMMENTS')|s6.strip('*').startswith('COMMENTS')|s5.strip('*').startswith('COMMENTS'):
                    break
                if (not s1.lstrip().startswith(('!', 'ST', 'C1', 'C2', 'C3', 'D1', 'D2', 'T1', 'T2', 'V1'))) and (s1.lstrip('0123456789.- \t\n') != ""):
                    name = s1.strip().split()[0]
                    if len(s1.split()) > 1:
                        formula = s1.split()[1]
                    else:
                        formula = ''
                    s2 = gid.readline(); s2 = gid.readline() if s2.lstrip().startswith('!') else s2;
                    s3 = gid.readline()
                    s4 = gid.readline() if (s3.strip().split()[0] not in specie_name) else '0'
                    s4 = s4 + '0' if s4.strip() == '' else s4
                    s5 = gid.readline() if s4.strip().split()[0] not in specie_name else '0'
                    s5 = s5 + '0' if s5.strip() == '' else s5
                    s6 = gid.readline() if s5.strip().split()[0] not in specie_name else '0'
                    s6 = s6 + '0' if s6.strip() == '' else s6
                    params = s2.split()[1:-1] if s2.lstrip().startswith('ST') else s3.split()[1:-1] if s3.lstrip().startswith('ST') else s4.split()[1:-1] if s4.lstrip().startswith('ST') else s5.split()[1:-1] if s5.lstrip().startswith('ST') else [0]*4
                    params += s3.split()[1:-1] if s3.lstrip().startswith('C1') else s4.split()[1:-1] if s4.lstrip().startswith('C1') else s5.split()[1:-1] if s5.lstrip().startswith('C1') else s6.split()[1:-1] if s6.lstrip().startswith('C1') else [0]*4
                    # params += s3.split()[1:-2] if s3.lstrip().startswith('C2') else s4.split()[1:-2] if s4.lstrip().startswith('C2') else s5.split()[1:-2] if s5.lstrip().startswith('C2') else s6.split()[1:-2] if s6.lstrip().startswith('C2') else [0]*3
                    params += s3.split()[1:-1] if s3.lstrip().startswith('V1') else s4.split()[1:-1] if s4.lstrip().startswith('V1') else s5.split()[1:-1] if s5.lstrip().startswith('V1') else s6.split()[1:-1] if s6.lstrip().startswith('V1') else [0]*4
                    params += s3.split()[1:2] if s3.lstrip().startswith('T2') else s4.split()[1:2] if s4.lstrip().startswith('T2') else s5.split()[1:2] if s5.lstrip().startswith('T2') else s6.split()[1:2] if s6.lstrip().startswith('T2') else [0]
                    params += s3.split()[1:] if s3.lstrip().startswith('T1') else s4.split()[1:] if s4.lstrip().startswith('T1') else s5.split()[1:] if s5.lstrip().startswith('T1') else s6.split()[1:] if s6.lstrip().startswith('T1') else [0]*5
                    params += s3.split()[1:-1] if s3.lstrip().startswith('D1') else s4.split()[1:-1] if s4.lstrip().startswith('D1') else s5.split()[1:-1] if s5.lstrip().startswith('D1') else s6.split()[1:-1] if s6.lstrip().startswith('D1') else []
                    params += s3.split()[1:-1] if s3.lstrip().startswith('D2') else s4.split()[1:-1] if s4.lstrip().startswith('D2') else s5.split()[1:-1] if s5.lstrip().startswith('D2') else s6.split()[1:-1] if s6.lstrip().startswith('D2') else []
                    ref = 'Berman_1988'
                    params = [float(i) if float(i) != 999999 else 0 for i in params]
                    # print(name, params)
                    if name in self.dbaccessdic.keys():
                        print('Duplicate found for species "%s" in %s' % (name, self.dbBerman_dir.split('/')[-1]))
                        continue
                    else:
                        self.dbaccessdic[name] = [formula, ref] + params
            gid.close()

        elif self.dbHP_dir is not None:
            codecs = findcodecs(self.dbHP_dir)
            mineralgas_list = list(self.dbaccessdic.keys())[:list(self.dbaccessdic.keys()).index(last_gas)+1]
            self.dbaccessdic = {k: v for k, v in self.dbaccessdic.items() if k not in mineralgas_list}
            with open(self.dbHP_dir, encoding = codecs) as g:
                Rd = g.readlines()
            self.dbaccessdic.update(multiline_reader(Rd, 0, self.dbHP_dir)[0])

        # read in the header reference list
        if len(header_counter) != 0:
            header_lines = Rd[header_counter[0]+1:header_counter[1]]
            if len(header_lines) > 1:
                checker = [i for i, x in enumerate(header_lines) if ' ... ' in x.lstrip()]; ref_list = []
                for i in range(len(checker)-1):
                    p = ''
                    for k in range(checker[i],checker[i+1]):
                        p = p + header_lines[k].lstrip('*').lstrip().strip('\n')
                    ref_list.append(p)
                self.header_ref = dict(zip(*[iter([item.strip() for sublist in [k.split(' ... ') for k in ref_list] for item in sublist])]*2))

        #%% other sources aside speq20 for solid solution calculation
        #dG dH S V a1 a2 a3 a4 a5
        # dG, dH, S from Arnorsson 1999, V and Cp from Robie and Hemingway #1995
        _Anorthite_ = ['Ca(Al2Si2)O8', 'R&H95, Stef2001',-4002095, -4227830, 199.30, 100.790*J_to_cal,
                       5.168e2, -9.249e-2, -1.408e6, -4.589e3, 4.188e-5]
        self.dbaccessdic['ss_Anorthite'] = [x/J_to_cal if type(x)!=str else x for x in _Anorthite_ ]

        self.dbaccessdic['ss_Albite_high'] = ['NaAlSi3O8', 'R&H95, Stef2001', -887368.32+8413/J_to_cal,
                                        -940769.52, 224.14/J_to_cal, 100.07, 139.56, -0.0221916826,
                                        401051.6, -1535.37, 5.430210e-06]

        _K_feldspar_ = ['KAlSi3O8', 'R&H95, A&S99', -3745958, -3965730, 232.90, 108.960*J_to_cal,
                        6.934e2, -1.717e-1, 3.462e6, -8.305e3, 4.919e-5]
        self.dbaccessdic['ss_K-feldspar'] = [x/J_to_cal if type(x)!=str else x for x in _K_feldspar_ ]

        _Ferrosilite_ = ['FeSiO3', 'R&H95, Stef2001', -1118000, -1195200, 94.6, 33.0*J_to_cal,
                       1.243e2, 1.454e-2, -3.378e6, 0, 0]
        self.dbaccessdic['ss_Ferrosilite'] = [x/J_to_cal if type(x)!=str else x for x in _Ferrosilite_]

        _Enstatite_=['MgSiO3', 'R&H95, Stef2001', -1458300, -1545600,  66.3, 31.31*J_to_cal,
                     3.507e2, -1.472e-1, 1.769e6, -4.296e3, 5.826e-5]
        self.dbaccessdic['ss_Enstatite'] = [x/J_to_cal if type(x)!=str else x for x in _Enstatite_]

        _Clinoenstatite_=['MgSiO3', 'R&H95, Stef2001', -1458100, -1545000,  67.9, 31.28*J_to_cal,
                     2.056e2, -1.280e-2, 1.193e6, -2.298e3, 0]
        self.dbaccessdic['ss_Clinoenstatite'] = [x/J_to_cal if type(x)!=str else x for x in _Clinoenstatite_]

        _Hedenbergite_=['CaFeSi2O6', 'R&H95, Stef2001', -2676300, -2839900,  174.2, 67.950*J_to_cal,
                     3.1046e2, 1.257e-2, -1.846e6, -2.040e3, 0]
        self.dbaccessdic['ss_Hedenbergite'] = [x/J_to_cal if type(x)!=str else x for x in _Hedenbergite_]

        _Diopside_=['CaMgSi2O6', 'R&H95, Stef2001', -3026800, -3201500,  142.7 , 66.090*J_to_cal,
                     4.7025e2, -9.864e-2, 0.2454e6, -4.823e3, 2.813e-5]
        self.dbaccessdic['ss_Diopside'] = [x/J_to_cal if type(x)!=str else x for x in _Diopside_]

        #dG dH S V a1 a2 a3 a4 a5
        _Forsterite_ = ['Mg2SiO4', 'R&H95, Stef2001', -2053600, -2171850, 94.1, 43.65*J_to_cal,
                      8.736e1, 8.717e-2, -3.699e6, 8.436e2, -2.237e-5]
        self.dbaccessdic['ss_Forsterite'] = [x/J_to_cal if type(x)!=str else x for x in _Forsterite_]

        _Fayalite_ = ['Fe2SiO4', 'R&H95, Stef2001', -1379100, -1478200, 151, 46.31*J_to_cal, 1.7602e2,
                    -8.808e-3, -3.889e6, 0, 2.471e-5]
        self.dbaccessdic['ss_Fayalite'] = [x/J_to_cal if type(x)!=str else x for x in _Fayalite_]

        _Fluorapatite_ = ['Ca5(PO4)3F', 'R&H95', -6489700, -6872000, 387.9, 157.56*J_to_cal,
                      7.543e2, -3.026e-2, -0.9084e6, -6.201e3, 0]
        if 'Fluorapatite' not in self.dbaccessdic.keys():
            self.dbaccessdic['Fluorapatite'] = [x/J_to_cal if type(x)!=str else x for x in _Fluorapatite_]

        _Hydroxyapatite_ = ['Ca5(OH)(PO4)3', 'R&H95', -6337100, -6738500, 390.4, 159.6*J_to_cal,
                            3.878e2, 11.186e-2, -12.70e6, 1.811e3, 0]
        if 'Hydroxyapatite' not in self.dbaccessdic.keys():
            self.dbaccessdic['Hydroxyapatite'] = [x/J_to_cal if type(x)!=str else x for x in _Hydroxyapatite_]

        if 'Ankerite' not in self.dbaccessdic.keys():
            self.dbaccessdic['Ankerite'] = ['CaFe(CO3)2', 'H&P2011               31.DEC.11\n', -434945.7,
                                            -471178.3, 45.043, 66.060,  81.500956, -0.277486, 0, -730.114720, 0]

        if 'Acmite' not in self.dbaccessdic.keys():
            self.dbaccessdic['Acmite'] = ['NaFeSi2O6', 'H&P2011               31.DEC.11\n', -577476.5,
                                          -617454.6, 40.774, 64.590,  1.994e2, 6.197e-2, -4.267e6, 0, 0]

        _Annite_ = ['KFe3AlSi3O10(OH)2', 'R&H95', -4798300, -5149300, 415.0, 154.3*J_to_cal,
                    6.366e2, 8.208e-2, -4.860e6, -3.731e3, 0]
        self.dbaccessdic['ss_Annite'] = [x/J_to_cal if type(x)!=str else x for x in _Annite_]

        _Phlogopite_ = ['KMg3AlSi3O10(OH)2', 'R&H95', -5860500, -6246000, 315.9, 149.65*J_to_cal,
                        8.639e2, -7.6076e-2, 3.5206e5, -8.470e3, 0]
        self.dbaccessdic['ss_Phlogopite'] = [x/J_to_cal if type(x)!=str else x for x in _Phlogopite_]
                                        #dG dH S V a1 a2 a3 a4 a5
        _Molybdenite_ = ['MoS2', 'R&H95', -262800, -271800, 62.6, 32.02*J_to_cal,
                        1.045e2, -4.812e-3, -6.291e3, -6.817e2, 0]
        if 'Molybdenite' not in self.dbaccessdic.keys():
            self.dbaccessdic['Molybdenite'] = [x/J_to_cal if type(x)!=str else x for x in _Molybdenite_]

        _Molybdite_ = ['MoO3', 'R&H95', -668100, -745200, 77.7, 30.56*J_to_cal,
                        6.433e0, 6.278e-2, -2.46e6, 1.337e3, 0]
        if 'Molybdite' not in self.dbaccessdic.keys():
            self.dbaccessdic['Molybdite'] = [x/J_to_cal if type(x)!=str else x for x in _Molybdite_]

        return

[docs]    def readSourceGWBdb(self):
        """
        This function reads source GWB thermodynamic database and reaction coefficients of 'eh'
        and 'e-' has been added at the bottom returns all reaction coefficients and species,
        group species into redox, minerals, gases, oxides and aqueous species
        Parameters
        ----------
            sourcedb      :     filename of the source database
        Returns
        ----------
            sourcedic     :     dictionary of reaction coefficients and species
            specielist    :     list of species segmented into the different categories
                                [element, basis, redox, aqueous, minerals, gases, oxides]
            speciecat     :     list of species categories listed in 'specielist'
            chargedic     :     dictionary of charges of species
            MWdic         :     dictionary of MW of species
            Mineraltype   :     mineral type for minerals
            fugacity_info :     fugacity information as stored in new tdat database for chi and critical ppts
        Usage:
        ----------
        [sourcedic, specielist, chargedic, MWdic, Mineraltype, fugacity_info, activity_model] = readSourceGWBdb()
        """
        codecs = self.sourcedb_codecs
        with open(self.sourcedb_dir, encoding = codecs) as g:
            for line in g:
                if line.startswith('activity model'):
                    break

        with open(self.sourcedb_dir, encoding = codecs) as g:
            Rd = g.readlines()
        activity_model = line.strip('\n').split()[-1]
        data_fmt = [x for x in Rd if 'dataset format' in x][0].strip('\n').split(':')[-1].strip()
        fugacity_model = [x for x in Rd if 'fugacity model' in x][0].strip('\n').split(': ')[-1] if data_fmt != 'oct94' else ''

        unwanted = ['elements', 'basis species', 'redox couples', 'aqueous species',
                    'free electron', 'minerals', 'solid solutions', 'gases', 'oxides', 'stop.' ]
        #capture line numbers with line break
        d=[]; previousline = ''
        with open(self.sourcedb_dir, encoding = codecs) as fid:
            for idx, line in enumerate(fid, 1):
                if line.strip().rstrip('\n').lstrip('0123456789.- ') in unwanted:
                    x=idx
                    d.append(x-1)
                #elif line.strip(' \n*').startswith(('virial coefficients', 'Virial coefficients', 'SIT epsilon coefficients', 'Pitzer parameters')):
                elif previousline.startswith('-end-') and line.startswith('*'):
                    x=idx; #print(x)
                    d.append(x+1)
                    break
                previousline = line

        if activity_model == 'h-m-w':
            if all([i.startswith('*') for i in Rd[d[-1]:d[-1]+30]]) == False:
                d_act = [i for i, x in enumerate(Rd[d[-1]:]) if x.strip('\n') ==''][0]
            else:
                d_act = [i for i, x in enumerate(Rd[d[-2]:]) if x.strip('\n') ==''][0]
        f = open(self.sourcedb_dir, 'r', encoding = codecs)
        #skip first 11 lines of database  .lstrip('0123456789.- ')
        for i in range(d[1]+2):
          s1 = f.readline()

        self.sourcedic = {} # initialize dictionary
        for i in range(d[-1]-d[1]):   #
            s1 = f.readline()
            if s1.strip(' \n*').startswith(("references", 'virial coefficients', 'Virial coefficients', 'SIT epsilon coefficients', 'Pitzer parameters')) :
                break
            if not s1.startswith((' ', '*'), 0) | (s1.rstrip('\n') == "") | (len(s1) !=0 and s1[0] == "-") :
                if s1.rstrip('\n').lstrip('0123456789.- ') in unwanted:
                    continue
                elif s1.rstrip('\n').lstrip('0123456789.- ') == '':
                    continue
                else:
                    specie_name = s1.strip().split()[0]
                    s2 = f.readline(); s3 = f.readline() if s2.rstrip('\n') != '' else ''
                    s4 = f.readline() if s3.rstrip('\n') != '' else ''
                    s5 = f.readline() if s4.rstrip('\n') != '' else ''
                    if data_fmt == 'mar21' and not s3.split()[0].isdigit() and'mole' not in [s3.split()[0], s2.split()[0]] :
                        if (s2.rstrip('\n') != "" and s3.rstrip('\n') == ""):
                            ss_details = [s1, s2, s3]
                        elif (s3.rstrip('\n') != "" and s4.rstrip('\n') == ""):
                            ss_details = [s1, s2, s3, s4]
                        elif (s4.rstrip('\n') != "" and s5.rstrip('\n') == ""):
                            ss_details = [s1, s2, s3, s4, s5]
                        elif (s5.rstrip('\n') != ""):
                            s6 = f.readline()
                            ss_details = [s1, s2, s3, s4, s5, s6]
                            if (s6.rstrip('\n') != ""):
                                s7 = f.readline()
                                ss_details = [s1, s2, s3, s4, s5, s6, s7]
                                if (s7.rstrip('\n') != ""):
                                    s8 = f.readline()
                                    ss_details = [s1, s2, s3, s4, s5, s6, s7, s8]
                                    if (s8.rstrip('\n') != ""):
                                        s9 = f.readline()
                                        ss_details = [s1, s2, s3, s4, s5, s6, s7, s8, s9]
                        specie_formula = []; species_num = []; reactant = []
                    else:
                        if (s5.rstrip('\n') != ""):
                            s6 = f.readline()
                            if (s6.rstrip('\n') != ""):
                                s7 = f.readline()
                                if not s2.startswith('*',0):
                                    if (len(s2.split()) > 1) and (s2.split()[0] != 'formula='):
                                        if len(s1.split('formula=')) <= 1:
                                            specie_formula = ""
                                        else:
                                            specie_formula = s1.rstrip('\n').split('formula=')[1]
                                        if not s3.lstrip().startswith(('chi', 'Pcrit'), 0):
                                            species_num = int(s3.split()[0])
                                            if species_num <= 3:
                                                reactant = s4.split()
                                            elif species_num <= 6:
                                                reactant = s4.split() + s5.split()
                                            else:
                                                reactant = s4.split() + s5.split() + s6.split()
                                        else:
                                            if not s4.lstrip().startswith(('chi','Pcrit'),0):
                                                species_num = int(s4.split()[0])
                                                if species_num <= 3:
                                                    reactant = s5.split()
                                                elif species_num <= 6:
                                                    reactant = s5.split() + s6.split()
                                                else:
                                                    reactant = s5.split() + s6.split() + s7.split()
                                            else:
                                                species_num = int(s5.split()[0])
                                                if species_num <= 3:
                                                    reactant = s6.split()
                                                else:
                                                    reactant = s6.split() + s7.split()
                                    else:
                                        if len(s2.split('formula=')) <= 1:
                                            specie_formula = ""
                                        else:
                                            specie_formula = s2.rstrip('\n').split('formula=')[1]
                                        species_num = int(s4.split()[0])
                                        if species_num <= 3:
                                            reactant = s5.split()
                                        elif species_num <= 6:
                                            reactant = s5.split() + s6.split()
                                        else:
                                            reactant = s5.split() + s6.split() + s7.split()
                                else:
                                    specie_formula = s2.split()[2]
                                    species_num = int(s4.split()[0])
                                    if species_num <= 3:
                                        reactant = s5.split()
                                    elif species_num <= 6:
                                        reactant = s5.split() + s6.split()
                                    else:
                                        reactant = s5.split() + s6.split() + s7.split()
                            else:
                                specie_formula = ""
                                species_num = int(s3.split()[0])
                                if species_num <= 3:
                                    reactant = s4.split()
                                elif species_num <= 6:
                                    reactant = s4.split() + s5.split()
                                else:
                                    reactant = s4.split() + s5.split() + s6.split()
                        else:
                            specie_formula = ""
                            species_num = int(s3.split()[0])
                            if species_num <= 3:
                                reactant = s4.split()
                            elif species_num <= 6:
                                reactant = s4.split() + s5.split()
                            else:
                                reactant = s4.split() + s5.split() + s6.split()
                        ss_details = []

            dt = [specie_formula, species_num] + reactant + ss_details
            self.sourcedic[specie_name] = dt[2:] if any(isinstance(el, list) for el in dt) else dt
        self.sourcedic['eh'] = ['eh', 3, '-2.0000', 'H2O', '1.0000', 'O2(g)', '4.0000', 'H+']
        self.sourcedic['e-'] = ['e-', 3, '0.50000', 'H2O', '-0.2500', 'O2(g)', '-1.0000', 'H+']

        f.close()

        element = []; basis = []; redox = []; aqueous = []; minerals = []; gases = []; oxides = [];
        solidsolutions = []
        charge = []; MW = []; electron = []; self.Mineraltype = {}; fugacity_chi = {}; fugacity_Pcrit = {}
        with open(self.sourcedb_dir, encoding = codecs) as fid:
            for i, line in enumerate(fid, 1):
                previousline = line
                if (line.strip(' \n*').startswith(("references", 'Pitzer parameters', 'virial coefficients', 'Virial coefficients', 'SIT epsilon coefficients'))):
                    break
                # elif previousline.startswith('-end-') and line.startswith('*'):
                #     break
                if not line.startswith((' ','*'),0) | (line.rstrip('\n') == "") | (line[0] == "-") :
                    if not line.split()[0].replace('.','',1).isnumeric():
                        if not line.startswith(('charge', 'mole', 'formula'), 0):
                            if d[0] < i < d[1]:
                                element.append(line.split()[0])
                            elif d[1] < i < d[2]:
                                basis.append(line.split()[0])
                            elif d[2] < i < d[3]:
                                redox.append(line.split()[0])
                            elif d[3] < i < d[4]:
                                aqueous.append(line.split()[0])
                            elif d[4] < i < d[5]:
                                if data_fmt == 'oct94':
                                    minerals.append(line.split()[0])
                                elif data_fmt in ['jul17', 'jan19', 'apr20', 'mar21'] :
                                    electron.append(line.split()[0])
                            elif d[5] < i < d[6]:
                                if data_fmt == 'oct94':
                                    gases.append(line.split()[0])
                                elif data_fmt in ['jul17', 'jan19', 'apr20', 'mar21'] :
                                    minerals.append(line.split()[0])
                            elif i > d[6]:
                                if data_fmt == 'oct94':
                                    oxides.append(line.split()[0])
                                elif d[6] < i < d[7]:
                                    if data_fmt in ['jul17', 'jan19', 'apr20'] :
                                        gases.append(line.split()[0])
                                    elif data_fmt == 'mar21':
                                        solidsolutions.append(line.split()[0])
                                elif i > d[7]:
                                    if data_fmt in ['jul17', 'jan19', 'apr20'] :
                                        oxides.append(line.split()[0])
                                    elif data_fmt == 'mar21':
                                        if d[7] < i < d[8]:
                                            gases.append(line.split()[0])
                                        elif i > d[8]:
                                            oxides.append(line.split()[0])

                if (re.compile(r"charge").search(line) != None) and not line.startswith('*'):
                    charge.append(line)
                if (re.compile(r"mole wt.=").search(line) != None) and not line.startswith('*'):
                    MW.append(re.sub('[^0123456789\.]', '', line.strip('\n').split('wt.=')[1]))
                if (re.compile(r"type=").search(line) != None) and not line.startswith('*'):
                    if len(line.split()) <= 2:
                        self.Mineraltype[line.split()[0]] = ''
                    else:
                        self.Mineraltype[line.split()[0]] = line.split()[2]
                if (re.compile(r"chi=").search(line) != None) and not line.startswith('*'):
                    fugacity_chi[gases[-1]] = line
                if (re.compile(r"Pcrit=").search(line) != None) and not line.startswith('*'):
                    fugacity_Pcrit[gases[-1]] = line

        act_list = []; #previousline = ''
        if activity_model == 'h-m-w':
            with open(self.sourcedb_dir, encoding = codecs) as fid:
                for i, line in enumerate(fid, 1):
                    num = d[-1] + d_act if all([i.startswith('*') for i in Rd[d[-1]:d[-1]+30]]) == False else d[-2] + d_act
                    if i > num: #d[-1] + d_act:
                        if not line.startswith(('  ', '\n', '-end-', '*')):
                            act_list.append(line)

        self.act_param = {'activity_model': activity_model, 'act_list': act_list, 'dataset_format' : data_fmt}
        self.fugacity_info = {'fugacity_model': fugacity_model, 'fugacity_chi': fugacity_chi,'fugacity_Pcrit': fugacity_Pcrit}
        res = basis + redox + aqueous + electron
        self.chargedic = {res[i]: charge[i].rstrip('\n') for i in range(len(charge))}
        res = element + basis + redox + aqueous + electron + minerals + gases + oxides
        self.MWdic = {res[i]: float(MW[i]) for i in range(len(MW))}
        if data_fmt != 'mar21':
            self.specielist = [element, basis, redox, aqueous, electron, minerals, gases, oxides]
            self.speciecat = ['element', 'basis', 'redox', 'aqueous', 'electron', 'minerals', 'gases', 'oxides']
        else:
            self.specielist = [element, basis, redox, aqueous, electron, minerals, solidsolutions, gases, oxides]
            self.speciecat = ['element', 'basis', 'redox', 'aqueous', 'electron', 'minerals', 'solidsolutions', 'gases', 'oxides']

        return

[docs]    def readSourceEQ36db(self):
        """
        This function reads source EQ3/6 thermodynamic database and reaction coefficients of 'eh'
        and 'e-' has been added at the bottom returns all reaction coefficients and species,
        group species into basis, auxiliary basis, minerals, gases, liquids and aqueous species
        Parameters
        ----------
            sourcedb      :     filename of the source database
        Returns
        ----------
            sourcedic     :     dictionary of reaction coefficients and species
            specielist    :     list of species segmented into the different categories
                                [element, basis, redox, aqueous, minerals, gases, oxides]
            speciecat     :     list of species categories listed in 'specielist'
            chargedic     :     dictionary of charges of species and DHazero info
            MWdic         :     dictionary of MW of species
            Sptype        :     specie types and eq3/6 and revised date info
            Elemlist      :     dictionary of elements and coefficients
        Usage:
        ----------
        [sourcedic, specielist, chargedic, MWdic, block_info, Elemlist, act_param] = readSourceEQ36db(sourcedb)
        """
        codecs = self.sourcedb_codecs
        with open(self.sourcedb_dir, encoding = codecs) as g:
            for line in g:
                if line.startswith(('bdot parameters', 'single-salt parameters', 'ca combinations')):
                    break

        if line.startswith('bdot parameters'):
            activity_model = 'debye-huckel'
        elif line.startswith(('single-salt parameters', 'ca combinations')):
            activity_model = 'h-m-w'

        subheading = ['elements', 'basis species', 'auxiliary basis species', 'aqueous species',
                    'solids', 'liquids', 'gases', 'solid solutions', 'references', 'stop.']
        d = []
        with open(self.sourcedb_dir, encoding = codecs) as f:
            for idx, line in enumerate(f, 1):
                if line.strip().rstrip('\n').lstrip('0123456789.- ') in subheading:
                    x=idx
                    d.append(x-1)
                if line.startswith(('bdot parameters', 'single-salt parameters', 'ca combinations')):
                    x=idx
                    d.append(x-1)

        element = []; basis = []; auxiliary = []; aqueous = []; minerals = []; MW = []; #DH = []
        gases = []; liquids = []; charge = []; solid_solutions = []; act_params = []

        with open(self.sourcedb_dir, encoding = codecs) as f:
            for i, line in enumerate(f, 1):
                if (line.rstrip('\n') == "references"):
                    break
                if not line.startswith('*',0) | line.startswith(' ',0) | (line.rstrip('\n') == "") | (line[0] == "-") :
                    if not line.split()[0].replace('.','',1).isnumeric() and not line.startswith('+',0):
                        if line.strip().rstrip('\n').lstrip('0123456789.- ') not in subheading:
                            line = line.strip().rstrip('\n')
                            if d[1] < i < d[2]:
                                element.append(line.split()[0])
                                MW.append(float(line.split()[1]))
                            elif d[2] < i < d[3]:
                                basis.append(line.split('  ')[0])
                            elif d[3] < i < d[4]:
                                if any(re.findall(r'|'.join(('acid', 'high', 'low')), line.split('  ')[0], re.IGNORECASE)):
                                    auxiliary.append(line.split('  ')[0].replace(' ', '_'))
                                else:
                                    auxiliary.append(line.split('  ')[0])
                            elif d[4] < i < d[5]:
                                if any(re.findall(r'|'.join(('acid', 'high', 'low')), line.split('  ')[0], re.IGNORECASE)):
                                    aqueous.append(line.split('  ')[0].replace(' ', '_'))
                                else:
                                    aqueous.append(line.split('  ')[0])
                            elif d[5] < i < d[6]:
                                if any(re.findall(r'|'.join(('acid', 'high', 'low', 'anhyd', 'hydr')), line.split('  ')[0], re.IGNORECASE)):
                                    minerals.append(line.split('  ')[0].replace(' ', '_'))
                                else:
                                    minerals.append(line.split('  ')[0])
                            elif d[6] < i < d[7]:
                                liquids.append(line.split('  ')[0])
                            elif d[7] < i < d[8]:
                                gases.append(line.split('  ')[0])
                            elif i > d[8]:
                                solid_solutions.append(line.split('  ')[0])
                if re.compile(r"charge").search(line) != None:
                    charge.append(line)
                if d[0] < i < d[1]:
                    act_params.append(line)

        self.MWdic = {element[i]: float(MW[i]) for i in range(len(MW))}
        res = basis + auxiliary + aqueous + minerals + liquids + gases + solid_solutions
        lstname = []; self.block_info = {}; self.Elemlist = {}; self.sourcedic = {}
        self.act_param = {'activity_model': activity_model, 'act_list': None, 'alpha_beta' : {},
                          'theta':{}, 'lambda':{}, 'psi':{},'zeta':{},'mu':{}, 'beta0' : {},
                          'beta1' : {}, 'beta2' : {}, 'alpha1' : {}, 'alpha2' : {}, 'cphi' : {}}
        keywords = [["+" + "-"*30,   "+" + "-"*30]  ]
        for num, k in enumerate(keywords):
            lst = []; counter = 0
            f = open(self.sourcedb_dir, 'r', encoding = codecs)

            for i in range(d[1] + len(element)):#
                s1 = f.readline()
                if (s1.rstrip('\n') == "elements") :
                    break
                act_list = [act_params[i +1] for i,x in enumerate(act_params) if x.rstrip('\n').startswith(k[0])]
                act_list = [x for x in act_list if not x.startswith(('*', 'nn', 'nc', 'cc', 'mixture'))]
                if (activity_model == 'h-m-w') and any(s1.lstrip().rstrip('\n').startswith(x) for x in act_list):
                    lst = []; lst.append(s1)
                    for j in range(50):
                        s = f.readline()
                        if s.lstrip().rstrip('\n').startswith(k[1]):
                            break
                        elif not s.lstrip().startswith(('****', '*',  'single-salt parameters', 'ca combinations')) | (s.rstrip('\n').strip('0123456789.- ') in ["", 'Miscellaneous parameters']+subheading):
                            lst.append(s)
                    if len(lst) > 2:
                        if any(['mu' in x for x in lst]):
                            checker = [x.strip(' \n').split()[-1] for i, x in enumerate(lst) if x.lstrip().startswith('mu')]
                            if re.sub('[^0123456789\.]', '', checker[0]) == '':
                                first_a = [i for i, x in enumerate(lst) if x.lstrip().startswith('a')][0]
                                mu = float(lst[first_a].split()[-1])
                            else:
                                mu = float(re.sub('[^0123456789\.]', '', checker[0]))
                            self.act_param['mu'][lst[0].rstrip('\n')] = mu
                        if any(['zeta' in x for x in lst]):
                            checker = [x.strip(' \n').split()[-1] for i, x in enumerate(lst) if x.lstrip().startswith('zeta')]
                            if re.sub('[^0123456789\.]', '', checker[0]) == '':
                                first_a = [i for i, x in enumerate(lst) if x.lstrip().startswith('a')][0]
                                zeta = float(lst[first_a].split()[-1])
                            else:
                                zeta = float(re.sub('[^0123456789\.]', '', checker[0]))
                            self.act_param['zeta'][lst[0].rstrip('\n')] = zeta
                        if any([('alpha' or 'beta') in x for x in lst]):
                            self.act_param['alpha_beta'][lst[0].rstrip('\n')] = lst
                            lster = ['beta0', 'beta1', 'beta2', 'alpha1', 'alpha2', 'cphi']
                            for pos in lster:
                                checker = [x.strip(' \n') for i, x in enumerate(lst) if pos in re.sub('[()]', '', x).lstrip().lower()]
                                if all([x in checker[0] for x in lster[:3]]):
                                    checker = checker[0].replace("=", "").split()
                                    par = float(checker[checker.index(pos) + 1])
                                elif all([x in checker[0] for x in lster[3:5]]):
                                    checker = checker[0].replace("=", "").split()
                                    par = float(checker[checker.index(pos) + 1])
                                else:
                                    # checker = checker.split()[-1]
                                    if re.sub('[^0123456789\.]', '', re.sub('[(012)]', '', checker[0])) == '':
                                        first_a = [i for i, x in enumerate(lst) if pos in re.sub('[()]', '', x).lstrip().lower() ][0] + 1
                                        par = float(lst[first_a].split()[-1])
                                    else:
                                        par = float(re.sub('[^0123456789\.]', '', re.sub('[()]', '', checker[0])))
                                self.act_param[pos][lst[0].rstrip('\n')] = par
                        if any(['psi' in x for x in lst]):
                            checker = [x.strip(' \n') for i, x in enumerate(lst) if 'psi' in x.lstrip() ]
                            if all([x in checker[0] for x in [' psi', 'theta']]):
                                checker = checker[0].replace("=", "").split()
                                psi = float(checker[checker.index('psi') + 1])
                            else:
                                if re.sub('[^0123456789\.]', '', checker[0]) == '':
                                    first_a = [i for i, x in enumerate(lst) if x.lstrip().startswith('a')][0]
                                    psi = float(lst[first_a].split()[-1])
                                else:
                                    psi = float(re.sub('[^0123456789\.]', '', checker[0]))
                            self.act_param['psi'][lst[0].rstrip('\n')] = psi
                        if any(['theta' in x for x in lst]):
                            checker = [x.strip(' \n') for i, x in enumerate(lst) if 'theta' in x.lstrip() ]
                            if all([x in checker[0] for x in ['theta', ' psi']]):
                                checker = checker[0].replace("=", "").split()
                                theta = float(checker[checker.index('theta') + 1])
                            else:
                                if re.sub('[^0123456789\.]', '', checker[0]) == '':
                                    first_a = [i for i, x in enumerate(lst) if x.lstrip().startswith('a')][0]
                                    theta = float(lst[first_a].split()[-1])
                                else:
                                    theta = float(re.sub('[^0123456789\.]', '', checker[0]))
                            self.act_param['theta'][lst[0].rstrip('\n')] = theta
                        if any(['lambda' in x for x in lst]):
                            checker = [x.strip(' \n').split()[-1] for i, x in enumerate(lst) if 'lambda' in x.lstrip()]
                            if re.sub('[^0123456789\.]', '', checker[0]) == '':
                                first_a = [i for i, x in enumerate(lst) if x.lstrip().startswith('a')][0]
                                lambdaa = float(lst[first_a].split()[-1])
                            else:
                                lambdaa = float(re.sub('[^0123456789\.]', '', checker[0]))
                            self.act_param['lambda'][lst[0].rstrip('\n')] = lambdaa

            for i in range(d[-1]) :  #
                s = f.readline()
                if (s.rstrip('\n') == "references") :
                    break
                s = s.replace(" acid", "_acid").replace(" high", "_high").replace(" low", "_low").replace(" anhyd", "_anhyd").replace(" hydr", "_hydr")

                if any(s.lstrip().rstrip('\n').startswith(x) for x in res):
                    lst = []; lst.append(s); counter += 1
                    for j in range(50):
                        s = f.readline()
                        if s.lstrip().rstrip('\n').startswith(k[1]):
                            break
                        elif not s.lstrip().startswith('****'):
                            lst.append(s)
                    lst[0] = lst[0].rstrip('\n')
                    if lst[0] not in subheading:
                        if lst[0].split('  ')[0] in res or lst[0].split()[0].replace(' ', '_') in res:
                            if any(re.findall(r'|'.join(('acid', 'high', 'low', 'anhyd', 'hydr')), lst[0].split('  ')[0], re.IGNORECASE)):
                                specie_name = lst[0].split('  ')[0].replace(' ', '_')
                            else:
                                specie_name = lst[0].split('  ')[0]
                            lstname.append(specie_name)
                            if specie_name not in solid_solutions:
                                indx, elem_numb =[(i,int(re.sub('[^0123456789\.]', '', x))) for i,x in enumerate(lst)
                                                  if x.strip('0123456789.,-: ').startswith('element(s)')][0]
                                elem_rows = list(range(indx + 1, indx + 2)) if elem_numb <= 3 else list(range(indx + 1, indx + 3)) if elem_numb <= 6 else list(range(indx + 1, indx + 4)) if elem_numb <= 9 else list(range(indx + 1, indx + 5))
                                reactant = [lst[x].rstrip('\n').split('  ') for x in elem_rows]
                                reactant = [item for sublist in reactant for item in sublist] # convert list of list to list
                                reactant = [y for x in reactant for y in x.split() if x != '' and x != '****']
                                self.Elemlist[specie_name] = reactant
                            else:
                                elem_rows = [20]
                            if specie_name not in basis[:-1] + solid_solutions and counter > len(basis):
                                indx, species_num =[(i,int(re.sub('[^0123456789\.]', '', x))) for i,x in enumerate(lst)
                                                    if x.strip('0123456789.,-: ').startswith('species in')][0]
                                spec_rows = list(range(indx + 1, indx + 2)) if species_num < 3 else list(range(indx + 1, indx + 3)) if species_num < 5 else list(range(indx + 1, indx + 4)) if species_num < 7 else list(range(indx + 1, indx + 5)) if species_num < 9 else  list(range(indx + 1, indx + 6))
                                reactants = [lst[x].rstrip('\n').split('  ') for x in spec_rows]
                                reactants = [item for sublist in reactants for item in sublist] # convert list of list to list
                                reactants = [y for x in reactants for y in x.split() if x != '' and x != '****']
                                reactants = [j.replace(' ', '_') if any(re.findall(r'|'.join(('acid', 'high', 'low', 'anhyd', 'hydr')),
                                                                                    j, re.IGNORECASE)) else j for j in reactants]
                            else:
                                spec_rows = [20]
                            if specie_name in solid_solutions:
                                indx, species_num =[(i,int(re.sub('[^0123456789\.]', '', x))) for i,x in enumerate(lst)
                                                    if x.strip('0123456789.,-: ').startswith(('components', 'end members'))][0]
                                spec_rows = list(range(indx + 1, indx + 2)) if species_num < 3 else list(range(indx + 1, indx + 3)) if species_num < 5 else list(range(indx + 1, indx + 4)) if species_num < 7 else list(range(indx + 1, indx + 5)) if species_num < 9 else  list(range(indx + 1, indx + 6))
                                reactants = [lst[x].rstrip('\n').split('  ') for x in spec_rows]
                                reactants = [item for sublist in reactants for item in sublist] # convert list of list to list
                                reactants = [y for x in reactants for y in x.split() if x != '' and x != '****']
                            mw = [x for x in lst if x.strip('*    ').startswith('mol.wt.')]
                            mw = float(re.sub('[^0123456789\.]', '', mw[0].split()[-2])) if len(mw) != 0 else []
                            if len(lst[0].split()) <= 1:
                                specie_formula = ""
                            elif len(lst[0].split()) > 2:
                                specie_formula = lst[0].split()[2]
                            else:
                                specie_formula = lst[0].split()[1]
                            if (specie_name == 'O2(g)') and counter <= len(basis):
                                self.block_info['%s_b' % specie_name] = lst[1:min(elem_rows,spec_rows)[0] - 1]
                            elif specie_name in basis[:-1] + auxiliary + aqueous + minerals + liquids + gases:
                                self.block_info[specie_name] = lst[1:min(elem_rows,spec_rows)[0] - 1]
                            elif specie_name in solid_solutions:
                                self.block_info[specie_name] = [lst[1:min(spec_rows) - 1], lst[(max(spec_rows) + 1):]]
                            if specie_name in basis and counter <= len(basis):
                                self.sourcedic[specie_name] = [specie_formula, elem_numb] + reactant
                            elif specie_name == 'O2(g)':
                                self.sourcedic[specie_name] = [specie_formula, species_num] + reactants
                            else:
                                self.sourcedic[specie_name] = [specie_formula, species_num] + reactants
                            self.MWdic[specie_name] = mw

        self.sourcedic['eh'] = ['eh', 4, '-1.0000', 'eh', '-2.0000', 'H2O', '1.0000', 'O2(g)', '4.0000', 'H+']
        self.sourcedic['e-'] = ['e-', 4, '-1.0000', 'e-', '0.50000', 'H2O', '-0.2500', 'O2(g)', '-1.0000', 'H+']
        self.specielist = [element, basis, auxiliary, aqueous, minerals, liquids, gases, solid_solutions]
        self.speciecat = ['element', 'basis', 'redox', 'aqueous', 'minerals', 'liquids', 'gases', 'solid_solutions']
        res = basis + auxiliary + aqueous
        self.chargedic = {res[i]: charge[i].rstrip('\n') for i in range(len(charge))}
        self.act_param['act_list'] = act_list if activity_model == 'h-m-w' else ''
        self.act_param['activity_model'] = activity_model

        f.close()
        return