Showing posts with label python. Show all posts
Showing posts with label python. Show all posts

16 April 2014

572. autorotate/superimpose python script

If you want to calculate reaction coordinates between two structures you need to make sure that the structures haven't been rotated or translated, something which easily happens if you allow symmetry in gaussian and (it seems) z-matrix in nwchem.

I've written a script that lets you take two structures and align and superimpose them so that only the atoms that take part in the reaction move.

It works by you defining a minimum of four atoms that aren't supposed to move /relative to each other/ (i.e. they can be translated/rotated --- just not relative to each other) between the two structures. Four atoms far from each other are ideal. You need to make sure that they also don't lie in the same plane, but form a three-dimensional space.

I've tried this on real molecules too and it works better than I'd ever dared to hope for. The more 'stationary' atoms that you can use to make up the transformation matrix, the better.


Example:
In this example atom F is in a different location in structures a and b. The structures have also been rotated relative to each other.

a.xyz
6 A A 0.00000 0.00000 1.00000 B 0.00000 1.00000 0.00000 C 1.00000 0.00000 0.00000 D -1.00000 0.00000 0.00000 E 0.00000 0.00000 -1.00000 F 0.00000 0.500000 0.00000
b.xyz
6 B A 1 0 0 B 0 1 0 C 0 0 -1 D 0 0 1 E -1 0 0 F 0 -1 0

./autorotate.py a.xyz b.xyz '1,2,3,4'
Selected atoms in molecules 1 and 2 ['A', 0.0, 0.0, 1.0] ['A', 1.0, 0.0, 0.0] ['B', 0.0, 1.0, 0.0] ['B', 0.0, 1.0, 0.0] ['C', 1.0, 0.0, 0.0] ['C', 0.0, 0.0, -1.0] ['D', -1.0, 0.0, 0.0] ['D', 0.0, 0.0, 1.0] Transformation max error: 3.33066907388e-16 Writing to a.rot.xyz

a.rot.xyz
6 A A 1.00000 0.00000 0.00000 B -0.00000 1.00000 -0.00000 C -0.00000 0.00000 -1.00000 D -0.00000 0.00000 1.00000 E -1.00000 -0.00000 -0.00000 F -0.00000 0.50000 -0.00000
This is how it looks (note that the axis aren't aligned with (1,0,0; 0,1,0; 0,0,1) but seem to go through the centre of the molecule):
a.xyz
a.rot.xyz


b.xyz





Code:
#!/usr/bin/python
import sys
import numpy as np

#autorotate input_1.xyz input_2.xyz '1,2,3,4'
# need to pick at least four atoms that are not in the same plane
# input_1.xyz will be rotated to align with input_2.xyz
# you pick at least four atoms that should have the same positions
# relative to one another (i.e. distance and relative geometry). These 
# are then used to calculate an affine transform matrix which is used 
# to rotate and translate structure input_1.xyz to overlap with 
# structure 2

def formatinput(argument):
 infile1=sys.argv[1]
 atoms=sys.argv[3]
 atoms=atoms.split(',')
 coord_sys=[]

 for n in atoms:
  coord_sys+=[int(n)-1]
 try:
  infile2=sys.argv[2]
 except:
  infile2=''
 infile=[infile1,infile2]
 return infile,coord_sys
 
def getrawdata(infile):
 f=open(infile,'r')
 
 n=0
 preamble=[]
 
 struct=[]
 
 for line in f:
  if n<2: data-blogger-escaped-if="" data-blogger-escaped-line.rstrip="" data-blogger-escaped-n="" data-blogger-escaped-preamble="">1:
   line=line.rstrip()
   struct+=[line]
  n+=1
 xyz=[struct]
 
 return xyz, preamble

def getcoords(rawdata,preamble,atoms):
 
 n=0
 cartesian=[]
 
 for structure in rawdata:
  n=n+1
  num="%03d" % (n,)
  for item in structure:
   
   coordx=filter(None,item.split(' '))
   coordy=filter(None,item.split('\t'))
   if len(coordx)>len(coordy):
    coords=coordx
   else:
    coords=coordy
      
   coordinates=[float(coords[1]),float(coords[2]),float(coords[3])]
   element=coords[0]
   cartesian+=[[element,float(coords[1]),float(coords[2]),float(coords[3])]]
     
 return cartesian

def getstructures(rawdata,preamble):
 
 n=0
 cartesian=[]
 
 for structure in rawdata:
  n=n+1
  num="%03d" % (n,)
  for item in structure:
   
   coordx=filter(None,item.split(' '))
   coordy=filter(None,item.split('\t'))
   if len(coordx)>len(coordy):
    coords=coordx
   else:
    coords=coordy
      
   coordinates=[float(coords[1]),float(coords[2]),float(coords[3])]
   element=coords[0]
   cartesian+=[coordinates]
     
 return cartesian

def affine_transform(atoms,structures):
# from http://stackoverflow.com/questions/20546182/how-to-perform-coordinates-affine-transformation-using-python-part-2
 primaryatomcoords=[]
 for n in atoms:
  primaryatomcoords+=[structures[0][n]]

 secondaryatomcoords=[]
 for n in atoms:
  secondaryatomcoords+=[structures[1][n]]

 primary = np.array(primaryatomcoords)
 secondary = np.array(secondaryatomcoords)
 primaryfull = np.array(structures[0])

 # Pad the data with ones, so that our transformation can do translations too
 n = primary.shape[0]
 pad = lambda x: np.hstack([x, np.ones((x.shape[0], 1))])
 unpad = lambda x: x[:,:-1]
 X = pad(primary)
 Y = pad(secondary)
 Xp= pad(primaryfull)

 # Solve the least squares problem X * A = Y
 # to find our transformation matrix A
 A, res, rank, s = np.linalg.lstsq(X, Y)

 transform = lambda x: unpad(np.dot(pad(x), A))

# print "Max error should be as small as possible if the rotation is successful"
# print "If max error is large you may have selected a bad set of atoms"
 print "Transformation max error:", np.abs(secondary - transform(primary)).max()
 secondaryfull=transform(primaryfull)
 return secondaryfull

def transform_xyz(tmatrix,newxyz):
 final_xyz=[]
 for n in newxyz:
  coord=np.mat(str(n[0])+';'+str(n[1])+';'+str(n[2]))
  newcoord=np.dot(tmatrix,coord)
  newcoord=np.matrix.tolist(newcoord)
  final_xyz+=[[ newcoord[0][0],newcoord[1][0],newcoord[2][0]]]
 return final_xyz

def genxyzstring(coords,elementnumber):
 x_str='%10.5f'% coords[0]
 y_str='%10.5f'% coords[1]
 z_str='%10.5f'% coords[2]
 element=elementnumber
 xyz_string=element+(3-len(element))*' '+10*' '+\
 (8-len(x_str))*' '+x_str+10*' '+(8-len(y_str))*' '+y_str+10*' '+(8-len(z_str))*' '+z_str+'\n'
 
 return xyz_string

def write_aligned(aligned_structure,atom_coords,preamble,outfile):
 outfile=outfile.replace('.xyz','.rot.xyz')
 print "Writing to ",outfile
 g=open(outfile,'w')
 g.write(str(preamble[0])+'\n'+str(preamble[1])+'\n')
 
 for n in range(0,len(aligned_structure)):
  xyzstring=genxyzstring(aligned_structure[n],atom_coords[n][0])
  g.write(xyzstring)
 g.close()
 return 0
 
if __name__ == "__main__":
 infile,atoms=formatinput(sys.argv)
 
 xyz=['','']
 preamble=['','']
 
 #get raw data
 xyz[0],preamble[0]=getrawdata(infile[0])
 xyz[1],preamble[1]=getrawdata(infile[1])

 atom_coords=[getcoords(xyz[0],preamble[0],atoms)]
 atom_coords+=[getcoords(xyz[1],preamble[1],atoms)]
 
 #collect structures from raw data
 structures=[getstructures(xyz[0],preamble[0])]
 structures+=[getstructures(xyz[1],preamble[1])]
 
 print "Selected atoms in molecules 1 and 2"
 for n in atoms:
  print atom_coords[0][n],atom_coords[1][n]
  
 #transform structure
 aligned_structure=affine_transform(atoms,structures)
 
 write_aligned(aligned_structure,atom_coords[0],preamble[0],str(infile[0]))
 

04 November 2013

525. Briefly: rotating molecule during optimisation in Gaussian

Most people who have used gaussian will be familiar with molecules that are rotated multiple times during optimisation. Occasionally,  the molecule is rotated repeatedly without any sign of convergence, and without any changes in atom positions.

Uploading the video to blogspot has lead to some distortion (i.e. the molecule is translated -- look at the x,y,z axes in the bottom left) but the general behaviour should still be clear:


The energies aren't really changing:


Solution:
Turning off rotation using 'nosymm' might yield faster convergence:
#P rB3LYP/GEN 5D Opt=()  Freq=() SCF=(MaxCycle=128 ) NoSymm  Punch=(MO)

In ECCE, just uncheck 'Use available symmetry'.

Note that NoSymm doesn't turn off symmetry completely -- it just prevent reorientation:

From http://www.gaussian.com/g_tech/g_ur/k_symmetry.htm
The NoSymmetry keyword prevents molecule reorientation and causes all computations to be performed in the input orientation (although the program still attempts to identify the appropriate point group). Symmetry use can be completely disabled by Symmetry=None; use this option if NoSymm generates an error when identifying the point group.



How the figures were made:

First I used this script to extract the .xyz coordinates of the geometry steps:
#!/usr/bin/python
import sys 

def getrawdata(infile):
    f=open(infile,'r')
    opt=0
    geo=0
    struct=[]
    structure=[]
    energies=[]
    energy=[]
    for line in f:
    
        if opt==1 and geo==1 and not ("---" in line):
            structure+=[line.rstrip()]
    
        if 'Coordinates (Angstroms)' in line:
            if opt==0:
                opt=1
                structure=[]
    
        if opt==1 and "--------------------------" in line:
            if geo==0:
                geo=1
            elif geo==1:
                geo=0
                opt=0
        if 'SCF Done' in line:
            energy=filter(None,line.rstrip('\n').split(' ')) 
#       if  'Optimization completed' in line and (opt==0 and geo==0):
            energies+=[float(energy[4])]
            opt=0
            geo=0
            struct+=[structure]
            structure=[]
    
    return struct, energies

def periodictable(elementnumber):
    ptable={1:'H',2:'He',\
    3:'Li', 4:'Be',5:'B',6:'C',7:'N',8:'O',9:'F',10:'Ne',\
    11:'Na',12:'Mg',13:'Al',14:'Si',15:'P',16:'S',17:'Cl',18:'Ar',\
    19:'K',20:'Ca',\
    21:'Sc',22:'Ti',23:'V',24:'Cr',25:'Mn',26:'Fe',27:'Co',28:'Ni',29:'Cu',30:'Zn',\
    31:'Ga',32:'Ge',33:'As',34:'Se',35:'Br',36:'Kr',\
    37:'Rb',38:'Sr',\
    39:'Y',40:'Zr',41:'Nb',42:'Mo',43:'Tc',44:'Ru',45:'Rh',46:'Pd',47:'Ag',48:'Cd',\
    49:'In',50:'Sn',51:'Sb',52:'Te',53:'I',54:'Xe',\
    55:'Cs',56:'Ba',\
    57:'La',58:'Ce',59:'Pr',60:'Nd',61:'Pm',62:'Sm',63:'Eu',64:'Gd',65:'Tb',66:'Dy',67:'Ho',68:'Er',69:'Tm',70:'Yb',71:'Lu',\
    72:'Hf', 73:'Ta', 74:'W',75:'Re', 76:'Os', 77:'Ir',78:'Pt', 79:'Au', 80:'Hg',\
    81:'Tl', 82:'Pb', 83:'Bi',84:'Po',85:'At',86:'Rn',\
    87:'Fr',88:'Ra',\
    89:'Ac',90:'Th',91:'Pa',92:'U',93:'Np',94:'Pu',95:'Am',96:'Cm',97:'Bk',98:'Cf',99:'Es',100:'Fm',101:'Md',102:'No',\
    103:'Lr',104:'Rf',105:'Db',106:'Sg',107:'Bh',108:'Hs',109:'Mt',110:'Ds',111:'Rg',112:'Cn',\
    113:'Uut',114:'Fl',115:'Uup',116:'Lv',117:'Uus',118:'Uuo'}
    element=ptable[elementnumber]
    return element
def genxyzstring(coords,elementnumber):
    x_str='%10.5f'% coords[0]
    y_str='%10.5f'% coords[1]
    z_str='%10.5f'% coords[2]
    element=periodictable(int(elementnumber))
    xyz_string=element+(3-len(element))*' '+10*' '+\
    (8-len(x_str))*' '+x_str+10*' '+(8-len(y_str))*' '+y_str+10*' '+(8-len(z_str))*' '+z_str+'\n'

    return xyz_string

def getstructures(rawdata):

    n=0
    for structure in rawdata:

        n=n+1
        num="%03d" % (n,)
        g=open('structure_'+num+'.xyz','w')
        itson=False
        cartesian=[]

        for item in structure:

            coords=filter(None,item.split(' '))
            coordinates=[float(coords[3]),float(coords[4]),float(coords[5])]
            element=coords[1]
            cartesian+=[genxyzstring(coordinates,element)]
        g.write(str(len(cartesian))+'\n')
        g.write('Structure '+str(n)+'\n')
        for line in cartesian:
            g.write(line)
        g.close()
        cartesian=[]
    return 0

if __name__ == "__main__":
    infile=sys.argv[1]
    rawdata,energies=getrawdata(infile)
    structures=getstructures(rawdata)
    g=open('energies.dat','w')
    for n in range(0,len(energies)):
        g.write(str(n)+'\t'+str(energies[n])+'\n')
    g.close()

Then I turned all the .xyz files into one .xyz file:
cat *.xyz > structures.xyz

Which I then opened in vmd with a view to make a video. Select New Molecule and load structure.xyz. Then go to Extensions, Visualization/Movie Maker. Making a movie using VMD didn't yield to anything of sufficient quality. Instead, I selected Tachyon as the renderer. Under Movie Settings I unchecked Delete Image Files.

Once I had the ppm files I did
ffmpeg -r 2 -i symm.%05d.ppm -vcodec mpeg4 -b 5000k video.avi

The key to getting a decent quality video was picking a high enough bit rate, which is probably obvious to most people who know what bit rate means, but not to a happy amateur like myself (well, now it is).

I made the energy plot using gnuplot and the energies.dat file which the python script above generates.

23 October 2013

524. Generating bonds, angles and dihedrals for a molecule for GROMACS

Generating bond, angle and dihedral parameters for GROMACS molecular dynamics simulations is a real PITA when it comes to reasonably large and complex molecules.

Since I am currently looking at the solvation dynamics of a series of isomers of a metal oxide cluster I quickly got tired of working out the bonds and bond constraints for my 101 atom clusters by hand.

So here's a python (2.7.x) script that generates parameters suitable for a GROMACS .top or .itp file.

Let's call the script genparam. You can call it as follows:
genparam molecule.xyz 0.21 3

where molecule.xyz is the molecule in xyz coordinates. 0.21 (nm) is used to generate bonds -- atoms that are 0.21 nm or closer to each other are bonded. The final number should be 1, 2 or 3. If it is 1, then only bonds (and constraints for bonds that are 0.098 nm or shorter -- such as O-H bonds. It's specific for my use, so you can edit the code.) are printed. If it is 2, then bonds and angles are printed. If it is 3, then bonds, angles and dihedrals are printed.

A simple and understandable example using methanol looks like this:
./genparams.py methanol.xyz 0.15 3
[bonds] [bonds] 1 2 6 0.143 50000.00 ;C OH 1 3 6 0.108 50000.00 ;C H 1 4 6 0.108 50000.00 ;C H 1 5 6 0.108 50000.00 ;C H [constraints] 2 6 2 0.096 ;OH HO [ angles ] 2 1 3 1 109.487 50000.00 ;OH C H 2 1 4 1 109.487 50000.00 ;OH C H 2 1 5 1 109.466 50000.00 ;OH C H 1 2 6 1 109.578 50000.00 ;C OH HO 3 1 4 1 109.542 50000.00 ;H C H 3 1 5 1 109.534 50000.00 ;H C H 4 1 5 1 109.534 50000.00 ;H C H [ dihedrals ] 6 2 1 3 1 60.01 50000.00 2 ;HO OH C H 6 2 1 4 1 60.01 50000.00 2 ;HO OH C H 6 2 1 5 1 180.00 50000.00 2 ;HO OH C H

where methanol.xyz looks like this:
6 Methanol C -0.000000010000 0.138569980000 0.355570700000 OH -0.000000010000 0.187935770000 -1.074466460000 H 0.882876920000 -0.383123830000 0.697839450000 H -0.882876940000 -0.383123830000 0.697839450000 H -0.000000010000 1.145042790000 0.750208830000 HO -0.000000010000 -0.705300580000 -1.426986340000

Note that the bond length value may need to be tuned for each molecule -- e.g. 0.21 nm is fine for my metal oxides which have long M-O bonds, while 0.15 nm is better for my methanol.xyz. In the end, you'll probably have to do some manual editing to remove excess bonds, angles and dihedrals.

Also note that there's a switch (prevent_hhbonds) which prevent the formation of bonds between atoms that start with H -- this is to prevent bonds between e.g. H in CH3 units (177 pm). You can change this in the code in the __main__ section.

Finally, note that the function types, multiplicities and forces may need to be edited. I suggest not doing that in the code, but in the output as it will depend on each molecule.

This script won't do science for you, but it'll take care of some of the drudgery of providing geometric descriptors.

Anyway, having said all that, here's the code:


#!/usr/bin/python
import sys
from math import sqrt, pi
from itertools import permutations
from math import acos,atan2

# see table 5.5 (p 132, v 4.6.3) in the gromacs manual
# for the different function types


#from
#http://stackoverflow.com/questions/1984799/cross-product-of-2-different-\
#vectors-in-python
def crossproduct(a, b):
 c = [a[1]*b[2] - a[2]*b[1],
  a[2]*b[0] - a[0]*b[2],
  a[0]*b[1] - a[1]*b[0]]
 return c
#end of code snippet

# mostly from 
# http://www.emoticode.net/python/calculate-angle-between-two-vectors.html
def dotproduct(a,b):
 return sum([a[i]*b[i] for i in range(len(a))])

def veclength(a):
 length=sqrt(a[0]**2+a[1]**2+a[2]**2)
 return length

def ange(a,b,la,lb,angle_unit):
 dp=dotproduct(a,b)
 costheta=dp/(la*lb)
 angle=acos(costheta)
 if angle_unit=='deg':
  angle=angle*180/pi
 return angle
# end of code snippet

def diheder(a,b,c,angle_unit):
 dihedral=atan2(veclength(crossproduct(crossproduct(a,b),\
 crossproduct(b,c))), dotproduct(crossproduct(a,b),crossproduct(b,c)))
 if angle_unit=='deg':
  dihedral=dihedral*180/pi
 return dihedral

def readatoms(infile):
 positions=[]
 f=open(infile,'r')
 atomno=-2
 for line in f:
  atomno+=1
  if atomno >=1:
   position=filter(None,line.rstrip('\n').split(' '))
   if len(position)>3:
    positions+=[[position[0],int(atomno),\
    float(position[1]),float(position[2]),\
    float(position[3])]]
 return positions

def makebonds(positions,rcutoff,prevent_hhbonds):
 
 bonds=[]
 
 for firstatom in positions:
  for secondatom in positions:
   distance=round(sqrt((firstatom[2]-secondatom[2])**2\
+(firstatom[3]-secondatom[3])**2+(firstatom[4]-secondatom[4])**2)/10.0,3)
   xyz=[[firstatom[2],firstatom[3],firstatom[4]],\
[secondatom[2],secondatom[3],secondatom[4]]]
   
   if distance<=rcutoff and firstatom[1]!=secondatom[1]:
    if prevent_hhbonds and (firstatom[0][0:1]=='H' and\
     secondatom[0][0:1]=='H'):
      pass
    elif firstatom[1]<secondatom[1]:
     bonds+=[[firstatom[1],secondatom[1],\
distance,firstatom[0],secondatom[0],xyz[0],xyz[1]]]
    else:
     bonds+=[[secondatom[1],firstatom[1],\
distance,firstatom[0],secondatom[0],xyz[1],xyz[0]]]
 return bonds

def dedupe_bonds(bonds):
 
 newbonds=[]
 for olditem in bonds:
  dupe=False
  for newitem in newbonds:
   if newitem[0]==olditem[0] and newitem[1]==olditem[1]:
    dupe=True
    break;
  if dupe==False:
   newbonds+=[olditem]
 return(newbonds)

def genvec(a,b):
 vec=[b[0]-a[0],b[1]-a[1],b[2]-a[2]]
 return vec

def findangles(bonds,angle_unit):
 # for atoms 1,2,3 we can have the following situations
 # 1-2, 1-3
 # 1-2, 2-3
 # 1-3, 2-3
 # The indices are sorted so that the lower number is always first
 
 angles=[]
 for firstbond in bonds:
    
  for secondbond in bonds:
   if firstbond[0]==secondbond[0] and not (firstbond[1]==\
secondbond[1]): # 1-2, 1-3
    vec=[genvec(firstbond[6],firstbond[5])]
    vec+=[genvec(secondbond[6],secondbond[5])]
    angle=ange(vec[0],vec[1],firstbond[2]*10,\
secondbond[2]*10,angle_unit)
    angles+=[[firstbond[1],firstbond[0],\
secondbond[1],angle,firstbond[4],firstbond[3],secondbond[4],firstbond[6],\
firstbond[5],secondbond[6]]]
   
   if firstbond[0]==secondbond[1] and not (firstbond[1]==\
secondbond[1]): # 1-2, 3-1
    #this should never be relevant since we've sorted the atom numbers
    pass
   
   if firstbond[1]==secondbond[0] and not (firstbond[0]==\
secondbond[1]): # 1-2, 2-3
    vec=[genvec(firstbond[5],firstbond[6])]
    vec+=[genvec(secondbond[6],secondbond[5])]
    angle=ange(vec[0],vec[1],firstbond[2]*10,\
secondbond[2]*10,angle_unit)
    angles+=[[firstbond[0],firstbond[1],\
secondbond[1],angle,firstbond[3],firstbond[4],secondbond[4],firstbond[5],\
firstbond[6],secondbond[6]]]
   
   if firstbond[1]==secondbond[1] and not (firstbond[0]==\
secondbond[0]): # 1-3, 2-3
    vec=[genvec(firstbond[6],firstbond[5])]
    vec+=[genvec(secondbond[6],secondbond[5])]
    angle=ange(vec[0],vec[1],firstbond[2]*10,\
secondbond[2]*10,angle_unit)
    angles+=[[firstbond[0],firstbond[1],\
secondbond[0],angle,firstbond[3],firstbond[4],secondbond[3],firstbond[5],\
firstbond[6],secondbond[5]]]    
 return angles

def dedupe_angles(angles):
 dupe=False
 newangles=[]
 for item in angles:
  dupe=False
  for anotheritem in newangles:
   if item[0]==anotheritem[2] and (item[2]==anotheritem[0]\
 and item[1]==anotheritem[1]):
    dupe=True
    break
  if dupe==False:
   newangles+=[item]
 
 newerangles=[]
 dupe=False
 
 for item in newangles:
  dupe=False
  for anotheritem in newerangles:
   if item[2]==anotheritem[2] and (item[0]==anotheritem[1]\
 and item[1]==anotheritem[0]):
    dupe_item=anotheritem
    dupe=True
    break
  
  if dupe==False:
   newerangles+=[item]
  elif dupe==True:
   if dupe_item[3]>item[3]:
    pass;
   else:
    newerangles[len(newerangles)-1]=item

 newestangles=[]
 dupe=False
 
 for item in newerangles:
  dupe=False
  for anotheritem in newestangles:
   if (sorted(item[0:3]) == sorted (anotheritem[0:3])):
    dupe_item=anotheritem
    dupe=True
    break
  
  if dupe==False:
   newestangles+=[item]
  elif dupe==True:
   if dupe_item[3]>item[3]:
    pass;
   else:
    newestangles[len(newestangles)-1]=item
 return newestangles

def finddihedrals(angles,bonds,angle_unit):
 dihedrals=[]
 for item in angles:
  for anotheritem in bonds:
   if item[2]==anotheritem[0]:
    vec=[genvec(item[7],item[8])]
    vec+=[genvec(item[8],item[9])]
    vec+=[genvec(item[9],anotheritem[6])]
    dihedral=diheder(vec[0],vec[1],vec[2],\
angle_unit)
    dihedrals+=[ [ item[0],item[1],item[2],\
anotheritem[1], dihedral, item[4],item[5],item[6], anotheritem[4] ] ]

   if item[0]==anotheritem[0] and not item[1]==anotheritem[1]:
    vec=[genvec(anotheritem[6],item[7])]
    vec+=[genvec(item[7],item[8])]
    vec+=[genvec(item[8],item[9])]
    dihedral=diheder(vec[0],vec[1],vec[2],\
angle_unit)
    dihedrals+=[ [ anotheritem[1],item[0],item[1],\
item[2], dihedral, anotheritem[4],item[4],item[5], item[6] ] ]
 return dihedrals

def dedup_dihedrals(dihedrals):
 newdihedrals=[]
 
 for item in dihedrals:
  dupe=False
  for anotheritem in newdihedrals:
   rev=anotheritem[0:4]
   rev.reverse()
   if item[0:4] == rev:
    dupe=True
  if not dupe:
   newdihedrals+=[item]
 return newdihedrals

def print_bonds(bonds,func):
 constraints=""
 funcconstr='2'
 print '[bonds]'
 
 for item in bonds:
  if item[2]<=0.098:
   constraints+=(str(item[0])+'\t'+str(item[1])+'\t'+\
funcconstr+'\t'+str("%1.3f"% item[2])+'\t;'+str(item[3])+'\t'+str(item[4])+'\n')
  else: 
   print str(item[0])+'\t'+str(item[1])+'\t'+func+'\t'+\
str("%1.3f"% item[2])+'\t'+'50000.00'+'\t;'+str(item[3])+'\t'+str(item[4])
 print '[constraints]'
 print constraints
 return 0

def print_angles(angles,func):
 print '[ angles ]'
 for item in angles:
  print str(item[0])+'\t'+str(item[1])+'\t'+str(item[2])+'\t'+\
func+'\t'+str("%3.3f" % item[3])+'\t'+'50000.00'+'\t;'\
  +str(item[4])+'\t'+str(item[5])+'\t'+str(item[6])
 return 0

def print_dihedrals(dihedrals,func):
 print "[ dihedrals ]"
 force='50000.00'
 mult='2'
 for item in dihedrals:
  print str(item[0])+'\t'+str(item[1])+'\t'+str(item[2])+'\t'+\
str(item[3])+\
  '\t'+func+'\t'+str("%3.2f" % item[4])+'\t'+force+'\t'+mult+\
  '\t;'+str(item[5])+'\t'+str(item[6])+\
  '\t'+str(item[7])+'\t'+str(item[8])
 return 0

if __name__ == "__main__":
 infile=sys.argv[1]
 rcutoff=float(sys.argv[2]) # in nm
 itemstoprint=int(sys.argv[3])
 
 angle_unit='deg' #{'rad'|'deg'}
 prevent_hhbonds=True # False is safer -- it prevents bonds between
 # atoms whose names start with H

 positions=readatoms(infile)

 bonds=makebonds(positions,rcutoff,prevent_hhbonds)
 bonds=dedupe_bonds(bonds) 
 
 print_bonds(bonds,'6')
 
 angles=findangles(bonds,angle_unit)
 angles=dedupe_angles(angles)
 
 if itemstoprint>=2:
  print_angles(angles,'1')

 dihedrals=finddihedrals(angles,bonds,angle_unit)
 dihedrals=dedup_dihedrals(dihedrals)
 if itemstoprint>=3:
  print_dihedrals(dihedrals,'1') 

11 October 2013

519. Formatting an XYZ molecular geometry file using python

This is a silly little script -- ECCE, which I use to manage all my computations, is very particular about what XYZ files it can and cannot read -- if the number of spaces between the coordinates are wrong, it will fail to read the structure. So here's a python script, based on parts I've cannibalised from other scripts that I've written in the past (i.e. it could be made more elegant, but I had the parts ready and just wanted something that works) which takes a somewhat ugly XYZ file and turns it into something beautiful. At least in the eyes of ECCE.

The impetus for this was that someone gave me an XYZ file the had generated by copying columns from Excel. On Mac. mac2unix took care of the line endings, but there were tabs (\t) all over the place.

Usage:
polish_xyz ugly.xyz pretty.xyz

Script:
#!/usr/bin/python
import sys 

def getrawdata(infile):
    f=open(infile,'r')
    n=0 
    preamble=[]
    struct=[]
    
    for line in f:
        if n<2: data-blogger-escaped-if="" data-blogger-escaped-line.rstrip="" data-blogger-escaped-n="" data-blogger-escaped-preamble="">3:
            line=line.rstrip()
            struct+=[line]
        n+=1
    xyz=[struct]
    return xyz, preamble

def genxyzstring(coords,elementnumber):
    x_str='%10.5f'% coords[0]
    y_str='%10.5f'% coords[1]
    z_str='%10.5f'% coords[2]
    element=elementnumber
    xyz_string=element+(3-len(element))*' '+10*' '+\ 
    (8-len(x_str))*' '+x_str+10*' '+(8-len(y_str))*' '+y_str+10*' '+(8-len(z_str))*' '+z_str+'\n'
    
    return xyz_string

def getstructures(rawdata,preamble,outfile):
    n=0 
    for structure in rawdata:
        n=n+1
        num="%03d" % (n,)
        g=open(outfile,'w')
        itson=False
        cartesian=[]
    
        for item in structure:
            coordx=filter(None,item.split(' ')) 
            coordy=filter(None,item.split('\t'))
            if len(coordx)>len(coordy):
                coords=coordx
            else:
                coords=coordy
            coordinates=[float(coords[1]),float(coords[2]),float(coords[3])]
            element=coords[0]
            cartesian+=[genxyzstring(coordinates,element)]
        g.write(str(preamble[0])+'\n')
        g.write(str(preamble[1])+'\n')
        for line in cartesian:
            g.write(line)
        g.close()
        cartesian=[]
    return 0

if __name__ == "__main__":
    infile=sys.argv[1]
    outfile=sys.argv[2]
    xyz,preamble=getrawdata(infile)
    structures=getstructures(xyz,preamble,outfile)

518. Generating enantiomers of molecular structures given in XYZ coordinates using python.

What I'm showing here is probably overkill -- there may be better ways of doing this with sed/awk*. However, since I had most of the code below ready as it was part of another script, doing this is python was quick and easy. Plus it's portable-ish.

*[ECCE, which I use for managing my calculations, is very, very particular about the format of the XYZ file, including the number of chars between coordinates. So simply doing e.g.

cat molecule.xyz|gawk '{print $1,$2,$3,-$4}'

won't work. Not all pieces of software are that picky when it comes to xyz coordinates though -- jmol, for example, is very forgiving.]

Anyway, the script below flips a molecular structure by taking the geometry given as a (properly formatted) XYZ file, and changing the sign in front of the Z coordinates. It's that simple.
Save it, call it flip_xyz, make it executable and call it using e.g.

flip_xyz molecule.xyz flipped_molecule.xyz

Script:
#!/usr/bin/python
import sys 

def getrawdata(infile):
    f=open(infile,'r')
    n=0 
    preamble=[]
    struct=[]
  
    for line in f:
        if n<2: data-blogger-escaped-if="" data-blogger-escaped-line.rstrip="" data-blogger-escaped-n="" data-blogger-escaped-preamble="">1:
            line=line.rstrip()
            struct+=[line]
        n+=1
    xyz=[struct]
    
    return xyz, preamble

def genxyzstring(coords,elementnumber):
    x_str='%10.5f'% coords[0]
    y_str='%10.5f'% coords[1]
    z_str='%10.5f'% -coords[2]
    element=elementnumber
    xyz_string=element+(3-len(element))*' '+10*' '+\ 
    (8-len(x_str))*' '+x_str+10*' '+(8-len(y_str))*' '+y_str+10*' '+(8-len(z_str))*' '+z_str+'\n'
    
    return xyz_string

def getstructures(rawdata,preamble,outfile):
    n=0 
    for structure in rawdata:
        n=n+1
        num="%03d" % (n,)
        g=open(outfile,'w')
        cartesian=[]
    
        for item in structure:
            coordx=filter(None,item.split(' ')) 
            coordy=filter(None,item.split('\t'))
            if len(coordx)>len(coordy):
                coords=coordx
            else:
                coords=coordy
    
            coordinates=[float(coords[1]),float(coords[2]),float(coords[3])]
            element=coords[0]
            cartesian+=[genxyzstring(coordinates,element)]
    
        g.write(str(preamble[0])+'\n')
        g.write(str(preamble[1])+'\n')
        for line in cartesian:
            g.write(line)
        g.close()
        cartesian=[]
    return 0

if __name__ == "__main__":
    infile=sys.argv[1]
    outfile=sys.argv[2]
    xyz,preamble=getrawdata(infile)
    structures=getstructures(xyz,preamble,outfile)


05 September 2013

513. Extracting data from a PES scan with gaussian

There are a few reasons to like gaussian, and many reasons not to. Gaussian is fast, and their whitepapers are great resources for learning computational techniques.

Without going into discussions about the commercial behaviour of Wavefunction inc., the things I don't like about gaussian is the clunky input format (nwchem has a much more readable syntax), the inscrutable error messages, and the unreadable output. Well, it's not unreadable in a literal sense, but it could certainly be clearer. On the other hand, I've having issues with running some of my PES scans in nwchem -- and I can't find a solution (more about that in a later post)

Anyway, here's a python script for extracting optimized structures and energies from a relaxed PES scan in Gaussian 09.

First, an example of a simple scan:
%nprocshared=2 %Chk=methanol.chk #P rB3LYP/6-31g 6D 10F Opt=(modredundant) NoSymm Punch=(MO) Pop=() methanol 0 1 ! charge and multiplicity C 0.0351714 0.00548884 0.0351714 H -0.617781 -0.634073 0.667983 H 0.667983 -0.634073 -0.617781 H -0.605139 0.646470 -0.605139 O 0.839603 0.818768 0.839603 H 1.38912 0.201564 1.38912 1 5 S 10 0.1
And here's the script, pes_parse_g09:
#!/usr/bin/python
import sys

def getrawdata(infile):
        f=open(infile,'r')
        opt=0
        geo=0
        struct=[]
        structure=[]
        energies=[]
        energy=[]
        for line in f:
                
                if opt==1 and geo==1 and not ("---" in line):
                        structure+=[line.rstrip()]
                
                if 'Coordinates (Angstroms)' in line:
                        if opt==0:
                                opt=1
                                structure=[]
                        
                if opt==1 and "--------------------------" in line:
                        if geo==0:
                                geo=1
                        elif geo==1:
                                geo=0
                                opt=0
                if 'SCF Done' in line:
                        energy=filter(None,line.rstrip('\n').split(' '))
                if      'Optimization completed' in line and (opt==0 and geo==0):
                        energies+=[float(energy[4])]
                        opt=0
                        geo=0
                        struct+=[structure]
                        structure=[]
        
        return struct, energies

def periodictable(elementnumber):
        ptable={1:'H',2:'He',\
        3:'Li', 4:'Be',5:'B',6:'C',7:'N',8:'O',9:'F',10:'Ne',\
        11:'Na',12:'Mg',13:'Al',14:'Si',15:'P',16:'S',17:'Cl',18:'Ar',\
        19:'K',20:'Ca',\
        21:'Sc',22:'Ti',23:'V',24:'Cr',25:'Mn',26:'Fe',27:'Co',28:'Ni',29:'Cu',30:'Zn',\
        31:'Ga',32:'Ge',33:'As',34:'Se',35:'Br',36:'Kr',\
        37:'Rb',38:'Sr',\
        39:'Y',40:'Zr',41:'Nb',42:'Mo',43:'Tc',44:'Ru',45:'Rh',46:'Pd',47:'Ag',48:'Cd',\
        49:'In',50:'Sn',51:'Sb',52:'Te',53:'I',54:'Xe',\
        55:'Cs',56:'Ba',\
        57:'La',58:'Ce',59:'Pr',60:'Nd',61:'Pm',62:'Sm',63:'Eu',64:'Gd',65:'Tb',66:'Dy',67:'Ho',68:'Er',69:'Tm',70:'Yb',71:'Lu',\
        72:'Hf', 73:'Ta', 74:'W',75:'Re', 76:'Os', 77:'Ir',78:'Pt', 79:'Au', 80:'Hg',\
        81:'Tl', 82:'Pb', 83:'Bi',84:'Po',85:'At',86:'Rn',\
        87:'Fr',88:'Ra',\
        89:'Ac',90:'Th',91:'Pa',92:'U',93:'Np',94:'Pu',95:'Am',96:'Cm',97:'Bk',98:'Cf',99:'Es',100:'Fm',101:'Md',102:'No',\
        103:'Lr',104:'Rf',105:'Db',106:'Sg',107:'Bh',108:'Hs',109:'Mt',110:'Ds',111:'Rg',112:'Cn',\
        113:'Uut',114:'Fl',115:'Uup',116:'Lv',117:'Uus',118:'Uuo'}
        element=ptable[elementnumber]
        return element

def genxyzstring(coords,elementnumber):
        x_str='%10.5f'% coords[0]
        y_str='%10.5f'% coords[1]
        z_str='%10.5f'% coords[2]
        element=periodictable(int(elementnumber))
        xyz_string=element+(3-len(element))*' '+10*' '+\
        (8-len(x_str))*' '+x_str+10*' '+(8-len(y_str))*' '+y_str+10*' '+(8-len(z_str))*' '+z_str+'\n'
 
        return xyz_string

def getstructures(rawdata):
        
        n=0
        for structure in rawdata:
                
                n=n+1
                num="%03d" % (n,)
                g=open('structure_'+num+'.xyz','w')
                itson=False
                cartesian=[]
                        
                for item in structure:
                        
                        coords=filter(None,item.split(' '))
                        coordinates=[float(coords[3]),float(coords[4]),float(coords[5])]
                        element=coords[1]
                        cartesian+=[genxyzstring(coordinates,element)]
                g.write(str(len(cartesian))+'\n')
                g.write('Structure '+str(n)+'\n')
                for line in cartesian:
                        g.write(line)
                g.close()
                cartesian=[]
        return 0
        
if __name__ == "__main__":
        infile=sys.argv[1]
        rawdata,energies=getrawdata(infile)
        structures=getstructures(rawdata)
        g=open('energies.dat','w')
        for n in range(0,len(energies)):
                g.write(str(n)+'\t'+str(energies[n])+'\n')
        g.close()

And here's what we get from the output:
g09 methanol.in |tee methanol.out
pes_parse_g09 methanol.log
cat structure* > meoh_traj.xyz



And here's a plot of energies.dat:

30 August 2013

506. Extracting optimized structures from a potential energy scan in nwchem

Another update:
It now dumps the energies in a file, energies.dat, as well.

Update:
some programmes, like ecce, are more picky about the xyz format than others (e.g. jmol, vmd). I've updated the code to output xyz files that ecce too can read.

Original post:
When you use scan_input() in nwchem to do a PES scan (see e.g. here: http://verahill.blogspot.com.au/2013/08/503-relaxed-pes-scanning-in-nwchem.html) you get the energies and the gradients for the optimized structures returned as the results. However, for a casual user the atomic actual coordinates is more informative.

Here's a very simple parser written in python (2.7) which extracts the optimized structures from the output file:

#!/usr/bin/python
import sys

def getrawdata(infile):
        f=open(infile,'r')
        opt=0
        geo=0
        energy=[]
        energies=[]
        struct=[]
        structure=[]
        for line in f:
                if "Total DFT" in line:
                        line=filter(None,line.rstrip('\n').split(' '))
                        energy=float(line[4])
                if 'Optimization converged' in line:
                        opt=1
                if opt==1 and 'Geometry' in line:
                        geo=1
                if      'Atomic Mass' in line and (opt==1 and geo==1):
                        opt=0
                        geo=0
                        struct+=[structure]
                        energies+=[energy]
                        structure=[]
                if opt==1 and geo==1:
                        structure+=[line.rstrip()]
        return struct,energies

def genxyzstring(coords,element):
        x_str='%10.5f'% coords[0]
        y_str='%10.5f'% coords[1]
        z_str='%10.5f'% coords[2]
 
        xyz_string=element+(3-len(element))*' '+10*' '+\
        (8-len(x_str))*' '+x_str+10*' '+(8-len(y_str))*' '+y_str+10*' '+(8-len(z_str))*' '+z_str+'\n'
 
        return xyz_string

def getstructures(rawdata):
        
        n=0
        for structure in rawdata:
                
                n=n+1
                num="%03d" % (n,)
                g=open('structure_'+num+'.xyz','w')
                itson=False
                cartesian=[]
                        
                for item in structure:
                        
                        if itson and not(item==""):
                                coords=filter(None,item.split(' '))
                                coordinates=[float(coords[3]),float(coords[4]),float(coords[5])]
                                element=coords[1]
                                cartesian+=[genxyzstring(coordinates,element)]
                                #cartesian+=[coords[1]+'\t'+coords[3]+'\t'+coords[4]+'\t'+coords[5]+'\n']
                
                        if "---" in item:
                                itson=True
                        if item=="" and itson==True:
                                itson=False
                                if not(len(cartesian)==0):
                                        g.write(str(len(cartesian))+'\n')
                                        g.write('Structure '+str(n)+'\n')
                                        for line in cartesian:
                                                g.write(line)
                                        g.close()
                                cartesian=[]
        return 0
        
if __name__ == "__main__":
        infile=sys.argv[1]
        rawdata,energies=getrawdata(infile)
        structures=getstructures(rawdata)

        g=open('energies.dat','w')
        for n in range(0,len(energies)):
                g.write(str(n)+'\t'+str(energies[n])+'\n')
        g.close()


Presuming that you've saved it as pes_parse.py you can then generate a series of xyz files with the structures, catenate them into a trajectory file, and open it in e.g. jmol. I'm using the output from example 1 in http://verahill.blogspot.com.au/2013/08/503-relaxed-pes-scanning-in-nwchem.html as the example:

chmod +x pes_parse.py
./pes_parse.py nwch.nwout
ls
nwch.nwout structure_001.xyz structure_003.xyz structure_005.xyz structure_007.xyz structure_009.xyz structure_011.xyz structure_013.xyz structure_015.xyz structure_017.xyz structure_019.xyz pes_parse.py structure_002.xyz structure_004.xyz structure_006.xyz structure_008.xyz structure_010.xyz structure_012.xyz structure_014.xyz structure_016.xyz structure_018.xyz
cat structure_*.xyz >> trajectory.xyz jmol trajectory.xyz

You can go through the structures by clicking on the arrows indicated by the white arrow:

Finally, using VMD it's easy to make videos -- note that they for some reason look awful here (seems like a lot of frames are removed, in particular from the beginning of the run):

And here's the SN2 reaction from post 503:


28 August 2013

503. (relaxed) PES scanning in Nwchem revisited.

Update 2: The coordinates are actually gradients, and so aren't terribly informative to a casual user like myself. See this post for how to extract the geometries properly: http://verahill.blogspot.com.au/2013/08/506-extracting-optimized-structures.html


Update:
Please note that the coordinates in square brackets ([]) in the python output are not raw coordinates for the atoms in the molecule -- I haven't quite figured out how they scale, but it's not a simple matter of just multiplying. The energies are good though, and you can always extract the coordinates the slow and painful way by manually going through the output.

Another issue which should be stressed is that scan_input(geom,[1.398],[3.398],19,'dft',task_optimize) does not do the end points -- i.e. you won't get the energy for a bond length of 1.398, and you won't get the energy for a bond length of 3.398. Instead you'll get 19 data points in between these. It's a bit...awkward.

Original post:
A long time ago I made a post on doing potential energy surface (PES) scans in nwchem using python.

This is a post giving PES another look. The impetus for the post is that I'm tired of Gaussian failing and being opaque about the whole procedure.

The following page was of great help: http://www.fqt.izt.uam.mx/html/software_fqt/user/node34.html

NOTE: you'll need to compile nwchem with python support. See e.g. http://verahill.blogspot.com.au/2013/06/449-nwchem-63-updated-sources-compiling.html (the post is a bit messy, but persevere -- it's not that difficult)

On Debian the key is to change
    EXTRA_LIBS +=    -lnwcutil  -lpthread -lutil -ldl
to
    EXTRA_LIBS +=    -lnwcutil  -lpthread -lutil -ldl -lssl -lz

in config/makefile.h before compiling. It's not necessary on RHEL clones.

Below I'll show three examples:
* a simple bond dissociation reaction. I also discuss the use of 'constant', and task_energy vs task_optimize.
* an SN2 reaction (CH3Br + I-)
* a 2D/parallel PES scan of ethane ( C-C bond length, H-C-C angle). I also show constant vs free variables.


Example 1.
Breaking the C-O bond in methanol

I set this up in ecce (see e.g. next example), but you don't have to. The input file I used was the following:
scratch_dir /scratch Title "meoh_pes" Start meoh_pes echo charge 0 geometry autosym units angstrom C 0.0351714 0.00548884 0.0351714 H -0.617781 -0.634073 0.667983 H 0.667983 -0.634073 -0.617781 H -0.605139 0.646470 -0.605139 O 0.839603 0.818768 0.839603 H 1.38912 0.201564 1.38912 end ecce_print ecce.out basis "ao basis" cartesian print H library "3-21G" O library "3-21G" C library "3-21G" END dft mult 1 direct XC b3lyp grid fine iterations 99 mulliken end driver default maxiter 888 end python from nwgeom import * geom = ''' geometry adjust zcoord bond 1 5 %f cccc constant end end ''' results=scan_input(geom,[1.398],[3.398],19,'dft',task_optimize) for i in range(0,len(results)): print results[i][0][0],results[i][1] end task python
The PES bit is highlighted in blue. Note the 'constant' keyword -- if you omit that the bond length will initially be set to whatever you define it to in your scan, but it can relax back to the optimum length. If you DO set 'constant' everything BUT that bond will be relaxed. Most likely this is what you will want to do.

Also note that a constrained (i.e. not relaxed) PES scan can be done by doing task_energy instead of task_optimize.

ECCE can't quite handle the textual output (alt+O) since there are lines that are too long. The output is properly written though -- you'll just have to look in the Output folder of the job. The ecce.out file works fine though.

The job takes 90-100 seconds on an old 3-core node (AMD Athlon II X3).


The very end of the output file has all the results, but in a non-obvious way:
1.498 (-115.07289914310056, [-0.00130778291169336, 0.01798903956433226, 0.0, -4.009155466250247e-05, 1.693340302064139e-05, -6.637550254401381e-06, -4.009155466250247e-05, 1.693340302064139e-05, 6.637550254401381e-06, 2. 4514244186701895e-05, -1.5885649893555842e-05, 0.0, 0.0012636893525275195, -0.018041103298149008, 0.0, 9.97624242821682e-05, 3.4082577691996185e-05, 0.0]) (-114.8737952986994, [-4.7287277448850376e-05, 0.030029200359777717, 0.0, -1.3711175166353229e-06, -8.452926738775068e-08, 9.941241931599176e-07, -1.3711175166353229e-06, -8.452926738775068e-08, -9.941241931599176e-07, 8. 167348279908282e-07, -2.5820569179275075e-06, 0.0, 4.871429991895604e-05, -0.030027845123621805, 0.0, 4.984777179639632e-07, 1.3958792967685985e-06, 0.0]) 1.498 (-115.07289914310056, [-0.00130778291169336, 0.01798903956433226, 0.0, -4.009155466250247e-05, 1.693340302064139e-05, -6.637550254401381e-06, -4.009155466250247e-05, 1.693340302064139e-05, 6.637550254401381e-06, 2. 4514244186701895e-05, -1.5885649893555842e-05, 0.0, 0.0012636893525275195, -0.018041103298149008, 0.0, [..] 3.198 (-114.87977711993531, [-0.00018979360652668711, 0.033296276783081655, 0.0, -2.3787379704320877e-06, 1.7510009376556918e-06, 1.3530564600128248e-06, -2.3787379704320877e-06, 1.7510009376556918e-06, -1.3530564600128248e-06, 8. 24207064487048e-06, -8.055936327900498e-07, 0.0, 0.00018027576986845428, -0.03329589479259992, 0.0, 6.033241931824307e-06, -3.0783987173960137e-06, 0.0]) 3.298 (-114.8737952986994, [-4.7287277448850376e-05, 0.030029200359777717, 0.0, -1.3711175166353229e-06, -8.452926738775068e-08, 9.941241931599176e-07, -1.3711175166353229e-06, -8.452926738775068e-08, -9.941241931599176e-07, 8. 167348279908282e-07, -2.5820569179275075e-06, 0.0, 4.871429991895604e-05, -0.030027845123621805, 0.0, 4.984777179639632e-07, 1.3958792967685985e-06, 0.0])
All in all, there are 58 lines for 19 steps. I think that there are three things happening -- firstly, the line in blue is the output from the 19th step, and that somehow gets mixed in with the output from all the calculations. Secondly, the structure and energy of each step is reported twice at a time. Thirdly, the optimised structures/energies are reported one more time by injecting them into the output, like this:
A
S
A
B
B
C
C
D
D
A
E
E
B

where A is the first step, S is the 19th step etc. This way you get 19x3+1=58 lines. This is clearly idiotic.

Instead, you can look through the output and search for 'Scanning NWChem input - results from step' to see all the output for the optimised structures one by one:
Scanning NWChem input - results from step 2 (-115.06618436935011, [-0.0038228970733096973, 0.050051062094932305, 0.0, 2.9196769046224702e-05, -6.928661348853948e-06, 4.746536668570611e-06, 2.9196769046224702e-05, -6.928661348853948e-06, -4.746536668570611e-06, -1.0103262985700079e-05, 1.6491089715894858e-05, 0.0, 0.003767244388907326, -0.05005618579508188, 0.0, 7.362409274846993e-06, 2.489933151654522e-06, 0.0])
In this particular case I can grep my way through by doing
cat nwch.nwout |grep '^(-'|cat -n
1 (-115.07289914310056, [-0.00130778291169336, 0.01798903956433226, 0.0, -4.009155466250247e-05, 1.693340302064139e-05, -6.637550254401381e-06, -4.009155466250247e-05, 1.693340302064139e-05, 6.637550254401381e-06, 2.4514244186701895e-05, -1.5885649893555842e-05, 0.0, 0.0012636893525275195, -0.018041103298149008, 0.0, 9.97624242821682e-05, 3.4082577691996185e-05, 0.0]) 2 (-115.06618436935011, [-0.0038228970733096973, 0.050051062094932305, 0.0, 2.9196769046224702e-05, -6.928661348853948e-06, 4.746536668570611e-06, 2.9196769046224702e-05, -6.928661348853948e-06, -4.746536668570611e-06, -1.0103262985700079e-05, 1.6491089715894858e-05, 0.0, 0.003767244388907326, -0.05005618579508188, 0.0, 7.362409274846993e-06, 2.489933151654522e-06, 0.0]) 3 (-115.05478103866017, [-0.005033784212299788, 0.06848598587431667, 0.0, -1.3396548676491982e-06, -2.5875637174599397e-08, -5.261746410523127e-07, -1.3396548676491982e-06, -2.5875637174599397e-08, 5.261746410523127e-07, 1.4459720645843e-07, -2.8328952926398587e-06, 0.0, 0.005034455335082233, -0.0684825786855032, 0.0, 1.8635897582608418e-06, -5.225422206114883e-07, 0.0]) 4 (-115.04079235517, [-0.005485543277166251, 0.07798880362126945, 0.0, 4.745460307237215e-06, -5.597510268573469e-06, 5.645418744981701e-07, 4.745460307237215e-06, -5.597510268573469e-06, -5.645418744981701e-07, -6.651712157745848e-07, 6.750842351778419e-06, 0.0, 0.00548062073181968, -0.07798086728839469, 0.0, -3.903204054994669e-06, -3.4921546817404114e-06, 0.0]) 5 (-115.02560006674966, [-0.0054233976595857575, 0.08166232318137269, 0.0, -1.659239761503395e-06, -4.376603580866223e-07, 4.4580035316599265e-06, -1.659239761503395e-06, -4.376603580866223e-07, -4.4580035316599265e-06, 3.034808945895362e-06, -6.726118036586015e-06, 0.0, 0.005436665955901393, -0.08164730868562775, 0.0, -1.2984625724410392e-05, -7.4130570159938736e-06, 0.0]) [..] 16 (-114.89364787840326, [-0.0005591249462735259, 0.04018795560035916, 0.0, -5.34666220519675e-07, 1.1370871814235517e-06, 4.809133242467123e-07, -5.34666220519675e-07, 1.1370871814235517e-06, -4.809133242467123e-07, -6.9140095421138525e-06, -3.095664552260277e-06, 0.0, 0.0005695756951453745, -0.040185884820554796, 0.0, -2.467406898132296e-06, -1.2492896190128416e-06, 0.0]) 17 (-114.8863872514371, [-0.00036666056940981573, 0.03667976502852128, 0.0, 2.9101399354747315e-06, -2.094045026924257e-06, -4.933288234976185e-06, 2.9101399354747315e-06, -2.094045026924257e-06, 4.933288234976185e-06, 1.6531622304416516e-07, 1.511517903679191e-07, 0.0, 0.00036162347288279384, -0.03668602744257765, 0.0, -9.484995716624312e-07, 1.0299352320775057e-05, 0.0]) 18 (-114.87977711993531, [-0.00018979360652668711, 0.033296276783081655, 0.0, -2.3787379704320877e-06, 1.7510009376556918e-06, 1.3530564600128248e-06, -2.3787379704320877e-06, 1.7510009376556918e-06, -1.3530564600128248e-06, 8.24207064487048e-06, -8.055936327900498e-07, 0.0, 0.00018027576986845428, -0.03329589479259992, 0.0, 6.033241931824307e-06, -3.0783987173960137e-06, 0.0]) 19 (-114.8737952986994, [-4.7287277448850376e-05, 0.030029200359777717, 0.0, -1.3711175166353229e-06, -8.452926738775068e-08, 9.941241931599176e-07, -1.3711175166353229e-06, -8.452926738775068e-08, -9.941241931599176e-07, 8.167348279908282e-07, -2.5820569179275075e-06, 0.0, 4.871429991895604e-05, -0.030027845123621805, 0.0, 4.984777179639632e-07, 1.3958792967685985e-06, 0.0])
Not pretty, but manageable.
cat nwch.nwout |grep '^(-'|sed 's/\,/\t/g;s/(\([^)]*\))/\1/g'|cat -n|gawk '{print $1,$2}' > profile.dat

and then plot it:


Example 2.
SN2 reaction between iodide and bromomethane

You can set up your calc however you want, but ECCE is easier than anything else.

Draw bromomethane, then throw in an iodine atom. Adjust the angle across Br-C-I to 180 degrees, and set the C to I distance to 3 Ã….


Set up the calculation -- in this case I used b3lyp/def2-svp
Edit the input and add
python from nwgeom import * geom = ''' geometry adjust zcoord bond 1 6 %f cccc constant end end ''' results=scan_input(geom,[3.00],[1.5],20,'dft',task_optimize) for i in range(0,len(results)): print results[i][0][0],results[i][1] end task python

(Delete 'task dft optimize')

You'll now have the following input file:
scratch_dir /scratch
Title "sn2_br"

Start  sn2_br

echo

charge -1

geometry noautosym units angstrom
 C     0.00000     0.00000     0.00000
 H     -0.675500     -0.675500     0.675500
 H     0.675500     -0.675500     -0.675500
 H     -0.675500     0.675500     -0.675500
 Br     1.10274     1.10274     1.10274
 I     -1.73205     -1.73205     -1.73205
end

ecce_print ecce.out

basis "ao basis" spherical print
  H library "def2-svpd"
  Br library "def2-svpd"
  C library "def2-svpd"
  I library "def2-svpd"
END
ECP
  I library "def2-ecp"
END

dft
  mult 1
  direct
  XC b3lyp
  grid fine
  iterations 99
  mulliken
end

driver
  default
  maxiter 99
end


python
from nwgeom import *
geom = '''
    geometry adjust
        zcoord
            bond 1 6 %f cccc constant
        end
    end
'''
results=scan_input(geom,[3.00],[1.5],20,'dft',task_optimize)
for i in range(0,len(results)):
    print results[i][0][0],results[i][1]
end


task python
Launch it and wait...eventually (2h 30 min on a slow three-core node) you'll get an output like the one below. Note that I didn't pre-optimise the bromomethane, so there's a bit of a drop in energy at the beginning. Likewise, I let the C-I distance get so short that the energy is rising rapidly at the end
Structure at the beginning

Transition-state-ish structure

Product


Example 3:
Two-dimensional PES scan

I'll keep this brief. First we do a scan where we use 'constant' for the angle, but not the bond length:
scratch_dir //scratch Title "2d_pes-1" Start 2d_pes-1 echo charge 0 geometry noautosym units angstrom C -2.51242e-66 1.67495e-66 -0.767732 H -0.722530 0.722530 -1.16548 H -0.264464 -0.986995 -1.16548 H 0.986995 0.264464 -1.16548 C 2.51242e-66 -2.51242e-66 0.767732 H 0.264464 0.986995 1.16548 H -0.986995 -0.264464 1.16548 H 0.722530 -0.722530 1.16548 end ecce_print ecce.out basis "ao basis" cartesian print H library "6-31G" C library "6-31G" END dft mult 1 direct XC b3lyp grid fine iterations 99 mulliken end driver default end python from pes_scan import pes_scan geom = ''' geometry noprint adjust zcoord bond 1 5 %f cc angle 2 1 5 %f hcc constant end end ''' results = pes_scan(geom, \ [1.535, 111.269], [1.800, 90], 5, 'dft', task_optimize) end task python

And the output:
What's happening is that the bond length ends up being the same no matter what we initially set it to

If we instead set constant for the bond as well:
python from pes_scan import pes_scan geom = ''' geometry noprint adjust zcoord bond 1 5 %f cc constant angle 2 1 5 %f hcc constant end end ''' results = pes_scan(geom, \ [1.535, 111.269], [1.800, 90], 5, 'dft', task_optimize) end task python

And we get:

10 July 2013

473. Programming a Metrohm Titrino -- not a how-to, just a ramble

Many, many years ago I learned basic programming using BASIC (the version that came with PC DOS 5, I think). I even wrote the odd game, but it was all pretty awful. A few years later I learned Turbo Pascal, which was a fantastic experience compared to Basic. It felt all sciency and grown up, and it was my first experience with a real IDE. I even ended up buying a TP book, and became somewhat proficient. This must've been when I was around 18-19.I then stopped programming completely.

At around 30 years of age I decided it was time to get serious about programming again -- I was doing mass spectrometry and needed a simple program that could generate a series of solutions to the identity of a mass/charge ratio given a range of elements. I probably had a quick look at C and C++, but ended up getting a Python book and have been happy Python programmer ever since.

The problem is that I've never been a /good/ python programmer -- and in all these years I've never fully understood the use for (or, in all fairness, use OF) OOP. And at the moment it seems to be holding me back -- all the examples that I find of the use the threading module as well as writing GUIs (using e.g. wxPython) involve using classes. And I just don't understand them well enough to sort out what I need done.

Anyway, long story short: I've written a basic program for communicating with a Metrohm Titrino 736 GP via RS 232. It's found here: https://sourceforge.net/projects/pytitrino/

Currently:
* the code is a mess (see above)
* it works fine for doing monotonic and dynamic end point titrations (MET and DET)
* it saves data to a file, but does so silently (i.e. when you run you won't get any feedback that things are working properly...)
* it uses the thread (not threading) module
* I've managed to pass parameters back and forth between the thread and the main loop using Queue

There are probably much better solutions. One day I hope to be able to stick a GUI on top of it, but the more I look at it I get the impression that one writes the GUI first, then the engine...not that I'd know.

Anyway. That's what I've been up to. Anyone with a bit of programming experience, whom is in possession of an old-school Titrino (i.e. using RS 232) and wants to save $1.5k in software licenses may be interested in taking the sources and turning them into something useful.


28 June 2013

466. morph xyz -- python script to morph .xyz files

Rather naively I was hoping that by comparing two  molecule .xyz files and generating an average of them I would be able to conveniently generate a half-decent transition state guess.

Turns out that it's not quite as simple. However, I've written the software, so I might as well share it.

Note that it's written in python 2.7 (i.e. not python 3)

Run the script without arguments for help. General usage is
morphxyz -i 1.xyz 2.xyz -o morph.xyz


So here it is:

morphxyz:
#!/usr/bin/python

import sys

def getvars(arguments):
 switches={}

 version='0.1'
 
 try:
  if "-i" in arguments:
   switches['in_one']=arguments[arguments.index('-i')+1]
   switches['in_two']=arguments[arguments.index('-i')+2]
   print 'Input: %s and %s'% (switches['in_one'],switches['in_two'])
  else:
   arguments="--help";
 except:
  arguments="--help";
  
 try:
  if "-o" in arguments:
   switches['o']=arguments[arguments.index('-o')+1].lower()
   print 'Output: %s'% switches['o']
  else:
   arguments="--help";
 except:
  arguments="--help";

 try:
  if "-w" in arguments:
   switches['w']=float(arguments[arguments.index('-w')+1])
   print 'Weighting: %i'% switches['w']
  else:
   print 'Assuming no weighting'
   switches['w']=1.0;
 except:
  switches['w']=1.0;

 doexit=0
 try:
  if ("-h" in arguments) or ("--help" in arguments):
   print '\t\t bytes2words version %s' % version
   print '\t-i\t two xyz files to morph'
   print '\t-o\t output file'
   print '\t-w\t weight one structure vs the other (1=average; 0=start; 2=end)'
   print 'Exiting'
   doexit=1
 except:
  a=0 # do nothing
 if doexit==1:
  sys.exit(0)

 return switches

def getcmpds(switches):
 
 cmpds={}
 
 g=open(switches['in_one'],'r') 
 n=0
 xyz=[]
 atoms=[]
 
 for line in g:
  n+=1
  line=line.rstrip('\n')
  if n==1:
   cmpds['atoms_one']=int(line)
  elif n==2:
   cmpds['title_one']=line
  else:
   line=line.split(' ')
   line=filter(None,line)
   xyz+=[[float(line[1]),float(line[2]),float(line[3])]]
   atoms+=[line[0].capitalize()]
 cmpds['coords_one']=xyz
 cmpds['elements_one']=atoms
 
 g.close
 
 g=open(switches['in_two'],'r') 
 n=0
 xyz=[]
 atoms=[]
 
 for line in g:
  n+=1
  line=line.rstrip('\n')
  if n==1:
   cmpds['atoms_two']=int(line)
  elif n==2:
   cmpds['title_two']=line
  else:
   line=line.split(' ')
   line=filter(None,line)
   xyz+=[[float(line[1]),float(line[2]),float(line[3])]]
   atoms+=[line[0].capitalize()]
 cmpds['coords_two']=xyz
 cmpds['elements_two']=atoms
 g.close
 
 cmpds['w']=switches['w']
 
 return cmpds

def morph(cmpds):
 coords_one=cmpds['coords_one']
 coords_two=cmpds['coords_two']
 
 coords_morph=[]
 coords_diff=[]
 for n in range(0,cmpds['atoms_one']):
  morph_x=coords_one[n][0]+cmpds['w']*(coords_two[n][0]-coords_one[n][0])/2.0
  morph_y=coords_one[n][1]+cmpds['w']*(coords_two[n][1]-coords_one[n][1])/2.0
  morph_z=coords_one[n][2]+cmpds['w']*(coords_two[n][2]-coords_one[n][2])/2.0
  diff_x=coords_two[n][0]-coords_one[n][0]
  diff_y=coords_two[n][1]-coords_one[n][1]
  diff_z=coords_two[n][2]-coords_one[n][2]
  coords_morph+=[[morph_x,morph_y,morph_z]]
  coords_diff+=[[diff_x,diff_y,diff_z]]
 cmpds['coords_morph']=coords_morph
 cmpds['coords_diff']=coords_diff
 return cmpds

def genxyzstring(coords,element):
 x_str='%10.5f'% coords[0]
 y_str='%10.5f'% coords[1]
 z_str='%10.5f'% coords[2]
 
 xyz_string=element+(3-len(element))*' '+10*' '+\
 (8-len(x_str))*' '+x_str+10*' '+(8-len(y_str))*' '+y_str+10*' '+(8-len(z_str))*' '+z_str+'\n'
 
 return xyz_string

def writemorph(cmpds,outfile):
 g=open(outfile,'w') 
 h=open('diff.xyz','w')
 g.write(str(cmpds['atoms_one'])+'\n'+'\n')
 h.write(str(cmpds['atoms_one'])+'\n'+'\n')
 
 for n in range(0,cmpds['atoms_one']):
  coords=cmpds['coords_morph'][n]
  diffcoords=cmpds['coords_diff'][n]
  
  g.write(genxyzstring(coords, cmpds['elements_one'][n]))
  h.write(genxyzstring(diffcoords, cmpds['elements_one'][n]))
    
 g.close
 h.close
 return 0

if __name__=="__main__":
 arguments=sys.argv[1:len(sys.argv)]
 switches=getvars(arguments)
 cmpds=getcmpds(switches)
 
 if cmpds['atoms_one']!=cmpds['atoms_two']:
  print 'The number of atoms differ. Exiting'
  sys.exit(1)
 elif cmpds['elements_one']!=cmpds['elements_two']:
  print 'The types of atoms differ. Exiting'
  sys.exit(1)
  
 cmpds=morph(cmpds)
 success=writemorph(cmpds,switches['o'])
 if success==0:
  print 'Conversion seems successful'
 


26 June 2013

464. bytes2words -- python script

This script does something I could easily do myself in e.g. bc, and so is a bit of a waste of time. However, because I enjoy writing short python scripts, I did it anyway.

Usage:
/bytes2words -i 8gib -o MW
Input: 8gib Output: mw Assuming 64 bit word size Result: 8 gib is 1073 mw
bytes2words:
#!/usr/bin/python
# Converts bytes to words
# The impetus comes from the use of MW as the default memory unit in many computational pieces of software.

import sys

def split_text(s): # from http://stackoverflow.com/questions/12409894
    from itertools import groupby
    for k,g in groupby(s, str.isalpha):
        yield ''.join(list(g))

def getargs(arguments):
 switches={}

 version='0.1'
 
 try:
  if "-i" in arguments:
   switches['i']=arguments[arguments.index('-i')+1]
   print 'Input: %s'% switches['i']

   inputted=list(split_text(switches['i']))
   
   if len(inputted)>1:
    switches['i']=inputted[0]
    switches['unit']=inputted[len(inputted)-1].lower()
    
    if switches['unit'] == 'kib':
     switches['if']=1024
    elif switches['unit'] == 'kb':
     switches['if']=1000
    elif switches['unit'] == 'mb':
     switches['if']=1000*1000
    elif switches['unit'] == 'mib':
     switches['if']=1024*1024
    elif switches['unit'] == 'gb':
     switches['if']=1000*1000*1000
    elif switches['unit'] == 'gib':
     switches['if']=1024*1024*1024
   else:
    switches['if']=1
   
   if len(inputted)>2:
    print 'Illegal input: %s'% inputted
    arguments="--help" 
  else:
   arguments="--help";
 except:
  arguments="--help";
  
 try:
  if "-o" in arguments:
   switches['o']=arguments[arguments.index('-o')+1].lower()
   print 'Output: %s'% switches['o']
   if switches['o'] == 'w':
    switches['of']=1
   elif switches['o'] == 'kiw':
    switches['of']=1024
   elif switches['o'] == 'kw':
    switches['of']=1000
   elif switches['o'] == 'mw':
    switches['of']=1000*1000
   elif switches['o'] == 'miw':
    switches['of']=1024*1024
   elif switches['o'] == 'gw':
    switches['of']=1000*1000*1000
   elif switches['o'] == 'giw':
    switches['of']=1024*1024*1024
   else:
    print 'illegal output argument' % switches['o']
  else:
   arguments="--help";
 except:
  arguments="--help";
 version='0.1'

 try:
  if "-b" in arguments:
   switches['b']=int(arguments[arguments.index('-b')+1])
   print 'Word size: %i bits'% switches['b']
  else:
   print 'Assuming 64 bit word size'
   switches['b']=64;
 except:
  switches['b']=64;

 doexit=0
 try:
  if ("-h" in arguments) or ("--help" in arguments):
   print '\t\t bytes2words version %s' % version
   print ' \t-i\t input in words with units, e.g. 200kb'
   print ' \t-o\t output unit (w,kw,mw,gw)'
   print ' \t-b\t word size (32 or 64 (bit))'
   print 'Exiting'
   doexit=1
 except:
  a=0 # do nothing
 if doexit==1:
  sys.exit(0)
 return switches
 
if __name__=="__main__":
 bitsperbyte=8
 arguments=sys.argv[1:len(sys.argv)]
 switches=getargs(arguments)
 
 print 'Result: %s %s  is %s %s' % ( (switches['i']),switches['unit'],int(switches['if']*(float(switches['i'])/float(switches['b']/bitsperbyte))/float(switches['of'])),switches['o'])
 


17 June 2013

456. Adding NWChem basis sets to ECCE. Part 2. A solution: nwchem2ecce.py

UPDATED!

I've moved the finished scripts to here:
https://sourceforge.net/projects/nwbas2ecce/

They work! I've also added a number of converted basis sets to the sourceforge repo under 'examples'. You'll also find example ecp and ECPOrbital files.

Phew...

Here's the README:
The programmes are not 'intelligent' -- they won't check that you are doing something reasonable. Bad input = bad output. __Installation__: Download eccepag and nwbas2ecce They are both python (2.7) programmes, so you will need to install python to run them. On linux, this is normally very easy. E.g. on debian, run 'sudo apt-get install python2.7' and you are done. If you want, you can put the files in /usr/local/bin and do 'sudo chmod +x /usr/local/bin/eccepage' 'sudo chmod +x /usr/local/bin/nwbas2ecce' and you will be able to call the scripts from any directory. __Usage__ nwbas2ecce can turn a full basis set, or a, ECP basis set, into an ECCE compatible set of basis set files. Typically, an nwchem basis set consists of a single file, e.g. 3-21g. It can also be divided into several files, e.g. def2-svp and def-ecp, where the effective core potentials (ecps) are in def2-ecp. Other basis set files, like lanl2dz_ecp, contains both the orbital and the contraction parts. Typically, a ECCE basis set suite consists of: basis.BAS basis.BAS.meta basis.POT (for ECP) basis.POT.meta (for ECP) Sometimes polarization and diffuse functions are separated from the main .BAS file. E.g. 3-21++G* consists of 3-21G.BAS 3-21GS.BAS POPLDIFF.BAS , in addition to the meta files. The meta files are just markup-language type files with e.g. references. Note that you don't HAVE to break up the basis set components like this. Since the basis set data can be broken up into smaller files, the overall basis set is defined as an entry in a category file. For example, 3-21G is defined in the category file 'pople', and points to 3-21G.BAS. 3-21G* is also defined in pople, but point to both 3-21G.BAS and 3-21GS.BAS. ECP works in a similar way, by combining a .BAS and a .POT file. Note that the .POT files look different from the .BAS files. nwbas2ecce generates .BAS and .POT files based on whether there are basis/end or ecp/end sections in the nwchem basis set file. If there are both, both POT and BAS files are generated. All these files are contained in server/data/Ecce/system/GaussianBasisSetLibrary Finally, you need to generate .pag and .dir files that go into the server/data/Ecce/system/GaussianBasisSetLibrary/.DAV directory. The .dir file is always empty, while the .pag file is unfortunately a binary file. eccepag can, however, generate it with the right input. See e.g. http://verahill.blogspot.com.au/2013/06/455-adding-nwchem-basis-sets-to-ecce.html for more detailed information __Example__ We'll use def2-svp as an example. The nwchem basis set file def2-svp contains the basis set, while def2-ecp contains the core potentials. Use def2-svp to generate DEF2_SVP.BAS, DEF2_SVP.BAS.meta. Use def2-ecp to generate DEF2_ECP.POT, DEF2_ECP.POT.meta. As part of the generation, .descriptor files are also generated. These contain information that should go into the category file(s). Then generate the .pag files for both the POT and the BAS files, and touch the .dir files into existence. Do like this: nwbas2ecce -i def2-svp -o DEF2_SVP.BAS -n 'def2-svp' nwbas2ecce -i def2-ecp -p DEF2_ECP.POT -n 'def2-ecp' eccepag -n def2-svp -t ECPOrbital -c ORBITAL -y Segmented -s Y -o DEF2_SVP.BAS.pag eccepag -n def2-ecp -t ecp -c AUXILIARY -o DEF2_ECP.POT.pag NOTE: I don't actually know if def2-svp is segmented, and spherical. I don't think it matters for the .pag file generation. Also note that most inputs are case sensitive. Look at a similar .pag file for hints. You now have the following files: DEF2_ECP.POT DEF2_ECP.POT.descriptor DEF2_ECP.POT.meta DEF2_ECP.POT.pag DEF2_SVP.BAS DEF2_SVP.BAS.descriptor DEF2_SVP.BAS.meta DEF2_SVP.BAS.pag Copy the files. Note that you need to select the correct target directory, and that will vary with where you installed ECCE. I'll assume it's in /opt/ecce cp DEF2* /opt/ecce/server/data/Ecce/system/GaussianBasisSetLibrary cd /opt/ecce/server/data/Ecce/system/GaussianBasisSetLibrary mv *.pag .DAV/ touch .DAV/DEF2_SVP.BAS.dir .DAV/DEF2_ECP.POT.dir cat DEF2_SVP.BAS.descriptor >> ECPOrbital cat DEF2_ECP.POT.descriptor >> ECPOrbital cat DEF2_ECP.POT.descriptor >> ecp Edit ECPOrbital so that it reads: name= def2-svp files= DEF2_SVP.BAS DEF2_ECP.POT atoms= H He Li Be B C N O F Ne Na Mg Al Si P S Cl Ar K Ca Sc Ti V Cr Mn Fe Co Ni Cu Zn Ga Ge As Se Br Kr Rb Sr Y Zr Nb Mo Tc Ru Rh Pd Ag Cd In Sn Sb Te I Xe Cs Ba La Hf Ta W Re Os Ir Pt Au Hg Tl Pb Bi Po At Rn atoms= Rb Sr Y Zr Nb Mo Tc Ru Rh Pd Ag Cd In Sn Sb Te I Xe Cs Ba La Hf Ta W Re Os Ir Pt Au Hg Tl Pb Bi Po At Rn
/pre>

08 October 2012

252. Molecular weight calculator in python

Here's the molecular weight part of the isotopic pattern calculator in a previous post.

Most people won't need a full molecular weight calculator with plotting of isotopic composition, so I'm publishing the molecular weight part as a separate program.

The actual algorithm is fairly simple and is more or less contained in the formulaExpander function below. It looks messy because of the definition of the PeriodicTable dictionary at the beginning, but it's simple.

Copy the code, past it into a file (call it e.g. molcalc), put it in e.g. /usr/local/bin and chmod +x it.

Usage:
molcalc 'Mg2(PO4)3'
returns
The mass of Mg2P3O12 is 333.524247 and the calculated charge is -5.
The charge is based on my default oxidation states -- depending on what kind of chemistry you do the oxidation states you encounter are likely to differ.

#!/usr/bin/python2.7
#########################################################################
# Principal author of current version: Me
# Isotopic abundances and masses were copied from Wsearch32.
#
#
# Dependencies:
# To be honest I'm not quite certain. At a minimum you will need python2.7,
# python-numpy
#
#########################################################################

import re #for regular expressions
import sys
from numpy import matrix,transpose # for molw calc
try:
 molecules=sys.argv[1]
except:
 quit()

#slowly changed to IUPAC 1997 isotopic compositions and IUPAC 2007 masses
# see http://pac.iupac.org/publications/pac/pdf/1998/pdf/7001x0217.pdf for
# natural variations in isotopic composition
PeriodicTable ={
   'H':[1,1,[1.0078250321,2.0141017780],[0.999885,0.0001157]], # iupac '97 in water
   'He':[2,0,[3.0160293097,4.0026032497],[0.00000137,0.99999863]], # iupac iso '97
   'Li':[3,1,[6.0151233,7.0160040],[0.0759,0.9241]], # iupac '97
   'Be':[4,2,[9.0121821],[1.0]], # iupac '97
   'B':[5,3,[10.0129370,11.0093055],[0.199,0.801]], # iupac' 97
                        'C':[6,4,[12.0,13.0033548378],[0.9893,0.0107]], # iupac '97
                        'N':[7,5,[14.0030740052,15.0001088984],[0.99632,0.00368]], # iupac '97
                        'O':[8,-2,[15.9949146221,16.99913150,17.9991604],[0.99757,0.00038,0.00205]], # iupac '97
                        'F':[9,-1,[18.99840320],[1.0]], # iupac '97
                        'Ne':[10,0,[19.9924401759,20.99384674,21.99138551],[0.9048,0.0027,0.0925]], # iupac '97 in air
                        'Na':[11,1,[22.98976967],[1.0]], #iupac '97
                        'Mg':[12,2,[23.98504190,24.98583702,25.98259304],[0.7899,0.10,0.1101]], #iupac '97
                        'Al':[13,3,[26.98153844],[1.0]], #iupac '97
                        'Si':[14,4,[27.9769265327,28.97649472,29.97377022],[0.92297,0.046832,0.030872]],#iupac '97
                        'P':[15,5,[30.97376151],[1.0]], #iupac '97
                        'S':[16,-2,[31.97207069,32.97145850,33.96786683,35.96708088],[0.9493,0.0076,0.0429,0.0002]], #iupac '97
                        'Cl':[17,-1,[34.96885271,36.96590260],[0.7578,0.2422]], #iupac '97
                        'Ar':[18,0,[35.96754628,37.9627322,39.962383123],[0.003365,0.000632,0.996003]],#iupac '97 in air
                        'K':[19,1,[38.9637069,39.96399867,40.96182597],[0.932581,0.000117,0.067302]], #iupac '97
                        'Ca':[20,2,[39.9625912,41.9586183,42.9587668,43.9554811,45.9536928,47.952534],[0.96941,0.00647,0.00135,0.02086,0.00004,0.00187]], #iupac '97
                        'Sc':[21,3,[44.9559102],[1.0]], #iupac '97
                        'Ti':[22,4,[45.9526295,46.9517638,47.9479471,48.9478708,49.9447921],[0.0825,0.0744,0.7372,0.0541,0.0518]], #iupac '97
                        'V':[23,5,[49.9471628,50.9439637],[0.00250,0.99750]], #iupac '97
                        'Cr':[24,2,[49.9460496,51.9405119,52.9406538,53.9388849],[0.04345,0.83789,0.09501,0.02365]], #iupac '97
                        'Mn':[25,2,[54.9380496],[1.0]], #iupac '97
                        'Fe':[26,3,[53.9396148,55.9349421,56.9353987,57.9332805],[0.05845,0.91754,0.02119,0.00282]], #iupac '97
                        'Ni':[27,3,[57.9353479,59.9307906,60.9310604,61.9283488,63.9279696],[0.680769,0.262231,0.011399,0.036345,0.009256]], #iupac '97
                        'Co':[28,2,[58.933195],[1.0]], #iupac '97
                        'Cu':[29,2,[62.9296011,64.9277937],[0.6917,0.3083]], #iupac '97
                        'Zn':[30,2,[63.9291466,65.9260368,66.9271309,67.9248476,69.925325],[0.4863,0.2790,0.0410,0.1875,0.0062]], #iupac '97
                        'Ga':[31,3,[68.925581,70.9247050],[0.60108,0.39892]], #iupac '97
                        'Ge':[32,2,[69.9242504,71.9220762,72.9234594,73.9211782,75.9214027],[0.2084,0.2754,0.0773,0.3628,0.0761]], #iupac '97
                        'As':[33,3,[74.9215964],[1.0]], #iupac '97
                        'Se':[34,4,[73.9224766,75.9192141,76.9199146,77.9173095,79.9165218,81.9167000],[0.0089,0.0937,0.0763,0.2377,0.4961,0.0873]], #iupac '97
                        'Br':[35,-1,[78.9183376,80.916291],[0.5069,0.4931]],#iupac '97
                        'Kr':[36,0,[77.920386,79.916378,81.9134846,82.914136,83.911507,85.9106103],[0.0035,0.0228,0.1158,0.1149,0.5700,0.1730]], #iupac '97 in air
                        'Rb':[37,1,[84.9117893,86.9091835],[0.7217,0.2783]], #iupac '97
                        'Sr':[38,2,[83.913425,85.9092624,86.9088793,87.9056143],[0.0056,0.0986,0.0700,0.8258]], #iupac '97
                        'Y': [39,3,[88.9058479],[1.0]], #iupac '97
                        'Zr': [40,4,[89.9047037,90.9056450,91.9050401,93.9063158,95.908276],[0.5145,0.1122,0.1715,0.1738,0.0280]],#iupac '97
                        'Nb':[41,5,[92.9063775],[1.0]], #iupac '97
                        'Mo':[42,6,[91.906810,93.9050876,94.9058415,95.9046789,96.9060210,97.9054078,99.907477],[0.1484,0.0925,0.1592,0.1668,0.0955,0.2413,0.0963]], #checked, iupac '97
                        'Tc': [43,2,[96.906365,97.907216,98.9062546],[1.0]], #no natural abundance
                        'Ru': [44,3,[95.907598,97.905287,98.9059393,99.9042197,100.9055822,101.9043495,103.905430],[0.0554,0.0187,0.1276,0.1260,0.1706,0.3155,0.1862]], #iupac '97
                        'Rh':[45,2,[102.905504],[1.0]], #iupac '97
                        'Pd':[46,2,[101.905608,103.904035,104.905084,105.903483,107.903894,109.905152],[0.0102,0.1114,0.2233,0.2733,0.2646,0.1172]], #iupac '97
                        'Ag':[47,1,[106.905093,108.904756],[0.51839,0.48161]], #iupac '97
                        'Cd':[48,2,[105.906458,107.904183,109.903006,110.904182,111.9027572,112.9044009,113.9033581,115.904755],[0.0125,0.0089,0.1249,0.1280,0.2413,0.1222,0.2873,0.0749]],#iupac '97
                        'In':[49,3,[112.904061,114.903878],[0.0429,0.9571]], #iupac '97
                        'Sn':[50,4,[111.904821,113.902782,114.903346,115.901744,116.902954,117.901606,118.903309,119.9021966,121.9034401,123.9052746],[0.0097,0.0066,0.0034,0.1454,0.0768,0.2422,0.0859,0.3258,0.0463,0.0579]], #iupac '97
                        'Sb':[51,3,[120.9038180,122.9042157],[0.5721,0.4279]], #iupac '97
                        'Te':[52,4,[119.904020,121.9030471,122.9042730,123.9028195,124.9044247,125.9033055,127.9044614,129.9062228],[0.0009,0.0255,0.0089,0.0474,0.0707,0.1884,0.3174,0.3408]],#iupac '97
                        'I':[53,-1,[126.904468],[1.0]], #iupac '97
                        'Xe':[54,0,[123.9058958,125.904269,127.9035304,128.9047795,129.9035079,130.9050819,131.9041545,133.9053945,135.907220],[0.0009,0.0009,0.0192,0.2644,0.0408,0.2118,0.2689,0.1044,0.0887]], #iupac '97
                        'Cs':[55,1,[132.905447],[1.0]], #iupac '97
   'Ba':[56,2,[129.906310,131.905056,133.904503,134.905683,135.904570,136.905821,137.905241],[0.00106,0.00101,0.02417,0.06592,0.07854,0.11232,0.71698]], #iupac '97
   'La':[57,3,[137.907107,138.906348],[0.00090,0.99910]],#iupac '97
   'Ce':[58,3,[135.907140,137.905986,139.905434,141.909240],[0.00185,0.00251,0.88450,0.11114]],#iupac '97
                        'Pr':[59,3,[140.907648],[1.0]], #iupac '97
   'Nd':[60,3,[141.907719,142.909810,143.910083,144.912569,145.913112,147.916889,149.920887],[0.272,0.122,0.238,0.083,0.172,0.057,0.056]],#iupac '97
   'Pm':[61,3,[144.91270],[1.0]], #no natural occurence
   'Sm':[62,3,[143.911995,146.914893,147.914818,148.917180,149.917271,151.919728,153.922205],[0.0307,0.1499,0.1124,0.1382,0.0738,0.2675,0.2275]], #iupac '97
   'Eu':[63,3,[150.919846,152.921226],[0.4781,0.5219]], #iupac '97
   'Gd':[64,3,[151.919788,153.920862,154.922619,155.922120,156.923957,157.924101,159.927051],[0.0020,0.0218,0.1480,0.2047,0.1565,0.2484,0.2186]],#iupac '97
                        'Tb':[65,4,[158.925343],[1.0]], #iupac '97
   'Dy':[66,3,[155.924278,157.924405,159.925194,160.926930,161.926795,162.928728,163.929171],[0.0006,0.0010,0.0234,0.1891,0.2551,0.2490,0.2818]], #iupac '97
   'Ho':[67,3,[164.930319],[1.0]], #iupac '97
   'Er':[68,3,[161.928775,163.929197,165.930290,166.932045,167.932368,169.935460],[0.0014,0.0161,0.3361,0.2293,0.2678,0.1493]], #iupac '97
   'Tm':[69,3,[168.934211],[1.0]], #iupac '97
                        'Yb':[70,3,[167.933894,169.934759,170.936322,171.9363777,172.9382068,173.9388581,175.942568],[0.0013,0.0304,0.1428,0.2183,0.1613,0.3183,0.1276]], #iupac '97
   'Lu':[71,3,[174.9407679,175.9426824],[0.9741,0.0259]],#iupac '97
   'Hf':[72,4,[173.940040,175.9414018,176.9432200,177.9436977,178.9458151,179.9465488],[0.0016,0.0526,0.1860,0.2728,0.1362,0.3508]], #iupac '97
   'Ta':[73,5,[179.947466,180.947996],[0.00012,0.99988]], #iupac '97
   'W':[74,6,[179.946704,181.9482042,182.9502230,183.9509312,185.9543641],[0.0012,0.2650,0.1431,0.3064,0.2843]], #iupac  '97
                        'Re':[75,2,[184.9529557,186.9557508],[0.3740,0.6260]],#iupac '97
   'Os':[76,4,[183.952491,185.953838,186.9557479,187.9558360,188.9581449,189.958445,191.961479],[0.0002,0.0159,0.0196,0.1324,0.1615,0.2626,0.4078]],#iupac '97
   'Ir':[77,4,[190.960591,192.962924],[0.373,0.627]], #iupac '97
   'Pt':[78,4,[189.959930,191.961035,193.962664,194.964774,195.964935,197.967876],[0.00014,0.00782,0.32967,0.33832,0.25242,0.07163]],#iupac '97
   'Au':[79,3,[196.966552],[1.0]], #iupac '97
                        'Hg':[80,2,[195.965815,197.966752,198.968262,199.968309,200.970285,201.970626,203.973476],[0.0015,0.0997,0.1687,0.2310,0.1318,0.2986,0.0687]], #iupac '97
   'Tl':[81,1,[202.972329,204.974412],[0.29524,0.70476]], #iupac '97
   'Pb':[82,2,[203.973029,205.974449,206.975881,207.976636],[0.014,0.241,0.221,0.524]],#
   'Bi':[83,3,[208.980383],[1.0]], #iupac '97
   'Po':[84,4,[209.0],[1.0]],
   'At':[85,7,[210.0],[1.0]],
                        'Rn':[86,0,[220.0],[1.0]],
   'Fr':[87,1,[223.0],[1.0]],
   'Ra':[88,2,[226.0],[1.0]],
   'Ac':[89,3,[227.0],[1.0]],
   'Th':[90,4,[232.0380504],[1.0]], #iupac '97
   'Pa':[91,4,[231.03588],[1.0]],
                        'U':[92,6,[234.0409456,235.0439231,236.0455619,238.0507826],[0.000055,0.007200,0.0,0.992745]], #iupac '97
   'Np':[93,5,[237.0],[1.0]],
   'Pu':[94,3,[244.0],[1.0]],
   'Am':[95,2,[243.0],[1.0]],
   'Cm':[96,3,[247.0],[1.0]],
   'Bk':[97,3,[247.0],[1.0]],
   'Cf':[98,0,[251.0],[1.0]],
                        'Es':[99,0,[252,.0],[1.0]],
   'Fm':[100,0,[257.0],[1.0]],
   'Md':[101,0,[258.0],[1.0]],
   'No':[102,0,[259.0],[1.0]],
   'Lr':[103, 0,[262.0],[1.0]],
   'Rf':[104, 0,[261.0],[1.0]],
   'Db':[105, 0,[262.0],[1.0]],
   'Sg':[106, 0,[266.0],[1.0]]
}

#######################################
# Collect properties
#######################################
def getMass(x):
 atom=re.findall('[A-Z][a-z]*',x)
 number=re.findall('[0-9]+', x)
 if len(number) == 0:
  multiplier = 1
 else:
  multiplier = float(number[0])
 atomic_mass=float(matrix(PeriodicTable[atom[0]][2])*transpose(matrix(PeriodicTable[atom[0]][3])))
# That's right -- the molecular weight is based on the isotopes and ratios
 return (atomic_mass*multiplier)

def getCharge(x):
 atom=re.findall('[A-Z][a-z]*',x)
 number=re.findall('[0-9]+', x)
 if len(number) == 0:
  multiplier = 1
 else:
  multiplier = float(number[0])
 atomic_charge=float(PeriodicTable[atom[0]][1])
 return (atomic_charge*multiplier)


#####################################################
# Iterate over expanded formula to collect property
#####################################################
def molmass(formula):
 mass=0
 while (len(formula)>0):
  segments = re.findall('[A-Z][a-z]*[0-9]*',formula)
  for i in range(0, len(segments)):
   mass+=getMass(segments[i])
  formula=re.sub(formula, '', formula)
 return mass

def molcharge(formula):
 charge=0
 while (len(formula)>0):
  segments = re.findall('[A-Z][a-z]*[0-9]*',formula)
  for i in range(0, len(segments)):
   charge+=getCharge(segments[i])  
  formula=re.sub(formula, '', formula)
 return charge


################################################################################
#expands ((((M)N)O)P)Q to M*N*O*P*Q
################################################################################

def formulaExpander(formula):
 while len(re.findall('\(\w*\)',formula))>0:
  parenthetical=re.findall('\(\w*\)[0-9]+',formula)
  for i in parenthetical:
   p=re.findall('[0-9]+',str(re.findall('\)[0-9]+',i)))
   j=re.findall('[A-Z][a-z]*[0-9]*',i)
   oldj=j
   for n in range(0,len(j)):
    numero=re.findall('[0-9]+',j[n])
    if len(numero)!=0:
     for k in numero:
      nu=re.sub(k,str(int(int(k)*int(p[0]))),j[n])
    else:
     nu=re.sub(j[n],j[n]+p[0],j[n])
    j[n]=nu
   newphrase=""
   for m in j:
    newphrase+=str(m)
   formula=formula.replace(i,newphrase)
  if (len((re.findall('\(\w*\)[0-9]+',formula)))==0) and (len(re.findall('\(\w*\)',formula))!=0):
   formula=formula.replace('(','')
   formula=formula.replace(')','')
 return formula


#######
# main #
########
if __name__ == '__main__':
 molecules=molecules.split(',')
 for element in molecules:
  element=formulaExpander(element)
  print ('The mass of %(substance)s is %(Mass)f and the calculated charge is %(Charge)i.' % {'substance': \
   element, 'Mass': molmass(element), 'Charge': molcharge(element)})