Skip to content

Commit

Permalink
New scrips to download meta data for the entire DDB set
Browse files Browse the repository at this point in the history
  • Loading branch information
JarlPed committed Apr 1, 2020
1 parent 2be11b5 commit f6bade1
Show file tree
Hide file tree
Showing 4 changed files with 510 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,4 @@ ENV/

# Database files (generated by code)
DB/
DDBST_Meta.db
225 changes: 225 additions & 0 deletions Download_metaThermoData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 26 08:37:26 2020
@author: jarl.robert.pedersen
"""

import requests
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import sqlite3
import os
import time
import warnings

from fake_useragent import UserAgent



def responce_manager(url, headers):
try:
#time.sleep(1)
return requests.get(url, headers=headers )
except:
warnings.warn("Connection Timeout, waiting 1 minute to retry..")
time.sleep(10)
return responce_manager(url, headers)



BASE = "http://ddbonline.ddbst.com/DDBSearch/onlineddboverview.exe?submit="
TYPE_OVERVIEW = "Overview&"
TYPE_DETAILS = "Details&"
COMPLIST = "systemcomplist="

Name = ""

ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

DDBSTIndexNum = 32 #1
dataDict = {}
mixData = {}
tempDict = {}
pbar = tqdm(total= 75000 )



This_dir_cint = os.listdir('./')
isDBHere = False
for item in This_dir_cint:
if item == 'DDBST_Meta.db':
isDBHere = True



SQLcon = sqlite3.connect('./DDBST_Meta.db')
SQLCursor = SQLcon.cursor()

if not isDBHere:
SQLCursor.execute('CREATE TABLE Component_Registry (DDB_RN int, Name text, CAS_RN text, Formula text)')
SQLCursor.execute('CREATE TABLE Pure_Property_Metadata (DDB_RN int, Property text, Points text, Sets text, Temperature_Range text, States_Set text)')
SQLCursor.execute('CREATE TABLE Mixure_Property_Metadata (DDB_RN_Sequence text, Databank text, Sets text, Points text, Temperature_Range text, Pressure_Range text)')
SQLcon.commit()



while True:
responce = responce_manager(BASE + TYPE_OVERVIEW + COMPLIST + str(DDBSTIndexNum), headers=header )
#HTML_cont = httplib.parser.fromstring(responce.content.decode("utf-8"))
soup = BeautifulSoup(responce.text, 'html.parser')
chemEntryName = soup.findAll('tr')[4].findAll('td')[1].text
if chemEntryName == "" or chemEntryName == "Reserved Entry": # or DDBSTIndexNum >= 1 +1:
break # break while

# assuming the entry is valid.. add cas and formula entries
tempDict.update({ soup.findAll('tr')[3].findAll('th')[1].text : chemEntryName } )
tempDict.update({ soup.findAll('tr')[3].findAll('th')[2].text : soup.findAll('tr')[4].findAll('td')[2].text })
tempDict.update({ soup.findAll('tr')[3].findAll('th')[3].text : soup.findAll('tr')[4].findAll('td')[3].text })

# See detail page, if !"No data available for this component.", then add data entires for pure component props.
responce = responce_manager(BASE + TYPE_DETAILS + COMPLIST + str(DDBSTIndexNum), headers=header )
soupPurePropDetails = BeautifulSoup(responce.text, 'html.parser')

tempPPdict = {}
if re.search('No data available for this component.', soupPurePropDetails.text) == None: # i.e. data exsits for the pure properties
Table_Headers = ['Property','Points','Sets','Temperature Range', 'States', 'Sets']
PropertyType = ''
propDict = {}
statesetDict = {}
for entry in soupPurePropDetails.find_all('tr')[6:-3]:
entryItems = entry.findAll('td')

if entryItems[0].text != '':
if PropertyType != '':
propDict.update({'State Sets' : statesetDict })
tempPPdict.update({PropertyType : propDict})
propDict = {}
statesetDict = {}
PropertyType = entryItems[0].text

if entryItems[1].text != '':
propDict.update({'Points' : entryItems[1].text})
if entryItems[2].text != '':
propDict.update({'Sets' : entryItems[2].text})
if entryItems[3].text != '':
propDict.update( {'Temperature Range' : entryItems[3].text})
if entryItems[4].text != '':
statesetDict.update({entryItems[4].text : entryItems[5].text })


propDict.update({'State Sets' : statesetDict })
tempPPdict.update({PropertyType : propDict}) # last element

tempDict.update({'Pure Component Data' : tempPPdict})

# push pure component data to the sqlite db:
for key in tempPPdict.keys():
StatesOfPurePropString = ''
for stateSet in tempPPdict[key]['State Sets'].keys():
StatesOfPurePropString += stateSet +'_' + tempPPdict[key]['State Sets'][stateSet] + ';'

StatesOfPurePropString = StatesOfPurePropString[:-1]


SetsSQLWrite = ''
try:
SetsSQLWrite = tempPPdict[key]['Sets']
except:
SetsSQLWrite = ''
PointsSQLWrite = ''
try:
PointsSQLWrite = tempPPdict[key]['Points']
except:
PointsSQLWrite = ''
TempRangeSQLWrite = ''
try:
TempRangeSQLWrite = tempPPdict[key]['Temperature Range'].strip('(').strip(')')
except:
TempRangeSQLWrite = ''


SQLCursor.execute('INSERT INTO Pure_Property_Metadata VALUES (' + \
str(DDBSTIndexNum) + ','+ \
'\'' + key + '\',' + \
'\'' + PointsSQLWrite + '\',' + \
'\'' + SetsSQLWrite + '\',' + \
'\'' + TempRangeSQLWrite + '\',' + \
'\'' + StatesOfPurePropString + '\')' )

# push stuff to the sql db; component number and formula
SQLCursor.execute('INSERT INTO Component_Registry VALUES (' + \
str(DDBSTIndexNum) + ','+ \
'\'' + chemEntryName + '\',' + \
'\'' + tempDict['CAS-RN'] + '\',' + \
'\'' + tempDict['Formula'] + '\')' )

SQLcon.commit()



### Section for mixure data ###
###MixSections = soup.findAll('tr')[3]
# add thing to mixDataif first no. is DDBSTIndexNum:
mixEntryTables = soup.findAll('tr')[7:-4]
for TableEntries in mixEntryTables:
LineInformation = TableEntries.findAll('td')
if len( LineInformation ) > 0:
LineSplitted = str(LineInformation[1]).split('<br/>')
FirstComp = BeautifulSoup(LineSplitted[0], 'html.parser').text
if FirstComp == str(DDBSTIndexNum):
responceDetailPage = responce_manager( LineInformation[-1].find('a').get_attribute_list('href')[0], headers=header)
DetailSoup = BeautifulSoup(responceDetailPage.text, 'html.parser')

Detail_tables = DetailSoup.findAll('table')[3:5]
tempDetailDict = {}

dataSeriesName = ''
for line in Detail_tables[0].findAll('tr')[1:]:
dataSeriesName += line.find('td').text + ','
dataSeriesName = dataSeriesName[:-1] # remove the last comma

for line in Detail_tables[1].findAll('tr')[1:-1]:
lineEntries = line.findAll('td')
if len(lineEntries) == 6 and re.search('[a-zA-Z]{2}', lineEntries[0].text.strip('\r').strip('\n') ) != None and lineEntries[0].text.strip('\r').strip('\n') != "Total":
tempDetailDict.update( { lineEntries[0].text.strip('\r').strip('\n') : \
{ "Sets" : lineEntries[2].text, \
"Points" : lineEntries[3].text, \
"Temperature Range" : lineEntries[4].text, \
"Pressure Range" : lineEntries[5].text
} } )
#mixData.update({dataSeriesName : tempDetailDict})

# push mix data to sql-db
for key in tempDetailDict.keys():
SQLCursor.execute('INSERT INTO Mixure_Property_Metadata VALUES (' + \
'\'' + dataSeriesName + '\',' + \
'\'' + key + '\',' + \
'\'' + tempDetailDict[key]['Sets'] + '\',' + \
'\'' + tempDetailDict[key]['Points'] + '\',' + \
'\'' + tempDetailDict[key]['Temperature Range'] + '\',' + \
'\'' + tempDetailDict[key]['Pressure Range'] + '\')' )
SQLcon.commit()




# preparte for next entry:
#dataDict.update( {DDBSTIndexNum : tempDict} )
tempDict = {}
DDBSTIndexNum += 1
pbar.update(1)


SQLcon.close()

#dataDict.update({ "Mixure Data" : mixData } )


pbar.close()
#fp = open('./MetaData.json', mode='w')
#fp.write(json.dumps(dataDict, indent=4) )
#fp.close()
Loading

0 comments on commit f6bade1

Please sign in to comment.