Samstag, 26. März 2016

Correlating my Foobot measurements and official PM 2.5 measurements

Since I've recently bought a Foobot device I was curious how the measured values from my apartment correlate with official values from the www.wien.at website.
The first figure shows the max and average values from www.wien.at, there is only one measurement per day. The red line are my actual measurements from the Foobot device, one measurement per hour.  The green shape shows the difference between means and actual measurements. We can see that the measurements in the apartment are lower than the www.wien.at measurements.

However, there is a correlation between the two measurements, shown in the second figure.

Foobot measurements (Python code)

First I downloaded the measurements from the foobot server, which is possible since they provide an API for doing that.
import requests
import json
import sys
import datetime

#startdate="2016-02-07"
#enddate="2016-02-15"

#get foobot measurements between startdate and enddate
startdate=sys.argv[1]
enddate=sys.argv[2]
   
#open output file   
myfile = open(startdate+'_'+enddate+'.txt', "w")


#login
myurl ='https://api.foobot.io/v2/user/YOURUSERNAME/login/'
mypasswd='YOURPASSWORD'
myparams = {'password': mypasswd}
r = requests.post(myurl, json=myparams)
output=r.json()
mytoken=r.headers['x-auth-token']

#get devices
mykey = 'YOURKEY'
myparams = {'x-auth-token': mytoken}
r = requests.get(myurl, headers=myparams)
output=r.json()
print output

#get data interval
sampling=3600
myurl ='https://api.foobot.io/v2/device/'+output[0]['uuid']+'/datapoint/'+startdate+'T00:00:00/'+enddate+'T24:00:00/'+str(sampling)+'/'
print myurl
myparams = {'x-auth-token': mytoken}
r = requests.get(myurl, headers=myparams)
output=r.json()
print output
#print output['sensors']
#print output['thresholds']

#save datapoints
for x in output['datapoints']:
 for y in x:
  myfile.write(str(y)+" ")
 myfile.write('\n')

myfile.close()

Wien.at measurements (Python code)

Then I've downloaded the webpages with measurements from the  www.wien.at website.
wget --recursive --level=1 https://www.wien.gv.at/ma22-lgb/lufttb.htm
And grepped the relevant PM 2.5 values from the downloaded pages.
import sys
import datetime
import os

#startdate="2016-02-07"
#enddate="2016-02-15"

#parse PM 2.5 values from downloaded wien.at sites
startdate=sys.argv[1]
enddate=sys.argv[2]
dirname="www.wien.gv.at/ma22-lgb/tb/"

d1l=startdate.split("-")
d2l=enddate.split("-")
d1 = datetime.date(int(d1l[0]),int(d1l[1]),int(d1l[2]))
print d1
d2 = datetime.date(int(d2l[0]),int(d2l[1]),int(d2l[2]))
print d2
#get difference between dates
delta=d2-d1

#open output file
myfile = open(startdate+'_'+enddate+'_wienat.txt', "w")

#find file for each day and grep PM 2.5 value
for j in range(0,delta.days+1):
 print d1
 fn=dirname+"tb"+str(d1).replace("-","")+".htm"

 grepcmdmax='grep "WIEN - MAXIMUM" '+fn+' | cut -f6 -d"|" > max.txt'
 grepcmdmean='grep "WIEN - MITTEL" '+fn+' | cut -f6 -d"|" > mean.txt'

 os.system(grepcmdmax)
 os.system(grepcmdmean)

 maxfile = open('max.txt', 'r')
 meanfile = open('mean.txt', 'r')
 maxval=maxfile.read().strip().replace(" ","")
 meanval=meanfile.read().strip().replace(" ","")
 print maxval
 print meanval

 if j==delta.days:
  for i in range(0,23):
   myfile.write(maxval+" "+meanval+"\n")
 else:
  for i in range(0,24):
   myfile.write(maxval+" "+meanval+"\n")

 #increase date
 d1 += datetime.timedelta(days=1)

myfile.close()

Analysing the data and plotting (Python code)

For data analysis I've checked the differences and correlations in the data.
import sys
import datetime
import matplotlib.pyplot as plt
import os
import numpy as np
from scipy.stats import pearsonr

#startdate="2016-02-07"
#enddate="2016-02-15"
#python plot_results.py 2016-02-15_2016-03-23.txt 2016-02-15_2016-03-23_wienat.txt "1020 Vienna"
#generate two plots with measurements and wien.at values, and correlation and differences

foobotfile=sys.argv[1]
wienatfile=sys.argv[2]
#for filename and graphs
location=sys.argv[3]

#grep start and enddate from filename
startdate=foobotfile.replace(".txt","").split("_")[0]
enddate=foobotfile.replace(".txt","").split("_")[1]
d1l=startdate.split("-")
d2l=enddate.split("-")
d1 = datetime.date(int(d1l[0]),int(d1l[1]),int(d1l[2]))
print d1
d2 = datetime.date(int(d2l[0]),int(d2l[1]),int(d2l[2]))
print d2
#difference between date
delta=d2-d1

#generate label file
labels=[]
for j in range(0,delta.days+1):
 print d1

 if j==delta.days:
  for i in range(0,23):
   #add short date day.month at first hour of day in labels
   if i==0:
    newd=d1 + datetime.timedelta(days=j)
    labels.append(str(newd.day)+"."+str(newd.month))
   else:
    labels.append("")
 else:
  for i in range(0,24):
   #add short date day.month at first hour of day in labels
   if i==0:
    newd=d1 + datetime.timedelta(days=j)
    labels.append(str(newd.day)+"."+str(newd.month))
   else:
    labels.append("")  
print labels
print len(labels)

#read wien.at file into data string
with open(wienatfile) as f:
    data = f.read()
#split data string at newline
data = data.split('\n')
#get first row of each line split by empty string
wmax=[]
for row in data:
 if len(row.split(' '))>0:
  wmax.append(row.split(' ')[0])
#get second row of each line split by empty string
wmean=[]
for row in data:
 if len(row.split(' '))>1:
  wmean.append(row.split(' ')[1])
#store loaded date into numpy array as float
wmaxn=np.array(wmax).astype(np.float)
wmeann=np.array(wmean).astype(np.float)

#read foobot file into data string
with open(foobotfile) as f:
 data = f.read()
#split data string at newline
data = data.split('\n')
#get second row of each line split by empty string
foobot=[]
for row in data:
 if len(row.split(' '))>1:
  foobot.append(row.split(' ')[1])
#store loaded date into numpy array as float
foobotn=np.array(foobot).astype(np.float)

#create array for x axis
xax=[]
for i in range(0,len(wmaxn)):
 xax.append(i)
 
#create figure
fig = plt.figure()
fig.suptitle("Analysis for "+location, fontsize=14, fontweight='bold')
#plot wmaxn, wmeann, foobotn
plt.subplot(2,1,1)
plt.plot(xax,wmaxn,label="Max value from wien.at")
plt.plot(wmeann,label="Average from wien.at")
plt.plot(foobotn,label="Measurements from "+location)
#fill space between meann and foobotn
plt.fill_between(xax, wmeann, foobotn, where=wmaxn>=foobotn, interpolate=True, color='green')
plt.ylabel('PM 2.5')
plt.title('PM 2.5 Measurements')
plt.legend(loc=2,prop={'size':10})
plt.xticks(xax, labels, rotation='vertical')
plt.subplots_adjust(bottom=0.15)

#compute correlations
corr_maxmeasure=pearsonr(wmaxn, foobotn)
corr_meanmeasure=pearsonr(wmeann, foobotn)
print corr_maxmeasure
print corr_meanmeasure

#compute differences absolute and in percent
maxdiffhr=np.sum(wmaxn-foobotn)/len(wmaxn)
meandiffhr=np.sum(wmeann-foobotn)/len(wmaxn)
maxdiffhrpc=100*(np.sum(wmaxn-foobotn)/np.sum(wmaxn))
meandiffhrpc=100*(np.sum(wmeann-foobotn)/np.sum(wmeann))

#add subplot 2
ax = fig.add_subplot(212)
ax.set_title('Correlation')
ax.set_xlabel('Average from wien.at')
ax.set_ylabel("Measurements from "+location)
txtstr="Maxcorr:              "+str(round(corr_maxmeasure[0],3))+"\nMeancorr:            "+str(round(corr_meanmeasure[0],3))+"\nMaxdiff/hour:   "+str(round(maxdiffhr,3))+","+str(round(maxdiffhrpc,3))+"%\nMeandiff/hour: "+str(round(meandiffhr,3))+","+str(round(meandiffhrpc,3))+"%"
ax.text(3, 18, txtstr, style='italic',bbox={'facecolor':'red', 'alpha':0.5, 'pad':10})
ax.plot(wmeann, foobotn)
plt.show()

#save figure
fig.set_size_inches(10,10)
fig.savefig(location.replace(" ","")+".pdf")
fig.savefig(location.replace(" ","")+".png")