User Tools

Site Tools


code:shell:fetchpm25

This is an old revision of the document!


Fetch the pm 2.5 for all the cities in China

#!/bin/sh
 
RAW_HTML="index_html"
CITY="all_city.txt"
TMP="all_city"
HISTORY="history"
OUTPUT="${TMP}/pm25_all.txt"
OUTPUT_TMP="${TMP}/pm25_all_latest.txt"
wget http://www.soupm25.com/ -O ${RAW_HTML}
ALL_URLS=`cat ${RAW_HTML}|grep ".html"|sed "s/<a href=//g"|sed "s/\"//g"|sed "s/\/\///g"|sed "s/>//g"|sed "s/<\/a//g"|cut -d "=" -f4|sed "s/<\/li//g"|grep -v -E "^$|</html|DOCTYPE"|awk -F "www" '{print "http://www"$2}'|awk -F "html" '{print $1"html "$2}'|sed 's/  / /g'|dos2unix`
ALL_CITY=`echo "${ALL_URLS}"|grep "city"`
echo "${ALL_CITY}" > ${CITY}
 
rm "${OUTPUT_TMP}"
 
mkdir ${HISTORY}
mkdir "${TMP}"
cd "${TMP}"
 
while read line
do
    i="${line}"
    url=`echo "${i}"|cut -d " " -f1`
    city=`echo "${i}"|awk -F "city/" '{print $2}'|sed 's/.html /_/g'|dos2unix`
    pingyin=`echo "${city}"|cut -d "_" -f1`
    #echo "${pingyin}"
    mkdir "${pingyin}"
    cd ${pingyin}
    echo wget "${url}" -O "${pingyin}.html"
    wget "${url}" -O "${pingyin}.html"
    PM25=`cat "${pingyin}.html"|grep cityid|grep -v "config"|awk -F "cityid" '{print $2}'|cut -d "<" -f1|cut -d ">" -f2|sed 's/ //g'`
    echo ""${city}" "${PM25}"" >>"../../${OUTPUT_TMP}"
    cd ..
done < ../${CITY}
 
cd ..
 
cp ${OUTPUT_TMP} ${OUTPUT}
cp ${OUTPUT} ${HISTORY}/"`date`.txt"

/var/www/dokuwiki/wiki/data/attic/code/shell/fetchpm25.1421790007.txt.gz · Last modified: 2016/05/05 13:06 (external edit)