# Parse usgbc LEED building list
# 2/27/2014 by Michael Minn

import os
import csv
import string
import urllib2

def find_span(span_tag, html):
	start = string.find(html, span_tag)
	end = string.find(html, "</span>", start)
	if (start > 0) and (end > 0):
		return html[(start + len(span_tag)):end].strip()
	else:
		return ""

def find_project_page(url):
	page_directories = ["project-pages/leed-locations-1", "project-pages/leed-locations-2",
		"project-pages/leed-locations-3", "project-pages/leed-locations-4"]

	filename = url[11:]
	filename = filename.replace("/", "-") + ".html"

	path = None
	for directory in page_directories:
		if os.path.isfile(directory + "/" + filename):
			path = directory + "/" + filename

	if path:
		infile = open(path, "r")
		html = infile.read()
		infile.close()
		return html

	try:
		request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) 
		response = urllib2.urlopen(request)
		html = response.read()

		print "Downloaded: " + url
		return html

	except urllib2.URLError as error:
		print url + ": " + unicode(error)
		return ""

def find_xml_text(xml, tag):
	opening = "<" + tag + ">"
	closing = "</" + tag + ">"
	start = xml.find(opening)
	end = xml.find(closing, start + len(opening))
	if (start < 0) or (end < 0):
		return ""
	return xml[(start + len(opening)):end]

def find_xml_attribute(xml, name):
	opening = name + "='"
	closing = "'"
	start = xml.find(opening)
	end = xml.find(closing, start + len(opening))
	if (start < 0) or (end < 0):
		return ""
	return xml[(start + len(opening)):end]
		

def find_brownfield(html):

	if (not html) or (html == ""):
		return "NF"

	start = string.find(html, "Brownfield redevelopment")
	if start < 0:
		return "NR"

	start = string.find(html, "<span", start + 24)
	if start < 0:
		return "NA"

	start = start + 18
	end = string.find(html, "</span>", start)
	
	while (start < end) and (html[start] <= ' '):
		start = start + 1

	if html[start] == '0':
		return "no"
	elif html[start] == '1':
		return "yes"
	else:
		return "NA"

def find_google_lat_long(query):
	url = "http://maps.googleapis.com/maps/api/geocode/xml?sensor=false&address=" + query

	try:
		request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) 
		response = urllib2.urlopen(request)
		xml = response.read()

	except urllib2.URLError as error:
		print url + ": " + unicode(error)
		return "", "", url

	return find_xml_text(xml, "lat"), find_xml_text(xml, "lng"), url

def find_osm_lat_long(query):
	url = "http://nominatim.openstreetmap.org/search?format=xml&q=" + query

	try:
		request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) 
		response = urllib2.urlopen(request)
		xml = response.read()

	except urllib2.URLError as error:
		print url + ": " + unicode(error)
		return "", "", url

	return find_xml_attribute(xml, "lat"), find_xml_attribute(xml, "lon"), url

def find_web_lat_long(html):

	# Skip first commented lat/long link - which are usually off
	start = html.find("http://maps.google.com")
	start = html.find("http://maps.google.com", start + 1)
	end = html.find("\" itemprop=\"maps\"", start)

	if (start < 0) or (end < 0):
		return "", "", ""

	url = html[start:end]
	query = url.replace("http://maps.google.com/maps?q=", "")

	# Query OpenStreetMap first to stay under Google bandwidth restrictions
	latitude, longitude, url = find_osm_lat_long(query)
	if not latitude:
		latitude, longitude, url = find_google_lat_long(query)

	return latitude, longitude, url

def find_gmap_link_lat_long(html):

	# Find hidden google maps lat/long link
	start = html.find("href=\"http://maps.google.com")
	end = html.find('"', start + 6)
	if (start > 0) and (end > 0):
		map_url = html[(start + 6):end]
	else:
		map_url = ""

	start = map_url.find("q=")
	end = map_url.find(",", start)
	if (start > 0) and (end > 0):
		latitude = map_url[(start + 2):end]
	else:
		latitude = ""

	start = end + 2
	end = map_url.find("+", start)
	if (start > 0) and (end > 0):
		longitude = map_url[start:end]
	else:
		longitude = ""

	return latitude, longitude, map_url

def extract_project_page_info():

	infile = open("leed_projects-2014-03-28.csv", 'r')
	dialect = csv.Sniffer().sniff(infile.read(2048))
	infile.seek(0)
	reader = csv.reader(infile, dialect)

	header = reader.next()
	header.extend(["address", "locality", "region", "postal_code",
		"country", "brownfield", "latitude", "longitude", "map_url"])

	dialect.escapechar = '\\'
	dialect.quoting = csv.QUOTE_NONNUMERIC

	outfile = open("temp.csv", 'w')
	writer = csv.writer(outfile, dialect)
	writer.writerow(header)

	skip = 0
	maximum = 100000
	count = 0

	for row in reader:
		count = count + 1
		if count < skip:
			continue
		elif count > maximum:
			break

		url = row[1]
		html = find_project_page(url)
		

		# Find address information
		address = find_span("<span itemprop=\"streetAddress\">", html)
		address = address.replace("<br/>", "")

		locality = find_span("<span itemprop=\"addressLocality\">", html)

		region = find_span("<span itemprop=\"addressRegion\">", html)

		postal_code = find_span("<span itemprop=\"postalCode\">", html)

		country = find_span("<span itemprop=\"addressCountry\">", html)

		brownfield = find_brownfield(html)

		# latitude, longitude, map_url = find_web_lat_long(html)
		latitude, longitude, map_url = find_gmap_link_lat_long(html)

		row.extend([address, locality, region, postal_code, country, brownfield, latitude, longitude, map_url])

		writer.writerow(row)

		#print unicode(reader.line_num) + ") " + \
		#	",".join([address, locality, region, postal_code, country, brownfield])
		#print map_url
		#print ",".join([latitude, longitude])
		#print "\n"

def find_brownfields():

	infile = open("leed-locations.csv", "r")
	dialect = csv.Sniffer().sniff(infile.read(2048))
	infile.seek(0)
	reader = csv.reader(infile, dialect)
	header = reader.next()

	dialect.escapechar = '\\'
	dialect.quoting = csv.QUOTE_NONNUMERIC

	outfile = open("temp.csv", 'w')
	writer = csv.writer(outfile, dialect)
	writer.writerow(header)

	skip = 0
	maximum = 100000
	count = 0

	for row in reader:
		count = count + 1
		if count < skip:
			continue
		elif count > maximum:
			break

		url = row[1]
		html = find_project_page(url)
		
		brownfield = find_brownfield(html)
		row[14] = brownfield

		print row[1] + ": " + brownfield

		writer.writerow(row)

def find_osm_where_available():

	infile = open("leed-locations.csv", "r")
	dialect = csv.Sniffer().sniff(infile.read(2048))
	infile.seek(0)
	reader = csv.reader(infile, dialect)
	header = reader.next()

	dialect.escapechar = '\\'
	dialect.quoting = csv.QUOTE_NONNUMERIC

	outfile = open("temp.csv", 'w')
	writer = csv.writer(outfile, dialect)
	writer.writerow(header)

	count = 0
	for row in reader:
		count = count + 1
		url = row[17]

		if (url.find("maps.google.com") > 0) and (url.find("maps?q=,") < 0) and (url.find("maps?q=0.0") < 0):
			query = "+".join(row[9:13])
			query = query.replace(" ", "+")
			lat, lon, url = find_osm_lat_long(query)
			if lat:
				row[15] = lat
				row[16] = lon
				row[17] = url

		print unicode(count) + ")" + " ".join(row[15:18])

		writer.writerow(row)

find_osm_where_available()

# extract_project_page_info()

