User:Statsrick/GQL code
Appearance
Creating and using a UDF (note: uses Standard SQL Dialect ... in "Show Options" in the console, uncheck the "Use Legacy SQL" box before running) |
---|
CREATE TEMPORARY FUNCTION classy(s STRING)
RETURNS STRING
LANGUAGE js AS """
if (s) {
try {
var myRegexp = /^(?:https?:\\/\\/)?(?:www\\.)?((?:(?!www\\.|\\.).)+\\.[a-zA-Z0-9.]+)/g;
var udom = myRegexp.exec(s)[1];
if (udom.indexOf('25ans.') > -1 && udom.indexOf('.jp') > -1) {
var d = 'JP~HMI~2024~25ans.jp';
} else if (udom.indexOf('4029tv.') > -1 && udom.indexOf('.com') > -1) {
var d = 'US~HTV~10041~4029tv.com';
} else if (udom.indexOf('allaboutsoap.') > -1 && udom.indexOf('.uk') > -1) {
var d = 'UK~HMI~1915~allaboutsoap.co.uk';
} else {
var d = 'XX~XXX~unknown~XXXXX~unknown';
};
} catch (err) {
var d = 'XX~XXX~unknown~XXXXX~unknown';
}
var myArr = d.split('~');
var output = {
country:myArr[0],
business_unit:myArr[1],
ic_site_id:myArr[2],
domain:myArr[3]
};
return d;
}
""";
SELECT split(classy(page_url),'~')[offset(3)] as domain,page_url
FROM omniture.hmg_web_20160723
limit 100;
|
Fun with REGEX examples |
---|
SELECT
REPLACE(REGEXP_REPLACE(ct_pageurl,r'(/a\d+\.\w+|/g\d+\.\w+)', '/test'),'/test',SUBSTR(regexp_extract(ct_pageurl,r'(/a\d+\.\w+|/g\d+\.\w+)'),1,instr(regexp_extract(ct_pageurl,r'(/a\d+\.\w+|/g\d+\.\w+)'),'.')-1)) as url
,regexp_extract(CustomTargeting,r'buzzing=([^;]*)') as buzzing
,regexp_extract(CustomTargeting,r'page_type2=([^;]*)') as page_type2
,regexp_extract(CustomTargeting,r'oxb=([^;]*)') as oxb
,regexp_extract(CustomTargeting,r'position=([^;]*)') as pos
,ct_pageurl
,nth(2,split(ct_pageurl,'/')) as dom
,ct_div_id
,ct_urlhash
,ct_src
,CustomTargeting
,Lower(nvl(regexp_extract(parentpath,r'(?:.*?\'adUnitCode\':\s){3}([^\,]+)') ,'novalue')) as au2
,Lower(nvl(regexp_extract(parentpath,r'(?:.*?\'adUnitCode\':\s){4}([^\,]+)') ,'novalue')) as au3
,Lower(nvl(regexp_extract(parentpath,r'(?:.*?\'adUnitCode\':\s){5}([^\,]+)') ,'novalue')) as au4
,Lower(replace(adunitcode,'/novalue','')) as au5
,case when substr(adunitcode,1,1)!='-' then substr(regexp_extract(adunitcode,r'(?:.*?-){0}([^-]+)'),1,1) end as au51
,regexp_extract(adunitcode,r'(?:.*?-){1}([^-]+)') as au52
,regexp_extract(adunitcode,r'(?:.*?-){2}([^-]+)') as au53
FROM [hds_operational_tables.dfp_impressions_today]
WHERE INSTR(CustomTargeting,'sfgate')>0 and length(regexp_extract(url, r'(/a\d+|/g\d+)'))>2
LIMIT 100
|
Use TABLE_QUERY to union and query a bunch of tables that end in dates (e.g. table_20160401, table_20160402, etc.) |
---|
SELECT * FROM (TABLE_QUERY(dataset_name, 'REGEXP_MATCH(table_id, r"^table_[\d]{8}")'))
|
Use SPLIT to pivot a table with a delimited variable and make it tall and skinny |
---|
SELECT dt,lotame_id,split(behaviors,',') as behavior
FROM [groovy-age-90013:hearstbehaviors.behavior]
|
Core Python Script |
---|
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from bigquery import get_client
from subprocess import Popen, PIPE
import argparse
import json
import os
#https://github.com/tylertreat/BigQuery-Python
from bigquery.errors import (
JobInsertException, JobExecutingException,
BigQueryTimeoutException
)
def execjob(job):
try:
job_resource = client.wait_for_job(job, timeout=6000)
#return job_resource
return str(job_resource.get('status').get('state')) + str(job_resource.get('configuration').get('load').get('destinationTable').get('tableId') if job_resource.get('configuration').get('load') is not None else job_resource.get('configuration').get('query').get('destinationTable').get('tableId'))
except BigQueryTimeoutException:
return "Timeout"
def dele(database,table_name):
table_exists = client.check_table(database, table_name)
if table_exists:
# Delete an existing table.
deleted = client.delete_table(database, table_name)
def runquery(database,table_name,sqlfile):
dele(database,table_name)
with open(sqlfile, 'r') as qfile:
query=qfile.read()
job = client.write_to_table(query,database,table_name,allow_large_results=True)
print execjob(job)
def system(cmd):
p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
out, err = p.communicate()
return "Return code: ", p.returncode, out.rstrip(), err.rstrip()
def xport(gcdir,database,table_name):
job = client.export_data_to_uris([gcdir+table_name+'.tsv'],
database,
table_name,
field_delimiter='\t')
try:
job_resource = client.wait_for_job(job, timeout=6000)
print job_resource
except BigQueryTimeoutException:
print "Timeout"
if __name__ == "__main__":
##GLOBALS
PROJECT_ID = 'your projectid'
service_account = 'your service account'
##Set up bigquery client
json_key = 'BigQueryKey.json'
client = get_client(PROJECT_ID, json_key_file=json_key, readonly=False)
##make tmp_adunitstring
BQ_DATABASE='your data base name'
BQ_TABLENAME='tmp_adunitstring'
SQL='program.sql'
runquery(BQ_DATABASE,BQ_TABLENAME,SQL)
##Export table to gc
GC_LOCATION='gs://bucket/'
xport(GC_LOCATION,BQ_DATABASE,BQ_TABLENAME)
##Copy from gc to local
FILE_NAME=BQ_TABLENAME+'.tsv'
system("gsutil -m cp -r "+GC_LOCATION+FILE_NAME+" "+FILE_NAME)
|
Back to Rick McFarland's Library