Merge pull request #4 from johndavi/master · diffbot/diffbot-python@a7d7182 · GitHub
Skip to content

Commit a7d7182

Browse files
committed
Merge pull request #4 from johndavi/master
Adding crawl update, crawl download, other fixes
2 parents 53c1f41 + 7c6df80 commit a7d7182

3 files changed

Lines changed: 48 additions & 8 deletions

File tree

README.md

Lines changed: 19 additions & 2 deletions

client.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def format_version_string(version_number):
3232

3333
class DiffbotJob(DiffbotClient):
3434
"""
35-
Various calls for managing a Diffbot Crawlbot or Bulk API job.
35+
Various calls for managing a Crawlbot or Bulk API job.
3636
"""
3737

3838
def request(self,params):
@@ -51,6 +51,12 @@ def status(self):
5151
response = self.request(self.params)
5252
return response
5353

54+
def update(self,**kwargs):
55+
temp_params = self.params
56+
temp_params.update(kwargs)
57+
response = self.request(self.params)
58+
return response
59+
5460
def delete(self):
5561
temp_params = self.params
5662
temp_params['delete'] = 1
@@ -63,19 +69,36 @@ def restart(self):
6369
response = self.request(temp_params)
6470
return response
6571

72+
def download(self,data_format="json"):
73+
"""
74+
downloads the JSON output of a crawl or bulk job
75+
"""
76+
77+
download_url = '{}/v3/{}/download/{}-{}_data.{}'.format(
78+
self.base_url,self.jobType,self.params['token'],self.params['name'],data_format
79+
)
80+
download = requests.get(download_url)
81+
download.raise_for_status()
82+
if data_format == "csv":
83+
return download.content
84+
else:
85+
return download.json()
86+
6687
class DiffbotCrawl(DiffbotJob):
6788
"""
68-
Initializes a new Diffbot crawl. Pass additional arguments as necessary.
89+
Initializes a Diffbot crawl. Pass additional arguments as necessary.
6990
"""
7091

71-
def __init__(self,token,name,seeds,api,apiVersion=3,**kwargs):
92+
def __init__(self,token,name,seeds=None,api=None,apiVersion=3,**kwargs):
7293
self.params = {
7394
"token": token,
7495
"name": name,
7596
}
7697
startParams = dict(self.params)
77-
startParams['seeds'] = seeds
78-
startParams['apiUrl'] = self.compose_url(api,apiVersion)
98+
if seeds:
99+
startParams['seeds'] = seeds
100+
if api:
101+
startParams['apiUrl'] = self.compose_url(api,apiVersion)
79102
startParams.update(kwargs)
80103
self.jobType = "crawl"
81104
self.start(startParams)

example.py

Lines changed: 1 addition & 1 deletion

0 commit comments

Comments
 (0)