-
Notifications
You must be signed in to change notification settings - Fork 2
/
solr.py
210 lines (181 loc) · 6.99 KB
/
solr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# $Id$
# A simple Solr client for python.
# This is prototype level code and subject to change.
#
# quick examples on use:
#
# from solr import *
# c = SolrConnection(host='localhost:8983', persistent=True)
# c.add(id='500',name='python test doc')
# c.delete('123')
# c.commit()
# print c.search(q='id:[* TO *]', wt='python', rows='10',indent='on')
# data = c.search(q='id:500', wt='python')
# print 'first match=', eval(data)['response']['docs'][0]
import httplib
import socket
from xml.dom.minidom import parseString
import codecs
import urllib
from base64 import encodestring
class SolrException(Exception):
""" An exception thrown by solr connections """
def __init__(self, httpcode, reason=None, body=None):
self.httpcode = httpcode
self.reason = reason
self.body = body
def __repr__(self):
return 'HTTP code=%s, Reason=%s, body=%s' % (
self.httpcode, self.reason, self.body)
def __str__(self):
return 'HTTP code=%s, reason=%s' % (self.httpcode, self.reason)
class SolrConnection:
def __init__(self, host='localhost:8080', solrBase='/solr', username=None, password=None, persistent=True, postHeaders={}):
self.host = host
self.solrBase = solrBase
self.persistent = persistent
self.reconnects = 0
self.username = username
self.password = password
self.encoder = codecs.getencoder('utf-8')
#responses from Solr will always be in UTF-8
self.decoder = codecs.getdecoder('utf-8')
#a real connection to the server is not opened at this point.
self.conn = httplib.HTTPConnection(self.host)
#self.conn.set_debuglevel(1000000)
self.xmlheaders = {'Content-Type': 'text/xml; charset=utf-8'}
if self.username and self.password:
auth = self.username+':'+self.password
auth = encodestring(urllib.unquote(auth)).strip()
self.xmlheaders['Authorization'] = 'Basic '+auth
self.xmlheaders.update(postHeaders)
if not self.persistent: self.xmlheaders['Connection']='close'
self.formheaders = {'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8'}
if not self.persistent: self.formheaders['Connection']='close'
def __str__(self):
return 'SolrConnection{host=%s, solrBase=%s, persistent=%s, postHeaders=%s, reconnects=%s}' % \
(self.host, self.solrBase, self.persistent, self.xmlheaders, self.reconnects)
def __reconnect(self):
self.reconnects+=1
self.conn.close()
self.conn.connect()
def __errcheck(self,rsp):
if rsp.status != 200:
ex = SolrException(rsp.status, rsp.reason)
try:
ex.body = rsp.read()
except:
pass
raise ex
return rsp
def doPost(self,url,body,headers):
try:
self.conn.request('POST', url, body, headers )
return self.__errcheck(self.conn.getresponse())
except (socket.error,httplib.CannotSendRequest) :
#Reconnect in case the connection was broken from the server going down,
#the server timing out our persistent connection, or another
#network failure. Also catch httplib.CannotSendRequest because the
#HTTPConnection object can get in a bad state.
self.__reconnect()
self.conn.request('POST', url, body, headers)
return self.__errcheck(self.conn.getresponse())
def doUpdateXML(self, request):
try:
rsp = self.doPost(self.solrBase+'/update', request, self.xmlheaders)
data = rsp.read()
finally:
if not self.persistent: self.conn.close()
#detect old-style error response (HTTP response code of
#200 with a non-zero status.
if data.startswith('<result status="') and not data.startswith('<result status="0"'):
data = self.decoder(data)[0]
parsed = parseString(data)
status = parsed.documentElement.getAttribute('status')
if status != 0:
reason = parsed.documentElement.firstChild.nodeValue
raise SolrException(rsp.status, reason)
return data
def escapeVal(self,val):
val = val.replace("&", "&")
val = val.replace("<", "<")
val = val.replace("]]>", "]]>")
val = val.replace("/", "\/")
return self.encoder(val)[0] #to utf8
def escapeKey(self,key):
key = key.replace("&", "&")
key = key.replace('"', """)
return self.encoder(key)[0] #to utf8
def delete(self, id):
xstr = '<delete><id>'+self.escapeVal(unicode(id))+'</id></delete>'
return self.doUpdateXML(xstr)
def deleteByQyery(self, query):
xstr = '<delete><query>'+self.escapeVal(query)+'</query></delete>'
return self.doUpdateXML(xstr)
def __add(self, lst, fields):
lst.append('<doc>')
for f,v in fields.items():
lst.append('<field name="')
lst.append(self.escapeKey(str(f)))
lst.append('">')
lst.append(self.escapeVal(str(v)))
lst.append('</field>')
lst.append('</doc>')
def add(self, **fields):
lst=['<add>']
self.__add(lst,fields)
lst.append('</add>')
xstr = ''.join(lst)
return self.doUpdateXML(xstr)
def __add_list(self, lst, list_of_field_value_tuples):
lst.append('<doc>')
for f,v in list_of_field_value_tuples:
lst.append('<field name="')
lst.append(self.escapeKey(f))
lst.append('">')
lst.append(self.escapeVal(v))
lst.append('</field>')
lst.append('</doc>')
def add_list(self, list_of_field_value_tuples):
lst=['<add>']
self.__add_list(lst, list_of_field_value_tuples)
lst.append('</add>')
xstr = ''.join(lst)
return self.doUpdateXML(xstr)
def addMany(self, arrOfMap):
lst=[u'<add>']
for doc in arrOfMap:
self.__add(lst,doc)
lst.append(u'</add>')
xstr = ''.join(lst)
return self.doUpdateXML(xstr)
def commit(self, waitFlush=True, waitSearcher=True, optimize=False):
xstr = '<commit'
if optimize: xstr='<optimize'
if not waitSearcher: #just handle deviations from the default
if not waitFlush: xstr +=' waitFlush="false" waitSearcher="false"'
else: xstr += ' waitSearcher="false"'
xstr += '/>'
return self.doUpdateXML(xstr)
def search(self, **params):
request=urllib.urlencode(params, doseq=True)
try:
rsp = self.doPost(self.solrBase+'/select', request, self.formheaders)
data = rsp.read()
finally:
if not self.persistent: self.conn.close()
return data