Package parsedatetime :: Module parsedatetime
[hide private]
[frames] | no frames]

Source Code for Module parsedatetime.parsedatetime

   1  #!/usr/bin/env python
 
   2  
 
   3  """
 
   4  Parse human-readable date/time text.
 
   5  """ 
   6  
 
   7  __license__ = """
 
   8  Copyright (c) 2004-2008 Mike Taylor
 
   9  Copyright (c) 2006-2008 Darshana Chhajed
 
  10  All rights reserved.
 
  11  
 
  12  Licensed under the Apache License, Version 2.0 (the "License");
 
  13  you may not use this file except in compliance with the License.
 
  14  You may obtain a copy of the License at
 
  15  
 
  16     http://www.apache.org/licenses/LICENSE-2.0
 
  17  
 
  18  Unless required by applicable law or agreed to in writing, software
 
  19  distributed under the License is distributed on an "AS IS" BASIS,
 
  20  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
  21  See the License for the specific language governing permissions and
 
  22  limitations under the License.
 
  23  """ 
  24  
 
  25  _debug = False 
  26  
 
  27  
 
  28  import re 
  29  import time 
  30  import datetime 
  31  import rfc822 
  32  import parsedatetime_consts 
  33  
 
  34  
 
  35  # Copied from feedparser.py
 
  36  # Universal Feedparser
 
  37  # Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
 
  38  # Originally a def inside of _parse_date_w3dtf()
 
39 -def _extract_date(m):
40 year = int(m.group('year')) 41 if year < 100: 42 year = 100 * int(time.gmtime()[0] / 100) + int(year) 43 if year < 1000: 44 return 0, 0, 0 45 julian = m.group('julian') 46 if julian: 47 julian = int(julian) 48 month = julian / 30 + 1 49 day = julian % 30 + 1 50 jday = None 51 while jday != julian: 52 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) 53 jday = time.gmtime(t)[-2] 54 diff = abs(jday - julian) 55 if jday > julian: 56 if diff < day: 57 day = day - diff 58 else: 59 month = month - 1 60 day = 31 61 elif jday < julian: 62 if day + diff < 28: 63 day = day + diff 64 else: 65 month = month + 1 66 return year, month, day 67 month = m.group('month') 68 day = 1 69 if month is None: 70 month = 1 71 else: 72 month = int(month) 73 day = m.group('day') 74 if day: 75 day = int(day) 76 else: 77 day = 1 78 return year, month, day
79 80 # Copied from feedparser.py 81 # Universal Feedparser 82 # Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. 83 # Originally a def inside of _parse_date_w3dtf()
84 -def _extract_time(m):
85 if not m: 86 return 0, 0, 0 87 hours = m.group('hours') 88 if not hours: 89 return 0, 0, 0 90 hours = int(hours) 91 minutes = int(m.group('minutes')) 92 seconds = m.group('seconds') 93 if seconds: 94 seconds = int(seconds) 95 else: 96 seconds = 0 97 return hours, minutes, seconds
98 99 100 # Copied from feedparser.py 101 # Universal Feedparser 102 # Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. 103 # Modified to return a tuple instead of mktime 104 # 105 # Original comment: 106 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by 107 # Drake and licensed under the Python license. Removed all range checking 108 # for month, day, hour, minute, and second, since mktime will normalize 109 # these later
110 -def _parse_date_w3dtf(dateString):
111 # the __extract_date and __extract_time methods were 112 # copied-out so they could be used by my code --bear 113 def __extract_tzd(m): 114 '''Return the Time Zone Designator as an offset in seconds from UTC.''' 115 if not m: 116 return 0 117 tzd = m.group('tzd') 118 if not tzd: 119 return 0 120 if tzd == 'Z': 121 return 0 122 hours = int(m.group('tzdhours')) 123 minutes = m.group('tzdminutes') 124 if minutes: 125 minutes = int(minutes) 126 else: 127 minutes = 0 128 offset = (hours*60 + minutes) * 60 129 if tzd[0] == '+': 130 return -offset 131 return offset
132 133 __date_re = ('(?P<year>\d\d\d\d)' 134 '(?:(?P<dsep>-|)' 135 '(?:(?P<julian>\d\d\d)' 136 '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?') 137 __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)' 138 __tzd_rx = re.compile(__tzd_re) 139 __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)' 140 '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?' 141 + __tzd_re) 142 __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) 143 __datetime_rx = re.compile(__datetime_re) 144 m = __datetime_rx.match(dateString) 145 if (m is None) or (m.group() != dateString): return 146 return _extract_date(m) + _extract_time(m) + (0, 0, 0) 147 148 149 # Copied from feedparser.py 150 # Universal Feedparser 151 # Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. 152 # Modified to return a tuple instead of mktime 153 #
154 -def _parse_date_rfc822(dateString):
155 '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' 156 data = dateString.split() 157 if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: 158 del data[0] 159 if len(data) == 4: 160 s = data[3] 161 i = s.find('+') 162 if i > 0: 163 data[3:] = [s[:i], s[i+1:]] 164 else: 165 data.append('') 166 dateString = " ".join(data) 167 if len(data) < 5: 168 dateString += ' 00:00:00 GMT' 169 return rfc822.parsedate_tz(dateString)
170 171 # rfc822.py defines several time zones, but we define some extra ones. 172 # 'ET' is equivalent to 'EST', etc. 173 _additional_timezones = {'AT': -400, 'ET': -500, 174 'CT': -600, 'MT': -700, 175 'PT': -800} 176 rfc822._timezones.update(_additional_timezones) 177 178
179 -class Calendar:
180 """ 181 A collection of routines to input, parse and manipulate date and times. 182 The text can either be 'normal' date values or it can be human readable. 183 """ 184
185 - def __init__(self, constants=None):
186 """ 187 Default constructor for the L{Calendar} class. 188 189 @type constants: object 190 @param constants: Instance of the class L{parsedatetime_consts.Constants} 191 192 @rtype: object 193 @return: L{Calendar} instance 194 """ 195 # if a constants reference is not included, use default 196 if constants is None: 197 self.ptc = parsedatetime_consts.Constants() 198 else: 199 self.ptc = constants 200 201 self.weekdyFlag = False # monday/tuesday/... 202 self.dateStdFlag = False # 07/21/06 203 self.dateStrFlag = False # July 21st, 2006 204 self.timeStdFlag = False # 5:50 205 self.meridianFlag = False # am/pm 206 self.dayStrFlag = False # tomorrow/yesterday/today/.. 207 self.timeStrFlag = False # lunch/noon/breakfast/... 208 self.modifierFlag = False # after/before/prev/next/.. 209 self.modifier2Flag = False # after/before/prev/next/.. 210 self.unitsFlag = False # hrs/weeks/yrs/min/.. 211 self.qunitsFlag = False # h/m/t/d.. 212 213 self.timeFlag = 0 214 self.dateFlag = 0
215 216
217 - def _convertUnitAsWords(self, unitText):
218 """ 219 Converts text units into their number value 220 221 Five = 5 222 Twenty Five = 25 223 Two hundred twenty five = 225 224 Two thousand and twenty five = 2025 225 Two thousand twenty five = 2025 226 227 @type unitText: string 228 @param unitText: number text to convert 229 230 @rtype: integer 231 @return: numerical value of unitText 232 """ 233 # TODO: implement this 234 pass
235 236
237 - def _buildTime(self, source, quantity, modifier, units):
238 """ 239 Take C{quantity}, C{modifier} and C{unit} strings and convert them into values. 240 After converting, calcuate the time and return the adjusted sourceTime. 241 242 @type source: time 243 @param source: time to use as the base (or source) 244 @type quantity: string 245 @param quantity: quantity string 246 @type modifier: string 247 @param modifier: how quantity and units modify the source time 248 @type units: string 249 @param units: unit of the quantity (i.e. hours, days, months, etc) 250 251 @rtype: struct_time 252 @return: C{struct_time} of the calculated time 253 """ 254 if _debug: 255 print '_buildTime: [%s][%s][%s]' % (quantity, modifier, units) 256 257 if source is None: 258 source = time.localtime() 259 260 if quantity is None: 261 quantity = '' 262 else: 263 quantity = quantity.strip() 264 265 if len(quantity) == 0: 266 qty = 1 267 else: 268 try: 269 qty = int(quantity) 270 except ValueError: 271 qty = 0 272 273 if modifier in self.ptc.Modifiers: 274 qty = qty * self.ptc.Modifiers[modifier] 275 276 if units is None or units == '': 277 units = 'dy' 278 279 # plurals are handled by regex's (could be a bug tho) 280 281 (yr, mth, dy, hr, mn, sec, _, _, _) = source 282 283 start = datetime.datetime(yr, mth, dy, hr, mn, sec) 284 target = start 285 286 if units.startswith('y'): 287 target = self.inc(start, year=qty) 288 self.dateFlag = 1 289 elif units.endswith('th') or units.endswith('ths'): 290 target = self.inc(start, month=qty) 291 self.dateFlag = 1 292 else: 293 if units.startswith('d'): 294 target = start + datetime.timedelta(days=qty) 295 self.dateFlag = 1 296 elif units.startswith('h'): 297 target = start + datetime.timedelta(hours=qty) 298 self.timeFlag = 2 299 elif units.startswith('m'): 300 target = start + datetime.timedelta(minutes=qty) 301 self.timeFlag = 2 302 elif units.startswith('s'): 303 target = start + datetime.timedelta(seconds=qty) 304 self.timeFlag = 2 305 elif units.startswith('w'): 306 target = start + datetime.timedelta(weeks=qty) 307 self.dateFlag = 1 308 309 return target.timetuple()
310 311
312 - def parseDate(self, dateString):
313 """ 314 Parse short-form date strings:: 315 316 '05/28/2006' or '04.21' 317 318 @type dateString: string 319 @param dateString: text to convert to a C{datetime} 320 321 @rtype: struct_time 322 @return: calculated C{struct_time} value of dateString 323 """ 324 yr, mth, dy, hr, mn, sec, wd, yd, isdst = time.localtime() 325 326 # values pulled from regex's will be stored here and later 327 # assigned to mth, dy, yr based on information from the locale 328 # -1 is used as the marker value because we want zero values 329 # to be passed thru so they can be flagged as errors later 330 v1 = -1 331 v2 = -1 332 v3 = -1 333 334 s = dateString 335 m = self.ptc.CRE_DATE2.search(s) 336 if m is not None: 337 index = m.start() 338 v1 = int(s[:index]) 339 s = s[index + 1:] 340 341 m = self.ptc.CRE_DATE2.search(s) 342 if m is not None: 343 index = m.start() 344 v2 = int(s[:index]) 345 v3 = int(s[index + 1:]) 346 else: 347 v2 = int(s.strip()) 348 349 v = [ v1, v2, v3 ] 350 d = { 'm': mth, 'd': dy, 'y': yr } 351 352 for i in range(0, 3): 353 n = v[i] 354 c = self.ptc.dp_order[i] 355 if n >= 0: 356 d[c] = n 357 358 # if the year is not specified and the date has already 359 # passed, increment the year 360 if v3 == -1 and ((mth > d['m']) or (mth == d['m'] and dy > d['d'])): 361 yr = d['y'] + 1 362 else: 363 yr = d['y'] 364 365 mth = d['m'] 366 dy = d['d'] 367 368 # birthday epoch constraint 369 if yr < self.ptc.BirthdayEpoch: 370 yr += 2000 371 elif yr < 100: 372 yr += 1900 373 374 if _debug: 375 print 'parseDate: ', yr, mth, dy, self.ptc.daysInMonth(mth, yr) 376 377 if (mth > 0 and mth <= 12) and \ 378 (dy > 0 and dy <= self.ptc.daysInMonth(mth, yr)): 379 sourceTime = (yr, mth, dy, hr, mn, sec, wd, yd, isdst) 380 else: 381 self.dateFlag = 0 382 self.timeFlag = 0 383 sourceTime = time.localtime() # return current time if date 384 # string is invalid 385 386 return sourceTime
387 388
389 - def parseDateText(self, dateString):
390 """ 391 Parse long-form date strings:: 392 393 'May 31st, 2006' 394 'Jan 1st' 395 'July 2006' 396 397 @type dateString: string 398 @param dateString: text to convert to a datetime 399 400 @rtype: struct_time 401 @return: calculated C{struct_time} value of dateString 402 """ 403 yr, mth, dy, hr, mn, sec, wd, yd, isdst = time.localtime() 404 405 currentMth = mth 406 currentDy = dy 407 408 s = dateString.lower() 409 m = self.ptc.CRE_DATE3.search(s) 410 mth = m.group('mthname') 411 mth = self.ptc.MonthOffsets[mth] 412 413 if m.group('day') != None: 414 dy = int(m.group('day')) 415 else: 416 dy = 1 417 418 if m.group('year') != None: 419 yr = int(m.group('year')) 420 421 # birthday epoch constraint 422 if yr < self.ptc.BirthdayEpoch: 423 yr += 2000 424 elif yr < 100: 425 yr += 1900 426 427 elif (mth < currentMth) or (mth == currentMth and dy < currentDy): 428 # if that day and month have already passed in this year, 429 # then increment the year by 1 430 yr += 1 431 432 if dy > 0 and dy <= self.ptc.daysInMonth(mth, yr): 433 sourceTime = (yr, mth, dy, hr, mn, sec, wd, yd, isdst) 434 else: 435 # Return current time if date string is invalid 436 self.dateFlag = 0 437 self.timeFlag = 0 438 sourceTime = time.localtime() 439 440 return sourceTime
441 442
443 - def evalRanges(self, datetimeString, sourceTime=None):
444 """ 445 Evaluate the C{datetimeString} text and determine if 446 it represents a date or time range. 447 448 @type datetimeString: string 449 @param datetimeString: datetime text to evaluate 450 @type sourceTime: struct_time 451 @param sourceTime: C{struct_time} value to use as the base 452 453 @rtype: tuple 454 @return: tuple of: start datetime, end datetime and the invalid flag 455 """ 456 startTime = '' 457 endTime = '' 458 startDate = '' 459 endDate = '' 460 rangeFlag = 0 461 462 s = datetimeString.strip().lower() 463 464 if self.ptc.rangeSep in s: 465 s = s.replace(self.ptc.rangeSep, ' %s ' % self.ptc.rangeSep) 466 s = s.replace(' ', ' ') 467 468 m = self.ptc.CRE_TIMERNG1.search(s) 469 if m is not None: 470 rangeFlag = 1 471 else: 472 m = self.ptc.CRE_TIMERNG2.search(s) 473 if m is not None: 474 rangeFlag = 2 475 else: 476 m = self.ptc.CRE_TIMERNG4.search(s) 477 if m is not None: 478 rangeFlag = 7 479 else: 480 m = self.ptc.CRE_TIMERNG3.search(s) 481 if m is not None: 482 rangeFlag = 3 483 else: 484 m = self.ptc.CRE_DATERNG1.search(s) 485 if m is not None: 486 rangeFlag = 4 487 else: 488 m = self.ptc.CRE_DATERNG2.search(s) 489 if m is not None: 490 rangeFlag = 5 491 else: 492 m = self.ptc.CRE_DATERNG3.search(s) 493 if m is not None: 494 rangeFlag = 6 495 496 if _debug: 497 print 'evalRanges: rangeFlag =', rangeFlag, '[%s]' % s 498 499 if m is not None: 500 if (m.group() != s): 501 # capture remaining string 502 parseStr = m.group() 503 chunk1 = s[:m.start()] 504 chunk2 = s[m.end():] 505 s = '%s %s' % (chunk1, chunk2) 506 flag = 1 507 508 sourceTime, flag = self.parse(s, sourceTime) 509 510 if flag == 0: 511 sourceTime = None 512 else: 513 parseStr = s 514 515 if rangeFlag == 1: 516 m = re.search(self.ptc.rangeSep, parseStr) 517 startTime, sflag = self.parse((parseStr[:m.start