diff --git a/.gitattributes b/.gitattributes index 715654c63704b78684770ed922cbac4124da76a0..e8d158d0ea9bb065719cb7307723571c34bbb7ff 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2694,6 +2694,44 @@ LCU/checkhardware/rtsm.py -text LCU/checkhardware/showBadSpectra.py -text LCU/checkhardware/showTestResult.py -text LCU/checkhardware/updatePVSS.py -text +LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/PKG-INFO -text +LTA/LTAIngest/ClientForm-0.1.17/PKG-INFO -text +LTA/LTAIngest/SOAPpy-0.12.0/LICENSE -text +LTA/LTAIngest/SOAPpy-0.12.0/PKG-INFO -text +LTA/LTAIngest/SOAPpy-0.12.0/RELEASE_INFO -text +LTA/LTAIngest/SOAPpy-0.12.0/TODO -text +LTA/LTAIngest/SOAPpy-0.12.0/tests/TemperatureService.wsdl -text +LTA/LTAIngest/SOAPpy-0.12.0/validate/server.pem -text +LTA/LTAIngest/SOAPpy-0.12.0/validate/silab.servers -text +LTA/LTAIngest/dav/.serverrc -text +LTA/LTAIngest/doc/LTA-SIP.xsd -text +LTA/LTAIngest/example.job -text +LTA/LTAIngest/fpconst-0.7.0/PKG-INFO -text +LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/PKG-INFO -text +LTA/LTAIngest/md5adler/a32 -text +LTA/LTAIngest/md5adler/foo -text +LTA/LTAIngest/md5adler/md5a32 -text +LTA/LTAIngest/mechanize-0.2.5/PKG-INFO -text +LTA/LTAIngest/mechanize-0.2.5/examples/forms/data.dat -text +LTA/LTAIngest/mechanize-0.2.5/examples/forms/echo.cgi -text +LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/PKG-INFO -text +LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/zip-safe -text +LTA/LTAIngest/mechanize-0.2.5/test-tools/cookietest.cgi -text +LTA/LTAIngest/mechanize-0.2.5/test/functional_tests_golden/FormsExamplesTests.test_example/output -text +LTA/LTAIngest/mechanize-0.2.5/test/functional_tests_golden/FormsExamplesTests.test_simple/output -text +LTA/LTAIngest/mechanize-0.2.5/test/test_browser.doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_forms.doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_history.doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_html.doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_opener.doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_password_manager.special_doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_request.doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_response.doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_rfc3986.doctest -text +LTA/LTAIngest/mechanize-0.2.5/test/test_robotfileparser.doctest -text +LTA/LTAIngest/ssh_check.sh eol=lf +LTA/LTAIngest/test.xmlrpc -text +LTA/LTAIngest/user_ingest_example -text MAC/APL/APLCommon/include/APL/APLCommon/AntennaField.h -text MAC/APL/APLCommon/src/AntennaField.cc -text MAC/APL/APLCommon/src/StartDaemon_Protocol.prot -text svneol=native#application/octet-stream diff --git a/CMake/LofarPackageList.cmake b/CMake/LofarPackageList.cmake index ac8adee38dfc442b83206f22d1c994c5aa3cb6b6..070c8b421bd61b8e1006ac92e5ba99038a8e8491 100644 --- a/CMake/LofarPackageList.cmake +++ b/CMake/LofarPackageList.cmake @@ -145,4 +145,5 @@ if(NOT DEFINED LOFAR_PACKAGE_LIST_INCLUDED) set(SAS_OTDB_SOURCE_DIR ${CMAKE_SOURCE_DIR}/SubSystems/SAS_OTDB) set(PVSS_DB_SOURCE_DIR ${CMAKE_SOURCE_DIR}/SubSystems/PVSS_DB) set(LAPS_CEP_SOURCE_DIR ${CMAKE_SOURCE_DIR}/SubSystems/LAPS_CEP) + set(LTAIngest_SOURCE_DIR ${CMAKE_SOURCE_DIR}/LTA/LTAIngest) endif(NOT DEFINED LOFAR_PACKAGE_LIST_INCLUDED) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7d00590eb6d2f8a8a0d415971cc3f74d1918cd9..ccb788fe33c4bb7b2458a73210da07d583a68bda 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,7 @@ if(NOT DEFINED BUILD_PACKAGES) lofar_add_package(SAS) lofar_add_package(MAC) lofar_add_package(LCU) + lofar_add_package(LTA) lofar_add_package(SubSystems) else(NOT DEFINED BUILD_PACKAGES) separate_arguments(BUILD_PACKAGES) diff --git a/LTA/CMakeLists.txt b/LTA/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b09c313b57e8ae270d06944035fddc7666024658 --- /dev/null +++ b/LTA/CMakeLists.txt @@ -0,0 +1,3 @@ +# $Id$ + +lofar_add_package(LTAIngest) diff --git a/LTA/LTAIngest/CMakeLists.txt b/LTA/LTAIngest/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cb994c0c0c09854369e30f0510f6a0ecb281804 --- /dev/null +++ b/LTA/LTAIngest/CMakeLists.txt @@ -0,0 +1,244 @@ +# $Id$ + +lofar_package(LTAIngest 0.1) # DEPENDS Common) + +include(PythonInstall) + +python_install( + ingestpipeline.py + ClientForm-0.1.17/ClientForm-0.1.17/test.py + ClientForm-0.1.17/ClientForm-0.1.17/ClientForm.py + ClientForm-0.1.17/ClientForm-0.1.17/setup.py + ClientForm-0.1.17/test.py + ClientForm-0.1.17/ClientForm.py + ClientForm-0.1.17/setup.py + .ropeproject/config.py + ingestpipeline_test.py + mechanize-0.2.5/examples/hack21.py + mechanize-0.2.5/examples/pypi.py + mechanize-0.2.5/examples/forms/simple.py + mechanize-0.2.5/examples/forms/example.py + mechanize-0.2.5/mechanize/_debug.py + mechanize-0.2.5/mechanize/_util.py + mechanize-0.2.5/mechanize/_lwpcookiejar.py + mechanize-0.2.5/mechanize/_mechanize.py + mechanize-0.2.5/mechanize/_sockettimeout.py + mechanize-0.2.5/mechanize/_sgmllib_copy.py + mechanize-0.2.5/mechanize/_firefox3cookiejar.py + mechanize-0.2.5/mechanize/_markupbase.py + mechanize-0.2.5/mechanize/_urllib2_fork.py + mechanize-0.2.5/mechanize/_beautifulsoup.py + mechanize-0.2.5/mechanize/_http.py + mechanize-0.2.5/mechanize/_version.py + mechanize-0.2.5/mechanize/_request.py + mechanize-0.2.5/mechanize/_pullparser.py + mechanize-0.2.5/mechanize/__init__.py + mechanize-0.2.5/mechanize/_mozillacookiejar.py + mechanize-0.2.5/mechanize/_useragent.py + mechanize-0.2.5/mechanize/_opener.py + mechanize-0.2.5/mechanize/_rfc3986.py + mechanize-0.2.5/mechanize/_form.py + mechanize-0.2.5/mechanize/_auth.py + mechanize-0.2.5/mechanize/_response.py + mechanize-0.2.5/mechanize/_clientcookie.py + mechanize-0.2.5/mechanize/_urllib2.py + mechanize-0.2.5/mechanize/_gzip.py + mechanize-0.2.5/mechanize/_msiecookiejar.py + mechanize-0.2.5/mechanize/_html.py + mechanize-0.2.5/mechanize/_testcase.py + mechanize-0.2.5/mechanize/_headersutil.py + mechanize-0.2.5/test.py + mechanize-0.2.5/release.py + mechanize-0.2.5/test/test_headers.py + mechanize-0.2.5/test/test_browser.py + mechanize-0.2.5/test/test_html.py + mechanize-0.2.5/test/test_useragent.py + mechanize-0.2.5/test/test_pickle.py + mechanize-0.2.5/test/test_date.py + mechanize-0.2.5/test/test_performance.py + mechanize-0.2.5/test/test_api.py + mechanize-0.2.5/test/test_import.py + mechanize-0.2.5/test/test_functional.py + mechanize-0.2.5/test/test_response.py + mechanize-0.2.5/test/__init__.py + mechanize-0.2.5/test/test_form.py + mechanize-0.2.5/test/test_cookies.py + mechanize-0.2.5/test/test_opener.py + mechanize-0.2.5/test/test_urllib2.py + mechanize-0.2.5/test/test_unittest.py + mechanize-0.2.5/test/test_form_mutation.py + mechanize-0.2.5/test/test_urllib2_localnet.py + mechanize-0.2.5/test/test_cookie.py + mechanize-0.2.5/test/test_pullparser.py + mechanize-0.2.5/test-tools/doctest.py + mechanize-0.2.5/test-tools/linecache_copy.py + mechanize-0.2.5/test-tools/twisted-ftpserver.py + mechanize-0.2.5/test-tools/testprogram.py + mechanize-0.2.5/test-tools/functools_copy.py + mechanize-0.2.5/test-tools/unittest/case.py + mechanize-0.2.5/test-tools/unittest/loader.py + mechanize-0.2.5/test-tools/unittest/__main__.py + mechanize-0.2.5/test-tools/unittest/result.py + mechanize-0.2.5/test-tools/unittest/suite.py + mechanize-0.2.5/test-tools/unittest/__init__.py + mechanize-0.2.5/test-tools/unittest/runner.py + mechanize-0.2.5/test-tools/unittest/main.py + mechanize-0.2.5/test-tools/unittest/util.py + mechanize-0.2.5/test-tools/twisted-localserver.py + mechanize-0.2.5/ez_setup.py + mechanize-0.2.5/setup.py + mechanize/_debug.py + mechanize/_util.py + mechanize/_lwpcookiejar.py + mechanize/_mechanize.py + mechanize/_sockettimeout.py + mechanize/_sgmllib_copy.py + mechanize/_firefox3cookiejar.py + mechanize/_markupbase.py + mechanize/_urllib2_fork.py + mechanize/_beautifulsoup.py + mechanize/_http.py + mechanize/_version.py + mechanize/_request.py + mechanize/_pullparser.py + mechanize/__init__.py + mechanize/_mozillacookiejar.py + mechanize/_useragent.py + mechanize/_opener.py + mechanize/_rfc3986.py + mechanize/_form.py + mechanize/_auth.py + mechanize/_response.py + mechanize/_clientcookie.py + mechanize/_urllib2.py + mechanize/_gzip.py + mechanize/_msiecookiejar.py + mechanize/_html.py + mechanize/_testcase.py + mechanize/_headersutil.py + fpconst-0.7.0/fpconst-0.7.0/fpconst.py + fpconst-0.7.0/fpconst-0.7.0/setup.py + fpconst-0.7.0/fpconst.py + fpconst-0.7.0/setup.py + test/test_ingest_logging.py + test/__init__.py + unspecifiedSIP.py + h5_check.py + mom_http.py + dav/davlib_orig.py + dav/davlib.py + dav/httplib.py + dav/webdav/WebdavRequests.py + dav/webdav/WebdavClient.py + dav/webdav/Constants.py + dav/webdav/Utils.py + dav/webdav/logger.py + dav/webdav/__init__.py + dav/webdav/Condition.py + dav/webdav/Connection.py + dav/webdav/VersionHandler.py + dav/webdav/NameCheck.py + dav/webdav/WebdavResponse.py + dav/webdav/acp/GrantDeny.py + dav/webdav/acp/Ace.py + dav/webdav/acp/Acl.py + dav/webdav/acp/Privilege.py + dav/webdav/acp/__init__.py + dav/webdav/acp/AceHandler.py + dav/webdav/acp/Principal.py + dav/wsrt_webdavlib.py + dav/davhttplib.py + dav/qp_xml.py + job_parser.py + find_files.py + ingest_config_test.py + multiprocessing/reduction.py + multiprocessing/process.py + multiprocessing/queues.py + multiprocessing/connection.py + multiprocessing/dummy/connection.py + multiprocessing/dummy/__init__.py + multiprocessing/heap.py + multiprocessing/sharedctypes.py + multiprocessing/pool.py + multiprocessing/__init__.py + multiprocessing/forking.py + multiprocessing/synchronize.py + multiprocessing/managers.py + multiprocessing/util.py + slave.py + sitecustomize.py + SOAPpy-0.12.0/tools/interop2html.py + SOAPpy-0.12.0/tests/xmethods.py + SOAPpy-0.12.0/tests/echoClient.py + SOAPpy-0.12.0/tests/excelTest.py + SOAPpy-0.12.0/tests/translateTest.py + SOAPpy-0.12.0/tests/simpleWSDL.py + SOAPpy-0.12.0/tests/echoServer.py + SOAPpy-0.12.0/tests/quoteTest.py + SOAPpy-0.12.0/tests/echoHeader.py + SOAPpy-0.12.0/tests/TCtest.py + SOAPpy-0.12.0/tests/weatherTest.py + SOAPpy-0.12.0/tests/esj_test_client.py + SOAPpy-0.12.0/tests/BabelfishWSDLTest.py + SOAPpy-0.12.0/tests/testWSDL.py + SOAPpy-0.12.0/tests/Bug1001646.py + SOAPpy-0.12.0/tests/ZeroLengthArray.py + SOAPpy-0.12.0/tests/testleak.py + SOAPpy-0.12.0/tests/ComplexTypes.py + SOAPpy-0.12.0/tests/esj_test_server.py + SOAPpy-0.12.0/tests/newsTest.py + SOAPpy-0.12.0/tests/SOAPtest.py + SOAPpy-0.12.0/tests/largeDataTest.py + SOAPpy-0.12.0/tests/Bug916265.py + SOAPpy-0.12.0/tests/cardServer.py + SOAPpy-0.12.0/tests/storageTest.py + SOAPpy-0.12.0/tests/alanbushTest.py + SOAPpy-0.12.0/tests/whoisTest.py + SOAPpy-0.12.0/tests/GoogleTest.py + SOAPpy-0.12.0/tests/speedTest.py + SOAPpy-0.12.0/tests/testClient1.py + SOAPpy-0.12.0/tests/Bug918216.py + SOAPpy-0.12.0/tests/cardClient.py + SOAPpy-0.12.0/bid/inventoryClient.py + SOAPpy-0.12.0/bid/monitorClient.py + SOAPpy-0.12.0/bid/inventoryServer.py + SOAPpy-0.12.0/SOAPpy/Errors.py + SOAPpy-0.12.0/SOAPpy/Server.py + SOAPpy-0.12.0/SOAPpy/NS.py + SOAPpy-0.12.0/SOAPpy/SOAPBuilder.py + SOAPpy-0.12.0/SOAPpy/Utilities.py + SOAPpy-0.12.0/SOAPpy/version.py + SOAPpy-0.12.0/SOAPpy/wstools/Namespaces.py + SOAPpy-0.12.0/SOAPpy/wstools/Utility.py + SOAPpy-0.12.0/SOAPpy/wstools/WSDLTools.py + SOAPpy-0.12.0/SOAPpy/wstools/TimeoutSocket.py + SOAPpy-0.12.0/SOAPpy/wstools/XMLSchema.py + SOAPpy-0.12.0/SOAPpy/wstools/logging.py + SOAPpy-0.12.0/SOAPpy/wstools/UserTuple.py + SOAPpy-0.12.0/SOAPpy/wstools/__init__.py + SOAPpy-0.12.0/SOAPpy/wstools/c14n.py + SOAPpy-0.12.0/SOAPpy/wstools/XMLname.py + SOAPpy-0.12.0/SOAPpy/GSIServer.py + SOAPpy-0.12.0/SOAPpy/__init__.py + SOAPpy-0.12.0/SOAPpy/Client.py + SOAPpy-0.12.0/SOAPpy/URLopener.py + SOAPpy-0.12.0/SOAPpy/Types.py + SOAPpy-0.12.0/SOAPpy/WSDL.py + SOAPpy-0.12.0/SOAPpy/Parser.py + SOAPpy-0.12.0/SOAPpy/Config.py + SOAPpy-0.12.0/SOAPpy/SOAP.py + SOAPpy-0.12.0/contrib/soap_cli.py + SOAPpy-0.12.0/contrib/soap_handler.py + SOAPpy-0.12.0/setup.py + SOAPpy-0.12.0/validate/silabclient.py + SOAPpy-0.12.0/validate/soapware.py + SOAPpy-0.12.0/validate/silabserver.py + job_group.py + master.py + ingest_config.py + user_ingest.py + do_ltacp.py + simple_server.py + __init__.py + DESTINATION LTAIngest) diff --git a/LTA/LTAIngest/ClientForm-0.1.17/COPYING b/LTA/LTAIngest/ClientForm-0.1.17/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..f54e8c49c6926f132a395667cfdf0230fc8372db --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/COPYING @@ -0,0 +1,31 @@ +Copyright (c) 2002-2004 John J. Lee <jjl@pobox.com> +Copyright (c) 1998-2000 Gisle Aas + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +Neither the name of the contributors nor the names of their employers +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ChangeLog b/LTA/LTAIngest/ClientForm-0.1.17/ChangeLog new file mode 100644 index 0000000000000000000000000000000000000000..3573a2ebd78602b9c449dfe81c3294337bfffac4 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ChangeLog @@ -0,0 +1,578 @@ +This isn't really in proper GNU ChangeLog format, it just happens to +look that way. + +2005-01-30 John J Lee <jjl@pobox.com> + * Fix failure to raise ParseError (!) + * Workaround for failure of sgmllib to unescape attributes (bug + report from Titus Brown). + * Released 0.1.17. + +2005-01-17 John J Lee <jjl@pobox.com> + * Fixed case where FORM action contains a '?' or '#' (again). + * Allow user to supply own Request class (Tobias). + * Fix ISINDEX action URL (bug only showed up in Python 2.4). + * Fix image control in case where value is present. + * Hack choose_boundary not to fail on socket.gaierror. + +2004-05-15 John J Lee <jjl@pobox.com> + * Released 0.1.16 and 0.0.16. + + * 0.1.x and 0.0.x: + * Fixed case where FORM action contains a '?' (bug report from + Moof). + + * 0.1.x only: + * Look for BASE element attribute 'href', not 'uri'! (patch from + Jochen Knuth) + * Applied workaround for file upload for AOLServer (patch from + Andrei Mitran). + * Added optional form_parser_class arguments to allow choice + between htmllib and HTMLParser modules. + * Added a SelectControl._delete_items() method, useful for quick- + hack JS simulation. Not yet a stable interface, hence the + initial underscore. + + * 0.0.x only: + * SubmitControls with no initial value in HTML are now successful + (default value is '', not None) + +2004-01-22 John J Lee <jjl@pobox.com> + * 0.1.x only: + * Cleaned up docs a bit, and removed references to toggle methods. + +2004-01-05 John J Lee <jjl@pobox.com> + * 0.1.x only: + * Take note of base element. Thanks to Phillip J. Eby for bug + report. + * All form attributes are now available in HTMLForm.attrs + (previously, name, action, method and enctype were not present). + * Released 0.1.15. + +2004-01-01 John J Lee <jjl@pobox.com> + * 0.1.x only: Disovered ignore_errors was ignored by + ParseResponse! It seems nobody uses it from ParseResponse, and + it's probably worthless anyway. Also, I just now realise that + FormParser.error() is actually overriding a base class method + without my noticing it! The arguments are still there, but + they're now ignored. Thanks to Per Cederqvist. + +2003-12-24 John J Lee <jjl@pobox.com> + * Modified setup.py so can easily register with PyPI. + +2003-12-06 John J Lee <jjl@pobox.com> + * Fixed bug where ClientForm.urlencode choked on Unicode. + * Released 0.1.14 and 0.0.15. + +2003-11-14 John J Lee <jjl@pobox.com> + * A few doc fixes in HTMLForm.__doc__. + * Minor code clean-up. + +2003-11-12 John J Lee <jjl@pobox.com> + * Fixed bug where empty OPTION caused KeyError. Thanks to Doug + Henderson. + * Released 0.1.13 and 0.0.14. + +2003-11-11 John J Lee <jjl@pobox.com> + * Fixed bugs where TEXTAREA or OPTION containing entity reference + would result in truncated element contents. Thanks to Michael + Howitz again! + * Applied fixes to 0.0.x for ImageControl integer coordinates, + TEXTAREA content .strip()ping and entity references in TEXTAREA + and OPTION. + * Released 0.1.12 and 0.0.13. + +2003-11-07 John J Lee <jjl@pobox.com> + * TEXTAREA contents are no longer .strip()ped on form parsing. + * Released 0.1.11. + +2003-11-03 John J Lee <jjl@pobox.com> + * Fixed ImageControl.pairs(): return value contained integer + coordinates instead of strings. Thanks to Michael Howitz. + +2003-10-31 John J Lee <jjl@pobox.com> + * XHTML support for Pythons >= 2.2. Thanks to Michael Howitz. + * Released 0.1.10. + +2003-10-02 John J Lee <jjl@pobox.com> + * Bugfix: selection of default control to click on is supposed to + only happen if no control is explictly requested, but id wasn't + included in that. Now, it is. + +2003-09-28 John J Lee <jjl@pobox.com> + * Fixed HTMLForm.attrs. Thanks to Scott Chapman. + * Released ClientForm 0.0.12 and 0.1.9 (first stable release of + 0.1.x). + +2003-09-21 John J Lee <jjl@pobox.com> + * Interface change (sorry): id is now supported. This means + Controls have an id attribute, and appropriate HTMLForm methods + have an id argument. This will only affect people using + positional arguments after the 'kind' argument. + * Interface change: BUTTON/BUTTON now has type "buttonbutton" (was + "button") to prevent clash with type of INPUT/BUTTON (was and is + "button"). Both types of control are ignored anyway (ie. + represented by IgnoreControl), so it's unlikely any code is + affected. + * SubmitControl value now defaults to "", so it is successful even + when no value is given in the HTML. + * Extraneous "\r\n\r\n" at start of multipart/form-data POST data + removed. + * Multiple file upload now emits multipart/mixed, rather than + multipart/multipart/mixed, as content-type! + * Content-disposition header now comes before content-type, in + case that matters... + * Slight tweak to SelectControl.fixup, to fix case where multiple + SELECT is empty. + * Released 0.1.8b. + +2003-07-12 John J Lee <jjl@pobox.com> + * Added indication to ListControl.__str__ of disabled items -- + they have parentheses around them: item 1, (item 2), item 3 + means "item 2" is disabled. + * Released 0.1.7b. + +2003-07-10 John J Lee <jjl@pobox.com> + * Removed assertion that self.value is None in + IgnoreControl.__init__. Now sets value to None instead. Thanks + to Martijn Faasen for bug report. Same for FileControl. + +2003-07-23 John J Lee <jjl@pobox.com> + * 0.1.6a changes: + * After some thought about Law of Demeter, realised that there was + no justification for deprecating most use of find_control, nor + for all of the new methods on HTMLForm. Use of find_control + is now officially OK again. set_/get_readonly, + set_/get_disabled, set_/get_item_disabled and + set_all_items disabled, have been removed from HTMLForm. + * Added HTMLForm.set_all_readonly method. This one is actually + useful! + * All methods on Controls that used to be separate _by_label + methods are now by_label arguments, now I see that labels can be + defined for all controls' items. The exceptions are + set_value_by_label and get_value_by_label, since there is no + method to add an argument to in those cases. The lack of + implementation of by_label for CHECKBOX and RADIO is considered + a bug, so NotImplementedError is raised. LabelNotSupportedError + has gone. + * Released 0.1.6a. + +2003-07-22 John J Lee <jjl@pobox.com> + * 0.1.6a changes: + * Added some tests for new HTMLForm methods. + * ListControl.readonly now exists and works. + * Corrected error message for predicate arg of + HTMLForm.find_control. + * Fixed HTMLForm.get_readonly. + * Fixed ListControl.get_item_disabled. + * Fixed exception raised by ListControl: was TypeError, now + LabelNotSupportedError. + * Fixed ListControl.get_value, .set_value and .possible_items + exception messages. + * Enforced restriction on new HTMLForm methods that at least one + find_control argument must be supplied. + +2003-07-14 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * Removed listcontrol arg from HTMLForm methods, added kind and + predicate arguments to .find_control. kind argument is + available in most HTMLForm methods. + * set methods take selected argument set(selected, "itemname"), + and the clear methods are gone. + +2003-07-13 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * FileControl now unsuccessful when disabled attribute is true. + * Moved most method definitions in Control into ScalarControl. + * ListControls now always take sequence values, never string-like + values. + * listcontrol argument on appropriate HTMLForm methods, in + addition to name, type and nr. This allows you to ask for a + ListControl without specifying the exact type. + * All controls now have the readonly attribute. + * Renamed get_value_as_label --> get_value_by_label. + * Renamed possible_values --> possible_items. + * Renamed possible_labels --> possible_item_labels. + * SelectControl.set_by_label, .clear_by_label, and + .toggle_by_label have now gone, to be replaced by by_label + arguments to .get, .set and .toggle. + * Added files now have their MIME content type guessed unless the + content type is explicitly specified in content_type argument to + add_file. At the moment, it's always guessed to be + application/octet-stream. + +2003-07-12 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * get_item_attrs now raises IndexError instead of returning None + when the item is not found. + * Realised that exceptions raised are a mess (IndexError should + never have been raised at all, for a start). Rethought it all + and thoroughly overhauled it. + +2003-07-08 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * Added toggle_single, set_single, clear_single methods to + HTMLForm and ListControl. This is useful when you have a + single-item list control (usually a single checkbox that you + want to check), and you want to select that item without having + to know what the item's name is (it's usually something + meaningless like "1" or "on"). + * FileControl no longer derives from TextControl. + * Moved most documentation from Control objects into HTMLForm. + The class docstring for HTMLForm now contains most of what you + need to know. + +2003-07-07 John J Lee <jjl@pobox.com> + * 0.1.5a: Empty SelectControl can now be constructed. + +2003-07-06 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * Interface change: the HTMLForm.set, .clear and .toggle + methods now take value as *first* argument, with the other + arguments reflecting those of find_control (ie. name, type, nr). + * find_control and find_item now behave as documented with regard + to need to supply all arguments (nr now defaults to None, not + 0). + * Renamed find_item --> get_item_attrs. + * Added ListControl.get_item_disabled and .set_item_disabled + methods, and support for OPTGROUP (disabled OPTGROUPs make their + OPTIONs disabled). No longer need to mess with attrs dictionary + to set disabled state of items. + * Renamed items --> pairs. + * Renamed click_items --> click_pairs. + * HTML attribute dictionaries now contain *all* original HTML + attributes, including those that are exposed elsewhere in the + ClientForm API (such as name, type, multiple, selected). + * HTMLForm.find_control now raises IndexError instead of returning + None when no control is found. set_disabled, set_readonly, + click, click_request_data, click_pairs, set, clear, toggle, + possible_values all now raise IndexError instead of ValueError + when no control is found. + * HTMLForm.set_disabled, .set_readonly now take boolean arg as + first argument, and take type and nr args. + * HTMLForm.set_readonly now raises AttributeError, not ValueError, + when invoked for control with no readonly attribute. + * Fixed minor, latent 1.5.2-compatibility bug in MapBase. + * HTMLForm.set, .get, .toggle now raise AttributeError, not + TypeError, on being invoked for non-list controls. + * Removed nr argument from all methods related to find_item_attrs. + Not needed AFAICS! + * Lots of new delegating methods on HTMLForm. + * ListControl.multiple is now enforced to be readonly. + * Controls now take extra name argument (to enable creating empty + ListControls). + * Some code cleanup. + +2003-07-04 John J Lee <jjl@pobox.com> + * 0.1.5a: Added HTMLForm.set_disabled and .set_readonly methods. + +2003-06-29 John J Lee <jjl@pobox.com> + * Noticed that I was wrong about browser behaviour with default + selection for RADIO with no explictly selected items in HTML. + In fact, browsers don't select any items in that case, in + contradiction to HTML 4.01 (and RFC 1866, FWIW). Default is now + for RadioControl to follow this behaviour, and the various + select_default arguments now make RADIO follow the HTML 4.01 + standard. + * RadioControls now no longer have to have exactly one item + selected. + * 0.1.4a: set / clear / toggle methods on HTMLForm and ListControl + now work with single-selection controls. + * Released 0.0.11 and 0.1.4a. + +2003-06-28 John J Lee <jjl@pobox.com> + * 0.1.4a: + * Removed all asserts from tests. Now uses TestCase.assert_ + method. + * All raise statements now raise Exception objects, not classes, + and use the raise FooError('msg') syntax. + * Simplified implementation of HTMLForm set / clear / toggle + methods. + * Corrected exception message for ListControl set / clear / toggle + methods: was giving item name instead of control name for + single-selection lists. + * Moved ListControl._single_set_value method from ListControl into + RadioControl. + * disabled attribute is now handled differently: if any item has + the disabled HTML-attribute, the control's value can't be set, + but ListControl.set, .clear and .toggle (or the methods on + HTMLForm with the same names) can still be used. Using those + methods, individual items can't be set if they're disabled. + ATM, to un-disable an item, you have to del the dictionary key: + + del form.find_control("cheeses").find_item("cheddar")["disabled"] + + which will have to change, I think. + +2003-06-25 John J Lee <jjl@pobox.com> + * Changed license to BSD, to make it easier to use other code. + The only difference is the addition of a non-endorsement clause. + * Default value for single-selection SELECT controls was wrong, + and at odds with my own comments! Before, nothing was selected + if select_default was False (the default). Now, the first item + is selected. Thanks to Chris Curvey. + * CHECKBOX and multiple SELECT controls now allow + control.value = None. + * Better isstringlike function, after Alex Martelli. + * RadioControl now has default value "on" -- same as for + CheckboxControl. Both IE5 and Mozilla Firebird 0.6 do this. + * Fixed toggle_by_label & co. exceptions: before, raised + KeyError, now raises ValueError. + * Released 0.0.10 and 0.1.3a. + +2003-06-13 John J Lee <jjl@pobox.com> + * Parse errors may now be ignored, thanks to ignore_errors + argument to ParseFile and ParseResponse. + * 0.1.3a: Added HTMLForm.set and HTMLForm.clear methods (and + corresponding control methods). + +2003-06-12 John J Lee <jjl@pobox.com> + * HTMLForm.__getitem__ and .__setitem__ now raise IndexError when + they should. + * 0.1.3a: Change all HTTP headers to use initial caps in first + word only (Content-type, not Content-Type), for 2.3 + compatibility when checking private Request.headers dict in + tests. + +2003-06-09 John J Lee <jjl@pobox.com> + * Released 0.0.9 and 0.1.2a. + +2003-06-07 John J Lee <jjl@pobox.com> + * Improved output of __str__ methods. Every control type now has + its own class. + * Added nr argument to click* methods. + * 0.1.2a: Fixed bug in _request_data: POST with + "application/x-www-form-urlencoded" failed due to incorrect + return value. + +2003-06-03 John J Lee <jjl@pobox.com> + * Released 0.0.8 and 0.1.1a. + +2003-05-28 John J Lee <jjl@pobox.com> + * Fixed HTMLForm.__str__, which was calling repr on its Controls + rather than str, which was rather unhelpful. + * Added a bit in README.html explaining single-checkbox-with- + missing-value-attribute case. + +2003-04-30 John J Lee <jjl@pobox.com> + * Released 0.1.0a. + +2003-04-05 John J Lee <jjl@pobox.com> + * In 0.1.0a: Added file upload capability for INPUT TYPE=FILE + controls (for single files only). + * In 0.1.0a: Removed items argument to HTMLForm.click method, and + added click_items and click_request_data methods. Removed items + and make_request methods from HTMLForm. Made SubmitControl.click + method private -- is now named _click, and is only called by + HTMLForm. + * In 0.1.0a: IsindexControl is now clickable, and isindex_url has + been removed, since it was essentially pointless. + * In 0.1.0a: Changed SelectControl so it has an attrs dict of HTML + attributes. SELECT and OPTION HTML attributes are now separate. + +2003-03-23 John J Lee <jjl@pobox.com> + * Released 0.0.7. + +2003-03-08 John J Lee <jjl@pobox.com> + * In 0.1.0a: FormParser no longer deletes type HTML attribute from + the dictionary of HTML attributes it provides -- is now the + control's responsibility. + +2003-03-05 John J Lee <jjl@pobox.com> + * Allow INPUT TYPE=FILE in form (file upload still not + implemented -- this is just to allow parsing forms containing + file upload controls). + +2003-02-14 John J Lee <jjl@pobox.com> + * Fixed empty TEXTAREA case. Thanks to Khalid Zuberi for the bug + report and fix. + * Released 0.0.6. + +2003-02-05 John J Lee <jjl@pobox.com> + * Released 0.0.5 (first stable release). + +2003-01-05 John J Lee <jjl@pobox.com> + * Parser now no longer reads entire file before starting to work + on data. + +2002-12-13 John J Lee <jjl@pobox.com> + * Implemented ISINDEX submission, and updated documentation (see + IsindexControl.__doc__). + * Changed type attributes of BUTTON TYPE=SUBMIT and + BUTTON TYPE=RESET to "submitbutton" and "resetbutton" + respectively. Previously, they were "submit" and "reset" + respectively, which made it impossible to tell whether they came + from a BUTTON or an INPUT control. + * Improved README.html. + +2002-11-19 John J Lee <jjl@pobox.com> + * Released 0.0.4b. + +2002-11-17 John J Lee <jjl@pobox.com> + * Changed license to MIT (from Perl Artistic). Thanks, Gisle. + * Removed README, created README.html and INSTALL. README mostly + just restated what was in the web page, so README.html is now + just a copy of the web page. + +2002-11-16 John J Lee <jjl@pobox.com> + * Tested label methods of SelectControl. + * Removed undocumented munging of SELECT's value HTML attribute to + the key "select_value" in the HTML attributes dict returned by + SelectControl.items(). The purpose of this, in the original + Perl, was presumably to avoid clobbering SELECT's value HTML + attribute (since OPTION and SELECT HTML attributes are merged to + generate this dictionary). The only trouble is, SELECT *has* no + value HTML attribute! Either some buggy HTML contains SELECT + controls with value attributes, or Gisle was not paying + attention when he wrote this, or both! + +2002-11-14 John J Lee <jjl@pobox.com> + * Fixed select_default for single-selection SELECT controls. + +2002-11-13 John J Lee <jjl@pobox.com> + * Replaced __repr__ methods with __str__ methods. Very unlikely + to break anyone's code. repr(obj) now gives something more + useful, str(obj) still gives the same result. + * Fixed ParseResponse, which was ignoring the select_default + argument. + * Cleaned up constructors of ScalarControl and ListControl. + Control is now more clearly an abstract base class (not meant to + be instantiated). + * ListControl is now an abstract base class, with subclasses + RadioControl, CheckboxControl and SelectControl. + * Rather than using the values of the OPTION elements to set + SelectControl values, SelectControl items can also be specified + by the labels of the OPTION elements. For example, if you have + a SELECT control like so: + + <SELECT> + <OPTION value="br">Brie</OPTION> + <OPTION value="ched">Cheddar</OPTION> + <OPTION value="grgnz" label="Gorgonzola">Special offer on + Gorgonzola!</OPTION> + </SELECT> + + instead of setting its value like this: + + control.value = ["br", "ched", "grgnz"] + + you can now optionally use the more readable (and, possibly, + more maintainable): + + control.set_value_by_label(["Brie", "Cheddar", "Gorgonzola"]) + + Note that the label HTML attribute defaults to the content of + the OPTION element (as does the value HTML attribute). + * Improved documentation and comments. + +2002-11-04 John J Lee <jjl@pobox.com> + * Fixed TextControl default value to be empty string rather than + None. This has the effect that text controls are successful + even when empty. + * Stopped Content-Type from being emitted twice. + +2002-10-25 John J Lee <jjl@pobox.com> + * Released 0.0.3b + +2002-10-24 John J Lee <jjl@pobox.com> + * Changed handling of SELECT/multiple ListControls: select_default + argument to various functions and methods now indicates whether + or not should follow RFC 1866 or Netscape / IE behaviour in + setting default selection if no 'selected' HTML attribute was + given. + * Changed type of SELECT/OPTION controls to "select" from + "option". This is more appropriate, since SELECT is the element + that represents the control, whereas the OPTION element + represents the list items inside the control. + * Removed readonly attribute from ListControl -- reading W3C + HTML 4 specification carefully and testing with Netscape / IE + reveals that this isn't intended to work with INPUT elements + other than those of type TEXT and PASSWORD. + * Fixed Control.__setattr__ to make value of disabled controls + read-only. + * Improved tests and documentation. + +2002-10-20 John J Lee <jjl@pobox.com> + * Some testing on a site having a fairly complicated sequence of + forms. No problems came to light. + * Made name and type attributes of Control readonly. + * Improved documentation. + +2002-10-15 John J Lee <jjl@pobox.com> + * Fixed make_request to pass urlencode(data) instead of data for + POST. + * Thanks to Conrad Schneiker for help with HTTPS on Windows and a + bug report. + +2002-10-11 John J Lee <jjl@pobox.com> + * Fixed silly Python 2.3 forwards-compatibility bug (True / False + constants were defined, overwriting the new builtin versions in + 2.3). + * Fixed treatment of form method -- was incorrectly treated as + case-sentitive. + * Fixed enctype default in FormParser. + +2002-10-07 John J Lee <jjl@pobox.com> + * Added TEXTAREA. + * Added HTMLForm.attrs attribute, which is a dictionary mapping + HTML attributes to their values. + * Added more tests. + * Back-ported to Python 1.5.2. + +2002-10-06 John J Lee <jjl@pobox.com> + * Renamed 'input' to 'control' everywhere (HTML 4.0 terminology, + and more accurate, because one Control may represent more than + one INPUT or OPTION, in the case of ListControl). + * Changed interface of HTMLForm.find_control and + HTMLForm.possible_values, so that nr argument begins indexing at + 0 rather than 1. + * Added name attribute to HTMLForm. + * Fixed case where HTMLForm.find_control is passed only nr + argument. + * Fixed find_control to return None rather than raise an + exception. + * Renamed HTMLForm.push_control to new_control. + * Replaced HTMLForm.controls method with attribute. + * Fixed ListControl.set_value method in single-selection case. + * Replaced all type, name, value and set_value methods with + attributes and __getattr__ / __setattr__. + * Added multiple attribute, indicating whether or not ListControl + can have more than one value selected at a time. + * Added ScalarControl base class, which has attrs attribute which + is a dictionary mapping HTML attributes to their values. + * Added find_item method to ListControl, which allows access to + HTML attributes of items in the sequence. + * Removed controls argument of HTMLForm.__init__. + * Altered handling of disabled and readonly -- now are attributes + on Control instances, and may be set or cleared to change + Control's behaviour. + * Added toggle methods to ListControl and Form. + * Fixed ParseFile (hence ParseResponse) to set default form action + correctly when there is none given in HTML. + * Fixed many tests. + * Improved documentation. + +2002-09-29 John J Lee <jjl@pobox.com> + * Edited down large test file to save space. + +2002-09-22 John J Lee <jjl@pobox.com> + * Added HTMLForm.possible_values method. + * First use on internet -- seems to work. + * Announced on comp.lang.python.announce. + * Released 0.0.2a + +2002-09-20 John J Lee <jjl@pobox.com> + * Uploaded 0.0.1a + +2002-09-14 John J Lee <jjl@pobox.com> + * Ported form tests from my old classes. + * Added input.merge_input() so that ListInputs can be created easily + without an HTMLForm. + +2002-08-23 John J Lee <jjl@pobox.com> + * General clean-up. + * Added tests for input classes and debugged: tests now pass. + * Things should more-or-less work now. + +2002-08-19 John J Lee <jjl@pobox.com> + * Finished port. + * Tests from LWP pass. diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/COPYING b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..f54e8c49c6926f132a395667cfdf0230fc8372db --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/COPYING @@ -0,0 +1,31 @@ +Copyright (c) 2002-2004 John J. Lee <jjl@pobox.com> +Copyright (c) 1998-2000 Gisle Aas + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +Neither the name of the contributors nor the names of their employers +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/ChangeLog b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/ChangeLog new file mode 100644 index 0000000000000000000000000000000000000000..3573a2ebd78602b9c449dfe81c3294337bfffac4 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/ChangeLog @@ -0,0 +1,578 @@ +This isn't really in proper GNU ChangeLog format, it just happens to +look that way. + +2005-01-30 John J Lee <jjl@pobox.com> + * Fix failure to raise ParseError (!) + * Workaround for failure of sgmllib to unescape attributes (bug + report from Titus Brown). + * Released 0.1.17. + +2005-01-17 John J Lee <jjl@pobox.com> + * Fixed case where FORM action contains a '?' or '#' (again). + * Allow user to supply own Request class (Tobias). + * Fix ISINDEX action URL (bug only showed up in Python 2.4). + * Fix image control in case where value is present. + * Hack choose_boundary not to fail on socket.gaierror. + +2004-05-15 John J Lee <jjl@pobox.com> + * Released 0.1.16 and 0.0.16. + + * 0.1.x and 0.0.x: + * Fixed case where FORM action contains a '?' (bug report from + Moof). + + * 0.1.x only: + * Look for BASE element attribute 'href', not 'uri'! (patch from + Jochen Knuth) + * Applied workaround for file upload for AOLServer (patch from + Andrei Mitran). + * Added optional form_parser_class arguments to allow choice + between htmllib and HTMLParser modules. + * Added a SelectControl._delete_items() method, useful for quick- + hack JS simulation. Not yet a stable interface, hence the + initial underscore. + + * 0.0.x only: + * SubmitControls with no initial value in HTML are now successful + (default value is '', not None) + +2004-01-22 John J Lee <jjl@pobox.com> + * 0.1.x only: + * Cleaned up docs a bit, and removed references to toggle methods. + +2004-01-05 John J Lee <jjl@pobox.com> + * 0.1.x only: + * Take note of base element. Thanks to Phillip J. Eby for bug + report. + * All form attributes are now available in HTMLForm.attrs + (previously, name, action, method and enctype were not present). + * Released 0.1.15. + +2004-01-01 John J Lee <jjl@pobox.com> + * 0.1.x only: Disovered ignore_errors was ignored by + ParseResponse! It seems nobody uses it from ParseResponse, and + it's probably worthless anyway. Also, I just now realise that + FormParser.error() is actually overriding a base class method + without my noticing it! The arguments are still there, but + they're now ignored. Thanks to Per Cederqvist. + +2003-12-24 John J Lee <jjl@pobox.com> + * Modified setup.py so can easily register with PyPI. + +2003-12-06 John J Lee <jjl@pobox.com> + * Fixed bug where ClientForm.urlencode choked on Unicode. + * Released 0.1.14 and 0.0.15. + +2003-11-14 John J Lee <jjl@pobox.com> + * A few doc fixes in HTMLForm.__doc__. + * Minor code clean-up. + +2003-11-12 John J Lee <jjl@pobox.com> + * Fixed bug where empty OPTION caused KeyError. Thanks to Doug + Henderson. + * Released 0.1.13 and 0.0.14. + +2003-11-11 John J Lee <jjl@pobox.com> + * Fixed bugs where TEXTAREA or OPTION containing entity reference + would result in truncated element contents. Thanks to Michael + Howitz again! + * Applied fixes to 0.0.x for ImageControl integer coordinates, + TEXTAREA content .strip()ping and entity references in TEXTAREA + and OPTION. + * Released 0.1.12 and 0.0.13. + +2003-11-07 John J Lee <jjl@pobox.com> + * TEXTAREA contents are no longer .strip()ped on form parsing. + * Released 0.1.11. + +2003-11-03 John J Lee <jjl@pobox.com> + * Fixed ImageControl.pairs(): return value contained integer + coordinates instead of strings. Thanks to Michael Howitz. + +2003-10-31 John J Lee <jjl@pobox.com> + * XHTML support for Pythons >= 2.2. Thanks to Michael Howitz. + * Released 0.1.10. + +2003-10-02 John J Lee <jjl@pobox.com> + * Bugfix: selection of default control to click on is supposed to + only happen if no control is explictly requested, but id wasn't + included in that. Now, it is. + +2003-09-28 John J Lee <jjl@pobox.com> + * Fixed HTMLForm.attrs. Thanks to Scott Chapman. + * Released ClientForm 0.0.12 and 0.1.9 (first stable release of + 0.1.x). + +2003-09-21 John J Lee <jjl@pobox.com> + * Interface change (sorry): id is now supported. This means + Controls have an id attribute, and appropriate HTMLForm methods + have an id argument. This will only affect people using + positional arguments after the 'kind' argument. + * Interface change: BUTTON/BUTTON now has type "buttonbutton" (was + "button") to prevent clash with type of INPUT/BUTTON (was and is + "button"). Both types of control are ignored anyway (ie. + represented by IgnoreControl), so it's unlikely any code is + affected. + * SubmitControl value now defaults to "", so it is successful even + when no value is given in the HTML. + * Extraneous "\r\n\r\n" at start of multipart/form-data POST data + removed. + * Multiple file upload now emits multipart/mixed, rather than + multipart/multipart/mixed, as content-type! + * Content-disposition header now comes before content-type, in + case that matters... + * Slight tweak to SelectControl.fixup, to fix case where multiple + SELECT is empty. + * Released 0.1.8b. + +2003-07-12 John J Lee <jjl@pobox.com> + * Added indication to ListControl.__str__ of disabled items -- + they have parentheses around them: item 1, (item 2), item 3 + means "item 2" is disabled. + * Released 0.1.7b. + +2003-07-10 John J Lee <jjl@pobox.com> + * Removed assertion that self.value is None in + IgnoreControl.__init__. Now sets value to None instead. Thanks + to Martijn Faasen for bug report. Same for FileControl. + +2003-07-23 John J Lee <jjl@pobox.com> + * 0.1.6a changes: + * After some thought about Law of Demeter, realised that there was + no justification for deprecating most use of find_control, nor + for all of the new methods on HTMLForm. Use of find_control + is now officially OK again. set_/get_readonly, + set_/get_disabled, set_/get_item_disabled and + set_all_items disabled, have been removed from HTMLForm. + * Added HTMLForm.set_all_readonly method. This one is actually + useful! + * All methods on Controls that used to be separate _by_label + methods are now by_label arguments, now I see that labels can be + defined for all controls' items. The exceptions are + set_value_by_label and get_value_by_label, since there is no + method to add an argument to in those cases. The lack of + implementation of by_label for CHECKBOX and RADIO is considered + a bug, so NotImplementedError is raised. LabelNotSupportedError + has gone. + * Released 0.1.6a. + +2003-07-22 John J Lee <jjl@pobox.com> + * 0.1.6a changes: + * Added some tests for new HTMLForm methods. + * ListControl.readonly now exists and works. + * Corrected error message for predicate arg of + HTMLForm.find_control. + * Fixed HTMLForm.get_readonly. + * Fixed ListControl.get_item_disabled. + * Fixed exception raised by ListControl: was TypeError, now + LabelNotSupportedError. + * Fixed ListControl.get_value, .set_value and .possible_items + exception messages. + * Enforced restriction on new HTMLForm methods that at least one + find_control argument must be supplied. + +2003-07-14 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * Removed listcontrol arg from HTMLForm methods, added kind and + predicate arguments to .find_control. kind argument is + available in most HTMLForm methods. + * set methods take selected argument set(selected, "itemname"), + and the clear methods are gone. + +2003-07-13 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * FileControl now unsuccessful when disabled attribute is true. + * Moved most method definitions in Control into ScalarControl. + * ListControls now always take sequence values, never string-like + values. + * listcontrol argument on appropriate HTMLForm methods, in + addition to name, type and nr. This allows you to ask for a + ListControl without specifying the exact type. + * All controls now have the readonly attribute. + * Renamed get_value_as_label --> get_value_by_label. + * Renamed possible_values --> possible_items. + * Renamed possible_labels --> possible_item_labels. + * SelectControl.set_by_label, .clear_by_label, and + .toggle_by_label have now gone, to be replaced by by_label + arguments to .get, .set and .toggle. + * Added files now have their MIME content type guessed unless the + content type is explicitly specified in content_type argument to + add_file. At the moment, it's always guessed to be + application/octet-stream. + +2003-07-12 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * get_item_attrs now raises IndexError instead of returning None + when the item is not found. + * Realised that exceptions raised are a mess (IndexError should + never have been raised at all, for a start). Rethought it all + and thoroughly overhauled it. + +2003-07-08 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * Added toggle_single, set_single, clear_single methods to + HTMLForm and ListControl. This is useful when you have a + single-item list control (usually a single checkbox that you + want to check), and you want to select that item without having + to know what the item's name is (it's usually something + meaningless like "1" or "on"). + * FileControl no longer derives from TextControl. + * Moved most documentation from Control objects into HTMLForm. + The class docstring for HTMLForm now contains most of what you + need to know. + +2003-07-07 John J Lee <jjl@pobox.com> + * 0.1.5a: Empty SelectControl can now be constructed. + +2003-07-06 John J Lee <jjl@pobox.com> + * 0.1.5a changes: + * Interface change: the HTMLForm.set, .clear and .toggle + methods now take value as *first* argument, with the other + arguments reflecting those of find_control (ie. name, type, nr). + * find_control and find_item now behave as documented with regard + to need to supply all arguments (nr now defaults to None, not + 0). + * Renamed find_item --> get_item_attrs. + * Added ListControl.get_item_disabled and .set_item_disabled + methods, and support for OPTGROUP (disabled OPTGROUPs make their + OPTIONs disabled). No longer need to mess with attrs dictionary + to set disabled state of items. + * Renamed items --> pairs. + * Renamed click_items --> click_pairs. + * HTML attribute dictionaries now contain *all* original HTML + attributes, including those that are exposed elsewhere in the + ClientForm API (such as name, type, multiple, selected). + * HTMLForm.find_control now raises IndexError instead of returning + None when no control is found. set_disabled, set_readonly, + click, click_request_data, click_pairs, set, clear, toggle, + possible_values all now raise IndexError instead of ValueError + when no control is found. + * HTMLForm.set_disabled, .set_readonly now take boolean arg as + first argument, and take type and nr args. + * HTMLForm.set_readonly now raises AttributeError, not ValueError, + when invoked for control with no readonly attribute. + * Fixed minor, latent 1.5.2-compatibility bug in MapBase. + * HTMLForm.set, .get, .toggle now raise AttributeError, not + TypeError, on being invoked for non-list controls. + * Removed nr argument from all methods related to find_item_attrs. + Not needed AFAICS! + * Lots of new delegating methods on HTMLForm. + * ListControl.multiple is now enforced to be readonly. + * Controls now take extra name argument (to enable creating empty + ListControls). + * Some code cleanup. + +2003-07-04 John J Lee <jjl@pobox.com> + * 0.1.5a: Added HTMLForm.set_disabled and .set_readonly methods. + +2003-06-29 John J Lee <jjl@pobox.com> + * Noticed that I was wrong about browser behaviour with default + selection for RADIO with no explictly selected items in HTML. + In fact, browsers don't select any items in that case, in + contradiction to HTML 4.01 (and RFC 1866, FWIW). Default is now + for RadioControl to follow this behaviour, and the various + select_default arguments now make RADIO follow the HTML 4.01 + standard. + * RadioControls now no longer have to have exactly one item + selected. + * 0.1.4a: set / clear / toggle methods on HTMLForm and ListControl + now work with single-selection controls. + * Released 0.0.11 and 0.1.4a. + +2003-06-28 John J Lee <jjl@pobox.com> + * 0.1.4a: + * Removed all asserts from tests. Now uses TestCase.assert_ + method. + * All raise statements now raise Exception objects, not classes, + and use the raise FooError('msg') syntax. + * Simplified implementation of HTMLForm set / clear / toggle + methods. + * Corrected exception message for ListControl set / clear / toggle + methods: was giving item name instead of control name for + single-selection lists. + * Moved ListControl._single_set_value method from ListControl into + RadioControl. + * disabled attribute is now handled differently: if any item has + the disabled HTML-attribute, the control's value can't be set, + but ListControl.set, .clear and .toggle (or the methods on + HTMLForm with the same names) can still be used. Using those + methods, individual items can't be set if they're disabled. + ATM, to un-disable an item, you have to del the dictionary key: + + del form.find_control("cheeses").find_item("cheddar")["disabled"] + + which will have to change, I think. + +2003-06-25 John J Lee <jjl@pobox.com> + * Changed license to BSD, to make it easier to use other code. + The only difference is the addition of a non-endorsement clause. + * Default value for single-selection SELECT controls was wrong, + and at odds with my own comments! Before, nothing was selected + if select_default was False (the default). Now, the first item + is selected. Thanks to Chris Curvey. + * CHECKBOX and multiple SELECT controls now allow + control.value = None. + * Better isstringlike function, after Alex Martelli. + * RadioControl now has default value "on" -- same as for + CheckboxControl. Both IE5 and Mozilla Firebird 0.6 do this. + * Fixed toggle_by_label & co. exceptions: before, raised + KeyError, now raises ValueError. + * Released 0.0.10 and 0.1.3a. + +2003-06-13 John J Lee <jjl@pobox.com> + * Parse errors may now be ignored, thanks to ignore_errors + argument to ParseFile and ParseResponse. + * 0.1.3a: Added HTMLForm.set and HTMLForm.clear methods (and + corresponding control methods). + +2003-06-12 John J Lee <jjl@pobox.com> + * HTMLForm.__getitem__ and .__setitem__ now raise IndexError when + they should. + * 0.1.3a: Change all HTTP headers to use initial caps in first + word only (Content-type, not Content-Type), for 2.3 + compatibility when checking private Request.headers dict in + tests. + +2003-06-09 John J Lee <jjl@pobox.com> + * Released 0.0.9 and 0.1.2a. + +2003-06-07 John J Lee <jjl@pobox.com> + * Improved output of __str__ methods. Every control type now has + its own class. + * Added nr argument to click* methods. + * 0.1.2a: Fixed bug in _request_data: POST with + "application/x-www-form-urlencoded" failed due to incorrect + return value. + +2003-06-03 John J Lee <jjl@pobox.com> + * Released 0.0.8 and 0.1.1a. + +2003-05-28 John J Lee <jjl@pobox.com> + * Fixed HTMLForm.__str__, which was calling repr on its Controls + rather than str, which was rather unhelpful. + * Added a bit in README.html explaining single-checkbox-with- + missing-value-attribute case. + +2003-04-30 John J Lee <jjl@pobox.com> + * Released 0.1.0a. + +2003-04-05 John J Lee <jjl@pobox.com> + * In 0.1.0a: Added file upload capability for INPUT TYPE=FILE + controls (for single files only). + * In 0.1.0a: Removed items argument to HTMLForm.click method, and + added click_items and click_request_data methods. Removed items + and make_request methods from HTMLForm. Made SubmitControl.click + method private -- is now named _click, and is only called by + HTMLForm. + * In 0.1.0a: IsindexControl is now clickable, and isindex_url has + been removed, since it was essentially pointless. + * In 0.1.0a: Changed SelectControl so it has an attrs dict of HTML + attributes. SELECT and OPTION HTML attributes are now separate. + +2003-03-23 John J Lee <jjl@pobox.com> + * Released 0.0.7. + +2003-03-08 John J Lee <jjl@pobox.com> + * In 0.1.0a: FormParser no longer deletes type HTML attribute from + the dictionary of HTML attributes it provides -- is now the + control's responsibility. + +2003-03-05 John J Lee <jjl@pobox.com> + * Allow INPUT TYPE=FILE in form (file upload still not + implemented -- this is just to allow parsing forms containing + file upload controls). + +2003-02-14 John J Lee <jjl@pobox.com> + * Fixed empty TEXTAREA case. Thanks to Khalid Zuberi for the bug + report and fix. + * Released 0.0.6. + +2003-02-05 John J Lee <jjl@pobox.com> + * Released 0.0.5 (first stable release). + +2003-01-05 John J Lee <jjl@pobox.com> + * Parser now no longer reads entire file before starting to work + on data. + +2002-12-13 John J Lee <jjl@pobox.com> + * Implemented ISINDEX submission, and updated documentation (see + IsindexControl.__doc__). + * Changed type attributes of BUTTON TYPE=SUBMIT and + BUTTON TYPE=RESET to "submitbutton" and "resetbutton" + respectively. Previously, they were "submit" and "reset" + respectively, which made it impossible to tell whether they came + from a BUTTON or an INPUT control. + * Improved README.html. + +2002-11-19 John J Lee <jjl@pobox.com> + * Released 0.0.4b. + +2002-11-17 John J Lee <jjl@pobox.com> + * Changed license to MIT (from Perl Artistic). Thanks, Gisle. + * Removed README, created README.html and INSTALL. README mostly + just restated what was in the web page, so README.html is now + just a copy of the web page. + +2002-11-16 John J Lee <jjl@pobox.com> + * Tested label methods of SelectControl. + * Removed undocumented munging of SELECT's value HTML attribute to + the key "select_value" in the HTML attributes dict returned by + SelectControl.items(). The purpose of this, in the original + Perl, was presumably to avoid clobbering SELECT's value HTML + attribute (since OPTION and SELECT HTML attributes are merged to + generate this dictionary). The only trouble is, SELECT *has* no + value HTML attribute! Either some buggy HTML contains SELECT + controls with value attributes, or Gisle was not paying + attention when he wrote this, or both! + +2002-11-14 John J Lee <jjl@pobox.com> + * Fixed select_default for single-selection SELECT controls. + +2002-11-13 John J Lee <jjl@pobox.com> + * Replaced __repr__ methods with __str__ methods. Very unlikely + to break anyone's code. repr(obj) now gives something more + useful, str(obj) still gives the same result. + * Fixed ParseResponse, which was ignoring the select_default + argument. + * Cleaned up constructors of ScalarControl and ListControl. + Control is now more clearly an abstract base class (not meant to + be instantiated). + * ListControl is now an abstract base class, with subclasses + RadioControl, CheckboxControl and SelectControl. + * Rather than using the values of the OPTION elements to set + SelectControl values, SelectControl items can also be specified + by the labels of the OPTION elements. For example, if you have + a SELECT control like so: + + <SELECT> + <OPTION value="br">Brie</OPTION> + <OPTION value="ched">Cheddar</OPTION> + <OPTION value="grgnz" label="Gorgonzola">Special offer on + Gorgonzola!</OPTION> + </SELECT> + + instead of setting its value like this: + + control.value = ["br", "ched", "grgnz"] + + you can now optionally use the more readable (and, possibly, + more maintainable): + + control.set_value_by_label(["Brie", "Cheddar", "Gorgonzola"]) + + Note that the label HTML attribute defaults to the content of + the OPTION element (as does the value HTML attribute). + * Improved documentation and comments. + +2002-11-04 John J Lee <jjl@pobox.com> + * Fixed TextControl default value to be empty string rather than + None. This has the effect that text controls are successful + even when empty. + * Stopped Content-Type from being emitted twice. + +2002-10-25 John J Lee <jjl@pobox.com> + * Released 0.0.3b + +2002-10-24 John J Lee <jjl@pobox.com> + * Changed handling of SELECT/multiple ListControls: select_default + argument to various functions and methods now indicates whether + or not should follow RFC 1866 or Netscape / IE behaviour in + setting default selection if no 'selected' HTML attribute was + given. + * Changed type of SELECT/OPTION controls to "select" from + "option". This is more appropriate, since SELECT is the element + that represents the control, whereas the OPTION element + represents the list items inside the control. + * Removed readonly attribute from ListControl -- reading W3C + HTML 4 specification carefully and testing with Netscape / IE + reveals that this isn't intended to work with INPUT elements + other than those of type TEXT and PASSWORD. + * Fixed Control.__setattr__ to make value of disabled controls + read-only. + * Improved tests and documentation. + +2002-10-20 John J Lee <jjl@pobox.com> + * Some testing on a site having a fairly complicated sequence of + forms. No problems came to light. + * Made name and type attributes of Control readonly. + * Improved documentation. + +2002-10-15 John J Lee <jjl@pobox.com> + * Fixed make_request to pass urlencode(data) instead of data for + POST. + * Thanks to Conrad Schneiker for help with HTTPS on Windows and a + bug report. + +2002-10-11 John J Lee <jjl@pobox.com> + * Fixed silly Python 2.3 forwards-compatibility bug (True / False + constants were defined, overwriting the new builtin versions in + 2.3). + * Fixed treatment of form method -- was incorrectly treated as + case-sentitive. + * Fixed enctype default in FormParser. + +2002-10-07 John J Lee <jjl@pobox.com> + * Added TEXTAREA. + * Added HTMLForm.attrs attribute, which is a dictionary mapping + HTML attributes to their values. + * Added more tests. + * Back-ported to Python 1.5.2. + +2002-10-06 John J Lee <jjl@pobox.com> + * Renamed 'input' to 'control' everywhere (HTML 4.0 terminology, + and more accurate, because one Control may represent more than + one INPUT or OPTION, in the case of ListControl). + * Changed interface of HTMLForm.find_control and + HTMLForm.possible_values, so that nr argument begins indexing at + 0 rather than 1. + * Added name attribute to HTMLForm. + * Fixed case where HTMLForm.find_control is passed only nr + argument. + * Fixed find_control to return None rather than raise an + exception. + * Renamed HTMLForm.push_control to new_control. + * Replaced HTMLForm.controls method with attribute. + * Fixed ListControl.set_value method in single-selection case. + * Replaced all type, name, value and set_value methods with + attributes and __getattr__ / __setattr__. + * Added multiple attribute, indicating whether or not ListControl + can have more than one value selected at a time. + * Added ScalarControl base class, which has attrs attribute which + is a dictionary mapping HTML attributes to their values. + * Added find_item method to ListControl, which allows access to + HTML attributes of items in the sequence. + * Removed controls argument of HTMLForm.__init__. + * Altered handling of disabled and readonly -- now are attributes + on Control instances, and may be set or cleared to change + Control's behaviour. + * Added toggle methods to ListControl and Form. + * Fixed ParseFile (hence ParseResponse) to set default form action + correctly when there is none given in HTML. + * Fixed many tests. + * Improved documentation. + +2002-09-29 John J Lee <jjl@pobox.com> + * Edited down large test file to save space. + +2002-09-22 John J Lee <jjl@pobox.com> + * Added HTMLForm.possible_values method. + * First use on internet -- seems to work. + * Announced on comp.lang.python.announce. + * Released 0.0.2a + +2002-09-20 John J Lee <jjl@pobox.com> + * Uploaded 0.0.1a + +2002-09-14 John J Lee <jjl@pobox.com> + * Ported form tests from my old classes. + * Added input.merge_input() so that ListInputs can be created easily + without an HTMLForm. + +2002-08-23 John J Lee <jjl@pobox.com> + * General clean-up. + * Added tests for input classes and debugged: tests now pass. + * Things should more-or-less work now. + +2002-08-19 John J Lee <jjl@pobox.com> + * Finished port. + * Tests from LWP pass. diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/ClientForm.py b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/ClientForm.py new file mode 100644 index 0000000000000000000000000000000000000000..dec49815aa549d1a6b9973cf7f26012aeb04afc6 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/ClientForm.py @@ -0,0 +1,2854 @@ +"""HTML form handling for web clients. + +ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It has developed from a port of Gisle +Aas' Perl module HTML::Form, from the libwww-perl library, but the +interface is not the same. + +The most useful docstring is the one for HTMLForm. + +RFC 1866: HTML 2.0 +RFC 1867: Form-based File Upload in HTML +RFC 2388: Returning Values from Forms: multipart/form-data +HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) +HTML 4.01 Specification, W3C Recommendation 24 December 1999 + + +Copyright 2002-2005 John J. Lee <jjl@pobox.com> +Copyright 1998-2000 Gisle Aas. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file COPYING included with +the distribution). + +""" + +# XXX +# Fix or work around attibute quoting bug. +# Add .clear() method. +# Add some functional tests +# Especially single and multiple file upload on the internet. +# Does file upload work when name is missing? Sourceforge tracker form +# doesn't like it. Check standards, and test with Apache. Test +# binary upload with Apache. +# Unicode: see Wichert Akkerman's 2004-01-22 message to c.l.py. +# Controls can have name=None (eg. forms constructed partly with +# JavaScript), but find_control can't be told to find a control +# with that name, because None there means 'unspecified'. Can still +# get at by nr, but would be nice to be able to specify something +# equivalent to name=None, too. +# Support for list item ids. How to handle missing ids? (How do I deal +# with duplicate OPTION labels ATM? Can't remember...) +# Deal with character sets properly. Not sure what the issues are here. +# Do URL encodings need any attention? +# I don't *think* any encoding of control names, filenames or data is +# necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6 +# doesn't seem to do it. +# Add charset parameter to Content-type headers? How to find value?? +# Add label support for CHECKBOX and RADIO. Actually, I may not bother +# to fix this, since a discussion with Gisle on libwww-perl list seemed +# to show that it wouldn't be very useful. +# I'm not going to fix this unless somebody tells me what real servers +# that want this encoding actually expect: If enctype is +# application/x-www-form-urlencoded and there's a FILE control present. +# Strictly, it should be 'name=data' (see HTML 4.01 spec., section +# 17.13.2), but I send "name=" ATM. What about multiple file upload?? +# Get rid of MapBase, AList and MimeWriter. +# Should really use sgmllib, not htmllib. +# Remove single-selection code: can be special case of multi-selection, +# with a few variations, I think. +# Factor out multiple-selection list code? May not be easy. Maybe like +# this: + +# ListControl +# ^ +# | MultipleListControlMixin +# | ^ +# SelectControl / +# ^ / +# \ / +# MultiSelectControl + + +# Plan +# ---- +# Maybe a 0.2.x, cleaned up a bit and with id support for list items? +# Not sure it's worth it... +# Unify single / multiple selection code. +# action should probably be an absolute URI, like DOMForm. +# Remove toggle methods. +# Replace by_label with choice between value / id / label / +# element contents (see discussion with Gisle about labels on +# libwww-perl list). +# ...what else? +# Work on DOMForm. +# XForms? Don't know if there's a need here. + + +try: True +except NameError: + True = 1 + False = 0 + +try: bool +except NameError: + def bool(expr): + if expr: return True + else: return False + +import sys, urllib, urllib2, types, string, mimetools, copy, urlparse, \ + htmlentitydefs +from urlparse import urljoin +from cStringIO import StringIO +try: + from types import UnicodeType +except ImportError: + UNICODE = False +else: + UNICODE = True + +VERSION = "0.1.17" + +CHUNK = 1024 # size of chunks fed to parser, in bytes + +# This version of urlencode is from my Python 1.5.2 back-port of the +# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence +# of pairs instead of a mapping -- the 2.0 version only accepts a mapping. +def urlencode(query,doseq=False,): + """Encode a sequence of two-element tuples or dictionary into a URL query \ +string. + + If any values in the query arg are sequences and doseq is true, each + sequence element is converted to a separate parameter. + + If the query arg is a sequence of two-element tuples, the order of the + parameters in the output will match the order of parameters in the + input. + """ + + if hasattr(query,"items"): + # mapping objects + query = query.items() + else: + # it's a bother at times that strings and string-like objects are + # sequences... + try: + # non-sequence items should not work with len() + x = len(query) + # non-empty strings will fail this + if len(query) and type(query[0]) != types.TupleType: + raise TypeError() + # zero-length sequences of all types will get here and succeed, + # but that's a minor nit - since the original implementation + # allowed empty dicts that type of behavior probably should be + # preserved for consistency + except TypeError: + ty,va,tb = sys.exc_info() + raise TypeError("not a valid non-string sequence or mapping " + "object", tb) + + l = [] + if not doseq: + # preserve old behavior + for k, v in query: + k = urllib.quote_plus(str(k)) + v = urllib.quote_plus(str(v)) + l.append(k + '=' + v) + else: + for k, v in query: + k = urllib.quote_plus(str(k)) + if type(v) == types.StringType: + v = urllib.quote_plus(v) + l.append(k + '=' + v) + elif UNICODE and type(v) == types.UnicodeType: + # is there a reasonable way to convert to ASCII? + # encode generates a string, but "replace" or "ignore" + # lose information and "strict" can raise UnicodeError + v = urllib.quote_plus(v.encode("ASCII","replace")) + l.append(k + '=' + v) + else: + try: + # is this a sufficient test for sequence-ness? + x = len(v) + except TypeError: + # not a sequence + v = urllib.quote_plus(str(v)) + l.append(k + '=' + v) + else: + # loop over the sequence + for elt in v: + l.append(k + '=' + urllib.quote_plus(str(elt))) + return string.join(l, '&') + +# Grabbed from 2.4 xml.sax.saxutils. Modification: accept None. +def __dict_replace(s, d): + """Replace substrings of a string using a dictionary.""" + for key, value in d.items(): + s = string.replace(s, key, value) + return s +def unescape(data, entities={}): + """Unescape &, <, and > in a string of data. + + You can unescape other strings of data by passing a dictionary as + the optional entities parameter. The keys and values must all be + strings; each key will be replaced with its corresponding value. + """ + if data is None: + return None + data = string.replace(data, "<", "<") + data = string.replace(data, ">", ">") + if entities: + data = __dict_replace(data, entities) + # must do ampersand last + return string.replace(data, "&", "&") + +def startswith(string, initial): + if len(initial) > len(string): return False + return string[:len(initial)] == initial + +def issequence(x): + try: + x[0] + except (TypeError, KeyError): + return False + except IndexError: + pass + return True + +def isstringlike(x): + try: x+"" + except: return False + else: return True + + +# XXX don't really want to drag this along (MapBase, AList, MimeWriter, +# _choose_boundary) + +# This is essentially the same as UserDict.DictMixin. I wrote this before +# that, and DictMixin isn't available in 1.5.2 anyway. +class MapBase: + """Mapping designed to be easily derived from. + + Subclass it and override __init__, __setitem__, __getitem__, __delitem__ + and keys. Nothing else should need to be overridden, unlike UserDict. + This significantly simplifies dictionary-like classes. + + Also different from UserDict in that it has a redonly flag, and can be + updated (and initialised) with a sequence of pairs (key, value). + + """ + def __init__(self, init=None): + self._data = {} + self.readonly = False + if init is not None: self.update(init) + + def __getitem__(self, key): + return self._data[key] + + def __setitem__(self, key, item): + if not self.readonly: + self._data[key] = item + else: + raise TypeError("object doesn't support item assignment") + + def __delitem__(self, key): + if not self.readonly: + del self._data[key] + else: + raise TypeError("object doesn't support item deletion") + + def keys(self): + return self._data.keys() + + # now the internal workings, there should be no need to override these: + + def clear(self): + for k in self.keys(): + del self[k] + + def __repr__(self): + rep = [] + for k, v in self.items(): + rep.append("%s: %s" % (repr(k), repr(v))) + return self.__class__.__name__+"{"+(string.join(rep, ", "))+"}" + + def copy(self): + return copy.copy(self) + + def __cmp__(self, dict): + # note: return value is *not* boolean + for k, v in self.items(): + if not (dict.has_key(k) and dict[k] == v): + return 1 # different + return 0 # the same + + def __len__(self): + return len(self.keys()) + + def values(self): + r = [] + for k in self.keys(): + r.append(self[k]) + return r + + def items(self): + keys = self.keys() + vals = self.values() + r = [] + for i in len(self): + r.append((keys[i], vals[i])) + return r + + def has_key(self, key): + return key in self.keys() + + def update(self, map): + if issequence(map) and not isstringlike(map): + items = map + else: + items = map.items() + for tup in items: + if not isinstance(tup, TupleType): + raise TypeError( + "MapBase.update requires a map or a sequence of pairs") + k, v = tup + self[k] = v + + def get(self, key, failobj=None): + if key in self.keys(): + return self[key] + else: + return failobj + + def setdefault(self, key, failobj=None): + if not self.has_key(key): + self[key] = failobj + return self[key] + + +class AList(MapBase): + """Read-only ordered mapping.""" + def __init__(self, seq=[]): + self.readonly = True + self._inverted = False + self._data = list(seq[:]) + self._keys = [] + self._values = [] + for key, value in seq: + self._keys.append(key) + self._values.append(value) + + def set_inverted(self, inverted): + if (inverted and not self._inverted) or ( + not inverted and self._inverted): + self._keys, self._values = self._values, self._keys + if inverted: self._inverted = True + else: self._inverted = False + + def __getitem__(self, key): + try: + i = self._keys.index(key) + except ValueError: + raise KeyError(key) + return self._values[i] + + def __delitem__(self, key): + try: + i = self._keys.index[key] + except ValueError: + raise KeyError(key) + del self._values[i] + + def keys(self): return list(self._keys[:]) + def values(self): return list(self._values[:]) + def items(self): + data = self._data[:] + if not self._inverted: + return data + else: + newdata = [] + for k, v in data: + newdata.append((v, k)) + return newdata + +# -------------------------------------------------------------------- +# grabbed from Python standard library mimetools module and tweaked to +# avoid socket.gaierror +try: + import thread + _thread = thread; del thread +except ImportError: + import dummy_thread + _thread = dummy_thread; del dummy_thread +_counter_lock = _thread.allocate_lock() +del _thread + +_counter = 0 +def _get_next_counter(): + global _counter + _counter_lock.acquire() + _counter = _counter + 1 + result = _counter + _counter_lock.release() + return result + +_prefix = None + +def _choose_boundary(): + """Return a string usable as a multipart boundary. + + The string chosen is unique within a single program run, and + incorporates the user id (if available), process id (if available), + and current time. So it's very unlikely the returned string appears + in message text, but there's no guarantee. + + The boundary contains dots so you have to quote it in the header.""" + + global _prefix + import time + import os + import socket + if _prefix is None: + try: + socket.gaierror + except AttributeError: + exc = socket.error + else: + exc = socket.gaierror + + try: + hostid = socket.gethostbyname(socket.gethostname()) + except exc: + hostid = 'localhost' + try: + uid = repr(os.getuid()) + except AttributeError: + uid = '1' + try: + pid = repr(os.getpid()) + except AttributeError: + pid = '1' + _prefix = hostid + '.' + uid + '.' + pid + return "%s.%.3f.%d" % (_prefix, time.time(), _get_next_counter()) + +# end of code from mimetools module +# -------------------------------------------------------------------- + +def choose_boundary(): + b = _choose_boundary() + string.replace(b, ".", "") + return b + +# This cut-n-pasted MimeWriter from standard library is here so can add +# to HTTP headers rather than message body when appropriate. It also uses +# \r\n in place of \n. This is nasty. +class MimeWriter: + + """Generic MIME writer. + + Methods: + + __init__() + addheader() + flushheaders() + startbody() + startmultipartbody() + nextpart() + lastpart() + + A MIME writer is much more primitive than a MIME parser. It + doesn't seek around on the output file, and it doesn't use large + amounts of buffer space, so you have to write the parts in the + order they should occur on the output file. It does buffer the + headers you add, allowing you to rearrange their order. + + General usage is: + + f = <open the output file> + w = MimeWriter(f) + ...call w.addheader(key, value) 0 or more times... + + followed by either: + + f = w.startbody(content_type) + ...call f.write(data) for body data... + + or: + + w.startmultipartbody(subtype) + for each part: + subwriter = w.nextpart() + ...use the subwriter's methods to create the subpart... + w.lastpart() + + The subwriter is another MimeWriter instance, and should be + treated in the same way as the toplevel MimeWriter. This way, + writing recursive body parts is easy. + + Warning: don't forget to call lastpart()! + + XXX There should be more state so calls made in the wrong order + are detected. + + Some special cases: + + - startbody() just returns the file passed to the constructor; + but don't use this knowledge, as it may be changed. + + - startmultipartbody() actually returns a file as well; + this can be used to write the initial 'if you can read this your + mailer is not MIME-aware' message. + + - If you call flushheaders(), the headers accumulated so far are + written out (and forgotten); this is useful if you don't need a + body part at all, e.g. for a subpart of type message/rfc822 + that's (mis)used to store some header-like information. + + - Passing a keyword argument 'prefix=<flag>' to addheader(), + start*body() affects where the header is inserted; 0 means + append at the end, 1 means insert at the start; default is + append for addheader(), but insert for start*body(), which use + it to determine where the Content-type header goes. + + """ + + def __init__(self, fp, http_hdrs=None): + self._http_hdrs = http_hdrs + self._fp = fp + self._headers = [] + self._boundary = [] + self._first_part = True + + def addheader(self, key, value, prefix=0, + add_to_http_hdrs=0): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + lines = string.split(value, "\r\n") + while lines and not lines[-1]: del lines[-1] + while lines and not lines[0]: del lines[0] + if add_to_http_hdrs: + value = string.join(lines, "") + self._http_hdrs.append((key, value)) + else: + for i in range(1, len(lines)): + lines[i] = " " + string.strip(lines[i]) + value = string.join(lines, "\r\n") + "\r\n" + line = key + ": " + value + if prefix: + self._headers.insert(0, line) + else: + self._headers.append(line) + + def flushheaders(self): + self._fp.writelines(self._headers) + self._headers = [] + + def startbody(self, ctype=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + if content_type and ctype: + for name, value in plist: + ctype = ctype + ';\r\n %s=%s' % (name, value) + self.addheader("Content-type", ctype, prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs) + self.flushheaders() + if not add_to_http_hdrs: self._fp.write("\r\n") + self._first_part = True + return self._fp + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + boundary = boundary or choose_boundary() + self._boundary.append(boundary) + return self.startbody("multipart/" + subtype, + [("boundary", boundary)] + plist, + prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs, + content_type=content_type) + + def nextpart(self): + boundary = self._boundary[-1] + if self._first_part: + self._first_part = False + else: + self._fp.write("\r\n") + self._fp.write("--" + boundary + "\r\n") + return self.__class__(self._fp) + + def lastpart(self): + if self._first_part: + self.nextpart() + boundary = self._boundary.pop() + self._fp.write("\r\n--" + boundary + "--\r\n") + + +class ControlNotFoundError(ValueError): pass +class ItemNotFoundError(ValueError): pass +class ItemCountError(ValueError): pass + +class ParseError(Exception): pass + + +class _AbstractFormParser: + """forms attribute contains HTMLForm instances on completion.""" + # pinched (and modified) from Moshe Zadka + def __init__(self, entitydefs=None): + if entitydefs is not None: + self.entitydefs = entitydefs + self.base = None + self.forms = [] + self._current_form = None + self._select = None + self._optgroup = None + self._option = None + self._textarea = None + + def do_base(self, attrs): + for key, value in attrs: + if key == "href": + self.base = value + + def start_form(self, attrs): + if self._current_form is not None: + raise ParseError("nested FORMs") + name = None + action = None + enctype = "application/x-www-form-urlencoded" + method = "GET" + d = {} + for key, value in attrs: + if key == "name": + name = value + elif key == "action": + action = value + elif key == "method": + method = string.upper(value) + elif key == "enctype": + enctype = string.lower(value) + d[key] = value + controls = [] + self._current_form = (name, action, method, enctype), d, controls + + def end_form(self): + if self._current_form is None: + raise ParseError("end of FORM before start") + self.forms.append(self._current_form) + self._current_form = None + + def start_select(self, attrs): + if self._current_form is None: + raise ParseError("start of SELECT before start of FORM") + if self._select is not None: + raise ParseError("nested SELECTs") + if self._textarea is not None: + raise ParseError("SELECT inside TEXTAREA") + d = {} + for key, val in attrs: + d[key] = val + + self._select = d + + self._append_select_control({"__select": d}) + + def end_select(self): + if self._current_form is None: + raise ParseError("end of SELECT before start of FORM") + if self._select is None: + raise ParseError("end of SELECT before start") + + if self._option is not None: + self._end_option() + + self._select = None + + def start_optgroup(self, attrs): + if self._select is None: + raise ParseError("OPTGROUP outside of SELECT") + d = {} + for key, val in attrs: + d[key] = val + + self._optgroup = d + + def end_optgroup(self): + if self._optgroup is None: + raise ParseError("end of OPTGROUP before start") + self._optgroup = None + + def _start_option(self, attrs): + if self._select is None: + raise ParseError("OPTION outside of SELECT") + if self._option is not None: + self._end_option() + + d = {} + for key, val in attrs: + d[key] = val + + self._option = {} + self._option.update(d) + if (self._optgroup and self._optgroup.has_key("disabled") and + not self._option.has_key("disabled")): + self._option["disabled"] = None + + def _end_option(self): + if self._option is None: + raise ParseError("end of OPTION before start") + + contents = string.strip(self._option.get("contents", "")) + self._option["contents"] = contents + if not self._option.has_key("value"): + self._option["value"] = contents + if not self._option.has_key("label"): + self._option["label"] = contents + # stuff dict of SELECT HTML attrs into a special private key + # (gets deleted again later) + self._option["__select"] = self._select + self._append_select_control(self._option) + self._option = None + + def _append_select_control(self, attrs): + controls = self._current_form[2] + name = self._select.get("name") + controls.append(("select", name, attrs)) + + def start_textarea(self, attrs): + if self._current_form is None: + raise ParseError("start of TEXTAREA before start of FORM") + if self._textarea is not None: + raise ParseError("nested TEXTAREAs") + if self._select is not None: + raise ParseError("TEXTAREA inside SELECT") + d = {} + for key, val in attrs: + d[key] = val + + self._textarea = d + + def end_textarea(self): + if self._current_form is None: + raise ParseError("end of TEXTAREA before start of FORM") + if self._textarea is None: + raise ParseError("end of TEXTAREA before start") + controls = self._current_form[2] + name = self._textarea.get("name") + controls.append(("textarea", name, self._textarea)) + self._textarea = None + + def handle_data(self, data): + if self._option is not None: + # self._option is a dictionary of the OPTION element's HTML + # attributes, but it has two special keys, one of which is the + # special "contents" key contains text between OPTION tags (the + # other is the "__select" key: see the end_option method) + map = self._option + key = "contents" + elif self._textarea is not None: + map = self._textarea + key = "value" + else: + return + + if not map.has_key(key): + map[key] = data + else: + map[key] = map[key] + data + + def do_button(self, attrs): + if self._current_form is None: + raise ParseError("start of BUTTON before start of FORM") + d = {} + d["type"] = "submit" # default + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + # we don't want to lose information, so use a type string that + # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON} + # eg. type for BUTTON/RESET is "resetbutton" + # (type for INPUT/RESET is "reset") + type = type+"button" + controls.append((type, name, d)) + + def do_input(self, attrs): + if self._current_form is None: + raise ParseError("start of INPUT before start of FORM") + d = {} + d["type"] = "text" # default + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + controls.append((type, name, d)) + + def do_isindex(self, attrs): + if self._current_form is None: + raise ParseError("start of ISINDEX before start of FORM") + d = {} + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + # isindex doesn't have type or name HTML attributes + controls.append(("isindex", None, d)) + +# HTMLParser.HTMLParser is recent, so live without it if it's not available +# (also, htmllib.HTMLParser is much more tolerant of bad HTML) +try: + import HTMLParser +except ImportError: + class XHTMLCompatibleFormParser: + def __init__(self, entitydefs=None): + raise ValueError("HTMLParser could not be imported") +else: + class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): + """Good for XHTML, bad for tolerance of incorrect HTML.""" + # thanks to Michael Howitz for this! + def __init__(self, entitydefs=None): + HTMLParser.HTMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs) + + def start_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + def end_option(self): + _AbstractFormParser._end_option(self) + + def handle_starttag(self, tag, attrs): + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + pass # unknown tag + else: + method() + + # handle_charref, handle_entityref and default entitydefs are taken + # from sgmllib + def handle_charref(self, name): + try: + n = int(name) + except ValueError: + self.unknown_charref(name) + return + if not 0 <= n <= 255: + self.unknown_charref(name) + return + self.handle_data(chr(n)) + + # Definition of entities -- derived classes may override + entitydefs = \ + {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} + + def handle_entityref(self, name): + table = self.entitydefs + if name in table: + self.handle_data(table[name]) + else: + self.unknown_entityref(name) + return + + # These methods would have passed through the ref intact if I'd thought + # of it earlier, but since the old parser silently swallows unknown + # refs, so does this new parser. + def unknown_entityref(self, ref): pass + def unknown_charref(self, ref): pass + +import htmllib, formatter +class FormParser(_AbstractFormParser, htmllib.HTMLParser): + """Good for tolerance of incorrect HTML, bad for XHTML.""" + def __init__(self, entitydefs=None): + htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) + _AbstractFormParser.__init__(self, entitydefs) + + def do_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + +#FormParser = XHTMLCompatibleFormParser # testing hack + +def get_entitydefs(): + entitydefs = {} + for name, char in htmlentitydefs.entitydefs.items(): + entitydefs["&%s;" % name] = char + # unescape already does these three + del entitydefs["<"] + del entitydefs[">"] + del entitydefs["&"] + return entitydefs + +def unescape_attrs(attrs, entitydefs): + escaped_attrs = {} + for key, val in attrs.items(): + try: + val.items + except AttributeError: + escaped_attrs[key] = unescape(val, entitydefs) + else: + # eg. "__select" -- yuck! + escaped_attrs[key] = unescape_attrs(val, entitydefs) + return escaped_attrs + +def ParseResponse(response, select_default=False, + ignore_errors=False, # ignored! + form_parser_class=FormParser, + request_class=urllib2.Request): + """Parse HTTP response and return a list of HTMLForm instances. + + The return value of urllib2.urlopen can be conveniently passed to this + function as the response parameter. + + ClientForm.ParseError is raised on parse errors. + + response: file-like object (supporting read() method) with a method + geturl(), returning the URI of the HTTP response + select_default: for multiple-selection SELECT controls and RADIO controls, + pick the first item as the default if none are selected in the HTML + form_parser_class: class to instantiate and use to pass + + Pass a true value for select_default if you want the behaviour specified by + RFC 1866 (the HTML 2.0 standard), which is to select the first item in a + RADIO or multiple-selection SELECT control if none were selected in the + HTML. Most browsers (including Microsoft Internet Explorer (IE) and + Netscape Navigator) instead leave all items unselected in these cases. The + W3C HTML 4.0 standard leaves this behaviour undefined in the case of + multiple-selection SELECT controls, but insists that at least one RADIO + button should be checked at all times, in contradiction to browser + behaviour. + + There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses + HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses + htmllib.HTMLParser) (the default) works best for ordinary grubby HTML. + Note that HTMLParser is only available in Python 2.2 and later. You can + pass your own class in here as a hack to work around bad HTML, but at your + own risk: there is no well-defined interface. + + """ + return ParseFile(response, response.geturl(), select_default, + False, + form_parser_class, + request_class) + +def ParseFile(file, base_uri, select_default=False, + ignore_errors=False, # ignored! + form_parser_class=FormParser, + request_class=urllib2.Request): + """Parse HTML and return a list of HTMLForm instances. + + ClientForm.ParseError is raised on parse errors. + + file: file-like object (supporting read() method) containing HTML with zero + or more forms to be parsed + base_uri: the URI of the document (note that the base URI used to submit + the form will be that given in the BASE element if present, not that of + the document) + + For the other arguments and further details, see ParseResponse.__doc__. + + """ + use_htmllib = True + fp = form_parser_class() + while 1: + data = file.read(CHUNK) + try: + fp.feed(data) + except ParseError, e: + e.base_uri = base_uri + raise + if len(data) != CHUNK: break + if fp.base is not None: + # HTML BASE element takes precedence over document URI + base_uri = fp.base + forms = [] + entitydefs = get_entitydefs() + for (name, action, method, enctype), attrs, controls in fp.forms: + if action is None: + action = base_uri + else: + action = urljoin(base_uri, action) + form = HTMLForm(action, method, enctype, name, attrs, request_class) + for type, name, attr in controls: + attr = unescape_attrs(attr, entitydefs) + name = unescape(name, entitydefs) + form.new_control(type, name, attr, select_default=select_default) + forms.append(form) + for form in forms: + form.fixup() + return forms + + +class Control: + """An HTML form control. + + An HTMLForm contains a sequence of Controls. HTMLForm delegates lots of + things to Control objects, and most of Control's methods are, in effect, + documented by the HTMLForm docstrings. + + The Controls in an HTMLForm can be got at via the HTMLForm.find_control + method or the HTMLForm.controls attribute. + + Control instances are usually constructed using the ParseFile / + ParseResponse functions, so you can probably ignore the rest of this + paragraph. A Control is only properly initialised after the fixup method + has been called. In fact, this is only strictly necessary for ListControl + instances. This is necessary because ListControls are built up from + ListControls each containing only a single item, and their initial value(s) + can only be known after the sequence is complete. + + The types and values that are acceptable for assignment to the value + attribute are defined by subclasses. + + If the disabled attribute is true, this represents the state typically + represented by browsers by `greying out' a control. If the disabled + attribute is true, the Control will raise AttributeError if an attempt is + made to change its value. In addition, the control will not be considered + `successful' as defined by the W3C HTML 4 standard -- ie. it will + contribute no data to the return value of the HTMLForm.click* methods. To + enable a control, set the disabled attribute to a false value. + + If the readonly attribute is true, the Control will raise AttributeError if + an attempt is made to change its value. To make a control writable, set + the readonly attribute to a false value. + + All controls have the disabled and readonly attributes, not only those that + may have the HTML attributes of the same names. + + On assignment to the value attribute, the following exceptions are raised: + TypeError, AttributeError (if the value attribute should not be assigned + to, because the control is disabled, for example) and ValueError. + + If the name or value attributes are None, or the value is an empty list, or + if the control is disabled, the control is not successful. + + Public attributes: + + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) (readonly) + name: name of control (readonly) + value: current value of control (subclasses may allow a single value, a + sequence of values, or either) + disabled: disabled state + readonly: readonly state + id: value of id HTML attribute + + """ + def __init__(self, type, name, attrs): + """ + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) + name: control name + attrs: HTML attributes of control's HTML element + + """ + raise NotImplementedError() + + def add_to_form(self, form): + form.controls.append(self) + + def fixup(self): + pass + + def is_of_kind(self, kind): + raise NotImplementedError() + + def __getattr__(self, name): raise NotImplementedError() + def __setattr__(self, name, value): raise NotImplementedError() + + def pairs(self): + """Return list of (key, value) pairs suitable for passing to urlencode. + """ + raise NotImplementedError() + + def _write_mime_data(self, mw): + """Write data for this control to a MimeWriter.""" + # called by HTMLForm + for name, value in self.pairs(): + mw2 = mw.nextpart() + mw2.addheader("Content-disposition", + 'form-data; name="%s"' % name, 1) + f = mw2.startbody(prefix=0) + f.write(value) + + def __str__(self): + raise NotImplementedError() + + +#--------------------------------------------------- +class ScalarControl(Control): + """Control whose value is not restricted to one of a prescribed set. + + Some ScalarControls don't accept any value attribute. Otherwise, takes a + single value, which must be string-like. + + Additional read-only public attribute: + + attrs: dictionary mapping the names of original HTML attributes of the + control to their values + + """ + def __init__(self, type, name, attrs): + self.__dict__["type"] = string.lower(type) + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = attrs.has_key("disabled") + self.readonly = attrs.has_key("readonly") + self.id = attrs.get("id") + + self.attrs = attrs.copy() + + self._clicked = False + + def __getattr__(self, name): + if name == "value": + return self.__dict__["_value"] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if not isstringlike(value): + raise TypeError("must assign a string") + elif self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + elif self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + self.__dict__["_value"] = value + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def pairs(self): + name = self.name + value = self.value + if name is None or value is None or self.disabled: + return [] + return [(name, value)] + + def __str__(self): + name = self.name + value = self.value + if name is None: name = "<None>" + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class TextControl(ScalarControl): + """Textual input control. + + Covers: + + INPUT/TEXT + INPUT/PASSWORD + INPUT/FILE + INPUT/HIDDEN + TEXTAREA + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + if self.type == "hidden": self.readonly = True + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind == "text" + +#--------------------------------------------------- +class FileControl(ScalarControl): + """File upload with INPUT TYPE=FILE. + + The value attribute of a FileControl is always None. Use add_file instead. + + Additional public method: add_file + + """ + + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + self._value = None + self._upload_data = [] + + def is_of_kind(self, kind): return kind == "file" + + def __setattr__(self, name, value): + if name in ("value", "name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def add_file(self, file_object, content_type=None, filename=None): + if not hasattr(file_object, "read"): + raise TypeError("file-like object must have read method") + if content_type is not None and not isstringlike(content_type): + raise TypeError("content type must be None or string-like") + if filename is not None and not isstringlike(filename): + raise TypeError("filename must be None or string-like") + if content_type is None: + content_type = "application/octet-stream" + self._upload_data.append((file_object, content_type, filename)) + + def pairs(self): + # XXX should it be successful even if unnamed? + if self.name is None or self.disabled: + return [] + return [(self.name, "")] + + def _write_mime_data(self, mw): + # called by HTMLForm + if len(self._upload_data) == 1: + # single file + file_object, content_type, filename = self._upload_data[0] + mw2 = mw.nextpart() + fn_part = filename and ('; filename="%s"' % filename) or '' + disp = 'form-data; name="%s"%s' % (self.name, fn_part) + mw2.addheader("Content-disposition", disp, prefix=1) + fh = mw2.startbody(content_type, prefix=0) + fh.write(file_object.read()) + elif len(self._upload_data) != 0: + # multiple files + mw2 = mw.nextpart() + disp = 'form-data; name="%s"' % self.name + mw2.addheader("Content-disposition", disp, prefix=1) + fh = mw2.startmultipartbody("mixed", prefix=0) + for file_object, content_type, filename in self._upload_data: + mw3 = mw2.nextpart() + fn_part = filename and ('; filename="%s"' % filename) or '' + disp = 'file%s' % fn_part + mw3.addheader("Content-disposition", disp, prefix=1) + fh2 = mw3.startbody(content_type, prefix=0) + fh2.write(file_object.read()) + mw2.lastpart() + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + if not self._upload_data: + value = "<No files added>" + else: + value = [] + for file, ctype, filename in self._upload_data: + if filename is None: + value.append("<Unnamed file>") + else: + value.append(filename) + value = string.join(value, ", ") + + info = [] + if self.disabled: info.append("disabled") + if self.readonly: info.append("readonly") + info = string.join(info, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class IsindexControl(ScalarControl): + """ISINDEX control. + + ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really + part of regular HTML forms at all, and predates it. You're only allowed + one ISINDEX per HTML document. ISINDEX and regular form submission are + mutually exclusive -- either submit a form, or the ISINDEX. + + Having said this, since ISINDEX controls may appear in forms (which is + probably bad HTML), ParseFile / ParseResponse will include them in the + HTMLForm instances it returns. You can set the ISINDEX's value, as with + any other control (but note that ISINDEX controls have no name, so you'll + need to use the type argument of set_value!). When you submit the form, + the ISINDEX will not be successful (ie., no data will get returned to the + server as a result of its presence), unless you click on the ISINDEX + control, in which case the ISINDEX gets submitted instead of the form: + + form.set_value("my isindex value", type="isindex") + urllib2.urlopen(form.click(type="isindex")) + + ISINDEX elements outside of FORMs are ignored. If you want to submit one + by hand, do it like so: + + url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value")) + result = urllib2.urlopen(url) + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind in ["text", "clickable"] + + def pairs(self): + return [] + + def _click(self, form, coord, return_type, request_class=urllib2.Request): + # Relative URL for ISINDEX submission: instead of "foo=bar+baz", + # want "bar+baz". + # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is + # deprecated in 4.01, but it should still say how to submit it). + # Submission of ISINDEX is explained in the HTML 3.2 spec, though. + parts = urlparse.urlparse(form.action) + rest, (query, frag) = parts[:-2], parts[-2:] + parts = rest + (urllib.quote_plus(self.value), "") + url = urlparse.urlunparse(parts) + req_data = url, None, [] + + if return_type == "pairs": + return [] + elif return_type == "request_data": + return req_data + else: + return request_class(url) + + def __str__(self): + value = self.value + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s)%s>" % (self.__class__.__name__, value, info) + + +#--------------------------------------------------- +class IgnoreControl(ScalarControl): + """Control that we're not interested in. + + Covers: + + INPUT/RESET + BUTTON/RESET + INPUT/BUTTON + BUTTON/BUTTON + + These controls are always unsuccessful, in the terminology of HTML 4 (ie. + they never require any information to be returned to the server). + + BUTTON/BUTTON is used to generate events for script embedded in HTML. + + The value attribute of IgnoreControl is always None. + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + self._value = None + + def is_of_kind(self, kind): return False + + def __setattr__(self, name, value): + if name == "value": + raise AttributeError( + "control '%s' is ignored, hence read-only" % self.name) + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + +#--------------------------------------------------- +class ListControl(Control): + """Control representing a sequence of items. + + The value attribute of a ListControl represents the selected list items in + the control. + + ListControl implements both list controls that take a single value and + those that take multiple values. + + ListControls accept sequence values only. Some controls only accept + sequences of length 0 or 1 (RADIO, and single-selection SELECT). + In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes + and multiple-selection SELECTs (those having the "multiple" HTML attribute) + accept sequences of any length. + + Note the following mistake: + + control.value = some_value + assert control.value == some_value # not necessarily true + + The reason for this is that the value attribute always gives the list items + in the order they were listed in the HTML. + + ListControl items can also be referred to by their labels instead of names. + Use the by_label argument, and the set_value_by_label, get_value_by_label + methods. + + XXX RadioControl and CheckboxControl don't implement by_label yet. + + Note that, rather confusingly, though SELECT controls are represented in + HTML by SELECT elements (which contain OPTION elements, representing + individual list items), CHECKBOXes and RADIOs are not represented by *any* + element. Instead, those controls are represented by a collection of INPUT + elements. For example, this is a SELECT control, named "control1": + + <select name="control1"> + <option>foo</option> + <option value="1">bar</option> + </select> + + and this is a CHECKBOX control, named "control2": + + <input type="checkbox" name="control2" value="foo" id="cbe1"> + <input type="checkbox" name="control2" value="bar" id="cbe2"> + + The id attribute of a CHECKBOX or RADIO ListControl is always that of its + first element (for example, "cbe1" above). + + + Additional read-only public attribute: multiple. + + """ + + # ListControls are built up by the parser from their component items by + # creating one ListControl per item, consolidating them into a single + # master ListControl held by the HTMLForm: + + # -User calls form.new_control(...) + # -Form creates Control, and calls control.add_to_form(self). + # -Control looks for a Control with the same name and type in the form, + # and if it finds one, merges itself with that control by calling + # control.merge_control(self). The first Control added to the form, of + # a particular name and type, is the only one that survives in the + # form. + # -Form calls control.fixup for all its controls. ListControls in the + # form know they can now safely pick their default values. + + # To create a ListControl without an HTMLForm, use: + + # control.merge_control(new_control) + + # (actually, it's much easier just to use ParseFile) + + def __init__(self, type, name, attrs={}, select_default=False, + called_as_base_class=False): + """ + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present + + """ + if not called_as_base_class: + raise NotImplementedError() + + self.__dict__["type"] = string.lower(type) + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = False + self.readonly = False + self.id = attrs.get("id") + + self._attrs = attrs.copy() + # As Controls are merged in with .merge_control(), self._attrs will + # refer to each Control in turn -- always the most recently merged + # control. Each merged-in Control instance corresponds to a single + # list item: see ListControl.__doc__. + if attrs: + self._attrs_list = [self._attrs] # extended by .merge_control() + self._disabled_list = [self._attrs.has_key("disabled")] # ditto + else: + self._attrs_list = [] # extended by .merge_control() + self._disabled_list = [] # ditto + + self._select_default = select_default + self._clicked = False + # Some list controls can have their default set only after all items + # are known. If so, self._value_is_set is false, and the self.fixup + # method, called after all items have been added, sets the default. + self._value_is_set = False + + def is_of_kind(self, kind): + if kind == "list": + return True + elif kind == "multilist": + return bool(self.multiple) + elif kind == "singlelist": + return not self.multiple + else: + return False + + def _value_from_label(self, label): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + + def toggle(self, name, by_label=False): + return self._set_selected_state(name, 2, by_label) + def set(self, selected, name, by_label=False): + action = int(bool(selected)) + return self._set_selected_state(name, action, by_label) + + def _set_selected_state(self, name, action, by_label): + """ + name: item name + action: + 0: clear + 1: set + 2: toggle + + """ + if not isstringlike(name): + raise TypeError("item name must be string-like") + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError("no item named '%s'" % name) + + if self.multiple: + if action == 2: + action = not self._selected[i] + if action and self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % name) + self._selected[i] = bool(action) + else: + if action == 2: + if self._selected == name: + action = 0 + else: + action = 1 + if action == 0 and self._selected == name: + self._selected = None + elif action == 1: + if self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % name) + self._selected = name + + def toggle_single(self, by_label=False): + self._set_single_selected_state(2, by_label) + def set_single(self, selected, by_label=False): + action = int(bool(selected)) + self._set_single_selected_state(action, by_label) + + def _set_single_selected_state(self, action, by_label): + if len(self._menu) != 1: + raise ItemCountError("'%s' is not a single-item control" % + self.name) + + name = self._menu[0] + if by_label: + name = self._value_from_label(name) + self._set_selected_state(name, action, by_label) + + def get_item_disabled(self, name, by_label=False): + """Get disabled state of named list item in a ListControl.""" + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + else: + return self._disabled_list[i] + + def set_item_disabled(self, disabled, name, by_label=False): + """Set disabled state of named list item in a ListControl. + + disabled: boolean disabled state + + """ + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + else: + self._disabled_list[i] = bool(disabled) + + def set_all_items_disabled(self, disabled): + """Set disabled state of all list items in a ListControl. + + disabled: boolean disabled state + + """ + for i in range(len(self._disabled_list)): + self._disabled_list[i] = bool(disabled) + + def get_item_attrs(self, name, by_label=False): + """Return dictionary of HTML attributes for a single ListControl item. + + The HTML element types that describe list items are: OPTION for SELECT + controls, INPUT for the rest. These elements have HTML attributes that + you may occasionally want to know about -- for example, the "alt" HTML + attribute gives a text string describing the item (graphical browsers + usually display this as a tooltip). + + The returned dictionary maps HTML attribute names to values. The names + and values are taken from the original HTML. + + Note that for SELECT controls, the returned dictionary contains a + special key "contents" -- see SelectControl.__doc__. + + """ + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + return self._attrs_list[i] + + def add_to_form(self, form): + try: + control = form.find_control(self.name, self.type) + except ControlNotFoundError: + Control.add_to_form(self, form) + else: + control.merge_control(self) + + def merge_control(self, control): + assert bool(control.multiple) == bool(self.multiple) + assert isinstance(control, self.__class__) + self._menu.extend(control._menu) + self._attrs_list.extend(control._attrs_list) + self._disabled_list.extend(control._disabled_list) + if control.multiple: + self._selected.extend(control._selected) + else: + if control._value_is_set: + self._selected = control._selected + if control._value_is_set: + self._value_is_set = True + + def fixup(self): + """ + ListControls are built up from component list items (which are also + ListControls) during parsing. This method should be called after all + items have been added. See ListControl.__doc__ for the reason this is + required. + + """ + # Need to set default selection where no item was indicated as being + # selected by the HTML: + + # CHECKBOX: + # Nothing should be selected. + # SELECT/single, SELECT/multiple and RADIO: + # RFC 1866 (HTML 2.0): says first item should be selected. + # W3C HTML 4.01 Specification: says that client behaviour is + # undefined in this case. For RADIO, exactly one must be selected, + # though which one is undefined. + # Both Netscape and Microsoft Internet Explorer (IE) choose first + # item for SELECT/single. However, both IE5 and Mozilla (both 1.0 + # and Firebird 0.6) leave all items unselected for RADIO and + # SELECT/multiple. + + # Since both Netscape and IE all choose the first item for + # SELECT/single, we do the same. OTOH, both Netscape and IE + # leave SELECT/multiple with nothing selected, in violation of RFC 1866 + # (but not in violation of the W3C HTML 4 standard); the same is true + # of RADIO (which *is* in violation of the HTML 4 standard). We follow + # RFC 1866 if the select_default attribute is set, and Netscape and IE + # otherwise. RFC 1866 and HTML 4 are always violated insofar as you + # can deselect all items in a RadioControl. + + raise NotImplementedError() + + def __getattr__(self, name): + if name == "value": + menu = self._menu + if self.multiple: + values = [] + for i in range(len(menu)): + if self._selected[i]: values.append(menu[i]) + return values + else: + if self._selected is None: return [] + else: return [self._selected] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._set_value(value) + elif name in ("name", "type", "multiple"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _set_value(self, value): + if self.multiple: + self._multiple_set_value(value) + else: + self._single_set_value(value) + + def _single_set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + nr = len(value) + if not (0 <= nr <= 1): + raise ItemCountError("single selection list, must set sequence of " + "length 0 or 1") + + if nr == 0: + self._selected = None + else: + value = value[0] + try: + i = self._menu.index(value) + except ValueError: + raise ItemNotFoundError("no item named '%s'" % + repr(value)) + if self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % value) + self._selected = value + + def _multiple_set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + + selected = [False]*len(self._selected) + menu = self._menu + disabled_list = self._disabled_list + + for v in value: + found = False + for i in range(len(menu)): + item_name = menu[i] + if v == item_name: + if disabled_list[i]: + raise AttributeError("item '%s' is disabled" % value) + selected[i] = True + found = True + break + if not found: + raise ItemNotFoundError("no item named '%s'" % repr(v)) + self._selected = selected + + def set_value_by_label(self, value): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + def get_value_by_label(self): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + + def possible_items(self, by_label=False): + if by_label: + raise NotImplementedError( + "control '%s' does not yet support by_label" % self.name) + return copy.copy(self._menu) + + def pairs(self): + if self.disabled: + return [] + + if not self.multiple: + name = self.name + value = self._selected + if name is None or value is None: + return [] + return [(name, value)] + else: + control_name = self.name # usually the name HTML attribute + pairs = [] + for i in range(len(self._menu)): + item_name = self._menu[i] # usually the value HTML attribute + if self._selected[i]: + pairs.append((control_name, item_name)) + return pairs + + def _item_str(self, i): + item_name = self._menu[i] + if self.multiple: + if self._selected[i]: + item_name = "*"+item_name + else: + if self._selected == item_name: + item_name = "*"+item_name + if self._disabled_list[i]: + item_name = "(%s)" % item_name + return item_name + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + display = [] + for i in range(len(self._menu)): + s = self._item_str(i) + display.append(s) + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=[%s])%s>" % (self.__class__.__name__, + name, string.join(display, ", "), info) + + +class RadioControl(ListControl): + """ + Covers: + + INPUT/RADIO + + """ + def __init__(self, type, name, attrs, select_default=False): + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + self.__dict__["multiple"] = False + value = attrs.get("value", "on") + self._menu = [value] + checked = attrs.has_key("checked") + if checked: + self._value_is_set = True + self._selected = value + else: + self._selected = None + + def fixup(self): + if not self._value_is_set: + # no item explicitly selected + assert self._selected is None + if self._select_default: + self._selected = self._menu[0] + self._value_is_set = True + + +class CheckboxControl(ListControl): + """ + Covers: + + INPUT/CHECKBOX + + """ + def __init__(self, type, name, attrs, select_default=False): + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + self.__dict__["multiple"] = True + value = attrs.get("value", "on") + self._menu = [value] + checked = attrs.has_key("checked") + self._selected = [checked] + self._value_is_set = True + + def fixup(self): + # If no items were explicitly checked in HTML, that's how we must + # leave it, so we have nothing to do here. + assert self._value_is_set + + +class SelectControl(ListControl): + """ + Covers: + + SELECT (and OPTION) + + SELECT control values and labels are subject to some messy defaulting + rules. For example, if the HTML repreentation of the control is: + + <SELECT name=year> + <OPTION value=0 label="2002">current year</OPTION> + <OPTION value=1>2001</OPTION> + <OPTION>2000</OPTION> + </SELECT> + + The items, in order, have labels "2002", "2001" and "2000", whereas their + values are "0", "1" and "2000" respectively. Note that the value of the + last OPTION in this example defaults to its contents, as specified by RFC + 1866, as do the labels of the second and third OPTIONs. + + The OPTION labels are sometimes more meaningful than the OPTION values, + which can make for more maintainable code. + + Additional read-only public attribute: attrs + + The attrs attribute is a dictionary of the original HTML attributes of the + SELECT element. Other ListControls do not have this attribute, because in + other cases the control as a whole does not correspond to any single HTML + element. The get_item_attrs method may be used as usual to get at the + HTML attributes of the HTML elements corresponding to individual list items + (for SELECT controls, these are OPTION elements). + + Another special case is that the attributes dictionaries returned by + get_item_attrs have a special key "contents" which does not correspond to + any real HTML attribute, but rather contains the contents of the OPTION + element: + + <OPTION>this bit</OPTION> + + """ + # HTML attributes here are treated slightly from other list controls: + # -The SELECT HTML attributes dictionary is stuffed into the OPTION + # HTML attributes dictionary under the "__select" key. + # -The content of each OPTION element is stored under the special + # "contents" key of the dictionary. + # After all this, the dictionary is passed to the SelectControl constructor + # as the attrs argument, as usual. However: + # -The first SelectControl constructed when building up a SELECT control + # has a constructor attrs argument containing only the __select key -- so + # this SelectControl represents an empty SELECT control. + # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and + # the __select dictionary containing the SELECT HTML-attributes. + def __init__(self, type, name, attrs, select_default=False): + # fish out the SELECT HTML attributes from the OPTION HTML attributes + # dictionary + self.attrs = attrs["__select"].copy() + attrs = attrs.copy() + del attrs["__select"] + + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + + self._label_map = None + self.disabled = self.attrs.has_key("disabled") + self.id = self.attrs.get("id") + + self._menu = [] + self._selected = [] + self._value_is_set = False + if self.attrs.has_key("multiple"): + self.__dict__["multiple"] = True + self._selected = [] + else: + self.__dict__["multiple"] = False + self._selected = None + + if attrs: # OPTION item data was provided + value = attrs["value"] + self._menu.append(value) + selected = attrs.has_key("selected") + if selected: + self._value_is_set = True + if self.attrs.has_key("multiple"): + self._selected.append(selected) + elif selected: + self._selected = value + + def _build_select_label_map(self): + """Return an ordered mapping of labels to values. + + For example, if the HTML repreentation of the control is as given in + SelectControl.__doc__, this function will return a mapping like: + + {"2002": "0", "2001": "1", "2000": "2000"} + + """ + alist = [] + for val in self._menu: + attrs = self.get_item_attrs(val) + alist.append((attrs["label"], val)) + return AList(alist) + + def _value_from_label(self, label): + try: + return self._label_map[label] + except KeyError: + raise ItemNotFoundError("no item has label '%s'" % label) + + def fixup(self): + if not self._value_is_set: + # No item explicitly selected. + if len(self._menu) > 0: + if self.multiple: + if self._select_default: + self._selected[0] = True + else: + assert self._selected is None + self._selected = self._menu[0] + self._value_is_set = True + self._label_map = self._build_select_label_map() + + def _delete_items(self): + # useful for simulating JavaScript code, but not a stable interface yet + self._menu = [] + self._value_is_set = False + if self.multiple: + self._selected = [] + else: + self._selected = None + + def possible_items(self, by_label=False): + if not by_label: + return copy.copy(self._menu) + else: + self._label_map.set_inverted(True) + try: + r = map(lambda v, self=self: self._label_map[v], self._menu) + finally: + self._label_map.set_inverted(False) + return r + + def set_value_by_label(self, value): + if isstringlike(value): + raise TypeError("ListControl, must set a sequence, not a string") + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + + try: + value = map(lambda v, self=self: self._label_map[v], value) + except KeyError, e: + raise ItemNotFoundError("no item has label '%s'" % e.args[0]) + self._set_value(value) + + def get_value_by_label(self): + menu = self._menu + self._label_map.set_inverted(True) + try: + if self.multiple: + values = [] + for i in range(len(menu)): + if self._selected[i]: + values.append(self._label_map[menu[i]]) + return values + else: + return [self._label_map[self._selected]] + finally: + self._label_map.set_inverted(False) + + +#--------------------------------------------------- +class SubmitControl(ScalarControl): + """ + Covers: + + INPUT/SUBMIT + BUTTON/SUBMIT + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it + # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem + # to define this. + if self.value is None: self.value = "" + self.readonly = True + + def is_of_kind(self, kind): return kind == "clickable" + + def _click(self, form, coord, return_type, request_class=urllib2.Request): + self._clicked = coord + r = form._switch_click(return_type, request_class) + self._clicked = False + return r + + def pairs(self): + if not self._clicked: + return [] + return ScalarControl.pairs(self) + + +#--------------------------------------------------- +class ImageControl(SubmitControl): + """ + Covers: + + INPUT/IMAGE + + Coordinates are specified using one of the HTMLForm.click* methods. + + """ + def __init__(self, type, name, attrs): + SubmitControl.__init__(self, type, name, attrs) + self.readonly = False + + def pairs(self): + clicked = self._clicked + if self.disabled or not clicked: + return [] + name = self.name + if name is None: return [] + pairs = [ + ("%s.x" % name, str(clicked[0])), + ("%s.y" % name, str(clicked[1])), + ] + value = self._value + if value: + pairs.append((name, value)) + return pairs + +# aliases, just to make str(control) and str(form) clearer +class PasswordControl(TextControl): pass +class HiddenControl(TextControl): pass +class TextareaControl(TextControl): pass +class SubmitButtonControl(SubmitControl): pass + + +def is_listcontrol(control): return control.is_of_kind("list") + + +class HTMLForm: + """Represents a single HTML <form> ... </form> element. + + A form consists of a sequence of controls that usually have names, and + which can take on various values. The values of the various types of + controls represent variously: text, zero-or-one-of-many or many-of-many + choices, and files to be uploaded. Some controls can be clicked on to + submit the form, and clickable controls' values sometimes include the + coordinates of the click. + + Forms can be filled in with data to be returned to the server, and then + submitted, using the click method to generate a request object suitable for + passing to urllib2.urlopen (or the click_request_data or click_pairs + methods if you're not using urllib2). + + import ClientForm + forms = ClientForm.ParseFile(html, base_uri) + form = forms[0] + + form["query"] = "Python" + form.set("lots", "nr_results") + + response = urllib2.urlopen(form.click()) + + Usually, HTMLForm instances are not created directly. Instead, the + ParseFile or ParseResponse factory functions are used. If you do construct + HTMLForm objects yourself, however, note that an HTMLForm instance is only + properly initialised after the fixup method has been called (ParseFile and + ParseResponse do this for you). See ListControl.__doc__ for the reason + this is required. + + Indexing a form (form["control_name"]) returns the named Control's value + attribute. Assignment to a form index (form["control_name"] = something) + is equivalent to assignment to the named Control's value attribute. If you + need to be more specific than just supplying the control's name, use the + set_value and get_value methods. + + ListControl values are lists of item names. The list item's name is the + value of the corresponding HTML element's "value" attribute. + + Example: + + <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT> + <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT> + + defines a CHECKBOX control with name "cheeses" which has two items, named + "leicester" and "cheddar". + + Another example: + + <SELECT name="more_cheeses"> + <OPTION>1</OPTION> + <OPTION value="2" label="CHEDDAR">cheddar</OPTION> + </SELECT> + + defines a SELECT control with name "more_cheeses" which has two items, + named "1" and "2" (because the OPTION element's value HTML attribute + defaults to the element contents). + + To set, clear or toggle individual list items, use the set and toggle + methods. To set the whole value, do as for any other control:use indexing + or the set_/get_value methods. + + Example: + + # select *only* the item named "cheddar" + form["cheeses"] = ["cheddar"] + # select "cheddar", leave other items unaffected + form.set("cheddar", "cheeses") + + Some controls (RADIO and SELECT without the multiple attribute) can only + have zero or one items selected at a time. Some controls (CHECKBOX and + SELECT with the multiple attribute) can have multiple items selected at a + time. To set the whole value of a ListControl, assign a sequence to a form + index: + + form["cheeses"] = ["cheddar", "leicester"] + + If the ListControl is not multiple-selection, the assigned list must be of + length one. + + To check whether a control has an item, or whether an item is selected, + respectively: + + "cheddar" in form.possible_items("cheeses") + "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses")) + + Note that some list items may be disabled (see below). + + Note the following mistake: + + form[control_name] = control_value + assert form[control_name] == control_value # not necessarily true + + The reason for this is that form[control_name] always gives the list items + in the order they were listed in the HTML. + + List items (hence list values, too) can be referred to in terms of list + item labels rather than list item names. Currently, this is only possible + for SELECT controls (this is a bug). To use this feature, use the by_label + arguments to the various HTMLForm methods. Note that it is *item* names + (hence ListControl values also), not *control* names, that can be referred + to by label. + + The question of default values of OPTION contents, labels and values is + somewhat complicated: see SelectControl.__doc__ and + ListControl.get_item_attrs.__doc__ if you think you need to know. + + Controls can be disabled or readonly. In either case, the control's value + cannot be changed until you clear those flags (see example below). + Disabled is the state typically represented by browsers by `greying out' a + control. Disabled controls are not `successful' -- they don't cause data + to get returned to the server. Readonly controls usually appear in + browsers as read-only text boxes. Readonly controls are successful. List + items can also be disabled. Attempts to select disabled items (with + form[name] = value, or using the ListControl.set method, for example) fail. + Attempts to clear disabled items are allowed. + + If a lot of controls are readonly, it can be useful to do this: + + form.set_all_readonly(False) + + When you want to do several things with a single control, or want to do + less common things, like changing which controls and items are disabled, + you can get at a particular control: + + control = form.find_control("cheeses") + control.disabled = False + control.readonly = False + control.set_item_disabled(False, "gruyere") + control.set("gruyere") + + Most methods on HTMLForm just delegate to the contained controls, so see + the docstrings of the various Control classes for further documentation. + Most of these delegating methods take name, type, kind, id and nr arguments + to specify the control to be operated on: see + HTMLForm.find_control.__doc__. + + ControlNotFoundError (subclass of ValueError) is raised if the specified + control can't be found. This includes occasions where a non-ListControl + is found, but the method (set, for example) requires a ListControl. + ItemNotFoundError (subclass of ValueError) is raised if a list item can't + be found. ItemCountError (subclass of ValueError) is raised if an attempt + is made to select more than one item and the control doesn't allow that, or + set/get_single are called and the control contains more than one item. + AttributeError is raised if a control or item is readonly or disabled and + an attempt is made to alter its value. + + XXX CheckBoxControl and RadioControl don't yet support item access by label + + Security note: Remember that any passwords you store in HTMLForm instances + will be saved to disk in the clear if you pickle them (directly or + indirectly). The simplest solution to this is to avoid pickling HTMLForm + objects. You could also pickle before filling in any password, or just set + the password to "" before pickling. + + + Public attributes: + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form (None if no name was specified) + attrs: dictionary mapping original HTML form attributes to their values + + controls: list of Control instances; do not alter this list + (instead, call form.new_control to make a Control and add it to the + form, or control.add_to_form if you already have a Control instance) + + + + Methods for form filling: + ------------------------- + + Most of the these methods have very similar arguments. See + HTMLForm.find_control.__doc__ for details of the name, type, kind and nr + arguments. See above for a description of by_label. + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None) + + get_value(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + set_value(value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + set_all_readonly(readonly) + + + Methods applying only to ListControls: + + possible_items(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + set(selected, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + toggle(item_name, + name=None, type=None, id=None, nr=None, + by_label=False) + + set_single(selected, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + toggle_single(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + + Method applying only to FileControls: + + add_file(file_object, + content_type="application/octet-stream", filename=None, + name=None, id=None, nr=None) + + + Methods applying only to clickable controls: + + click(name=None, type=None, id=None, nr=0, coord=(1,1)) + click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1)) + click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1)) + + """ + + type2class = { + "text": TextControl, + "password": PasswordControl, + "hidden": HiddenControl, + "textarea": TextareaControl, + + "isindex": IsindexControl, + + "file": FileControl, + + "button": IgnoreControl, + "buttonbutton": IgnoreControl, + "reset": IgnoreControl, + "resetbutton": IgnoreControl, + + "submit": SubmitControl, + "submitbutton": SubmitButtonControl, + "image": ImageControl, + + "radio": RadioControl, + "checkbox": CheckboxControl, + "select": SelectControl, + } + +#--------------------------------------------------- +# Initialisation. Use ParseResponse / ParseFile instead. + + def __init__(self, action, method="GET", + enctype="application/x-www-form-urlencoded", + name=None, attrs=None, + request_class=urllib2.Request): + """ + In the usual case, use ParseResponse (or ParseFile) to create new + HTMLForm objects. + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form + attrs: dictionary mapping original HTML form attributes to their values + + """ + self.action = action + self.method = method + self.enctype = enctype + self.name = name + if attrs is not None: + self.attrs = attrs.copy() + else: + self.attrs = {} + self.controls = [] + self._request_class = request_class + + def new_control(self, type, name, attrs, + ignore_unknown=False, select_default=False): + """Adds a new control to the form. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + Note that controls representing lists of items are built up from + controls holding only a single list item. See ListControl.__doc__ for + further information. + + type: type of control (see Control.__doc__ for a list) + attrs: HTML attributes of control + ignore_unknown: if true, use a dummy Control instance for controls of + unknown type; otherwise, raise ValueError + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present (this defaulting happens when the HTMLForm.fixup method is + called) + + """ + type = string.lower(type) + klass = self.type2class.get(type) + if klass is None: + if ignore_unknown: + klass = IgnoreControl + else: + raise ValueError("Unknown control type '%s'" % type) + + a = attrs.copy() + if issubclass(klass, ListControl): + control = klass(type, name, a, select_default) + else: + control = klass(type, name, a) + control.add_to_form(self) + + def fixup(self): + """Normalise form after all controls have been added. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + This method should only be called once, after all controls have been + added to the form. + + """ + for control in self.controls: + control.fixup() + +#--------------------------------------------------- + def __str__(self): + header = "%s %s %s" % (self.method, self.action, self.enctype) + rep = [header] + for control in self.controls: + rep.append(" %s" % str(control)) + return "<%s>" % string.join(rep, "\n") + +#--------------------------------------------------- +# Form-filling methods. + + def __getitem__(self, name): + return self.find_control(name).value + def __setitem__(self, name, value): + control = self.find_control(name) + try: + control.value = value + except AttributeError, e: + raise ValueError(str(e)) + + def get_value(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Return value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] + + """ + c = self.find_control(name, type, kind, id, nr=nr) + if by_label: + try: + meth = c.get_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + return meth() + else: + return c.value + def set_value(self, value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Set value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] = value + + """ + c = self.find_control(name, type, kind, id, nr=nr) + if by_label: + try: + meth = c.set_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + meth(value) + else: + c.value = value + + def set_all_readonly(self, readonly): + for control in self.controls: + control.readonly = bool(readonly) + + +#--------------------------------------------------- +# Form-filling methods applying only to ListControls. + + def possible_items(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Return a list of all values that the specified control can take.""" + c = self._find_list_control(name, type, kind, id, nr) + return c.possible_items(by_label) + + def set(self, selected, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Select / deselect named list item. + + selected: boolean selected state + + """ + self._find_list_control(name, type, kind, id, nr).set( + selected, item_name, by_label) + def toggle(self, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Toggle selected state of named list item.""" + self._find_list_control(name, type, kind, id, nr).toggle( + item_name, by_label) + + def set_single(self, selected, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Select / deselect list item in a control having only one item. + + If the control has multiple list items, ItemCountError is raised. + + This is just a convenience method, so you don't need to know the item's + name -- the item name in these single-item controls is usually + something meaningless like "1" or "on". + + For example, if a checkbox has a single item named "on", the following + two calls are equivalent: + + control.toggle("on") + control.toggle_single() + + """ + self._find_list_control(name, type, kind, id, nr).set_single( + selected, by_label) + def toggle_single(self, name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Toggle selected state of list item in control having only one item. + + The rest is as for HTMLForm.set_single.__doc__. + + """ + self._find_list_control(name, type, kind, id, nr).toggle_single( + by_label) + +#--------------------------------------------------- +# Form-filling method applying only to FileControls. + + def add_file(self, file_object, content_type=None, filename=None, + name=None, id=None, nr=None): + """Add a file to be uploaded. + + file_object: file-like object (with read method) from which to read + data to upload + content_type: MIME content type of data to upload + filename: filename to pass to server + + If filename is None, no filename is sent to the server. + + If content_type is None, the content type is guessed based on the + filename and the data from read from the file object. + + XXX + At the moment, guessed content type is always application/octet-stream. + Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and + plain text. + + Note the following useful HTML attributes of file upload controls (see + HTML 4.01 spec, section 17): + + accept: comma-separated list of content types that the server will + handle correctly; you can use this to filter out non-conforming files + size: XXX IIRC, this is indicative of whether form wants multiple or + single files + maxlength: XXX hint of max content length in bytes? + + """ + self.find_control(name, "file", id=id, nr=nr).add_file( + file_object, content_type, filename) + +#--------------------------------------------------- +# Form submission methods, applying only to clickable controls. + + def click(self, name=None, type=None, id=None, nr=0, coord=(1,1), + request_class=urllib2.Request): + """Return request that would result from clicking on a control. + + The request object is a urllib2.Request instance, which you can pass to + urllib2.urlopen (or ClientCookie.urlopen). + + Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and + IMAGEs) can be clicked. + + Will click on the first clickable control, subject to the name, type + and nr arguments (as for find_control). If no name, type, id or number + is specified and there are no clickable controls, a request will be + returned for the form in its current, un-clicked, state. + + IndexError is raised if any of name, type, id or nr is specified but no + matching control is found. ValueError is raised if the HTMLForm has an + enctype attribute that is not recognised. + + You can optionally specify a coordinate to click at, which only makes a + difference if you clicked on an image. + + """ + return self._click(name, type, id, nr, coord, "request", + self._request_class) + + def click_request_data(self, + name=None, type=None, id=None, nr=0, coord=(1,1), + request_class=urllib2.Request): + """As for click method, but return a tuple (url, data, headers). + + You can use this data to send a request to the server. This is useful + if you're using httplib or urllib rather than urllib2. Otherwise, use + the click method. + + # Untested. Have to subclass to add headers, I think -- so use urllib2 + # instead! + import urllib + url, data, hdrs = form.click_request_data() + r = urllib.urlopen(url, data) + + # Untested. I don't know of any reason to use httplib -- you can get + # just as much control with urllib2. + import httplib, urlparse + url, data, hdrs = form.click_request_data() + tup = urlparse(url) + host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:]) + conn = httplib.HTTPConnection(host) + if data: + httplib.request("POST", path, data, hdrs) + else: + httplib.request("GET", path, headers=hdrs) + r = conn.getresponse() + + """ + return self._click(name, type, id, nr, coord, "request_data", + self._request_class) + + def click_pairs(self, name=None, type=None, id=None, nr=0, coord=(1,1)): + """As for click_request_data, but returns a list of (key, value) pairs. + + You can use this list as an argument to ClientForm.urlencode. This is + usually only useful if you're using httplib or urllib rather than + urllib2 or ClientCookie. It may also be useful if you want to manually + tweak the keys and/or values, but this should not be necessary. + Otherwise, use the click method. + + Note that this method is only useful for forms of MIME type + x-www-form-urlencoded. In particular, it does not return the + information required for file upload. If you need file upload and are + not using urllib2, use click_request_data. + + Also note that Python 2.0's urllib.urlencode is slightly broken: it + only accepts a mapping, not a sequence of pairs, as an argument. This + messes up any ordering in the argument. Use ClientForm.urlencode + instead. + + """ + return self._click(name, type, id, nr, coord, "pairs", + self._request_class) + +#--------------------------------------------------- + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None): + """Locate and return some specific control within the form. + + At least one of the name, type, kind, predicate and nr arguments must + be supplied. If no matching control is found, ControlNotFoundError is + raised. + + If name is specified, then the control must have the indicated name. + + If type is specified then the control must have the specified type (in + addition to the types possible for <input> HTML tags: "text", + "password", "hidden", "submit", "image", "button", "radio", "checkbox", + "file" we also have "reset", "buttonbutton", "submitbutton", + "resetbutton", "textarea", "select" and "isindex"). + + If kind is specified, then the control must fall into the specified + group, each of which satisfies a particular interface. The types are + "text", "list", "multilist", "singlelist", "clickable" and "file". + + If id is specified, then the control must have the indicated id. + + If predicate is specified, then the control must match that function. + The predicate function is passed the control as its single argument, + and should return a boolean value indicating whether the control + matched. + + nr, if supplied, is the sequence number of the control (where 0 is the + first). Note that control 0 is the first control matching all the + other arguments (if supplied); it is not necessarily the first control + in the form. + + """ + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (predicate is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + if nr is None: nr = 0 + + return self._find_control(name, type, kind, id, predicate, nr) + +#--------------------------------------------------- +# Private methods. + + def _find_list_control(self, + name=None, type=None, kind=None, id=None, nr=None): + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + if nr is None: nr = 0 + + return self._find_control(name, type, kind, id, is_listcontrol, nr) + + def _find_control(self, name, type, kind, id, predicate, nr): + if (name is not None) and not isstringlike(name): + raise TypeError("control name must be string-like") + if (type is not None) and not isstringlike(type): + raise TypeError("control type must be string-like") + if (kind is not None) and not isstringlike(kind): + raise TypeError("control kind must be string-like") + if (id is not None) and not isstringlike(id): + raise TypeError("control id must be string-like") + if (predicate is not None) and not callable(predicate): + raise TypeError("control predicate must be callable") + if nr < 0: raise ValueError("control number must be a positive " + "integer") + + orig_nr = nr + + for control in self.controls: + if name is not None and name != control.name: + continue + if type is not None and type != control.type: + continue + if kind is not None and not control.is_of_kind(kind): + continue + if id is not None and id != control.id: + continue + if predicate and not predicate(control): + continue + if nr: + nr = nr - 1 + continue + return control + + description = [] + if name is not None: description.append("name '%s'" % name) + if type is not None: description.append("type '%s'" % type) + if kind is not None: description.append("kind '%s'" % kind) + if id is not None: description.append("id '%s'" % id) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr: description.append("nr %d" % orig_nr) + description = string.join(description, ", ") + raise ControlNotFoundError("no control matching "+description) + + def _click(self, name, type, id, nr, coord, return_type, + request_class=urllib2.Request): + try: + control = self._find_control(name, type, "clickable", id, None, nr) + except ControlNotFoundError: + if ((name is not None) or (type is not None) or (id is not None) or + (nr != 0)): + raise + # no clickable controls, but no control was explicitly requested, + # so return state without clicking any control + return self._switch_click(return_type) + else: + return control._click(self, coord, return_type, request_class) + + def _pairs(self): + """Return sequence of (key, value) pairs suitable for urlencoding.""" + pairs = [] + for control in self.controls: + pairs.extend(control.pairs()) + return pairs + + def _request_data(self): + """Return a tuple (url, data, headers).""" + method = string.upper(self.method) + #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action) + parts = urlparse.urlparse(self.action) + rest, (query, frag) = parts[:-2], parts[-2:] + + if method == "GET": + if self.enctype != "application/x-www-form-urlencoded": + raise ValueError( + "unknown GET form encoding type '%s'" % self.enctype) + parts = rest + (urlencode(self._pairs()), "") + uri = urlparse.urlunparse(parts) + return uri, None, [] + elif method == "POST": + parts = rest + (query, "") + uri = urlparse.urlunparse(parts) + if self.enctype == "application/x-www-form-urlencoded": + return (uri, urlencode(self._pairs()), + [("Content-type", self.enctype)]) + elif self.enctype == "multipart/form-data": + data = StringIO() + http_hdrs = [] + mw = MimeWriter(data, http_hdrs) + f = mw.startmultipartbody("form-data", add_to_http_hdrs=True, + prefix=0) + for control in self.controls: + control._write_mime_data(mw) + mw.lastpart() + return uri, data.getvalue(), http_hdrs + else: + raise ValueError( + "unknown POST form encoding type '%s'" % self.enctype) + else: + raise ValueError("Unknown method '%s'" % method) + + def _switch_click(self, return_type, request_class=urllib2.Request): + # This is called by HTMLForm and clickable Controls to hide switching + # on return_type. + if return_type == "pairs": + return self._pairs() + elif return_type == "request_data": + return self._request_data() + else: + req_data = self._request_data() + req = request_class(req_data[0], req_data[1]) + for key, val in req_data[2]: + req.add_header(key, val) + return req diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/GeneralFAQ.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/GeneralFAQ.html new file mode 100644 index 0000000000000000000000000000000000000000..878b54f0b4fb84354d93aa230f71ae93bb6ed525 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/GeneralFAQ.html @@ -0,0 +1,139 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> + <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> + <meta name="author" content="John J. Lee <jjl@pobox.com>"> + <meta name="date" content="2005-01"> + <meta name="keywords" content="FAQ,cookie,HTTP,HTML,form,table,Python,web,client,client-side,testing,sniffer,https,script,embedded"> + <title>Python web-client programming general FAQs</title> + <style type="text/css" media="screen">@import "../styles/style.css";</style> + <base href="http://wwwsearch.sourceforge.net/bits/clientx.html"> +</head> +<body> + +<div id="sf"><a href="http://sourceforge.net"> +<img src="http://sourceforge.net/sflogo.php?group_id=48205&type=2" + width="125" height="37" alt="SourceForge.net Logo"></a></div> +<!--<img src="../images/sflogo.png"--> + +<h1>Python web-client programming general FAQs</h1> + +<div id="Content"> +<ul> + <li>Is there any example code? + <p>There's (still!) a bit of a shortage of example code for ClientCookie + and ClientForm &co., because the stuff I've written tends to either + require access to restricted-access sites, or is proprietary code (and the + same goes for other people's code). + <li>HTTPS on Windows? + <p>Use this <a href="http://pypgsql.sourceforge.net/misc/python22-win32-ssl.zip"> + _socket.pyd</a>, or use Python 2.3. + <li>I want to see what my web browser is doing, but standard network sniffers + like <a href="http://www.ethereal.com/">ethereal</a> or netcat (nc) don't + work for HTTPS. How do I sniff HTTPS traffic? + <p>Three good options: + <ul> + <li>Mozilla plugin: <a href="http://livehttpheaders.mozdev.org/"> + livehttpheaders</a>. + <li><a href="http://www.blunck.info/iehttpheaders.html">ieHTTPHeaders</a> + does the same for MSIE. + <li>Use <a href="http://lynx.browser.org/">lynx</a> <code>-trace</code>, + and filter out the junk with a script. + </ul> + <p>I'm told you can also use a proxy like <a + href="http://www.proxomitron.info/">proxomitron</a> (never tried it + myself). There's also a commercial <a href="http://www.simtec.ltd.uk/">MSIE + plugin</a>. + <li>Embedded script is messing up my web-scraping. What do I do? + <p>It is possible to embed script in HTML pages (sandwiched between + <code><SCRIPT>here</SCRIPT></code> tags, and in + <code>javascript:</code> URLs) - JavaScript / ECMAScript, VBScript, or + even Python. These scripts can do all sorts of things, including causing + cookies to be set in a browser, submitting or filling in parts of forms in + response to user actions, changing link colours as the mouse moves over a + link, etc. + + <p>If you come across this in a page you want to automate, you + have four options. Here they are, roughly in order of simplicity. + + <ul> + <li>Simply figure out what the embedded script is doing and emulate it + in your Python code: for example, by manually adding cookies to your + <code>CookieJar</code> instance, calling methods on + <code>HTMLForm</code>s, calling <code>urlopen</code>, etc. + <li>Dump ClientCookie and ClientForm and automate a browser instead + (eg. use MS Internet Explorer via its COM automation interfaces, using + the <a href="http://starship.python.net/crew/mhammond/">Python for + Windows extensions</a>, XXX Mozilla automation & XPCOM / PyXPCOM, + Konqueror & DCOP / KParts / PyKDE). + <li>Use Java's <a href="httpunit.sourceforge.net">httpunit</a> from + Jython, since it knows some JavaScript. + <li>Get ambitious and automatically delegate the work to an appropriate + interpreter (Mozilla's JavaScript interpreter, for instance). This + approach is the one taken by <a href="../DOMForm">DOMForm</a> (the + JavaScript support is "very alpha", though!). + </ul> + <li>Misc links + <ul> + <li>Another Java thing: <a href="http://maxq.tigris.org/">maxq</a>, + which provides a proxy to aid automatic generation of functional tests + written in Jython using the standard library unittest module (PyUnit) + and the "Jakarta Commons" HttpClient library. + <li>A useful set Zope-oriented links on <a + href="http://viii.dclxvi.org/bookmarks/tech/zope/test">tools for testing + web applications</a>. + <li>O'Reilly book: <a href="">Spidering Hacks</a>. Very Perl-oriented. + <li>Useful + <a href="http://chrispederick.myacen.com/work/firebird/webdeveloper/"> + Mozilla plugin</a> which, amongst other things, can display HTML form + information and HTML table structure(thanks to Erno Kuusela for this + link). + <li> + <a href="http://www.iopus.com/iim.htm">IOpus Internet Macros</a> Cheap + and nasty macro recording for IE. It works, just barely. Commercial + software. + <li> + <a href="http://www.opensourcetesting.org/functional.php">Open source + functional testing tools</a>. A nice list. + <li><a href="http://www.rexx.com/~dkuhlman/quixote_htmlscraping.html"> + A HOWTO on web scraping</a> from Dave Kuhlman. + </ul> + <li>Will any of this code make its way into the Python standard library? + <p>The request / response processing extensions to urllib2 from ClientCookie + have been merged into urllib2 for Python 2.4. The cookie processing has + been added, as module cookielib. Eventually, I'll submit patches to get + the http-equiv, refresh, and robots.txt code in there too, and maybe + <code>mechanize.UserAgent</code> too (but <em>not</em> + <code>mechanize.Browser</code>). The rest, probably not. +</ul> +</div> <!--id="Content"--> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, January 2005. + +<hr> + +</div> + +<div id="Menu"> + +<a href="..">Home</a><br> +<!--<a href=""></a><br>--> + +<br> + +<a href="../ClientCookie">ClientCookie</a><br> +<a href="../ClientForm">ClientForm</a><br> +<a href="../DOMForm/">DOMForm</a><br> +<a href="../python-spidermonkey/">python-spidermonkey</a><br> +<a href="../ClientTable">ClientTable</a><br> +<a href="../mechanize/">mechanize</a><br> +<a href="../pullparser/">pullparser</a><br> +<span class="thispage">General FAQs</span><br> +<a href="./urllib2_152.py">1.5.2 urllib2.py</a><br> +<a href="./urllib_152.py">1.5.2 urllib.py</a><br> + +<br> + +</body> +</html> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/INSTALL b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/INSTALL new file mode 100644 index 0000000000000000000000000000000000000000..ac12b0f0de5f14fa137107f119f09ce078eecdd3 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/INSTALL @@ -0,0 +1,63 @@ +ClientForm installation instructions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +Dependencies +~~~~~~~~~~~~ + +Python 1.5.2 or above is required, and urllib2 is recommended. + + +Installation +~~~~~~~~~~~~ + +To install the package, run the following command: + + python setup.py build + +then (with appropriate permissions) + + python setup.py install + + +Alternatively, just copy the whole ClientForm.py into your Python +path (eg. unix: /usr/local/lib/python2.2/site-packages, +Windows: C:\Python21, or C:\Python22\Lib\site-packages). That's all +that setup.py does. + + +To run the tests (none of which access the network), run the following +command: + + python test.py + +This runs the tests against the source files extracted from the +package. For help on command line options: + + python test.py --help + + +If you're using a pre-2.1 version of Python, you'll need to get +unittest.py (from http://pyunit.sourceforge.net) to run the Pyunit +tests. + +Bugs and comments to jjl@pobox.com. + + +NO WARRANTY + +THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +Copyright Notices + + (C) 2002-2003 John J. Lee. All rights reserved. + (C) 1998-2000 Gisle Aas. All rights reserved. (Original LWP code) + +This code in this package is free software; you can redistribute it +and/or modify it under the terms of the BSD license (see the file +COPYING). + +John J. Lee <jjl@pobox.com> +June 2003 diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/MANIFEST.in b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..2393119d33520ec119f5f92368df956da77543d8 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/MANIFEST.in @@ -0,0 +1,10 @@ +include MANIFEST.in +include COPYING +include INSTALL +include GeneralFAQ.html +include README.html.in +include README.html +include README.txt +include ChangeLog +include *.py +recursive-include testdata *.html diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/PKG-INFO b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..afc3d70ec4b35bb685b3970d7820f1ea12bfcd36 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/PKG-INFO @@ -0,0 +1,37 @@ +Metadata-Version: 1.0 +Name: ClientForm +Version: 0.1.17 +Summary: Client-side HTML form handling. +Home-page: http://wwwsearch.sourceforge.net/ClientForm/ +Author: John J. Lee +Author-email: jjl@pobox.com +License: BSD +Download-URL: http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0.1.17.tar.gz +Description: ClientForm is a Python module for handling HTML forms on the client + side, useful for parsing HTML forms, filling them in and returning the + completed forms to the server. It developed from a port of Gisle Aas' + Perl module HTML::Form, from the libwww-perl library, but the + interface is not the same. + +Platform: any +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: License :: OSI Approved :: BSD License +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Topic :: Internet +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Internet :: WWW/HTTP :: Site Management +Classifier: Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Software Development :: Testing +Classifier: Topic :: Software Development :: Testing :: Traffic Generation +Classifier: Topic :: System :: Networking :: Monitoring +Classifier: Topic :: System :: Systems Administration +Classifier: Topic :: Text Processing +Classifier: Topic :: Text Processing :: Markup +Classifier: Topic :: Text Processing :: Markup :: HTML +Classifier: Topic :: Text Processing :: Markup :: XML diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.html new file mode 100644 index 0000000000000000000000000000000000000000..519b830eb1d6c68165972787458be834bde4f030 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.html @@ -0,0 +1,363 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> + <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> + <meta name="author" content="John J. Lee <jjl@pobox.com>"> + <meta name="date" content="2005-01"> + <meta name="keywords" content="form,HTML,Python,web,client,client-side"> + <title>ClientForm</title> + <style type="text/css" media="screen">@import "../styles/style.css";</style> + <base href="http://wwwsearch.sourceforge.net/ClientForm/"> +</head> +<body> + + + +<div id=sf><a href="http://sourceforge.net"> +<img src="http://sourceforge.net/sflogo.php?group_id=48205&type=2" + width="125" height="37" alt="SourceForge.net Logo"></a></div> + +<h1>ClientForm</h1> + +<div id="Content"> + +<p>ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It developed from a port of Gisle Aas' +Perl module <code>HTML::Form</code>, from the <a +href="http://www.linpro.no/lwp/">libwww-perl</a> library, but the +interface is not the same. + +<p>Simple example: + +<pre> + <span class="pykw">from</span> urllib2 <span class="pykw">import</span> urlopen + <span class="pykw">from</span> ClientForm <span class="pykw">import</span> ParseResponse + + forms = ParseResponse(urlopen(<span class="pystr">"http://www.example.com/form.html"</span>)) + form = forms[0] + <span class="pykw">print</span> form + form[<span class="pystr">"author"</span>] = <span class="pystr">"Gisle Aas"</span> + + <span class="pycmt"># form.click() returns a urllib2.Request object +</span> <span class="pycmt"># (see HTMLForm.click.__doc__ if you don't have urllib2) +</span> response = urlopen(form.click(<span class="pystr">"Thanks"</span>))</pre> + + +<p>A more complicated example: + +<pre> + <span class="pykw">import</span> ClientForm + <span class="pykw">import</span> urllib2 + request = urllib2.Request(<span class="pystr">"http://www.example.com/form.html"</span>) + response = urllib2.urlopen(request) + forms = ClientForm.ParseResponse(response) + response.close() + form = forms[0] + <span class="pykw">print</span> form <span class="pycmt"># very useful!</span> + + <span class="pycmt"># Indexing allows setting and retrieval of control values +</span> original_text = form[<span class="pystr">"comments"</span>] <span class="pycmt"># a string, NOT a Control instance</span> + form[<span class="pystr">"comments"</span>] = <span class="pystr">"Blah."</span> + + <span class="pycmt"># Controls that represent lists (checkbox, select and radio lists) are +</span> <span class="pycmt"># ListControls. Their values are sequences of list item names. +</span> <span class="pycmt"># They come in two flavours: single- and multiple-selection: +</span> <span class="pykw">print</span> form.possible_items(<span class="pystr">"cheeses"</span>) + form[<span class="pystr">"favorite_cheese"</span>] = [<span class="pystr">"brie"</span>] <span class="pycmt"># single</span> + form[<span class="pystr">"cheeses"</span>] = [<span class="pystr">"parmesan"</span>, <span class="pystr">"leicester"</span>, <span class="pystr">"cheddar"</span>] <span class="pycmt"># multi</span> + <span class="pycmt"># is the "parmesan" item of the "cheeses" control selected? +</span> <span class="pykw">print</span> <span class="pystr">"parmesan"</span> <span class="pykw">in</span> form[<span class="pystr">"cheeses"</span>] + <span class="pycmt"># does cheeses control have a "caerphilly" item? +</span> <span class="pykw">print</span> <span class="pystr">"caerphilly"</span> <span class="pykw">in</span> form.possible_items(<span class="pystr">"cheeses"</span>) + + <span class="pycmt"># Sometimes one wants to set or clear individual items in a list: +</span> <span class="pycmt"># select the item named "gorgonzola" in the first control named "cheeses" +</span> form.set(True, <span class="pystr">"gorgonzola"</span>, <span class="pystr">"cheeses"</span>) + <span class="pycmt"># You can be more specific: supply at least one of name, type, kind, id +</span> <span class="pycmt"># and nr (most other methods on HTMLForm take the same form of arguments): +</span> <span class="pycmt"># deselect "edam" in third CHECKBOX control +</span> form.set(False, <span class="pystr">"edam"</span>, type=<span class="pystr">"checkbox"</span>, nr=2) + + <span class="pycmt"># You can explicitly say that you're referring to a ListControl: +</span> <span class="pycmt"># set whole value (rather than just one item of) "cheeses" ListControl +</span> form.set_value([<span class="pystr">"gouda"</span>], name=<span class="pystr">"cheeses"</span>, kind=<span class="pystr">"list"</span>) + <span class="pycmt"># last example is almost equivalent to following (but insists that the +</span> <span class="pycmt"># control be a ListControl -- so it will skip any non-list controls that +</span> <span class="pycmt"># come before the control we want) +</span> form[<span class="pystr">"cheeses"</span>] = [<span class="pystr">"gouda"</span>] + <span class="pycmt"># The kind argument can also take values "multilist", "singlelist", "text", +</span> <span class="pycmt"># "clickable" and "file": +</span> <span class="pycmt"># find first control that will accept text, and scribble in it +</span> form.set_value(<span class="pystr">"rhubarb rhubarb"</span>, kind=<span class="pystr">"text"</span>) + form.set_value([<span class="pystr">""</span>], kind=<span class="pystr">"singlelist"</span>) + + <span class="pycmt"># Often, a single checkbox (a CHECKBOX control with a single item) is +</span> <span class="pycmt"># present. In that case, the name of the single item isn't of much +</span> <span class="pycmt"># interest, so it's useful to be able to check and uncheck the box +</span> <span class="pycmt"># without using the item name: +</span> form.set_single(True, <span class="pystr">"smelly"</span>) <span class="pycmt"># check</span> + form.set_single(False, <span class="pystr">"smelly"</span>) <span class="pycmt"># uncheck</span> + + <span class="pycmt"># Add files to FILE controls with .add_file(). Only call this multiple +</span> <span class="pycmt"># times if the server is expecting multiple files. +</span> <span class="pycmt"># add a file, default value for MIME type, no filename sent to server +</span> form.add_file(open(<span class="pystr">"data.dat"</span>)) + <span class="pycmt"># add a second file, explicitly giving MIME type, and telling the server +</span> <span class="pycmt"># what the filename is +</span> form.add_file(open(<span class="pystr">"data.txt"</span>), <span class="pystr">"text/plain"</span>, <span class="pystr">"data.txt"</span>) + + <span class="pycmt"># Many methods have a by_label argument, allowing specification of list +</span> <span class="pycmt"># items by label instead of by name. At the moment, only SelectControl +</span> <span class="pycmt"># supports this argument (this will be fixed). Sometimes labels are +</span> <span class="pycmt"># easier to maintain than names, sometimes the other way around. +</span> form.set_value([<span class="pystr">"Mozzarella"</span>, <span class="pystr">"Caerphilly"</span>], <span class="pystr">"cheeses"</span>, by_label=True) + + <span class="pycmt"># It's also possible to get at the individual controls inside the form. +</span> <span class="pycmt"># This is useful for calling several methods in a row on a single control, +</span> <span class="pycmt"># and for the less common operations. The methods are quite similar to +</span> <span class="pycmt"># those on HTMLForm: +</span> control = form.find_control(<span class="pystr">"cheeses"</span>, type=<span class="pystr">"select"</span>) + <span class="pykw">print</span> control.value, control.name, control.type + <span class="pykw">print</span> control.possible_items() + control.value = [<span class="pystr">"mascarpone"</span>, <span class="pystr">"curd"</span>] + control.set(True, <span class="pystr">"limburger"</span>) + + <span class="pycmt"># All Controls may be disabled (equivalent of greyed-out in browser) +</span> control = form.find_control(<span class="pystr">"comments"</span>) + <span class="pykw">print</span> control.disabled + <span class="pycmt"># ...or readonly +</span> <span class="pykw">print</span> control.readonly + <span class="pycmt"># readonly and disabled attributes can be assigned to +</span> control.disabled = False + <span class="pycmt"># convenience method, used here to make all controls writable (unless +</span> <span class="pycmt"># they're disabled): +</span> form.set_all_readonly(False) + <span class="pycmt"># ListControl items may also be disabled (setting a disabled item is not +</span> <span class="pycmt"># allowed, but clearing one is allowed): +</span> <span class="pykw">print</span> control.get_item_disabled(<span class="pystr">"emmenthal"</span>) + control.set_item_disabled(True, <span class="pystr">"emmenthal"</span>) + <span class="pycmt"># enable all items in control +</span> control.set_all_items_disabled(False) + + <span class="pycmt"># HTMLForm.controls is a list of all controls in the form +</span> <span class="pykw">for</span> control <span class="pykw">in</span> form.controls: + <span class="pykw">if</span> control.value == <span class="pystr">"inquisition"</span>: sys.exit() + + request2 = form.click() <span class="pycmt"># urllib2.Request object</span> + response2 = urllib2.urlopen(request2) + + <span class="pykw">print</span> response2.geturl() + <span class="pykw">print</span> response2.info() <span class="pycmt"># headers</span> + <span class="pykw">print</span> response2.read() <span class="pycmt"># body</span> + response2.close()</pre> + + +<p>All of the standard control types are supported: <code>TEXT</code>, +<code>PASSWORD</code>, <code>HIDDEN</code>, <code>TEXTAREA</code>, +<code>ISINDEX</code>, <code>RESET</code>, <code>BUTTON</code> (<code>INPUT +TYPE=BUTTON</code> and the various <code>BUTTON</code> types), +<code>SUBMIT</code>, <code>IMAGE</code>, <code>RADIO</code>, +<code>CHECKBOX</code>, <code>SELECT</code>/<code>OPTION</code> and +<code>FILE</code> (for file upload). Both standard form encodings +(<code>application/x-www-form-urlencoded</code> and +<code>multipart/form-data</code>) are supported. + +<p>The module is designed for testing and automation of web +interfaces, not for implementing interactive user agents. + +<p><strong><em>Security note</em>: Remember that any passwords you store in +<code>HTMLForm</code> instances will be saved to disk in the clear if you +pickle them (directly or indirectly). The simplest solution to this is to +avoid pickling <code>HTMLForm</code> objects. You could also pickle before +filling in any password, or just set the password to <code>""</code> before +pickling.</strong> + +<p>Python 1.5.2 or above is required. To run the tests, you need the +<code>unittest</code> module (from <a href="http://pyunit.sourceforge.net/">PyUnit</a>). +<code>unittest</code> is a standard library module with Python 2.1 and +above. + +<p>For full documentation, see the docstrings in ClientForm.py. + +<p><em><strong>Note: this page describes the 0.1.x interface. See <a +href="./src/README_0_0_15.html">here</a> for the old 0.0.x interface.</strong> +</em> + + +<a name="download"></a> +<h2>Download</h2> + +<p>For installation instructions, see the INSTALL file included in the +distribution. + +<p><em>Stable release.</em>. There have been many interface changes since +0.0.x, so I don't recommend upgrading old code from 0.0.x unless you want the +new features. + +<p>0.1.x includes <code>FILE</code> control support for file upload, handling +of disabled list items, and a redesigned interface. +<ul> +<li><a href="./src/ClientForm-0.1.17.tar.gz">ClientForm-0.1.17.tar.gz</a> +<li><a href="./src/ClientForm-0_1_17.zip">ClientForm-0_1_17.zip</a> +<li><a href="./src/ChangeLog.txt">Change Log</a> (included in distribution) +<li><a href="./src/">Older versions.</a> +</ul> + +<br> + +<p><em>Old release.</em> +<ul> +<li><a href="./src/ClientForm-0.0.16.tar.gz">ClientForm-0.0.16.tar.gz</a> +<li><a href="./src/ClientForm-0_0_16.zip">ClientForm-0_0_16.zip</a> +<li><a href="./src/ChangeLog.txt">Change Log</a> (included in distribution) +<li><a href="./src/">Older versions.</a> +</ul> + + +<a name="faq"></a> +<h2>FAQs</h2> +<ul> + <li>Doesn't the standard Python library module, <code>cgi</code>, do this? + <p>No: the <code>cgi</code> module does the server end of the job. It + doesn't know how to parse or fill in a form or how to send it back to the + server. + <li>Which version of Python do I need? + <p>1.5.2 or above. + <li>Is <code>urllib2</code> required? + <p>No. + <li>How do I use it without <code>urllib2</code>? + <p>Use <code>.click_request_data()</code> instead of <code>.click()</code>. + <li>Which <code>urllib2</code> do I need? + <p>You don't. It's convenient, though. If you have Python 2.0, you need to + upgrade to the version from Python 2.1 (available from <a + href="http://www.python.org/">www.python.org</a>). Alternatively, use the + 1.5.2-compatible version. If you have Python 1.5.2, use this <a + href="../bits/urllib2.py"><code>urllib2</code></a> and <a + href="../bits/urllib.py"><code>urllib</code></a>. Otherwise, you're OK. + <li>Which license? + <p>The <a href="http://www.opensource.org/licenses/bsd-license.php"> BSD + license</a> (included in distribution). + + <li>Is XHTML supported? + <p>Yes, since 0.1.12. + <li>How do I figure out what control names and values to use? + <p><code>print form</code> is usually all you need. + <code>HTMLForm.possible_items</code> can be useful. Note that it's + possible to use item labels instead of item names, which can be useful + — use the <code>by_label</code> arguments to the various methods, + and the <code>.get_value_by_label()</code> / + <code>.set_value_by_label()</code> methods on <code>ListControl</code>. + Only <code>SelectControl</code> currently supports item labels (which + default to <code>OPTION</code> element contents). I might not bother to + fix this, since it seems it's probably only useful for <code>SELECT</code> + anyway. + <li>What do those <code>'*'</code> characters mean in the string + representations of list controls? + <p>A <code>*</code> next to an item means that item is selected. + <li>What do those parentheses (round brackets) mean in the string + representations of list controls? + <p>Parentheses <code>(foo)</code> around an item mean that item is disabled. + <li>Why doesn't <some control> turn up in the data returned by + <code>.click*()</code> when that control has non-<code>None</code> value? + <p>Either the control is disabled, or it is not successful for some other + reason. 'Successful' (see HTML 4 specification) means that the control + will cause data to get sent to the server. + <li>Why does ClientForm not follow the HTML 4.0 / RFC 1866 standards for + <code>RADIO</code> and multiple-selection <code>SELECT</code> controls? + <p>Because by default, it follows browser behaviour when setting the + initially-selected items in list controls that have no items explicitly + selected in the HTML. Use the <code>select_default</code> argument to + <code>ParseResponse</code> if you want to follow the RFC 1866 rules + instead. Note that browser behaviour violates the HTML 4.01 specification + in the case of <code>RADIO</code> controls. + <li>Why does <code>.click()</code>ing on a button not work for me? + <ul> + <li>Clicking on a <code>RESET</code> button doesn't do anything, by design + - this is a library for web automation, not an interactive browser. + Even in an interactive browser, clicking on <code>RESET</code> sends + nothing to the server, so there is little point in having + <code>.click()</code> do anything special here. + <li>Clicking on a <code>BUTTON TYPE=BUTTON</code> doesn't do anything + either, also by design. This time, the reason is that that + <code>BUTTON</code> is only in the HTML standard so that one can attach + callbacks to its events. The callbacks are functions in + <code>SCRIPT</code> elements (such as Javascript) embedded in the HTML, + and their execution may result in information getting sent back to the + server. ClientForm, however, knows nothing about these callbacks, so + it can't do anything useful with a click on a <code>BUTTON</code> whose + type is <code>BUTTON</code>. + <li>Generally, embedded script may be messing things up in all kinds of + ways. See the answer to the next question. + </ul> + <li>Embedded script is messing up my form filling. What do I do? + <p>See the <a href="../bits/GeneralFAQ.html">General FAQs</a> page for + what to do about this. +<!-- XXX example here --> + <li>I'm having trouble debugging my code. + <p>The <a href="../ClientCookie/">ClientCookie</a> package makes it + easy to get <code>.seek()</code>able response objects, which is + convenient for debugging. See also <a + href="../ClientCookie/doc.html#debugging">here</a> for few + relevant tips. Also see <a href="../bits/GeneralFAQ.html"> General + FAQs</a>. + <li>I have a control containing a list of integers. How do I select the one + whose value is nearest to the one I want? +<p><pre> + <span class="pykw">import</span> bisect + <span class="pykw">def</span> closest_int_value(form, ctrl_name, value): + values = map(int, form.possible_items(ctrl_name)) + <span class="pykw">return</span> str(values[bisect.bisect(values, value) - 1]) + + form[<span class="pystr">"distance"</span>] = [closest_int_value(form, <span class="pystr">"distance"</span>, 23)]</pre> + + </li> + <li>Where can I find out more about the HTML and HTTP standards? + <ul> + <li>W3C <a href="http://www.w3.org/TR/html401/">HTML 4.01 + Specification</a>. + <li><a href="http://www.ietf.org/rfc/rfc1866.txt">RFC 1866</a> - + the HTML 2.0 standard. + <li><a href="http://www.ietf.org/rfc/rfc1867.txt">RFC 1867</a> - + Form-based file upload. + <li><a href="http://www.ietf.org/rfc/rfc2616.txt">RFC 2616</a> - + HTTP 1.1 Specification. + </ul> +</ul> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, January 2005. + +</div> + +<div id="Menu"> + +<a href="..">Home</a><br> +<!--<a href=""></a><br>--> + +<br> + +<a href="../ClientCookie/">ClientCookie</a><br> +<span class="thispage">ClientForm</span><br> +<a href="../DOMForm/">DOMForm</a><br> +<a href="../python-spidermonkey/">python-spidermonkey</a><br> +<a href="../ClientTable/">ClientTable</a><br> +<a href="../mechanize/">mechanize</a><br> +<a href="../pullparser/">pullparser</a><br> +<a href="../bits/GeneralFAQ.html">General FAQs</a><br> +<a href="../bits/urllib2_152.py">1.5.2 urllib2.py</a><br> +<a href="../bits/urllib_152.py">1.5.2 urllib.py</a><br> + +<br> + +<a href="../#other">Other stuff</a><br> + +<br> + +<a href="./#download">Download</a><br> +<a href="./#faq">FAQs</a><br> + +</div> + +</body> +</html> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.html.in b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.html.in new file mode 100644 index 0000000000000000000000000000000000000000..a2d1112108810556d01e7f475a9fbc9b000f751a --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.html.in @@ -0,0 +1,365 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> + <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> + <meta name="author" content="John J. Lee <jjl@@pobox.com>"> + <meta name="date" content="2005-01"> + <meta name="keywords" content="form,HTML,Python,web,client,client-side"> + <title>ClientForm</title> + <style type="text/css" media="screen">@@import "../styles/style.css";</style> + <base href="http://wwwsearch.sourceforge.net/ClientForm/"> +</head> +<body> + +@# This file is processed by EmPy to colorize Python source code +@# http://wwwsearch.sf.net/bits/colorize.py +@{from colorize import colorize} + +<div id=sf><a href="http://sourceforge.net"> +<img src="http://sourceforge.net/sflogo.php?group_id=48205&type=2" + width="125" height="37" alt="SourceForge.net Logo"></a></div> + +<h1>ClientForm</h1> + +<div id="Content"> + +<p>ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It developed from a port of Gisle Aas' +Perl module <code>HTML::Form</code>, from the <a +href="http://www.linpro.no/lwp/">libwww-perl</a> library, but the +interface is not the same. + +<p>Simple example: + +@{colorize(r""" + from urllib2 import urlopen + from ClientForm import ParseResponse + + forms = ParseResponse(urlopen("http://www.example.com/form.html")) + form = forms[0] + print form + form["author"] = "Gisle Aas" + + # form.click() returns a urllib2.Request object + # (see HTMLForm.click.__doc__ if you don't have urllib2) + response = urlopen(form.click("Thanks")) +""")} + +<p>A more complicated example: + +@{colorize(r""" + import ClientForm + import urllib2 + request = urllib2.Request("http://www.example.com/form.html") + response = urllib2.urlopen(request) + forms = ClientForm.ParseResponse(response) + response.close() + form = forms[0] + print form # very useful! + + # Indexing allows setting and retrieval of control values + original_text = form["comments"] # a string, NOT a Control instance + form["comments"] = "Blah." + + # Controls that represent lists (checkbox, select and radio lists) are + # ListControls. Their values are sequences of list item names. + # They come in two flavours: single- and multiple-selection: + print form.possible_items("cheeses") + form["favorite_cheese"] = ["brie"] # single + form["cheeses"] = ["parmesan", "leicester", "cheddar"] # multi + # is the "parmesan" item of the "cheeses" control selected? + print "parmesan" in form["cheeses"] + # does cheeses control have a "caerphilly" item? + print "caerphilly" in form.possible_items("cheeses") + + # Sometimes one wants to set or clear individual items in a list: + # select the item named "gorgonzola" in the first control named "cheeses" + form.set(True, "gorgonzola", "cheeses") + # You can be more specific: supply at least one of name, type, kind, id + # and nr (most other methods on HTMLForm take the same form of arguments): + # deselect "edam" in third CHECKBOX control + form.set(False, "edam", type="checkbox", nr=2) + + # You can explicitly say that you're referring to a ListControl: + # set whole value (rather than just one item of) "cheeses" ListControl + form.set_value(["gouda"], name="cheeses", kind="list") + # last example is almost equivalent to following (but insists that the + # control be a ListControl -- so it will skip any non-list controls that + # come before the control we want) + form["cheeses"] = ["gouda"] + # The kind argument can also take values "multilist", "singlelist", "text", + # "clickable" and "file": + # find first control that will accept text, and scribble in it + form.set_value("rhubarb rhubarb", kind="text") + form.set_value([""], kind="singlelist") + + # Often, a single checkbox (a CHECKBOX control with a single item) is + # present. In that case, the name of the single item isn't of much + # interest, so it's useful to be able to check and uncheck the box + # without using the item name: + form.set_single(True, "smelly") # check + form.set_single(False, "smelly") # uncheck + + # Add files to FILE controls with .add_file(). Only call this multiple + # times if the server is expecting multiple files. + # add a file, default value for MIME type, no filename sent to server + form.add_file(open("data.dat")) + # add a second file, explicitly giving MIME type, and telling the server + # what the filename is + form.add_file(open("data.txt"), "text/plain", "data.txt") + + # Many methods have a by_label argument, allowing specification of list + # items by label instead of by name. At the moment, only SelectControl + # supports this argument (this will be fixed). Sometimes labels are + # easier to maintain than names, sometimes the other way around. + form.set_value(["Mozzarella", "Caerphilly"], "cheeses", by_label=True) + + # It's also possible to get at the individual controls inside the form. + # This is useful for calling several methods in a row on a single control, + # and for the less common operations. The methods are quite similar to + # those on HTMLForm: + control = form.find_control("cheeses", type="select") + print control.value, control.name, control.type + print control.possible_items() + control.value = ["mascarpone", "curd"] + control.set(True, "limburger") + + # All Controls may be disabled (equivalent of greyed-out in browser) + control = form.find_control("comments") + print control.disabled + # ...or readonly + print control.readonly + # readonly and disabled attributes can be assigned to + control.disabled = False + # convenience method, used here to make all controls writable (unless + # they're disabled): + form.set_all_readonly(False) + # ListControl items may also be disabled (setting a disabled item is not + # allowed, but clearing one is allowed): + print control.get_item_disabled("emmenthal") + control.set_item_disabled(True, "emmenthal") + # enable all items in control + control.set_all_items_disabled(False) + + # HTMLForm.controls is a list of all controls in the form + for control in form.controls: + if control.value == "inquisition": sys.exit() + + request2 = form.click() # urllib2.Request object + response2 = urllib2.urlopen(request2) + + print response2.geturl() + print response2.info() # headers + print response2.read() # body + response2.close() +""")} + +<p>All of the standard control types are supported: <code>TEXT</code>, +<code>PASSWORD</code>, <code>HIDDEN</code>, <code>TEXTAREA</code>, +<code>ISINDEX</code>, <code>RESET</code>, <code>BUTTON</code> (<code>INPUT +TYPE=BUTTON</code> and the various <code>BUTTON</code> types), +<code>SUBMIT</code>, <code>IMAGE</code>, <code>RADIO</code>, +<code>CHECKBOX</code>, <code>SELECT</code>/<code>OPTION</code> and +<code>FILE</code> (for file upload). Both standard form encodings +(<code>application/x-www-form-urlencoded</code> and +<code>multipart/form-data</code>) are supported. + +<p>The module is designed for testing and automation of web +interfaces, not for implementing interactive user agents. + +<p><strong><em>Security note</em>: Remember that any passwords you store in +<code>HTMLForm</code> instances will be saved to disk in the clear if you +pickle them (directly or indirectly). The simplest solution to this is to +avoid pickling <code>HTMLForm</code> objects. You could also pickle before +filling in any password, or just set the password to <code>""</code> before +pickling.</strong> + +<p>Python 1.5.2 or above is required. To run the tests, you need the +<code>unittest</code> module (from <a href="http://pyunit.sourceforge.net/">PyUnit</a>). +<code>unittest</code> is a standard library module with Python 2.1 and +above. + +<p>For full documentation, see the docstrings in ClientForm.py. + +<p><em><strong>Note: this page describes the 0.1.x interface. See <a +href="./src/README_0_0_15.html">here</a> for the old 0.0.x interface.</strong> +</em> + + +<a name="download"></a> +<h2>Download</h2> + +<p>For installation instructions, see the INSTALL file included in the +distribution. + +<p><em>Stable release.</em>. There have been many interface changes since +0.0.x, so I don't recommend upgrading old code from 0.0.x unless you want the +new features. + +<p>0.1.x includes <code>FILE</code> control support for file upload, handling +of disabled list items, and a redesigned interface. +<ul> +<li><a href="./src/ClientForm-0.1.17.tar.gz">ClientForm-0.1.17.tar.gz</a> +<li><a href="./src/ClientForm-0_1_17.zip">ClientForm-0_1_17.zip</a> +<li><a href="./src/ChangeLog.txt">Change Log</a> (included in distribution) +<li><a href="./src/">Older versions.</a> +</ul> + +<br> + +<p><em>Old release.</em> +<ul> +<li><a href="./src/ClientForm-0.0.16.tar.gz">ClientForm-0.0.16.tar.gz</a> +<li><a href="./src/ClientForm-0_0_16.zip">ClientForm-0_0_16.zip</a> +<li><a href="./src/ChangeLog.txt">Change Log</a> (included in distribution) +<li><a href="./src/">Older versions.</a> +</ul> + + +<a name="faq"></a> +<h2>FAQs</h2> +<ul> + <li>Doesn't the standard Python library module, <code>cgi</code>, do this? + <p>No: the <code>cgi</code> module does the server end of the job. It + doesn't know how to parse or fill in a form or how to send it back to the + server. + <li>Which version of Python do I need? + <p>1.5.2 or above. + <li>Is <code>urllib2</code> required? + <p>No. + <li>How do I use it without <code>urllib2</code>? + <p>Use <code>.click_request_data()</code> instead of <code>.click()</code>. + <li>Which <code>urllib2</code> do I need? + <p>You don't. It's convenient, though. If you have Python 2.0, you need to + upgrade to the version from Python 2.1 (available from <a + href="http://www.python.org/">www.python.org</a>). Alternatively, use the + 1.5.2-compatible version. If you have Python 1.5.2, use this <a + href="../bits/urllib2.py"><code>urllib2</code></a> and <a + href="../bits/urllib.py"><code>urllib</code></a>. Otherwise, you're OK. + <li>Which license? + <p>The <a href="http://www.opensource.org/licenses/bsd-license.php"> BSD + license</a> (included in distribution). + + <li>Is XHTML supported? + <p>Yes, since 0.1.12. + <li>How do I figure out what control names and values to use? + <p><code>print form</code> is usually all you need. + <code>HTMLForm.possible_items</code> can be useful. Note that it's + possible to use item labels instead of item names, which can be useful + — use the <code>by_label</code> arguments to the various methods, + and the <code>.get_value_by_label()</code> / + <code>.set_value_by_label()</code> methods on <code>ListControl</code>. + Only <code>SelectControl</code> currently supports item labels (which + default to <code>OPTION</code> element contents). I might not bother to + fix this, since it seems it's probably only useful for <code>SELECT</code> + anyway. + <li>What do those <code>'*'</code> characters mean in the string + representations of list controls? + <p>A <code>*</code> next to an item means that item is selected. + <li>What do those parentheses (round brackets) mean in the string + representations of list controls? + <p>Parentheses <code>(foo)</code> around an item mean that item is disabled. + <li>Why doesn't <some control> turn up in the data returned by + <code>.click*()</code> when that control has non-<code>None</code> value? + <p>Either the control is disabled, or it is not successful for some other + reason. 'Successful' (see HTML 4 specification) means that the control + will cause data to get sent to the server. + <li>Why does ClientForm not follow the HTML 4.0 / RFC 1866 standards for + <code>RADIO</code> and multiple-selection <code>SELECT</code> controls? + <p>Because by default, it follows browser behaviour when setting the + initially-selected items in list controls that have no items explicitly + selected in the HTML. Use the <code>select_default</code> argument to + <code>ParseResponse</code> if you want to follow the RFC 1866 rules + instead. Note that browser behaviour violates the HTML 4.01 specification + in the case of <code>RADIO</code> controls. + <li>Why does <code>.click()</code>ing on a button not work for me? + <ul> + <li>Clicking on a <code>RESET</code> button doesn't do anything, by design + - this is a library for web automation, not an interactive browser. + Even in an interactive browser, clicking on <code>RESET</code> sends + nothing to the server, so there is little point in having + <code>.click()</code> do anything special here. + <li>Clicking on a <code>BUTTON TYPE=BUTTON</code> doesn't do anything + either, also by design. This time, the reason is that that + <code>BUTTON</code> is only in the HTML standard so that one can attach + callbacks to its events. The callbacks are functions in + <code>SCRIPT</code> elements (such as Javascript) embedded in the HTML, + and their execution may result in information getting sent back to the + server. ClientForm, however, knows nothing about these callbacks, so + it can't do anything useful with a click on a <code>BUTTON</code> whose + type is <code>BUTTON</code>. + <li>Generally, embedded script may be messing things up in all kinds of + ways. See the answer to the next question. + </ul> + <li>Embedded script is messing up my form filling. What do I do? + <p>See the <a href="../bits/GeneralFAQ.html">General FAQs</a> page for + what to do about this. +<!-- XXX example here --> + <li>I'm having trouble debugging my code. + <p>The <a href="../ClientCookie/">ClientCookie</a> package makes it + easy to get <code>.seek()</code>able response objects, which is + convenient for debugging. See also <a + href="../ClientCookie/doc.html#debugging">here</a> for few + relevant tips. Also see <a href="../bits/GeneralFAQ.html"> General + FAQs</a>. + <li>I have a control containing a list of integers. How do I select the one + whose value is nearest to the one I want? +<p>@{colorize(r""" + import bisect + def closest_int_value(form, ctrl_name, value): + values = map(int, form.possible_items(ctrl_name)) + return str(values[bisect.bisect(values, value) - 1]) + + form["distance"] = [closest_int_value(form, "distance", 23)] +""")} + </li> + <li>Where can I find out more about the HTML and HTTP standards? + <ul> + <li>W3C <a href="http://www.w3.org/TR/html401/">HTML 4.01 + Specification</a>. + <li><a href="http://www.ietf.org/rfc/rfc1866.txt">RFC 1866</a> - + the HTML 2.0 standard. + <li><a href="http://www.ietf.org/rfc/rfc1867.txt">RFC 1867</a> - + Form-based file upload. + <li><a href="http://www.ietf.org/rfc/rfc2616.txt">RFC 2616</a> - + HTTP 1.1 Specification. + </ul> +</ul> + +<p><a href="mailto:jjl@@pobox.com">John J. Lee</a>, January 2005. + +</div> + +<div id="Menu"> + +<a href="..">Home</a><br> +<!--<a href=""></a><br>--> + +<br> + +<a href="../ClientCookie/">ClientCookie</a><br> +<span class="thispage">ClientForm</span><br> +<a href="../DOMForm/">DOMForm</a><br> +<a href="../python-spidermonkey/">python-spidermonkey</a><br> +<a href="../ClientTable/">ClientTable</a><br> +<a href="../mechanize/">mechanize</a><br> +<a href="../pullparser/">pullparser</a><br> +<a href="../bits/GeneralFAQ.html">General FAQs</a><br> +<a href="../bits/urllib2_152.py">1.5.2 urllib2.py</a><br> +<a href="../bits/urllib_152.py">1.5.2 urllib.py</a><br> + +<br> + +<a href="../#other">Other stuff</a><br> + +<br> + +<a href="./#download">Download</a><br> +<a href="./#faq">FAQs</a><br> + +</div> + +</body> +</html> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.txt b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..737fb3f2a919d32a0022478532043ff07892be01 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/README.txt @@ -0,0 +1,320 @@ + [1]SourceForge.net Logo + + ClientForm + + ClientForm is a Python module for handling HTML forms on the client + side, useful for parsing HTML forms, filling them in and returning the + completed forms to the server. It developed from a port of Gisle Aas' + Perl module HTML::Form, from the [2]libwww-perl library, but the + interface is not the same. + + Simple example: + from urllib2 import urlopen + from ClientForm import ParseResponse + + forms = ParseResponse(urlopen("http://www.example.com/form.html")) + form = forms[0] + print form + form["author"] = "Gisle Aas" + + # form.click() returns a urllib2.Request object + # (see HTMLForm.click.__doc__ if you don't have urllib2) + response = urlopen(form.click("Thanks")) + + A more complicated example: + import ClientForm + import urllib2 + request = urllib2.Request("http://www.example.com/form.html") + response = urllib2.urlopen(request) + forms = ClientForm.ParseResponse(response) + response.close() + form = forms[0] + print form # very useful! + + # Indexing allows setting and retrieval of control values + original_text = form["comments"] # a string, NOT a Control instance + form["comments"] = "Blah." + + # Controls that represent lists (checkbox, select and radio lists) are + # ListControls. Their values are sequences of list item names. + # They come in two flavours: single- and multiple-selection: + print form.possible_items("cheeses") + form["favorite_cheese"] = ["brie"] # single + form["cheeses"] = ["parmesan", "leicester", "cheddar"] # multi + # is the "parmesan" item of the "cheeses" control selected? + print "parmesan" in form["cheeses"] + # does cheeses control have a "caerphilly" item? + print "caerphilly" in form.possible_items("cheeses") + + # Sometimes one wants to set or clear individual items in a list: + # select the item named "gorgonzola" in the first control named "cheeses" + form.set(True, "gorgonzola", "cheeses") + # You can be more specific: supply at least one of name, type, kind, id + # and nr (most other methods on HTMLForm take the same form of arguments): + # deselect "edam" in third CHECKBOX control + form.set(False, "edam", type="checkbox", nr=2) + + # You can explicitly say that you're referring to a ListControl: + # set whole value (rather than just one item of) "cheeses" ListControl + form.set_value(["gouda"], name="cheeses", kind="list") + # last example is almost equivalent to following (but insists that the + # control be a ListControl -- so it will skip any non-list controls that + # come before the control we want) + form["cheeses"] = ["gouda"] + # The kind argument can also take values "multilist", "singlelist", "text", + # "clickable" and "file": + # find first control that will accept text, and scribble in it + form.set_value("rhubarb rhubarb", kind="text") + form.set_value([""], kind="singlelist") + + # Often, a single checkbox (a CHECKBOX control with a single item) is + # present. In that case, the name of the single item isn't of much + # interest, so it's useful to be able to check and uncheck the box + # without using the item name: + form.set_single(True, "smelly") # check + form.set_single(False, "smelly") # uncheck + + # Add files to FILE controls with .add_file(). Only call this multiple + # times if the server is expecting multiple files. + # add a file, default value for MIME type, no filename sent to server + form.add_file(open("data.dat")) + # add a second file, explicitly giving MIME type, and telling the server + # what the filename is + form.add_file(open("data.txt"), "text/plain", "data.txt") + + # Many methods have a by_label argument, allowing specification of list + # items by label instead of by name. At the moment, only SelectControl + # supports this argument (this will be fixed). Sometimes labels are + # easier to maintain than names, sometimes the other way around. + form.set_value(["Mozzarella", "Caerphilly"], "cheeses", by_label=True) + + # It's also possible to get at the individual controls inside the form. + # This is useful for calling several methods in a row on a single control, + # and for the less common operations. The methods are quite similar to + # those on HTMLForm: + control = form.find_control("cheeses", type="select") + print control.value, control.name, control.type + print control.possible_items() + control.value = ["mascarpone", "curd"] + control.set(True, "limburger") + + # All Controls may be disabled (equivalent of greyed-out in browser) + control = form.find_control("comments") + print control.disabled + # ...or readonly + print control.readonly + # readonly and disabled attributes can be assigned to + control.disabled = False + # convenience method, used here to make all controls writable (unless + # they're disabled): + form.set_all_readonly(False) + # ListControl items may also be disabled (setting a disabled item is not + # allowed, but clearing one is allowed): + print control.get_item_disabled("emmenthal") + control.set_item_disabled(True, "emmenthal") + # enable all items in control + control.set_all_items_disabled(False) + + # HTMLForm.controls is a list of all controls in the form + for control in form.controls: + if control.value == "inquisition": sys.exit() + + request2 = form.click() # urllib2.Request object + response2 = urllib2.urlopen(request2) + + print response2.geturl() + print response2.info() # headers + print response2.read() # body + response2.close() + + All of the standard control types are supported: TEXT, PASSWORD, + HIDDEN, TEXTAREA, ISINDEX, RESET, BUTTON (INPUT TYPE=BUTTON and the + various BUTTON types), SUBMIT, IMAGE, RADIO, CHECKBOX, SELECT/OPTION + and FILE (for file upload). Both standard form encodings + (application/x-www-form-urlencoded and multipart/form-data) are + supported. + + The module is designed for testing and automation of web interfaces, + not for implementing interactive user agents. + + Security note: Remember that any passwords you store in HTMLForm + instances will be saved to disk in the clear if you pickle them + (directly or indirectly). The simplest solution to this is to avoid + pickling HTMLForm objects. You could also pickle before filling in any + password, or just set the password to "" before pickling. + + Python 1.5.2 or above is required. To run the tests, you need the + unittest module (from [3]PyUnit). unittest is a standard library + module with Python 2.1 and above. + + For full documentation, see the docstrings in ClientForm.py. + + Note: this page describes the 0.1.x interface. See [4]here for the old + 0.0.x interface. + +Download + + For installation instructions, see the INSTALL file included in the + distribution. + + Stable release.. There have been many interface changes since 0.0.x, + so I don't recommend upgrading old code from 0.0.x unless you want the + new features. + + 0.1.x includes FILE control support for file upload, handling of + disabled list items, and a redesigned interface. + * [5]ClientForm-0.1.17.tar.gz + * [6]ClientForm-0_1_17.zip + * [7]Change Log (included in distribution) + * [8]Older versions. + + Old release. + * [9]ClientForm-0.0.16.tar.gz + * [10]ClientForm-0_0_16.zip + * [11]Change Log (included in distribution) + * [12]Older versions. + +FAQs + + * Doesn't the standard Python library module, cgi, do this? + No: the cgi module does the server end of the job. It doesn't know + how to parse or fill in a form or how to send it back to the + server. + * Which version of Python do I need? + 1.5.2 or above. + * Is urllib2 required? + No. + * How do I use it without urllib2? + Use .click_request_data() instead of .click(). + * Which urllib2 do I need? + You don't. It's convenient, though. If you have Python 2.0, you + need to upgrade to the version from Python 2.1 (available from + [13]www.python.org). Alternatively, use the 1.5.2-compatible + version. If you have Python 1.5.2, use this [14]urllib2 and + [15]urllib. Otherwise, you're OK. + * Which license? + The [16]BSD license (included in distribution). + * Is XHTML supported? + Yes, since 0.1.12. + * How do I figure out what control names and values to use? + print form is usually all you need. HTMLForm.possible_items can be + useful. Note that it's possible to use item labels instead of item + names, which can be useful -- use the by_label arguments to the + various methods, and the .get_value_by_label() / + .set_value_by_label() methods on ListControl. Only SelectControl + currently supports item labels (which default to OPTION element + contents). I might not bother to fix this, since it seems it's + probably only useful for SELECT anyway. + * What do those '*' characters mean in the string representations of + list controls? + A * next to an item means that item is selected. + * What do those parentheses (round brackets) mean in the string + representations of list controls? + Parentheses (foo) around an item mean that item is disabled. + * Why doesn't <some control> turn up in the data returned by + .click*() when that control has non-None value? + Either the control is disabled, or it is not successful for some + other reason. 'Successful' (see HTML 4 specification) means that + the control will cause data to get sent to the server. + * Why does ClientForm not follow the HTML 4.0 / RFC 1866 standards + for RADIO and multiple-selection SELECT controls? + Because by default, it follows browser behaviour when setting the + initially-selected items in list controls that have no items + explicitly selected in the HTML. Use the select_default argument + to ParseResponse if you want to follow the RFC 1866 rules instead. + Note that browser behaviour violates the HTML 4.01 specification + in the case of RADIO controls. + * Why does .click()ing on a button not work for me? + + Clicking on a RESET button doesn't do anything, by design - + this is a library for web automation, not an interactive + browser. Even in an interactive browser, clicking on RESET + sends nothing to the server, so there is little point in + having .click() do anything special here. + + Clicking on a BUTTON TYPE=BUTTON doesn't do anything either, + also by design. This time, the reason is that that BUTTON is + only in the HTML standard so that one can attach callbacks to + its events. The callbacks are functions in SCRIPT elements + (such as Javascript) embedded in the HTML, and their + execution may result in information getting sent back to the + server. ClientForm, however, knows nothing about these + callbacks, so it can't do anything useful with a click on a + BUTTON whose type is BUTTON. + + Generally, embedded script may be messing things up in all + kinds of ways. See the answer to the next question. + * Embedded script is messing up my form filling. What do I do? + See the [17]General FAQs page for what to do about this. + * I'm having trouble debugging my code. + The [18]ClientCookie package makes it easy to get .seek()able + response objects, which is convenient for debugging. See also + [19]here for few relevant tips. Also see [20]General FAQs. + * I have a control containing a list of integers. How do I select + the one whose value is nearest to the one I want? + import bisect + def closest_int_value(form, ctrl_name, value): + values = map(int, form.possible_items(ctrl_name)) + return str(values[bisect.bisect(values, value) - 1]) + + form["distance"] = [closest_int_value(form, "distance", 23)] + * Where can I find out more about the HTML and HTTP standards? + + W3C [21]HTML 4.01 Specification. + + [22]RFC 1866 - the HTML 2.0 standard. + + [23]RFC 1867 - Form-based file upload. + + [24]RFC 2616 - HTTP 1.1 Specification. + + [25]John J. Lee, January 2005. + + [26]Home + [27]ClientCookie + ClientForm + [28]DOMForm + [29]python-spidermonkey + [30]ClientTable + [31]mechanize + [32]pullparser + [33]General FAQs + [34]1.5.2 urllib2.py + [35]1.5.2 urllib.py + [36]Other stuff + [37]Download + [38]FAQs + +References + + 1. http://sourceforge.net/ + 2. http://www.linpro.no/lwp/ + 3. http://pyunit.sourceforge.net/ + 4. http://wwwsearch.sourceforge.net/ClientForm/src/README_0_0_15.html + 5. http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0.1.17.tar.gz + 6. http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0_1_17.zip + 7. http://wwwsearch.sourceforge.net/ClientForm/src/ChangeLog.txt + 8. http://wwwsearch.sourceforge.net/ClientForm/src/ + 9. http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0.0.16.tar.gz + 10. http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0_0_16.zip + 11. http://wwwsearch.sourceforge.net/ClientForm/src/ChangeLog.txt + 12. http://wwwsearch.sourceforge.net/ClientForm/src/ + 13. http://www.python.org/ + 14. http://wwwsearch.sourceforge.net/bits/urllib2.py + 15. http://wwwsearch.sourceforge.net/bits/urllib.py + 16. http://www.opensource.org/licenses/bsd-license.php + 17. http://wwwsearch.sourceforge.net/bits/GeneralFAQ.html + 18. http://wwwsearch.sourceforge.net/ClientCookie/ + 19. http://wwwsearch.sourceforge.net/ClientCookie/doc.html#debugging + 20. http://wwwsearch.sourceforge.net/bits/GeneralFAQ.html + 21. http://www.w3.org/TR/html401/ + 22. http://www.ietf.org/rfc/rfc1866.txt + 23. http://www.ietf.org/rfc/rfc1867.txt + 24. http://www.ietf.org/rfc/rfc2616.txt + 25. mailto:jjl@pobox.com + 26. http://wwwsearch.sourceforge.net/ + 27. http://wwwsearch.sourceforge.net/ClientCookie/ + 28. http://wwwsearch.sourceforge.net/DOMForm/ + 29. http://wwwsearch.sourceforge.net/python-spidermonkey/ + 30. http://wwwsearch.sourceforge.net/ClientTable/ + 31. http://wwwsearch.sourceforge.net/mechanize/ + 32. http://wwwsearch.sourceforge.net/pullparser/ + 33. http://wwwsearch.sourceforge.net/bits/GeneralFAQ.html + 34. http://wwwsearch.sourceforge.net/bits/urllib2_152.py + 35. http://wwwsearch.sourceforge.net/bits/urllib_152.py + 36. http://wwwsearch.sourceforge.net/#other + 37. http://wwwsearch.sourceforge.net/ClientForm/#download + 38. http://wwwsearch.sourceforge.net/ClientForm/#faq diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/setup.py b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/setup.py new file mode 100755 index 0000000000000000000000000000000000000000..eb4101af4b5dc5ef0ac3bcf5e8c3a6c4f122c282 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/setup.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +"""Client-side HTML form handling. + +ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It developed from a port of Gisle Aas' +Perl module HTML::Form, from the libwww-perl library, but the +interface is not the same. +""" + +from ClientForm import VERSION +NAME = "ClientForm" +PACKAGE = False +LICENSE = "BSD" +PLATFORMS = ["any"] +CLASSIFIERS = """\ +Development Status :: 5 - Production/Stable +Intended Audience :: Developers +Intended Audience :: System Administrators +License :: OSI Approved :: BSD License +Natural Language :: English +Operating System :: OS Independent +Programming Language :: Python +Topic :: Internet +Topic :: Internet :: WWW/HTTP +Topic :: Internet :: WWW/HTTP :: Site Management +Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking +Topic :: Software Development :: Libraries +Topic :: Software Development :: Libraries :: Python Modules +Topic :: Software Development :: Testing +Topic :: Software Development :: Testing :: Traffic Generation +Topic :: System :: Networking :: Monitoring +Topic :: System :: Systems Administration +Topic :: Text Processing +Topic :: Text Processing :: Markup +Topic :: Text Processing :: Markup :: HTML +Topic :: Text Processing :: Markup :: XML +""" + +#------------------------------------------------------- +# the rest is constant for most of my released packages: + +import sys, string +from distutils.core import setup + +_setup = setup +def setup(**kwargs): + if not hasattr(sys, "version_info") or sys.version_info < (2, 3): + # Python version compatibility + # XXX probably download_url came in earlier than 2.3 + for key in ["classifiers", "download_url"]: + if kwargs.has_key(key): + del kwargs[key] + # Only want packages keyword if this is a package, + # only want py_modules keyword if this is a single-file module, + # so get rid of packages or py_modules keyword as appropriate. + if kwargs["packages"] is None: + del kwargs["packages"] + else: + del kwargs["py_modules"] + apply(_setup, (), kwargs) + +if PACKAGE: + packages = [NAME] + py_modules = None +else: + py_modules = [NAME] + packages = None + +doclines = string.split(__doc__, "\n") + +setup(name = NAME, + version = VERSION, + license = LICENSE, + platforms = PLATFORMS, + classifiers = filter(None, string.split(CLASSIFIERS, "\n")), + author = "John J. Lee", + author_email = "jjl@pobox.com", + description = doclines[0], + url = "http://wwwsearch.sourceforge.net/%s/" % NAME, + download_url = ("http://wwwsearch.sourceforge.net/%s/src/" + "%s-%s.tar.gz" % (NAME, NAME, VERSION)), + long_description = string.join(doclines[2:], "\n"), + py_modules = py_modules, + packages = packages, + ) diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/test.py b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/test.py new file mode 100755 index 0000000000000000000000000000000000000000..e88812cad16942c528984b45a28cb500764e119b --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/test.py @@ -0,0 +1,1949 @@ +#!/usr/bin/env python + +import unittest, string +from unittest import TestCase +from cStringIO import StringIO + +import ClientForm +from ClientForm import ControlNotFoundError, ItemNotFoundError, \ + ItemCountError, ParseError + +# XXX +# Base control tests on ParseFile, so can use same tests for DOMForm and +# ClientForm. That wouldn't be unit testing exactly, but saner than the +# current situation with massive duplication of tests between the two +# modules. +# HTMLForm.enctype +# XHTML + +try: True +except NameError: + True = 1 + False = 0 + +try: bool +except NameError: + def bool(expr): + if expr: return True + else: return False + +class LWPFormTests(TestCase): + """The original tests from libwww-perl 5.64.""" + def testEmptyParse(self): + forms = ClientForm.ParseFile(StringIO(""), "http://localhost") + self.assert_(len(forms) == 0) + + def _forms(self): + file = StringIO("""<form action="abc"> + + <input name="firstname" value="Gisle"> + + </form> + + """) + return ClientForm.ParseFile(file, "http://localhost/") + + def testParse(self): + forms = self._forms() + self.assert_(len(forms) == 1) + self.assert_(forms[0]["firstname"] == "Gisle") + + def testFillForm(self): + forms = self._forms() + form = forms[0] + form["firstname"] = "Gisle Aas" + req = form.click() + def request_method(req): + if req.has_data(): + return "POST" + else: + return "GET" + self.assert_(request_method(req) == "GET") + self.assert_(req.get_full_url() == "http://localhost/abc?firstname=Gisle+Aas") + + +class ParseTests(TestCase): + def test_parse_error(self): + f = StringIO( +"""<form action="abc"> +<option> +</form> +""") + base_uri = "http://localhost/" + try: + ClientForm.ParseFile(f, base_uri) + except ClientForm.ParseError, e: + self.assert_(e.base_uri == base_uri) + else: + self.assert_(0) + + def test_base_uri(self): + # BASE element takes priority over document URI + file = StringIO( +"""<base HREF="http://example.com"> +<form action="abc"> +<input type="submit"></input> +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.action == "http://example.com/abc") + + file = StringIO( +"""<form action="abc"> +<input type="submit"></input> +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.action == "http://localhost/abc") + + def testTextarea(self): + file = StringIO( +"""<form action="abc"> + +<input name="firstname" value="Gisle"> +<textarea>blah, blah, +Rhubarb. + +</textarea> + +<textarea></textarea> + +<textarea name=""ta"" id="foo&bar">Hello testers & users!</textarea> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + self.assert_(len(forms) == 1) + form = forms[0] + self.assert_(form.name is None) + control = form.find_control(type="textarea") + self.assert_(control.name is None) + self.assert_(control.value == "blah, blah,\nRhubarb.\n\n") + + empty_control = form.find_control(type="textarea", nr=1) + self.assert_(str(empty_control) == "<TextareaControl(<None>=)>") + self.assert_(empty_control.value == "") + + entity_ctl = form.find_control(type="textarea", nr=2) + self.assert_(entity_ctl.name == '"ta"') + self.assertEqual(entity_ctl.attrs["id"], "foo&bar") + + self.assert_(entity_ctl.value == "Hello testers & users!") + + def testSelect(self): + file = StringIO( +"""<form action="abc"> + +<select name="foo"> + <option>Hello testers & users!</option> + <option></option><option></option> +</select> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + self.assert_(len(forms) == 1) + form = forms[0] + + entity_ctl = form.find_control(type="select") + self.assert_(entity_ctl.name == "foo") + self.assert_(entity_ctl.value[0] == "Hello testers & users!") + opt = entity_ctl.get_item_attrs("Hello testers & users!") + self.assert_(opt["value"] == opt["label"] == opt["contents"] == + "Hello testers & users!") + + def testButton(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<input type="text" value="cow" name="moo"> + +<button name="b">blah, blah, +Rhubarb.</button> + +<button type="reset" name="b2"></button> +<button type="button" name="b3"></button> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.name == "myform") + control = form.find_control(name="b") + self.assert_(control.type == "submitbutton") + self.assert_(control.value == "") + self.assert_(form.find_control("b2").type == "resetbutton") + self.assert_(form.find_control("b3").type == "buttonbutton") + pairs = form.click_pairs() + self.assert_(pairs == [("moo", "cow"), ("b", "")]) + + def testIsindex(self): + file = StringIO( +"""<form action="abc"> + +<isindex prompt=">>>"> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + control = form.find_control(type="isindex") + self.assert_(control.type == "isindex") + self.assert_(control.name is None) + self.assert_(control.value == "") + control.value = "some stuff" + self.assert_(form.click_pairs() == []) + self.assert_(form.click_request_data() == + ("http://localhost/abc?some+stuff", None, [])) + self.assert_(form.click().get_full_url() == + "http://localhost/abc?some+stuff") + + def testEmptySelect(self): + file = StringIO( +"""<form action="abc"> +<select name="foo"></select> + +<select name="bar" multiple></select> + +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + control0 = form.find_control(type="select", nr=0) + control1 = form.find_control(type="select", nr=1) + self.assert_(str(control0) == "<SelectControl(foo=[])>") + self.assert_(str(control1) == "<SelectControl(bar=[])>") + form.set_value([], "foo") + self.assertRaises(ItemNotFoundError, form.set_value, ["oops"], "foo") + self.assert_(form.click_pairs() == []) + +# XXX figure out what to do in these sorts of cases +## def badSelect(self): +## # what objects should these generate, if any? +## # what should happen on submission of these? +## # what about similar checkboxes and radios? +## """<form action="abc" name="myform"> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ + +## """<form action="abc" name="myform"> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ +## <select name="foo"> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select name="foo" multiple> +## <option>4</option> +## <option>5</option> +## <option>6</option> +## </select> +## """ + +## """<form action="abc" name="myform"> + +## <select> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ + +## def testBadCheckbox(self): +## # see comments above +## # split checkbox -- is it one control, or two? + +## """ +## <html> + +## <input type=checkbox name=foo value=bar> +## <input type=checkbox name=foo value=bar> + +## <select> +## <option>1</option> +## <option>2</option> +## </select> + +## <input type=checkbox name=foo value=baz> +## <input type=checkbox name=foo value=bar> + +## </html> +## """ + + def testUnnamedControl(self): + file = StringIO(""" +<form action="./weird.html"> + +<input type="checkbox" value="foo"></input> + +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.controls[0].name is None) + + def testNamelessListControls(self): + # XXX SELECT + # these controls have no item names + file = StringIO("""<form action="./weird.html"> + +<input type="checkbox" name="foo"></input> + +<input type="radio" name="bar"></input> + +<!-- +<select name="baz"> + <option></option> +</select> + +<select name="baz" multiple> + <option></option> +</select> +--> + +<input type="submit" name="submit"> +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.possible_items("foo") == ["on"]) + self.assert_(form.possible_items("bar") == ["on"]) + #self.assert_(form.possible_items("baz") == []) + self.assert_(form["foo"] == []) + self.assert_(form["bar"] == []) + #self.assert_(form["baz"] == []) + form["foo"] = ["on"] + form["bar"] = ["on"] + pairs = form.click_pairs() + self.assert_(pairs == [("foo", "on"), ("bar", "on"), ("submit", "")]) + + def testBadSingleSelect(self): + # HTML 4.01 section 17.6.1: single selection SELECT controls shouldn't + # have > 1 item selected, but if they do, not more than one should end + # up selected. + file = StringIO("""<form action="./bad.html"> + +<select name="spam"> + <option selected>1</option> + <option selected>2</option> +</select> + +<input type="submit" name="submit"> +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.possible_items("spam") == ["1", "2"]) + nr_selected = len(form.find_control("spam").pairs()) + self.assert_(nr_selected == 1) + + def testSelectDefault(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="a" multiple> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="b"> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + control = form.find_control("a") + self.assert_(control.value == []) + single_control = form.find_control("b") + self.assert_(single_control.value == ["1"]) + + file.seek(0) + forms = ClientForm.ParseFile(file, "http://localhost/", + select_default=1) + form = forms[0] + # select_default only affects *multiple* selection select controls + control = form.find_control(type="select") + self.assert_(control.value == ["1"]) + single_control = form.find_control(type="select", nr=1) + self.assert_(single_control.value == ["1"]) + + +class DisabledTests(TestCase): + def testOptgroup(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="foo" multiple> + <option>1</option> + <optgroup> + <option>2</option> + </optgroup> + <option>3</option> + <optgroup> + <option>4</option> + <option>5</option> + <option>6</option> + </optgroup> + <optgroup disabled> + <option selected>7</option> + <option>8</option> + </optgroup> + <option>9</option> + <optgroup disabled> + <option>10</option> + </optgroup> +</select> + +<select name="bar"> + <option>1</option> + <optgroup> + <option>2</option> + </optgroup> + <option>3</option> + <optgroup> + <option>4</option> + <option>5</option> + <option>6</option> + </optgroup> + <optgroup disabled> + <option selected>7</option> + <option>8</option> + </optgroup> + <option>9</option> + <optgroup disabled> + <option>10</option> + </optgroup> +</select> + +</form>""") + + def get_control(name, file=file): + file.seek(0) + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + return form.find_control(name) + + # can't call item_disabled with no args + control = get_control("foo") + self.assertRaises(TypeError, control.get_item_disabled) + + control.set_item_disabled(True, "2") + self.assert_(str(control) == "<SelectControl(foo=" + "[1, (2), 3, 4, 5, 6, (*7), (8), 9, (10)])>") + + # list controls only allow assignment to .value if no attempt is + # made to set any disabled item... + + # ...multi selection + control = get_control("foo") + self.assert_(control.value == ["7"]) + control.value = ["1"] + control = get_control("foo") + def assign_8(control=control): control.value = ["8"] + self.assertRaises(AttributeError, assign_8) + self.assert_(control.value == ["7"]) + # even though 7 is set already, attempt to set it again fails + def assign_7(control=control): control.value = ["7"] + self.assertRaises(AttributeError, assign_7) + control.value = ["1", "3"] + control = get_control("foo") + def assign_multi(control=control): control.value = ["1", "7"] + self.assertRaises(AttributeError, assign_multi) + # enable all items + for item in control.possible_items(): + control.set_item_disabled(False, item) + assign_multi() + + control = get_control("foo") + for value in 7, 8, 10: + self.assert_(control.get_item_disabled(str(value))) + self.assertRaises(AttributeError, control.set, True, str(value)) + control.set(False, str(value)) + self.assert_(str(value) not in control.value) + control.set(False, str(value)) + self.assert_((str(value) not in control.value)) + self.assertRaises(AttributeError, control.toggle, str(value)) + self.assert_(str(value) not in control.value) + self.assertRaises(AttributeError, control.set, True, str(value)) + self.assert_(str(value) not in control.value) + + control = get_control("foo") + for value in 1, 2, 3, 4, 5, 6, 9: + self.assert_(not control.get_item_disabled(str(value))) + control.set(False, str(value)) + self.assert_(str(value) not in control.value) + control.toggle(str(value)) + self.assert_(str(value) in control.value) + control.set(True, str(value)) + self.assert_(str(value) in control.value) + control.toggle(str(value)) + self.assert_(str(value) not in control.value) + + control = get_control("foo") + self.assert_(control.get_item_disabled("7")) + control.toggle("7") # clearing, not setting, so no problem + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(True, "7") + self.assert_(control.get_item_disabled("7")) + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(False, "7") + self.assert_(not control.get_item_disabled("7")) + control.set(True, "7") + control.set(False, "7") + control.toggle("7") + control.toggle("7") + + # ...single-selection + control = get_control("bar") + self.assert_(control.value == ["7"]) + control.value = ["1"] + control = get_control("bar") + def assign_8(control=control): control.value = ["8"] + self.assertRaises(AttributeError, assign_8) + self.assert_(control.value == ["7"]) + # even though 7 is set already, attempt to set it again fails + def assign_7(control=control): control.value = ["7"] + self.assertRaises(AttributeError, assign_7) + # enable all items + for item in control.possible_items(): + control.set_item_disabled(False, item) + assign_7() + + control = get_control("bar") + for value in 7, 8, 10: + self.assert_(control.get_item_disabled(str(value))) + self.assertRaises(AttributeError, control.set, True, str(value)) + control.set(False, str(value)) + self.assert_(str(value) != control.value) + control.set(False, str(value)) + self.assert_(str(value) != control.value) + self.assertRaises(AttributeError, control.toggle, str(value)) + self.assert_(str(value) != control.value) + self.assertRaises(AttributeError, control.set, True, str(value)) + self.assert_(str(value) != control.value) + + control = get_control("bar") + for value in 1, 2, 3, 4, 5, 6, 9: + self.assert_(not control.get_item_disabled(str(value))) + control.set(False, str(value)) + self.assert_(str(value) not in control.value) + control.toggle(str(value)) + self.assert_(str(value) == control.value[0]) + control.set(True, str(value)) + self.assert_(str(value) == control.value[0]) + control.toggle(str(value)) + self.assert_(str(value) not in control.value) + + control = get_control("bar") + self.assert_(control.get_item_disabled("7")) + control.toggle("7") # clearing, not setting, so no problem + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(True, "7") + self.assert_(control.get_item_disabled("7")) + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(False, "7") + self.assert_(not control.get_item_disabled("7")) + control.set(True, "7") + control.set(False, "7") + control.toggle("7") + control.toggle("7") + + # set_all_items_disabled + for name in "foo", "bar": + control = get_control(name) + control.set_all_items_disabled(False) + control.set(True, "7") + control.set(True, "1") + control.set_all_items_disabled(True) + self.assertRaises(AttributeError, control.set, True, "7") + self.assertRaises(AttributeError, control.set, True, "1") + +# XXX single select + def testDisabledSelect(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="foo" multiple> + <option label="a">1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="bar" multiple> + <option>1</option> + <option disabled>2</option> + <option>3</option> +</select> + +<select name="baz" disabled multiple> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="spam" disabled multiple> + <option>1</option> + <option disabled>2</option> + <option>3</option> +</select> + +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + for name, control_disabled, item_disabled in [ + ("foo", False, False), + ("bar", False, True), + ("baz", True, False), + ("spam", True, True)]: + control = form.find_control(name) + self.assert_(bool(control.disabled) == control_disabled) + item = control.get_item_attrs("2") + self.assert_(bool(item.has_key("disabled")) == item_disabled) + + def bad_assign(value, control=control): control.value = value + if control_disabled: + for value in "1", "2", "3": + self.assertRaises(AttributeError, control.set, True, value) + self.assertRaises(AttributeError, bad_assign, [value]) + elif item_disabled: + self.assertRaises(AttributeError, control.set, True, "2") + self.assertRaises(AttributeError, bad_assign, ["2"]) + for value in "1", "3": + control.set(True, value) + else: + control.value = ["1", "2", "3"] + + control = form.find_control("foo") + # missing disabled arg + self.assertRaises(TypeError, control.set_item_disabled, "1") + # by_label + self.assert_(not control.get_item_disabled("a", by_label=True)) + control.set_item_disabled(True, "a", by_label=True) + self.assert_(control.get_item_disabled("a", by_label=True)) + + def testDisabledCheckbox(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<input type="checkbox" name="foo" value="1" label="a"></input> +<input type="checkbox" name="foo" value="2"></input> +<input type="checkbox" name="foo" value="3"></input> + +<input type="checkbox" name="bar" value="1"></input> +<input type="checkbox" name="bar" value="2" disabled></input> +<input type="checkbox" name="bar" value="3"></input> + +<input type="checkbox" name="baz" value="1" disabled></input> +<input type="checkbox" name="baz" value="2" disabled></input> +<input type="checkbox" name="baz" value="3" disabled></input> + +</form>""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + for name, control_disabled, item_disabled in [ + ("foo", False, False), + ("bar", False, True), + ("baz", False, True)]: + control = form.find_control(name) + self.assert_(bool(control.disabled) == control_disabled) + item = control.get_item_attrs("2") + self.assert_(bool(item.has_key("disabled")) == item_disabled) + self.assert_(control.get_item_disabled("2") == item_disabled) + + def bad_assign(value, control=control): control.value = value + if item_disabled: + self.assertRaises(AttributeError, control.set, True, "2") + self.assertRaises(AttributeError, bad_assign, ["2"]) + if not control.get_item_disabled("1"): + control.set(True, "1") + else: + control.value = ["1", "2", "3"] + + control = form.find_control("foo") + control.set_item_disabled(False, "1") + # missing disabled arg + self.assertRaises(TypeError, control.set_item_disabled, "1") + # by_label + self.assertRaises(NotImplementedError, + control.get_item_disabled, "a", by_label=True) + self.assert_(not control.get_item_disabled("1")) + self.assertRaises(NotImplementedError, + control.set_item_disabled, True, "a", + by_label=True) + self.assert_(not control.get_item_disabled("1")) + + +class ControlTests(TestCase): + def testTextControl(self): + attrs = {"type": "this is ignored", + "name": "ath_Uname", + "value": "", + "maxlength": "20", + "id": "foo"} + c = ClientForm.TextControl("texT", "ath_Uname", attrs) + c.fixup() + self.assert_(c.type == "text") + self.assert_(c.name == "ath_Uname") + self.assert_(c.id == "foo") + self.assert_(c.value == "") + self.assert_(str(c) == "<TextControl(ath_Uname=)>") + self.assert_(c.pairs() == [("ath_Uname", "")]) + def bad_assign(c=c): c.type = "sometype" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.type == "text") + def bad_assign(c=c): c.name = "somename" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.name == "ath_Uname") + c.value = "2" + self.assert_(c.value == "2") + self.assert_(str(c) == "<TextControl(ath_Uname=2)>") + def bad_assign(c=c): c.value = ["foo"] + self.assertRaises(TypeError, bad_assign) + self.assert_(c.value == "2") + self.assert_(not c.readonly) + c.readonly = True + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.value == "2") + c.disabled = True + self.assert_(str(c) == + "<TextControl(ath_Uname=2) (disabled, readonly)>") + c.readonly = False + self.assert_(str(c) == "<TextControl(ath_Uname=2) (disabled)>") + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.value == "2") + self.assert_(c.pairs() == []) + c.disabled = False + self.assert_(str(c) == "<TextControl(ath_Uname=2)>") + + self.assert_(c.attrs.has_key("maxlength")) + for key in "name", "type", "value": + self.assert_(c.attrs.has_key(key)) + + # initialisation of readonly and disabled attributes + attrs["readonly"] = True + c = ClientForm.TextControl("text", "ath_Uname", attrs) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + del attrs["readonly"] + attrs["disabled"] = True + c = ClientForm.TextControl("text", "ath_Uname", attrs) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + del attrs["disabled"] + c = ClientForm.TextControl("hidden", "ath_Uname", attrs) + self.assert_(c.readonly) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + + def testIsindexControl(self): + attrs = {"type": "this is ignored", + "prompt": ">>>"} + c = ClientForm.IsindexControl("isIndex", None, attrs) + c.fixup() + self.assert_(c.type == "isindex") + self.assert_(c.name is None) + self.assert_(c.value == "") + self.assert_(str(c) == "<IsindexControl()>") + self.assert_(c.pairs() == []) + def set_type(c=c): c.type = "sometype" + self.assertRaises(AttributeError, set_type) + self.assert_(c.type == "isindex") + def set_name(c=c): c.name = "somename" + self.assertRaises(AttributeError, set_name) + def set_value(value, c=c): c.value = value + self.assertRaises(TypeError, set_value, [None]) + self.assert_(c.name is None) + c.value = "2" + self.assert_(c.value == "2") + self.assert_(str(c) == "<IsindexControl(2)>") + c.disabled = True + self.assert_(str(c) == "<IsindexControl(2) (disabled)>") + self.assertRaises(AttributeError, set_value, "foo") + self.assert_(c.value == "2") + self.assert_(c.pairs() == []) + c.readonly = True + self.assert_(str(c) == "<IsindexControl(2) (disabled, readonly)>") + self.assertRaises(AttributeError, set_value, "foo") + c.disabled = False + self.assert_(str(c) == "<IsindexControl(2) (readonly)>") + self.assertRaises(AttributeError, set_value, "foo") + c.readonly = False + self.assert_(str(c) == "<IsindexControl(2)>") + + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs.has_key("prompt")) + self.assert_(c.attrs["prompt"] == ">>>") + for key in "name", "value": + self.assert_(not c.attrs.has_key(key)) + + c.value = "foo 1 bar 2" + class FakeForm: action = "http://localhost/" + form = FakeForm() + self.assert_(c._click(form, (1,1), "request_data") == + ("http://localhost/?foo+1+bar+2", None, [])) + + def testIgnoreControl(self): + attrs = {"type": "this is ignored"} + c = ClientForm.IgnoreControl("reset", None, attrs) + self.assert_(c.type == "reset") + self.assert_(c.value is None) + self.assert_(str(c) == "<IgnoreControl(<None>=<None>)>") + + def set_value(value, c=c): c.value = value + self.assertRaises(AttributeError, set_value, "foo") + self.assert_(c.value is None) + + def testSubmitControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "img": "foo.gif"} + c = ClientForm.SubmitControl("submit", "name_value", attrs) + self.assert_(c.type == "submit") + self.assert_(c.name == "name_value") + self.assert_(c.value == "value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value) (readonly)>") + def set_value(value, c=c): c.value = value + self.assertRaises(TypeError, set_value, ["foo"]) + c.disabled = True + self.assertRaises(AttributeError, set_value, "value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value) " + "(disabled, readonly)>") + c.disabled = False + c.readonly = False + set_value("value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value)>") + c.readonly = True + + # click on button + form = ClientForm.HTMLForm("http://foo.bar.com/") + c.add_to_form(form) + self.assert_(c.pairs() == []) + pairs = c._click(form, (1,1), "pairs") + request = c._click(form, (1,1), "request") + data = c._click(form, (1,1), "request_data") + self.assert_(c.pairs() == []) + self.assert_(pairs == [("name_value", "value_value")]) + self.assert_(request.get_full_url() == + "http://foo.bar.com/?name_value=value_value") + self.assert_(data == + ("http://foo.bar.com/?name_value=value_value", None, [])) + c.disabled = True + pairs = c._click(form, (1,1), "pairs") + request = c._click(form, (1,1), "request") + data = c._click(form, (1,1), "request_data") + self.assert_(pairs == []) + # XXX not sure if should have '?' on end of this URL, or if it really matters... + self.assert_(request.get_full_url() == "http://foo.bar.com/") + self.assert_(data == ("http://foo.bar.com/", None, [])) + + def testImageControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "img": "foo.gif"} + c = ClientForm.ImageControl("image", "name_value", attrs) + self.assert_(c.type == "image") + self.assert_(c.name == "name_value") + self.assert_(c.value == "") + self.assert_(str(c) == "<ImageControl(name_value=)>") + + # click, at coordinate (0, 55), on image + form = ClientForm.HTMLForm("http://foo.bar.com/") + c.add_to_form(form) + self.assert_(c.pairs() == []) + request = c._click(form, (0, 55), "request") + self.assert_(c.pairs() == []) + self.assert_(request.get_full_url() == + "http://foo.bar.com/?name_value.x=0&name_value.y=55") + self.assert_(c._click(form, (0,55), return_type="request_data") == + ("http://foo.bar.com/?name_value.x=0&name_value.y=55", + None, [])) + c.value = "blah" + request = c._click(form, (0, 55), "request") + self.assert_(request.get_full_url() == + "http://foo.bar.com/?name_value.x=0&name_value.y=55&name_value=blah") + + c.disabled = True + self.assertEqual(c.value, "blah") + self.assert_(str(c) == "<ImageControl(name_value=blah) (disabled)>") + def set_value(value, c=c): c.value = value + self.assertRaises(AttributeError, set_value, "blah") + self.assert_(c._click(form, (1,1), return_type="pairs") == []) + c.readonly = True + self.assert_(str(c) == "<ImageControl(name_value=blah) " + "(disabled, readonly)>") + self.assertRaises(AttributeError, set_value, "blah") + self.assert_(c._click(form, (1,1), return_type="pairs") == []) + c.disabled = c.readonly = False + self.assert_(c._click(form, (1,1), return_type="pairs") == + [("name_value.x", "1"), ("name_value.y", "1"), ('name_value', 'blah')]) + + def testCheckboxControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "alt": "some string"} + c = ClientForm.CheckboxControl("checkbox", "name_value", attrs) + c.fixup() + self.assert_(c.type == "checkbox") + self.assert_(c.name == "name_value") + self.assert_(c.value == []) + self.assert_(c.possible_items() == ["value_value"]) + def set_type(c=c): c.type = "sometype" + self.assertRaises(AttributeError, set_type) + self.assert_(c.type == "checkbox") + def set_name(c=c): c.name = "somename" + self.assertRaises(AttributeError, set_name) + self.assert_(c.name == "name_value") + + # construct larger list from length-1 lists + c = ClientForm.CheckboxControl("checkbox", "name_value", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = ClientForm.CheckboxControl("checkbox", "name_value", attrs2) + c.merge_control(c2) + c.fixup() + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[value_value, value_value2])>") + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + attrs = c.get_item_attrs("value_value") + for key in "alt", "name", "value", "type": + self.assert_(attrs.has_key(key)) + self.assertRaises(ItemNotFoundError, c.get_item_attrs, "oops") + + def set_value(value, c=c): c.value = value + + c.value = ["value_value", "value_value2"] + self.assert_(c.value == ["value_value", "value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assertRaises(TypeError, set_value, "value_value") + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.toggle("value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.toggle("value_value2") + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + # set + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(False, "value_value2") + self.assert_(c.value == ["value_value"]) + c.set(False, "value_value2") + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2])>") + c.disabled = True + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(disabled)>") + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == []) + c.readonly = True + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(disabled, readonly)>") + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == []) + c.disabled = False + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(readonly)>") + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == [("name_value", "value_value")]) + c.readonly = False + c.value = [] + self.assert_(c.value == []) + + def testSelectControlMultiple(self): + import copy + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "alt": "some string", + "label": "contents_value", + "contents": "contents_value", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": "", + "alt": "alt_text"}} + # with Netscape / IE default selection... + c = ClientForm.SelectControl("select", "select_name", attrs) + c.fixup() + self.assert_(c.type == "select") + self.assert_(c.name == "select_name") + self.assert_(c.value == []) + self.assert_(c.possible_items() == ["value_value"]) + self.assert_(c.attrs.has_key("name")) + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs["alt"] == "alt_text") + # ... and with RFC 1866 default selection + c = ClientForm.SelectControl("select", "select_name", attrs, select_default=True) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = ClientForm.SelectControl("select", "select_name", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = ClientForm.SelectControl("select", "select_name", attrs2) + c.merge_control(c2) + c.fixup() + self.assert_(str(c) == "<SelectControl(" + "select_name=[value_value, value_value2])>") + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + # get_item_attrs + attrs3 = c.get_item_attrs("value_value") + self.assert_(attrs3.has_key("alt")) + self.assert_(not attrs3.has_key("multiple")) + # HTML attributes dictionary should have been copied by ListControl + # constructor. + attrs["new_attr"] = "new" + attrs2["new_attr2"] = "new2" + for key in ("new_attr", "new_attr2"): + self.assert_(not attrs3.has_key(key)) + self.assertRaises(ItemNotFoundError, c.get_item_attrs, "oops") + + c.value = ["value_value", "value_value2"] + self.assert_(c.value == ["value_value", "value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assertRaises(TypeError, set_value, None) + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.toggle("value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.toggle("value_value2") + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + self.assert_(c.value == ["value_value"]) + # test ordering of items + c.value = ["value_value2", "value_value"] + self.assert_(c.value == ["value_value", "value_value2"]) + # set + c.set(True, "value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(False, "value_value") + self.assert_(c.value == ["value_value2"]) + c.set(False, "value_value") + self.assert_(c.value == ["value_value2"]) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + c.value = [] + self.assert_(c.value == []) + + def testSelectControlMultiple_label(self): + import ClientForm +## <SELECT name=year> +## <OPTION value=0 label="2002">current year</OPTION> +## <OPTION value=1>2001</OPTION> +## <OPTION>2000</OPTION> +## </SELECT> + attrs = {"type": "ignored", + "name": "year", + "value": "0", + "label": "2002", + "contents": "current year", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + attrs2 = {"type": "ignored", + "name": "year", + "value": "1", + "label": "2001", # label defaults to contents + "contents": "2001", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + attrs3 = {"type": "ignored", + "name": "year", + "value": "2000", # value defaults to contents + "label": "2000", # label defaults to contents + "contents": "2000", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + c = ClientForm.SelectControl("select", "select_name", attrs) + c2 = ClientForm.SelectControl("select", "select_name", attrs2) + c3 = ClientForm.SelectControl("select", "select_name", attrs3) + c.merge_control(c2) + c.merge_control(c3) + c.fixup() + + self.assert_(c.possible_items() == ["0", "1", "2000"]) + self.assert_(c.possible_items(by_label=True) == + ["2002", "2001", "2000"]) + + self.assert_(c.value == []) + c.toggle("2002", by_label=True) + self.assert_(c.value == ["0"]) + c.toggle("0") + self.assert_(c.value == []) + c.toggle("0") + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.toggle("2002", by_label=True) + self.assertRaises(ItemNotFoundError, c.toggle, "blah", by_label=True) + self.assert_(c.value == []) + c.toggle("2000") + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["2002"]) + self.assertRaises(TypeError, set_value, "1") + self.assertRaises(TypeError, set_value, None) + self.assert_(c.value == ["2000"]) + c.value = ["0"] + self.assert_(c.value == ["0"]) + c.value = [] + self.assertRaises(TypeError, c.set_value_by_label, "2002") + c.set_value_by_label(["2002"]) + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.set_value_by_label(["2000"]) + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + c.set_value_by_label(["2000", "2002"]) + self.assert_(c.value == ["0", "2000"]) + self.assert_(c.get_value_by_label() == ["2002", "2000"]) + + c.set(False, "2002", by_label=True) + self.assert_(c.get_value_by_label() == c.value == ["2000"]) + c.set(False, "2002", by_label=True) + self.assert_(c.get_value_by_label() == c.value == ["2000"]) + c.set(True, "2002", by_label=True) + self.assert_(c.get_value_by_label() == ["2002", "2000"]) + self.assert_(c.value == ["0", "2000"]) + c.set(False, "2000", by_label=True) + self.assert_(c.get_value_by_label() == ["2002"]) + self.assert_(c.value == ["0"]) + c.set(True, "2001", by_label=True) + self.assert_(c.get_value_by_label() == ["2002", "2001"]) + self.assert_(c.value == ["0", "1"]) + self.assertRaises(ItemNotFoundError, c.set, True, "blah", + by_label=True) + self.assertRaises(ItemNotFoundError, c.set, + False, "blah", by_label=True) + + def testSelectControlSingle_label(self): + import ClientForm +## <SELECT name=year> +## <OPTION value=0 label="2002">current year</OPTION> +## <OPTION value=1>2001</OPTION> +## <OPTION>2000</OPTION> +## </SELECT> + attrs = {"type": "ignored", + "name": "year", + "value": "0", + "label": "2002", + "contents": "current year", + "__select": {"type": "this is ignored", + "name": "select_name"}} + attrs2 = {"type": "ignored", + "name": "year", + "value": "1", + "label": "2001", # label defaults to contents + "contents": "2001", + "__select": {"type": "this is ignored", + "name": "select_name"}} + attrs3 = {"type": "ignored", + "name": "year", + "value": "2000", # value defaults to contents + "label": "2000", # label defaults to contents + "contents": "2000", + "__select": {"type": "this is ignored", + "name": "select_name"}} + c = ClientForm.SelectControl("select", "select_name", attrs) + c2 = ClientForm.SelectControl("select", "select_name", attrs2) + c3 = ClientForm.SelectControl("select", "select_name", attrs3) + c.merge_control(c2) + c.merge_control(c3) + c.fixup() + + self.assert_(c.possible_items() == ["0", "1", "2000"]) + self.assert_(c.possible_items(by_label=True) == + ["2002", "2001", "2000"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["2002"]) + self.assertRaises(TypeError, set_value, "1") + self.assertRaises(TypeError, set_value, None) + self.assert_(c.value == ["0"]) + c.value = [] + self.assert_(c.value == []) + c.value = ["0"] + self.assert_(c.value == ["0"]) + c.value = [] + self.assertRaises(TypeError, c.set_value_by_label, "2002") + self.assertRaises(ItemNotFoundError, c.set_value_by_label, ["foo"]) + c.set_value_by_label(["2002"]) + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.set_value_by_label(["2000"]) + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + + def testSelectControlSingle(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "label": "contents_value", + "contents": "contents_value", + "__select": {"type": "this is ignored", + "name": "select_name", + "alt": "alt_text"}} + # Netscape and IE behaviour... + c = ClientForm.SelectControl("select", "select_name", attrs) + c.fixup() + self.assert_(c.type == "select") + self.assert_(c.name == "select_name") + self.assert_(c.value == ["value_value"]) + self.assert_(c.possible_items() == ["value_value"]) + self.assert_(c.attrs.has_key("name")) + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs["alt"] == "alt_text") + # ...and RFC 1866 behaviour are identical (unlike multiple SELECT). + c = ClientForm.SelectControl("select", "select_name", attrs, + select_default=1) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = ClientForm.SelectControl("select", "select_name", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = ClientForm.SelectControl("select", "select_name", attrs2) + c.merge_control(c2) + c.fixup() + self.assert_(str(c) == "<SelectControl(" + "select_name=[*value_value, value_value2])>") + c.value = [] + self.assert_(c.value == []) + self.assert_(str(c) == "<SelectControl(" + "select_name=[value_value, value_value2])>") + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assert_(str(c) == "<SelectControl(" + "select_name=[*value_value, value_value2])>") + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemCountError, set_value, + ["value_value", "value_value2"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assertRaises(TypeError, set_value, None) + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assert_(c.value == ["value_value"]) + c.toggle("value_value") + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + self.assertRaises(TypeError, c.toggle, ["oops"]) + self.assert_(c.value == []) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + # nothing selected is allowed + c.value = [] + self.assert_(c.value == []) + # set + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value2"]) + c.set(False, "value_value") + self.assert_("value_value2") + c.set(False, "value_value2") + self.assert_(c.value == []) + c.set(False, "value_value2") + self.assert_(c.value == []) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + + def testRadioControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "id": "blah"} + # Netscape and IE behaviour... + c = ClientForm.RadioControl("radio", "name_value", attrs) + c.fixup() + self.assert_(c.type == "radio") + self.assert_(c.name == "name_value") + self.assert_(c.id == "blah") + self.assert_(c.value == []) + self.assert_(c.possible_items() == ["value_value"]) + # ...and RFC 1866 behaviour + c = ClientForm.RadioControl("radio", "name_value", attrs, + select_default=True) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = ClientForm.RadioControl("radio", "name_value", attrs, + select_default=True) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = ClientForm.RadioControl("radio", "name_value", attrs2, + select_default=True) + c.merge_control(c2) + c.fixup() + self.assert_(str(c) == "<RadioControl(" + "name_value=[*value_value, value_value2])>") + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemCountError, set_value, + ["value_value", "value_value2"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assert_(c.value == ["value_value"]) + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assert_(c.value == ["value_value"]) + c.toggle("value_value") + self.assert_(c.value == []) + c.toggle("value_value") + self.assert_(c.value == ["value_value"]) + self.assertRaises(TypeError, c.toggle, ["value_value"]) + self.assert_(c.value == ["value_value"]) + # nothing selected is allowed + c.value = [] + self.assert_(c.value == []) + # set + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value2"]) + c.set(False, "value_value") + self.assert_("value_value2") + c.set(False, "value_value2") + self.assert_(c.value == []) + c.set(False, "value_value2") + self.assert_(c.value == []) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + + +class FormTests(TestCase): + base_uri = "http://auth.athensams.net/" + def test_click(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<input type="submit" name="foo"></input> +<input type="submit" name="bar"></input> +</form> +""") + form = ClientForm.ParseFile(file, "http://blah/")[0] + self.assertRaises(ControlNotFoundError, form.click, nr=2) + self.assert_(form.click().get_full_url() == "http://blah/abc?foo=") + self.assert_(form.click(name="bar").get_full_url() == "http://blah/abc?bar=") + + # XXX POST, ?, and # + for method in ["GET", "POST"]: + file = StringIO( +"""<form method="%s" action="abc?bang=whizz#doh" name="myform"> + +<input type="submit" name="foo"></input> +</form> +""" % method) + # " (this line is here for emacs) + form = ClientForm.ParseFile(file, "http://blah/")[0] + if method == "GET": + url = "http://blah/abc?foo=" + else: + url = "http://blah/abc?bang=whizz" + self.assert_(form.click().get_full_url() == url) + + def testAuth(self): + file = open("./testdata/Auth.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + self.assert_(form.action == + "http://auth.athensams.net/" + "?ath_returl=%22http%3A%2F%2Ftame.mimas.ac.uk%2Fisicgi" + "%2FWOS-login.cgi%22&ath_dspid=MIMAS.WOS") + + self.assertRaises(ControlNotFoundError, + lambda form=form: form.toggle("d'oh", "oops")) + self.assertRaises(ControlNotFoundError, lambda form=form: form["oops"]) + def bad_assign(form=form): form["oops"] = ["d'oh"] + self.assertRaises(ControlNotFoundError, bad_assign) + + self.assertRaises(ValueError, form.find_control) + + keys = ["ath_uname", "ath_passwd"] + values = ["", ""] + types = ["text", "password"] + for i in range(len(keys)): + key = keys[i] + c = form.find_control(key) + self.assert_(c.value == values[i]) + self.assert_(c.type == types[i]) + c = form.find_control(type="image") + self.assert_(c.name is None) + self.assert_(c.value == "") + self.assert_(c.type == "image") + + form["ath_uname"] = "jbloggs" + form["ath_passwd"] = "foobar" + + self.assert_(form.click_pairs() == + [("ath_uname", "jbloggs"), + ("ath_passwd", "foobar")]) + + def testSearchType(self): + file = open("./testdata/SearchType.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + + keys = ["SID", "SESSION_DIR", "Full Search", "Easy Search", + "New Session", "Log off", "Form", "JavaScript"] + values = ["PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0", + "", "", "", "", "", "Welcome", "No"] + types = ["hidden", "hidden", "image", "image", "image", "image", + "hidden", "hidden"] + for i in range(len(keys)): + key = keys[i] + self.assert_(form.find_control(key).value == values[i]) + self.assert_(form.find_control(key).type == types[i]) + + pairs = form.click_pairs("Full Search") + self.assert_(pairs == [ + ("SID", "PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"), + ("SESSION_DIR", ""), ("Full Search.x", "1"), ("Full Search.y", "1"), + ("Form", "Welcome"), ("JavaScript", "No")]) + + def testFullSearch(self): + pass # XXX + + def testGeneralSearch(self): + file = open("./testdata/GeneralSearch.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + + keys = ["SID", "SESSION_DIR", + "Home", "Date & Database Limits", "Cited Ref Search", + "Log off", "Search", + "topic", "titleonly", "author", "journal", "address", + "Search", "Save query", "Clear", + "languagetype", "doctype", "Sort", + "Form", "Func"] + values = ["PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0", "", + "", "", "", "", "", + "", [], "", "", "", + "", "", "", + ["All languages"], ["All document types"], ["Latest date"], + "General", "Search"] + types = ["hidden", "hidden", + "image", "image", "image", "image", "image", + "text", "checkbox", "text", "text", "text", + "image", "image", "image", + "select", "select", "select", + "hidden", "hidden"] + fc = form.find_control + for i in range(len(keys)): + name = keys[i] + type = types[i] + self.assert_(fc(name).value == form.get_value(name) == values[i]) + self.assert_(fc(name).type == type) + self.assert_(fc(name, type).name == name) + self.assert_(fc(type="hidden").name == "SID") + self.assert_(fc(type="image").name == "Home") + self.assert_(fc(nr=6).name == "Search") + self.assertRaises(ControlNotFoundError, fc, nr=50) + self.assertRaises(ValueError, fc, nr=-1) + self.assert_(fc("Search", "image").name == "Search") + self.assertRaises(ControlNotFoundError, fc, "Search", "hidden") + s0 = fc("Search", "image", nr=0) + s0b = fc("Search", "image", nr=0) + s1 = fc("Search", "image", nr=1) + self.assert_(s0.name == s1.name == "Search") + self.assert_(s0 is s0b) + self.assert_(s0 is not s1) + self.assertRaises(ControlNotFoundError, fc, "Search", "image", nr=2) + self.assert_(fc(type="text", nr=2).name == "journal") + self.assert_(fc("Search", nr=0) is not fc("Search", nr=1)) + + form["topic"] = "foo" + self.assert_(form["topic"] == "foo") + form["author"] = "bar" + form["journal"] = "" + form["address"] = "baz" + form["languagetype"] = ["English", "Catalan"] + self.assert_(form["languagetype"] == ["English", "Catalan"]) + form["titleonly"] = ["on"] + self.assert_(form["titleonly"] == ["on"]) + pairs = form.click_pairs("Search") + self.assert_(pairs == [ + ("SID", "PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"), + ("SESSION_DIR", ""), + ("Search.x", "1"), ("Search.y", "1"), + ("topic", "foo"), + ("titleonly", "on"), + ("author", "bar"), + ("journal", ""), ("address", "baz"), + ("languagetype", "English"), ("languagetype", "Catalan"), + ("doctype", "All document types"), ("Sort", "Latest date"), + ("Form", "General"), ("Func", "Search")]) + + pvs = form.possible_items("languagetype") + self.assert_(pvs[0] == "All languages") + self.assert_(len(pvs) == 47) + + self.assertRaises( + ItemNotFoundError, + lambda form=form: form.toggle("d'oh", "languagetype")) + form.toggle("English", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + self.assertRaises(TypeError, form.toggle, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.toggle, "Catalan", ["languagetype"]) + + # XXX type, nr, by_label args + + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "SID") + + # multiple select + form["languagetype"] = [] + self.assert_(form["languagetype"] == []) + form.set(True, "Catalan", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + form.set(True, "English", "languagetype") + self.assert_(form["languagetype"] == ["English", "Catalan"]) + form.set(False, "English", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + form.set(False, "Catalan", "languagetype") + self.assert_(form["languagetype"] == []) + self.assertRaises(ItemNotFoundError, form.set, True, "doh", "languagetype") + self.assertRaises(ItemNotFoundError, form.set, False, "doh", "languagetype") + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "oops") + self.assertRaises(TypeError, form.set, True, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.set, False, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.set, True, "Catalan", ["languagetype"]) + self.assertRaises(TypeError, form.set, False, "Catalan", ["languagetype"]) + + def setitem(name, value, form=form): form[name] = value + form["languagetype"] = ["Catalan"] + self.assert_(form["languagetype"] == ["Catalan"]) + self.assertRaises(ItemNotFoundError, + setitem, "languagetype", ["doh"]) + self.assertRaises(ControlNotFoundError, setitem, "oops", ["blah"]) + self.assertRaises(TypeError, setitem, ["languagetype"], "Catalan") + + # single select + form["Sort"] = [] + self.assert_(form["Sort"] == []) + form.set(True, "Relevance", "Sort") + self.assert_(form["Sort"] == ["Relevance"]) + form.set(True, "Times Cited", "Sort") + self.assert_(form["Sort"] == ["Times Cited"]) + form.set(False, "Times Cited", "Sort") + self.assert_(form["Sort"] == []) + self.assertRaises(ItemNotFoundError, form.set, True, "doh", "Sort") + self.assertRaises(ItemNotFoundError, form.set, False, "doh", "Sort") + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "oops") + self.assertRaises(TypeError, form.set, True, ["Relevance"], "Sort") + self.assertRaises(TypeError, form.set, False, ["Relevance"], "Sort") + self.assertRaises(TypeError, form.set, True, "Relevance", ["Sort"]) + self.assertRaises(TypeError, form.set, False, "Relevance", ["Sort"]) + + form["Sort"] = ["Relevance"] + self.assert_(form["Sort"] == ["Relevance"]) + self.assertRaises(ItemNotFoundError, + setitem, "Sort", ["doh"]) + self.assertRaises(ControlNotFoundError, setitem, "oops", ["blah"]) + self.assertRaises(TypeError, setitem, ["Sort"], ["Relevance"]) + + def testResults(self): + file = open("./testdata/Results.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + + pvs = form.possible_items("marked_list_candidates") + self.assert_(pvs == [ + "000174872000059/1", "000174858300003/2", "000174827900006/3"]) + def bad_setitem(form=form): + form["marked_list_candidates"] = ["blah"] + self.assertRaises(ItemNotFoundError, bad_setitem) + form["marked_list_candidates"] = [pvs[0]] + + # I've removed most of the INPUT elements from this page, and + # corrected an HTML error + keys = ["Add marked records to list", + "Add records on page to list", + "Add all records retrieved to list", + "marked_list_candidates", + "Add marked records to list", + "Add records on page to list", + "Add all records retrieved to list" + ] + types = ["image", "image", "image", + "checkbox", + "image", "image", "image"] + values = ["", "", "", + [pvs[0]], + "", "", "", + ] + + for i in range(len(keys)): + key = keys[i] + control = form.find_control(key) + self.assert_(control.value == values[i]) + self.assert_(control.type == types[i]) + + pairs = form.click_pairs("Add all records retrieved to list") + self.assert_(pairs == [ + ("Add all records retrieved to list.x", "1"), + ("Add all records retrieved to list.y", "1"), + ("marked_list_candidates", pvs[0])]) + + def testMarkedResults(self): + file = open("./testdata/MarkedResults.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + + pairs = form.click_pairs() + # I've removed most of the INPUT elements from this page, and + # corrected an HTML error + self.assert_(pairs == [ + ("Add marked records to list.x", "1"), + ("Add marked records to list.y", "1"), + ("marked_list_candidates", "000174872000059/1"), + ("marked_list_candidates", "000174858300003/2"), + ("marked_list_candidates", "000174827900006/3") + ]) + + def testMarkedRecords(self): + pass # XXX + + +class MoreFormTests(TestCase): + def make_form(self): + f = StringIO("""\ +<form blah="nonsense" name="formname"> + <input type="checkbox" name="a" value="1" id="1a" blah="spam"></input> + <input type="checkbox" name="a" value="2" blah="eggs"></input> + <input type="checkbox" name="a" value="3" id="3a"></input> + + <input type="radio" name="b" value="1"></input> + <input type="radio" name="b" value="2" id="2"></input> + <input type="radio" name="b" value="3" id="3"></input> + + <select name="c" id="cselect" blah="foo"> + <option id="coption1" blah="bar">1</option> + <option selected blah="baz">2</option> + <option id="coption3">3</option> + </select> + + <select name="d" multiple> + <option value="v1">l1</option> + <option value="v2">l2</option> + <option blah="fee" rhubarb="fi" value="v3">l3</option> + </select> + + <input type="checkbox" name="e" value="1"></input> +</form> +""") + return ClientForm.ParseFile(f, "http://blah/")[0] + + def test_value(self): + form = self.make_form() + + form.set_value(["v3"], type="select", kind="multilist") + self.assert_(form.get_value("d") == ["v3"]) + form.set_value(["l2"], type="select", kind="multilist", by_label=True) + self.assert_(form.get_value("d", by_label=True) == ["l2"]) + + self.assert_(form.get_value( + "b", "radio", "singlelist", None, 0, False) == []) + self.assertRaises(NotImplementedError, + form.set_value, ["1"], "b", by_label=True) + + def test_id(self): + form = self.make_form() + + self.assert_(form.find_control("c").id == "cselect") + self.assert_(form.find_control("a").id == "1a") + self.assert_(form.find_control("b").id is None) + + self.assert_(form.find_control(id="cselect").id == "cselect") + self.assertRaises(ControlNotFoundError, form.find_control, + id="coption1") + self.assert_(form.find_control(id="1a").id == "1a") + self.assertRaises(ControlNotFoundError, form.find_control, id="1") + + def test_single(self): + form = self.make_form() + + self.assertRaises(ItemCountError, form.set_single, True, "d") + + self.assertRaises(NotImplementedError, + form.set_single, True, "e", by_label=True) + form.toggle_single("e", "checkbox", "list", nr=0) + self.assert_("1" in form.get_value("e")) + form.set_single(False, "e", "checkbox", "list", nr=0) + self.assert_("1" not in form.get_value("e")) + form.set_single(True, "e", "checkbox", "list", nr=0) + self.assert_("1" in form.get_value("e")) + + def test_possible_items(self): + form = self.make_form() + + self.assert_(form.possible_items("c") == ["1", "2", "3"]) + self.assert_(form.possible_items("d", by_label=True) == + ["l1", "l2", "l3"]) + + self.assert_(form.possible_items("a") == ["1", "2", "3"]) + self.assertRaises(NotImplementedError, + form.possible_items, "a", by_label=True) + + def test_set_all_readonly(self): + form = self.make_form() + + form.set_all_readonly(True) + for c in form.controls: + self.assert_(c.readonly) + form.set_all_readonly(False) + for c in form.controls: + self.assert_(not c.readonly) + + def test_attrs(self): + form = self.make_form() + + self.assert_(form.attrs["blah"] == "nonsense") + self.assert_(form.attrs["name"] == "formname") + + a = form.find_control("a") + self.assert_(not hasattr(a, "attrs")) + self.assert_(a.get_item_attrs("1")["blah"] == "spam") + self.assert_(a.get_item_attrs("2")["blah"] == "eggs") + self.assert_(not a.get_item_attrs("3").has_key("blah")) + + c = form.find_control("c") + self.assert_(c.attrs["blah"] == "foo") + self.assert_(c.get_item_attrs("1")["blah"] == "bar") + self.assert_(c.get_item_attrs("2")["blah"] == "baz") + self.assert_(not c.get_item_attrs("3").has_key("blah")) + + +def startswith(string, initial): + if len(initial) > len(string): return False + return string[:len(initial)] == initial + +class CaseInsensitiveDict: + def __init__(self, dict): + self._dict = {} + for key, val in dict.items(): + self._dict[string.lower(key)] = val + + def __getitem__(self, key): return self._dict[key] + + def __getattr__(self, name): return getattr(self._dict, name) + + +class UploadTests(TestCase): + def make_form(self): + html = """\ +<form action="/cgi-bin/upload.cgi" method="POST" enctype="multipart/form-data"> +<input type="file" name="data"> +<input type="text" name="user" value="nobody"> +<br> +<input type="submit"> +</form> +""" + + return ClientForm.ParseFile(StringIO(html), + "http://localhost/cgi-bin/upload.cgi")[0] + + def test_file_request(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data)) + #print "data_control._upload_data", data_control._upload_data + req = form.click() + self.assert_(startswith(req.headers["Content-type"], + 'multipart/form-data; boundary=')) + + #print "req.get_data()\n>>%s<<" % req.get_data() + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(req.headers), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + self.assert_(fs["data"].value == data) + self.assert_(fs["data"].filename is None) + + def test_file_request_with_filename(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data), filename="afilename") + req = form.click() + self.assert_(startswith(req.headers["Content-type"], + 'multipart/form-data; boundary=')) + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(req.headers), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + self.assert_(fs["data"].value == data) + self.assert_(fs["data"].filename == "afilename") + + def test_multipart_file_request(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data), filename="filenamea") + more_data = "rhubarb\nrhubarb\n" + data_control.add_file(StringIO(more_data), filename="filenameb") + yet_more_data = "rheum\nrhaponicum\n" + data_control.add_file(StringIO(yet_more_data), filename="filenamec") + req = form.click() + self.assert_(startswith(req.headers["Content-type"], + 'multipart/form-data; boundary=')) + + #print "req.get_data()\n>>%s<<" % req.get_data() + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(req.headers), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + + fss = fs["data"][None] + filenames = "filenamea", "filenameb", "filenamec" + datas = data, more_data, yet_more_data + for i in range(len(fss)): + fs = fss[i] + filename = filenames[i] + data = datas[i] + self.assert_(fs.filename == filename) + self.assert_(fs.value == data) + + def test_upload_data(self): + form = self.make_form() + data = form.click().get_data() + self.assert_(startswith(data, "--")) + + def test_empty_upload(self): + # no controls except for INPUT/SUBMIT + forms = ClientForm.ParseFile(StringIO("""<html> +<form method="POST" action="./weird.html" enctype="multipart/form-data"> +<input type="submit" name="submit"></input> +</form></html>"""), ".") + form = forms[0] + data = form.click().get_data() + lines = string.split(data, "\r\n") + self.assert_(startswith(lines[0], "--")) + self.assert_(lines[1] == + 'Content-disposition: form-data; name="submit"') + self.assert_(lines[2] == lines[3] == "") + self.assert_(startswith(lines[4], "--")) + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/Auth.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/Auth.html new file mode 100644 index 0000000000000000000000000000000000000000..9c931ba9b26904b7d06a9e121ab9eae4c421f8cb --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/Auth.html @@ -0,0 +1,79 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> + +<HTML> +<HEAD> +<TITLE>Athens Authentication Point</TITLE> +<META http-equiv="Content-Type" content="text/html;charset=iso-8859-1"> +</HEAD> + +<BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#000000" VLINK="#000000"> + +<TABLE BORDER="0" CELLSPACING="0" CELLPADDING="0" WIDTH=609> + <TR> + <TD ALIGN="RIGHT"> + <IMG SRC="http://wos.mimas.ac.uk/isicgi/Images/main.jpg" ALT="ISI Web of Science" BORDER="0" WIDTH=470 HEIGHT=150> + </TD> + </TR> + <TR> + <TD> + <IMG SRC="http://auth.athensams.net/images/auth_point.gif" ALT="Athens Authentication Point"> + </TD> + </TR> + <TR> + <TD> + <P> <P> + </TD> + </TR> + <TR> + <TD ALIGN="CENTER"> + <FORM METHOD=POST ACTION="/?ath_returl=%22http%3A%2F%2Ftame.mimas.ac.uk%2Fisicgi%2FWOS-login.cgi%22&ath_dspid=MIMAS.WOS"> + <TABLE ALIGN=CENTER BORDER=0 CELLPADDING=0 CELLSPACING=10 WIDTH="75%"> + <TR> + <TD ALIGN=RIGHT WIDTH="40%"> + <FONT COLOR="#333366" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"><B>Username:</B></FONT> + </TD> + <TD ALIGN=LEFT> + <FONT COLOR="#FFFFFF" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"> + <INPUT TYPE=TEXT NAME="ath_uname" VALUE="" MAXLENGTH=20> + </FONT> + </TD> + </TR> + <TR> + <TD ALIGN=RIGHT> + <FONT COLOR="#333366" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"><B>Password:</B></FONT> + </TD> + <TD ALIGN=LEFT> + <FONT COLOR="#FFFFFF" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"> + <INPUT TYPE=PASSWORD NAME="ath_passwd" MAXLENGTH=20> + </FONT> + </TD> + </TR> + <TR> + <TD ALIGN=CENTER COLSPAN=2> + <INPUT TYPE=IMAGE SRC="http://auth.athensams.net/images/login.gif" BORDER=0 ALT="Login" ALIGN=MIDDLE><BR> + </TD> + </TR> + </TABLE> + </FORM> + </TD> + </TR> +</TABLE> + +<TABLE WIDTH="609" BORDER="0"> + <TR> + <TD> + <FONT FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva" SIZE=1> + Athens is a service of <a href=http://www.eduserv.ac.uk>EduServ</a> + </FONT> + <BR> + <FONT FACE="Verdana, Arial, Helvetica" SIZE=1>(c) <A HREF="http://www.athensams.net/copyright.html">Copyright</a>, EduServ. All rights reserved. February 2002</FONT> + </TD> + <TD> + <A HREF="http://www.mimas.ac.uk"><img align="right" +BORDER="0" SRC="http://wos.mimas.ac.uk/images/small_mimas2.gif" alt="MIMAS"></a> + </TD> + </TR> +</TABLE> + +</BODY> +</HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/FullSearch.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/FullSearch.html new file mode 100644 index 0000000000000000000000000000000000000000..60dc0479c38029601c8d320ae93e11fd61179cd3 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/FullSearch.html @@ -0,0 +1,114 @@ +<HTML><HEAD><TITLE>Search -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PeriodSelect.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi NAME = "searchForm" ENCTYPE="multipart/form-data" METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpsrch.html#Full_Search"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> +<CENTER><STRONG><FONT SIZE=4>Full Search</FONT><BR></STRONG></CENTER><INPUT TYPE=CHECKBOX NAME="editions" VALUE="D"> +<A HREF=help/helptoc.html#sci>Science Citation Index Expanded (SCI-EXPANDED)--1981-present</A><BR> +<INPUT TYPE=CHECKBOX NAME="editions" VALUE="S"> +<A HREF=help/helptoc.html#ssci>Social Sciences Citation Index (SSCI)--1981-present</A><BR> +<INPUT TYPE=CHECKBOX NAME="editions" VALUE="H"> +<A HREF=help/helptoc.html#ahci>Arts & Humanities Citation Index (A&HCI)--1981-present</A><BR> +<HR><INPUT TYPE=RADIO NAME="Period" VALUE="This Week" onClick="clear_years();"> +This week's update (Updated April 26, 2002)<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Latest 2 Weeks" onClick="clear_years();"> +Latest 2 Weeks<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Latest 4 Weeks" onClick="clear_years();"> +Latest 4 Weeks<BR><INPUT TYPE=RADIO NAME="Period" CHECKED VALUE="All Years" onClick="clear_years();"> +All years<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Year Selection"> +Limit search to years selected below<BR><TABLE> +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="2002" onClick="set_period(4);"> +2002 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="2001" onClick="set_period(4);"> +2001 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="2000" onClick="set_period(4);"> +2000 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1999" onClick="set_period(4);"> +1999 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1998" onClick="set_period(4);"> +1998 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1997" onClick="set_period(4);"> +1997 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1996" onClick="set_period(4);"> +1996 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1995" onClick="set_period(4);"> +1995 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1994" onClick="set_period(4);"> +1994 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1993" onClick="set_period(4);"> +1993 + +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="1992" onClick="set_period(4);"> +1992 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1991" onClick="set_period(4);"> +1991 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1990" onClick="set_period(4);"> +1990 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1989" onClick="set_period(4);"> +1989 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1988" onClick="set_period(4);"> +1988 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1987" onClick="set_period(4);"> +1987 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1986" onClick="set_period(4);"> +1986 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1985" onClick="set_period(4);"> +1985 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1984" onClick="set_period(4);"> +1984 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1983" onClick="set_period(4);"> +1983 + +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="1982" onClick="set_period(4);"> +1982 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1981" onClick="set_period(4);"> +1981 +</TABLE><HR><TABLE> + <TR> + <TD><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=Images/gensrch.gif NAME="General Search" ALT="General Search"></TD> + + <TD> + Search for articles by subject term, author name, journal title, or author affiliation<BR></TD> + <TR> + <TD><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=Images/crsrch.gif NAME="Cited Ref Search" ALT="Cited Ref Search"></TD> + + <TD>Search for articles that cite an author or work</TD> </TR> </TABLE> + <HR> + <TABLE> + <TR > + <TD NOWRAP> <A HREF= http://tame.mimas.ac.uk:80/isicgi/CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=LoadQuery>Using Saved Queries:</A></TD><TD> Instructions for editing and running saved queries.</TD> + </TR> </TABLE> + + Enter full pathname of saved query (e.g., c:\myqueries\query1) or use Browse.<BR> + <TABLE> + <TR> + <TD NOWRAP> + <INPUT TYPE=file NAME=fileToUpload VALUE = "" ALT="Browse""> + </TD> + <TD> + <INPUT TYPE=SUBMIT NAME=Func VALUE="Open Query" ALT="Open Query"> + </TD> + </TR> + </TABLE> + <INPUT TYPE=HIDDEN NAME=Form VALUE=Full> + <HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/GeneralSearch.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/GeneralSearch.html new file mode 100644 index 0000000000000000000000000000000000000000..f5ba69fa7b46d47aad1f893b02ce8fb7a319a704 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/GeneralSearch.html @@ -0,0 +1,178 @@ +<HTML><HEAD><TITLE>General Search -- Web of Science v4.31</TITLE> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=http://tame.mimas.ac.uk:80/isicgi/CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> +<A NAME=top> + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + <TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#General_Search"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Date & Database Limits" ALT="Date & Database Limits" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblimits.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Cited Ref Search" ALT="Cited Ref Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbcrsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> + <CENTER><STRONG><FONT SIZE=4> + General Search</FONT><BR></STRONG></CENTER> + Enter individual search terms or phrases separated by search operators such as AND or OR then press SEARCH below.<BR> + <A href=#setlimits><FONT SIZE=+1> + Set language and document type limits and sort option.</A></FONT><BR> + <TABLE><TR> + <TD ALIGN=right HEIGHT="1" WIDTH="74"><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/search.gif NAME="Search" ALT="Search"></TD> + + <TD> + Search using terms entered below.</TD></TABLE><HR> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Basic_Index> + TOPIC:</A> Enter terms from the article title, keywords, or abstract + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#topic_search_examples> + Examples</A><BR> + <INPUT TYPE=TEXT NAME="topic" SIZE="50" VALUE=""> + <INPUT TYPE=CHECKBOX NAME="titleonly"> +Title only<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Author> + AUTHOR:</A> + Enter one or more author names as O'BRIAN C* OR OBRIAN C*<BR> + <INPUT TYPE=TEXT NAME="author" SIZE="50" VALUE=""> +<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Journal> + SOURCE TITLE:</A> + Enter journal title or copy and paste from the <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/A_fulljt.html> + source list</A><BR> + <INPUT TYPE=TEXT NAME="journal" SIZE="50" VALUE=""> +<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Address> + ADDRESS:</A> + Enter terms from an author's affiliation as YALE UNIV SAME HOSP (see <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/adabrv.html>abbreviations list</A>)<BR> + <INPUT TYPE=TEXT NAME="address" SIZE="50" VALUE=""> +<BR> + <HR> + <TABLE> + <TR> + <TD ALIGN=right><INPUT TYPE=IMAGE SRC=http://tame.mimas.ac.uk:80/isicgi/Images/search.gif ALT="Search" BORDER=0 VSPACE=0 HSPACE=1 NAME="Search"></TD> + + <TD> + Search using terms entered above.<BR></TD> <TR> + <TD ALIGN=RIGHT><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/saveq.gif ALT="Save query" NAME="Save query"></TD> + <TD> + Save the search terms for future use.<BR></TD> + <TR> + <TD ALIGN=right><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/clear.gif NAME="Clear" ALT="Clear"></TD> + <TD> + Clear all search terms entered above.</TD> + </TABLE> + <A NAME=setlimits> + <HR> + <STRONG> + SET LIMITS AND SORT OPTION</STRONG><P> + <TABLE FRAME=VOID> <TR ALIGN=LEFT VALIGN=TOP> + <TH COLSPAN=2> Restrict search to a specific language or document type: <TR ALIGN=LEFT VALIGN=TOP> +<TH COLSPAN=2> (Multiple items may be selected from lists) <TH>Sort results by: <TR ALIGN=LEFT VALIGN=TOP> +<TD> <SELECT NAME="languagetype" MULTIPLE SIZE="5"> +<OPTION VALUE="All languages" SELECTED>All languages +<OPTION VALUE="English">English +<OPTION VALUE="Afrikaans">Afrikaans +<OPTION VALUE="Arabic">Arabic +<OPTION VALUE="Bengali">Bengali +<OPTION VALUE="Bulgarian">Bulgarian +<OPTION VALUE="Byelorussian">Byelorussian +<OPTION VALUE="Catalan">Catalan +<OPTION VALUE="Chinese">Chinese +<OPTION VALUE="Croatian">Croatian +<OPTION VALUE="Czech">Czech +<OPTION VALUE="Danish">Danish +<OPTION VALUE="Dutch">Dutch +<OPTION VALUE="Estonian">Estonian +<OPTION VALUE="Finnish">Finnish +<OPTION VALUE="Flemish">Flemish +<OPTION VALUE="French">French +<OPTION VALUE="Gaelic">Gaelic +<OPTION VALUE="Galician">Galician +<OPTION VALUE="Georgian">Georgian +<OPTION VALUE="German">German +<OPTION VALUE="Greek">Greek +<OPTION VALUE="Hebrew">Hebrew +<OPTION VALUE="Hungarian">Hungarian +<OPTION VALUE="Icelandic">Icelandic +<OPTION VALUE="Italian">Italian +<OPTION VALUE="Japanese">Japanese +<OPTION VALUE="Korean">Korean +<OPTION VALUE="Latin">Latin +<OPTION VALUE="Macedonian">Macedonian +<OPTION VALUE="Multi-Language">Multi-Language +<OPTION VALUE="Norwegian">Norwegian +<OPTION VALUE="Persian">Persian +<OPTION VALUE="Polish">Polish +<OPTION VALUE="Portuguese">Portuguese +<OPTION VALUE="Provencal">Provencal +<OPTION VALUE="Rumanian">Rumanian +<OPTION VALUE="Russian">Russian +<OPTION VALUE="Serbian">Serbian +<OPTION VALUE="Serbo-Croatian">Serbo-Croatian +<OPTION VALUE="Slovak">Slovak +<OPTION VALUE="Slovene">Slovene +<OPTION VALUE="Spanish">Spanish +<OPTION VALUE="Swedish">Swedish +<OPTION VALUE="Turkish">Turkish +<OPTION VALUE="Ukrainian">Ukrainian +<OPTION VALUE="Welsh">Welsh +</SELECT> +<TD><SELECT NAME="doctype" MULTIPLE SIZE="5"> +<OPTION VALUE="All document types" SELECTED>All document types +<OPTION VALUE="Article">Article +<OPTION VALUE="Abstract of Published Item">Abstract of Published Item +<OPTION VALUE="Art Exhibit Review">Art Exhibit Review +<OPTION VALUE="Bibliography">Bibliography +<OPTION VALUE="Biographical-Item">Biographical-Item +<OPTION VALUE="Book Review">Book Review +<OPTION VALUE="Chronology">Chronology +<OPTION VALUE="Correction">Correction +<OPTION VALUE="Correction, Addition">Correction, Addition +<OPTION VALUE="Dance Performance Review">Dance Performance Review +<OPTION VALUE="Database Review">Database Review +<OPTION VALUE="Discussion">Discussion +<OPTION VALUE="Editorial Material">Editorial Material +<OPTION VALUE="Excerpt">Excerpt +<OPTION VALUE="Fiction, Creative Prose">Fiction, Creative Prose +<OPTION VALUE="Film Review">Film Review +<OPTION VALUE="Hardware Review">Hardware Review +<OPTION VALUE="Item About an Individual">Item About an Individual +<OPTION VALUE="Letter">Letter +<OPTION VALUE="Meeting Abstract">Meeting Abstract +<OPTION VALUE="Meeting-Abstract">Meeting-Abstract +<OPTION VALUE="Music Performance Review">Music Performance Review +<OPTION VALUE="Music Score">Music Score +<OPTION VALUE="Music Score Review">Music Score Review +<OPTION VALUE="News Item">News Item +<OPTION VALUE="Note">Note +<OPTION VALUE="Poetry">Poetry +<OPTION VALUE="Record Review">Record Review +<OPTION VALUE="Reprint">Reprint +<OPTION VALUE="Review">Review +<OPTION VALUE="Script">Script +<OPTION VALUE="Software Review">Software Review +<OPTION VALUE="TV Review, Radio Review">TV Review, Radio Review +<OPTION VALUE="TV Review, Radio Review, Video">TV Review, Radio Review, Video +<OPTION VALUE="Theater Review">Theater Review +</SELECT> +<TD><SELECT NAME="Sort" SIZE="5"> +<OPTION VALUE="Latest date" SELECTED>Latest date +<OPTION VALUE="Times Cited">Times Cited +<OPTION VALUE="Relevance">Relevance +<OPTION VALUE="First author">First author +<OPTION VALUE="Source Title">Source Title +</SELECT> +</TABLE>Back to <A HREF=#top> + top of Search</A> + page <P> + <HR><BR> + </OL> + <INPUT TYPE=HIDDEN NAME=Form VALUE=General> + <INPUT TYPE=HIDDEN NAME=Func VALUE=Search> + </FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/MarkedRecords.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/MarkedRecords.html new file mode 100644 index 0000000000000000000000000000000000000000..8fb05bd747faae312aab40dcb4e41dde6ab3d217 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/MarkedRecords.html @@ -0,0 +1,152 @@ +<HTML><HEAD><TITLE>Marked Records -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=Common.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> +<A NAME=top> +<INPUT TYPE=HIDDEN NAME="Form" VALUE="Marked_Records"> + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpprn.html#Print_&_Export_Marked_Records"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Date & Database Limits" ALT="Date & Database Limits" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblimits.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="General Search" ALT="General Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbgsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Cited Ref Search" ALT="Cited Ref Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbcrsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> + +<INPUT TYPE=HIDDEN NAME=id VALUE=6> + +<div align="center"> + <table width="650" border="0" cellspacing="0" cellpadding="0"> + <tr> + <td width="231" align="center"> + </td> + <td width="215"> + <p align="center"><br> + <STRONG><FONT SIZE=4>Marked Records</FONT></STRONG> + </td> + <td align="right"> </td> + </tr> + <tr> + <td width="231" align="center"> + <p align="right"><b>500</b></td> + <td width="215"> + <p align="center"> <b>Records on the marked list</b></p> + </td> + <td align="right"><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Clear+Mark+List onClick="this.href = confirmLink( 'Warning: Pressing OK will clear the marked list.', 'CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Clear+Mark+List', 'javascript:void(0)');"> +<IMG SRC="Images/clearlst.gif" ALT="Clear Marked List" BORDER="0"></A></td> + </tr> + </table> +</div> +<hr> + +<font size="+1"><b>STEP 1: Select sort and output fields for the entire marked list.</b></font> + +<div align="center"> +<table width="92%" border="1" height="124"> + <tr> + <td width="21%" valign="top" height="124"> + <div align="left"> + <p align="center"><b>Select sort option:</b> + </p> + </div> + <div align="left"> + <p> + <SELECT NAME="MarkedSort" SIZE="4"> +<OPTION VALUE="Latest date" SELECTED>Latest date +<OPTION VALUE="First author">First author +<OPTION VALUE="Source Title">Source Title +<OPTION VALUE="Times Cited">Times Cited +</SELECT> + + </p> + </div> + </td> + <td width="79%" height="124"> + + + <p align="center"><b>Select fields to include in addition to the author(s), + article title and source.</b> </p> + + + <table width="481"> + <tr> + + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=include_refs >cited references*</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=address >addresses</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=abstract >abstract</td> + </tr> + <tr> + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=language >language</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=publisher >publisher information</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=ISSN >ISSN</td> + </tr> + <tr> + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=doctype >document type</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=keywords >keywords</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=timescited >times cited</td> + </tr> + </table> + + <FONT SIZE=-1><i>*Selecting the cited references may cause the server + to time out with large numbers of records.</i></FONT> + + </td> + </tr> +</table> +</div> + +<br> + +<font size="+1"><b>STEP 2: Select action for output.</b></font><br> + +<div align="center"> + <table width=650 height="28" cellspacing="0" cellpadding="0" border="0"> + <tr align="center"> + <td width="542"><INPUT TYPE=IMAGE SRC=Images/print.gif NAME="Format for Print" ALT="Format for Print" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/savefile.gif NAME="Save to File" ALT="Save to File" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/export.gif NAME="Export to reference software" ALT="Export to reference software" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/email.gif NAME="E-Mail" ALT="E-Mail" BORDER=0> + </td> + </tr></table> +</div> +<hr> + <BR> +<DL><DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000174872000059 CHECKED> Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/1>Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002<!000174872000059> +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000174858300003 CHECKED> Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/2>Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002<!000174858300003> +<BR><BR> +<!--snip--> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000081310100003 CHECKED> Disney RHL<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/499>A troublesome sibling species complex of scuttle flies (Diptera : Phoridae) revisited</A><BR>J NAT HIST 33 (8): 1159-1216 AUG 1999<!000081310100003> +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000081297200008 CHECKED> Rosanowski F, Eysholdt U<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/500>Medical expertise prior to voice change surgery in male-to-female transsexuals</A><BR>HNO 47 (6): 556-562 JUN 1999<!000081297200008> +<BR><BR> +</DL> +<hr> +<div align="center"> + <table width=650 height="28" cellspacing="0" cellpadding="0" border="0"> + <tr align="center"> + <td width="542"><INPUT TYPE=IMAGE SRC=Images/print.gif NAME="Format for Print" ALT="Format for Print" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/savefile.gif NAME="Save to File" ALT="Save to File" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/export.gif NAME="Export to reference software" ALT="Export to reference software" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/email.gif NAME="E-Mail" ALT="E-Mail" BORDER=0> + </td> + </tr></table> +</div> +<BR>Back to <A HREF=#top>top of Marked Records</A> page<BR><BR><HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/MarkedResults.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/MarkedResults.html new file mode 100644 index 0000000000000000000000000000000000000000..cb5b2bc228579eb23e08a8b5fe20a29aa43f9eff --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/MarkedResults.html @@ -0,0 +1,97 @@ +<HTML><HEAD><TITLE>General Search Results-Summary -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PageSubmit.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<HR><TABLE WIDTH=100%><TR ALIGN=CENTER><TD><STRONG><FONT SIZE=4>General Search Results--Summary</FONT></STRONG></TD></TR></TABLE> + Topic=troublesome; DocType=All document types; Language=All languages; Databases=SCI-EXPANDED; Timespan=All Years; (sorted by latest date) + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Unmark Page" ALT="Unmark Page" SRC=Images/unmarkall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + <HR><I><FONT SIZE=-1>Use the checkboxes to add individual articles to the Marked List. Be sure to click SUBMIT MARKS button before leaving page.</FONT></I><DL> +<DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174872000059/1 CHECKED> Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/1 onClick="this.href="javascript:submit_page('Abstract', '1/1')";">Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174858300003/2 CHECKED> Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/2 onClick="this.href="javascript:submit_page('Abstract', '1/2')";">Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174827900006/3 CHECKED> Chang DW, Hussussian C, Lewin JS, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/3 onClick="this.href="javascript:submit_page('Abstract', '1/3')";">Analysis of pharyngocutaneous fistula following free jejunal transfer for total laryngopharyngectomy</A><BR>PLAST RECONSTR SURG 109 (5): 1522-1527 APR 15 2002 +<BR><BR> +</DL><HR> + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Unmark Page" ALT="Unmark Page" SRC=Images/unmarkall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + <BR>1783 of 16635816 documents matched the query. (500 shown)<HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/Results.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/Results.html new file mode 100644 index 0000000000000000000000000000000000000000..ee31c1fd8370a02969a3098b37d815f6c5be2352 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/Results.html @@ -0,0 +1,94 @@ +<HTML><HEAD><TITLE>General Search Results-Summary -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PageSubmit.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<HR> +<TABLE WIDTH=100%><TR ALIGN=CENTER><TD><STRONG><FONT SIZE=4>General Search Results--Summary</FONT></STRONG></TD></TR><TR><TD> </TD></TR></TABLE> + Topic=troublesome; DocType=All document types; Language=All languages; Databases=SCI-EXPANDED; Timespan=All Years; (sorted by latest date) + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add records on page to list" ALT="Add records on page to list" SRC=Images/markall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> </TD> +<TD>. . . + <TD><IMG SRC=Images/frwrd10i.gif ALT="Next 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR> + </TABLE></CENTER> + <HR><I><FONT SIZE=-1>Use the checkboxes to add individual articles to the Marked List. Be sure to click SUBMIT MARKS button before leaving page.</FONT></I><DL> +<DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174872000059/1 > Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/1 onClick="this.href="javascript:submit_page('Abstract', '1/1')";">Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174858300003/2 > Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/2 onClick="this.href="javascript:submit_page('Abstract', '1/2')";">Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174827900006/3 > Chang DW, Hussussian C, Lewin JS, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/3 onClick="this.href="javascript:submit_page('Abstract', '1/3')";">Analysis of pharyngocutaneous fistula following free jejunal transfer for total laryngopharyngectomy</A><BR>PLAST RECONSTR SURG 109 (5): 1522-1527 APR 15 2002 +<BR><BR> +</DL><HR> + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add records on page to list" ALT="Add records on page to list" SRC=Images/markall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + +<BR>1783 of 16635816 documents matched the query. (500 shown)<HR> +</FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/SearchType.html b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/SearchType.html new file mode 100644 index 0000000000000000000000000000000000000000..a895c3e0a5c95db2dd1d9e3534a414f801710391 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm-0.1.17/testdata/SearchType.html @@ -0,0 +1,55 @@ +<HTML><HEAD><TITLE>Welcome -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=Common.js> +</SCRIPT> + + + +<SCRIPT LANGUAGE=JavaScript> +<!-- Hide script from old browsers. +function main(){ + JavaScriptTest(); +} +// End script hide. --> +</SCRIPT> + +</HEAD> +<BODY BGCOLOR=#FFFFFF onLoad="main()" ><FORM ACTION=CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> + + <A NAME=top></A> + <CENTER><IMG SRC=Images/main.jpg ALT="Institute for Scientific Information"></CENTER> + + <P> + <CENTER> + <TABLE BORDER=2 CELLPADDING=0> + <TR> + <TD ALIGN=CENTER><INPUT TYPE=IMAGE SRC=Images/fullsch.gif NAME="Full Search" ALT="Full Search" BORDER=0> + <TD> + <TABLE> + <TR><TD><TD>Search by bibliographic information (topic, author, source, address) or by cited reference. +</TABLE> + <TR> + <TD ALIGN=CENTER><INPUT TYPE=IMAGE SRC=Images/quiksch.gif NAME="Easy Search" ALT="Easy Search" BORDER=0> + <TD><TABLE> + <TR><TD><TD>Search for a limited number of articles on a specific topic, person, or address.</TABLE> +<TR><TD ALIGN=CENTER> + <INPUT TYPE=IMAGE SRC=Images/newsession.gif NAME="New Session" ALT="New Session" BORDER=0> + <TD> + <TABLE><TR><TD><TD> +Clear all search forms and the marked list.</TABLE> <TR> + <TD ALIGN=CENTER> +<INPUT TYPE=IMAGE SRC=Images/logoff.gif NAME="Log off" ALT="Log off" BORDER=0> + <TD><TABLE> + <TR><TD> +Fully disconnect from the database and make your connection available to another user at your institution.</TD></TABLE><INPUT TYPE=HIDDEN NAME=Form Value=Welcome> +</TABLE></CENTER> +<HR> +<INPUT TYPE=HIDDEN NAME="JavaScript" VALUE="No"> +<P><CENTER><IMG SRC=Images/isilogo.gif ALT="ISI Thomson Scientific"></CENTER><P> +</FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/ClientForm.py b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm.py new file mode 100644 index 0000000000000000000000000000000000000000..dec49815aa549d1a6b9973cf7f26012aeb04afc6 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/ClientForm.py @@ -0,0 +1,2854 @@ +"""HTML form handling for web clients. + +ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It has developed from a port of Gisle +Aas' Perl module HTML::Form, from the libwww-perl library, but the +interface is not the same. + +The most useful docstring is the one for HTMLForm. + +RFC 1866: HTML 2.0 +RFC 1867: Form-based File Upload in HTML +RFC 2388: Returning Values from Forms: multipart/form-data +HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) +HTML 4.01 Specification, W3C Recommendation 24 December 1999 + + +Copyright 2002-2005 John J. Lee <jjl@pobox.com> +Copyright 1998-2000 Gisle Aas. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file COPYING included with +the distribution). + +""" + +# XXX +# Fix or work around attibute quoting bug. +# Add .clear() method. +# Add some functional tests +# Especially single and multiple file upload on the internet. +# Does file upload work when name is missing? Sourceforge tracker form +# doesn't like it. Check standards, and test with Apache. Test +# binary upload with Apache. +# Unicode: see Wichert Akkerman's 2004-01-22 message to c.l.py. +# Controls can have name=None (eg. forms constructed partly with +# JavaScript), but find_control can't be told to find a control +# with that name, because None there means 'unspecified'. Can still +# get at by nr, but would be nice to be able to specify something +# equivalent to name=None, too. +# Support for list item ids. How to handle missing ids? (How do I deal +# with duplicate OPTION labels ATM? Can't remember...) +# Deal with character sets properly. Not sure what the issues are here. +# Do URL encodings need any attention? +# I don't *think* any encoding of control names, filenames or data is +# necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6 +# doesn't seem to do it. +# Add charset parameter to Content-type headers? How to find value?? +# Add label support for CHECKBOX and RADIO. Actually, I may not bother +# to fix this, since a discussion with Gisle on libwww-perl list seemed +# to show that it wouldn't be very useful. +# I'm not going to fix this unless somebody tells me what real servers +# that want this encoding actually expect: If enctype is +# application/x-www-form-urlencoded and there's a FILE control present. +# Strictly, it should be 'name=data' (see HTML 4.01 spec., section +# 17.13.2), but I send "name=" ATM. What about multiple file upload?? +# Get rid of MapBase, AList and MimeWriter. +# Should really use sgmllib, not htmllib. +# Remove single-selection code: can be special case of multi-selection, +# with a few variations, I think. +# Factor out multiple-selection list code? May not be easy. Maybe like +# this: + +# ListControl +# ^ +# | MultipleListControlMixin +# | ^ +# SelectControl / +# ^ / +# \ / +# MultiSelectControl + + +# Plan +# ---- +# Maybe a 0.2.x, cleaned up a bit and with id support for list items? +# Not sure it's worth it... +# Unify single / multiple selection code. +# action should probably be an absolute URI, like DOMForm. +# Remove toggle methods. +# Replace by_label with choice between value / id / label / +# element contents (see discussion with Gisle about labels on +# libwww-perl list). +# ...what else? +# Work on DOMForm. +# XForms? Don't know if there's a need here. + + +try: True +except NameError: + True = 1 + False = 0 + +try: bool +except NameError: + def bool(expr): + if expr: return True + else: return False + +import sys, urllib, urllib2, types, string, mimetools, copy, urlparse, \ + htmlentitydefs +from urlparse import urljoin +from cStringIO import StringIO +try: + from types import UnicodeType +except ImportError: + UNICODE = False +else: + UNICODE = True + +VERSION = "0.1.17" + +CHUNK = 1024 # size of chunks fed to parser, in bytes + +# This version of urlencode is from my Python 1.5.2 back-port of the +# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence +# of pairs instead of a mapping -- the 2.0 version only accepts a mapping. +def urlencode(query,doseq=False,): + """Encode a sequence of two-element tuples or dictionary into a URL query \ +string. + + If any values in the query arg are sequences and doseq is true, each + sequence element is converted to a separate parameter. + + If the query arg is a sequence of two-element tuples, the order of the + parameters in the output will match the order of parameters in the + input. + """ + + if hasattr(query,"items"): + # mapping objects + query = query.items() + else: + # it's a bother at times that strings and string-like objects are + # sequences... + try: + # non-sequence items should not work with len() + x = len(query) + # non-empty strings will fail this + if len(query) and type(query[0]) != types.TupleType: + raise TypeError() + # zero-length sequences of all types will get here and succeed, + # but that's a minor nit - since the original implementation + # allowed empty dicts that type of behavior probably should be + # preserved for consistency + except TypeError: + ty,va,tb = sys.exc_info() + raise TypeError("not a valid non-string sequence or mapping " + "object", tb) + + l = [] + if not doseq: + # preserve old behavior + for k, v in query: + k = urllib.quote_plus(str(k)) + v = urllib.quote_plus(str(v)) + l.append(k + '=' + v) + else: + for k, v in query: + k = urllib.quote_plus(str(k)) + if type(v) == types.StringType: + v = urllib.quote_plus(v) + l.append(k + '=' + v) + elif UNICODE and type(v) == types.UnicodeType: + # is there a reasonable way to convert to ASCII? + # encode generates a string, but "replace" or "ignore" + # lose information and "strict" can raise UnicodeError + v = urllib.quote_plus(v.encode("ASCII","replace")) + l.append(k + '=' + v) + else: + try: + # is this a sufficient test for sequence-ness? + x = len(v) + except TypeError: + # not a sequence + v = urllib.quote_plus(str(v)) + l.append(k + '=' + v) + else: + # loop over the sequence + for elt in v: + l.append(k + '=' + urllib.quote_plus(str(elt))) + return string.join(l, '&') + +# Grabbed from 2.4 xml.sax.saxutils. Modification: accept None. +def __dict_replace(s, d): + """Replace substrings of a string using a dictionary.""" + for key, value in d.items(): + s = string.replace(s, key, value) + return s +def unescape(data, entities={}): + """Unescape &, <, and > in a string of data. + + You can unescape other strings of data by passing a dictionary as + the optional entities parameter. The keys and values must all be + strings; each key will be replaced with its corresponding value. + """ + if data is None: + return None + data = string.replace(data, "<", "<") + data = string.replace(data, ">", ">") + if entities: + data = __dict_replace(data, entities) + # must do ampersand last + return string.replace(data, "&", "&") + +def startswith(string, initial): + if len(initial) > len(string): return False + return string[:len(initial)] == initial + +def issequence(x): + try: + x[0] + except (TypeError, KeyError): + return False + except IndexError: + pass + return True + +def isstringlike(x): + try: x+"" + except: return False + else: return True + + +# XXX don't really want to drag this along (MapBase, AList, MimeWriter, +# _choose_boundary) + +# This is essentially the same as UserDict.DictMixin. I wrote this before +# that, and DictMixin isn't available in 1.5.2 anyway. +class MapBase: + """Mapping designed to be easily derived from. + + Subclass it and override __init__, __setitem__, __getitem__, __delitem__ + and keys. Nothing else should need to be overridden, unlike UserDict. + This significantly simplifies dictionary-like classes. + + Also different from UserDict in that it has a redonly flag, and can be + updated (and initialised) with a sequence of pairs (key, value). + + """ + def __init__(self, init=None): + self._data = {} + self.readonly = False + if init is not None: self.update(init) + + def __getitem__(self, key): + return self._data[key] + + def __setitem__(self, key, item): + if not self.readonly: + self._data[key] = item + else: + raise TypeError("object doesn't support item assignment") + + def __delitem__(self, key): + if not self.readonly: + del self._data[key] + else: + raise TypeError("object doesn't support item deletion") + + def keys(self): + return self._data.keys() + + # now the internal workings, there should be no need to override these: + + def clear(self): + for k in self.keys(): + del self[k] + + def __repr__(self): + rep = [] + for k, v in self.items(): + rep.append("%s: %s" % (repr(k), repr(v))) + return self.__class__.__name__+"{"+(string.join(rep, ", "))+"}" + + def copy(self): + return copy.copy(self) + + def __cmp__(self, dict): + # note: return value is *not* boolean + for k, v in self.items(): + if not (dict.has_key(k) and dict[k] == v): + return 1 # different + return 0 # the same + + def __len__(self): + return len(self.keys()) + + def values(self): + r = [] + for k in self.keys(): + r.append(self[k]) + return r + + def items(self): + keys = self.keys() + vals = self.values() + r = [] + for i in len(self): + r.append((keys[i], vals[i])) + return r + + def has_key(self, key): + return key in self.keys() + + def update(self, map): + if issequence(map) and not isstringlike(map): + items = map + else: + items = map.items() + for tup in items: + if not isinstance(tup, TupleType): + raise TypeError( + "MapBase.update requires a map or a sequence of pairs") + k, v = tup + self[k] = v + + def get(self, key, failobj=None): + if key in self.keys(): + return self[key] + else: + return failobj + + def setdefault(self, key, failobj=None): + if not self.has_key(key): + self[key] = failobj + return self[key] + + +class AList(MapBase): + """Read-only ordered mapping.""" + def __init__(self, seq=[]): + self.readonly = True + self._inverted = False + self._data = list(seq[:]) + self._keys = [] + self._values = [] + for key, value in seq: + self._keys.append(key) + self._values.append(value) + + def set_inverted(self, inverted): + if (inverted and not self._inverted) or ( + not inverted and self._inverted): + self._keys, self._values = self._values, self._keys + if inverted: self._inverted = True + else: self._inverted = False + + def __getitem__(self, key): + try: + i = self._keys.index(key) + except ValueError: + raise KeyError(key) + return self._values[i] + + def __delitem__(self, key): + try: + i = self._keys.index[key] + except ValueError: + raise KeyError(key) + del self._values[i] + + def keys(self): return list(self._keys[:]) + def values(self): return list(self._values[:]) + def items(self): + data = self._data[:] + if not self._inverted: + return data + else: + newdata = [] + for k, v in data: + newdata.append((v, k)) + return newdata + +# -------------------------------------------------------------------- +# grabbed from Python standard library mimetools module and tweaked to +# avoid socket.gaierror +try: + import thread + _thread = thread; del thread +except ImportError: + import dummy_thread + _thread = dummy_thread; del dummy_thread +_counter_lock = _thread.allocate_lock() +del _thread + +_counter = 0 +def _get_next_counter(): + global _counter + _counter_lock.acquire() + _counter = _counter + 1 + result = _counter + _counter_lock.release() + return result + +_prefix = None + +def _choose_boundary(): + """Return a string usable as a multipart boundary. + + The string chosen is unique within a single program run, and + incorporates the user id (if available), process id (if available), + and current time. So it's very unlikely the returned string appears + in message text, but there's no guarantee. + + The boundary contains dots so you have to quote it in the header.""" + + global _prefix + import time + import os + import socket + if _prefix is None: + try: + socket.gaierror + except AttributeError: + exc = socket.error + else: + exc = socket.gaierror + + try: + hostid = socket.gethostbyname(socket.gethostname()) + except exc: + hostid = 'localhost' + try: + uid = repr(os.getuid()) + except AttributeError: + uid = '1' + try: + pid = repr(os.getpid()) + except AttributeError: + pid = '1' + _prefix = hostid + '.' + uid + '.' + pid + return "%s.%.3f.%d" % (_prefix, time.time(), _get_next_counter()) + +# end of code from mimetools module +# -------------------------------------------------------------------- + +def choose_boundary(): + b = _choose_boundary() + string.replace(b, ".", "") + return b + +# This cut-n-pasted MimeWriter from standard library is here so can add +# to HTTP headers rather than message body when appropriate. It also uses +# \r\n in place of \n. This is nasty. +class MimeWriter: + + """Generic MIME writer. + + Methods: + + __init__() + addheader() + flushheaders() + startbody() + startmultipartbody() + nextpart() + lastpart() + + A MIME writer is much more primitive than a MIME parser. It + doesn't seek around on the output file, and it doesn't use large + amounts of buffer space, so you have to write the parts in the + order they should occur on the output file. It does buffer the + headers you add, allowing you to rearrange their order. + + General usage is: + + f = <open the output file> + w = MimeWriter(f) + ...call w.addheader(key, value) 0 or more times... + + followed by either: + + f = w.startbody(content_type) + ...call f.write(data) for body data... + + or: + + w.startmultipartbody(subtype) + for each part: + subwriter = w.nextpart() + ...use the subwriter's methods to create the subpart... + w.lastpart() + + The subwriter is another MimeWriter instance, and should be + treated in the same way as the toplevel MimeWriter. This way, + writing recursive body parts is easy. + + Warning: don't forget to call lastpart()! + + XXX There should be more state so calls made in the wrong order + are detected. + + Some special cases: + + - startbody() just returns the file passed to the constructor; + but don't use this knowledge, as it may be changed. + + - startmultipartbody() actually returns a file as well; + this can be used to write the initial 'if you can read this your + mailer is not MIME-aware' message. + + - If you call flushheaders(), the headers accumulated so far are + written out (and forgotten); this is useful if you don't need a + body part at all, e.g. for a subpart of type message/rfc822 + that's (mis)used to store some header-like information. + + - Passing a keyword argument 'prefix=<flag>' to addheader(), + start*body() affects where the header is inserted; 0 means + append at the end, 1 means insert at the start; default is + append for addheader(), but insert for start*body(), which use + it to determine where the Content-type header goes. + + """ + + def __init__(self, fp, http_hdrs=None): + self._http_hdrs = http_hdrs + self._fp = fp + self._headers = [] + self._boundary = [] + self._first_part = True + + def addheader(self, key, value, prefix=0, + add_to_http_hdrs=0): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + lines = string.split(value, "\r\n") + while lines and not lines[-1]: del lines[-1] + while lines and not lines[0]: del lines[0] + if add_to_http_hdrs: + value = string.join(lines, "") + self._http_hdrs.append((key, value)) + else: + for i in range(1, len(lines)): + lines[i] = " " + string.strip(lines[i]) + value = string.join(lines, "\r\n") + "\r\n" + line = key + ": " + value + if prefix: + self._headers.insert(0, line) + else: + self._headers.append(line) + + def flushheaders(self): + self._fp.writelines(self._headers) + self._headers = [] + + def startbody(self, ctype=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + if content_type and ctype: + for name, value in plist: + ctype = ctype + ';\r\n %s=%s' % (name, value) + self.addheader("Content-type", ctype, prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs) + self.flushheaders() + if not add_to_http_hdrs: self._fp.write("\r\n") + self._first_part = True + return self._fp + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + boundary = boundary or choose_boundary() + self._boundary.append(boundary) + return self.startbody("multipart/" + subtype, + [("boundary", boundary)] + plist, + prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs, + content_type=content_type) + + def nextpart(self): + boundary = self._boundary[-1] + if self._first_part: + self._first_part = False + else: + self._fp.write("\r\n") + self._fp.write("--" + boundary + "\r\n") + return self.__class__(self._fp) + + def lastpart(self): + if self._first_part: + self.nextpart() + boundary = self._boundary.pop() + self._fp.write("\r\n--" + boundary + "--\r\n") + + +class ControlNotFoundError(ValueError): pass +class ItemNotFoundError(ValueError): pass +class ItemCountError(ValueError): pass + +class ParseError(Exception): pass + + +class _AbstractFormParser: + """forms attribute contains HTMLForm instances on completion.""" + # pinched (and modified) from Moshe Zadka + def __init__(self, entitydefs=None): + if entitydefs is not None: + self.entitydefs = entitydefs + self.base = None + self.forms = [] + self._current_form = None + self._select = None + self._optgroup = None + self._option = None + self._textarea = None + + def do_base(self, attrs): + for key, value in attrs: + if key == "href": + self.base = value + + def start_form(self, attrs): + if self._current_form is not None: + raise ParseError("nested FORMs") + name = None + action = None + enctype = "application/x-www-form-urlencoded" + method = "GET" + d = {} + for key, value in attrs: + if key == "name": + name = value + elif key == "action": + action = value + elif key == "method": + method = string.upper(value) + elif key == "enctype": + enctype = string.lower(value) + d[key] = value + controls = [] + self._current_form = (name, action, method, enctype), d, controls + + def end_form(self): + if self._current_form is None: + raise ParseError("end of FORM before start") + self.forms.append(self._current_form) + self._current_form = None + + def start_select(self, attrs): + if self._current_form is None: + raise ParseError("start of SELECT before start of FORM") + if self._select is not None: + raise ParseError("nested SELECTs") + if self._textarea is not None: + raise ParseError("SELECT inside TEXTAREA") + d = {} + for key, val in attrs: + d[key] = val + + self._select = d + + self._append_select_control({"__select": d}) + + def end_select(self): + if self._current_form is None: + raise ParseError("end of SELECT before start of FORM") + if self._select is None: + raise ParseError("end of SELECT before start") + + if self._option is not None: + self._end_option() + + self._select = None + + def start_optgroup(self, attrs): + if self._select is None: + raise ParseError("OPTGROUP outside of SELECT") + d = {} + for key, val in attrs: + d[key] = val + + self._optgroup = d + + def end_optgroup(self): + if self._optgroup is None: + raise ParseError("end of OPTGROUP before start") + self._optgroup = None + + def _start_option(self, attrs): + if self._select is None: + raise ParseError("OPTION outside of SELECT") + if self._option is not None: + self._end_option() + + d = {} + for key, val in attrs: + d[key] = val + + self._option = {} + self._option.update(d) + if (self._optgroup and self._optgroup.has_key("disabled") and + not self._option.has_key("disabled")): + self._option["disabled"] = None + + def _end_option(self): + if self._option is None: + raise ParseError("end of OPTION before start") + + contents = string.strip(self._option.get("contents", "")) + self._option["contents"] = contents + if not self._option.has_key("value"): + self._option["value"] = contents + if not self._option.has_key("label"): + self._option["label"] = contents + # stuff dict of SELECT HTML attrs into a special private key + # (gets deleted again later) + self._option["__select"] = self._select + self._append_select_control(self._option) + self._option = None + + def _append_select_control(self, attrs): + controls = self._current_form[2] + name = self._select.get("name") + controls.append(("select", name, attrs)) + + def start_textarea(self, attrs): + if self._current_form is None: + raise ParseError("start of TEXTAREA before start of FORM") + if self._textarea is not None: + raise ParseError("nested TEXTAREAs") + if self._select is not None: + raise ParseError("TEXTAREA inside SELECT") + d = {} + for key, val in attrs: + d[key] = val + + self._textarea = d + + def end_textarea(self): + if self._current_form is None: + raise ParseError("end of TEXTAREA before start of FORM") + if self._textarea is None: + raise ParseError("end of TEXTAREA before start") + controls = self._current_form[2] + name = self._textarea.get("name") + controls.append(("textarea", name, self._textarea)) + self._textarea = None + + def handle_data(self, data): + if self._option is not None: + # self._option is a dictionary of the OPTION element's HTML + # attributes, but it has two special keys, one of which is the + # special "contents" key contains text between OPTION tags (the + # other is the "__select" key: see the end_option method) + map = self._option + key = "contents" + elif self._textarea is not None: + map = self._textarea + key = "value" + else: + return + + if not map.has_key(key): + map[key] = data + else: + map[key] = map[key] + data + + def do_button(self, attrs): + if self._current_form is None: + raise ParseError("start of BUTTON before start of FORM") + d = {} + d["type"] = "submit" # default + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + # we don't want to lose information, so use a type string that + # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON} + # eg. type for BUTTON/RESET is "resetbutton" + # (type for INPUT/RESET is "reset") + type = type+"button" + controls.append((type, name, d)) + + def do_input(self, attrs): + if self._current_form is None: + raise ParseError("start of INPUT before start of FORM") + d = {} + d["type"] = "text" # default + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + controls.append((type, name, d)) + + def do_isindex(self, attrs): + if self._current_form is None: + raise ParseError("start of ISINDEX before start of FORM") + d = {} + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + # isindex doesn't have type or name HTML attributes + controls.append(("isindex", None, d)) + +# HTMLParser.HTMLParser is recent, so live without it if it's not available +# (also, htmllib.HTMLParser is much more tolerant of bad HTML) +try: + import HTMLParser +except ImportError: + class XHTMLCompatibleFormParser: + def __init__(self, entitydefs=None): + raise ValueError("HTMLParser could not be imported") +else: + class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): + """Good for XHTML, bad for tolerance of incorrect HTML.""" + # thanks to Michael Howitz for this! + def __init__(self, entitydefs=None): + HTMLParser.HTMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs) + + def start_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + def end_option(self): + _AbstractFormParser._end_option(self) + + def handle_starttag(self, tag, attrs): + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + pass # unknown tag + else: + method() + + # handle_charref, handle_entityref and default entitydefs are taken + # from sgmllib + def handle_charref(self, name): + try: + n = int(name) + except ValueError: + self.unknown_charref(name) + return + if not 0 <= n <= 255: + self.unknown_charref(name) + return + self.handle_data(chr(n)) + + # Definition of entities -- derived classes may override + entitydefs = \ + {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} + + def handle_entityref(self, name): + table = self.entitydefs + if name in table: + self.handle_data(table[name]) + else: + self.unknown_entityref(name) + return + + # These methods would have passed through the ref intact if I'd thought + # of it earlier, but since the old parser silently swallows unknown + # refs, so does this new parser. + def unknown_entityref(self, ref): pass + def unknown_charref(self, ref): pass + +import htmllib, formatter +class FormParser(_AbstractFormParser, htmllib.HTMLParser): + """Good for tolerance of incorrect HTML, bad for XHTML.""" + def __init__(self, entitydefs=None): + htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) + _AbstractFormParser.__init__(self, entitydefs) + + def do_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + +#FormParser = XHTMLCompatibleFormParser # testing hack + +def get_entitydefs(): + entitydefs = {} + for name, char in htmlentitydefs.entitydefs.items(): + entitydefs["&%s;" % name] = char + # unescape already does these three + del entitydefs["<"] + del entitydefs[">"] + del entitydefs["&"] + return entitydefs + +def unescape_attrs(attrs, entitydefs): + escaped_attrs = {} + for key, val in attrs.items(): + try: + val.items + except AttributeError: + escaped_attrs[key] = unescape(val, entitydefs) + else: + # eg. "__select" -- yuck! + escaped_attrs[key] = unescape_attrs(val, entitydefs) + return escaped_attrs + +def ParseResponse(response, select_default=False, + ignore_errors=False, # ignored! + form_parser_class=FormParser, + request_class=urllib2.Request): + """Parse HTTP response and return a list of HTMLForm instances. + + The return value of urllib2.urlopen can be conveniently passed to this + function as the response parameter. + + ClientForm.ParseError is raised on parse errors. + + response: file-like object (supporting read() method) with a method + geturl(), returning the URI of the HTTP response + select_default: for multiple-selection SELECT controls and RADIO controls, + pick the first item as the default if none are selected in the HTML + form_parser_class: class to instantiate and use to pass + + Pass a true value for select_default if you want the behaviour specified by + RFC 1866 (the HTML 2.0 standard), which is to select the first item in a + RADIO or multiple-selection SELECT control if none were selected in the + HTML. Most browsers (including Microsoft Internet Explorer (IE) and + Netscape Navigator) instead leave all items unselected in these cases. The + W3C HTML 4.0 standard leaves this behaviour undefined in the case of + multiple-selection SELECT controls, but insists that at least one RADIO + button should be checked at all times, in contradiction to browser + behaviour. + + There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses + HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses + htmllib.HTMLParser) (the default) works best for ordinary grubby HTML. + Note that HTMLParser is only available in Python 2.2 and later. You can + pass your own class in here as a hack to work around bad HTML, but at your + own risk: there is no well-defined interface. + + """ + return ParseFile(response, response.geturl(), select_default, + False, + form_parser_class, + request_class) + +def ParseFile(file, base_uri, select_default=False, + ignore_errors=False, # ignored! + form_parser_class=FormParser, + request_class=urllib2.Request): + """Parse HTML and return a list of HTMLForm instances. + + ClientForm.ParseError is raised on parse errors. + + file: file-like object (supporting read() method) containing HTML with zero + or more forms to be parsed + base_uri: the URI of the document (note that the base URI used to submit + the form will be that given in the BASE element if present, not that of + the document) + + For the other arguments and further details, see ParseResponse.__doc__. + + """ + use_htmllib = True + fp = form_parser_class() + while 1: + data = file.read(CHUNK) + try: + fp.feed(data) + except ParseError, e: + e.base_uri = base_uri + raise + if len(data) != CHUNK: break + if fp.base is not None: + # HTML BASE element takes precedence over document URI + base_uri = fp.base + forms = [] + entitydefs = get_entitydefs() + for (name, action, method, enctype), attrs, controls in fp.forms: + if action is None: + action = base_uri + else: + action = urljoin(base_uri, action) + form = HTMLForm(action, method, enctype, name, attrs, request_class) + for type, name, attr in controls: + attr = unescape_attrs(attr, entitydefs) + name = unescape(name, entitydefs) + form.new_control(type, name, attr, select_default=select_default) + forms.append(form) + for form in forms: + form.fixup() + return forms + + +class Control: + """An HTML form control. + + An HTMLForm contains a sequence of Controls. HTMLForm delegates lots of + things to Control objects, and most of Control's methods are, in effect, + documented by the HTMLForm docstrings. + + The Controls in an HTMLForm can be got at via the HTMLForm.find_control + method or the HTMLForm.controls attribute. + + Control instances are usually constructed using the ParseFile / + ParseResponse functions, so you can probably ignore the rest of this + paragraph. A Control is only properly initialised after the fixup method + has been called. In fact, this is only strictly necessary for ListControl + instances. This is necessary because ListControls are built up from + ListControls each containing only a single item, and their initial value(s) + can only be known after the sequence is complete. + + The types and values that are acceptable for assignment to the value + attribute are defined by subclasses. + + If the disabled attribute is true, this represents the state typically + represented by browsers by `greying out' a control. If the disabled + attribute is true, the Control will raise AttributeError if an attempt is + made to change its value. In addition, the control will not be considered + `successful' as defined by the W3C HTML 4 standard -- ie. it will + contribute no data to the return value of the HTMLForm.click* methods. To + enable a control, set the disabled attribute to a false value. + + If the readonly attribute is true, the Control will raise AttributeError if + an attempt is made to change its value. To make a control writable, set + the readonly attribute to a false value. + + All controls have the disabled and readonly attributes, not only those that + may have the HTML attributes of the same names. + + On assignment to the value attribute, the following exceptions are raised: + TypeError, AttributeError (if the value attribute should not be assigned + to, because the control is disabled, for example) and ValueError. + + If the name or value attributes are None, or the value is an empty list, or + if the control is disabled, the control is not successful. + + Public attributes: + + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) (readonly) + name: name of control (readonly) + value: current value of control (subclasses may allow a single value, a + sequence of values, or either) + disabled: disabled state + readonly: readonly state + id: value of id HTML attribute + + """ + def __init__(self, type, name, attrs): + """ + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) + name: control name + attrs: HTML attributes of control's HTML element + + """ + raise NotImplementedError() + + def add_to_form(self, form): + form.controls.append(self) + + def fixup(self): + pass + + def is_of_kind(self, kind): + raise NotImplementedError() + + def __getattr__(self, name): raise NotImplementedError() + def __setattr__(self, name, value): raise NotImplementedError() + + def pairs(self): + """Return list of (key, value) pairs suitable for passing to urlencode. + """ + raise NotImplementedError() + + def _write_mime_data(self, mw): + """Write data for this control to a MimeWriter.""" + # called by HTMLForm + for name, value in self.pairs(): + mw2 = mw.nextpart() + mw2.addheader("Content-disposition", + 'form-data; name="%s"' % name, 1) + f = mw2.startbody(prefix=0) + f.write(value) + + def __str__(self): + raise NotImplementedError() + + +#--------------------------------------------------- +class ScalarControl(Control): + """Control whose value is not restricted to one of a prescribed set. + + Some ScalarControls don't accept any value attribute. Otherwise, takes a + single value, which must be string-like. + + Additional read-only public attribute: + + attrs: dictionary mapping the names of original HTML attributes of the + control to their values + + """ + def __init__(self, type, name, attrs): + self.__dict__["type"] = string.lower(type) + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = attrs.has_key("disabled") + self.readonly = attrs.has_key("readonly") + self.id = attrs.get("id") + + self.attrs = attrs.copy() + + self._clicked = False + + def __getattr__(self, name): + if name == "value": + return self.__dict__["_value"] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if not isstringlike(value): + raise TypeError("must assign a string") + elif self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + elif self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + self.__dict__["_value"] = value + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def pairs(self): + name = self.name + value = self.value + if name is None or value is None or self.disabled: + return [] + return [(name, value)] + + def __str__(self): + name = self.name + value = self.value + if name is None: name = "<None>" + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class TextControl(ScalarControl): + """Textual input control. + + Covers: + + INPUT/TEXT + INPUT/PASSWORD + INPUT/FILE + INPUT/HIDDEN + TEXTAREA + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + if self.type == "hidden": self.readonly = True + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind == "text" + +#--------------------------------------------------- +class FileControl(ScalarControl): + """File upload with INPUT TYPE=FILE. + + The value attribute of a FileControl is always None. Use add_file instead. + + Additional public method: add_file + + """ + + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + self._value = None + self._upload_data = [] + + def is_of_kind(self, kind): return kind == "file" + + def __setattr__(self, name, value): + if name in ("value", "name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def add_file(self, file_object, content_type=None, filename=None): + if not hasattr(file_object, "read"): + raise TypeError("file-like object must have read method") + if content_type is not None and not isstringlike(content_type): + raise TypeError("content type must be None or string-like") + if filename is not None and not isstringlike(filename): + raise TypeError("filename must be None or string-like") + if content_type is None: + content_type = "application/octet-stream" + self._upload_data.append((file_object, content_type, filename)) + + def pairs(self): + # XXX should it be successful even if unnamed? + if self.name is None or self.disabled: + return [] + return [(self.name, "")] + + def _write_mime_data(self, mw): + # called by HTMLForm + if len(self._upload_data) == 1: + # single file + file_object, content_type, filename = self._upload_data[0] + mw2 = mw.nextpart() + fn_part = filename and ('; filename="%s"' % filename) or '' + disp = 'form-data; name="%s"%s' % (self.name, fn_part) + mw2.addheader("Content-disposition", disp, prefix=1) + fh = mw2.startbody(content_type, prefix=0) + fh.write(file_object.read()) + elif len(self._upload_data) != 0: + # multiple files + mw2 = mw.nextpart() + disp = 'form-data; name="%s"' % self.name + mw2.addheader("Content-disposition", disp, prefix=1) + fh = mw2.startmultipartbody("mixed", prefix=0) + for file_object, content_type, filename in self._upload_data: + mw3 = mw2.nextpart() + fn_part = filename and ('; filename="%s"' % filename) or '' + disp = 'file%s' % fn_part + mw3.addheader("Content-disposition", disp, prefix=1) + fh2 = mw3.startbody(content_type, prefix=0) + fh2.write(file_object.read()) + mw2.lastpart() + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + if not self._upload_data: + value = "<No files added>" + else: + value = [] + for file, ctype, filename in self._upload_data: + if filename is None: + value.append("<Unnamed file>") + else: + value.append(filename) + value = string.join(value, ", ") + + info = [] + if self.disabled: info.append("disabled") + if self.readonly: info.append("readonly") + info = string.join(info, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class IsindexControl(ScalarControl): + """ISINDEX control. + + ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really + part of regular HTML forms at all, and predates it. You're only allowed + one ISINDEX per HTML document. ISINDEX and regular form submission are + mutually exclusive -- either submit a form, or the ISINDEX. + + Having said this, since ISINDEX controls may appear in forms (which is + probably bad HTML), ParseFile / ParseResponse will include them in the + HTMLForm instances it returns. You can set the ISINDEX's value, as with + any other control (but note that ISINDEX controls have no name, so you'll + need to use the type argument of set_value!). When you submit the form, + the ISINDEX will not be successful (ie., no data will get returned to the + server as a result of its presence), unless you click on the ISINDEX + control, in which case the ISINDEX gets submitted instead of the form: + + form.set_value("my isindex value", type="isindex") + urllib2.urlopen(form.click(type="isindex")) + + ISINDEX elements outside of FORMs are ignored. If you want to submit one + by hand, do it like so: + + url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value")) + result = urllib2.urlopen(url) + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind in ["text", "clickable"] + + def pairs(self): + return [] + + def _click(self, form, coord, return_type, request_class=urllib2.Request): + # Relative URL for ISINDEX submission: instead of "foo=bar+baz", + # want "bar+baz". + # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is + # deprecated in 4.01, but it should still say how to submit it). + # Submission of ISINDEX is explained in the HTML 3.2 spec, though. + parts = urlparse.urlparse(form.action) + rest, (query, frag) = parts[:-2], parts[-2:] + parts = rest + (urllib.quote_plus(self.value), "") + url = urlparse.urlunparse(parts) + req_data = url, None, [] + + if return_type == "pairs": + return [] + elif return_type == "request_data": + return req_data + else: + return request_class(url) + + def __str__(self): + value = self.value + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s)%s>" % (self.__class__.__name__, value, info) + + +#--------------------------------------------------- +class IgnoreControl(ScalarControl): + """Control that we're not interested in. + + Covers: + + INPUT/RESET + BUTTON/RESET + INPUT/BUTTON + BUTTON/BUTTON + + These controls are always unsuccessful, in the terminology of HTML 4 (ie. + they never require any information to be returned to the server). + + BUTTON/BUTTON is used to generate events for script embedded in HTML. + + The value attribute of IgnoreControl is always None. + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + self._value = None + + def is_of_kind(self, kind): return False + + def __setattr__(self, name, value): + if name == "value": + raise AttributeError( + "control '%s' is ignored, hence read-only" % self.name) + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + +#--------------------------------------------------- +class ListControl(Control): + """Control representing a sequence of items. + + The value attribute of a ListControl represents the selected list items in + the control. + + ListControl implements both list controls that take a single value and + those that take multiple values. + + ListControls accept sequence values only. Some controls only accept + sequences of length 0 or 1 (RADIO, and single-selection SELECT). + In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes + and multiple-selection SELECTs (those having the "multiple" HTML attribute) + accept sequences of any length. + + Note the following mistake: + + control.value = some_value + assert control.value == some_value # not necessarily true + + The reason for this is that the value attribute always gives the list items + in the order they were listed in the HTML. + + ListControl items can also be referred to by their labels instead of names. + Use the by_label argument, and the set_value_by_label, get_value_by_label + methods. + + XXX RadioControl and CheckboxControl don't implement by_label yet. + + Note that, rather confusingly, though SELECT controls are represented in + HTML by SELECT elements (which contain OPTION elements, representing + individual list items), CHECKBOXes and RADIOs are not represented by *any* + element. Instead, those controls are represented by a collection of INPUT + elements. For example, this is a SELECT control, named "control1": + + <select name="control1"> + <option>foo</option> + <option value="1">bar</option> + </select> + + and this is a CHECKBOX control, named "control2": + + <input type="checkbox" name="control2" value="foo" id="cbe1"> + <input type="checkbox" name="control2" value="bar" id="cbe2"> + + The id attribute of a CHECKBOX or RADIO ListControl is always that of its + first element (for example, "cbe1" above). + + + Additional read-only public attribute: multiple. + + """ + + # ListControls are built up by the parser from their component items by + # creating one ListControl per item, consolidating them into a single + # master ListControl held by the HTMLForm: + + # -User calls form.new_control(...) + # -Form creates Control, and calls control.add_to_form(self). + # -Control looks for a Control with the same name and type in the form, + # and if it finds one, merges itself with that control by calling + # control.merge_control(self). The first Control added to the form, of + # a particular name and type, is the only one that survives in the + # form. + # -Form calls control.fixup for all its controls. ListControls in the + # form know they can now safely pick their default values. + + # To create a ListControl without an HTMLForm, use: + + # control.merge_control(new_control) + + # (actually, it's much easier just to use ParseFile) + + def __init__(self, type, name, attrs={}, select_default=False, + called_as_base_class=False): + """ + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present + + """ + if not called_as_base_class: + raise NotImplementedError() + + self.__dict__["type"] = string.lower(type) + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = False + self.readonly = False + self.id = attrs.get("id") + + self._attrs = attrs.copy() + # As Controls are merged in with .merge_control(), self._attrs will + # refer to each Control in turn -- always the most recently merged + # control. Each merged-in Control instance corresponds to a single + # list item: see ListControl.__doc__. + if attrs: + self._attrs_list = [self._attrs] # extended by .merge_control() + self._disabled_list = [self._attrs.has_key("disabled")] # ditto + else: + self._attrs_list = [] # extended by .merge_control() + self._disabled_list = [] # ditto + + self._select_default = select_default + self._clicked = False + # Some list controls can have their default set only after all items + # are known. If so, self._value_is_set is false, and the self.fixup + # method, called after all items have been added, sets the default. + self._value_is_set = False + + def is_of_kind(self, kind): + if kind == "list": + return True + elif kind == "multilist": + return bool(self.multiple) + elif kind == "singlelist": + return not self.multiple + else: + return False + + def _value_from_label(self, label): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + + def toggle(self, name, by_label=False): + return self._set_selected_state(name, 2, by_label) + def set(self, selected, name, by_label=False): + action = int(bool(selected)) + return self._set_selected_state(name, action, by_label) + + def _set_selected_state(self, name, action, by_label): + """ + name: item name + action: + 0: clear + 1: set + 2: toggle + + """ + if not isstringlike(name): + raise TypeError("item name must be string-like") + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError("no item named '%s'" % name) + + if self.multiple: + if action == 2: + action = not self._selected[i] + if action and self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % name) + self._selected[i] = bool(action) + else: + if action == 2: + if self._selected == name: + action = 0 + else: + action = 1 + if action == 0 and self._selected == name: + self._selected = None + elif action == 1: + if self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % name) + self._selected = name + + def toggle_single(self, by_label=False): + self._set_single_selected_state(2, by_label) + def set_single(self, selected, by_label=False): + action = int(bool(selected)) + self._set_single_selected_state(action, by_label) + + def _set_single_selected_state(self, action, by_label): + if len(self._menu) != 1: + raise ItemCountError("'%s' is not a single-item control" % + self.name) + + name = self._menu[0] + if by_label: + name = self._value_from_label(name) + self._set_selected_state(name, action, by_label) + + def get_item_disabled(self, name, by_label=False): + """Get disabled state of named list item in a ListControl.""" + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + else: + return self._disabled_list[i] + + def set_item_disabled(self, disabled, name, by_label=False): + """Set disabled state of named list item in a ListControl. + + disabled: boolean disabled state + + """ + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + else: + self._disabled_list[i] = bool(disabled) + + def set_all_items_disabled(self, disabled): + """Set disabled state of all list items in a ListControl. + + disabled: boolean disabled state + + """ + for i in range(len(self._disabled_list)): + self._disabled_list[i] = bool(disabled) + + def get_item_attrs(self, name, by_label=False): + """Return dictionary of HTML attributes for a single ListControl item. + + The HTML element types that describe list items are: OPTION for SELECT + controls, INPUT for the rest. These elements have HTML attributes that + you may occasionally want to know about -- for example, the "alt" HTML + attribute gives a text string describing the item (graphical browsers + usually display this as a tooltip). + + The returned dictionary maps HTML attribute names to values. The names + and values are taken from the original HTML. + + Note that for SELECT controls, the returned dictionary contains a + special key "contents" -- see SelectControl.__doc__. + + """ + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + return self._attrs_list[i] + + def add_to_form(self, form): + try: + control = form.find_control(self.name, self.type) + except ControlNotFoundError: + Control.add_to_form(self, form) + else: + control.merge_control(self) + + def merge_control(self, control): + assert bool(control.multiple) == bool(self.multiple) + assert isinstance(control, self.__class__) + self._menu.extend(control._menu) + self._attrs_list.extend(control._attrs_list) + self._disabled_list.extend(control._disabled_list) + if control.multiple: + self._selected.extend(control._selected) + else: + if control._value_is_set: + self._selected = control._selected + if control._value_is_set: + self._value_is_set = True + + def fixup(self): + """ + ListControls are built up from component list items (which are also + ListControls) during parsing. This method should be called after all + items have been added. See ListControl.__doc__ for the reason this is + required. + + """ + # Need to set default selection where no item was indicated as being + # selected by the HTML: + + # CHECKBOX: + # Nothing should be selected. + # SELECT/single, SELECT/multiple and RADIO: + # RFC 1866 (HTML 2.0): says first item should be selected. + # W3C HTML 4.01 Specification: says that client behaviour is + # undefined in this case. For RADIO, exactly one must be selected, + # though which one is undefined. + # Both Netscape and Microsoft Internet Explorer (IE) choose first + # item for SELECT/single. However, both IE5 and Mozilla (both 1.0 + # and Firebird 0.6) leave all items unselected for RADIO and + # SELECT/multiple. + + # Since both Netscape and IE all choose the first item for + # SELECT/single, we do the same. OTOH, both Netscape and IE + # leave SELECT/multiple with nothing selected, in violation of RFC 1866 + # (but not in violation of the W3C HTML 4 standard); the same is true + # of RADIO (which *is* in violation of the HTML 4 standard). We follow + # RFC 1866 if the select_default attribute is set, and Netscape and IE + # otherwise. RFC 1866 and HTML 4 are always violated insofar as you + # can deselect all items in a RadioControl. + + raise NotImplementedError() + + def __getattr__(self, name): + if name == "value": + menu = self._menu + if self.multiple: + values = [] + for i in range(len(menu)): + if self._selected[i]: values.append(menu[i]) + return values + else: + if self._selected is None: return [] + else: return [self._selected] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._set_value(value) + elif name in ("name", "type", "multiple"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _set_value(self, value): + if self.multiple: + self._multiple_set_value(value) + else: + self._single_set_value(value) + + def _single_set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + nr = len(value) + if not (0 <= nr <= 1): + raise ItemCountError("single selection list, must set sequence of " + "length 0 or 1") + + if nr == 0: + self._selected = None + else: + value = value[0] + try: + i = self._menu.index(value) + except ValueError: + raise ItemNotFoundError("no item named '%s'" % + repr(value)) + if self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % value) + self._selected = value + + def _multiple_set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + + selected = [False]*len(self._selected) + menu = self._menu + disabled_list = self._disabled_list + + for v in value: + found = False + for i in range(len(menu)): + item_name = menu[i] + if v == item_name: + if disabled_list[i]: + raise AttributeError("item '%s' is disabled" % value) + selected[i] = True + found = True + break + if not found: + raise ItemNotFoundError("no item named '%s'" % repr(v)) + self._selected = selected + + def set_value_by_label(self, value): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + def get_value_by_label(self): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + + def possible_items(self, by_label=False): + if by_label: + raise NotImplementedError( + "control '%s' does not yet support by_label" % self.name) + return copy.copy(self._menu) + + def pairs(self): + if self.disabled: + return [] + + if not self.multiple: + name = self.name + value = self._selected + if name is None or value is None: + return [] + return [(name, value)] + else: + control_name = self.name # usually the name HTML attribute + pairs = [] + for i in range(len(self._menu)): + item_name = self._menu[i] # usually the value HTML attribute + if self._selected[i]: + pairs.append((control_name, item_name)) + return pairs + + def _item_str(self, i): + item_name = self._menu[i] + if self.multiple: + if self._selected[i]: + item_name = "*"+item_name + else: + if self._selected == item_name: + item_name = "*"+item_name + if self._disabled_list[i]: + item_name = "(%s)" % item_name + return item_name + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + display = [] + for i in range(len(self._menu)): + s = self._item_str(i) + display.append(s) + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=[%s])%s>" % (self.__class__.__name__, + name, string.join(display, ", "), info) + + +class RadioControl(ListControl): + """ + Covers: + + INPUT/RADIO + + """ + def __init__(self, type, name, attrs, select_default=False): + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + self.__dict__["multiple"] = False + value = attrs.get("value", "on") + self._menu = [value] + checked = attrs.has_key("checked") + if checked: + self._value_is_set = True + self._selected = value + else: + self._selected = None + + def fixup(self): + if not self._value_is_set: + # no item explicitly selected + assert self._selected is None + if self._select_default: + self._selected = self._menu[0] + self._value_is_set = True + + +class CheckboxControl(ListControl): + """ + Covers: + + INPUT/CHECKBOX + + """ + def __init__(self, type, name, attrs, select_default=False): + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + self.__dict__["multiple"] = True + value = attrs.get("value", "on") + self._menu = [value] + checked = attrs.has_key("checked") + self._selected = [checked] + self._value_is_set = True + + def fixup(self): + # If no items were explicitly checked in HTML, that's how we must + # leave it, so we have nothing to do here. + assert self._value_is_set + + +class SelectControl(ListControl): + """ + Covers: + + SELECT (and OPTION) + + SELECT control values and labels are subject to some messy defaulting + rules. For example, if the HTML repreentation of the control is: + + <SELECT name=year> + <OPTION value=0 label="2002">current year</OPTION> + <OPTION value=1>2001</OPTION> + <OPTION>2000</OPTION> + </SELECT> + + The items, in order, have labels "2002", "2001" and "2000", whereas their + values are "0", "1" and "2000" respectively. Note that the value of the + last OPTION in this example defaults to its contents, as specified by RFC + 1866, as do the labels of the second and third OPTIONs. + + The OPTION labels are sometimes more meaningful than the OPTION values, + which can make for more maintainable code. + + Additional read-only public attribute: attrs + + The attrs attribute is a dictionary of the original HTML attributes of the + SELECT element. Other ListControls do not have this attribute, because in + other cases the control as a whole does not correspond to any single HTML + element. The get_item_attrs method may be used as usual to get at the + HTML attributes of the HTML elements corresponding to individual list items + (for SELECT controls, these are OPTION elements). + + Another special case is that the attributes dictionaries returned by + get_item_attrs have a special key "contents" which does not correspond to + any real HTML attribute, but rather contains the contents of the OPTION + element: + + <OPTION>this bit</OPTION> + + """ + # HTML attributes here are treated slightly from other list controls: + # -The SELECT HTML attributes dictionary is stuffed into the OPTION + # HTML attributes dictionary under the "__select" key. + # -The content of each OPTION element is stored under the special + # "contents" key of the dictionary. + # After all this, the dictionary is passed to the SelectControl constructor + # as the attrs argument, as usual. However: + # -The first SelectControl constructed when building up a SELECT control + # has a constructor attrs argument containing only the __select key -- so + # this SelectControl represents an empty SELECT control. + # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and + # the __select dictionary containing the SELECT HTML-attributes. + def __init__(self, type, name, attrs, select_default=False): + # fish out the SELECT HTML attributes from the OPTION HTML attributes + # dictionary + self.attrs = attrs["__select"].copy() + attrs = attrs.copy() + del attrs["__select"] + + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + + self._label_map = None + self.disabled = self.attrs.has_key("disabled") + self.id = self.attrs.get("id") + + self._menu = [] + self._selected = [] + self._value_is_set = False + if self.attrs.has_key("multiple"): + self.__dict__["multiple"] = True + self._selected = [] + else: + self.__dict__["multiple"] = False + self._selected = None + + if attrs: # OPTION item data was provided + value = attrs["value"] + self._menu.append(value) + selected = attrs.has_key("selected") + if selected: + self._value_is_set = True + if self.attrs.has_key("multiple"): + self._selected.append(selected) + elif selected: + self._selected = value + + def _build_select_label_map(self): + """Return an ordered mapping of labels to values. + + For example, if the HTML repreentation of the control is as given in + SelectControl.__doc__, this function will return a mapping like: + + {"2002": "0", "2001": "1", "2000": "2000"} + + """ + alist = [] + for val in self._menu: + attrs = self.get_item_attrs(val) + alist.append((attrs["label"], val)) + return AList(alist) + + def _value_from_label(self, label): + try: + return self._label_map[label] + except KeyError: + raise ItemNotFoundError("no item has label '%s'" % label) + + def fixup(self): + if not self._value_is_set: + # No item explicitly selected. + if len(self._menu) > 0: + if self.multiple: + if self._select_default: + self._selected[0] = True + else: + assert self._selected is None + self._selected = self._menu[0] + self._value_is_set = True + self._label_map = self._build_select_label_map() + + def _delete_items(self): + # useful for simulating JavaScript code, but not a stable interface yet + self._menu = [] + self._value_is_set = False + if self.multiple: + self._selected = [] + else: + self._selected = None + + def possible_items(self, by_label=False): + if not by_label: + return copy.copy(self._menu) + else: + self._label_map.set_inverted(True) + try: + r = map(lambda v, self=self: self._label_map[v], self._menu) + finally: + self._label_map.set_inverted(False) + return r + + def set_value_by_label(self, value): + if isstringlike(value): + raise TypeError("ListControl, must set a sequence, not a string") + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + + try: + value = map(lambda v, self=self: self._label_map[v], value) + except KeyError, e: + raise ItemNotFoundError("no item has label '%s'" % e.args[0]) + self._set_value(value) + + def get_value_by_label(self): + menu = self._menu + self._label_map.set_inverted(True) + try: + if self.multiple: + values = [] + for i in range(len(menu)): + if self._selected[i]: + values.append(self._label_map[menu[i]]) + return values + else: + return [self._label_map[self._selected]] + finally: + self._label_map.set_inverted(False) + + +#--------------------------------------------------- +class SubmitControl(ScalarControl): + """ + Covers: + + INPUT/SUBMIT + BUTTON/SUBMIT + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it + # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem + # to define this. + if self.value is None: self.value = "" + self.readonly = True + + def is_of_kind(self, kind): return kind == "clickable" + + def _click(self, form, coord, return_type, request_class=urllib2.Request): + self._clicked = coord + r = form._switch_click(return_type, request_class) + self._clicked = False + return r + + def pairs(self): + if not self._clicked: + return [] + return ScalarControl.pairs(self) + + +#--------------------------------------------------- +class ImageControl(SubmitControl): + """ + Covers: + + INPUT/IMAGE + + Coordinates are specified using one of the HTMLForm.click* methods. + + """ + def __init__(self, type, name, attrs): + SubmitControl.__init__(self, type, name, attrs) + self.readonly = False + + def pairs(self): + clicked = self._clicked + if self.disabled or not clicked: + return [] + name = self.name + if name is None: return [] + pairs = [ + ("%s.x" % name, str(clicked[0])), + ("%s.y" % name, str(clicked[1])), + ] + value = self._value + if value: + pairs.append((name, value)) + return pairs + +# aliases, just to make str(control) and str(form) clearer +class PasswordControl(TextControl): pass +class HiddenControl(TextControl): pass +class TextareaControl(TextControl): pass +class SubmitButtonControl(SubmitControl): pass + + +def is_listcontrol(control): return control.is_of_kind("list") + + +class HTMLForm: + """Represents a single HTML <form> ... </form> element. + + A form consists of a sequence of controls that usually have names, and + which can take on various values. The values of the various types of + controls represent variously: text, zero-or-one-of-many or many-of-many + choices, and files to be uploaded. Some controls can be clicked on to + submit the form, and clickable controls' values sometimes include the + coordinates of the click. + + Forms can be filled in with data to be returned to the server, and then + submitted, using the click method to generate a request object suitable for + passing to urllib2.urlopen (or the click_request_data or click_pairs + methods if you're not using urllib2). + + import ClientForm + forms = ClientForm.ParseFile(html, base_uri) + form = forms[0] + + form["query"] = "Python" + form.set("lots", "nr_results") + + response = urllib2.urlopen(form.click()) + + Usually, HTMLForm instances are not created directly. Instead, the + ParseFile or ParseResponse factory functions are used. If you do construct + HTMLForm objects yourself, however, note that an HTMLForm instance is only + properly initialised after the fixup method has been called (ParseFile and + ParseResponse do this for you). See ListControl.__doc__ for the reason + this is required. + + Indexing a form (form["control_name"]) returns the named Control's value + attribute. Assignment to a form index (form["control_name"] = something) + is equivalent to assignment to the named Control's value attribute. If you + need to be more specific than just supplying the control's name, use the + set_value and get_value methods. + + ListControl values are lists of item names. The list item's name is the + value of the corresponding HTML element's "value" attribute. + + Example: + + <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT> + <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT> + + defines a CHECKBOX control with name "cheeses" which has two items, named + "leicester" and "cheddar". + + Another example: + + <SELECT name="more_cheeses"> + <OPTION>1</OPTION> + <OPTION value="2" label="CHEDDAR">cheddar</OPTION> + </SELECT> + + defines a SELECT control with name "more_cheeses" which has two items, + named "1" and "2" (because the OPTION element's value HTML attribute + defaults to the element contents). + + To set, clear or toggle individual list items, use the set and toggle + methods. To set the whole value, do as for any other control:use indexing + or the set_/get_value methods. + + Example: + + # select *only* the item named "cheddar" + form["cheeses"] = ["cheddar"] + # select "cheddar", leave other items unaffected + form.set("cheddar", "cheeses") + + Some controls (RADIO and SELECT without the multiple attribute) can only + have zero or one items selected at a time. Some controls (CHECKBOX and + SELECT with the multiple attribute) can have multiple items selected at a + time. To set the whole value of a ListControl, assign a sequence to a form + index: + + form["cheeses"] = ["cheddar", "leicester"] + + If the ListControl is not multiple-selection, the assigned list must be of + length one. + + To check whether a control has an item, or whether an item is selected, + respectively: + + "cheddar" in form.possible_items("cheeses") + "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses")) + + Note that some list items may be disabled (see below). + + Note the following mistake: + + form[control_name] = control_value + assert form[control_name] == control_value # not necessarily true + + The reason for this is that form[control_name] always gives the list items + in the order they were listed in the HTML. + + List items (hence list values, too) can be referred to in terms of list + item labels rather than list item names. Currently, this is only possible + for SELECT controls (this is a bug). To use this feature, use the by_label + arguments to the various HTMLForm methods. Note that it is *item* names + (hence ListControl values also), not *control* names, that can be referred + to by label. + + The question of default values of OPTION contents, labels and values is + somewhat complicated: see SelectControl.__doc__ and + ListControl.get_item_attrs.__doc__ if you think you need to know. + + Controls can be disabled or readonly. In either case, the control's value + cannot be changed until you clear those flags (see example below). + Disabled is the state typically represented by browsers by `greying out' a + control. Disabled controls are not `successful' -- they don't cause data + to get returned to the server. Readonly controls usually appear in + browsers as read-only text boxes. Readonly controls are successful. List + items can also be disabled. Attempts to select disabled items (with + form[name] = value, or using the ListControl.set method, for example) fail. + Attempts to clear disabled items are allowed. + + If a lot of controls are readonly, it can be useful to do this: + + form.set_all_readonly(False) + + When you want to do several things with a single control, or want to do + less common things, like changing which controls and items are disabled, + you can get at a particular control: + + control = form.find_control("cheeses") + control.disabled = False + control.readonly = False + control.set_item_disabled(False, "gruyere") + control.set("gruyere") + + Most methods on HTMLForm just delegate to the contained controls, so see + the docstrings of the various Control classes for further documentation. + Most of these delegating methods take name, type, kind, id and nr arguments + to specify the control to be operated on: see + HTMLForm.find_control.__doc__. + + ControlNotFoundError (subclass of ValueError) is raised if the specified + control can't be found. This includes occasions where a non-ListControl + is found, but the method (set, for example) requires a ListControl. + ItemNotFoundError (subclass of ValueError) is raised if a list item can't + be found. ItemCountError (subclass of ValueError) is raised if an attempt + is made to select more than one item and the control doesn't allow that, or + set/get_single are called and the control contains more than one item. + AttributeError is raised if a control or item is readonly or disabled and + an attempt is made to alter its value. + + XXX CheckBoxControl and RadioControl don't yet support item access by label + + Security note: Remember that any passwords you store in HTMLForm instances + will be saved to disk in the clear if you pickle them (directly or + indirectly). The simplest solution to this is to avoid pickling HTMLForm + objects. You could also pickle before filling in any password, or just set + the password to "" before pickling. + + + Public attributes: + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form (None if no name was specified) + attrs: dictionary mapping original HTML form attributes to their values + + controls: list of Control instances; do not alter this list + (instead, call form.new_control to make a Control and add it to the + form, or control.add_to_form if you already have a Control instance) + + + + Methods for form filling: + ------------------------- + + Most of the these methods have very similar arguments. See + HTMLForm.find_control.__doc__ for details of the name, type, kind and nr + arguments. See above for a description of by_label. + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None) + + get_value(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + set_value(value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + set_all_readonly(readonly) + + + Methods applying only to ListControls: + + possible_items(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + set(selected, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + toggle(item_name, + name=None, type=None, id=None, nr=None, + by_label=False) + + set_single(selected, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + toggle_single(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + + Method applying only to FileControls: + + add_file(file_object, + content_type="application/octet-stream", filename=None, + name=None, id=None, nr=None) + + + Methods applying only to clickable controls: + + click(name=None, type=None, id=None, nr=0, coord=(1,1)) + click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1)) + click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1)) + + """ + + type2class = { + "text": TextControl, + "password": PasswordControl, + "hidden": HiddenControl, + "textarea": TextareaControl, + + "isindex": IsindexControl, + + "file": FileControl, + + "button": IgnoreControl, + "buttonbutton": IgnoreControl, + "reset": IgnoreControl, + "resetbutton": IgnoreControl, + + "submit": SubmitControl, + "submitbutton": SubmitButtonControl, + "image": ImageControl, + + "radio": RadioControl, + "checkbox": CheckboxControl, + "select": SelectControl, + } + +#--------------------------------------------------- +# Initialisation. Use ParseResponse / ParseFile instead. + + def __init__(self, action, method="GET", + enctype="application/x-www-form-urlencoded", + name=None, attrs=None, + request_class=urllib2.Request): + """ + In the usual case, use ParseResponse (or ParseFile) to create new + HTMLForm objects. + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form + attrs: dictionary mapping original HTML form attributes to their values + + """ + self.action = action + self.method = method + self.enctype = enctype + self.name = name + if attrs is not None: + self.attrs = attrs.copy() + else: + self.attrs = {} + self.controls = [] + self._request_class = request_class + + def new_control(self, type, name, attrs, + ignore_unknown=False, select_default=False): + """Adds a new control to the form. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + Note that controls representing lists of items are built up from + controls holding only a single list item. See ListControl.__doc__ for + further information. + + type: type of control (see Control.__doc__ for a list) + attrs: HTML attributes of control + ignore_unknown: if true, use a dummy Control instance for controls of + unknown type; otherwise, raise ValueError + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present (this defaulting happens when the HTMLForm.fixup method is + called) + + """ + type = string.lower(type) + klass = self.type2class.get(type) + if klass is None: + if ignore_unknown: + klass = IgnoreControl + else: + raise ValueError("Unknown control type '%s'" % type) + + a = attrs.copy() + if issubclass(klass, ListControl): + control = klass(type, name, a, select_default) + else: + control = klass(type, name, a) + control.add_to_form(self) + + def fixup(self): + """Normalise form after all controls have been added. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + This method should only be called once, after all controls have been + added to the form. + + """ + for control in self.controls: + control.fixup() + +#--------------------------------------------------- + def __str__(self): + header = "%s %s %s" % (self.method, self.action, self.enctype) + rep = [header] + for control in self.controls: + rep.append(" %s" % str(control)) + return "<%s>" % string.join(rep, "\n") + +#--------------------------------------------------- +# Form-filling methods. + + def __getitem__(self, name): + return self.find_control(name).value + def __setitem__(self, name, value): + control = self.find_control(name) + try: + control.value = value + except AttributeError, e: + raise ValueError(str(e)) + + def get_value(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Return value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] + + """ + c = self.find_control(name, type, kind, id, nr=nr) + if by_label: + try: + meth = c.get_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + return meth() + else: + return c.value + def set_value(self, value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Set value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] = value + + """ + c = self.find_control(name, type, kind, id, nr=nr) + if by_label: + try: + meth = c.set_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + meth(value) + else: + c.value = value + + def set_all_readonly(self, readonly): + for control in self.controls: + control.readonly = bool(readonly) + + +#--------------------------------------------------- +# Form-filling methods applying only to ListControls. + + def possible_items(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Return a list of all values that the specified control can take.""" + c = self._find_list_control(name, type, kind, id, nr) + return c.possible_items(by_label) + + def set(self, selected, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Select / deselect named list item. + + selected: boolean selected state + + """ + self._find_list_control(name, type, kind, id, nr).set( + selected, item_name, by_label) + def toggle(self, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Toggle selected state of named list item.""" + self._find_list_control(name, type, kind, id, nr).toggle( + item_name, by_label) + + def set_single(self, selected, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Select / deselect list item in a control having only one item. + + If the control has multiple list items, ItemCountError is raised. + + This is just a convenience method, so you don't need to know the item's + name -- the item name in these single-item controls is usually + something meaningless like "1" or "on". + + For example, if a checkbox has a single item named "on", the following + two calls are equivalent: + + control.toggle("on") + control.toggle_single() + + """ + self._find_list_control(name, type, kind, id, nr).set_single( + selected, by_label) + def toggle_single(self, name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Toggle selected state of list item in control having only one item. + + The rest is as for HTMLForm.set_single.__doc__. + + """ + self._find_list_control(name, type, kind, id, nr).toggle_single( + by_label) + +#--------------------------------------------------- +# Form-filling method applying only to FileControls. + + def add_file(self, file_object, content_type=None, filename=None, + name=None, id=None, nr=None): + """Add a file to be uploaded. + + file_object: file-like object (with read method) from which to read + data to upload + content_type: MIME content type of data to upload + filename: filename to pass to server + + If filename is None, no filename is sent to the server. + + If content_type is None, the content type is guessed based on the + filename and the data from read from the file object. + + XXX + At the moment, guessed content type is always application/octet-stream. + Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and + plain text. + + Note the following useful HTML attributes of file upload controls (see + HTML 4.01 spec, section 17): + + accept: comma-separated list of content types that the server will + handle correctly; you can use this to filter out non-conforming files + size: XXX IIRC, this is indicative of whether form wants multiple or + single files + maxlength: XXX hint of max content length in bytes? + + """ + self.find_control(name, "file", id=id, nr=nr).add_file( + file_object, content_type, filename) + +#--------------------------------------------------- +# Form submission methods, applying only to clickable controls. + + def click(self, name=None, type=None, id=None, nr=0, coord=(1,1), + request_class=urllib2.Request): + """Return request that would result from clicking on a control. + + The request object is a urllib2.Request instance, which you can pass to + urllib2.urlopen (or ClientCookie.urlopen). + + Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and + IMAGEs) can be clicked. + + Will click on the first clickable control, subject to the name, type + and nr arguments (as for find_control). If no name, type, id or number + is specified and there are no clickable controls, a request will be + returned for the form in its current, un-clicked, state. + + IndexError is raised if any of name, type, id or nr is specified but no + matching control is found. ValueError is raised if the HTMLForm has an + enctype attribute that is not recognised. + + You can optionally specify a coordinate to click at, which only makes a + difference if you clicked on an image. + + """ + return self._click(name, type, id, nr, coord, "request", + self._request_class) + + def click_request_data(self, + name=None, type=None, id=None, nr=0, coord=(1,1), + request_class=urllib2.Request): + """As for click method, but return a tuple (url, data, headers). + + You can use this data to send a request to the server. This is useful + if you're using httplib or urllib rather than urllib2. Otherwise, use + the click method. + + # Untested. Have to subclass to add headers, I think -- so use urllib2 + # instead! + import urllib + url, data, hdrs = form.click_request_data() + r = urllib.urlopen(url, data) + + # Untested. I don't know of any reason to use httplib -- you can get + # just as much control with urllib2. + import httplib, urlparse + url, data, hdrs = form.click_request_data() + tup = urlparse(url) + host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:]) + conn = httplib.HTTPConnection(host) + if data: + httplib.request("POST", path, data, hdrs) + else: + httplib.request("GET", path, headers=hdrs) + r = conn.getresponse() + + """ + return self._click(name, type, id, nr, coord, "request_data", + self._request_class) + + def click_pairs(self, name=None, type=None, id=None, nr=0, coord=(1,1)): + """As for click_request_data, but returns a list of (key, value) pairs. + + You can use this list as an argument to ClientForm.urlencode. This is + usually only useful if you're using httplib or urllib rather than + urllib2 or ClientCookie. It may also be useful if you want to manually + tweak the keys and/or values, but this should not be necessary. + Otherwise, use the click method. + + Note that this method is only useful for forms of MIME type + x-www-form-urlencoded. In particular, it does not return the + information required for file upload. If you need file upload and are + not using urllib2, use click_request_data. + + Also note that Python 2.0's urllib.urlencode is slightly broken: it + only accepts a mapping, not a sequence of pairs, as an argument. This + messes up any ordering in the argument. Use ClientForm.urlencode + instead. + + """ + return self._click(name, type, id, nr, coord, "pairs", + self._request_class) + +#--------------------------------------------------- + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None): + """Locate and return some specific control within the form. + + At least one of the name, type, kind, predicate and nr arguments must + be supplied. If no matching control is found, ControlNotFoundError is + raised. + + If name is specified, then the control must have the indicated name. + + If type is specified then the control must have the specified type (in + addition to the types possible for <input> HTML tags: "text", + "password", "hidden", "submit", "image", "button", "radio", "checkbox", + "file" we also have "reset", "buttonbutton", "submitbutton", + "resetbutton", "textarea", "select" and "isindex"). + + If kind is specified, then the control must fall into the specified + group, each of which satisfies a particular interface. The types are + "text", "list", "multilist", "singlelist", "clickable" and "file". + + If id is specified, then the control must have the indicated id. + + If predicate is specified, then the control must match that function. + The predicate function is passed the control as its single argument, + and should return a boolean value indicating whether the control + matched. + + nr, if supplied, is the sequence number of the control (where 0 is the + first). Note that control 0 is the first control matching all the + other arguments (if supplied); it is not necessarily the first control + in the form. + + """ + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (predicate is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + if nr is None: nr = 0 + + return self._find_control(name, type, kind, id, predicate, nr) + +#--------------------------------------------------- +# Private methods. + + def _find_list_control(self, + name=None, type=None, kind=None, id=None, nr=None): + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + if nr is None: nr = 0 + + return self._find_control(name, type, kind, id, is_listcontrol, nr) + + def _find_control(self, name, type, kind, id, predicate, nr): + if (name is not None) and not isstringlike(name): + raise TypeError("control name must be string-like") + if (type is not None) and not isstringlike(type): + raise TypeError("control type must be string-like") + if (kind is not None) and not isstringlike(kind): + raise TypeError("control kind must be string-like") + if (id is not None) and not isstringlike(id): + raise TypeError("control id must be string-like") + if (predicate is not None) and not callable(predicate): + raise TypeError("control predicate must be callable") + if nr < 0: raise ValueError("control number must be a positive " + "integer") + + orig_nr = nr + + for control in self.controls: + if name is not None and name != control.name: + continue + if type is not None and type != control.type: + continue + if kind is not None and not control.is_of_kind(kind): + continue + if id is not None and id != control.id: + continue + if predicate and not predicate(control): + continue + if nr: + nr = nr - 1 + continue + return control + + description = [] + if name is not None: description.append("name '%s'" % name) + if type is not None: description.append("type '%s'" % type) + if kind is not None: description.append("kind '%s'" % kind) + if id is not None: description.append("id '%s'" % id) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr: description.append("nr %d" % orig_nr) + description = string.join(description, ", ") + raise ControlNotFoundError("no control matching "+description) + + def _click(self, name, type, id, nr, coord, return_type, + request_class=urllib2.Request): + try: + control = self._find_control(name, type, "clickable", id, None, nr) + except ControlNotFoundError: + if ((name is not None) or (type is not None) or (id is not None) or + (nr != 0)): + raise + # no clickable controls, but no control was explicitly requested, + # so return state without clicking any control + return self._switch_click(return_type) + else: + return control._click(self, coord, return_type, request_class) + + def _pairs(self): + """Return sequence of (key, value) pairs suitable for urlencoding.""" + pairs = [] + for control in self.controls: + pairs.extend(control.pairs()) + return pairs + + def _request_data(self): + """Return a tuple (url, data, headers).""" + method = string.upper(self.method) + #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action) + parts = urlparse.urlparse(self.action) + rest, (query, frag) = parts[:-2], parts[-2:] + + if method == "GET": + if self.enctype != "application/x-www-form-urlencoded": + raise ValueError( + "unknown GET form encoding type '%s'" % self.enctype) + parts = rest + (urlencode(self._pairs()), "") + uri = urlparse.urlunparse(parts) + return uri, None, [] + elif method == "POST": + parts = rest + (query, "") + uri = urlparse.urlunparse(parts) + if self.enctype == "application/x-www-form-urlencoded": + return (uri, urlencode(self._pairs()), + [("Content-type", self.enctype)]) + elif self.enctype == "multipart/form-data": + data = StringIO() + http_hdrs = [] + mw = MimeWriter(data, http_hdrs) + f = mw.startmultipartbody("form-data", add_to_http_hdrs=True, + prefix=0) + for control in self.controls: + control._write_mime_data(mw) + mw.lastpart() + return uri, data.getvalue(), http_hdrs + else: + raise ValueError( + "unknown POST form encoding type '%s'" % self.enctype) + else: + raise ValueError("Unknown method '%s'" % method) + + def _switch_click(self, return_type, request_class=urllib2.Request): + # This is called by HTMLForm and clickable Controls to hide switching + # on return_type. + if return_type == "pairs": + return self._pairs() + elif return_type == "request_data": + return self._request_data() + else: + req_data = self._request_data() + req = request_class(req_data[0], req_data[1]) + for key, val in req_data[2]: + req.add_header(key, val) + return req diff --git a/LTA/LTAIngest/ClientForm-0.1.17/GeneralFAQ.html b/LTA/LTAIngest/ClientForm-0.1.17/GeneralFAQ.html new file mode 100644 index 0000000000000000000000000000000000000000..878b54f0b4fb84354d93aa230f71ae93bb6ed525 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/GeneralFAQ.html @@ -0,0 +1,139 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> + <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> + <meta name="author" content="John J. Lee <jjl@pobox.com>"> + <meta name="date" content="2005-01"> + <meta name="keywords" content="FAQ,cookie,HTTP,HTML,form,table,Python,web,client,client-side,testing,sniffer,https,script,embedded"> + <title>Python web-client programming general FAQs</title> + <style type="text/css" media="screen">@import "../styles/style.css";</style> + <base href="http://wwwsearch.sourceforge.net/bits/clientx.html"> +</head> +<body> + +<div id="sf"><a href="http://sourceforge.net"> +<img src="http://sourceforge.net/sflogo.php?group_id=48205&type=2" + width="125" height="37" alt="SourceForge.net Logo"></a></div> +<!--<img src="../images/sflogo.png"--> + +<h1>Python web-client programming general FAQs</h1> + +<div id="Content"> +<ul> + <li>Is there any example code? + <p>There's (still!) a bit of a shortage of example code for ClientCookie + and ClientForm &co., because the stuff I've written tends to either + require access to restricted-access sites, or is proprietary code (and the + same goes for other people's code). + <li>HTTPS on Windows? + <p>Use this <a href="http://pypgsql.sourceforge.net/misc/python22-win32-ssl.zip"> + _socket.pyd</a>, or use Python 2.3. + <li>I want to see what my web browser is doing, but standard network sniffers + like <a href="http://www.ethereal.com/">ethereal</a> or netcat (nc) don't + work for HTTPS. How do I sniff HTTPS traffic? + <p>Three good options: + <ul> + <li>Mozilla plugin: <a href="http://livehttpheaders.mozdev.org/"> + livehttpheaders</a>. + <li><a href="http://www.blunck.info/iehttpheaders.html">ieHTTPHeaders</a> + does the same for MSIE. + <li>Use <a href="http://lynx.browser.org/">lynx</a> <code>-trace</code>, + and filter out the junk with a script. + </ul> + <p>I'm told you can also use a proxy like <a + href="http://www.proxomitron.info/">proxomitron</a> (never tried it + myself). There's also a commercial <a href="http://www.simtec.ltd.uk/">MSIE + plugin</a>. + <li>Embedded script is messing up my web-scraping. What do I do? + <p>It is possible to embed script in HTML pages (sandwiched between + <code><SCRIPT>here</SCRIPT></code> tags, and in + <code>javascript:</code> URLs) - JavaScript / ECMAScript, VBScript, or + even Python. These scripts can do all sorts of things, including causing + cookies to be set in a browser, submitting or filling in parts of forms in + response to user actions, changing link colours as the mouse moves over a + link, etc. + + <p>If you come across this in a page you want to automate, you + have four options. Here they are, roughly in order of simplicity. + + <ul> + <li>Simply figure out what the embedded script is doing and emulate it + in your Python code: for example, by manually adding cookies to your + <code>CookieJar</code> instance, calling methods on + <code>HTMLForm</code>s, calling <code>urlopen</code>, etc. + <li>Dump ClientCookie and ClientForm and automate a browser instead + (eg. use MS Internet Explorer via its COM automation interfaces, using + the <a href="http://starship.python.net/crew/mhammond/">Python for + Windows extensions</a>, XXX Mozilla automation & XPCOM / PyXPCOM, + Konqueror & DCOP / KParts / PyKDE). + <li>Use Java's <a href="httpunit.sourceforge.net">httpunit</a> from + Jython, since it knows some JavaScript. + <li>Get ambitious and automatically delegate the work to an appropriate + interpreter (Mozilla's JavaScript interpreter, for instance). This + approach is the one taken by <a href="../DOMForm">DOMForm</a> (the + JavaScript support is "very alpha", though!). + </ul> + <li>Misc links + <ul> + <li>Another Java thing: <a href="http://maxq.tigris.org/">maxq</a>, + which provides a proxy to aid automatic generation of functional tests + written in Jython using the standard library unittest module (PyUnit) + and the "Jakarta Commons" HttpClient library. + <li>A useful set Zope-oriented links on <a + href="http://viii.dclxvi.org/bookmarks/tech/zope/test">tools for testing + web applications</a>. + <li>O'Reilly book: <a href="">Spidering Hacks</a>. Very Perl-oriented. + <li>Useful + <a href="http://chrispederick.myacen.com/work/firebird/webdeveloper/"> + Mozilla plugin</a> which, amongst other things, can display HTML form + information and HTML table structure(thanks to Erno Kuusela for this + link). + <li> + <a href="http://www.iopus.com/iim.htm">IOpus Internet Macros</a> Cheap + and nasty macro recording for IE. It works, just barely. Commercial + software. + <li> + <a href="http://www.opensourcetesting.org/functional.php">Open source + functional testing tools</a>. A nice list. + <li><a href="http://www.rexx.com/~dkuhlman/quixote_htmlscraping.html"> + A HOWTO on web scraping</a> from Dave Kuhlman. + </ul> + <li>Will any of this code make its way into the Python standard library? + <p>The request / response processing extensions to urllib2 from ClientCookie + have been merged into urllib2 for Python 2.4. The cookie processing has + been added, as module cookielib. Eventually, I'll submit patches to get + the http-equiv, refresh, and robots.txt code in there too, and maybe + <code>mechanize.UserAgent</code> too (but <em>not</em> + <code>mechanize.Browser</code>). The rest, probably not. +</ul> +</div> <!--id="Content"--> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, January 2005. + +<hr> + +</div> + +<div id="Menu"> + +<a href="..">Home</a><br> +<!--<a href=""></a><br>--> + +<br> + +<a href="../ClientCookie">ClientCookie</a><br> +<a href="../ClientForm">ClientForm</a><br> +<a href="../DOMForm/">DOMForm</a><br> +<a href="../python-spidermonkey/">python-spidermonkey</a><br> +<a href="../ClientTable">ClientTable</a><br> +<a href="../mechanize/">mechanize</a><br> +<a href="../pullparser/">pullparser</a><br> +<span class="thispage">General FAQs</span><br> +<a href="./urllib2_152.py">1.5.2 urllib2.py</a><br> +<a href="./urllib_152.py">1.5.2 urllib.py</a><br> + +<br> + +</body> +</html> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/INSTALL b/LTA/LTAIngest/ClientForm-0.1.17/INSTALL new file mode 100644 index 0000000000000000000000000000000000000000..ac12b0f0de5f14fa137107f119f09ce078eecdd3 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/INSTALL @@ -0,0 +1,63 @@ +ClientForm installation instructions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +Dependencies +~~~~~~~~~~~~ + +Python 1.5.2 or above is required, and urllib2 is recommended. + + +Installation +~~~~~~~~~~~~ + +To install the package, run the following command: + + python setup.py build + +then (with appropriate permissions) + + python setup.py install + + +Alternatively, just copy the whole ClientForm.py into your Python +path (eg. unix: /usr/local/lib/python2.2/site-packages, +Windows: C:\Python21, or C:\Python22\Lib\site-packages). That's all +that setup.py does. + + +To run the tests (none of which access the network), run the following +command: + + python test.py + +This runs the tests against the source files extracted from the +package. For help on command line options: + + python test.py --help + + +If you're using a pre-2.1 version of Python, you'll need to get +unittest.py (from http://pyunit.sourceforge.net) to run the Pyunit +tests. + +Bugs and comments to jjl@pobox.com. + + +NO WARRANTY + +THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +Copyright Notices + + (C) 2002-2003 John J. Lee. All rights reserved. + (C) 1998-2000 Gisle Aas. All rights reserved. (Original LWP code) + +This code in this package is free software; you can redistribute it +and/or modify it under the terms of the BSD license (see the file +COPYING). + +John J. Lee <jjl@pobox.com> +June 2003 diff --git a/LTA/LTAIngest/ClientForm-0.1.17/MANIFEST.in b/LTA/LTAIngest/ClientForm-0.1.17/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..2393119d33520ec119f5f92368df956da77543d8 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/MANIFEST.in @@ -0,0 +1,10 @@ +include MANIFEST.in +include COPYING +include INSTALL +include GeneralFAQ.html +include README.html.in +include README.html +include README.txt +include ChangeLog +include *.py +recursive-include testdata *.html diff --git a/LTA/LTAIngest/ClientForm-0.1.17/PKG-INFO b/LTA/LTAIngest/ClientForm-0.1.17/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..afc3d70ec4b35bb685b3970d7820f1ea12bfcd36 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/PKG-INFO @@ -0,0 +1,37 @@ +Metadata-Version: 1.0 +Name: ClientForm +Version: 0.1.17 +Summary: Client-side HTML form handling. +Home-page: http://wwwsearch.sourceforge.net/ClientForm/ +Author: John J. Lee +Author-email: jjl@pobox.com +License: BSD +Download-URL: http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0.1.17.tar.gz +Description: ClientForm is a Python module for handling HTML forms on the client + side, useful for parsing HTML forms, filling them in and returning the + completed forms to the server. It developed from a port of Gisle Aas' + Perl module HTML::Form, from the libwww-perl library, but the + interface is not the same. + +Platform: any +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: License :: OSI Approved :: BSD License +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Topic :: Internet +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Internet :: WWW/HTTP :: Site Management +Classifier: Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Software Development :: Testing +Classifier: Topic :: Software Development :: Testing :: Traffic Generation +Classifier: Topic :: System :: Networking :: Monitoring +Classifier: Topic :: System :: Systems Administration +Classifier: Topic :: Text Processing +Classifier: Topic :: Text Processing :: Markup +Classifier: Topic :: Text Processing :: Markup :: HTML +Classifier: Topic :: Text Processing :: Markup :: XML diff --git a/LTA/LTAIngest/ClientForm-0.1.17/README.html b/LTA/LTAIngest/ClientForm-0.1.17/README.html new file mode 100644 index 0000000000000000000000000000000000000000..519b830eb1d6c68165972787458be834bde4f030 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/README.html @@ -0,0 +1,363 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> + <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> + <meta name="author" content="John J. Lee <jjl@pobox.com>"> + <meta name="date" content="2005-01"> + <meta name="keywords" content="form,HTML,Python,web,client,client-side"> + <title>ClientForm</title> + <style type="text/css" media="screen">@import "../styles/style.css";</style> + <base href="http://wwwsearch.sourceforge.net/ClientForm/"> +</head> +<body> + + + +<div id=sf><a href="http://sourceforge.net"> +<img src="http://sourceforge.net/sflogo.php?group_id=48205&type=2" + width="125" height="37" alt="SourceForge.net Logo"></a></div> + +<h1>ClientForm</h1> + +<div id="Content"> + +<p>ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It developed from a port of Gisle Aas' +Perl module <code>HTML::Form</code>, from the <a +href="http://www.linpro.no/lwp/">libwww-perl</a> library, but the +interface is not the same. + +<p>Simple example: + +<pre> + <span class="pykw">from</span> urllib2 <span class="pykw">import</span> urlopen + <span class="pykw">from</span> ClientForm <span class="pykw">import</span> ParseResponse + + forms = ParseResponse(urlopen(<span class="pystr">"http://www.example.com/form.html"</span>)) + form = forms[0] + <span class="pykw">print</span> form + form[<span class="pystr">"author"</span>] = <span class="pystr">"Gisle Aas"</span> + + <span class="pycmt"># form.click() returns a urllib2.Request object +</span> <span class="pycmt"># (see HTMLForm.click.__doc__ if you don't have urllib2) +</span> response = urlopen(form.click(<span class="pystr">"Thanks"</span>))</pre> + + +<p>A more complicated example: + +<pre> + <span class="pykw">import</span> ClientForm + <span class="pykw">import</span> urllib2 + request = urllib2.Request(<span class="pystr">"http://www.example.com/form.html"</span>) + response = urllib2.urlopen(request) + forms = ClientForm.ParseResponse(response) + response.close() + form = forms[0] + <span class="pykw">print</span> form <span class="pycmt"># very useful!</span> + + <span class="pycmt"># Indexing allows setting and retrieval of control values +</span> original_text = form[<span class="pystr">"comments"</span>] <span class="pycmt"># a string, NOT a Control instance</span> + form[<span class="pystr">"comments"</span>] = <span class="pystr">"Blah."</span> + + <span class="pycmt"># Controls that represent lists (checkbox, select and radio lists) are +</span> <span class="pycmt"># ListControls. Their values are sequences of list item names. +</span> <span class="pycmt"># They come in two flavours: single- and multiple-selection: +</span> <span class="pykw">print</span> form.possible_items(<span class="pystr">"cheeses"</span>) + form[<span class="pystr">"favorite_cheese"</span>] = [<span class="pystr">"brie"</span>] <span class="pycmt"># single</span> + form[<span class="pystr">"cheeses"</span>] = [<span class="pystr">"parmesan"</span>, <span class="pystr">"leicester"</span>, <span class="pystr">"cheddar"</span>] <span class="pycmt"># multi</span> + <span class="pycmt"># is the "parmesan" item of the "cheeses" control selected? +</span> <span class="pykw">print</span> <span class="pystr">"parmesan"</span> <span class="pykw">in</span> form[<span class="pystr">"cheeses"</span>] + <span class="pycmt"># does cheeses control have a "caerphilly" item? +</span> <span class="pykw">print</span> <span class="pystr">"caerphilly"</span> <span class="pykw">in</span> form.possible_items(<span class="pystr">"cheeses"</span>) + + <span class="pycmt"># Sometimes one wants to set or clear individual items in a list: +</span> <span class="pycmt"># select the item named "gorgonzola" in the first control named "cheeses" +</span> form.set(True, <span class="pystr">"gorgonzola"</span>, <span class="pystr">"cheeses"</span>) + <span class="pycmt"># You can be more specific: supply at least one of name, type, kind, id +</span> <span class="pycmt"># and nr (most other methods on HTMLForm take the same form of arguments): +</span> <span class="pycmt"># deselect "edam" in third CHECKBOX control +</span> form.set(False, <span class="pystr">"edam"</span>, type=<span class="pystr">"checkbox"</span>, nr=2) + + <span class="pycmt"># You can explicitly say that you're referring to a ListControl: +</span> <span class="pycmt"># set whole value (rather than just one item of) "cheeses" ListControl +</span> form.set_value([<span class="pystr">"gouda"</span>], name=<span class="pystr">"cheeses"</span>, kind=<span class="pystr">"list"</span>) + <span class="pycmt"># last example is almost equivalent to following (but insists that the +</span> <span class="pycmt"># control be a ListControl -- so it will skip any non-list controls that +</span> <span class="pycmt"># come before the control we want) +</span> form[<span class="pystr">"cheeses"</span>] = [<span class="pystr">"gouda"</span>] + <span class="pycmt"># The kind argument can also take values "multilist", "singlelist", "text", +</span> <span class="pycmt"># "clickable" and "file": +</span> <span class="pycmt"># find first control that will accept text, and scribble in it +</span> form.set_value(<span class="pystr">"rhubarb rhubarb"</span>, kind=<span class="pystr">"text"</span>) + form.set_value([<span class="pystr">""</span>], kind=<span class="pystr">"singlelist"</span>) + + <span class="pycmt"># Often, a single checkbox (a CHECKBOX control with a single item) is +</span> <span class="pycmt"># present. In that case, the name of the single item isn't of much +</span> <span class="pycmt"># interest, so it's useful to be able to check and uncheck the box +</span> <span class="pycmt"># without using the item name: +</span> form.set_single(True, <span class="pystr">"smelly"</span>) <span class="pycmt"># check</span> + form.set_single(False, <span class="pystr">"smelly"</span>) <span class="pycmt"># uncheck</span> + + <span class="pycmt"># Add files to FILE controls with .add_file(). Only call this multiple +</span> <span class="pycmt"># times if the server is expecting multiple files. +</span> <span class="pycmt"># add a file, default value for MIME type, no filename sent to server +</span> form.add_file(open(<span class="pystr">"data.dat"</span>)) + <span class="pycmt"># add a second file, explicitly giving MIME type, and telling the server +</span> <span class="pycmt"># what the filename is +</span> form.add_file(open(<span class="pystr">"data.txt"</span>), <span class="pystr">"text/plain"</span>, <span class="pystr">"data.txt"</span>) + + <span class="pycmt"># Many methods have a by_label argument, allowing specification of list +</span> <span class="pycmt"># items by label instead of by name. At the moment, only SelectControl +</span> <span class="pycmt"># supports this argument (this will be fixed). Sometimes labels are +</span> <span class="pycmt"># easier to maintain than names, sometimes the other way around. +</span> form.set_value([<span class="pystr">"Mozzarella"</span>, <span class="pystr">"Caerphilly"</span>], <span class="pystr">"cheeses"</span>, by_label=True) + + <span class="pycmt"># It's also possible to get at the individual controls inside the form. +</span> <span class="pycmt"># This is useful for calling several methods in a row on a single control, +</span> <span class="pycmt"># and for the less common operations. The methods are quite similar to +</span> <span class="pycmt"># those on HTMLForm: +</span> control = form.find_control(<span class="pystr">"cheeses"</span>, type=<span class="pystr">"select"</span>) + <span class="pykw">print</span> control.value, control.name, control.type + <span class="pykw">print</span> control.possible_items() + control.value = [<span class="pystr">"mascarpone"</span>, <span class="pystr">"curd"</span>] + control.set(True, <span class="pystr">"limburger"</span>) + + <span class="pycmt"># All Controls may be disabled (equivalent of greyed-out in browser) +</span> control = form.find_control(<span class="pystr">"comments"</span>) + <span class="pykw">print</span> control.disabled + <span class="pycmt"># ...or readonly +</span> <span class="pykw">print</span> control.readonly + <span class="pycmt"># readonly and disabled attributes can be assigned to +</span> control.disabled = False + <span class="pycmt"># convenience method, used here to make all controls writable (unless +</span> <span class="pycmt"># they're disabled): +</span> form.set_all_readonly(False) + <span class="pycmt"># ListControl items may also be disabled (setting a disabled item is not +</span> <span class="pycmt"># allowed, but clearing one is allowed): +</span> <span class="pykw">print</span> control.get_item_disabled(<span class="pystr">"emmenthal"</span>) + control.set_item_disabled(True, <span class="pystr">"emmenthal"</span>) + <span class="pycmt"># enable all items in control +</span> control.set_all_items_disabled(False) + + <span class="pycmt"># HTMLForm.controls is a list of all controls in the form +</span> <span class="pykw">for</span> control <span class="pykw">in</span> form.controls: + <span class="pykw">if</span> control.value == <span class="pystr">"inquisition"</span>: sys.exit() + + request2 = form.click() <span class="pycmt"># urllib2.Request object</span> + response2 = urllib2.urlopen(request2) + + <span class="pykw">print</span> response2.geturl() + <span class="pykw">print</span> response2.info() <span class="pycmt"># headers</span> + <span class="pykw">print</span> response2.read() <span class="pycmt"># body</span> + response2.close()</pre> + + +<p>All of the standard control types are supported: <code>TEXT</code>, +<code>PASSWORD</code>, <code>HIDDEN</code>, <code>TEXTAREA</code>, +<code>ISINDEX</code>, <code>RESET</code>, <code>BUTTON</code> (<code>INPUT +TYPE=BUTTON</code> and the various <code>BUTTON</code> types), +<code>SUBMIT</code>, <code>IMAGE</code>, <code>RADIO</code>, +<code>CHECKBOX</code>, <code>SELECT</code>/<code>OPTION</code> and +<code>FILE</code> (for file upload). Both standard form encodings +(<code>application/x-www-form-urlencoded</code> and +<code>multipart/form-data</code>) are supported. + +<p>The module is designed for testing and automation of web +interfaces, not for implementing interactive user agents. + +<p><strong><em>Security note</em>: Remember that any passwords you store in +<code>HTMLForm</code> instances will be saved to disk in the clear if you +pickle them (directly or indirectly). The simplest solution to this is to +avoid pickling <code>HTMLForm</code> objects. You could also pickle before +filling in any password, or just set the password to <code>""</code> before +pickling.</strong> + +<p>Python 1.5.2 or above is required. To run the tests, you need the +<code>unittest</code> module (from <a href="http://pyunit.sourceforge.net/">PyUnit</a>). +<code>unittest</code> is a standard library module with Python 2.1 and +above. + +<p>For full documentation, see the docstrings in ClientForm.py. + +<p><em><strong>Note: this page describes the 0.1.x interface. See <a +href="./src/README_0_0_15.html">here</a> for the old 0.0.x interface.</strong> +</em> + + +<a name="download"></a> +<h2>Download</h2> + +<p>For installation instructions, see the INSTALL file included in the +distribution. + +<p><em>Stable release.</em>. There have been many interface changes since +0.0.x, so I don't recommend upgrading old code from 0.0.x unless you want the +new features. + +<p>0.1.x includes <code>FILE</code> control support for file upload, handling +of disabled list items, and a redesigned interface. +<ul> +<li><a href="./src/ClientForm-0.1.17.tar.gz">ClientForm-0.1.17.tar.gz</a> +<li><a href="./src/ClientForm-0_1_17.zip">ClientForm-0_1_17.zip</a> +<li><a href="./src/ChangeLog.txt">Change Log</a> (included in distribution) +<li><a href="./src/">Older versions.</a> +</ul> + +<br> + +<p><em>Old release.</em> +<ul> +<li><a href="./src/ClientForm-0.0.16.tar.gz">ClientForm-0.0.16.tar.gz</a> +<li><a href="./src/ClientForm-0_0_16.zip">ClientForm-0_0_16.zip</a> +<li><a href="./src/ChangeLog.txt">Change Log</a> (included in distribution) +<li><a href="./src/">Older versions.</a> +</ul> + + +<a name="faq"></a> +<h2>FAQs</h2> +<ul> + <li>Doesn't the standard Python library module, <code>cgi</code>, do this? + <p>No: the <code>cgi</code> module does the server end of the job. It + doesn't know how to parse or fill in a form or how to send it back to the + server. + <li>Which version of Python do I need? + <p>1.5.2 or above. + <li>Is <code>urllib2</code> required? + <p>No. + <li>How do I use it without <code>urllib2</code>? + <p>Use <code>.click_request_data()</code> instead of <code>.click()</code>. + <li>Which <code>urllib2</code> do I need? + <p>You don't. It's convenient, though. If you have Python 2.0, you need to + upgrade to the version from Python 2.1 (available from <a + href="http://www.python.org/">www.python.org</a>). Alternatively, use the + 1.5.2-compatible version. If you have Python 1.5.2, use this <a + href="../bits/urllib2.py"><code>urllib2</code></a> and <a + href="../bits/urllib.py"><code>urllib</code></a>. Otherwise, you're OK. + <li>Which license? + <p>The <a href="http://www.opensource.org/licenses/bsd-license.php"> BSD + license</a> (included in distribution). + + <li>Is XHTML supported? + <p>Yes, since 0.1.12. + <li>How do I figure out what control names and values to use? + <p><code>print form</code> is usually all you need. + <code>HTMLForm.possible_items</code> can be useful. Note that it's + possible to use item labels instead of item names, which can be useful + — use the <code>by_label</code> arguments to the various methods, + and the <code>.get_value_by_label()</code> / + <code>.set_value_by_label()</code> methods on <code>ListControl</code>. + Only <code>SelectControl</code> currently supports item labels (which + default to <code>OPTION</code> element contents). I might not bother to + fix this, since it seems it's probably only useful for <code>SELECT</code> + anyway. + <li>What do those <code>'*'</code> characters mean in the string + representations of list controls? + <p>A <code>*</code> next to an item means that item is selected. + <li>What do those parentheses (round brackets) mean in the string + representations of list controls? + <p>Parentheses <code>(foo)</code> around an item mean that item is disabled. + <li>Why doesn't <some control> turn up in the data returned by + <code>.click*()</code> when that control has non-<code>None</code> value? + <p>Either the control is disabled, or it is not successful for some other + reason. 'Successful' (see HTML 4 specification) means that the control + will cause data to get sent to the server. + <li>Why does ClientForm not follow the HTML 4.0 / RFC 1866 standards for + <code>RADIO</code> and multiple-selection <code>SELECT</code> controls? + <p>Because by default, it follows browser behaviour when setting the + initially-selected items in list controls that have no items explicitly + selected in the HTML. Use the <code>select_default</code> argument to + <code>ParseResponse</code> if you want to follow the RFC 1866 rules + instead. Note that browser behaviour violates the HTML 4.01 specification + in the case of <code>RADIO</code> controls. + <li>Why does <code>.click()</code>ing on a button not work for me? + <ul> + <li>Clicking on a <code>RESET</code> button doesn't do anything, by design + - this is a library for web automation, not an interactive browser. + Even in an interactive browser, clicking on <code>RESET</code> sends + nothing to the server, so there is little point in having + <code>.click()</code> do anything special here. + <li>Clicking on a <code>BUTTON TYPE=BUTTON</code> doesn't do anything + either, also by design. This time, the reason is that that + <code>BUTTON</code> is only in the HTML standard so that one can attach + callbacks to its events. The callbacks are functions in + <code>SCRIPT</code> elements (such as Javascript) embedded in the HTML, + and their execution may result in information getting sent back to the + server. ClientForm, however, knows nothing about these callbacks, so + it can't do anything useful with a click on a <code>BUTTON</code> whose + type is <code>BUTTON</code>. + <li>Generally, embedded script may be messing things up in all kinds of + ways. See the answer to the next question. + </ul> + <li>Embedded script is messing up my form filling. What do I do? + <p>See the <a href="../bits/GeneralFAQ.html">General FAQs</a> page for + what to do about this. +<!-- XXX example here --> + <li>I'm having trouble debugging my code. + <p>The <a href="../ClientCookie/">ClientCookie</a> package makes it + easy to get <code>.seek()</code>able response objects, which is + convenient for debugging. See also <a + href="../ClientCookie/doc.html#debugging">here</a> for few + relevant tips. Also see <a href="../bits/GeneralFAQ.html"> General + FAQs</a>. + <li>I have a control containing a list of integers. How do I select the one + whose value is nearest to the one I want? +<p><pre> + <span class="pykw">import</span> bisect + <span class="pykw">def</span> closest_int_value(form, ctrl_name, value): + values = map(int, form.possible_items(ctrl_name)) + <span class="pykw">return</span> str(values[bisect.bisect(values, value) - 1]) + + form[<span class="pystr">"distance"</span>] = [closest_int_value(form, <span class="pystr">"distance"</span>, 23)]</pre> + + </li> + <li>Where can I find out more about the HTML and HTTP standards? + <ul> + <li>W3C <a href="http://www.w3.org/TR/html401/">HTML 4.01 + Specification</a>. + <li><a href="http://www.ietf.org/rfc/rfc1866.txt">RFC 1866</a> - + the HTML 2.0 standard. + <li><a href="http://www.ietf.org/rfc/rfc1867.txt">RFC 1867</a> - + Form-based file upload. + <li><a href="http://www.ietf.org/rfc/rfc2616.txt">RFC 2616</a> - + HTTP 1.1 Specification. + </ul> +</ul> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, January 2005. + +</div> + +<div id="Menu"> + +<a href="..">Home</a><br> +<!--<a href=""></a><br>--> + +<br> + +<a href="../ClientCookie/">ClientCookie</a><br> +<span class="thispage">ClientForm</span><br> +<a href="../DOMForm/">DOMForm</a><br> +<a href="../python-spidermonkey/">python-spidermonkey</a><br> +<a href="../ClientTable/">ClientTable</a><br> +<a href="../mechanize/">mechanize</a><br> +<a href="../pullparser/">pullparser</a><br> +<a href="../bits/GeneralFAQ.html">General FAQs</a><br> +<a href="../bits/urllib2_152.py">1.5.2 urllib2.py</a><br> +<a href="../bits/urllib_152.py">1.5.2 urllib.py</a><br> + +<br> + +<a href="../#other">Other stuff</a><br> + +<br> + +<a href="./#download">Download</a><br> +<a href="./#faq">FAQs</a><br> + +</div> + +</body> +</html> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/README.html.in b/LTA/LTAIngest/ClientForm-0.1.17/README.html.in new file mode 100644 index 0000000000000000000000000000000000000000..a2d1112108810556d01e7f475a9fbc9b000f751a --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/README.html.in @@ -0,0 +1,365 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> + <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> + <meta name="author" content="John J. Lee <jjl@@pobox.com>"> + <meta name="date" content="2005-01"> + <meta name="keywords" content="form,HTML,Python,web,client,client-side"> + <title>ClientForm</title> + <style type="text/css" media="screen">@@import "../styles/style.css";</style> + <base href="http://wwwsearch.sourceforge.net/ClientForm/"> +</head> +<body> + +@# This file is processed by EmPy to colorize Python source code +@# http://wwwsearch.sf.net/bits/colorize.py +@{from colorize import colorize} + +<div id=sf><a href="http://sourceforge.net"> +<img src="http://sourceforge.net/sflogo.php?group_id=48205&type=2" + width="125" height="37" alt="SourceForge.net Logo"></a></div> + +<h1>ClientForm</h1> + +<div id="Content"> + +<p>ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It developed from a port of Gisle Aas' +Perl module <code>HTML::Form</code>, from the <a +href="http://www.linpro.no/lwp/">libwww-perl</a> library, but the +interface is not the same. + +<p>Simple example: + +@{colorize(r""" + from urllib2 import urlopen + from ClientForm import ParseResponse + + forms = ParseResponse(urlopen("http://www.example.com/form.html")) + form = forms[0] + print form + form["author"] = "Gisle Aas" + + # form.click() returns a urllib2.Request object + # (see HTMLForm.click.__doc__ if you don't have urllib2) + response = urlopen(form.click("Thanks")) +""")} + +<p>A more complicated example: + +@{colorize(r""" + import ClientForm + import urllib2 + request = urllib2.Request("http://www.example.com/form.html") + response = urllib2.urlopen(request) + forms = ClientForm.ParseResponse(response) + response.close() + form = forms[0] + print form # very useful! + + # Indexing allows setting and retrieval of control values + original_text = form["comments"] # a string, NOT a Control instance + form["comments"] = "Blah." + + # Controls that represent lists (checkbox, select and radio lists) are + # ListControls. Their values are sequences of list item names. + # They come in two flavours: single- and multiple-selection: + print form.possible_items("cheeses") + form["favorite_cheese"] = ["brie"] # single + form["cheeses"] = ["parmesan", "leicester", "cheddar"] # multi + # is the "parmesan" item of the "cheeses" control selected? + print "parmesan" in form["cheeses"] + # does cheeses control have a "caerphilly" item? + print "caerphilly" in form.possible_items("cheeses") + + # Sometimes one wants to set or clear individual items in a list: + # select the item named "gorgonzola" in the first control named "cheeses" + form.set(True, "gorgonzola", "cheeses") + # You can be more specific: supply at least one of name, type, kind, id + # and nr (most other methods on HTMLForm take the same form of arguments): + # deselect "edam" in third CHECKBOX control + form.set(False, "edam", type="checkbox", nr=2) + + # You can explicitly say that you're referring to a ListControl: + # set whole value (rather than just one item of) "cheeses" ListControl + form.set_value(["gouda"], name="cheeses", kind="list") + # last example is almost equivalent to following (but insists that the + # control be a ListControl -- so it will skip any non-list controls that + # come before the control we want) + form["cheeses"] = ["gouda"] + # The kind argument can also take values "multilist", "singlelist", "text", + # "clickable" and "file": + # find first control that will accept text, and scribble in it + form.set_value("rhubarb rhubarb", kind="text") + form.set_value([""], kind="singlelist") + + # Often, a single checkbox (a CHECKBOX control with a single item) is + # present. In that case, the name of the single item isn't of much + # interest, so it's useful to be able to check and uncheck the box + # without using the item name: + form.set_single(True, "smelly") # check + form.set_single(False, "smelly") # uncheck + + # Add files to FILE controls with .add_file(). Only call this multiple + # times if the server is expecting multiple files. + # add a file, default value for MIME type, no filename sent to server + form.add_file(open("data.dat")) + # add a second file, explicitly giving MIME type, and telling the server + # what the filename is + form.add_file(open("data.txt"), "text/plain", "data.txt") + + # Many methods have a by_label argument, allowing specification of list + # items by label instead of by name. At the moment, only SelectControl + # supports this argument (this will be fixed). Sometimes labels are + # easier to maintain than names, sometimes the other way around. + form.set_value(["Mozzarella", "Caerphilly"], "cheeses", by_label=True) + + # It's also possible to get at the individual controls inside the form. + # This is useful for calling several methods in a row on a single control, + # and for the less common operations. The methods are quite similar to + # those on HTMLForm: + control = form.find_control("cheeses", type="select") + print control.value, control.name, control.type + print control.possible_items() + control.value = ["mascarpone", "curd"] + control.set(True, "limburger") + + # All Controls may be disabled (equivalent of greyed-out in browser) + control = form.find_control("comments") + print control.disabled + # ...or readonly + print control.readonly + # readonly and disabled attributes can be assigned to + control.disabled = False + # convenience method, used here to make all controls writable (unless + # they're disabled): + form.set_all_readonly(False) + # ListControl items may also be disabled (setting a disabled item is not + # allowed, but clearing one is allowed): + print control.get_item_disabled("emmenthal") + control.set_item_disabled(True, "emmenthal") + # enable all items in control + control.set_all_items_disabled(False) + + # HTMLForm.controls is a list of all controls in the form + for control in form.controls: + if control.value == "inquisition": sys.exit() + + request2 = form.click() # urllib2.Request object + response2 = urllib2.urlopen(request2) + + print response2.geturl() + print response2.info() # headers + print response2.read() # body + response2.close() +""")} + +<p>All of the standard control types are supported: <code>TEXT</code>, +<code>PASSWORD</code>, <code>HIDDEN</code>, <code>TEXTAREA</code>, +<code>ISINDEX</code>, <code>RESET</code>, <code>BUTTON</code> (<code>INPUT +TYPE=BUTTON</code> and the various <code>BUTTON</code> types), +<code>SUBMIT</code>, <code>IMAGE</code>, <code>RADIO</code>, +<code>CHECKBOX</code>, <code>SELECT</code>/<code>OPTION</code> and +<code>FILE</code> (for file upload). Both standard form encodings +(<code>application/x-www-form-urlencoded</code> and +<code>multipart/form-data</code>) are supported. + +<p>The module is designed for testing and automation of web +interfaces, not for implementing interactive user agents. + +<p><strong><em>Security note</em>: Remember that any passwords you store in +<code>HTMLForm</code> instances will be saved to disk in the clear if you +pickle them (directly or indirectly). The simplest solution to this is to +avoid pickling <code>HTMLForm</code> objects. You could also pickle before +filling in any password, or just set the password to <code>""</code> before +pickling.</strong> + +<p>Python 1.5.2 or above is required. To run the tests, you need the +<code>unittest</code> module (from <a href="http://pyunit.sourceforge.net/">PyUnit</a>). +<code>unittest</code> is a standard library module with Python 2.1 and +above. + +<p>For full documentation, see the docstrings in ClientForm.py. + +<p><em><strong>Note: this page describes the 0.1.x interface. See <a +href="./src/README_0_0_15.html">here</a> for the old 0.0.x interface.</strong> +</em> + + +<a name="download"></a> +<h2>Download</h2> + +<p>For installation instructions, see the INSTALL file included in the +distribution. + +<p><em>Stable release.</em>. There have been many interface changes since +0.0.x, so I don't recommend upgrading old code from 0.0.x unless you want the +new features. + +<p>0.1.x includes <code>FILE</code> control support for file upload, handling +of disabled list items, and a redesigned interface. +<ul> +<li><a href="./src/ClientForm-0.1.17.tar.gz">ClientForm-0.1.17.tar.gz</a> +<li><a href="./src/ClientForm-0_1_17.zip">ClientForm-0_1_17.zip</a> +<li><a href="./src/ChangeLog.txt">Change Log</a> (included in distribution) +<li><a href="./src/">Older versions.</a> +</ul> + +<br> + +<p><em>Old release.</em> +<ul> +<li><a href="./src/ClientForm-0.0.16.tar.gz">ClientForm-0.0.16.tar.gz</a> +<li><a href="./src/ClientForm-0_0_16.zip">ClientForm-0_0_16.zip</a> +<li><a href="./src/ChangeLog.txt">Change Log</a> (included in distribution) +<li><a href="./src/">Older versions.</a> +</ul> + + +<a name="faq"></a> +<h2>FAQs</h2> +<ul> + <li>Doesn't the standard Python library module, <code>cgi</code>, do this? + <p>No: the <code>cgi</code> module does the server end of the job. It + doesn't know how to parse or fill in a form or how to send it back to the + server. + <li>Which version of Python do I need? + <p>1.5.2 or above. + <li>Is <code>urllib2</code> required? + <p>No. + <li>How do I use it without <code>urllib2</code>? + <p>Use <code>.click_request_data()</code> instead of <code>.click()</code>. + <li>Which <code>urllib2</code> do I need? + <p>You don't. It's convenient, though. If you have Python 2.0, you need to + upgrade to the version from Python 2.1 (available from <a + href="http://www.python.org/">www.python.org</a>). Alternatively, use the + 1.5.2-compatible version. If you have Python 1.5.2, use this <a + href="../bits/urllib2.py"><code>urllib2</code></a> and <a + href="../bits/urllib.py"><code>urllib</code></a>. Otherwise, you're OK. + <li>Which license? + <p>The <a href="http://www.opensource.org/licenses/bsd-license.php"> BSD + license</a> (included in distribution). + + <li>Is XHTML supported? + <p>Yes, since 0.1.12. + <li>How do I figure out what control names and values to use? + <p><code>print form</code> is usually all you need. + <code>HTMLForm.possible_items</code> can be useful. Note that it's + possible to use item labels instead of item names, which can be useful + — use the <code>by_label</code> arguments to the various methods, + and the <code>.get_value_by_label()</code> / + <code>.set_value_by_label()</code> methods on <code>ListControl</code>. + Only <code>SelectControl</code> currently supports item labels (which + default to <code>OPTION</code> element contents). I might not bother to + fix this, since it seems it's probably only useful for <code>SELECT</code> + anyway. + <li>What do those <code>'*'</code> characters mean in the string + representations of list controls? + <p>A <code>*</code> next to an item means that item is selected. + <li>What do those parentheses (round brackets) mean in the string + representations of list controls? + <p>Parentheses <code>(foo)</code> around an item mean that item is disabled. + <li>Why doesn't <some control> turn up in the data returned by + <code>.click*()</code> when that control has non-<code>None</code> value? + <p>Either the control is disabled, or it is not successful for some other + reason. 'Successful' (see HTML 4 specification) means that the control + will cause data to get sent to the server. + <li>Why does ClientForm not follow the HTML 4.0 / RFC 1866 standards for + <code>RADIO</code> and multiple-selection <code>SELECT</code> controls? + <p>Because by default, it follows browser behaviour when setting the + initially-selected items in list controls that have no items explicitly + selected in the HTML. Use the <code>select_default</code> argument to + <code>ParseResponse</code> if you want to follow the RFC 1866 rules + instead. Note that browser behaviour violates the HTML 4.01 specification + in the case of <code>RADIO</code> controls. + <li>Why does <code>.click()</code>ing on a button not work for me? + <ul> + <li>Clicking on a <code>RESET</code> button doesn't do anything, by design + - this is a library for web automation, not an interactive browser. + Even in an interactive browser, clicking on <code>RESET</code> sends + nothing to the server, so there is little point in having + <code>.click()</code> do anything special here. + <li>Clicking on a <code>BUTTON TYPE=BUTTON</code> doesn't do anything + either, also by design. This time, the reason is that that + <code>BUTTON</code> is only in the HTML standard so that one can attach + callbacks to its events. The callbacks are functions in + <code>SCRIPT</code> elements (such as Javascript) embedded in the HTML, + and their execution may result in information getting sent back to the + server. ClientForm, however, knows nothing about these callbacks, so + it can't do anything useful with a click on a <code>BUTTON</code> whose + type is <code>BUTTON</code>. + <li>Generally, embedded script may be messing things up in all kinds of + ways. See the answer to the next question. + </ul> + <li>Embedded script is messing up my form filling. What do I do? + <p>See the <a href="../bits/GeneralFAQ.html">General FAQs</a> page for + what to do about this. +<!-- XXX example here --> + <li>I'm having trouble debugging my code. + <p>The <a href="../ClientCookie/">ClientCookie</a> package makes it + easy to get <code>.seek()</code>able response objects, which is + convenient for debugging. See also <a + href="../ClientCookie/doc.html#debugging">here</a> for few + relevant tips. Also see <a href="../bits/GeneralFAQ.html"> General + FAQs</a>. + <li>I have a control containing a list of integers. How do I select the one + whose value is nearest to the one I want? +<p>@{colorize(r""" + import bisect + def closest_int_value(form, ctrl_name, value): + values = map(int, form.possible_items(ctrl_name)) + return str(values[bisect.bisect(values, value) - 1]) + + form["distance"] = [closest_int_value(form, "distance", 23)] +""")} + </li> + <li>Where can I find out more about the HTML and HTTP standards? + <ul> + <li>W3C <a href="http://www.w3.org/TR/html401/">HTML 4.01 + Specification</a>. + <li><a href="http://www.ietf.org/rfc/rfc1866.txt">RFC 1866</a> - + the HTML 2.0 standard. + <li><a href="http://www.ietf.org/rfc/rfc1867.txt">RFC 1867</a> - + Form-based file upload. + <li><a href="http://www.ietf.org/rfc/rfc2616.txt">RFC 2616</a> - + HTTP 1.1 Specification. + </ul> +</ul> + +<p><a href="mailto:jjl@@pobox.com">John J. Lee</a>, January 2005. + +</div> + +<div id="Menu"> + +<a href="..">Home</a><br> +<!--<a href=""></a><br>--> + +<br> + +<a href="../ClientCookie/">ClientCookie</a><br> +<span class="thispage">ClientForm</span><br> +<a href="../DOMForm/">DOMForm</a><br> +<a href="../python-spidermonkey/">python-spidermonkey</a><br> +<a href="../ClientTable/">ClientTable</a><br> +<a href="../mechanize/">mechanize</a><br> +<a href="../pullparser/">pullparser</a><br> +<a href="../bits/GeneralFAQ.html">General FAQs</a><br> +<a href="../bits/urllib2_152.py">1.5.2 urllib2.py</a><br> +<a href="../bits/urllib_152.py">1.5.2 urllib.py</a><br> + +<br> + +<a href="../#other">Other stuff</a><br> + +<br> + +<a href="./#download">Download</a><br> +<a href="./#faq">FAQs</a><br> + +</div> + +</body> +</html> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/README.txt b/LTA/LTAIngest/ClientForm-0.1.17/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..737fb3f2a919d32a0022478532043ff07892be01 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/README.txt @@ -0,0 +1,320 @@ + [1]SourceForge.net Logo + + ClientForm + + ClientForm is a Python module for handling HTML forms on the client + side, useful for parsing HTML forms, filling them in and returning the + completed forms to the server. It developed from a port of Gisle Aas' + Perl module HTML::Form, from the [2]libwww-perl library, but the + interface is not the same. + + Simple example: + from urllib2 import urlopen + from ClientForm import ParseResponse + + forms = ParseResponse(urlopen("http://www.example.com/form.html")) + form = forms[0] + print form + form["author"] = "Gisle Aas" + + # form.click() returns a urllib2.Request object + # (see HTMLForm.click.__doc__ if you don't have urllib2) + response = urlopen(form.click("Thanks")) + + A more complicated example: + import ClientForm + import urllib2 + request = urllib2.Request("http://www.example.com/form.html") + response = urllib2.urlopen(request) + forms = ClientForm.ParseResponse(response) + response.close() + form = forms[0] + print form # very useful! + + # Indexing allows setting and retrieval of control values + original_text = form["comments"] # a string, NOT a Control instance + form["comments"] = "Blah." + + # Controls that represent lists (checkbox, select and radio lists) are + # ListControls. Their values are sequences of list item names. + # They come in two flavours: single- and multiple-selection: + print form.possible_items("cheeses") + form["favorite_cheese"] = ["brie"] # single + form["cheeses"] = ["parmesan", "leicester", "cheddar"] # multi + # is the "parmesan" item of the "cheeses" control selected? + print "parmesan" in form["cheeses"] + # does cheeses control have a "caerphilly" item? + print "caerphilly" in form.possible_items("cheeses") + + # Sometimes one wants to set or clear individual items in a list: + # select the item named "gorgonzola" in the first control named "cheeses" + form.set(True, "gorgonzola", "cheeses") + # You can be more specific: supply at least one of name, type, kind, id + # and nr (most other methods on HTMLForm take the same form of arguments): + # deselect "edam" in third CHECKBOX control + form.set(False, "edam", type="checkbox", nr=2) + + # You can explicitly say that you're referring to a ListControl: + # set whole value (rather than just one item of) "cheeses" ListControl + form.set_value(["gouda"], name="cheeses", kind="list") + # last example is almost equivalent to following (but insists that the + # control be a ListControl -- so it will skip any non-list controls that + # come before the control we want) + form["cheeses"] = ["gouda"] + # The kind argument can also take values "multilist", "singlelist", "text", + # "clickable" and "file": + # find first control that will accept text, and scribble in it + form.set_value("rhubarb rhubarb", kind="text") + form.set_value([""], kind="singlelist") + + # Often, a single checkbox (a CHECKBOX control with a single item) is + # present. In that case, the name of the single item isn't of much + # interest, so it's useful to be able to check and uncheck the box + # without using the item name: + form.set_single(True, "smelly") # check + form.set_single(False, "smelly") # uncheck + + # Add files to FILE controls with .add_file(). Only call this multiple + # times if the server is expecting multiple files. + # add a file, default value for MIME type, no filename sent to server + form.add_file(open("data.dat")) + # add a second file, explicitly giving MIME type, and telling the server + # what the filename is + form.add_file(open("data.txt"), "text/plain", "data.txt") + + # Many methods have a by_label argument, allowing specification of list + # items by label instead of by name. At the moment, only SelectControl + # supports this argument (this will be fixed). Sometimes labels are + # easier to maintain than names, sometimes the other way around. + form.set_value(["Mozzarella", "Caerphilly"], "cheeses", by_label=True) + + # It's also possible to get at the individual controls inside the form. + # This is useful for calling several methods in a row on a single control, + # and for the less common operations. The methods are quite similar to + # those on HTMLForm: + control = form.find_control("cheeses", type="select") + print control.value, control.name, control.type + print control.possible_items() + control.value = ["mascarpone", "curd"] + control.set(True, "limburger") + + # All Controls may be disabled (equivalent of greyed-out in browser) + control = form.find_control("comments") + print control.disabled + # ...or readonly + print control.readonly + # readonly and disabled attributes can be assigned to + control.disabled = False + # convenience method, used here to make all controls writable (unless + # they're disabled): + form.set_all_readonly(False) + # ListControl items may also be disabled (setting a disabled item is not + # allowed, but clearing one is allowed): + print control.get_item_disabled("emmenthal") + control.set_item_disabled(True, "emmenthal") + # enable all items in control + control.set_all_items_disabled(False) + + # HTMLForm.controls is a list of all controls in the form + for control in form.controls: + if control.value == "inquisition": sys.exit() + + request2 = form.click() # urllib2.Request object + response2 = urllib2.urlopen(request2) + + print response2.geturl() + print response2.info() # headers + print response2.read() # body + response2.close() + + All of the standard control types are supported: TEXT, PASSWORD, + HIDDEN, TEXTAREA, ISINDEX, RESET, BUTTON (INPUT TYPE=BUTTON and the + various BUTTON types), SUBMIT, IMAGE, RADIO, CHECKBOX, SELECT/OPTION + and FILE (for file upload). Both standard form encodings + (application/x-www-form-urlencoded and multipart/form-data) are + supported. + + The module is designed for testing and automation of web interfaces, + not for implementing interactive user agents. + + Security note: Remember that any passwords you store in HTMLForm + instances will be saved to disk in the clear if you pickle them + (directly or indirectly). The simplest solution to this is to avoid + pickling HTMLForm objects. You could also pickle before filling in any + password, or just set the password to "" before pickling. + + Python 1.5.2 or above is required. To run the tests, you need the + unittest module (from [3]PyUnit). unittest is a standard library + module with Python 2.1 and above. + + For full documentation, see the docstrings in ClientForm.py. + + Note: this page describes the 0.1.x interface. See [4]here for the old + 0.0.x interface. + +Download + + For installation instructions, see the INSTALL file included in the + distribution. + + Stable release.. There have been many interface changes since 0.0.x, + so I don't recommend upgrading old code from 0.0.x unless you want the + new features. + + 0.1.x includes FILE control support for file upload, handling of + disabled list items, and a redesigned interface. + * [5]ClientForm-0.1.17.tar.gz + * [6]ClientForm-0_1_17.zip + * [7]Change Log (included in distribution) + * [8]Older versions. + + Old release. + * [9]ClientForm-0.0.16.tar.gz + * [10]ClientForm-0_0_16.zip + * [11]Change Log (included in distribution) + * [12]Older versions. + +FAQs + + * Doesn't the standard Python library module, cgi, do this? + No: the cgi module does the server end of the job. It doesn't know + how to parse or fill in a form or how to send it back to the + server. + * Which version of Python do I need? + 1.5.2 or above. + * Is urllib2 required? + No. + * How do I use it without urllib2? + Use .click_request_data() instead of .click(). + * Which urllib2 do I need? + You don't. It's convenient, though. If you have Python 2.0, you + need to upgrade to the version from Python 2.1 (available from + [13]www.python.org). Alternatively, use the 1.5.2-compatible + version. If you have Python 1.5.2, use this [14]urllib2 and + [15]urllib. Otherwise, you're OK. + * Which license? + The [16]BSD license (included in distribution). + * Is XHTML supported? + Yes, since 0.1.12. + * How do I figure out what control names and values to use? + print form is usually all you need. HTMLForm.possible_items can be + useful. Note that it's possible to use item labels instead of item + names, which can be useful -- use the by_label arguments to the + various methods, and the .get_value_by_label() / + .set_value_by_label() methods on ListControl. Only SelectControl + currently supports item labels (which default to OPTION element + contents). I might not bother to fix this, since it seems it's + probably only useful for SELECT anyway. + * What do those '*' characters mean in the string representations of + list controls? + A * next to an item means that item is selected. + * What do those parentheses (round brackets) mean in the string + representations of list controls? + Parentheses (foo) around an item mean that item is disabled. + * Why doesn't <some control> turn up in the data returned by + .click*() when that control has non-None value? + Either the control is disabled, or it is not successful for some + other reason. 'Successful' (see HTML 4 specification) means that + the control will cause data to get sent to the server. + * Why does ClientForm not follow the HTML 4.0 / RFC 1866 standards + for RADIO and multiple-selection SELECT controls? + Because by default, it follows browser behaviour when setting the + initially-selected items in list controls that have no items + explicitly selected in the HTML. Use the select_default argument + to ParseResponse if you want to follow the RFC 1866 rules instead. + Note that browser behaviour violates the HTML 4.01 specification + in the case of RADIO controls. + * Why does .click()ing on a button not work for me? + + Clicking on a RESET button doesn't do anything, by design - + this is a library for web automation, not an interactive + browser. Even in an interactive browser, clicking on RESET + sends nothing to the server, so there is little point in + having .click() do anything special here. + + Clicking on a BUTTON TYPE=BUTTON doesn't do anything either, + also by design. This time, the reason is that that BUTTON is + only in the HTML standard so that one can attach callbacks to + its events. The callbacks are functions in SCRIPT elements + (such as Javascript) embedded in the HTML, and their + execution may result in information getting sent back to the + server. ClientForm, however, knows nothing about these + callbacks, so it can't do anything useful with a click on a + BUTTON whose type is BUTTON. + + Generally, embedded script may be messing things up in all + kinds of ways. See the answer to the next question. + * Embedded script is messing up my form filling. What do I do? + See the [17]General FAQs page for what to do about this. + * I'm having trouble debugging my code. + The [18]ClientCookie package makes it easy to get .seek()able + response objects, which is convenient for debugging. See also + [19]here for few relevant tips. Also see [20]General FAQs. + * I have a control containing a list of integers. How do I select + the one whose value is nearest to the one I want? + import bisect + def closest_int_value(form, ctrl_name, value): + values = map(int, form.possible_items(ctrl_name)) + return str(values[bisect.bisect(values, value) - 1]) + + form["distance"] = [closest_int_value(form, "distance", 23)] + * Where can I find out more about the HTML and HTTP standards? + + W3C [21]HTML 4.01 Specification. + + [22]RFC 1866 - the HTML 2.0 standard. + + [23]RFC 1867 - Form-based file upload. + + [24]RFC 2616 - HTTP 1.1 Specification. + + [25]John J. Lee, January 2005. + + [26]Home + [27]ClientCookie + ClientForm + [28]DOMForm + [29]python-spidermonkey + [30]ClientTable + [31]mechanize + [32]pullparser + [33]General FAQs + [34]1.5.2 urllib2.py + [35]1.5.2 urllib.py + [36]Other stuff + [37]Download + [38]FAQs + +References + + 1. http://sourceforge.net/ + 2. http://www.linpro.no/lwp/ + 3. http://pyunit.sourceforge.net/ + 4. http://wwwsearch.sourceforge.net/ClientForm/src/README_0_0_15.html + 5. http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0.1.17.tar.gz + 6. http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0_1_17.zip + 7. http://wwwsearch.sourceforge.net/ClientForm/src/ChangeLog.txt + 8. http://wwwsearch.sourceforge.net/ClientForm/src/ + 9. http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0.0.16.tar.gz + 10. http://wwwsearch.sourceforge.net/ClientForm/src/ClientForm-0_0_16.zip + 11. http://wwwsearch.sourceforge.net/ClientForm/src/ChangeLog.txt + 12. http://wwwsearch.sourceforge.net/ClientForm/src/ + 13. http://www.python.org/ + 14. http://wwwsearch.sourceforge.net/bits/urllib2.py + 15. http://wwwsearch.sourceforge.net/bits/urllib.py + 16. http://www.opensource.org/licenses/bsd-license.php + 17. http://wwwsearch.sourceforge.net/bits/GeneralFAQ.html + 18. http://wwwsearch.sourceforge.net/ClientCookie/ + 19. http://wwwsearch.sourceforge.net/ClientCookie/doc.html#debugging + 20. http://wwwsearch.sourceforge.net/bits/GeneralFAQ.html + 21. http://www.w3.org/TR/html401/ + 22. http://www.ietf.org/rfc/rfc1866.txt + 23. http://www.ietf.org/rfc/rfc1867.txt + 24. http://www.ietf.org/rfc/rfc2616.txt + 25. mailto:jjl@pobox.com + 26. http://wwwsearch.sourceforge.net/ + 27. http://wwwsearch.sourceforge.net/ClientCookie/ + 28. http://wwwsearch.sourceforge.net/DOMForm/ + 29. http://wwwsearch.sourceforge.net/python-spidermonkey/ + 30. http://wwwsearch.sourceforge.net/ClientTable/ + 31. http://wwwsearch.sourceforge.net/mechanize/ + 32. http://wwwsearch.sourceforge.net/pullparser/ + 33. http://wwwsearch.sourceforge.net/bits/GeneralFAQ.html + 34. http://wwwsearch.sourceforge.net/bits/urllib2_152.py + 35. http://wwwsearch.sourceforge.net/bits/urllib_152.py + 36. http://wwwsearch.sourceforge.net/#other + 37. http://wwwsearch.sourceforge.net/ClientForm/#download + 38. http://wwwsearch.sourceforge.net/ClientForm/#faq diff --git a/LTA/LTAIngest/ClientForm-0.1.17/setup.py b/LTA/LTAIngest/ClientForm-0.1.17/setup.py new file mode 100755 index 0000000000000000000000000000000000000000..eb4101af4b5dc5ef0ac3bcf5e8c3a6c4f122c282 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/setup.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +"""Client-side HTML form handling. + +ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It developed from a port of Gisle Aas' +Perl module HTML::Form, from the libwww-perl library, but the +interface is not the same. +""" + +from ClientForm import VERSION +NAME = "ClientForm" +PACKAGE = False +LICENSE = "BSD" +PLATFORMS = ["any"] +CLASSIFIERS = """\ +Development Status :: 5 - Production/Stable +Intended Audience :: Developers +Intended Audience :: System Administrators +License :: OSI Approved :: BSD License +Natural Language :: English +Operating System :: OS Independent +Programming Language :: Python +Topic :: Internet +Topic :: Internet :: WWW/HTTP +Topic :: Internet :: WWW/HTTP :: Site Management +Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking +Topic :: Software Development :: Libraries +Topic :: Software Development :: Libraries :: Python Modules +Topic :: Software Development :: Testing +Topic :: Software Development :: Testing :: Traffic Generation +Topic :: System :: Networking :: Monitoring +Topic :: System :: Systems Administration +Topic :: Text Processing +Topic :: Text Processing :: Markup +Topic :: Text Processing :: Markup :: HTML +Topic :: Text Processing :: Markup :: XML +""" + +#------------------------------------------------------- +# the rest is constant for most of my released packages: + +import sys, string +from distutils.core import setup + +_setup = setup +def setup(**kwargs): + if not hasattr(sys, "version_info") or sys.version_info < (2, 3): + # Python version compatibility + # XXX probably download_url came in earlier than 2.3 + for key in ["classifiers", "download_url"]: + if kwargs.has_key(key): + del kwargs[key] + # Only want packages keyword if this is a package, + # only want py_modules keyword if this is a single-file module, + # so get rid of packages or py_modules keyword as appropriate. + if kwargs["packages"] is None: + del kwargs["packages"] + else: + del kwargs["py_modules"] + apply(_setup, (), kwargs) + +if PACKAGE: + packages = [NAME] + py_modules = None +else: + py_modules = [NAME] + packages = None + +doclines = string.split(__doc__, "\n") + +setup(name = NAME, + version = VERSION, + license = LICENSE, + platforms = PLATFORMS, + classifiers = filter(None, string.split(CLASSIFIERS, "\n")), + author = "John J. Lee", + author_email = "jjl@pobox.com", + description = doclines[0], + url = "http://wwwsearch.sourceforge.net/%s/" % NAME, + download_url = ("http://wwwsearch.sourceforge.net/%s/src/" + "%s-%s.tar.gz" % (NAME, NAME, VERSION)), + long_description = string.join(doclines[2:], "\n"), + py_modules = py_modules, + packages = packages, + ) diff --git a/LTA/LTAIngest/ClientForm-0.1.17/test.py b/LTA/LTAIngest/ClientForm-0.1.17/test.py new file mode 100755 index 0000000000000000000000000000000000000000..e88812cad16942c528984b45a28cb500764e119b --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/test.py @@ -0,0 +1,1949 @@ +#!/usr/bin/env python + +import unittest, string +from unittest import TestCase +from cStringIO import StringIO + +import ClientForm +from ClientForm import ControlNotFoundError, ItemNotFoundError, \ + ItemCountError, ParseError + +# XXX +# Base control tests on ParseFile, so can use same tests for DOMForm and +# ClientForm. That wouldn't be unit testing exactly, but saner than the +# current situation with massive duplication of tests between the two +# modules. +# HTMLForm.enctype +# XHTML + +try: True +except NameError: + True = 1 + False = 0 + +try: bool +except NameError: + def bool(expr): + if expr: return True + else: return False + +class LWPFormTests(TestCase): + """The original tests from libwww-perl 5.64.""" + def testEmptyParse(self): + forms = ClientForm.ParseFile(StringIO(""), "http://localhost") + self.assert_(len(forms) == 0) + + def _forms(self): + file = StringIO("""<form action="abc"> + + <input name="firstname" value="Gisle"> + + </form> + + """) + return ClientForm.ParseFile(file, "http://localhost/") + + def testParse(self): + forms = self._forms() + self.assert_(len(forms) == 1) + self.assert_(forms[0]["firstname"] == "Gisle") + + def testFillForm(self): + forms = self._forms() + form = forms[0] + form["firstname"] = "Gisle Aas" + req = form.click() + def request_method(req): + if req.has_data(): + return "POST" + else: + return "GET" + self.assert_(request_method(req) == "GET") + self.assert_(req.get_full_url() == "http://localhost/abc?firstname=Gisle+Aas") + + +class ParseTests(TestCase): + def test_parse_error(self): + f = StringIO( +"""<form action="abc"> +<option> +</form> +""") + base_uri = "http://localhost/" + try: + ClientForm.ParseFile(f, base_uri) + except ClientForm.ParseError, e: + self.assert_(e.base_uri == base_uri) + else: + self.assert_(0) + + def test_base_uri(self): + # BASE element takes priority over document URI + file = StringIO( +"""<base HREF="http://example.com"> +<form action="abc"> +<input type="submit"></input> +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.action == "http://example.com/abc") + + file = StringIO( +"""<form action="abc"> +<input type="submit"></input> +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.action == "http://localhost/abc") + + def testTextarea(self): + file = StringIO( +"""<form action="abc"> + +<input name="firstname" value="Gisle"> +<textarea>blah, blah, +Rhubarb. + +</textarea> + +<textarea></textarea> + +<textarea name=""ta"" id="foo&bar">Hello testers & users!</textarea> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + self.assert_(len(forms) == 1) + form = forms[0] + self.assert_(form.name is None) + control = form.find_control(type="textarea") + self.assert_(control.name is None) + self.assert_(control.value == "blah, blah,\nRhubarb.\n\n") + + empty_control = form.find_control(type="textarea", nr=1) + self.assert_(str(empty_control) == "<TextareaControl(<None>=)>") + self.assert_(empty_control.value == "") + + entity_ctl = form.find_control(type="textarea", nr=2) + self.assert_(entity_ctl.name == '"ta"') + self.assertEqual(entity_ctl.attrs["id"], "foo&bar") + + self.assert_(entity_ctl.value == "Hello testers & users!") + + def testSelect(self): + file = StringIO( +"""<form action="abc"> + +<select name="foo"> + <option>Hello testers & users!</option> + <option></option><option></option> +</select> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + self.assert_(len(forms) == 1) + form = forms[0] + + entity_ctl = form.find_control(type="select") + self.assert_(entity_ctl.name == "foo") + self.assert_(entity_ctl.value[0] == "Hello testers & users!") + opt = entity_ctl.get_item_attrs("Hello testers & users!") + self.assert_(opt["value"] == opt["label"] == opt["contents"] == + "Hello testers & users!") + + def testButton(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<input type="text" value="cow" name="moo"> + +<button name="b">blah, blah, +Rhubarb.</button> + +<button type="reset" name="b2"></button> +<button type="button" name="b3"></button> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.name == "myform") + control = form.find_control(name="b") + self.assert_(control.type == "submitbutton") + self.assert_(control.value == "") + self.assert_(form.find_control("b2").type == "resetbutton") + self.assert_(form.find_control("b3").type == "buttonbutton") + pairs = form.click_pairs() + self.assert_(pairs == [("moo", "cow"), ("b", "")]) + + def testIsindex(self): + file = StringIO( +"""<form action="abc"> + +<isindex prompt=">>>"> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + control = form.find_control(type="isindex") + self.assert_(control.type == "isindex") + self.assert_(control.name is None) + self.assert_(control.value == "") + control.value = "some stuff" + self.assert_(form.click_pairs() == []) + self.assert_(form.click_request_data() == + ("http://localhost/abc?some+stuff", None, [])) + self.assert_(form.click().get_full_url() == + "http://localhost/abc?some+stuff") + + def testEmptySelect(self): + file = StringIO( +"""<form action="abc"> +<select name="foo"></select> + +<select name="bar" multiple></select> + +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + control0 = form.find_control(type="select", nr=0) + control1 = form.find_control(type="select", nr=1) + self.assert_(str(control0) == "<SelectControl(foo=[])>") + self.assert_(str(control1) == "<SelectControl(bar=[])>") + form.set_value([], "foo") + self.assertRaises(ItemNotFoundError, form.set_value, ["oops"], "foo") + self.assert_(form.click_pairs() == []) + +# XXX figure out what to do in these sorts of cases +## def badSelect(self): +## # what objects should these generate, if any? +## # what should happen on submission of these? +## # what about similar checkboxes and radios? +## """<form action="abc" name="myform"> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ + +## """<form action="abc" name="myform"> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ +## <select name="foo"> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select name="foo" multiple> +## <option>4</option> +## <option>5</option> +## <option>6</option> +## </select> +## """ + +## """<form action="abc" name="myform"> + +## <select> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ + +## def testBadCheckbox(self): +## # see comments above +## # split checkbox -- is it one control, or two? + +## """ +## <html> + +## <input type=checkbox name=foo value=bar> +## <input type=checkbox name=foo value=bar> + +## <select> +## <option>1</option> +## <option>2</option> +## </select> + +## <input type=checkbox name=foo value=baz> +## <input type=checkbox name=foo value=bar> + +## </html> +## """ + + def testUnnamedControl(self): + file = StringIO(""" +<form action="./weird.html"> + +<input type="checkbox" value="foo"></input> + +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.controls[0].name is None) + + def testNamelessListControls(self): + # XXX SELECT + # these controls have no item names + file = StringIO("""<form action="./weird.html"> + +<input type="checkbox" name="foo"></input> + +<input type="radio" name="bar"></input> + +<!-- +<select name="baz"> + <option></option> +</select> + +<select name="baz" multiple> + <option></option> +</select> +--> + +<input type="submit" name="submit"> +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.possible_items("foo") == ["on"]) + self.assert_(form.possible_items("bar") == ["on"]) + #self.assert_(form.possible_items("baz") == []) + self.assert_(form["foo"] == []) + self.assert_(form["bar"] == []) + #self.assert_(form["baz"] == []) + form["foo"] = ["on"] + form["bar"] = ["on"] + pairs = form.click_pairs() + self.assert_(pairs == [("foo", "on"), ("bar", "on"), ("submit", "")]) + + def testBadSingleSelect(self): + # HTML 4.01 section 17.6.1: single selection SELECT controls shouldn't + # have > 1 item selected, but if they do, not more than one should end + # up selected. + file = StringIO("""<form action="./bad.html"> + +<select name="spam"> + <option selected>1</option> + <option selected>2</option> +</select> + +<input type="submit" name="submit"> +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + self.assert_(form.possible_items("spam") == ["1", "2"]) + nr_selected = len(form.find_control("spam").pairs()) + self.assert_(nr_selected == 1) + + def testSelectDefault(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="a" multiple> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="b"> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +</form> + +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + control = form.find_control("a") + self.assert_(control.value == []) + single_control = form.find_control("b") + self.assert_(single_control.value == ["1"]) + + file.seek(0) + forms = ClientForm.ParseFile(file, "http://localhost/", + select_default=1) + form = forms[0] + # select_default only affects *multiple* selection select controls + control = form.find_control(type="select") + self.assert_(control.value == ["1"]) + single_control = form.find_control(type="select", nr=1) + self.assert_(single_control.value == ["1"]) + + +class DisabledTests(TestCase): + def testOptgroup(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="foo" multiple> + <option>1</option> + <optgroup> + <option>2</option> + </optgroup> + <option>3</option> + <optgroup> + <option>4</option> + <option>5</option> + <option>6</option> + </optgroup> + <optgroup disabled> + <option selected>7</option> + <option>8</option> + </optgroup> + <option>9</option> + <optgroup disabled> + <option>10</option> + </optgroup> +</select> + +<select name="bar"> + <option>1</option> + <optgroup> + <option>2</option> + </optgroup> + <option>3</option> + <optgroup> + <option>4</option> + <option>5</option> + <option>6</option> + </optgroup> + <optgroup disabled> + <option selected>7</option> + <option>8</option> + </optgroup> + <option>9</option> + <optgroup disabled> + <option>10</option> + </optgroup> +</select> + +</form>""") + + def get_control(name, file=file): + file.seek(0) + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + return form.find_control(name) + + # can't call item_disabled with no args + control = get_control("foo") + self.assertRaises(TypeError, control.get_item_disabled) + + control.set_item_disabled(True, "2") + self.assert_(str(control) == "<SelectControl(foo=" + "[1, (2), 3, 4, 5, 6, (*7), (8), 9, (10)])>") + + # list controls only allow assignment to .value if no attempt is + # made to set any disabled item... + + # ...multi selection + control = get_control("foo") + self.assert_(control.value == ["7"]) + control.value = ["1"] + control = get_control("foo") + def assign_8(control=control): control.value = ["8"] + self.assertRaises(AttributeError, assign_8) + self.assert_(control.value == ["7"]) + # even though 7 is set already, attempt to set it again fails + def assign_7(control=control): control.value = ["7"] + self.assertRaises(AttributeError, assign_7) + control.value = ["1", "3"] + control = get_control("foo") + def assign_multi(control=control): control.value = ["1", "7"] + self.assertRaises(AttributeError, assign_multi) + # enable all items + for item in control.possible_items(): + control.set_item_disabled(False, item) + assign_multi() + + control = get_control("foo") + for value in 7, 8, 10: + self.assert_(control.get_item_disabled(str(value))) + self.assertRaises(AttributeError, control.set, True, str(value)) + control.set(False, str(value)) + self.assert_(str(value) not in control.value) + control.set(False, str(value)) + self.assert_((str(value) not in control.value)) + self.assertRaises(AttributeError, control.toggle, str(value)) + self.assert_(str(value) not in control.value) + self.assertRaises(AttributeError, control.set, True, str(value)) + self.assert_(str(value) not in control.value) + + control = get_control("foo") + for value in 1, 2, 3, 4, 5, 6, 9: + self.assert_(not control.get_item_disabled(str(value))) + control.set(False, str(value)) + self.assert_(str(value) not in control.value) + control.toggle(str(value)) + self.assert_(str(value) in control.value) + control.set(True, str(value)) + self.assert_(str(value) in control.value) + control.toggle(str(value)) + self.assert_(str(value) not in control.value) + + control = get_control("foo") + self.assert_(control.get_item_disabled("7")) + control.toggle("7") # clearing, not setting, so no problem + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(True, "7") + self.assert_(control.get_item_disabled("7")) + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(False, "7") + self.assert_(not control.get_item_disabled("7")) + control.set(True, "7") + control.set(False, "7") + control.toggle("7") + control.toggle("7") + + # ...single-selection + control = get_control("bar") + self.assert_(control.value == ["7"]) + control.value = ["1"] + control = get_control("bar") + def assign_8(control=control): control.value = ["8"] + self.assertRaises(AttributeError, assign_8) + self.assert_(control.value == ["7"]) + # even though 7 is set already, attempt to set it again fails + def assign_7(control=control): control.value = ["7"] + self.assertRaises(AttributeError, assign_7) + # enable all items + for item in control.possible_items(): + control.set_item_disabled(False, item) + assign_7() + + control = get_control("bar") + for value in 7, 8, 10: + self.assert_(control.get_item_disabled(str(value))) + self.assertRaises(AttributeError, control.set, True, str(value)) + control.set(False, str(value)) + self.assert_(str(value) != control.value) + control.set(False, str(value)) + self.assert_(str(value) != control.value) + self.assertRaises(AttributeError, control.toggle, str(value)) + self.assert_(str(value) != control.value) + self.assertRaises(AttributeError, control.set, True, str(value)) + self.assert_(str(value) != control.value) + + control = get_control("bar") + for value in 1, 2, 3, 4, 5, 6, 9: + self.assert_(not control.get_item_disabled(str(value))) + control.set(False, str(value)) + self.assert_(str(value) not in control.value) + control.toggle(str(value)) + self.assert_(str(value) == control.value[0]) + control.set(True, str(value)) + self.assert_(str(value) == control.value[0]) + control.toggle(str(value)) + self.assert_(str(value) not in control.value) + + control = get_control("bar") + self.assert_(control.get_item_disabled("7")) + control.toggle("7") # clearing, not setting, so no problem + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(True, "7") + self.assert_(control.get_item_disabled("7")) + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(False, "7") + self.assert_(not control.get_item_disabled("7")) + control.set(True, "7") + control.set(False, "7") + control.toggle("7") + control.toggle("7") + + # set_all_items_disabled + for name in "foo", "bar": + control = get_control(name) + control.set_all_items_disabled(False) + control.set(True, "7") + control.set(True, "1") + control.set_all_items_disabled(True) + self.assertRaises(AttributeError, control.set, True, "7") + self.assertRaises(AttributeError, control.set, True, "1") + +# XXX single select + def testDisabledSelect(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="foo" multiple> + <option label="a">1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="bar" multiple> + <option>1</option> + <option disabled>2</option> + <option>3</option> +</select> + +<select name="baz" disabled multiple> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="spam" disabled multiple> + <option>1</option> + <option disabled>2</option> + <option>3</option> +</select> + +</form> +""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + for name, control_disabled, item_disabled in [ + ("foo", False, False), + ("bar", False, True), + ("baz", True, False), + ("spam", True, True)]: + control = form.find_control(name) + self.assert_(bool(control.disabled) == control_disabled) + item = control.get_item_attrs("2") + self.assert_(bool(item.has_key("disabled")) == item_disabled) + + def bad_assign(value, control=control): control.value = value + if control_disabled: + for value in "1", "2", "3": + self.assertRaises(AttributeError, control.set, True, value) + self.assertRaises(AttributeError, bad_assign, [value]) + elif item_disabled: + self.assertRaises(AttributeError, control.set, True, "2") + self.assertRaises(AttributeError, bad_assign, ["2"]) + for value in "1", "3": + control.set(True, value) + else: + control.value = ["1", "2", "3"] + + control = form.find_control("foo") + # missing disabled arg + self.assertRaises(TypeError, control.set_item_disabled, "1") + # by_label + self.assert_(not control.get_item_disabled("a", by_label=True)) + control.set_item_disabled(True, "a", by_label=True) + self.assert_(control.get_item_disabled("a", by_label=True)) + + def testDisabledCheckbox(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<input type="checkbox" name="foo" value="1" label="a"></input> +<input type="checkbox" name="foo" value="2"></input> +<input type="checkbox" name="foo" value="3"></input> + +<input type="checkbox" name="bar" value="1"></input> +<input type="checkbox" name="bar" value="2" disabled></input> +<input type="checkbox" name="bar" value="3"></input> + +<input type="checkbox" name="baz" value="1" disabled></input> +<input type="checkbox" name="baz" value="2" disabled></input> +<input type="checkbox" name="baz" value="3" disabled></input> + +</form>""") + forms = ClientForm.ParseFile(file, "http://localhost/") + form = forms[0] + for name, control_disabled, item_disabled in [ + ("foo", False, False), + ("bar", False, True), + ("baz", False, True)]: + control = form.find_control(name) + self.assert_(bool(control.disabled) == control_disabled) + item = control.get_item_attrs("2") + self.assert_(bool(item.has_key("disabled")) == item_disabled) + self.assert_(control.get_item_disabled("2") == item_disabled) + + def bad_assign(value, control=control): control.value = value + if item_disabled: + self.assertRaises(AttributeError, control.set, True, "2") + self.assertRaises(AttributeError, bad_assign, ["2"]) + if not control.get_item_disabled("1"): + control.set(True, "1") + else: + control.value = ["1", "2", "3"] + + control = form.find_control("foo") + control.set_item_disabled(False, "1") + # missing disabled arg + self.assertRaises(TypeError, control.set_item_disabled, "1") + # by_label + self.assertRaises(NotImplementedError, + control.get_item_disabled, "a", by_label=True) + self.assert_(not control.get_item_disabled("1")) + self.assertRaises(NotImplementedError, + control.set_item_disabled, True, "a", + by_label=True) + self.assert_(not control.get_item_disabled("1")) + + +class ControlTests(TestCase): + def testTextControl(self): + attrs = {"type": "this is ignored", + "name": "ath_Uname", + "value": "", + "maxlength": "20", + "id": "foo"} + c = ClientForm.TextControl("texT", "ath_Uname", attrs) + c.fixup() + self.assert_(c.type == "text") + self.assert_(c.name == "ath_Uname") + self.assert_(c.id == "foo") + self.assert_(c.value == "") + self.assert_(str(c) == "<TextControl(ath_Uname=)>") + self.assert_(c.pairs() == [("ath_Uname", "")]) + def bad_assign(c=c): c.type = "sometype" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.type == "text") + def bad_assign(c=c): c.name = "somename" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.name == "ath_Uname") + c.value = "2" + self.assert_(c.value == "2") + self.assert_(str(c) == "<TextControl(ath_Uname=2)>") + def bad_assign(c=c): c.value = ["foo"] + self.assertRaises(TypeError, bad_assign) + self.assert_(c.value == "2") + self.assert_(not c.readonly) + c.readonly = True + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.value == "2") + c.disabled = True + self.assert_(str(c) == + "<TextControl(ath_Uname=2) (disabled, readonly)>") + c.readonly = False + self.assert_(str(c) == "<TextControl(ath_Uname=2) (disabled)>") + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.value == "2") + self.assert_(c.pairs() == []) + c.disabled = False + self.assert_(str(c) == "<TextControl(ath_Uname=2)>") + + self.assert_(c.attrs.has_key("maxlength")) + for key in "name", "type", "value": + self.assert_(c.attrs.has_key(key)) + + # initialisation of readonly and disabled attributes + attrs["readonly"] = True + c = ClientForm.TextControl("text", "ath_Uname", attrs) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + del attrs["readonly"] + attrs["disabled"] = True + c = ClientForm.TextControl("text", "ath_Uname", attrs) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + del attrs["disabled"] + c = ClientForm.TextControl("hidden", "ath_Uname", attrs) + self.assert_(c.readonly) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + + def testIsindexControl(self): + attrs = {"type": "this is ignored", + "prompt": ">>>"} + c = ClientForm.IsindexControl("isIndex", None, attrs) + c.fixup() + self.assert_(c.type == "isindex") + self.assert_(c.name is None) + self.assert_(c.value == "") + self.assert_(str(c) == "<IsindexControl()>") + self.assert_(c.pairs() == []) + def set_type(c=c): c.type = "sometype" + self.assertRaises(AttributeError, set_type) + self.assert_(c.type == "isindex") + def set_name(c=c): c.name = "somename" + self.assertRaises(AttributeError, set_name) + def set_value(value, c=c): c.value = value + self.assertRaises(TypeError, set_value, [None]) + self.assert_(c.name is None) + c.value = "2" + self.assert_(c.value == "2") + self.assert_(str(c) == "<IsindexControl(2)>") + c.disabled = True + self.assert_(str(c) == "<IsindexControl(2) (disabled)>") + self.assertRaises(AttributeError, set_value, "foo") + self.assert_(c.value == "2") + self.assert_(c.pairs() == []) + c.readonly = True + self.assert_(str(c) == "<IsindexControl(2) (disabled, readonly)>") + self.assertRaises(AttributeError, set_value, "foo") + c.disabled = False + self.assert_(str(c) == "<IsindexControl(2) (readonly)>") + self.assertRaises(AttributeError, set_value, "foo") + c.readonly = False + self.assert_(str(c) == "<IsindexControl(2)>") + + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs.has_key("prompt")) + self.assert_(c.attrs["prompt"] == ">>>") + for key in "name", "value": + self.assert_(not c.attrs.has_key(key)) + + c.value = "foo 1 bar 2" + class FakeForm: action = "http://localhost/" + form = FakeForm() + self.assert_(c._click(form, (1,1), "request_data") == + ("http://localhost/?foo+1+bar+2", None, [])) + + def testIgnoreControl(self): + attrs = {"type": "this is ignored"} + c = ClientForm.IgnoreControl("reset", None, attrs) + self.assert_(c.type == "reset") + self.assert_(c.value is None) + self.assert_(str(c) == "<IgnoreControl(<None>=<None>)>") + + def set_value(value, c=c): c.value = value + self.assertRaises(AttributeError, set_value, "foo") + self.assert_(c.value is None) + + def testSubmitControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "img": "foo.gif"} + c = ClientForm.SubmitControl("submit", "name_value", attrs) + self.assert_(c.type == "submit") + self.assert_(c.name == "name_value") + self.assert_(c.value == "value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value) (readonly)>") + def set_value(value, c=c): c.value = value + self.assertRaises(TypeError, set_value, ["foo"]) + c.disabled = True + self.assertRaises(AttributeError, set_value, "value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value) " + "(disabled, readonly)>") + c.disabled = False + c.readonly = False + set_value("value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value)>") + c.readonly = True + + # click on button + form = ClientForm.HTMLForm("http://foo.bar.com/") + c.add_to_form(form) + self.assert_(c.pairs() == []) + pairs = c._click(form, (1,1), "pairs") + request = c._click(form, (1,1), "request") + data = c._click(form, (1,1), "request_data") + self.assert_(c.pairs() == []) + self.assert_(pairs == [("name_value", "value_value")]) + self.assert_(request.get_full_url() == + "http://foo.bar.com/?name_value=value_value") + self.assert_(data == + ("http://foo.bar.com/?name_value=value_value", None, [])) + c.disabled = True + pairs = c._click(form, (1,1), "pairs") + request = c._click(form, (1,1), "request") + data = c._click(form, (1,1), "request_data") + self.assert_(pairs == []) + # XXX not sure if should have '?' on end of this URL, or if it really matters... + self.assert_(request.get_full_url() == "http://foo.bar.com/") + self.assert_(data == ("http://foo.bar.com/", None, [])) + + def testImageControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "img": "foo.gif"} + c = ClientForm.ImageControl("image", "name_value", attrs) + self.assert_(c.type == "image") + self.assert_(c.name == "name_value") + self.assert_(c.value == "") + self.assert_(str(c) == "<ImageControl(name_value=)>") + + # click, at coordinate (0, 55), on image + form = ClientForm.HTMLForm("http://foo.bar.com/") + c.add_to_form(form) + self.assert_(c.pairs() == []) + request = c._click(form, (0, 55), "request") + self.assert_(c.pairs() == []) + self.assert_(request.get_full_url() == + "http://foo.bar.com/?name_value.x=0&name_value.y=55") + self.assert_(c._click(form, (0,55), return_type="request_data") == + ("http://foo.bar.com/?name_value.x=0&name_value.y=55", + None, [])) + c.value = "blah" + request = c._click(form, (0, 55), "request") + self.assert_(request.get_full_url() == + "http://foo.bar.com/?name_value.x=0&name_value.y=55&name_value=blah") + + c.disabled = True + self.assertEqual(c.value, "blah") + self.assert_(str(c) == "<ImageControl(name_value=blah) (disabled)>") + def set_value(value, c=c): c.value = value + self.assertRaises(AttributeError, set_value, "blah") + self.assert_(c._click(form, (1,1), return_type="pairs") == []) + c.readonly = True + self.assert_(str(c) == "<ImageControl(name_value=blah) " + "(disabled, readonly)>") + self.assertRaises(AttributeError, set_value, "blah") + self.assert_(c._click(form, (1,1), return_type="pairs") == []) + c.disabled = c.readonly = False + self.assert_(c._click(form, (1,1), return_type="pairs") == + [("name_value.x", "1"), ("name_value.y", "1"), ('name_value', 'blah')]) + + def testCheckboxControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "alt": "some string"} + c = ClientForm.CheckboxControl("checkbox", "name_value", attrs) + c.fixup() + self.assert_(c.type == "checkbox") + self.assert_(c.name == "name_value") + self.assert_(c.value == []) + self.assert_(c.possible_items() == ["value_value"]) + def set_type(c=c): c.type = "sometype" + self.assertRaises(AttributeError, set_type) + self.assert_(c.type == "checkbox") + def set_name(c=c): c.name = "somename" + self.assertRaises(AttributeError, set_name) + self.assert_(c.name == "name_value") + + # construct larger list from length-1 lists + c = ClientForm.CheckboxControl("checkbox", "name_value", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = ClientForm.CheckboxControl("checkbox", "name_value", attrs2) + c.merge_control(c2) + c.fixup() + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[value_value, value_value2])>") + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + attrs = c.get_item_attrs("value_value") + for key in "alt", "name", "value", "type": + self.assert_(attrs.has_key(key)) + self.assertRaises(ItemNotFoundError, c.get_item_attrs, "oops") + + def set_value(value, c=c): c.value = value + + c.value = ["value_value", "value_value2"] + self.assert_(c.value == ["value_value", "value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assertRaises(TypeError, set_value, "value_value") + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.toggle("value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.toggle("value_value2") + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + # set + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(False, "value_value2") + self.assert_(c.value == ["value_value"]) + c.set(False, "value_value2") + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2])>") + c.disabled = True + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(disabled)>") + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == []) + c.readonly = True + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(disabled, readonly)>") + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == []) + c.disabled = False + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(readonly)>") + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == [("name_value", "value_value")]) + c.readonly = False + c.value = [] + self.assert_(c.value == []) + + def testSelectControlMultiple(self): + import copy + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "alt": "some string", + "label": "contents_value", + "contents": "contents_value", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": "", + "alt": "alt_text"}} + # with Netscape / IE default selection... + c = ClientForm.SelectControl("select", "select_name", attrs) + c.fixup() + self.assert_(c.type == "select") + self.assert_(c.name == "select_name") + self.assert_(c.value == []) + self.assert_(c.possible_items() == ["value_value"]) + self.assert_(c.attrs.has_key("name")) + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs["alt"] == "alt_text") + # ... and with RFC 1866 default selection + c = ClientForm.SelectControl("select", "select_name", attrs, select_default=True) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = ClientForm.SelectControl("select", "select_name", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = ClientForm.SelectControl("select", "select_name", attrs2) + c.merge_control(c2) + c.fixup() + self.assert_(str(c) == "<SelectControl(" + "select_name=[value_value, value_value2])>") + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + # get_item_attrs + attrs3 = c.get_item_attrs("value_value") + self.assert_(attrs3.has_key("alt")) + self.assert_(not attrs3.has_key("multiple")) + # HTML attributes dictionary should have been copied by ListControl + # constructor. + attrs["new_attr"] = "new" + attrs2["new_attr2"] = "new2" + for key in ("new_attr", "new_attr2"): + self.assert_(not attrs3.has_key(key)) + self.assertRaises(ItemNotFoundError, c.get_item_attrs, "oops") + + c.value = ["value_value", "value_value2"] + self.assert_(c.value == ["value_value", "value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assertRaises(TypeError, set_value, None) + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.toggle("value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.toggle("value_value2") + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + self.assert_(c.value == ["value_value"]) + # test ordering of items + c.value = ["value_value2", "value_value"] + self.assert_(c.value == ["value_value", "value_value2"]) + # set + c.set(True, "value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(False, "value_value") + self.assert_(c.value == ["value_value2"]) + c.set(False, "value_value") + self.assert_(c.value == ["value_value2"]) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + c.value = [] + self.assert_(c.value == []) + + def testSelectControlMultiple_label(self): + import ClientForm +## <SELECT name=year> +## <OPTION value=0 label="2002">current year</OPTION> +## <OPTION value=1>2001</OPTION> +## <OPTION>2000</OPTION> +## </SELECT> + attrs = {"type": "ignored", + "name": "year", + "value": "0", + "label": "2002", + "contents": "current year", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + attrs2 = {"type": "ignored", + "name": "year", + "value": "1", + "label": "2001", # label defaults to contents + "contents": "2001", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + attrs3 = {"type": "ignored", + "name": "year", + "value": "2000", # value defaults to contents + "label": "2000", # label defaults to contents + "contents": "2000", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + c = ClientForm.SelectControl("select", "select_name", attrs) + c2 = ClientForm.SelectControl("select", "select_name", attrs2) + c3 = ClientForm.SelectControl("select", "select_name", attrs3) + c.merge_control(c2) + c.merge_control(c3) + c.fixup() + + self.assert_(c.possible_items() == ["0", "1", "2000"]) + self.assert_(c.possible_items(by_label=True) == + ["2002", "2001", "2000"]) + + self.assert_(c.value == []) + c.toggle("2002", by_label=True) + self.assert_(c.value == ["0"]) + c.toggle("0") + self.assert_(c.value == []) + c.toggle("0") + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.toggle("2002", by_label=True) + self.assertRaises(ItemNotFoundError, c.toggle, "blah", by_label=True) + self.assert_(c.value == []) + c.toggle("2000") + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["2002"]) + self.assertRaises(TypeError, set_value, "1") + self.assertRaises(TypeError, set_value, None) + self.assert_(c.value == ["2000"]) + c.value = ["0"] + self.assert_(c.value == ["0"]) + c.value = [] + self.assertRaises(TypeError, c.set_value_by_label, "2002") + c.set_value_by_label(["2002"]) + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.set_value_by_label(["2000"]) + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + c.set_value_by_label(["2000", "2002"]) + self.assert_(c.value == ["0", "2000"]) + self.assert_(c.get_value_by_label() == ["2002", "2000"]) + + c.set(False, "2002", by_label=True) + self.assert_(c.get_value_by_label() == c.value == ["2000"]) + c.set(False, "2002", by_label=True) + self.assert_(c.get_value_by_label() == c.value == ["2000"]) + c.set(True, "2002", by_label=True) + self.assert_(c.get_value_by_label() == ["2002", "2000"]) + self.assert_(c.value == ["0", "2000"]) + c.set(False, "2000", by_label=True) + self.assert_(c.get_value_by_label() == ["2002"]) + self.assert_(c.value == ["0"]) + c.set(True, "2001", by_label=True) + self.assert_(c.get_value_by_label() == ["2002", "2001"]) + self.assert_(c.value == ["0", "1"]) + self.assertRaises(ItemNotFoundError, c.set, True, "blah", + by_label=True) + self.assertRaises(ItemNotFoundError, c.set, + False, "blah", by_label=True) + + def testSelectControlSingle_label(self): + import ClientForm +## <SELECT name=year> +## <OPTION value=0 label="2002">current year</OPTION> +## <OPTION value=1>2001</OPTION> +## <OPTION>2000</OPTION> +## </SELECT> + attrs = {"type": "ignored", + "name": "year", + "value": "0", + "label": "2002", + "contents": "current year", + "__select": {"type": "this is ignored", + "name": "select_name"}} + attrs2 = {"type": "ignored", + "name": "year", + "value": "1", + "label": "2001", # label defaults to contents + "contents": "2001", + "__select": {"type": "this is ignored", + "name": "select_name"}} + attrs3 = {"type": "ignored", + "name": "year", + "value": "2000", # value defaults to contents + "label": "2000", # label defaults to contents + "contents": "2000", + "__select": {"type": "this is ignored", + "name": "select_name"}} + c = ClientForm.SelectControl("select", "select_name", attrs) + c2 = ClientForm.SelectControl("select", "select_name", attrs2) + c3 = ClientForm.SelectControl("select", "select_name", attrs3) + c.merge_control(c2) + c.merge_control(c3) + c.fixup() + + self.assert_(c.possible_items() == ["0", "1", "2000"]) + self.assert_(c.possible_items(by_label=True) == + ["2002", "2001", "2000"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["2002"]) + self.assertRaises(TypeError, set_value, "1") + self.assertRaises(TypeError, set_value, None) + self.assert_(c.value == ["0"]) + c.value = [] + self.assert_(c.value == []) + c.value = ["0"] + self.assert_(c.value == ["0"]) + c.value = [] + self.assertRaises(TypeError, c.set_value_by_label, "2002") + self.assertRaises(ItemNotFoundError, c.set_value_by_label, ["foo"]) + c.set_value_by_label(["2002"]) + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.set_value_by_label(["2000"]) + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + + def testSelectControlSingle(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "label": "contents_value", + "contents": "contents_value", + "__select": {"type": "this is ignored", + "name": "select_name", + "alt": "alt_text"}} + # Netscape and IE behaviour... + c = ClientForm.SelectControl("select", "select_name", attrs) + c.fixup() + self.assert_(c.type == "select") + self.assert_(c.name == "select_name") + self.assert_(c.value == ["value_value"]) + self.assert_(c.possible_items() == ["value_value"]) + self.assert_(c.attrs.has_key("name")) + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs["alt"] == "alt_text") + # ...and RFC 1866 behaviour are identical (unlike multiple SELECT). + c = ClientForm.SelectControl("select", "select_name", attrs, + select_default=1) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = ClientForm.SelectControl("select", "select_name", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = ClientForm.SelectControl("select", "select_name", attrs2) + c.merge_control(c2) + c.fixup() + self.assert_(str(c) == "<SelectControl(" + "select_name=[*value_value, value_value2])>") + c.value = [] + self.assert_(c.value == []) + self.assert_(str(c) == "<SelectControl(" + "select_name=[value_value, value_value2])>") + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assert_(str(c) == "<SelectControl(" + "select_name=[*value_value, value_value2])>") + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemCountError, set_value, + ["value_value", "value_value2"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assertRaises(TypeError, set_value, None) + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assert_(c.value == ["value_value"]) + c.toggle("value_value") + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + self.assertRaises(TypeError, c.toggle, ["oops"]) + self.assert_(c.value == []) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + # nothing selected is allowed + c.value = [] + self.assert_(c.value == []) + # set + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value2"]) + c.set(False, "value_value") + self.assert_("value_value2") + c.set(False, "value_value2") + self.assert_(c.value == []) + c.set(False, "value_value2") + self.assert_(c.value == []) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + + def testRadioControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "id": "blah"} + # Netscape and IE behaviour... + c = ClientForm.RadioControl("radio", "name_value", attrs) + c.fixup() + self.assert_(c.type == "radio") + self.assert_(c.name == "name_value") + self.assert_(c.id == "blah") + self.assert_(c.value == []) + self.assert_(c.possible_items() == ["value_value"]) + # ...and RFC 1866 behaviour + c = ClientForm.RadioControl("radio", "name_value", attrs, + select_default=True) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = ClientForm.RadioControl("radio", "name_value", attrs, + select_default=True) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = ClientForm.RadioControl("radio", "name_value", attrs2, + select_default=True) + c.merge_control(c2) + c.fixup() + self.assert_(str(c) == "<RadioControl(" + "name_value=[*value_value, value_value2])>") + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemCountError, set_value, + ["value_value", "value_value2"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assert_(c.value == ["value_value"]) + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assert_(c.value == ["value_value"]) + c.toggle("value_value") + self.assert_(c.value == []) + c.toggle("value_value") + self.assert_(c.value == ["value_value"]) + self.assertRaises(TypeError, c.toggle, ["value_value"]) + self.assert_(c.value == ["value_value"]) + # nothing selected is allowed + c.value = [] + self.assert_(c.value == []) + # set + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value2"]) + c.set(False, "value_value") + self.assert_("value_value2") + c.set(False, "value_value2") + self.assert_(c.value == []) + c.set(False, "value_value2") + self.assert_(c.value == []) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + + +class FormTests(TestCase): + base_uri = "http://auth.athensams.net/" + def test_click(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<input type="submit" name="foo"></input> +<input type="submit" name="bar"></input> +</form> +""") + form = ClientForm.ParseFile(file, "http://blah/")[0] + self.assertRaises(ControlNotFoundError, form.click, nr=2) + self.assert_(form.click().get_full_url() == "http://blah/abc?foo=") + self.assert_(form.click(name="bar").get_full_url() == "http://blah/abc?bar=") + + # XXX POST, ?, and # + for method in ["GET", "POST"]: + file = StringIO( +"""<form method="%s" action="abc?bang=whizz#doh" name="myform"> + +<input type="submit" name="foo"></input> +</form> +""" % method) + # " (this line is here for emacs) + form = ClientForm.ParseFile(file, "http://blah/")[0] + if method == "GET": + url = "http://blah/abc?foo=" + else: + url = "http://blah/abc?bang=whizz" + self.assert_(form.click().get_full_url() == url) + + def testAuth(self): + file = open("./testdata/Auth.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + self.assert_(form.action == + "http://auth.athensams.net/" + "?ath_returl=%22http%3A%2F%2Ftame.mimas.ac.uk%2Fisicgi" + "%2FWOS-login.cgi%22&ath_dspid=MIMAS.WOS") + + self.assertRaises(ControlNotFoundError, + lambda form=form: form.toggle("d'oh", "oops")) + self.assertRaises(ControlNotFoundError, lambda form=form: form["oops"]) + def bad_assign(form=form): form["oops"] = ["d'oh"] + self.assertRaises(ControlNotFoundError, bad_assign) + + self.assertRaises(ValueError, form.find_control) + + keys = ["ath_uname", "ath_passwd"] + values = ["", ""] + types = ["text", "password"] + for i in range(len(keys)): + key = keys[i] + c = form.find_control(key) + self.assert_(c.value == values[i]) + self.assert_(c.type == types[i]) + c = form.find_control(type="image") + self.assert_(c.name is None) + self.assert_(c.value == "") + self.assert_(c.type == "image") + + form["ath_uname"] = "jbloggs" + form["ath_passwd"] = "foobar" + + self.assert_(form.click_pairs() == + [("ath_uname", "jbloggs"), + ("ath_passwd", "foobar")]) + + def testSearchType(self): + file = open("./testdata/SearchType.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + + keys = ["SID", "SESSION_DIR", "Full Search", "Easy Search", + "New Session", "Log off", "Form", "JavaScript"] + values = ["PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0", + "", "", "", "", "", "Welcome", "No"] + types = ["hidden", "hidden", "image", "image", "image", "image", + "hidden", "hidden"] + for i in range(len(keys)): + key = keys[i] + self.assert_(form.find_control(key).value == values[i]) + self.assert_(form.find_control(key).type == types[i]) + + pairs = form.click_pairs("Full Search") + self.assert_(pairs == [ + ("SID", "PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"), + ("SESSION_DIR", ""), ("Full Search.x", "1"), ("Full Search.y", "1"), + ("Form", "Welcome"), ("JavaScript", "No")]) + + def testFullSearch(self): + pass # XXX + + def testGeneralSearch(self): + file = open("./testdata/GeneralSearch.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + + keys = ["SID", "SESSION_DIR", + "Home", "Date & Database Limits", "Cited Ref Search", + "Log off", "Search", + "topic", "titleonly", "author", "journal", "address", + "Search", "Save query", "Clear", + "languagetype", "doctype", "Sort", + "Form", "Func"] + values = ["PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0", "", + "", "", "", "", "", + "", [], "", "", "", + "", "", "", + ["All languages"], ["All document types"], ["Latest date"], + "General", "Search"] + types = ["hidden", "hidden", + "image", "image", "image", "image", "image", + "text", "checkbox", "text", "text", "text", + "image", "image", "image", + "select", "select", "select", + "hidden", "hidden"] + fc = form.find_control + for i in range(len(keys)): + name = keys[i] + type = types[i] + self.assert_(fc(name).value == form.get_value(name) == values[i]) + self.assert_(fc(name).type == type) + self.assert_(fc(name, type).name == name) + self.assert_(fc(type="hidden").name == "SID") + self.assert_(fc(type="image").name == "Home") + self.assert_(fc(nr=6).name == "Search") + self.assertRaises(ControlNotFoundError, fc, nr=50) + self.assertRaises(ValueError, fc, nr=-1) + self.assert_(fc("Search", "image").name == "Search") + self.assertRaises(ControlNotFoundError, fc, "Search", "hidden") + s0 = fc("Search", "image", nr=0) + s0b = fc("Search", "image", nr=0) + s1 = fc("Search", "image", nr=1) + self.assert_(s0.name == s1.name == "Search") + self.assert_(s0 is s0b) + self.assert_(s0 is not s1) + self.assertRaises(ControlNotFoundError, fc, "Search", "image", nr=2) + self.assert_(fc(type="text", nr=2).name == "journal") + self.assert_(fc("Search", nr=0) is not fc("Search", nr=1)) + + form["topic"] = "foo" + self.assert_(form["topic"] == "foo") + form["author"] = "bar" + form["journal"] = "" + form["address"] = "baz" + form["languagetype"] = ["English", "Catalan"] + self.assert_(form["languagetype"] == ["English", "Catalan"]) + form["titleonly"] = ["on"] + self.assert_(form["titleonly"] == ["on"]) + pairs = form.click_pairs("Search") + self.assert_(pairs == [ + ("SID", "PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"), + ("SESSION_DIR", ""), + ("Search.x", "1"), ("Search.y", "1"), + ("topic", "foo"), + ("titleonly", "on"), + ("author", "bar"), + ("journal", ""), ("address", "baz"), + ("languagetype", "English"), ("languagetype", "Catalan"), + ("doctype", "All document types"), ("Sort", "Latest date"), + ("Form", "General"), ("Func", "Search")]) + + pvs = form.possible_items("languagetype") + self.assert_(pvs[0] == "All languages") + self.assert_(len(pvs) == 47) + + self.assertRaises( + ItemNotFoundError, + lambda form=form: form.toggle("d'oh", "languagetype")) + form.toggle("English", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + self.assertRaises(TypeError, form.toggle, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.toggle, "Catalan", ["languagetype"]) + + # XXX type, nr, by_label args + + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "SID") + + # multiple select + form["languagetype"] = [] + self.assert_(form["languagetype"] == []) + form.set(True, "Catalan", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + form.set(True, "English", "languagetype") + self.assert_(form["languagetype"] == ["English", "Catalan"]) + form.set(False, "English", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + form.set(False, "Catalan", "languagetype") + self.assert_(form["languagetype"] == []) + self.assertRaises(ItemNotFoundError, form.set, True, "doh", "languagetype") + self.assertRaises(ItemNotFoundError, form.set, False, "doh", "languagetype") + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "oops") + self.assertRaises(TypeError, form.set, True, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.set, False, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.set, True, "Catalan", ["languagetype"]) + self.assertRaises(TypeError, form.set, False, "Catalan", ["languagetype"]) + + def setitem(name, value, form=form): form[name] = value + form["languagetype"] = ["Catalan"] + self.assert_(form["languagetype"] == ["Catalan"]) + self.assertRaises(ItemNotFoundError, + setitem, "languagetype", ["doh"]) + self.assertRaises(ControlNotFoundError, setitem, "oops", ["blah"]) + self.assertRaises(TypeError, setitem, ["languagetype"], "Catalan") + + # single select + form["Sort"] = [] + self.assert_(form["Sort"] == []) + form.set(True, "Relevance", "Sort") + self.assert_(form["Sort"] == ["Relevance"]) + form.set(True, "Times Cited", "Sort") + self.assert_(form["Sort"] == ["Times Cited"]) + form.set(False, "Times Cited", "Sort") + self.assert_(form["Sort"] == []) + self.assertRaises(ItemNotFoundError, form.set, True, "doh", "Sort") + self.assertRaises(ItemNotFoundError, form.set, False, "doh", "Sort") + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "oops") + self.assertRaises(TypeError, form.set, True, ["Relevance"], "Sort") + self.assertRaises(TypeError, form.set, False, ["Relevance"], "Sort") + self.assertRaises(TypeError, form.set, True, "Relevance", ["Sort"]) + self.assertRaises(TypeError, form.set, False, "Relevance", ["Sort"]) + + form["Sort"] = ["Relevance"] + self.assert_(form["Sort"] == ["Relevance"]) + self.assertRaises(ItemNotFoundError, + setitem, "Sort", ["doh"]) + self.assertRaises(ControlNotFoundError, setitem, "oops", ["blah"]) + self.assertRaises(TypeError, setitem, ["Sort"], ["Relevance"]) + + def testResults(self): + file = open("./testdata/Results.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + + pvs = form.possible_items("marked_list_candidates") + self.assert_(pvs == [ + "000174872000059/1", "000174858300003/2", "000174827900006/3"]) + def bad_setitem(form=form): + form["marked_list_candidates"] = ["blah"] + self.assertRaises(ItemNotFoundError, bad_setitem) + form["marked_list_candidates"] = [pvs[0]] + + # I've removed most of the INPUT elements from this page, and + # corrected an HTML error + keys = ["Add marked records to list", + "Add records on page to list", + "Add all records retrieved to list", + "marked_list_candidates", + "Add marked records to list", + "Add records on page to list", + "Add all records retrieved to list" + ] + types = ["image", "image", "image", + "checkbox", + "image", "image", "image"] + values = ["", "", "", + [pvs[0]], + "", "", "", + ] + + for i in range(len(keys)): + key = keys[i] + control = form.find_control(key) + self.assert_(control.value == values[i]) + self.assert_(control.type == types[i]) + + pairs = form.click_pairs("Add all records retrieved to list") + self.assert_(pairs == [ + ("Add all records retrieved to list.x", "1"), + ("Add all records retrieved to list.y", "1"), + ("marked_list_candidates", pvs[0])]) + + def testMarkedResults(self): + file = open("./testdata/MarkedResults.html", "r") + forms = ClientForm.ParseFile(file, self.base_uri) + self.assert_(len(forms) == 1) + form = forms[0] + + pairs = form.click_pairs() + # I've removed most of the INPUT elements from this page, and + # corrected an HTML error + self.assert_(pairs == [ + ("Add marked records to list.x", "1"), + ("Add marked records to list.y", "1"), + ("marked_list_candidates", "000174872000059/1"), + ("marked_list_candidates", "000174858300003/2"), + ("marked_list_candidates", "000174827900006/3") + ]) + + def testMarkedRecords(self): + pass # XXX + + +class MoreFormTests(TestCase): + def make_form(self): + f = StringIO("""\ +<form blah="nonsense" name="formname"> + <input type="checkbox" name="a" value="1" id="1a" blah="spam"></input> + <input type="checkbox" name="a" value="2" blah="eggs"></input> + <input type="checkbox" name="a" value="3" id="3a"></input> + + <input type="radio" name="b" value="1"></input> + <input type="radio" name="b" value="2" id="2"></input> + <input type="radio" name="b" value="3" id="3"></input> + + <select name="c" id="cselect" blah="foo"> + <option id="coption1" blah="bar">1</option> + <option selected blah="baz">2</option> + <option id="coption3">3</option> + </select> + + <select name="d" multiple> + <option value="v1">l1</option> + <option value="v2">l2</option> + <option blah="fee" rhubarb="fi" value="v3">l3</option> + </select> + + <input type="checkbox" name="e" value="1"></input> +</form> +""") + return ClientForm.ParseFile(f, "http://blah/")[0] + + def test_value(self): + form = self.make_form() + + form.set_value(["v3"], type="select", kind="multilist") + self.assert_(form.get_value("d") == ["v3"]) + form.set_value(["l2"], type="select", kind="multilist", by_label=True) + self.assert_(form.get_value("d", by_label=True) == ["l2"]) + + self.assert_(form.get_value( + "b", "radio", "singlelist", None, 0, False) == []) + self.assertRaises(NotImplementedError, + form.set_value, ["1"], "b", by_label=True) + + def test_id(self): + form = self.make_form() + + self.assert_(form.find_control("c").id == "cselect") + self.assert_(form.find_control("a").id == "1a") + self.assert_(form.find_control("b").id is None) + + self.assert_(form.find_control(id="cselect").id == "cselect") + self.assertRaises(ControlNotFoundError, form.find_control, + id="coption1") + self.assert_(form.find_control(id="1a").id == "1a") + self.assertRaises(ControlNotFoundError, form.find_control, id="1") + + def test_single(self): + form = self.make_form() + + self.assertRaises(ItemCountError, form.set_single, True, "d") + + self.assertRaises(NotImplementedError, + form.set_single, True, "e", by_label=True) + form.toggle_single("e", "checkbox", "list", nr=0) + self.assert_("1" in form.get_value("e")) + form.set_single(False, "e", "checkbox", "list", nr=0) + self.assert_("1" not in form.get_value("e")) + form.set_single(True, "e", "checkbox", "list", nr=0) + self.assert_("1" in form.get_value("e")) + + def test_possible_items(self): + form = self.make_form() + + self.assert_(form.possible_items("c") == ["1", "2", "3"]) + self.assert_(form.possible_items("d", by_label=True) == + ["l1", "l2", "l3"]) + + self.assert_(form.possible_items("a") == ["1", "2", "3"]) + self.assertRaises(NotImplementedError, + form.possible_items, "a", by_label=True) + + def test_set_all_readonly(self): + form = self.make_form() + + form.set_all_readonly(True) + for c in form.controls: + self.assert_(c.readonly) + form.set_all_readonly(False) + for c in form.controls: + self.assert_(not c.readonly) + + def test_attrs(self): + form = self.make_form() + + self.assert_(form.attrs["blah"] == "nonsense") + self.assert_(form.attrs["name"] == "formname") + + a = form.find_control("a") + self.assert_(not hasattr(a, "attrs")) + self.assert_(a.get_item_attrs("1")["blah"] == "spam") + self.assert_(a.get_item_attrs("2")["blah"] == "eggs") + self.assert_(not a.get_item_attrs("3").has_key("blah")) + + c = form.find_control("c") + self.assert_(c.attrs["blah"] == "foo") + self.assert_(c.get_item_attrs("1")["blah"] == "bar") + self.assert_(c.get_item_attrs("2")["blah"] == "baz") + self.assert_(not c.get_item_attrs("3").has_key("blah")) + + +def startswith(string, initial): + if len(initial) > len(string): return False + return string[:len(initial)] == initial + +class CaseInsensitiveDict: + def __init__(self, dict): + self._dict = {} + for key, val in dict.items(): + self._dict[string.lower(key)] = val + + def __getitem__(self, key): return self._dict[key] + + def __getattr__(self, name): return getattr(self._dict, name) + + +class UploadTests(TestCase): + def make_form(self): + html = """\ +<form action="/cgi-bin/upload.cgi" method="POST" enctype="multipart/form-data"> +<input type="file" name="data"> +<input type="text" name="user" value="nobody"> +<br> +<input type="submit"> +</form> +""" + + return ClientForm.ParseFile(StringIO(html), + "http://localhost/cgi-bin/upload.cgi")[0] + + def test_file_request(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data)) + #print "data_control._upload_data", data_control._upload_data + req = form.click() + self.assert_(startswith(req.headers["Content-type"], + 'multipart/form-data; boundary=')) + + #print "req.get_data()\n>>%s<<" % req.get_data() + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(req.headers), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + self.assert_(fs["data"].value == data) + self.assert_(fs["data"].filename is None) + + def test_file_request_with_filename(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data), filename="afilename") + req = form.click() + self.assert_(startswith(req.headers["Content-type"], + 'multipart/form-data; boundary=')) + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(req.headers), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + self.assert_(fs["data"].value == data) + self.assert_(fs["data"].filename == "afilename") + + def test_multipart_file_request(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data), filename="filenamea") + more_data = "rhubarb\nrhubarb\n" + data_control.add_file(StringIO(more_data), filename="filenameb") + yet_more_data = "rheum\nrhaponicum\n" + data_control.add_file(StringIO(yet_more_data), filename="filenamec") + req = form.click() + self.assert_(startswith(req.headers["Content-type"], + 'multipart/form-data; boundary=')) + + #print "req.get_data()\n>>%s<<" % req.get_data() + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(req.headers), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + + fss = fs["data"][None] + filenames = "filenamea", "filenameb", "filenamec" + datas = data, more_data, yet_more_data + for i in range(len(fss)): + fs = fss[i] + filename = filenames[i] + data = datas[i] + self.assert_(fs.filename == filename) + self.assert_(fs.value == data) + + def test_upload_data(self): + form = self.make_form() + data = form.click().get_data() + self.assert_(startswith(data, "--")) + + def test_empty_upload(self): + # no controls except for INPUT/SUBMIT + forms = ClientForm.ParseFile(StringIO("""<html> +<form method="POST" action="./weird.html" enctype="multipart/form-data"> +<input type="submit" name="submit"></input> +</form></html>"""), ".") + form = forms[0] + data = form.click().get_data() + lines = string.split(data, "\r\n") + self.assert_(startswith(lines[0], "--")) + self.assert_(lines[1] == + 'Content-disposition: form-data; name="submit"') + self.assert_(lines[2] == lines[3] == "") + self.assert_(startswith(lines[4], "--")) + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/ClientForm-0.1.17/testdata/Auth.html b/LTA/LTAIngest/ClientForm-0.1.17/testdata/Auth.html new file mode 100644 index 0000000000000000000000000000000000000000..9c931ba9b26904b7d06a9e121ab9eae4c421f8cb --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/testdata/Auth.html @@ -0,0 +1,79 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> + +<HTML> +<HEAD> +<TITLE>Athens Authentication Point</TITLE> +<META http-equiv="Content-Type" content="text/html;charset=iso-8859-1"> +</HEAD> + +<BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#000000" VLINK="#000000"> + +<TABLE BORDER="0" CELLSPACING="0" CELLPADDING="0" WIDTH=609> + <TR> + <TD ALIGN="RIGHT"> + <IMG SRC="http://wos.mimas.ac.uk/isicgi/Images/main.jpg" ALT="ISI Web of Science" BORDER="0" WIDTH=470 HEIGHT=150> + </TD> + </TR> + <TR> + <TD> + <IMG SRC="http://auth.athensams.net/images/auth_point.gif" ALT="Athens Authentication Point"> + </TD> + </TR> + <TR> + <TD> + <P> <P> + </TD> + </TR> + <TR> + <TD ALIGN="CENTER"> + <FORM METHOD=POST ACTION="/?ath_returl=%22http%3A%2F%2Ftame.mimas.ac.uk%2Fisicgi%2FWOS-login.cgi%22&ath_dspid=MIMAS.WOS"> + <TABLE ALIGN=CENTER BORDER=0 CELLPADDING=0 CELLSPACING=10 WIDTH="75%"> + <TR> + <TD ALIGN=RIGHT WIDTH="40%"> + <FONT COLOR="#333366" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"><B>Username:</B></FONT> + </TD> + <TD ALIGN=LEFT> + <FONT COLOR="#FFFFFF" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"> + <INPUT TYPE=TEXT NAME="ath_uname" VALUE="" MAXLENGTH=20> + </FONT> + </TD> + </TR> + <TR> + <TD ALIGN=RIGHT> + <FONT COLOR="#333366" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"><B>Password:</B></FONT> + </TD> + <TD ALIGN=LEFT> + <FONT COLOR="#FFFFFF" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"> + <INPUT TYPE=PASSWORD NAME="ath_passwd" MAXLENGTH=20> + </FONT> + </TD> + </TR> + <TR> + <TD ALIGN=CENTER COLSPAN=2> + <INPUT TYPE=IMAGE SRC="http://auth.athensams.net/images/login.gif" BORDER=0 ALT="Login" ALIGN=MIDDLE><BR> + </TD> + </TR> + </TABLE> + </FORM> + </TD> + </TR> +</TABLE> + +<TABLE WIDTH="609" BORDER="0"> + <TR> + <TD> + <FONT FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva" SIZE=1> + Athens is a service of <a href=http://www.eduserv.ac.uk>EduServ</a> + </FONT> + <BR> + <FONT FACE="Verdana, Arial, Helvetica" SIZE=1>(c) <A HREF="http://www.athensams.net/copyright.html">Copyright</a>, EduServ. All rights reserved. February 2002</FONT> + </TD> + <TD> + <A HREF="http://www.mimas.ac.uk"><img align="right" +BORDER="0" SRC="http://wos.mimas.ac.uk/images/small_mimas2.gif" alt="MIMAS"></a> + </TD> + </TR> +</TABLE> + +</BODY> +</HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/testdata/FullSearch.html b/LTA/LTAIngest/ClientForm-0.1.17/testdata/FullSearch.html new file mode 100644 index 0000000000000000000000000000000000000000..60dc0479c38029601c8d320ae93e11fd61179cd3 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/testdata/FullSearch.html @@ -0,0 +1,114 @@ +<HTML><HEAD><TITLE>Search -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PeriodSelect.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi NAME = "searchForm" ENCTYPE="multipart/form-data" METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpsrch.html#Full_Search"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> +<CENTER><STRONG><FONT SIZE=4>Full Search</FONT><BR></STRONG></CENTER><INPUT TYPE=CHECKBOX NAME="editions" VALUE="D"> +<A HREF=help/helptoc.html#sci>Science Citation Index Expanded (SCI-EXPANDED)--1981-present</A><BR> +<INPUT TYPE=CHECKBOX NAME="editions" VALUE="S"> +<A HREF=help/helptoc.html#ssci>Social Sciences Citation Index (SSCI)--1981-present</A><BR> +<INPUT TYPE=CHECKBOX NAME="editions" VALUE="H"> +<A HREF=help/helptoc.html#ahci>Arts & Humanities Citation Index (A&HCI)--1981-present</A><BR> +<HR><INPUT TYPE=RADIO NAME="Period" VALUE="This Week" onClick="clear_years();"> +This week's update (Updated April 26, 2002)<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Latest 2 Weeks" onClick="clear_years();"> +Latest 2 Weeks<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Latest 4 Weeks" onClick="clear_years();"> +Latest 4 Weeks<BR><INPUT TYPE=RADIO NAME="Period" CHECKED VALUE="All Years" onClick="clear_years();"> +All years<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Year Selection"> +Limit search to years selected below<BR><TABLE> +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="2002" onClick="set_period(4);"> +2002 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="2001" onClick="set_period(4);"> +2001 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="2000" onClick="set_period(4);"> +2000 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1999" onClick="set_period(4);"> +1999 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1998" onClick="set_period(4);"> +1998 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1997" onClick="set_period(4);"> +1997 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1996" onClick="set_period(4);"> +1996 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1995" onClick="set_period(4);"> +1995 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1994" onClick="set_period(4);"> +1994 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1993" onClick="set_period(4);"> +1993 + +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="1992" onClick="set_period(4);"> +1992 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1991" onClick="set_period(4);"> +1991 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1990" onClick="set_period(4);"> +1990 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1989" onClick="set_period(4);"> +1989 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1988" onClick="set_period(4);"> +1988 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1987" onClick="set_period(4);"> +1987 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1986" onClick="set_period(4);"> +1986 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1985" onClick="set_period(4);"> +1985 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1984" onClick="set_period(4);"> +1984 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1983" onClick="set_period(4);"> +1983 + +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="1982" onClick="set_period(4);"> +1982 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1981" onClick="set_period(4);"> +1981 +</TABLE><HR><TABLE> + <TR> + <TD><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=Images/gensrch.gif NAME="General Search" ALT="General Search"></TD> + + <TD> + Search for articles by subject term, author name, journal title, or author affiliation<BR></TD> + <TR> + <TD><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=Images/crsrch.gif NAME="Cited Ref Search" ALT="Cited Ref Search"></TD> + + <TD>Search for articles that cite an author or work</TD> </TR> </TABLE> + <HR> + <TABLE> + <TR > + <TD NOWRAP> <A HREF= http://tame.mimas.ac.uk:80/isicgi/CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=LoadQuery>Using Saved Queries:</A></TD><TD> Instructions for editing and running saved queries.</TD> + </TR> </TABLE> + + Enter full pathname of saved query (e.g., c:\myqueries\query1) or use Browse.<BR> + <TABLE> + <TR> + <TD NOWRAP> + <INPUT TYPE=file NAME=fileToUpload VALUE = "" ALT="Browse""> + </TD> + <TD> + <INPUT TYPE=SUBMIT NAME=Func VALUE="Open Query" ALT="Open Query"> + </TD> + </TR> + </TABLE> + <INPUT TYPE=HIDDEN NAME=Form VALUE=Full> + <HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/testdata/GeneralSearch.html b/LTA/LTAIngest/ClientForm-0.1.17/testdata/GeneralSearch.html new file mode 100644 index 0000000000000000000000000000000000000000..f5ba69fa7b46d47aad1f893b02ce8fb7a319a704 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/testdata/GeneralSearch.html @@ -0,0 +1,178 @@ +<HTML><HEAD><TITLE>General Search -- Web of Science v4.31</TITLE> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=http://tame.mimas.ac.uk:80/isicgi/CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> +<A NAME=top> + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + <TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#General_Search"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Date & Database Limits" ALT="Date & Database Limits" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblimits.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Cited Ref Search" ALT="Cited Ref Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbcrsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> + <CENTER><STRONG><FONT SIZE=4> + General Search</FONT><BR></STRONG></CENTER> + Enter individual search terms or phrases separated by search operators such as AND or OR then press SEARCH below.<BR> + <A href=#setlimits><FONT SIZE=+1> + Set language and document type limits and sort option.</A></FONT><BR> + <TABLE><TR> + <TD ALIGN=right HEIGHT="1" WIDTH="74"><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/search.gif NAME="Search" ALT="Search"></TD> + + <TD> + Search using terms entered below.</TD></TABLE><HR> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Basic_Index> + TOPIC:</A> Enter terms from the article title, keywords, or abstract + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#topic_search_examples> + Examples</A><BR> + <INPUT TYPE=TEXT NAME="topic" SIZE="50" VALUE=""> + <INPUT TYPE=CHECKBOX NAME="titleonly"> +Title only<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Author> + AUTHOR:</A> + Enter one or more author names as O'BRIAN C* OR OBRIAN C*<BR> + <INPUT TYPE=TEXT NAME="author" SIZE="50" VALUE=""> +<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Journal> + SOURCE TITLE:</A> + Enter journal title or copy and paste from the <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/A_fulljt.html> + source list</A><BR> + <INPUT TYPE=TEXT NAME="journal" SIZE="50" VALUE=""> +<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Address> + ADDRESS:</A> + Enter terms from an author's affiliation as YALE UNIV SAME HOSP (see <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/adabrv.html>abbreviations list</A>)<BR> + <INPUT TYPE=TEXT NAME="address" SIZE="50" VALUE=""> +<BR> + <HR> + <TABLE> + <TR> + <TD ALIGN=right><INPUT TYPE=IMAGE SRC=http://tame.mimas.ac.uk:80/isicgi/Images/search.gif ALT="Search" BORDER=0 VSPACE=0 HSPACE=1 NAME="Search"></TD> + + <TD> + Search using terms entered above.<BR></TD> <TR> + <TD ALIGN=RIGHT><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/saveq.gif ALT="Save query" NAME="Save query"></TD> + <TD> + Save the search terms for future use.<BR></TD> + <TR> + <TD ALIGN=right><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/clear.gif NAME="Clear" ALT="Clear"></TD> + <TD> + Clear all search terms entered above.</TD> + </TABLE> + <A NAME=setlimits> + <HR> + <STRONG> + SET LIMITS AND SORT OPTION</STRONG><P> + <TABLE FRAME=VOID> <TR ALIGN=LEFT VALIGN=TOP> + <TH COLSPAN=2> Restrict search to a specific language or document type: <TR ALIGN=LEFT VALIGN=TOP> +<TH COLSPAN=2> (Multiple items may be selected from lists) <TH>Sort results by: <TR ALIGN=LEFT VALIGN=TOP> +<TD> <SELECT NAME="languagetype" MULTIPLE SIZE="5"> +<OPTION VALUE="All languages" SELECTED>All languages +<OPTION VALUE="English">English +<OPTION VALUE="Afrikaans">Afrikaans +<OPTION VALUE="Arabic">Arabic +<OPTION VALUE="Bengali">Bengali +<OPTION VALUE="Bulgarian">Bulgarian +<OPTION VALUE="Byelorussian">Byelorussian +<OPTION VALUE="Catalan">Catalan +<OPTION VALUE="Chinese">Chinese +<OPTION VALUE="Croatian">Croatian +<OPTION VALUE="Czech">Czech +<OPTION VALUE="Danish">Danish +<OPTION VALUE="Dutch">Dutch +<OPTION VALUE="Estonian">Estonian +<OPTION VALUE="Finnish">Finnish +<OPTION VALUE="Flemish">Flemish +<OPTION VALUE="French">French +<OPTION VALUE="Gaelic">Gaelic +<OPTION VALUE="Galician">Galician +<OPTION VALUE="Georgian">Georgian +<OPTION VALUE="German">German +<OPTION VALUE="Greek">Greek +<OPTION VALUE="Hebrew">Hebrew +<OPTION VALUE="Hungarian">Hungarian +<OPTION VALUE="Icelandic">Icelandic +<OPTION VALUE="Italian">Italian +<OPTION VALUE="Japanese">Japanese +<OPTION VALUE="Korean">Korean +<OPTION VALUE="Latin">Latin +<OPTION VALUE="Macedonian">Macedonian +<OPTION VALUE="Multi-Language">Multi-Language +<OPTION VALUE="Norwegian">Norwegian +<OPTION VALUE="Persian">Persian +<OPTION VALUE="Polish">Polish +<OPTION VALUE="Portuguese">Portuguese +<OPTION VALUE="Provencal">Provencal +<OPTION VALUE="Rumanian">Rumanian +<OPTION VALUE="Russian">Russian +<OPTION VALUE="Serbian">Serbian +<OPTION VALUE="Serbo-Croatian">Serbo-Croatian +<OPTION VALUE="Slovak">Slovak +<OPTION VALUE="Slovene">Slovene +<OPTION VALUE="Spanish">Spanish +<OPTION VALUE="Swedish">Swedish +<OPTION VALUE="Turkish">Turkish +<OPTION VALUE="Ukrainian">Ukrainian +<OPTION VALUE="Welsh">Welsh +</SELECT> +<TD><SELECT NAME="doctype" MULTIPLE SIZE="5"> +<OPTION VALUE="All document types" SELECTED>All document types +<OPTION VALUE="Article">Article +<OPTION VALUE="Abstract of Published Item">Abstract of Published Item +<OPTION VALUE="Art Exhibit Review">Art Exhibit Review +<OPTION VALUE="Bibliography">Bibliography +<OPTION VALUE="Biographical-Item">Biographical-Item +<OPTION VALUE="Book Review">Book Review +<OPTION VALUE="Chronology">Chronology +<OPTION VALUE="Correction">Correction +<OPTION VALUE="Correction, Addition">Correction, Addition +<OPTION VALUE="Dance Performance Review">Dance Performance Review +<OPTION VALUE="Database Review">Database Review +<OPTION VALUE="Discussion">Discussion +<OPTION VALUE="Editorial Material">Editorial Material +<OPTION VALUE="Excerpt">Excerpt +<OPTION VALUE="Fiction, Creative Prose">Fiction, Creative Prose +<OPTION VALUE="Film Review">Film Review +<OPTION VALUE="Hardware Review">Hardware Review +<OPTION VALUE="Item About an Individual">Item About an Individual +<OPTION VALUE="Letter">Letter +<OPTION VALUE="Meeting Abstract">Meeting Abstract +<OPTION VALUE="Meeting-Abstract">Meeting-Abstract +<OPTION VALUE="Music Performance Review">Music Performance Review +<OPTION VALUE="Music Score">Music Score +<OPTION VALUE="Music Score Review">Music Score Review +<OPTION VALUE="News Item">News Item +<OPTION VALUE="Note">Note +<OPTION VALUE="Poetry">Poetry +<OPTION VALUE="Record Review">Record Review +<OPTION VALUE="Reprint">Reprint +<OPTION VALUE="Review">Review +<OPTION VALUE="Script">Script +<OPTION VALUE="Software Review">Software Review +<OPTION VALUE="TV Review, Radio Review">TV Review, Radio Review +<OPTION VALUE="TV Review, Radio Review, Video">TV Review, Radio Review, Video +<OPTION VALUE="Theater Review">Theater Review +</SELECT> +<TD><SELECT NAME="Sort" SIZE="5"> +<OPTION VALUE="Latest date" SELECTED>Latest date +<OPTION VALUE="Times Cited">Times Cited +<OPTION VALUE="Relevance">Relevance +<OPTION VALUE="First author">First author +<OPTION VALUE="Source Title">Source Title +</SELECT> +</TABLE>Back to <A HREF=#top> + top of Search</A> + page <P> + <HR><BR> + </OL> + <INPUT TYPE=HIDDEN NAME=Form VALUE=General> + <INPUT TYPE=HIDDEN NAME=Func VALUE=Search> + </FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/testdata/MarkedRecords.html b/LTA/LTAIngest/ClientForm-0.1.17/testdata/MarkedRecords.html new file mode 100644 index 0000000000000000000000000000000000000000..8fb05bd747faae312aab40dcb4e41dde6ab3d217 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/testdata/MarkedRecords.html @@ -0,0 +1,152 @@ +<HTML><HEAD><TITLE>Marked Records -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=Common.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> +<A NAME=top> +<INPUT TYPE=HIDDEN NAME="Form" VALUE="Marked_Records"> + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpprn.html#Print_&_Export_Marked_Records"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Date & Database Limits" ALT="Date & Database Limits" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblimits.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="General Search" ALT="General Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbgsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Cited Ref Search" ALT="Cited Ref Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbcrsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> + +<INPUT TYPE=HIDDEN NAME=id VALUE=6> + +<div align="center"> + <table width="650" border="0" cellspacing="0" cellpadding="0"> + <tr> + <td width="231" align="center"> + </td> + <td width="215"> + <p align="center"><br> + <STRONG><FONT SIZE=4>Marked Records</FONT></STRONG> + </td> + <td align="right"> </td> + </tr> + <tr> + <td width="231" align="center"> + <p align="right"><b>500</b></td> + <td width="215"> + <p align="center"> <b>Records on the marked list</b></p> + </td> + <td align="right"><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Clear+Mark+List onClick="this.href = confirmLink( 'Warning: Pressing OK will clear the marked list.', 'CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Clear+Mark+List', 'javascript:void(0)');"> +<IMG SRC="Images/clearlst.gif" ALT="Clear Marked List" BORDER="0"></A></td> + </tr> + </table> +</div> +<hr> + +<font size="+1"><b>STEP 1: Select sort and output fields for the entire marked list.</b></font> + +<div align="center"> +<table width="92%" border="1" height="124"> + <tr> + <td width="21%" valign="top" height="124"> + <div align="left"> + <p align="center"><b>Select sort option:</b> + </p> + </div> + <div align="left"> + <p> + <SELECT NAME="MarkedSort" SIZE="4"> +<OPTION VALUE="Latest date" SELECTED>Latest date +<OPTION VALUE="First author">First author +<OPTION VALUE="Source Title">Source Title +<OPTION VALUE="Times Cited">Times Cited +</SELECT> + + </p> + </div> + </td> + <td width="79%" height="124"> + + + <p align="center"><b>Select fields to include in addition to the author(s), + article title and source.</b> </p> + + + <table width="481"> + <tr> + + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=include_refs >cited references*</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=address >addresses</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=abstract >abstract</td> + </tr> + <tr> + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=language >language</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=publisher >publisher information</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=ISSN >ISSN</td> + </tr> + <tr> + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=doctype >document type</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=keywords >keywords</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=timescited >times cited</td> + </tr> + </table> + + <FONT SIZE=-1><i>*Selecting the cited references may cause the server + to time out with large numbers of records.</i></FONT> + + </td> + </tr> +</table> +</div> + +<br> + +<font size="+1"><b>STEP 2: Select action for output.</b></font><br> + +<div align="center"> + <table width=650 height="28" cellspacing="0" cellpadding="0" border="0"> + <tr align="center"> + <td width="542"><INPUT TYPE=IMAGE SRC=Images/print.gif NAME="Format for Print" ALT="Format for Print" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/savefile.gif NAME="Save to File" ALT="Save to File" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/export.gif NAME="Export to reference software" ALT="Export to reference software" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/email.gif NAME="E-Mail" ALT="E-Mail" BORDER=0> + </td> + </tr></table> +</div> +<hr> + <BR> +<DL><DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000174872000059 CHECKED> Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/1>Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002<!000174872000059> +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000174858300003 CHECKED> Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/2>Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002<!000174858300003> +<BR><BR> +<!--snip--> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000081310100003 CHECKED> Disney RHL<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/499>A troublesome sibling species complex of scuttle flies (Diptera : Phoridae) revisited</A><BR>J NAT HIST 33 (8): 1159-1216 AUG 1999<!000081310100003> +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000081297200008 CHECKED> Rosanowski F, Eysholdt U<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/500>Medical expertise prior to voice change surgery in male-to-female transsexuals</A><BR>HNO 47 (6): 556-562 JUN 1999<!000081297200008> +<BR><BR> +</DL> +<hr> +<div align="center"> + <table width=650 height="28" cellspacing="0" cellpadding="0" border="0"> + <tr align="center"> + <td width="542"><INPUT TYPE=IMAGE SRC=Images/print.gif NAME="Format for Print" ALT="Format for Print" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/savefile.gif NAME="Save to File" ALT="Save to File" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/export.gif NAME="Export to reference software" ALT="Export to reference software" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/email.gif NAME="E-Mail" ALT="E-Mail" BORDER=0> + </td> + </tr></table> +</div> +<BR>Back to <A HREF=#top>top of Marked Records</A> page<BR><BR><HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/testdata/MarkedResults.html b/LTA/LTAIngest/ClientForm-0.1.17/testdata/MarkedResults.html new file mode 100644 index 0000000000000000000000000000000000000000..cb5b2bc228579eb23e08a8b5fe20a29aa43f9eff --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/testdata/MarkedResults.html @@ -0,0 +1,97 @@ +<HTML><HEAD><TITLE>General Search Results-Summary -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PageSubmit.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<HR><TABLE WIDTH=100%><TR ALIGN=CENTER><TD><STRONG><FONT SIZE=4>General Search Results--Summary</FONT></STRONG></TD></TR></TABLE> + Topic=troublesome; DocType=All document types; Language=All languages; Databases=SCI-EXPANDED; Timespan=All Years; (sorted by latest date) + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Unmark Page" ALT="Unmark Page" SRC=Images/unmarkall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + <HR><I><FONT SIZE=-1>Use the checkboxes to add individual articles to the Marked List. Be sure to click SUBMIT MARKS button before leaving page.</FONT></I><DL> +<DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174872000059/1 CHECKED> Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/1 onClick="this.href="javascript:submit_page('Abstract', '1/1')";">Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174858300003/2 CHECKED> Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/2 onClick="this.href="javascript:submit_page('Abstract', '1/2')";">Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174827900006/3 CHECKED> Chang DW, Hussussian C, Lewin JS, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/3 onClick="this.href="javascript:submit_page('Abstract', '1/3')";">Analysis of pharyngocutaneous fistula following free jejunal transfer for total laryngopharyngectomy</A><BR>PLAST RECONSTR SURG 109 (5): 1522-1527 APR 15 2002 +<BR><BR> +</DL><HR> + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Unmark Page" ALT="Unmark Page" SRC=Images/unmarkall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + <BR>1783 of 16635816 documents matched the query. (500 shown)<HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/testdata/Results.html b/LTA/LTAIngest/ClientForm-0.1.17/testdata/Results.html new file mode 100644 index 0000000000000000000000000000000000000000..ee31c1fd8370a02969a3098b37d815f6c5be2352 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/testdata/Results.html @@ -0,0 +1,94 @@ +<HTML><HEAD><TITLE>General Search Results-Summary -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PageSubmit.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<HR> +<TABLE WIDTH=100%><TR ALIGN=CENTER><TD><STRONG><FONT SIZE=4>General Search Results--Summary</FONT></STRONG></TD></TR><TR><TD> </TD></TR></TABLE> + Topic=troublesome; DocType=All document types; Language=All languages; Databases=SCI-EXPANDED; Timespan=All Years; (sorted by latest date) + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add records on page to list" ALT="Add records on page to list" SRC=Images/markall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> </TD> +<TD>. . . + <TD><IMG SRC=Images/frwrd10i.gif ALT="Next 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR> + </TABLE></CENTER> + <HR><I><FONT SIZE=-1>Use the checkboxes to add individual articles to the Marked List. Be sure to click SUBMIT MARKS button before leaving page.</FONT></I><DL> +<DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174872000059/1 > Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/1 onClick="this.href="javascript:submit_page('Abstract', '1/1')";">Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174858300003/2 > Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/2 onClick="this.href="javascript:submit_page('Abstract', '1/2')";">Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174827900006/3 > Chang DW, Hussussian C, Lewin JS, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/3 onClick="this.href="javascript:submit_page('Abstract', '1/3')";">Analysis of pharyngocutaneous fistula following free jejunal transfer for total laryngopharyngectomy</A><BR>PLAST RECONSTR SURG 109 (5): 1522-1527 APR 15 2002 +<BR><BR> +</DL><HR> + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add records on page to list" ALT="Add records on page to list" SRC=Images/markall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + +<BR>1783 of 16635816 documents matched the query. (500 shown)<HR> +</FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/ClientForm-0.1.17/testdata/SearchType.html b/LTA/LTAIngest/ClientForm-0.1.17/testdata/SearchType.html new file mode 100644 index 0000000000000000000000000000000000000000..a895c3e0a5c95db2dd1d9e3534a414f801710391 --- /dev/null +++ b/LTA/LTAIngest/ClientForm-0.1.17/testdata/SearchType.html @@ -0,0 +1,55 @@ +<HTML><HEAD><TITLE>Welcome -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=Common.js> +</SCRIPT> + + + +<SCRIPT LANGUAGE=JavaScript> +<!-- Hide script from old browsers. +function main(){ + JavaScriptTest(); +} +// End script hide. --> +</SCRIPT> + +</HEAD> +<BODY BGCOLOR=#FFFFFF onLoad="main()" ><FORM ACTION=CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> + + <A NAME=top></A> + <CENTER><IMG SRC=Images/main.jpg ALT="Institute for Scientific Information"></CENTER> + + <P> + <CENTER> + <TABLE BORDER=2 CELLPADDING=0> + <TR> + <TD ALIGN=CENTER><INPUT TYPE=IMAGE SRC=Images/fullsch.gif NAME="Full Search" ALT="Full Search" BORDER=0> + <TD> + <TABLE> + <TR><TD><TD>Search by bibliographic information (topic, author, source, address) or by cited reference. +</TABLE> + <TR> + <TD ALIGN=CENTER><INPUT TYPE=IMAGE SRC=Images/quiksch.gif NAME="Easy Search" ALT="Easy Search" BORDER=0> + <TD><TABLE> + <TR><TD><TD>Search for a limited number of articles on a specific topic, person, or address.</TABLE> +<TR><TD ALIGN=CENTER> + <INPUT TYPE=IMAGE SRC=Images/newsession.gif NAME="New Session" ALT="New Session" BORDER=0> + <TD> + <TABLE><TR><TD><TD> +Clear all search forms and the marked list.</TABLE> <TR> + <TD ALIGN=CENTER> +<INPUT TYPE=IMAGE SRC=Images/logoff.gif NAME="Log off" ALT="Log off" BORDER=0> + <TD><TABLE> + <TR><TD> +Fully disconnect from the database and make your connection available to another user at your institution.</TD></TABLE><INPUT TYPE=HIDDEN NAME=Form Value=Welcome> +</TABLE></CENTER> +<HR> +<INPUT TYPE=HIDDEN NAME="JavaScript" VALUE="No"> +<P><CENTER><IMG SRC=Images/isilogo.gif ALT="ISI Thomson Scientific"></CENTER><P> +</FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/ChangeLog b/LTA/LTAIngest/SOAPpy-0.12.0/ChangeLog new file mode 100644 index 0000000000000000000000000000000000000000..1e2cfc4911c010bee995ec65544fba09342a6db1 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/ChangeLog @@ -0,0 +1,3195 @@ +2005-02-22 10:58 warnes + + * ChangeLog, README, RELEASE_INFO: Update for 0.12.0 release. + +2005-02-21 23:30 tag SOAPpy_0_12_0 + +2005-02-21 23:30 warnes + + * tests/README: update README since TCtests.py now passes! + +2005-02-21 23:29 warnes + + * SOAPpy/: Parser.py, Types.py: Fix handling of parse rules! + TCtest.py now passes! + +2005-02-21 23:25 warnes + + * tests/: Bug1001646.py, Bug916265.py, ComplexTypes.py, + GoogleTest.py, Makefile, README, TCtest.py, echoClient.py, + echoHeader.py, echoServer.py, largeDataTest.py, testleak.py: + Enhance echoServer with a quit method. Provide a makefile for + running all working tests. + +2005-02-21 15:27 warnes + + * SOAPpy/Client.py: Fix loss of quotes at start of file + description. + +2005-02-21 15:24 warnes + + * SOAPpy/SOAPBuilder.py: Add spaces and newlines to soap + environment declarations when writing SOAP enclosure so that the + results are more human readable. + +2005-02-21 15:21 warnes + + * SOAPpy/Client.py: Add code to handle the case when headers are + empty or not present. + +2005-02-21 15:16 warnes + + * SOAPpy/WSDL.py: Add 'show_methods' which will print the methods + and associated parameters. + +2005-02-21 15:09 warnes + + * docs/: GettingStarted.txt, GlobusSupport.txt, WSDL.txt, + complexTypes.txt, simpleTypes.txt: Update documentation. + +2005-02-18 14:29 warnes + + * SOAPpy/Client.py: If content_length is missing or empty, we need + not check for a the OC4J bug... + +2005-02-18 11:28 warnes + + * README: Add bug reporting and mailing list information. + +2005-02-18 10:42 warnes + + * docs/GettingStarted.txt: Change 'SOAP.py' to 'SOAPpy' + +2005-02-18 10:40 warnes + + * docs/attrs.txt: Remove the long copyright and usage notices. + Clarify the (too brief) text. + +2005-02-18 10:36 warnes + + * docs/: complexTypes.txt, UsingHeaders.txt: Most of the + information in the 'complexTypes.txt' file was actually about the + use of headers. Moved that text to a new file + 'UsingHeaders.txt'. + +2005-02-18 08:50 warnes + + * SOAPpy/wstools/XMLSchema.py: Apply patch submitted by Peter + McA'Nulty of WebReply.com: "class SimpleContent, class + Extension.fromDom() at line ~2313 doesn't handle extensions + without contents -- unusual, but not illegal" + +2005-02-17 01:44 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + + Modified Files: + XMLSchema.py -- added the item trace to an exception + thrown + when an unexpected child is encountered in + complexType.fromDom + method so one can see where the problem is occuring. + + + ---------------------------------------------------------------------- + +2005-02-16 13:39 warnes + + * tests/echoServer.py: Added echo_jheader which returns its + argument and the header, in order to support Bug1001646.py + +2005-02-16 13:34 warnes + + * tests/Bug1001646.py: Test now properly checks to see that the + header is preserved. + +2005-02-16 09:45 tag v1_7 + +2005-02-16 09:45 warnes + + * SOAPpy/wstools/XMLname.py: Remember to paste prefix back onto the + XMLname before returning. + +2005-02-15 23:27 warnes + + * SOAPpy/SOAPBuilder.py: Bug fix for [ 1078051 ] "Arrays of complex + types (doc/lit)" per the patch submited with the bug report by + Nelson Minar. + +2005-02-15 23:25 warnes + + * tests/echoClient.py: Print what we're trying to do so that errors + are easier to track down. + +2005-02-15 23:24 warnes + + * tests/storageTest.py: Updated 'registerUser' to match current + documentation, but still get an error from the server. + +2005-02-15 23:23 warnes + + * tests/echoServer.py: Add 'echo_simple' command that just returns + its arguments as-is. + +2005-02-15 23:22 warnes + + * tests/Bug1001646.py: Turn of some debugging information. + +2005-02-15 23:21 warnes + + * tests/Bug916265.py: Regression test for [ 916265 ] "Arrays of + unicode do not serialize correctly (patch included)" + +2005-02-15 23:09 warnes + + * tests/Bug1001646.py: Regression test for bug [ 1001646 ] SOAPpy + stomps headers when sending multirefs. + +2005-02-15 16:51 warnes + + * SOAPpy/Client.py: Create workaround for a bug in OC4J reported in + [ 1122991 ] "error from SOAPpy/Client.py for content_length + evaluation?" by Merten Schumann. + +2005-02-15 16:09 warnes + + * SOAPpy/SOAPBuilder.py: Fix [ 1106450 ] "Floats are truncated to + 10 digits, causing precision loss" submitted by Kerry 'krdavis'. + +2005-02-15 16:07 warnes + + * SOAPpy/Client.py: Fixed [ 1096971 ] "Parse error: missing HTTP + header 'Content-length'" submitted by 'pure water'. + +2005-02-15 15:59 warnes + + * SOAPpy/Types.py: Fix [ 1064248 ] "Bugs in _asdict() and + _asarray() in Types.py" submitted by Peter Lamb. + +2005-02-15 15:56 warnes + + * SOAPpy/SOAPBuilder.py: Fix handling of 0-length arrays. + +2005-02-15 15:52 warnes + + * SOAPpy/SOAPBuilder.py: Apply [ 1064233 ] "Bug fixes for complex + types" from Peter Lamb. + +2005-02-15 15:41 warnes + + * SOAPpy/SOAPBuilder.py: Fix bug [ 1001646 ] SOAPpy stomps headers + when sending multirefs using included patch provide by Nelson + Minar + +2005-02-15 15:15 warnes + + * SOAPpy/Client.py: Fix [ 925077 ] SOAPpy prints out SOAP fault + /even when Config.debug is off/. + +2005-02-15 15:12 warnes + + * SOAPpy/Parser.py, tests/Bug918216.py: Fix parsing bug & add + regression test. SOAPpy was not allowing anything after the + close of a faulttype block, but multirefs should be permitted + there. Closes bug [ 918216 ] "Parsing faults in SOAPpy 0.11.3" + +2005-02-15 14:30 warnes + + * tests/ZeroLengthArray.py: Test handling of zero-length typed + lists. Currently fails. + +2005-02-15 14:22 warnes + + * SOAPpy/SOAPBuilder.py: Revert broken header generation from last + patch. + +2005-02-15 11:41 warnes + + * SOAPpy/SOAPBuilder.py: Fix bug [ 916265 ] Arrays of unicode do + not serialize correctly, submitted by Nelson Minar. + +2005-02-15 11:37 warnes + + * README: Fix typo in README. + +2005-02-15 11:32 warnes + + * LICENSE, README, setup.py, SOAPpy/Client.py, SOAPpy/Errors.py, + SOAPpy/GSIServer.py, SOAPpy/NS.py, SOAPpy/Server.py, + SOAPpy/wstools/XMLname.py, docs/MethodParameterNaming.txt: Update + email address + +2005-02-09 13:33 tag bogus-12_9_123RC93 + +2005-02-09 13:33 boverhof + + * SOAPpy/wstools/Utility.py: + ---------------------------------------------------------------------- + + Modified Files: + Utility.py -- need to do isfile check before calling mod + func + "urlopen" for cross-platform compat. + + + ---------------------------------------------------------------------- + +2005-02-07 12:07 irjudson + + * SOAPpy/wstools/WSDLTools.py: Mod to flatten wsdl imports, schemas + not dealt with yet. + +2005-02-04 14:18 boverhof + + * SOAPpy/wstools/Namespaces.py: + ---------------------------------------------------------------------- + Committing in . + + Modified Files: + Namespaces.py -- OASIS.PROPERTIES wasn't quite correct + (suffix .wsdl --> .xsd) + + ---------------------------------------------------------------------- + +2005-02-01 13:10 boverhof + + * SOAPpy/wstools/Namespaces.py: + ---------------------------------------------------------------------- + + Modified Files: + Namespaces.py -- added OASIS BaseFaults. + + + ---------------------------------------------------------------------- + +2005-01-31 09:45 warnes + + * README: Update URL for fpconst. + +2005-01-27 18:01 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- added a couple convenience methods for + grabbing + various Schema Items. + + + ---------------------------------------------------------------------- + +2005-01-25 19:31 boverhof + + * SOAPpy/wstools/Utility.py: + ---------------------------------------------------------------------- + + Modified Files: + Utility.py -- change to accomodate writing out XML + instances + w/o SOAP Envelopes, "known" prefixes aren't + "known" + when the Envelope isn't writen out. + + + ---------------------------------------------------------------------- + +2005-01-18 12:18 boverhof + + * SOAPpy/wstools/: Namespaces.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + Namespaces.py + -- added Oasis base notification ns. + XMLSchema.py + -- modified an exception message. + + ---------------------------------------------------------------------- + +2004-12-20 15:36 boverhof + + * SOAPpy/wstools/: WSDLTools.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py + -- removed a line that was screwing up imports/includes + of + when WSDL file was specified to "wsdl2py" via a relative + path. + + XMLSchema.py + -- stop requiring suffix "xsd" for schema files. + + + ---------------------------------------------------------------------- + +2004-12-17 16:41 boverhof + + * SOAPpy/wstools/c14n.py: + ---------------------------------------------------------------------- + + Modified Files: + c14n.py -- fixed a bug affecting those who don't have + pyXML installed. + + ---------------------------------------------------------------------- + +2004-12-08 15:04 boverhof + + * SOAPpy/wstools/c14n.py: + ---------------------------------------------------------------------- + + Modified Files: + c14n.py -- just removed import of ZSI so SOAPy can use + w/o ZSI. + + ---------------------------------------------------------------------- + +2004-12-07 10:54 blunck2 + + * SOAPpy/wstools/__init__.py: uncommented import of WSDLTools. + josh commented this out during another commit and i think it was + unintentional + +2004-11-30 01:27 boverhof + + * SOAPpy/wstools/: Namespaces.py, WSDLTools.py: + ---------------------------------------------------------------------- + + Modified Files: + Namespaces.py WSDLTools.py + -- added WSA 2004/08 namespaces, and enabled "PortType" + to + discover it. + + ---------------------------------------------------------------------- + +2004-11-16 15:59 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- added a couple helper methods for + discovering + whether or not an element should be qualified. + + + ---------------------------------------------------------------------- + +2004-11-12 18:14 boverhof + + * SOAPpy/wstools/: Namespaces.py, WSDLTools.py, XMLSchema.py, + logging.py: + ---------------------------------------------------------------------- + Modified Files: + Namespaces.py + -- Added a SOAP-1.2 binding + WSDLTools.py + -- Added some methods from grabbing ElementDeclaration + and + TypeDefintion from Message instances. + XMLSchema.py + -- fixed a bug in SchemaReader. + logging.py + -- added a couple more functions, and a level to basic + logger. + + ---------------------------------------------------------------------- + +2004-11-08 14:46 boverhof + + * SOAPpy/wstools/Utility.py: + ---------------------------------------------------------------------- + Committing in . + + Modified Files: + Utility.py -- removed "sw" property in ElementProxy. + + + ---------------------------------------------------------------------- + +2004-11-05 02:32 boverhof + + * SOAPpy/wstools/c14n.py: + + ---------------------------------------------------------------------- + Modified Files: + c14n.py -- opps, want to call the implementation + constructor + when passed any NodeType, not just Element nodes. + + + ---------------------------------------------------------------------- + +2004-11-05 01:35 boverhof + + * SOAPpy/wstools/: Namespaces.py, Utility.py, WSDLTools.py, + __init__.py, c14n.py: + ---------------------------------------------------------------------- + + Modified Files: + ** removed all "imports" of ZSI or ZSI.wstools, so + wstools + can be used independently by SOAPpy. + + Namespaces.py + -- added a namespace + + Utility.py + -- moved ZSI.utility here, and the "Base" class for + logging. + + WSDLTools.py + -- added a "toDom" and "GetWSDL" methods to several + classes, + so now you can construct a WSDL instance and then call + WSDL.toDom() --> DOM --> and create a WSDL file. + + __init__.py + -- removed "Base" class for logging. + + Added Files: + c14n.py + -- moved the c14n stuff from ZSI.compat here. + + + ---------------------------------------------------------------------- + +2004-11-04 18:06 mateo41 + + * SOAPpy/wstools/Namespaces.py: overloaded the DSIG.C14N (from + PyXML Namespace with a newer canonicalization algorithm. Added + GLOBUS.SIG, which is the namespace for SecureConversation + +2004-10-26 20:09 boverhof + + * SOAPpy/wstools/Namespaces.py: + ---------------------------------------------------------------------- + + Modified Files: + Namespaces.py -- added OASIS.LIFETIME + + ---------------------------------------------------------------------- + +2004-10-22 16:11 boverhof + + * SOAPpy/wstools/: Utility.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + Utility.py -- catch any exceptions thrown when a DOM is + loaded up, + throw a ParseError with old ex.args and inform + which file + caused the problem. + + XMLSchema.py -- For attributeGroup incorrectly adding + global + attribute declarations, but these are declared + locally. + + + ---------------------------------------------------------------------- + +2004-10-21 02:40 mateo41 + + * SOAPpy/wstools/Namespaces.py: added another namespace to WSSE, + and created a GLOBUS namespace class with 2 namespaces + +2004-10-20 18:35 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + + Modified Files: + WSDLTools.py -- uncommented some "import" code that I + didn't + think was of any use. Now I need it. + + + ---------------------------------------------------------------------- + +2004-10-19 21:33 mateo41 + + * SOAPpy/wstools/Namespaces.py: added the BEA class, which is an + organization which has created schema for the SecureConversation + protocol + +2004-10-19 18:52 boverhof + + * SOAPpy/wstools/: WSDLTools.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py + XMLSchema.py + --removed some print statements I left behind. + + ---------------------------------------------------------------------- + +2004-10-18 22:51 boverhof + + * SOAPpy/wstools/Namespaces.py: + + ---------------------------------------------------------------------- + + Modified Files: + Namespaces.py -- added some URLs for Oasis + specifications. + + + ---------------------------------------------------------------------- + +2004-10-15 21:15 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py -- forgot to look for 2004 WS-Actions. + + + ---------------------------------------------------------------------- + +2004-10-14 04:24 boverhof + + * SOAPpy/wstools/: WSDLTools.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py + -- resolution of default WS-Action was incorrect when + using + imported portTypes. + + XMLSchema.py + -- added a couple helper functions, and a few more + Markers + for introspecting modelGroups and simpleType derivations. + + + ---------------------------------------------------------------------- + +2004-10-01 00:27 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + + Modified Files: + XMLSchema.py -- SchemaReader wasn't resolving relative + paths correctly. + + + ---------------------------------------------------------------------- + +2004-09-27 16:40 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- Shouldn't check attributes for WSDL + definition + since it's legal to specify <any> attribute in + <definition> + + + ---------------------------------------------------------------------- + +2004-09-27 15:55 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- required another condition in + getItemTrace, + need to check if at <xsd:schema> or <wsdl:definition>, + also added some info to SchemaError throw in + checkAttributes. + Now provides namespace/attribute and the Schema Item + containing + the offending attribute. + + + ---------------------------------------------------------------------- + ~ + +2004-09-22 18:40 boverhof + + * SOAPpy/wstools/: XMLSchema.py, __init__.py, logging.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py + -- added a few convience methods, and 'disabled' a few + methods + to force a sane usage. + + __init__.py + -- removed use of python logging module, and replaced it + with + the below. + + Added Files: + logging.py + -- simple interface to log message to, can write your own + logger class. By default do no logging. + + + ---------------------------------------------------------------------- + +2004-09-21 18:19 boverhof + + * SOAPpy/wstools/XMLSchema.py: + + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- added a "getItemTrace" method to + XMLSchemaComponent + Base class to produce a node trace. Added a bunch of + "tag" + class variables for identifying what an instance + represents, + and removed a bunch of unused code. + + + ---------------------------------------------------------------------- + +2004-09-10 23:14 warnes + + * RELEASE_INFO: Update for release 0.11.6 + +2004-09-10 23:07 tag SOAPpy_0_11_6 + +2004-09-10 23:07 warnes + + * ChangeLog, SOAPpy/version.py: Update version number + +2004-09-10 23:03 warnes + + * LICENSE, README, RELEASE_INFO, SOAPpy/Types.py: - Update URLs and + email address. - Use 'dictType' instead of 'dict' in type check. + +2004-09-09 19:32 boverhof + + * SOAPpy/wstools/__init__.py: + ---------------------------------------------------------------------- + + Modified Files: + __init__.py -- changed "Base", which contains a logger, + to + no-op logging if the logging configuration file is not + found and avoid the overhead of using the logging + module. + + + ---------------------------------------------------------------------- + +2004-09-09 00:37 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + + Modified Files: + WSDLTools.py -- Commented out the "imports" collection, + which isn't of much use. + + Now URLs are resolved relative to the importing document. + + Support for this scenario: + + --- + /Users/boverhof/Desktop/Wsdl/Service/whatever.wsdl + <definition> + <import location="../hello/hello.wsdl"/> + ... + </definition> + + --- /Users/boverhof/Desktop/Wsdl/hello/hello.wsdl + <definition> + <import location="goodbye.wsdl"/> + ... + </definition> + + --- /Users/boverhof/Desktop/Wsdl/hello/goodbye.wsdl + <definition> + ... + </definition> + + + ---------------------------------------------------------------------- + +2004-09-01 22:18 tag SOAPpy_0_11_5 + +2004-09-01 22:18 warnes + + * ChangeLog, RELEASE_INFO, SOAPpy/version.py: Update for release + 0.11.5 + +2004-08-18 19:12 boverhof + + * SOAPpy/wstools/__init__.py: + ---------------------------------------------------------------------- + + Modified Files: + __init__.py -- cleaned up the module, removed some old + code. + + + ---------------------------------------------------------------------- + +2004-06-23 16:10 boverhof + + * SOAPpy/wstools/XMLSchema.py: + + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- added getAttributeContent method wherever + appropriate. + + + ---------------------------------------------------------------------- + +2004-06-23 14:05 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py -- SoapBinding item soap:header message + attribute + needs to be stored as a (namespace,name) tuple to + enable + lookup. + + + ---------------------------------------------------------------------- + +2004-06-21 17:40 boverhof + + * SOAPpy/wstools/Namespaces.py: + ---------------------------------------------------------------------- + Modified Files: + Namespaces.py -- added 2004 ws-addressing namespaces. + + ---------------------------------------------------------------------- + +2004-06-05 14:30 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py -- couple bug fixes for determining default + wsa:Action values + + ---------------------------------------------------------------------- + +2004-06-04 12:53 boverhof + + * SOAPpy/wstools/WSDLTools.py: + + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py -- added a "getResourceProperties" method to + class PortType. + + ---------------------------------------------------------------------- + +2004-06-04 12:44 boverhof + + * SOAPpy/wstools/__init__.py: + ---------------------------------------------------------------------- + Enter Log. Lines beginning with `CVS:' are removed + automatically + + Committing in . + + Modified Files: + __init__.py -- added some code for doing logging + + + ---------------------------------------------------------------------- + +2004-06-03 09:03 warnes + + * MANIFEST.in: Add text files like LICENSE to the list included in + distributed packages. + +2004-05-14 00:37 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py -- Had to change how the default wsa:Action + values + are set up. Must wait until entire WSDL is loaded, + otherwise + can get into situations where the objects we need to + access + haven't been initialized yet because of the order of + WSDL + information items in the WSDL definition. + + + ---------------------------------------------------------------------- + +2004-05-13 13:15 tyger23 + + * SOAPpy/wstools/Namespaces.py: added XSD_LIST to SCHEMA because + it's absence broke wsdl2python. + +2004-05-11 04:07 boverhof + + * SOAPpy/wstools/: Namespaces.py, WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + Namespaces.py -- added a bunch of namespaces (encryption, + ws-address, ws-resourcepolicy, etc) + + WSDLTools.py -- added functionality for getting + WS-ResourceProperties and + ws-Address information out of WSDL. Ran all unittests + and passed. + + + ---------------------------------------------------------------------- + +2004-05-10 21:09 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- added some code to generate user + interpretable + exceptions. + + BEFORE: + + File + "/System/Library/Frameworks/Python.framework/Versions/2.3/lib/python2.3/site-packages/ZSI/wstools/Utility.py", + line 600, in __getitem__ + return self.data[key] + KeyError: u'xtvd' + + AFTER: File + "/System/Library/Frameworks/Python.framework/Versions/2.3/lib/python2.3/site-packages/ZSI/wstools/XMLSchema.py", + line 465, in getQNameAttribute raise KeyError, + "targetNamespace(%s) collection(%s) has no item(%s)"\ KeyError: + u'targetNamespace(urn:TMSWebServices) collection(types) has no + item(xtvd)' + + + ---------------------------------------------------------------------- + +2004-04-28 21:40 boverhof + + * SOAPpy/wstools/: Namespaces.py, Utility.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + Utility.py + XMLSchema.py -- just moved SplitQName out of here and + into Utility + + Added Files: + Namespaces.py -- WSDL, SOAP, SCHEMA, XMLNS namespaces + here. Doesn't + require PyXml + + + ---------------------------------------------------------------------- + +2004-04-28 17:47 warnes + + * SOAPpy/Server.py: Fix string format error in fault handling + +2004-04-27 11:47 warnes + + * CHANGELOG: Renamed to ChangeLog + +2004-04-27 11:39 warnes + + * ChangeLog, SOAPpy/version.py: Update for 0.11.4. + +2004-04-27 11:38 warnes + + * RELEASE_INFO: + Updated for 0.11.4 release. + +2004-04-27 11:23 warnes + + * SOAPpy/Server.py: Check if header information contains SOAPAction + key before checking its value. + +2004-04-27 11:22 warnes + + * tests/TCtest.py: Convert TCtest.py to unit test framework. + +2004-04-13 23:42 irjudson + + * SOAPpy/Types.py: Added traceback info to exception for + methodnotfound. + +2004-04-13 23:41 irjudson + + * SOAPpy/Server.py: Added traceback back in, without optional + config arg, seems resonable to do it this way. + +2004-04-11 18:01 boverhof + + * SOAPpy/wstools/Utility.py: + ---------------------------------------------------------------------- + Modified Files: + Utility.py + -- Fix to DOM singleton hasAttr method. _attr and + _attrNS are + instance variables of the minidom Element, and are + implementation + specific. Now hasAttr method will work when using + FtNode.Element, + which is what ZSI ParsedSoap defaults to (Ft.Doc). + + + ---------------------------------------------------------------------- + +2004-04-10 00:29 irjudson + + * SOAPpy/Types.py: Applied patch from antonio.beamud@linkend.com, + for tagging complexType arrays. Looks entirely self-contained, + and therefore safe to apply. Also includes new classes for the + standard soap faults that can occur, so clients can catch them + directly. + +2004-04-10 00:28 irjudson + + * SOAPpy/SOAPBuilder.py: Applied patch from + antonio.beamud@linkend.com, for tagging complexType arrays. Looks + entirely self-contained, and therefore safe to apply. + +2004-04-10 00:26 irjudson + + * SOAPpy/Server.py: Changed faultType construction to be more + client side parsable, make faultstring a non-variable string (ie + no nsmethod in it) so that it can be programmatically checked + more easily. Modified faultdetail to be the nsmethod. Now the + traceback doesn't get passed to the client side. + +2004-04-10 00:22 irjudson + + * SOAPpy/Client.py: Put quotes back in around SOAP Action, for spec + conformance. + +2004-04-02 08:45 irjudson + + * SOAPpy/Client.py: Removed quotes from SOAPAction header, seems to + be more correct. + +2004-04-01 08:25 warnes + + * SOAPpy/SOAPBuilder.py, SOAPpy/Types.py, SOAPpy/version.py, + tests/SOAPtest.py: - Fixed bug that caused typedArrayTypes to + lose their type information when rendered to SOAP. - Added + corresponding test case to SOAPtest.py - Updated version number. + +2004-03-30 19:42 boverhof + + * SOAPpy/wstools/Utility.py: + + ---------------------------------------------------------------------- + Modified Files: + Utility.py -- 'attrsNS' replaced with '_attrNS', the + actual name + + + ---------------------------------------------------------------------- + +2004-03-30 18:19 boverhof + + * SOAPpy/wstools/Utility.py: + ---------------------------------------------------------------------- + Modified Files: + Utility.py -- isElement should return False if NOT an + Element. + + + ---------------------------------------------------------------------- + +2004-03-25 15:46 irjudson + + * SOAPpy/Server.py: Modified unregsiterObject function to take + optional namespace/path args to be consistent with + registerObject. + +2004-03-22 14:32 irjudson + + * SOAPpy/Client.py: Fixed indent block bug and removed extraneous + print. + +2004-03-18 19:21 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py -- qualified Names are now stored as tuples + (namespace, localName). + These tuples are used as keys into CollectionNS + instances. + + + ---------------------------------------------------------------------- + +2004-03-18 09:19 irjudson + + * SOAPpy/Client.py: Introduced the notion of a fault handler, if a + call fails a user specified fault handler can be invoked (if it's + specified). This can be used in many situations, we're using is + specifically to handle credential problems. + +2004-03-11 18:07 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- fixed a few problem with the XML + namespace, mainly + affecting the use of 'xml:lang'. Fixed a bug + identifying + attributeGroup references vs. definitions. Also + changed the + inheritance of MarkerInterface, and moved the classes + involved + before all Schema classes. + + Now will parse "XML Schema Part 1: Structures", and + "XML Schema Part 2: Datatypes" XML Schema + definitions. + + + ---------------------------------------------------------------------- + +2004-03-11 14:14 boverhof + + * SOAPpy/wstools/: Utility.py, WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + + Utility.py -- added a CollectionNS class that keys items + via + (targetNamespace, name). + + WSDLTools.py -- + + Made WSDL Collections into CollectionNS instances, + this fixes problem with collisions, caused by + wsdl:imports, + between items with same name but defined in different + targetNamespaces. So now all items can be accessed + via + (namespace,name), but ONLY those items defined in + WSDL.targetNamepsace (not an import.targetNamespace) + can + be accessed using just 'name'. + + Also changed how portType is "loaded". Now instead + of + dropping all the operation nodes in "load", I drop + the portType node into "load". This makes sense + because + portType really should know about itself, and the + XML Schema definition of "portType" includes an + "anyAttribute" + and I need to make this stuff available. I may + change the + other WSDL information items to do this to be + consistent. + + + ---------------------------------------------------------------------- + +2004-03-09 17:53 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py - small fix for the creation of the output + parameters, + to enable return messages to be properly typecoded. + + + ---------------------------------------------------------------------- + +2004-03-04 14:50 irjudson + + * SOAPpy/SOAPBuilder.py: Added calls in dumpers to ensure tags are + built using toXMLname. + +2004-03-01 20:34 boverhof + + * SOAPpy/wstools/Utility.py: + ---------------------------------------------------------------------- + Modified Files: + Utility.py -- fixed bug in _clone_node patch. This + method + was being called with qualifiedName for namespaced + attributes, + and bombing. It is supposed to be called like so: + + getAttributeNodeNS(self, namespaceURI, localName) + + Really could remove the if/else clause here but I + decided to leave it there just to keep the 3 + distinctions + in the code. + + + ---------------------------------------------------------------------- + +2004-03-01 18:27 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- fixed bugs concerning Model Group + References (missing minOccurs/maxOccurs), + and simpleType annotations. + + + ---------------------------------------------------------------------- + +2004-02-19 10:37 irjudson + + * SOAPpy/Server.py: Added an unregisterObject function, which + unfortunately looks through the entire self.objmap dictionary to + find the object to remove, but it does remove it. + +2004-02-18 16:48 warnes + + * SOAPpy/: SOAPBuilder.py, Types.py: Changes to allow SOAPBuilder + to allow it to be passed a 'raw' Python object. + +2004-02-18 16:27 warnes + + * tests/: esj_test_client.py, esj_test_server.py: Add quit() method + to server and appropriate call to client. + +2004-02-18 16:22 warnes + + * tests/: cardClient.py, cardServer.py: Add quit() method to + cardServer, and have cardClient call it when done. + +2004-02-17 23:53 warnes + + * CHANGELOG, RELEASE_INFO: Update CHANGELOG and RELEASE_INFO for + 0.11.3 release. + +2004-02-17 23:36 warnes + + * tests/SOAPtest.py: Fix a change to how SOAPpy returns namespace + URI's that are non-string. They used to be converted to strings + and now are not. I'm not entirely sure what the correct behavior + is... + +2004-02-17 23:28 warnes + + * SOAPpy/Types.py: Accept provosed revision by Ivan Judson to the + handling of faultType objects in simplify(). + +2004-02-17 23:25 warnes + + * SOAPpy/version.py: Update version number for 0.11.1 release. + +2004-02-17 23:17 warnes + + * docs/GettingStarted.txt: Add \n to end of file. + +2004-02-17 23:16 warnes + + * tests/testClient1.py: Add test of echo with named parameter in + call to catch bugs related to providing names. + +2004-02-17 23:15 warnes + + * SOAPpy/SOAPBuilder.py: Fix bug #875977: no escaping of bad + tagnames for NoneTypes. Both bug and fix suggested by Robert + Zimmermann. + +2004-02-17 23:10 warnes + + * SOAPpy/SOAPBuilder.py: Apply patch to fix bug #888345: Python 2.3 + boolean type serialized as int, both provided by Nelson Minar. + +2004-02-17 23:05 warnes + + * SOAPpy/Client.py: Nelson Minar reported bug 888352 and provided a + patch: + + If the server does not send a content-length header for the + response, SOAPpy's Client fails to read the response. The + Google + Web APIs at api.google.com are one such service, as of around + 2004-01-20. + + A patch is included below to just read the whole socket's + contents + if there is no content-length. This should work fine for + HTTP/1.0, + but will cause troubles with HTTP/1.1 and chunked + encoding. SOAPpy's Client.py uses an old HTTP compatibility + class + from httplib which sends 1.0 requests, so this seems safe. + +2004-02-13 14:04 irjudson + + * SOAPpy/Types.py: I've added a SOAPException(Exception) class. The + simplify_objects option now raises a SOAPException instead of a + faultType. This seems to make more sense to me, but I could be + wrong. + +2004-02-13 14:02 irjudson + + * tests/echoClient.py: Added another missing call. + +2004-02-03 22:21 irjudson + + * SOAPpy/GSIServer.py: Modified GSIServer to have a GSIConfig that + handles the pyGlobus specific configuartion details. Hides this + from users. + +2004-02-03 01:39 irjudson + + * docs/GlobusSupport.txt: Updated for simpler client usage. + +2004-02-03 01:38 irjudson + + * SOAPpy/Server.py: Added a fix for exception handling that cleans + up exception data structures. This plugs a memory leak when + exceptions are raised. + +2004-02-03 01:21 irjudson + + * tests/echoServer.py: Added Context Test, cleaned up output to + honor Config.debug flag more. + +2004-02-03 01:20 irjudson + + * tests/echoClient.py: Cleaned up client usage of globus, added in + simplest test. + +2004-02-03 01:11 irjudson + + * SOAPpy/Client.py: Changed the use of SOAPAction, it used to + default to setting it to "", now it defaults to setting it to the + method (not the nsmethod). There is a clause in Server.py that + catches 'old style' SOAPActions (aka "") and sets them to the + method. When this is confirmed to be what everyone wants and we + decide it's alright to (possibly) break client/server interop, we + can take the clause out of Server.py and just handle SOAPActions + of "" as a possible error/warning. + +2004-02-03 01:08 irjudson + + * SOAPpy/GSIServer.py: Turned off default logging. + +2004-02-03 01:08 irjudson + + * SOAPpy/Server.py: Added context handling. Each call builds a SOAP + context and puts it in the global dict _contexts (indexed by + thread id). There is also a new function: GetSOAPContext() that + returns the context for the current thread. Removed previously + added method keyword args,. Turned off default log = 1 in + RequestHandler Classes. + +2004-01-30 23:20 warnes + + * SOAPpy/: Client.py, Config.py, Errors.py, GSIServer.py, NS.py, + Parser.py, SOAP.py, SOAPBuilder.py, Server.py, Types.py, + URLopener.py, Utilities.py, WSDL.py, __init__.py, version.py: + Add ident and __version string to all files. + +2004-01-30 23:19 warnes + + * SOAPpy/Server.py: + Fix bug reported by Dan Nathan that occurs using named arguments. + +2004-01-26 01:39 dwrobertson + + * SOAPpy/wstools/test/test_WSDLReader.py: Removed duplicate test. + +2004-01-26 01:38 dwrobertson + + * SOAPpy/wstools/test/: README, __init__.py, test_wsdl.py, + test_wstools.py, test_wstools_net.py: Brought README up to date, + made changes having to do with moving of ZSI-specific material to + zsi/test/wsdlpy, removed dependencies on utils.py. + +2004-01-26 01:35 dwrobertson + + * SOAPpy/wstools/test/: config.py, config.txt: config.py renamed to + config.txt + +2004-01-26 01:34 dwrobertson + + * SOAPpy/wstools/test/: test_wsdl2python.py, utils.py: Code moved + to zsi/test/wsdl2py + +2004-01-21 16:54 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- added substutionGroup to the list of + attributes + that can be specified in an ElementDeclaration. + + ---------------------------------------------------------------------- + +2004-01-15 01:09 warnes + + * docs/GlobusSupport.txt: + Added GlobusSupport.txt documentation submitted by Ivan R. Judson + <judson@mcs.anl.gov>. + +2004-01-15 00:39 warnes + + * tests/testClient1.py: + Add test for dataTimeType objects. + +2004-01-15 00:34 warnes + + * SOAPpy/Server.py, tests/echoServer.py: + Integrate patch from Ivan R. Judson [mailto:judson@mcs.anl.gov]: + + "There's a namespace collision in the Server.py module for + deciding what namespace should be used, ie, r._ns or the + path. I've resolved it with a big comment that basically + says, + r._ns is the preferred specification, but that if it is + missing + and self.path exists, use that. + + "This patch also includes the previous patch to include + "method" : + nsmethod in the keywords for invocations so that + authorization + gets the methodname." + +2004-01-05 13:03 warnes + + * SOAPpy/Types.py: + - Code assumes nested scopes, so I added the proper import so + this will work under python 2.2.x + + - _setAttr was assuming that all attributes are strings. Fixed. + +2004-01-01 23:17 rsalz + + * SOAPpy/.cvsignore: added + +2003-12-23 05:19 tag SOAPpy_0_11_1 + +2003-12-23 05:19 warnes + + * CHANGELOG, RELEASE_INFO, TODO: + - Updated documentation for 0.11.1 release. + +2003-12-23 05:05 warnes + + * SOAPpy/version.py: + - Update version number for new release. + +2003-12-23 05:04 warnes + + * SOAPpy/Server.py: + - Changes suggested by Richard Au (richardau) to fix ssl support. + See bug report [ 752882 ] "SSL SOAP Server no longer working." + +2003-12-23 04:33 warnes + + * SOAPpy/SOAPBuilder.py: + - Fixed bug [ 792258 ] "SOAPBuilder.SOAPBuilder.dump can catch + wrong exceptions" in SOAPBuilder.dump() submitted by Greg + Chapman (glchapman). + + - Fixed a bug in SOAPBuilder.dump_instance() that had been masked + by bug #792258. + +2003-12-23 04:24 warnes + + * tests/SOAPtest.py: + - Add call to structType.__init__ for user classes that inherit + from structType. This fixes a bug in the tests that was masked + by incorrectly catching & ignoring exceptions in + SOAPBuilder.dump(). + +2003-12-23 04:10 warnes + + * tests/testClient1.py: + - testClient1.py now works. The problem was failing to set + 'quit' back to zero after the SOAP server exited. + +2003-12-23 03:22 warnes + + * SOAPpy/SOAPBuilder.py: + - Remove call to gentag from 'dump' and add to 'dump_float', per + bug report [ 792600 ] "SOAPBuilder.SOAPBuilder.dump possibly + should not call gentag" by Greg Chapman (glchapman). + +2003-12-23 03:21 warnes + + * tests/SOAPtest.py: - Add a tests for handling of nil="true" and + nil="false". + +2003-12-23 03:17 warnes + + * SOAPpy/Parser.py: + - Correctly handle testing for nil="true" and for nil=1. + +2003-12-23 02:56 warnes + + * SOAPpy/wstools/WSDLTools.py: + - Fix syntax error from last change. + +2003-12-23 02:42 warnes + + * SOAPpy/wstools/WSDLTools.py: + - Added 'strict' option to the WSDL class. If strict is true, a + RuntimeException will be raised if an unrecogned message is + recieved. If strict is false, a warning will be printed to the + console, the message type will be added to the WSDL schema, and + processing will continue. This is in response to the second half + of bug report [ 817331 ] "Some WSDL.py changes", submitted by + Rudolf Ruland. + +2003-12-23 02:21 warnes + + * SOAPpy/wstools/WSDLTools.py: + - rename loadFromStream's 'file' argument to 'stream' to make it + clear that the stream need not be a file. + +2003-12-23 02:11 warnes + + * SOAPpy/Parser.py: + - Fix bug [ 858168 ] 'xsi:nil="true" causes exception', reported + by Robert Zimmermann (robertzett@user.sf.net). + +2003-12-19 22:21 warnes + + * RELEASE_INFO: + - Mistyped 'unwrap_results' instead of 'simplify_objects' + +2003-12-19 22:03 tag SOAPpy_0_11_0 + +2003-12-19 22:03 warnes + + * SOAPpy/version.py: + - Update version number for release. + +2003-12-19 17:23 warnes + + * TODO: + - Clarify what documentation needs to be done. + +2003-12-19 17:20 warnes + + * docs/quickstart.txt: + Renamed quickstart.txt to GettingStarted.txt. + +2003-12-19 17:19 warnes + + * CHANGELOG, LICENSE, README, RELEASE_INFO, + docs/GettingStarted.txt, docs/WSDL.txt: + Updated README to be shorter and clearer: - Moved most example to + docs/GettingStarted.txt + + - Moved WSDL example to docs/WSDL.txt + + - Moved license text to LICENSE + + Updated RELEASE_INFO + + Updated CHANGELOG + +2003-12-19 15:19 warnes + + * SOAPpy/Client.py, SOAPpy/Parser.py, SOAPpy/SOAPBuilder.py, + tests/SOAPtest.py, tests/simpleWSDL.py: + - Updated calls to fpconst - SOAPpy now checks for nonstandard + infinity encoding produced by some older SOAP tools like SOAP4J + and Apache SOAP (both now superceeded by Axis SOAP). + + - Added the offending XML string to the exception thrown when a + float/double underflow or overflow occurs. + + - Minor twiddles in the examples. + +2003-12-19 13:22 warnes + + * SOAPpy/: Client.py, Config.py, Types.py: + - Modified SOAPProxy code to correctly pull value of + unwrap_results and simplify_objects from Config unless + specifically provided in the instantiation call. + + - Modified Config to better handle configuration variables that + are only valid when pyGlobus is available. + + - Added more documentation to Config variables. + + - Better documentation for simplify() and simplify_contents() + functions. + +2003-12-19 13:16 warnes + + * tests/: README, SOAPtest.py, echoClient.py, echoHeader.py, + echoServer.py, esj_test_client.py, simpleWSDL.py: + - Add more documentation - Modify echoClient to work properly + when pyGlobus is not available. - Misc cleanup. + +2003-12-19 12:04 warnes + + * SOAPpy/Client.py, SOAPpy/Config.py, SOAPpy/Parser.py, + SOAPpy/Server.py, tests/SOAPtest.py, tests/echoClient.py, + tests/echoServer.py, tests/esj_test_client.py, + tests/esj_test_server.py: + - Restored Config.unwrap_results to its previous meaning. + + - Added Config.simplify_objects to control whether SOAPpy objects + are simplified into basic python types. + + - Moved simplification code out of ParseSOAPRPC into + SOAPRequestHandler.do_POST and SOAPProxy.__call. + + - Modified test code appropriately. + +2003-12-19 10:58 warnes + + * tests/simpleWSDL.py: + - Add WSDL example from the README as simpleWSDL.py + +2003-12-18 17:46 warnes + + * tests/README: + - More information + +2003-12-18 17:42 warnes + + * tests/: README, esj_test_client.py, esj_test_server.py, + largeDataTest.py, testClient1.py: + - Update test/README with more information - Remove/comment out + verbose debugging from tests. + +2003-12-18 17:40 warnes + + * SOAPpy/: Client.py, Config.py, GSIServer.py, Parser.py, + SOAPBuilder.py, Server.py, Types.py, WSDL.py, version.py: + - More changes to enable proper return of simple python objects + when Config.unwrap_results=1. + + - Addition of support for Globus, contributed by Ivan R. Judson + <judson@mcs.anl.gov> + + - Resolution of merge conflicts due to unsynchronized copies at + home and work. + +2003-12-18 13:28 warnes + + * tests/: echoHeader.py, esj_test_client.py, esj_test_server.py, + largeDataTest.py, testClient1.py, testleak.py: + - Added more test scripts. + +2003-12-18 13:26 warnes + + * tests/: echoClient.py, echoServer.py: + - Add testing of Globus Support code - Turn off verbose debugging + messages + +2003-12-18 13:25 warnes + + * tests/SOAPtest.py: + - Missed a call to parseSOAPRPC when config.uwrap_results=0. + +2003-12-18 02:08 warnes + + * RELEASE_INFO, TODO: + + - Update documentation. + +2003-12-18 01:55 warnes + + * SOAPpy/version.py: + + - Update version string. + +2003-12-18 01:54 warnes + + * SOAPpy/: Config.py, Parser.py, SOAPBuilder.py, Server.py, + Types.py: + + - Config.unwrap_results now works properly. + + - New Config.dict_encoding option to control the encoding of + dictionary keys. By default this is 'ascii' so that + dictionaries have ordinary string keys even if they were + encoded into UTF for transport (the default is UTF-8). Any + other encoding known to string.encode is valid. + + - New Config.strict_range option to force Parsing to enforce + range checking on double and float variables. This is off by + default. + +2003-12-18 01:31 warnes + + * tests/: SOAPtest.py, TCtest.py, echoClient.py, echoServer.py, + quoteTest.py, storageTest.py, xmethods.py: + + - Modified SOAPtest.py to work when Config.unwrap_results=1. - + Modified SOAPtest.py to set Config.strict_range to enforce + checking of the range of float and double objects. [This is a + new Config variable and is off by default.] - Fixed wrongly + named arguments in TCtest.py - Turned off excess debugging + information in echoClient.py, echoServer.py, storageTest.py - + Removed unneeded import from quoteTest.py that was generating a + warning. - Removed unnecessary explict mapping to dictionary in + xmethods.py + +2003-12-18 01:23 warnes + + * tests/README: + + - A little documentation can go a long way, ;^) + +2003-12-17 17:18 warnes + + * SOAPpy/wstools/WSDLTools.py: + - Fix problem when WSDL.Proxy was passed WSDL in a string or + other object that does not have a 'name' attribute. + +2003-12-08 03:04 boverhof + + * SOAPpy/wstools/WSDLTools.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py -- partial bug fix for [ 850905 ] wsdl2py + crash on a more sophisticated WSDL. + We do expect a list, but nmtokens is a list separated + by a single space. So if + "parts" is a string, the just split it to create a + list. + + + ---------------------------------------------------------------------- + +2003-12-05 20:49 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- If a schema needs to 'create' a schema + instance + itself it adds this schema to its import + dictionary. + + + ---------------------------------------------------------------------- + +2003-12-05 19:17 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- error in Import class for accessing a + schemaLocation. + This bug would only show up if a schema had to + 'construct' a + schema instance of a different namespace itself. + Basically the + last option. + + + ---------------------------------------------------------------------- + +2003-12-05 10:34 warnes + + * SOAPpy/: WSDL.py, wstools/WSDLTools.py: + - Fixes submitted by Kirk Strauser <kirk@daycos.com> for WSDL use + in SOAPpy + +2003-12-04 14:05 warnes + + * zope/: README, zope-2.5.0-soappy.diff, zope-2.6.2-soappy.diff, + zope-soap-client.py: + - Added contributed patches from Antonio Beamud Montero + <antonio.beamud@linkend.com> that extend Zope with SOAP support. + +2003-12-04 13:57 warnes + + * README, README.MethodParameterNaming, SOAPpy/Config.py, + SOAPpy/Parser.py, SOAPpy/SOAPBuilder.py, SOAPpy/Types.py, + SOAPpy/version.py: + - Mostly changes to allow returned SOAP objects to be 'unwrapped' + into plain python objects when Config.unwrap_results=1. + +2003-11-21 18:12 warnes + + * tests/SOAPtest.py: + - Fix some errors - Updated to mactch change back to "v#" unnamed + variable naming strategy. - Added config.unwrap=0 to prevent + unwrapping since this test code expects the old behavior. - + Only 1 failure now! + +2003-11-14 14:09 warnes + + * docs/MethodParameterNaming.txt: + Moved /README.MethodParameterNaming to + /docs/MethodParameterNaming.txt + +2003-11-14 13:48 warnes + + * SOAPpy/Types.py: + - Modify compoundType to remove internal _asdict and _aslist + objects which were different views of (some of) the object + attributes. It was possible for these views to get out of sync + causing (at least) confusion. Instead, I provide _aslist() and + _asdict() functions to render the existing data in the desired + format. + + - Modify simplify() function to work recursively on compound + types. + +2003-11-14 13:44 warnes + + * SOAPpy/Server.py: - if Config.unwrap_results is True, convert + SOAPpy arrayType and structType to python list and dictionary. + + - Modify special argument handling to use "v[0-9]" for unnamed + ordered arguments. + +2003-11-14 13:36 warnes + + * SOAPpy/SOAPBuilder.py: - Modifid to use "v[0-9]+" as pattern for + ordered but unnamed parameters instead of "_[0-9]". - Modified + dump_instance to simplify writing out object components. + +2003-11-14 13:32 warnes + + * SOAPpy/Parser.py: - Parser.py now converts arrayType and + structType SOAPpy objects to standard python lists and + dictionaries when Config.unwrap_results is True. + +2003-11-14 13:29 warnes + + * SOAPpy/Config.py: - Changed config.specialArgs back to a simple + flag. + +2003-10-31 21:49 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- fixed an unknown bug caused by + overlooking + the potential namespace contents of a "types" node. + + + ---------------------------------------------------------------------- + +2003-10-27 17:42 boverhof + + * SOAPpy/wstools/: Utility.py, WSDLTools.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + Utility.py + WSDLTools.py + XMLSchema.py + + -- Fixed [ 810251 ] default method broken in + wstools.Utility.Collection + + -- Also fixed problem with includes Collection key, and + changed + Types(Collection) class to declare a different default + key instead + of redefining several methods. + + + ---------------------------------------------------------------------- + +2003-10-27 16:26 boverhof + + * SOAPpy/wstools/: WSDLTools.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py + XMLSchema.py + + -- Fixed [ 808505 ] ZSI fails with <xs:include ... /> + Now schema include statements should be handled correctly. + + -- Also fixed a problem with relative uri handling for + XMLSchema + and SchemaReader. + + + ---------------------------------------------------------------------- + +2003-10-25 22:54 dwrobertson + + * SOAPpy/wstools/test/utils.py: Fixed bug, where if multiple tests + using one diff file, and one test failed, subsequent tests failed + on assertion errors. + +2003-10-23 15:09 uid27080 + + * README: + Updated for 0.10.4 + +2003-10-23 15:00 uid27080 + + * RELEASE_INFO: Updated for 0.10.4 + +2003-10-23 14:58 uid27080 + + * CHANGELOG: - Updated for 0.10.4. + +2003-10-22 12:46 tag SOAPpy_0_10_4 + +2003-10-22 12:46 warnes + + * SOAPpy/version.py: + Release with improved performace thanks to a patch by Erik + Westra. + +2003-10-22 12:45 warnes + + * SOAPpy/Parser.py: + + Patch improving parser performance submitted by Erik Westra: + + On Tuesday, October 21, 2003, at 09:44 PM, Erik Westra wrote: + + > Hi Greg, + > + > I've been using your SOAPpy library (version 0.10.3) in an + application + > I've been developing, and have had complaints about the amount + of time + > it takes to receive large packets of data. In this + application, the + > server was sending through PDF documents as base64-encoded + strings, + > which were transmitted using a perl SOAP library, and processed + on my + > end using SOAPpy. As soon as the PDF files got reasonably + large, + > SOAPpy was taking a long time to decode the data -- up to five + minutes + > in some cases. + > + > I started digging into the SOAPpy source code, and quickly + found the + > cause of the problem: the Parser.py module was using a Python + string + > to store the character data, and as new character data was + being + > received, it was "appending" the new data like this [from + Parser.py, + > line 404]: + > + > self._data += c + > + > The problem with this is that Python strings are immutable, so + the + > above statement is actually recreating the entire string buffer + from + > scratch every time a new line of character data is received -- + which + > is extremely inefficient. A much better way to do this is to + use a + > (mutable) Python list object, and to append the new character + data to + > the end of this list, like this: + > + > self._data = [] + > + > ... + > + > self._data.append(c) + > + > and then use: + > + > string.join(self._data, "") + > + > to obtain the complete copy of the list once all the lines of + data + > have been processed. I've attached a unified diff file listing + the + > changes I've made to Parser.py to implement this -- they're + pretty + > minimal, and won't affect anything other than the performance + of + > character data processing. + > + > The results of this patch are quite impressive: I tried + processing a + > SOAP response with a single string containing around 675K of + data. + > Using the unpatched Parser.py file, it took 111 seconds to + process -- + > with this patch, it took just 2.4 seconds. + > + > I hope you find this useful... + > + > Cheers, + > + > - Erik. + > + > PS: Thanks for the work you (and the others) are doing on + SOAPpy. + > It's a great library! + +2003-10-21 17:19 dwrobertson + + * SOAPpy/wstools/test/config.py: Latest xmethods. + +2003-10-14 12:08 mbucc + + * SOAPpy/wstools/Utility.py: Use m2crypto for SSL if it's installed + +2003-10-08 13:07 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- now we get reasonable error message back + when + import or include is incorrectly defined. + + File + "/usr/local/lib/python2.3/site-packages/ZSI/wstools/XMLSchema.py", + line 478, in __checkAttributes + raise SchemaError, '%s, unknown attribute' %a + ZSI.wstools.XMLSchema.SchemaError: location, unknown attribute + + + ---------------------------------------------------------------------- + +2003-10-03 13:49 rsalz + + * SOAPpy/wstools/Utility.py: Let lower layers (http lib) raise + exception if trying to use SSL on a non-SSL-enabled system. + +2003-10-03 10:01 mbucc + + * SOAPpy/wstools/XMLSchema.py: Removed pyXml dependency. + +2003-10-01 18:08 dwrobertson + + * SOAPpy/wstools/test/__init__.py: For importing utils + +2003-10-01 17:47 dwrobertson + + * SOAPpy/wstools/test/test_wsdl2python.py: High level client code + generator tests moved elsewhere. + +2003-09-30 04:25 dwrobertson + + * SOAPpy/wstools/test/utils.py: Fixed premature close of string + buffer. + +2003-09-25 14:12 tag SOAPpy_0_10_3 + +2003-09-25 14:12 warnes + + * SOAPpy/version.py: - Updated to 0.10.3 (we missed a cvs tag + point) + +2003-09-25 14:09 tag SOAPpy_0_10_2 + +2003-09-25 14:09 warnes + + * SOAPpy/SOAPBuilder.py: Updated version number for release 0.10.2. + +2003-09-16 20:08 dwrobertson + + * SOAPpy/wstools/test/config.py: Updated with latest xmethods, + removed URL's no longer in xmethods. + +2003-09-16 15:25 rsalz + + * SOAPpy/WSDL.py: Bug 792247: Unnecessarily slow code in + WSDL.Proxy.__getattr__ Use has_key instead of creating + temporary names() list + +2003-09-13 20:38 dwrobertson + + * SOAPpy/wstools/test/utils.py: Added ability to read values from + multiple config file sections, added setUpWsdl utility function, + cleaned up loadTestsFromNames, updated comments. + +2003-09-13 20:36 dwrobertson + + * SOAPpy/wstools/test/test_WSDLReader.py: Now using separate + MatchTestLoader in makeSuite. Improved way config file sections + are selected. + +2003-09-13 20:35 dwrobertson + + * SOAPpy/wstools/test/test_wsdl2python.py: Combined two tests + generating services and services_types files into one method. + Moved setUpWsdl to utils. Added easier choosing of config file + sections. Used separate MatchTestLoader in makeTestSuite. + +2003-09-13 20:32 dwrobertson + + * SOAPpy/wstools/test/: test_t1.py, test_wsdl.py, test_wstools.py, + test_wstools_net.py: Converted to more automated way of + generating test cases from configuration file. + +2003-09-12 02:11 dwrobertson + + * SOAPpy/wstools/test/config.py: Cleaned up names. + +2003-09-11 21:22 dwrobertson + + * SOAPpy/wstools/test/config.py: services_by_http section divided + into services where there is no schema, those with only simple + types, those with complex types, those with WSDLReader errors, + and those with wsdl2python errors. The last contain mostly those + that are raised by that module. + +2003-09-11 18:53 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + + Modified Files: + XMLSchema.py -- fixed two places where 'readFromURL' was + supposed + to be loadFromURL. + + ---------------------------------------------------------------------- + +2003-09-10 02:01 dwrobertson + + * SOAPpy/wstools/test/test_wsdl2python.py: Got rid of pyGridWare + import. + +2003-09-10 02:01 dwrobertson + + * SOAPpy/wstools/test/test_WSDLReader.py: Got rid of pyGridWare + import + +2003-09-10 00:17 dwrobertson + + * SOAPpy/wstools/test/utils.py: Utilities to aid unit tests. + +2003-09-10 00:16 dwrobertson + + * SOAPpy/wstools/test/test_wsdl2python.py: Unit tests for code + generation in wsdl2python + +2003-09-10 00:15 dwrobertson + + * SOAPpy/wstools/test/test_WSDLReader.py: Unit tests for WSDLReader + from WSTools + +2003-09-10 00:14 dwrobertson + + * SOAPpy/wstools/test/config.py: Added many URL's from xmethods to + services_by_http section. + +2003-09-05 15:59 warnes + + * README: + Changed dependency list. SOAPpy does depend on fpconst, but no + longer depends on pyXML. + +2003-09-05 15:53 warnes + + * README: + - Added dependencies list + +2003-09-05 14:57 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- added a try clause to catch xml.dom.ext + ImportError, + and added a SplitQName function that matches + xml.dom.ext.SplitQName + output. + + + ---------------------------------------------------------------------- + +2003-08-28 15:03 boverhof + + * SOAPpy/wstools/test/: README, config.py, test_t1.py, + test_wsdl.py, xmethods.tar.gz: + ---------------------------------------------------------------------- + Modified Files: + README config.py test_t1.py test_wsdl.py xmethods.tar.gz + + Added a couple tests and an explanation of how to add new + tests. + + ---------------------------------------------------------------------- + +2003-08-28 13:26 warnes + + * SOAPpy/Client.py: - Fixed missing import needed for basic + authentication. + +2003-08-27 18:27 boverhof + + * SOAPpy/wstools/test/: README, config.py, schema.tar.gz, + test_t1.py, test_wsdl.py, test_wstools.py, test_wstools_net.py, + xmethods.tar.gz: + ---------------------------------------------------------------------- + Added Files: + README config.py schema.tar.gz test_t1.py test_wsdl.py + test_wstools.py test_wstools_net.py xmethods.tar.gz + + -- basic unittesting framework for WSDLTools/XMLSchema, + test_t1 just checks that everything goes where it's + supposed to. + + + ---------------------------------------------------------------------- + +2003-08-27 18:25 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- attribute declarations were going into + wrong collection. + + + ---------------------------------------------------------------------- + +2003-08-26 18:43 boverhof + + * SOAPpy/wstools/: WSDLTools.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + WSDLTools.py -- added a line in Reader to to set + WSDL.location + for files so that imports with relative paths will + work for + file paths as well as urls. + + XMLSchema.py -- a couple Attribute fixes, and the + WSDLAdapter + wasn't passing it's parent into the XMLSchemaComponent + constructor which was messing up import lookups. + + + ---------------------------------------------------------------------- + +2003-08-25 18:35 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- fixed XMLSchemaComponent.setAttributes, + added + the needed getNamespace method to all + DOMAdapters. All + changes are related to XML attribute handling. + + + ---------------------------------------------------------------------- + +2003-08-25 08:16 warnes + + * README: + - Applied patch submitted by Humberto Diógenes (virtualspirit): + + Corrected examples inside "readme + + Just eliminated some warnings ("import SOAPProxy" instead + of + "import SOAP") + +2003-08-07 00:49 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- Modified/Extended some of the element + classes. + For LocalElementDeclaration inheritance was + duplicitous, + and for ElementReference it was wrong. + + + ---------------------------------------------------------------------- + +2003-08-05 19:42 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- Fixed a few bugs, a few classes + mistakenly thought + they contained global attribute declarations and + I fixed this + to local attribute declarations. Couple spots + where + AttributeGroup declarations and references were + incorreclty + used in place of eachother. Completed a few + classes but + a few remain incomplete. + + + ---------------------------------------------------------------------- + +2003-07-31 02:37 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- Namespaced a couple attributes in + attribute + dictionary that I missed. + + + ---------------------------------------------------------------------- + +2003-07-30 15:45 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- there was a indexing logic error in + Restriction/Extention + classes fromDom method. Also changed the + attribute dictionary of all + classes that inherit XMLSchemaComponent, now all + attributes + are organized by namespace. + + + ---------------------------------------------------------------------- + +2003-07-25 17:46 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- + Wasn't resolving schema imports in every scenario. + Now look in parent schema imported_schemas first, + second look + in the parent wsdl, lastly try to resolve + schemaLocation. + + Removed 'what' parameter from marker interface + methods, I don't + know what it was doing there. Check self. + + + ---------------------------------------------------------------------- + +2003-07-23 20:34 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- changed getQNameAttribute to return None + if it + can't find QName obj in it's own tns or in any of + its + imported namespaces. Used to throw an exception. + + + ---------------------------------------------------------------------- + +2003-07-23 18:16 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- fixed some default attribute handling, + added a + few get methods in XMLSchema for + elementFormDefault, + attributeFormDefault, blockDefault, finalDefault. + Also + added a global method GetSchema. Now default + attributes + are set correctly in all schema components. + + + ---------------------------------------------------------------------- + +2003-07-23 16:33 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- checking for wrong class in two methods. + + + ---------------------------------------------------------------------- + +2003-07-23 14:25 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- removed bogus method setType in + SimpleType class. + + + ---------------------------------------------------------------------- + +2003-07-22 13:39 boverhof + + * SOAPpy/wstools/Utility.py: + ---------------------------------------------------------------------- + Modified Files: + Utility.py -- commited a mistake. fixed. + + + ---------------------------------------------------------------------- + +2003-07-22 13:34 boverhof + + * SOAPpy/wstools/: Utility.py, XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + Utility.py -- Added a parameter to Collection class + constructor, + 'name' is the default attribute used for keys but + one + can specify whatever key they want. + + XMLSchema.py -- Used the above parameter to make + Collection + instances use the appropriate 'attribute' as key. + + + ---------------------------------------------------------------------- + +2003-07-22 10:57 warnes + + * SOAPpy/: Parser.py, SOAPBuilder.py: + - More fixes to use fpconst instead of ieee754. + +2003-07-22 10:54 warnes + + * SOAPpy/: Parser.py, SOAPBuilder.py, wstools/__init__.py, + wstools/ieee754.py: + - Remove obsolete ieee754.py. PEP 754 provides a (proposed) + fpconst module which is a newer version of this code. fpconst, + will of course, need to be installed separately. + +2003-07-21 18:13 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- still a couple mistakes in constructors, + changed + XSDNS to SCHEMA.XSD_LIST which was a mistake. + +2003-07-21 17:56 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- missing parent parameter to a few + constructors + that expect to see it. fixed. + +2003-07-21 15:14 boverhof + + * SOAPpy/wstools/: XMLSchema.py, license.txt: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- added LBNL copyright header. + Added Files: + license.txt -- LBNL copyright. + +2003-07-21 10:18 warnes + + * SOAPpy/: version.py, wstools/UserTuple.py, wstools/XMLSchema.py: + + - Modified XMLSchema to extend UserTuple instead of tuple for + python < 2.2. + + - Added UserTuple class, taken from from Stefan Schwarzer's + ftputil library, which is available at + <http://www.ndh.net/home/sschwarzer/python/python_software.html>. + +2003-07-21 09:15 warnes + + * SOAPpy/Utilities.py: + - Unecesssary import was creating a circular import loop. + +2003-07-18 13:36 tyger23 + + * SOAPpy/wstools/XMLSchema.py: fixed a naming issue + +2003-07-18 11:58 warnes + + * SOAPpy/URLopener.py, SOAPpy/WSDL.py, tests/BabelfishWSDLTest.py: + - Modifed WSDL.Proxy to pass along all arguments to SOAPProxy. + This should ensure that all features of SOAPProxy are + accessible to users of WSDL.Proxy + + - Created URLopener.py, which contains a class extending + urllib.FancyURLopener. This class allows reading from URLs that + are protected by basic authenticatoin, have been relocated, etc. + + - Modified WSDL.Proxy to use URLopener. It should now permit + access to WSDL files protected by basic authentication. + +2003-07-18 10:13 warnes + + * SOAPpy/Client.py: - Trivial formatting change + +2003-07-17 18:23 boverhof + + * SOAPpy/wstools/XMLSchema.py: + ---------------------------------------------------------------------- + Modified Files: + XMLSchema.py -- fixed a couple bad variable references. + +2003-07-17 17:48 boverhof + + * SOAPpy/wstools/: WSDLTools.py, XMLSchema.py: Modified Files: + WSDLTools.py -- just a few minor changes so the new + schema class + will be used instead of the schema tns + placeholder. + + Might want to add an argument to WSDL.load method + so that + programmer can specify the placeholder or actual + implementation. + + XMLSchema.py -- mostly new, unused original code is + commented out at the bottom. + +2003-07-02 14:58 warnes + + * SOAPpy/: Client.py, version.py: + - Client.py failed to import faultType from Types.py, and was + getting the python default instead. This caused problems in + properly detecting errors on the server reported via SOAP. + +2003-05-29 17:01 warnes + + * SOAPpy/WSDL.py: + - Add additional arguments to __init__ which will be passed to + Proxy.__init__. This allows specification of proxy server and + other options. + +2003-05-22 22:31 feanor420 + + * SOAPpy/wstools/Utility.py: Running pychecker over ZSI, and I + noticed some problems in wstools. + + I fixed that fact that Notation and Entity were not be found. I + changed them to use the qualified name like the rest of the + symbols from xml.dom.minidom. + + I also discovered that a RecursionError was being thrown, but + RecursionError didn't exist. I created simple sub-class of + Exception to rectify this. + +2003-05-21 13:39 warnes + + * SOAPpy/Client.py: + - Modified getNS pattern to prevent grabbing to much text. + +2003-05-21 12:06 blunck2 + + * SOAPpy/Client.py: changed namespace regular expression so that it + matches what is returned from a stateful (*shiver*) soap server. + for example, the namespace returned from a stateful soap server + looks like: urn:echo@rO0ABXNyACJ3ZWJsb2.... where urn:echo was + the original namespace. + +2003-05-21 11:33 tag SOAPpy_0_10_1 + +2003-05-21 11:33 warnes + + * CHANGELOG, RELEASE_INFO: - Updated CHANGELOG and RELEASE_INFO for + 0.10.1 release. + +2003-05-21 10:52 warnes + + * tests/: SOAPtest.py, TCtest.py, alanbushTest.py, echoClient.py, + echoServer.py, excelTest.py, newsTest.py, quoteTest.py, + speedTest.py, storageTest.py, translateTest.py, weatherTest.py, + whoisTest.py, xmethods.py: + - Add ".." to python module path so that the local SOAPpy code + will be used instead of the globally installed code when + running tests. + +2003-05-21 10:51 warnes + + * setup.py: + - Update setup.py to get version string from + SOAPpy/version.__version__. + +2003-05-21 10:37 warnes + + * SOAPpy/version.py: - I forgot to update the version number + associated with the addition of the file version.py. + +2003-05-21 10:34 warnes + + * SOAPpy/: Client.py, Errors.py, Server.py, version.py: + - Added file 'version.py' whose sole purpose is to hold the + definition of __version__ in a single place. - Modified + Server.py and Client.py to 'from version import __version__'. - + Removed __version__ definition from Error.py, which never used + it. + +2003-05-20 17:25 tag SOAPpy_0_10_0 + +2003-05-20 17:25 warnes + + * RELEASE_INFO: Updated for release 0.10.0. + +2003-05-20 17:10 warnes + + * SOAPpy/wstools/: TimeoutSocket.py, Utility.py, WSDLTools.py, + XMLSchema.py, XMLname.py, __init__.py: + - Added ident string containing CVS version to all files that + were lacking this. + +2003-05-20 17:04 warnes + + * CHANGELOG, TODO, setup.py, SOAPpy/SOAP.py, SOAPpy/Types.py, + SOAPpy/WSDL.py, SOAPpy/__init__.py: + - Added ident string containing CVS version to all files that + were lacking this. + +2003-05-20 16:08 warnes + + * SOAPpy/Client.py: + - Fix bug in getNS that caused loss of namespace by using better + pattern matching to find the namespace in the SOAP message. + +2003-05-20 08:47 warnes + + * setup.py: + - Removed or changed dashes to underscores in version numbers to + make RPM happy. + +2003-05-19 13:45 warnes + + * SOAPpy/Server.py: - Added ThreadingSOAPServer which inherits from + ThreadingTCPServer server so that muliple clients will be + automatically multiplexed. + +2003-05-15 20:31 boverhof + + * SOAPpy/wstools/XMLSchema.py: Modified Files: + XMLSchema.py + + + ---------------------------------------------------------------------- + fixed an obvious bug, added a SchemaError class so it can + actually + be thrown. + + + ---------------------------------------------------------------------- + +2003-05-13 20:22 blunck2 + + * SOAPpy/wstools/WSDLTools.py: changed references to classes that + exist within this module. + +2003-05-09 08:46 warnes + + * README, TODO, setup.py, SOAPpy/Client.py, SOAPpy/Config.py, + SOAPpy/Errors.py, SOAPpy/NS.py, SOAPpy/Parser.py, SOAPpy/SOAP.py, + SOAPpy/SOAPBuilder.py, SOAPpy/Server.py, SOAPpy/Types.py, + SOAPpy/Utilities.py, SOAPpy/WSDL.py, SOAPpy/__init__.py, + tests/SOAPtest.py, tests/TCtest.py, tests/alanbushTest.py, + tests/cardClient.py, tests/cardServer.py, tests/echoClient.py, + tests/echoServer.py, tests/excelTest.py, tests/fortuneTest.py, + tests/guidTest.py, tests/itimeTest.py, tests/newsTest.py, + tests/quoteTest.py, tests/quoteTest1.py, tests/quoteTest2.py, + tests/speedTest.py, tests/storageTest.py, tests/testWSDL.py, + tests/translateTest.py, tests/weatherTest.py, tests/whoisTest.py, + tests/wordFindTest.py, tests/xmethods.py: + - Merge changes splitting SOAP.py file into 10 separate files. + This should make the source much easier to navigate. + +2003-05-09 03:17 warnes + + * setup.py, SOAPpy/Client.py, SOAPpy/Parser.py, SOAPpy/SOAP.py, + SOAPpy/Server.py, SOAPpy/Utilities.py, SOAPpy/WSDL.py, + SOAPpy/__init__.py, tests/SOAPtest.py, tests/TCtest.py, + tests/alanbushTest.py, tests/cardClient.py, tests/cardServer.py, + tests/echoClient.py, tests/echoServer.py, tests/excelTest.py, + tests/newsTest.py, tests/quoteTest.py, tests/speedTest.py, + tests/storageTest.py, tests/testWSDL.py, tests/translateTest.py, + tests/weatherTest.py: - Many changes associated with splitting + SOAP.py into separate files. - Added Mark Bucciarelli's + <mark@hubcapconsulting.com> patch to provide wsdl code on + properly structured .GET requests + +2003-05-09 02:41 warnes + + * tests/translateTest.py: + - Added code to check for http_proxy environment variable and 'do + the right thing' (tm). + +2003-05-09 02:39 warnes + + * tests/whoisTest.py: + - Updated to use whois SOAP service provided by + www.SoapClient.com + +2003-05-09 02:23 warnes + + * tests/wordFindTest.py: + - Service no longer exists. + +2003-05-09 02:16 warnes + + * tests/: quoteTest1.py, quoteTest2.py: + - Service no longer exists. + +2003-05-09 02:13 warnes + + * tests/xmethods.py: + - Added test out to xmethods.net, which looks like a stable site + with lots of useful SOAP/WSDL/... stuff. + +2003-05-09 02:13 warnes + + * tests/itimeTest.py: + - Service no longer exists. + +2003-05-08 23:44 warnes + + * tests/guidTest.py: + - The target SOAP server no longer exists. + +2003-05-08 23:14 warnes + + * tests/fortuneTest.py: + - The target server no longer exists. Delete test. + +2003-05-08 17:32 warnes + + * TODO: + - Add TODO file. + +2003-05-08 17:29 warnes + + * README, setup.py, SOAPpy/Client.py, SOAPpy/Config.py, + SOAPpy/Errors.py, SOAPpy/NS.py, SOAPpy/Parser.py, SOAPpy/SOAP.py, + SOAPpy/SOAPBuilder.py, SOAPpy/Server.py, SOAPpy/Types.py, + SOAPpy/Utilities.py, SOAPpy/__init__.py, tests/cardClient.py, + tests/excelTest.py, tests/testWSDL.py: + - Split up the monolithic SOAPpy/SOAP.py into separate files. + This should make SOAPpy easier to maintain. + + - Other incidental changes.. + +2003-05-08 13:26 rsalz + + * SOAPpy/: WSDL.py, wstools/ServiceProxy.py, wstools/__init__.py: + Finish up what Mark Bucciarelli kicked off and I started with + commit a little while ago. :) That is, wstools is now + independant of SOAPpy and ZSI. This commit does the following: + wstools/ServiceProxy.py is now ZSI/ServiceProxy.py, so some + imports and ZSI docs had to change. ZSI needs some changing, in + case I didn't patch up all the imports right. + +2003-05-08 12:58 rsalz + + * SOAPpy/wstools/: ServiceProxy.py, WSDLTools.py: Move some stuff + from ServiceProxy (which imports ZSI) to WSDLTools (which + doesn't), so that SOAPpy can use wstools without needing ZSI + around... which is kinda the point of generic common-code. :) + + class SOAPCallInfo: + class ParameterInfo: + class HeaderInfo(ParameterInfo): + def callInfoFromWSDL(port, name): + Next step is to move what's left of wstools/ServiceProxy.py into + the ZSI module (and fix up the imports), so that wstools has *no* + soap-stack-specific code in it. + +2003-05-07 17:07 warnes + + * SOAPpy/SOAP.py: + - Fixed XML parse error memory leak fix to still raise the + error... + +2003-05-07 12:50 warnes + + * SOAPpy/SOAP.py: + - Applied patch by bstpierre, which he suggested to fix memory + leaks in bug report 544572 (see + http://sourceforge.net/tracker/index.php?func=detail&aid=544572&group_id=26590&atid=387667). + The leaks seem to have been corrected by other patches, but + the suggested code is cleaner, so I've applied it anyway. + +2003-05-07 11:34 warnes + + * SOAPpy/SOAP.py: + - Applied patch by Mark Bucciarelli to fix memory leak when the + SAX parser throws an exception. + +2003-05-07 10:39 warnes + + * SOAPpy/SOAP.py: + - Commit memory leak fix patch submitted by Jeremy Fincher + (jemfinch). + +2003-04-30 15:38 warnes + + * SOAPpy/SOAP.py: - Fixed display of exception when an internal + error happens. + +2003-04-29 10:53 rsalz + + * SOAPpy/wstools/: ServiceProxy.py, Utility.py: Remove DOS + line-ending ^M chars + +2003-04-28 10:59 rsalz + + * SOAPpy/wstools/.cvsignore: Move .cvsignore from ZSI/wsdl to + wstools + +2003-04-28 09:57 tag SOAPpy_0_9_9_pre5 + +2003-04-28 09:57 warnes + + * SOAPpy/SOAP.py: + - Updated version number + +2003-04-28 09:56 warnes + + * CHANGELOG, MANIFEST.in, README, SOAPpy/WSDL.py, + SOAPpy/__init__.py, tests/TemperatureService.wsdl, + tests/testWSDL.py: - Added client support for WSDL, ported from + ZSI by Mark Bucciarelli <mark@hubcapconsulting.com> + +2003-04-24 22:50 warnes + + * setup.py, SOAPpy/SOAP.py, SOAPpy/__init__.py: - More changes + associated with moving ZSI/SOAPpy common code into wstools CVS + package. + +2003-04-24 19:08 warnes + + * setup.py, SOAPpy/SOAP.py, SOAPpy/XMLname.py, SOAPpy/__init__.py, + SOAPpy/ieee754.py: + - Moved XMLname.py and ieee754 to the wstools CVS package. - + Modified SOAPpy to include these files from thier new location. + +2003-04-24 13:45 warnes + + * SOAPpy/wstools/: XMLname.py, __init__.py, ieee754.py: + - Moved XMLname.py and ieee754.py into the wstools CVS package + from SOAPpy/SOAPpy. + +2003-04-24 13:03 rsalz + + * SOAPpy/wstools/: ServiceProxy.py, TimeoutSocket.py, Utility.py, + WSDLTools.py, XMLSchema.py, ZPL, __init__.py: Import files from + (now outdated) ZSI/wsdl directory + +2003-03-27 11:36 warnes + + * CHANGELOG, SOAPpy/SOAP.py: + Updated version to 0.9.9-pre3 and added reason to changelog. + +2003-03-27 11:22 warnes + + * SOAPpy/SOAP.py: + - Only define SOAPUnixSocketServer if the Unix domain sockets are + supported + +2003-03-27 08:10 tag REL_0_9_9_pre2 + +2003-03-27 08:10 warnes + + * CHANGELOG: + - Added named scope change. + +2003-03-27 08:07 warnes + + * SOAPpy/SOAP.py: + - New argument handling codes needs nested scopes. + +2003-03-27 07:32 warnes + + * CHANGELOG, README, RELEASE_INFO: + - Updated text files for 0.9.9-pre2 release. + +2003-03-26 16:12 warnes + + * SOAPpy/SOAP.py: - Update version number to 0.9.9-pre2 + +2003-03-26 12:55 warnes + + * SOAPpy/__init__.py: + - Added import of ieee754. + +2003-03-26 12:54 warnes + + * SOAPpy/ieee754.py: + - Fixed type in __doc__ text. + +2003-03-26 11:29 warnes + + * SOAPpy/SOAP.py: - Split class SOAPServer into SOAPServerBase and + two sublcasses, SOAPServer and SOAPUnixSocketServer. + SOAPServer has the same functionality as before, while + SOAPUnixSocketServer connects over a Unix domain socket instead + of to a (public) TCP/IP port. + +2003-03-26 00:02 tag REL_0_9_9_pre1 + +2003-03-26 00:02 warnes + + * CHANGELOG: - Updated to note addition of ieee754 module and + changes enablein MS-Windows support + +2003-03-25 23:51 warnes + + * SOAPpy/: SOAP.py, ieee754.py: + - Added ieee754.py, which handles checking for IEEE 754 special + values: Inf, -Inf, NaN, ... - Updated SOAP.py to use the new + ieee754 module instead of the old (broken) windows hack. + +2003-03-25 15:53 warnes + + * SOAPpy/SOAP.py: - Reversed version string to 0.9.9-pre1. + +2003-03-25 15:45 warnes + + * CHANGELOG, README.MethodParameterNaming, SOAPpy/SOAP.py: + - specialArgs handling is now enabled by default. + +2003-03-25 15:26 warnes + + * setup.py: + - Modified setup.py to get version number directly from + SOAPpy/SOAP.py's __version__ variable. + +2003-03-25 12:53 warnes + + * SOAPpy/SOAP.py: - Changed all references from actzero.com to + pywebsvcs.sf.net. + +2003-03-25 12:02 warnes + + * SOAPpy/SOAP.py: + - Unnamed arguments which were lists were being incorrectly given + the name 'Result'. + +2003-03-12 03:14 tag REL_0_9_8 + +2003-03-12 03:14 warnes + + * MANIFEST.in: + - Added MANIFEST.in: needed by setup.py to create source + distribution. + +2003-03-12 02:53 warnes + + * tests/: SOAPtest.py, TCtest.py, echoClient.py, echoServer.py, + excelTest.py, speedTest.py: + - Updates related to change in structure to allow installation + using python distutils (i.e. setup.py) + +2003-03-12 02:47 warnes + + * setup.py: + - Updated version number to 0.9.8 + +2003-03-12 02:38 warnes + + * CHANGELOG: + - Noted directory restructuring in CHANGELOG. + +2003-03-08 00:10 warnes + + * CHANGELOG, README, setup.py, SOAPpy/__init__.py, + bid/inventoryClient.py, bid/inventoryServer.py, + bid/monitorClient.py, contrib/soap_cli.py, + contrib/soap_handler.py, tests/alanbushTest.py, + tests/cardClient.py, tests/cardServer.py, tests/fortuneTest.py, + tests/guidTest.py, tests/itimeTest.py, tests/newsTest.py, + tests/quoteTest.py, tests/quoteTest1.py, tests/quoteTest2.py, + tests/storageTest.py, tests/translateTest.py, + tests/weatherTest.py, tests/whoisTest.py, tests/wordFindTest.py, + validate/silabclient.py, validate/silabserver.py, + validate/soapware.py: + - Updates related to change in structure to allow installation + using python distutils (i.e. setup.py) + +2003-03-08 00:07 warnes + + * SOAPpy/SOAP.py: + - implemented an experimental method of handling method argument + names. + +2003-03-08 00:00 warnes + + * README.MethodParameterNaming: + - Fixed typos, improved wording and formatting. + +2003-03-05 16:43 warnes + + * setup.py: - Initial version of setup.py. Not yet tested! + +2003-02-10 12:06 rsalz + + * SOAPpy.spec: Add RPM spec file from Antonio Beamud Montero + (http://www.agoratechnologies.com). Temporary fix until a + setup.py file is written. + +2002-08-06 14:26 tag Release_1_0_0_beta3 + +2002-08-06 14:26 blunck2 + + * SOAPpy/SOAP.py: - Changed invoke method in SOAPProxy class to + return the value from the __call invocation (there was previously + no way to extract the return values from the call) + +2002-07-30 22:28 blunck2 + + * SOAPpy/SOAP.py: HTTPTransport.call(..) returns the response + message from the HTTP request regardless of the value of + config.dumpSOAPIn. I removed the conditional logic around the + fetching of the response message so that prior to the call to + getNS, the data is guaranteed to be there. + +2002-07-30 20:30 warnes + + * CHANGELOG, README, SOAPpy/SOAP.py: - Added 'no namespace' check + to namespace-rewriting code to avoid problems when no namespace + is specified. + + - Updated CHANGELOG and README + + - Added noroot parameter to the SOAPBuilder and SOAPProxy objects + in order to provide compatibility with an older version of + EasySOAP (v0.2) that balked if the SOAP-ENC:root parameter was + included.(Brad Knotwell) + +2002-07-25 17:38 blunck2 + + * SOAPpy/SOAP.py: - Added support for namespace-rewriting (used by + Apache v2.x SOAP server for error conditions as well as + stateful communication) - Added string <-> str conversion for + array types (Python 2.2+) - Added convenience method (invoke) to + SOAPProxy that calls __call (not sure if it is necessary - feel + free to remove if you want) + +2002-07-25 15:43 warnes + + * SOAPpy/SOAP.py: + - Python 'float' are equivalent to SOAP 'double'. Modified + dump_float and dump_list to use SOAP type string 'double' + appropriately. + +2002-06-27 15:44 tag Release_0_1_b2 + +2002-06-27 15:44 tag Release_0_1b2 + +2002-06-27 15:44 warnes + + * SOAPpy/SOAP.py: + - Patch from Brad Knotwell [b.knotwell@f5.com] to add basic + authentication: + + Hi Gregory-- + + This is untested (except for running some of the example + programs + to ensure it didn't break anything). However, it's trivial + enough (and copied almost verbatim from ZSI. . .I helped + Rich + with Authorization there as well) that I would be pretty + confident about committing it. My primary assumption in + saying + this is that the Authorization header can show up *anywhere* + in + the header stream and that I've inserted the putheader in + the + right method call. + + --Brad + +2002-05-24 17:38 warnes + + * SOAPpy/SOAP.py: + Fixes to enble proper handling of SOAP faults by the client. + + - Fixed test of whether message content is text/xml when + recieving a fault. - Added __call__ method to exception classes + to match the current API. - The faultType.__repr__() method now + print details if present + +2002-05-10 10:56 warnes + + * SOAPpy/: SOAP.py, XMLname.py, __init__.py: + - Added XMLnam.py which provides toXMLname() and fromXMLname() + for properly encoding xml tag names per the SOAP 2.1 (draft) + specification. + + - Added calls to toXMLname() and fromXMLname() so that tags names + are properly encoded. This resolves bug [ 548785 ] 'Error + passing dict keys containing space.' + + - Added code to cgi encode contents of tags when they are not a + recognized type. Fixes bug [ 549551 ] 'Error when passing + non-standard types'. + + - Added __init__.py, so that SOAPpy can be used like a standard + python module. + +2002-02-26 22:19 gliptak + + * SOAPpy/SOAP.py, tests/SOAPtest.py: Use array for string concat + when building messages + +2002-02-26 21:33 gliptak + + * SOAPpy/SOAP.py, tests/SOAPtest.py: Correcting arrayType struct + typo + +2002-02-26 20:14 gliptak + + * tests/quoteTest2.py: Another quote test using mybubble.com + +2002-02-26 20:13 gliptak + + * tests/SOAPtest.py: Added test for parameter ordering + +2002-02-26 20:11 gliptak + + * SOAPpy/SOAP.py: Support for explicit parameter ordering + +2002-02-25 22:34 gliptak + + * tests/translateTest.py: Correcting URL and URN + +2002-02-25 22:25 gliptak + + * tests/guidTest.py: Correcting URL + +2002-02-25 22:17 gliptak + + * tests/alanbushTest.py: Correct URI and list categories + +2002-02-25 22:06 gliptak + + * tests/SOAPtest.py: Modified to use PyUnit + +2002-02-25 16:47 gliptak + + * tests/SOAPtest.py: Do not fail for large double parsing for + Python 2.2 + +2002-02-25 10:57 gliptak + + * SOAPpy/SOAP.py: Fixing abs(None) traceback + +2002-02-24 21:50 gliptak + + * tests/quoteTest1.py: Another quote service test + +2002-02-24 21:48 gliptak + + * tests/wordFindTest.py: Corrected import path + +2002-02-24 21:46 gliptak + + * SOAPpy/SOAP.py: Aliases for Python 2.2 (lib\types.py definitions + changed) + +2001-11-05 14:19 tag REL_0_9_9_pre5 + +2001-11-05 14:19 tag v1_2RC4 + +2001-11-05 14:19 tag v1_2RC5 + +2001-11-05 14:19 tag v1_2RC6 + +2001-11-05 14:19 rsalz + + * .cvsignore, bid/.cvsignore, contrib/.cvsignore, tests/.cvsignore, + tools/.cvsignore, validate/.cvsignore: add .cvsignore + +2001-07-06 14:03 tag v1_2RC1 + +2001-07-06 14:03 tag v1_2RC2 + +2001-07-06 14:03 tag v1_2RC3 + +2001-07-06 14:03 cullman + + * SOAPpy/SOAP.py: Fixed the memory leak. + +2001-06-28 16:13 cullman + + * SOAPpy/SOAP.py: Fixed the 500 return code is always a SOAP + response "issue". + +2001-06-27 18:33 tag REL_0_9_6 + +2001-06-27 18:33 cullman + + * CHANGELOG: More changelog changes. + +2001-06-27 18:30 cullman + + * contrib/soap_handler.py: Adding the contributed soap_handler. + +2001-06-27 18:29 cullman + + * contrib/soap_cli.py: Added the medusa example files contributed + by Ng. + +2001-06-27 18:13 cullman + + * CHANGELOG: Added a description of the latest release. + +2001-06-27 17:36 tag start + +2001-06-27 17:36 cullman + + * CHANGELOG, README, SOAPpy/SOAP.py, bid/inventory.servers, + bid/inventoryClient.py, bid/inventoryServer.py, + bid/monitorClient.py, docs/quickstart.txt, docs/simpleTypes.txt, + tests/SOAPtest.py, tests/TCtest.py, tests/echoClient.py, + tests/echoServer.py, tests/excelTest.py, tests/speedTest.py, + docs/attrs.txt, docs/complexTypes.txt, tests/alanbushTest.py, + tests/cardClient.py, tests/cardServer.py, tests/fortuneTest.py, + tests/guidTest.py, tests/itimeTest.py, tests/newsTest.py, + tests/quoteTest.py, tests/storageTest.py, tests/translateTest.py, + tests/weatherTest.py, tests/whoisTest.py, tests/wordFindTest.py, + tools/interop2html.py, validate/server.pem, + validate/silab.servers, validate/silabclient.py, + validate/silabserver.py, validate/soapware.py: Initial SOAP.py + check in. + +2001-06-27 17:36 cullman + + * CHANGELOG, README, SOAPpy/SOAP.py, bid/inventory.servers, + bid/inventoryClient.py, bid/inventoryServer.py, + bid/monitorClient.py, docs/quickstart.txt, docs/simpleTypes.txt, + tests/SOAPtest.py, tests/TCtest.py, tests/echoClient.py, + tests/echoServer.py, tests/excelTest.py, tests/speedTest.py, + docs/attrs.txt, docs/complexTypes.txt, tests/alanbushTest.py, + tests/cardClient.py, tests/cardServer.py, tests/fortuneTest.py, + tests/guidTest.py, tests/itimeTest.py, tests/newsTest.py, + tests/quoteTest.py, tests/storageTest.py, tests/translateTest.py, + tests/weatherTest.py, tests/whoisTest.py, tests/wordFindTest.py, + tools/interop2html.py, validate/server.pem, + validate/silab.servers, validate/silabclient.py, + validate/silabserver.py, validate/soapware.py: Initial revision + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/LICENSE b/LTA/LTAIngest/SOAPpy-0.12.0/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..61ad0e893b338f153f41fc45b012ba58845926bd --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/LICENSE @@ -0,0 +1,56 @@ +============================================== +SOAPpy - Simple to use SOAP library for Python +============================================== + +Current Maintainers: + + Gregory R. Warnes <Gregory.R.Warnes@Pfizer.com> + Christopher Blunck <blunck2@gst.com> + +Original Authors: + + Cayce Ullman <c_ullman@yahoo.com> + Brian Matthews <blm@blmatthews.com> + +Contributions by: + + Brad Knotwell <b.knotwell@f5.com> + Mark Bucciarelli <mark@hubcapconsulting.com> (ported WSDL + client from ZSI) + Ivan R. Judson <judson@mcs.anl.gov> (Globus support) + Kirk Strauser <kirk@daycos.com> + Antonio Beamud Montero <antonio.beamud@linkend.com> (patches + for integrating SOAPpy into Zope) + +Copyright (c) 2002-2003, Pfizer, Inc. +Copyright (c) 2001, Cayce Ullman. +Copyright (c) 2001, Brian Matthews. +All rights reserved. + +LICENSE: +---------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. +Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +Neither the name of actzero, inc. nor the names of its contributors may +be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/PKG-INFO b/LTA/LTAIngest/SOAPpy-0.12.0/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..d37a906d66a7151b4c163b29de8047a8d817d94a --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: SOAPpy +Version: 0.12.0 +Summary: SOAP Services for Python +Home-page: http://pywebsvcs.sf.net/ +Author: Gregory Warnes +Author-email: Gregory.R.Warnes@Pfizer.com +License: UNKNOWN +Description: SOAPpy provides tools for building SOAP clients and servers. For more information see http://pywebsvcs.sf.net/ +Platform: UNKNOWN diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/README b/LTA/LTAIngest/SOAPpy-0.12.0/README new file mode 100644 index 0000000000000000000000000000000000000000..c6d469bd7e409943cffaf758657081ad9259ef36 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/README @@ -0,0 +1,237 @@ +============================================== +SOAPpy - Simple to use SOAP library for Python +============================================== + +Current Maintainer: + + Gregory R. Warnes <Gregory.R.Warnes@Pfizer.com> + +Original Authors: + + Cayce Ullman <c_ullman@yahoo.com> + Brian Matthews <blm@blmatthews.com> + +Contributions by: + + Christopher Blunck <blunck2@gst.com> + Brad Knotwell <b.knotwell@f5.com> + Mark Bucciarelli <mark@hubcapconsulting.com> (ported WSDL + client from ZSI) + Ivan R. Judson <judson@mcs.anl.gov> (Globus support) + Kirk Strauser <kirk@daycos.com> + Antonio Beamud Montero <antonio.beamud@linkend.com> (patches + for integrating SOAPpy into Zope) + And others. + +Copyright (c) 2002-2005, Pfizer, Inc. +Copyright (c) 2001, Cayce Ullman. +Copyright (c) 2001, Brian Matthews. +All rights reserved, see the file LICENSE for conditions of use. + +INTRODUCTION +============ + + The goal of the SOAPpy team is to provide a full-featured SOAP library + for Python that is very simple to use and that fully supports dynamic + interaction between clients and servers. + + INCLUDED + -------- + + - General SOAP Parser based on sax.xml + - General SOAP Builder + - SOAP Proxy for RPC client code + - SOAP Server framework for RPC server code + + FEATURES + -------- + + - Handles all SOAP 1.0 types + - Handles faults + - Allows namespace specification + - Allows SOAPAction specification + - Homogeneous typed arrays + - Supports multiple schemas + - Header support (mustUnderstand and actor) + - XML attribute support + - Multi-referencing support (Parser/Builder) + - Understands SOAP-ENC:root attribute + - Good interop, passes all client tests for Frontier, SOAP::LITE, SOAPRMI + - Encodings + - SSL clients (with Python compiled with OpenSSL support) + - SSL servers (with Python compiled with OpenSSL support and M2Crypto + installed) + - Encodes XML tags per SOAP 1.2 name mangling specification (Gregory Warnes) + - Automatic stateful SOAP server support (Apache v2.x) (blunck2) + - WSDL client support + - WSDL server support + + TODO (See RELEASE_INFO and CHANGELOG for recent changes) + ---- + + - Timeout on method calls + - Advanced arrays (sparse, multidimensional and partial) + - Attachments + - mod_python example + - medusa example + - Improved documentation + + MANIFEST + -------- + + Files + + + README This file + RELEASE_NOTES General information about each release + ChangeLog Detailed list of changes + TODO List of tasks that need to be done + + setup.py Python installation control files + MANIFEST + MANIFEST.in + + SOAPpy.spec* RPM package control file + + Directories + + SOAPpy/* Source code for the package + SOAPpy/wstools/* Source code for WSDL tools + tests/* unit tests and examples + validate/* interop client and servers + bid/* N+I interop client and server + doc/* Documentation + contrib/ Contributed examples (also see test/) + docs/ Documentation + tools/ Misc tools useful for the SOAPpy developers + zope/ Patches to Zope allowing it to provide SOAP services + + +INSTALLATION +============ + + REQUIRED PACKAGES: + ----------------- + + - fpconst 0.6.0 or later, + <http://research.warnes.net/projects/rzope/fpconst/> + + - pyXML 0.8.3 or later, <http://pyxml.sourceforge.net> + + OPTIONAL PACKAGES + ----------------- + + - pyGlobus, optional support for Globus, + <http://www-itg.lbl.gov/gtg/projects/pyGlobus/> + + - M2Crypto.SSL, optional support for server-side SSL + <http://sandbox.rulemaker.net/ngps/m2/> + + - If Python is compiled with SSL support (Python 2.3 does so by + default), client-side use of SSL is supported + + INSTALLATION STEPS + ------------------ + + As of version 0.9.8 SOAPpy can be installed using the standard python + package installation tools. + + To install: + + 1) Unpack the distribution package: + + On Windows, use your favorite zip file uncompression tool. + + On Unix: + + $ tar -xvzf SOAPpy-$VERSION$.tar.gz + + if you have gnu tar, otherwise + + $ gzcat SOAPpy-$VERSION$.tar.gz | tar -xvf - + + 2) Change into the source directory + + $ cd SOAPpy-$VERSION$ + + 3) Compile the package + + $ python setup.py build + + 4) Install the package + + On Windows: + + $ python setup.py install + + On Unix install as the owner of the python directories + (usally root): + + $ su root + Password: XXXXXX + $ python setup.py install + + +DOCUMENTATION +============= + + QUICK START + ----------- + + A simple "Hello World" http SOAP server: + + import SOAPpy + def hello(): + return "Hello World" + + server = SOAPpy.SOAPServer(("localhost", 8080)) + server.registerFunction(hello) + server.serve_forever() + + And the corresponding client: + + import SOAPpy + server = SOAPpy.SOAPProxy("http://localhost:8080/") + print server.hello() + + BASIC TUTORIAL + -------------- + + Mark Pilgrims' _Dive Into Python_, published in printed form by + Apress and online at at http://diveintopython.org provides a + nice tutorial for SOAPpy in Chapter 12, "SOAP Web Services". + See http://diveintopython.org/soap_web_services . + + OTHER DOCUMENTATION + ------------------- + + For further information see the files in the docs/ directory. + + Note that documentation is one of SOAPpy's current weak points. + Please help us out! + + +GETTING HELP +============ + + REPORTING BUGS + -------------- + + Please submit bug reports, feature requests, patches, etc at the + Python Web Services web site: http://pywebsvcs.sourceforge.net. + + MAILING LIST + ============ + + Please address questions and general discussion to the + pywebsvcs-talk mailing list, pywebsvcs-talk@lists.sourceforge.net. + + For subscription information visit + http://lists.sourceforge.net/lists/listinfo/pywebsvcs-talk. + List archives are available at + http://sourceforge.net/mailarchive/forum.php?forum=pywebsvcs-talk + + Please remember that the authors do have day jobs, so please try + the mailing list before contacting them directy. + +$Id: README,v 1.1 2005/05/13 08:20:39 renting Exp $ diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/RELEASE_INFO b/LTA/LTAIngest/SOAPpy-0.12.0/RELEASE_INFO new file mode 100644 index 0000000000000000000000000000000000000000..3775af668576df21dc89c3fdfd3f07787016a178 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/RELEASE_INFO @@ -0,0 +1,590 @@ + +Release 0.12.0 of SOAPpy +------------------------ + +This release primarily foces on bug fixes. Primary changes: + +- Fixes for bug reports that have accumulated over the last year + + [ 916265] "Arrays of unicode do not serialize correctly (patch included)" + [ 918216] "Parsing faults in SOAPpy 0.11.3" + [ 925077] "SOAPpy prints out SOAP fault" (even when Config.debug is off). + [1001646] "SOAPpy stomps headers when sending multirefs" + [1001646] "SOAPpy stomps headers when sending multirefs. + [1064233] "Bug fixes for complex types" + [1064248] "Bugs in _asdict() and _asarray() in Types.py" + [1078051] "Arrays of complex types (doc/lit)" + [1096971] "Parse error: missing HTTP header 'Content-length'" + [1106450] "Floats are truncated to 10 digits, causing precision loss" + [1122991] "error from SOAPpy/Client.py for content_length evaluation?" + +- Fixes for 'rules' which allow control of the data types of *incoming* messages. + As a consequence TCtest.py now passes all tests. + +- WSDL support has been improving, due to work on the 'wstools' + module which is shared between ZSI and SOAPpy. + +- Some work has been done to improve documentation. + + +Release 0.11.6 of SOAPpy +------------------------ + +Changes to URLs and email addresses in documentation. + + +Release 0.11.5 of SOAPpy +------------------------ + +- Bug fixes + + - Fix string format error in fault handling + + +Release 0.11.4 of SOAPpy +------------------------ + +- Bug fixes: + + - SOAPpy/Server.py: Check if header information contains SOAPAction + key before checking its value. + + - Fixes for generating SOAP from complexType arrays, contributed by + antonio.beamud@linkend.com + + - Fixed bug that caused typedArrayTypes to lose their type + information when rendered to SOAP and added corresponding + test case. + +- New Features + + - Enhancements to fault handling: The faultType Faultstring is now + a non-variable string (i.e. no nsmethod in it) so that it can be + programmatically checked. In addition fault handlers can now be + registered to handle specific types of faults. + + + - SOAPpy/Server.py: Modified unregsiterObject function to take + optional namespace/path args to be consistent with registerObject. + + - SOAPpy/Server.py: Added an unregisterObject function + + + - Changes to allow SOAPBuilder so it can handle a 'raw' Python object. + + + +Release 0.11.2 of SOAPpy +------------------------ + +- News: + + Ivan R. Judson has joined the SOAPpy team. He is focused on + Globus support but is also responsible for a lot of other work for + this release, + +- Bug fixes: + + - Code in Types.py assumes nested scopes, so I added the proper import so + this will work under python 2.2.x + + - Fixing namespace collision + + - Fixed handing of named arguments bug introduced in 0.11.1. + + - Fix memory leak when exceptions are raised. + + - Fix bug when content-length is not present in parsed SOAP message. + + - Fix bug #888345: Python 2.3 boolean type serialized as int + + - Fix bug #875977: no escaping of bad tagnames for NoneTypes + + +- New features: + + - Improved Globus support and documentation. Thanks Ivan! + + - Added context handling + + - Changed the use of SOAPAction, it used to default to setting it + to "", now it defaults to setting it to the method (not the + nsmethod). There is a clause in Server.py that catches 'old style' + SOAPActions (aka "") and sets them to the method. When this is + confirmed to be what everyone wants and we decide it's alright to + (possibly) break client/server interop, we can take the clause out + of Server.py and just handle SOAPActions of "" as a possible + error/warning. + + - Additional test code. + + - Raise a SOAPException instead of returning a SOAPpy.faultType + when a SOAP Fault is encountered and simplify_objects is enabled. + + +Release 0.11.1 of SOAPpy +------------------------ + +- Bug fixes: + + - Fixed bug [ 792258 ] "SOAPBuilder.SOAPBuilder.dump can catch + wrong exceptions" in SOAPBuilder.dump() submitted by Greg Chapman + (glchapman). + + - Changes suggested by Richard Au (richardau) to fix ssl support. + See bug report [ 752882 ] "SSL SOAP Server no longer working." + + - Remove call to gentag from 'dump' and add to 'dump_float', per + bug report [ 792600 ] "SOAPBuilder.SOAPBuilder.dump possibly should + not call gentag" by Greg Chapman (glchapman). + + - Add a tests for handling of nil="true" and nil="false". This + fixes bug [ pywebsvcs-Bugs-858168 ] 'xsi:nil="true" causes + exception' reported by Robert Zimmermann (robertzett): + +- testClient1.py now works properly. It had been failing to start the +server thread on the second unit test. It turned out that the +variable 'quit' needed to be reset to zero after the SOAP server +thread for the first unit test exited. With the solution of this +problem testClient1 can now be extended to run unit tests of both +client and server components. + +- Added 'strict' option to the WSDL class. If strict is true, a +RuntimeException will be raised if an unrecogned message is recieved. +If strict is false, a warning will be printed to the console, the +message type will be added to the WSDL schema, and processing will +continue. This is in response to the second half of bug report [ +817331 ] "Some WSDL.py changes", submitted by Rudolf Ruland. + + +Release 0.11.0 of SOAPpy +------------------------ + +- New/Changed configuration settings: + + - Config.simplify_objects=1 now converts all SOAPpy objects into basic + Python types (list, dictionary, tuple, double, float, etc.). By default, + Config.simplify_objects=0 for backward compatibility. + + - Config.dict_encoding='ascii' converts the keys of dictionaries + (e.g. created when Config.simplify_objects=1) to ascii == plain python + strings instead of unicode strings. This variable can be set to any + encoding known to string.encode(). + + - Config.strict_range=1 forces the SOAP parsing routines to perform + range checks on recieved SOAP float and double objects. When + Config.strict_range=0, the default, parsing does not perform range + checking (except for detecting overflows, which always occurs). In + either case, range checking is performed when + generating SOAP float and double objects. + +- Fixes for WSDLProxy. + +- Scripts in the test/ directory + + - Verbose debugging messages have been turned off.. + + - SOAPtest.py now functions when Config.simplify_objects=1 + + - SOAPtest.py now sets Config.strict_range=1 so that range + checks are be properly tested. + + - New README file listing what test scripts fail and why. + +- Initial support for Globus via pyGlobus contributed by Ivan + R. Judson <judson@mcs.anl.gov>. + +Release 0.10.4 of SOAPpy +------------------------ + +Dramatic performance improvements for large data transfers. + +Release 0.10.1 of SOAPpy +------------------------ + +only minor changes + +1) Code now uses a single file to store version number + +2) Client and server now report 'SOAPpy' as the server/user-agent. + +3) All test scripts now use the local SOAPpy source instead of the + globally installed version. + +Release 0.10.0 of SOAPpy +------------------------ + +Enhancements: + +1) The new name handling mechanism has been enabled by default. + + The primary purpose of this release is to allow users to test this + to see if it causes problems. Please take the time to do so. If + there are no problems reported by April 15, 2003, 0.9.9 will be + released with this feature enabled by default. + + Note that running a client under an old release of SOAPpy and a + server under this release will be likely to generate errors due to + the different name handling mechanisms. + +2) MS-Windows systems should now be fully supported. + + This required implementing a new module, ieee754, which provides + functions for detecting and generating IEEE 754 special floating + point values (+Inf, -Inf, NaN) which are not properly handled by + the Windows implementation of the float() function. + +3) Code reorganization: The huge file SOAPpy/SOAP.py (4,122 lines, + 131K) has been split into 10 separate files. In addition code + shared with ZSI has been moved into a separate subdirectory and a + separate CVS module. + +4) Fixed bug 678239 which caused loss of namespace information in the + client. + +5) Mark Bucciarelli's <mark@hubcapconsulting.com> has ported client + support for WSDL from ZSI, as well as providing a mechanism for + SOAPpy servers to provide WSDL on properly structured .GET + requests. + +6) Added ThreadingSOAPServer which inherits from ThreadingTCPServer + server so that multiple clients will be automatically multiplexed. + + +VERSION 0.10.4 +-------------- + +- Integrated a simple patch submitted by Erik Westra that dramatically + improves parser performance. + +- WSDL tools now uses m2crypto for SSL if it's installed. + +- Various other WSDL changes. + +VERSION 0.10.3 +-------------- + +- Removed import of obsoleted ieee753.py. Now use the fpconst module + proposed by PEP 754, available from + <http://research.warnes.net/Zope/projects/fpconst/> + +- SOAPpy should no longer depend on pyXML. + +VERSION 0.10.2 +-------------- + +- Fixed client support for basic authentication + +- Fixed import error in Client.py + +- Improved Client parsing of namespaces to support stateful SOAP servers. + +VERSION 0.10.1 +-------------- + +- Modified setup.py, Server.py, and Client.py to obtain SOAPpy version + number from a new file, version.py. + +- SOAP server/user-agent is now to 'SOAPpy' instead of 'SOAP.py'. + +- Added ident string containing CVS version to all files that were + lacking this. + +VERSION 0.10.0 +-------------- + +CHANGES SINCE VERSION 0.9.9-pre5 + +- Major Change: The huge file SOAPpy/SOAP.py (4,122 lines, 131K) has + been split into 10 separate files: + Client.py NS.py SOAPBuilder.py Utilities.py + Config.py Parser.py Server.py + Errors.py SOAP.py Types.py + This should ease navigation and maintenance. + +- A new CVS module 'wstools' was created to hold code which is used by + both ZSI and SOAPpy. While this module is stored separately in CVS, + it will be distributed as an integral part of both ZSI and SOAPpy, + and will be included as an 'internal' module by both. In the SOAPpy + source, it lives in the directory SOAPpy/wstools. + +- The files XMLname.py, ieee754.py, have been moved into SOAPpy/wstools. + +- Added TODO file + +- Fix bug in getNS that caused loss of namespace by using better + pattern matching to find the namespace in the SOAP message. Fixes bug + 678239 + +- Added Mark Bucciarelli's <mark@hubcapconsulting.com> patch to + provide wsdl code on properly structured .GET requests to the server. + +- Added client support for WSDL, ported from ZSI by Mark Bucciarelli + <mark@hubcapconsulting.com> + +- Added ThreadingSOAPServer which inherits from ThreadingTCPServer + server so that muliple clients will be automatically multiplexed. + +- Removed some files from /test for services that no longer exist. + + +CHANGES SINCE VERSION 0.9.9-pre4 +-------------------------------- + +- Added client support for WSDL, ported from ZSI by Mark Bucciarelli + <mark@hubcapconsulting.com>. + +CHANGES SINCE VERSION 0.9.9-pre3 +-------------------------------- + +- Code shared between SOAPpy and ZSI now lives in + SOAPpy/SOAPpy/wstools and is stored in a separate CVS package. This + will allow ZSI and SOAPpy to keep these files synchronized. + +CHANGES SINCE VERSION 0.9.9-pre2 +-------------------------------- + +- Fixed trivial compilation bug on Win32: Only define + SOAPUnixSocketServer if the Unix domain sockets are supported + +CHANGES SINCE VERSION 0.9.9-pre1 +-------------------------------- + +- Added request for nested scopes, should now work properly in python + 2.1 with named argument calls. + +- Fixed bug caused by omission of the ieee754 module from __init__.py. + +- SOAPpy now provides a SOAPUnixSocketServer class, which uses a unix + domain socket instead of a network TCP/IP socket for communication. A + corresponding client will be provided in the future. [This class + has not yet been tested.] + +CHANGES SINCE VERSION 0.9.8 +--------------------------- + +- IEEE 754 floating point specials (Inf, -Inf, NaN) should now be + properly and consistently handled on all platforms. + + Added code to explicitly check for and handle IEEE 754 floating + point specials (Inf, -Inf, NaN). This replaces an ugly hack for + systems whose python float() doesn't understand the strings "Inf", + "NaN", etc. Floating point specials should now be properly handled + on all operating systems. + + ***SOAPpy should now work properly on all versions of Microsoft Windows.*** + + A new module, ieee754 contains the functions required to detect and + create NaN, Inf, and -Inf values. This module should be usable in + other contexts. + +- *** The new argument handling method (via SOAPpy.SOAP.Config.specialArgs=1) + is now enabled by default.*** + +- Changed all references to actzero.com in SOAP.py to pywebscvs.sf.net. + +- Fixed a bug where lists included as parameters to SOAP method calls + were being incorrectly named 'Results' even when another name was + given. + +CHANGES SINCE VERSION 0.9.7 +--------------------------- + +- Modified structure to allow installation using Python distutils + (i.e. setup.py). Access to the SOAPpy library now requires: + from SOAPpy import SOAP + +- I (Gregory R. Warnes) have implemented an experimental and + non-standard method of handling named and unnamed arguments. This + mechanism is enabled in SOAPpy by setting + SOAPpy.SOAP.Config.specialArgs=1. + + When enabled, parameters with names of the form _#### (i.e., + matching the regexp "^_[0-9]+") are assumed to be unnamed parameters + and are passed to the method in numeric order. All other parameters + are assumed to be named and are passed using the xml tag id as the + parameter name. Outgoing SOAP method calls now always generate + names in this way--whether or not specialArgs is enabled--instead of + using the pattern v#####. + + See the file README.MethodParameterNaming for more details. + +- Added noroot parameter to the SOAPBuilder and SOAPProxy objects + in order to provide compatibility with an older version of + EasySOAP (v0.2) that balked if the SOAP-ENC:root parameter was + included.(Brad Knotwell) + +- Added support for namespace-rewriting (used by Apache v2.x SOAP server for + error conditions as well as stateful communication) (Christopher Blunck) + +- Added string <-> str conversion for array types (Python 2.2+) + (Christopher Blunck) + +- Added convenience method (invoke) to SOAPProxy that calls __call (not sure + if it is necessary - feel free to remove if you want) (Christopher Blunck) + +- Python 'float' are equivalent to SOAP 'double'. Modified dump_float + and dump_list to use SOAP type string 'double' + appropriately. (Gregory R. Warnes) + +- Add basic authentication (Brad Knotwell) + +- Fixes to enable proper handling of SOAP faults by the client: + - Fixed test of whether message content is text/xml when recieving a fault. + - Added __call__ method to exception classes to match the current API. + - The faultType.__repr__() method now print details if present + (Gregory R. Warnes) + +- Added XMLnam.py which provides toXMLname() and fromXMLname() for + properly encoding xml tag names per the SOAP 2.1 (draft) + specification. (Gregory R. Warnes) + +- Added calls to toXMLname() and fromXMLname() so that tags names are + properly encoded. This resolves bug [ 548785 ] 'Error passing dict + keys containing space.' (Gregory R. Warnes) + +- Added code to cgi encode contents of tags when they are not a + recognized type. Fixes bug [ 549551 ] 'Error when passing + non-standard types'. (Gregory R. Warnes) + +- Added __init__.py, so that SOAPpy can be used like a standard python + module. (Gregory R. Warnes) + + +VERSION 0.9.7 (6/27/01) +----------------------- + +- Fixed the unamed ordered parameters bug +- Added the ability to specify a http_proxy +- Added a patch provided by Tim MiddelKoop to allow printing of proxy objects +- Added the contrib directory and included a medusa implementation of a + SOAP.py server by Ng Pheng Siong + + +VERSION 0.9.6 (6/08/01) +----------------------- + +- The date and time types now check their initial values when the type + is created, not when the data is marshalled. +- The date and time types are now parsed and returned as tuples (for + multi-element types) or scalars (for single element types) in UTC and thus + can represent the entire range of SOAP dates. +- If an element doesn't have a type but has a name with a namespace, the + name is tried as the type. +- Untyped compound types with more than one element and all the elements + the same name are turned into an array when parsing. +- When parsing a structType, elements with the same name are placed in a + list instead of saving just the last one. _getItemsAsList can be used to + get an element of a structure as a list, whether there was one or many + occurances of the item. +- Added schemaNamespace, schemaNamespaceURI, and namespaceStyle + configuration options. namespaceStyle takes one of 1999, 2000, or 2001, + and sets typesNamespace, typesNamespaceURI, schemaNamespace, and + schemaNamespaceURI. +- Normalized the type class names, replacing Compound with compoundType, + Struct with structType, Header with headerType, Body with bodyType, Array + with arrayType, TypedArray with typedArrayType, Fault with faultType, and + urType with anyType. +- Attributes now appear on an element itself instead of the element's + parent. For elements parsed to builtin python types, the attributes are + stored in a dictionary keyed by the element's python id. The dictionary + is in the Context object, can be returned from parseSOAP*, and can be + returned from method calls if the returnAllAttrs configuration option + is set. +- isinstance is used to check for a class, so classes can be subtyped. +- An encoding of None can be specified to not include encoding information. +- Problems with the SOAPProxy URL are now reported when the SOAPProxy + instance is created instead of when the first method call is made. +- The Binary, Boolean and DateTime types have been removed in favor of + binaryType, booleanType, and dateTimeType. + + +VERSION 0.9.5 (5/16/01) +----------------------- + +- Should parse and build all 1999, 2000, 2001, and SOAP-ENC datatypes. +- Initial handling of multi-dimensional, partial, and sparse arrays. +- Supports SSL clients (if Python built with OpenSSL). +- Supports SSL servers (if M2Crypto installed). +- Applies defaults to SOAPproxy URLs (nice for command-line tools). +- Added the _SOAPContext object, gives registered server functions more info + about the current call. +- Now assumes that any type that isn't in a schema could be a struct. +- Added the Config object, now config options can be set globally or on an + individual call level. +- Deprecated the DateTime, Binary and Boolean types, should now + use dateTimeType, binaryType and booleanType. +- Includes N+I interop suite. +- Various bug fixes and improvements. + +VERSION 0.9 (5/01/01) +----------------------- + +- The Envelope now just contains definitions for namespaces actually used + (Builder) +- Namespace definitions are inherited by children but not siblings (Builder) +- Further improved multi-reference parsing -- it handles circular references + (Parser) +- Added support for building recursive and circular types using references + (Builder) +- More types +- Proper handling of overflow and underflow integral and floating point + types (Parser) +- More interop +- Various bug fixes and improvements + +VERSION 0.8.5 (4/25/01) +----------------------- + +- buildSOAP, SOAPProxy, SOAPServer now taking encoding argument +- Much improved multi-referencing (Parser) +- Added base64 and dateTime to interop suite +- Various bug fixes + +VERSION 0.8 (4/23/01) +----------------------- + +- Added more types +- Early multi-referencing support (Parser) +- Reorganized the parser, much cleaner now +- Preserve whitepsace in strings (per the standard) +- Full XML attribute support (Parser/Builder) +- Object (de)serialization now maintains element order +- Fixed the zero-length array problem +- Made indentation uniform (spaces not tabs) +- Made Header and Body work more like real structs +- Changed the parseSOAP api, now returns the body structure, + instead of a list of body elements +- Changed the soapaction and namespaces for the interop server +- New silabclient options +- Initial encoding support + +VERSION 0.7 (4/19/01) +----------------------- + +- Fixed a bug that caused nothing to work with Python 2.1 +- Float work arounds for WIN32 (others?) +- DateTime parsing for WIN32 +- Beginnings of XML attribute support +- Better interop + +VERSION 0.6 (4/18/01) +----------------------- + +- Fixed numerous bugs (dateTime, float precision, Response Element, null + strings) +- Added more types +- Homogeneous typed arrays +- Added support for more schemas +- Early Header support and mustUnderstand and actor +- Added interop suite +- Passes validator +- Interop greatly improved, passes all client tests for Frontier, + SOAP::LITE. + +VERSION 0.5 (4/17/01) +----------------------- + +- Initial public release + + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Client.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Client.py new file mode 100644 index 0000000000000000000000000000000000000000..67505308017971969901f8559951c30ed6803a84 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Client.py @@ -0,0 +1,494 @@ +""" +################################################################################ +# +# SOAPpy - Cayce Ullman (cayce@actzero.com) +# Brian Matthews (blm@actzero.com) +# Gregory Warnes (Gregory.R.Warnes@Pfizer.com) +# Christopher Blunck (blunck@gst.com) +# +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +ident = '$Id$' +from version import __version__ + + +#import xml.sax +import urllib +from types import * +import re +import base64 + +# SOAPpy modules +from Errors import * +from Config import Config +from Parser import parseSOAPRPC +from SOAPBuilder import buildSOAP +from Utilities import * +from Types import faultType, simplify + +################################################################################ +# Client +################################################################################ + + +def SOAPUserAgent(): + return "SOAPpy " + __version__ + " (pywebsvcs.sf.net)" + + +class SOAPAddress: + def __init__(self, url, config = Config): + proto, uri = urllib.splittype(url) + + # apply some defaults + if uri[0:2] != '//': + if proto != None: + uri = proto + ':' + uri + + uri = '//' + uri + proto = 'http' + + host, path = urllib.splithost(uri) + + try: + int(host) + host = 'localhost:' + host + except: + pass + + if not path: + path = '/' + + if proto not in ('http', 'https', 'httpg'): + raise IOError, "unsupported SOAP protocol" + if proto == 'httpg' and not config.GSIclient: + raise AttributeError, \ + "GSI client not supported by this Python installation" + if proto == 'https' and not config.SSLclient: + raise AttributeError, \ + "SSL client not supported by this Python installation" + + self.user,host = urllib.splituser(host) + self.proto = proto + self.host = host + self.path = path + + def __str__(self): + return "%(proto)s://%(host)s%(path)s" % self.__dict__ + + __repr__ = __str__ + + +class HTTPTransport: + def getNS(self, original_namespace, data): + """Extract the (possibly extended) namespace from the returned + SOAP message.""" + + if type(original_namespace) == StringType: + pattern="xmlns:\w+=['\"](" + original_namespace + "[^'\"]*)['\"]" + match = re.search(pattern, data) + if match: + return match.group(1) + else: + return original_namespace + else: + return original_namespace + + # Need a Timeout someday? + def call(self, addr, data, namespace, soapaction = None, encoding = None, + http_proxy = None, config = Config): + + import httplib + + if not isinstance(addr, SOAPAddress): + addr = SOAPAddress(addr, config) + + # Build a request + if http_proxy: + real_addr = http_proxy + real_path = addr.proto + "://" + addr.host + addr.path + else: + real_addr = addr.host + real_path = addr.path + + if addr.proto == 'httpg': + from pyGlobus.io import GSIHTTP + r = GSIHTTP(real_addr, tcpAttr = config.tcpAttr) + elif addr.proto == 'https': + r = httplib.HTTPS(real_addr) + else: + r = httplib.HTTP(real_addr) + + r.putrequest("POST", real_path) + + r.putheader("Host", addr.host) + r.putheader("User-agent", SOAPUserAgent()) + t = 'text/xml'; + if encoding != None: + t += '; charset="%s"' % encoding + r.putheader("Content-type", t) + r.putheader("Content-length", str(len(data))) + + # if user is not a user:passwd format + # we'll receive a failure from the server. . .I guess (??) + if addr.user != None: + val = base64.encodestring(addr.user) + r.putheader('Authorization','Basic ' + val.replace('\012','')) + + # This fixes sending either "" or "None" + if soapaction == None or len(soapaction) == 0: + r.putheader("SOAPAction", "") + else: + r.putheader("SOAPAction", '"%s"' % soapaction) + + if config.dumpHeadersOut: + s = 'Outgoing HTTP headers' + debugHeader(s) + print "POST %s %s" % (real_path, r._http_vsn_str) + print "Host:", addr.host + print "User-agent: SOAPpy " + __version__ + " (http://pywebsvcs.sf.net)" + print "Content-type:", t + print "Content-length:", len(data) + print 'SOAPAction: "%s"' % soapaction + debugFooter(s) + + r.endheaders() + + if config.dumpSOAPOut: + s = 'Outgoing SOAP' + debugHeader(s) + print data, + if data[-1] != '\n': + print + debugFooter(s) + + # send the payload + r.send(data) + + # read response line + code, msg, headers = r.getreply() + + if headers: + content_type = headers.get("content-type","text/xml") + content_length = headers.get("Content-length") + else: + content_type=None + content_length=None + + # work around OC4J bug which does '<len>, <len>' for some reaason + if content_length: + comma=content_length.find(',') + if comma>0: + content_length = content_length[:comma] + + # attempt to extract integer message size + try: + message_len = int(content_length) + except: + message_len = -1 + + if message_len < 0: + # Content-Length missing or invalid; just read the whole socket + # This won't work with HTTP/1.1 chunked encoding + data = r.getfile().read() + message_len = len(data) + else: + data = r.getfile().read(message_len) + + if(config.debug): + print "code=",code + print "msg=", msg + print "headers=", headers + print "content-type=", content_type + print "data=", data + + if config.dumpHeadersIn: + s = 'Incoming HTTP headers' + debugHeader(s) + if headers.headers: + print "HTTP/1.? %d %s" % (code, msg) + print "\n".join(map (lambda x: x.strip(), headers.headers)) + else: + print "HTTP/0.9 %d %s" % (code, msg) + debugFooter(s) + + def startswith(string, val): + return string[0:len(val)] == val + + if code == 500 and not \ + ( startswith(content_type, "text/xml") and message_len > 0 ): + raise HTTPError(code, msg) + + if config.dumpSOAPIn: + s = 'Incoming SOAP' + debugHeader(s) + print data, + if (len(data)>0) and (data[-1] != '\n'): + print + debugFooter(s) + + if code not in (200, 500): + raise HTTPError(code, msg) + + + # get the new namespace + if namespace is None: + new_ns = None + else: + new_ns = self.getNS(namespace, data) + + # return response payload + return data, new_ns + +################################################################################ +# SOAP Proxy +################################################################################ +class SOAPProxy: + def __init__(self, proxy, namespace = None, soapaction = None, + header = None, methodattrs = None, transport = HTTPTransport, + encoding = 'UTF-8', throw_faults = 1, unwrap_results = None, + http_proxy=None, config = Config, noroot = 0, + simplify_objects=None): + + # Test the encoding, raising an exception if it's not known + if encoding != None: + ''.encode(encoding) + + # get default values for unwrap_results and simplify_objects + # from config + if unwrap_results is None: + self.unwrap_results=config.unwrap_results + else: + self.unwrap_results=unwrap_results + + if simplify_objects is None: + self.simplify_objects=config.simplify_objects + else: + self.simplify_objects=simplify_objects + + self.proxy = SOAPAddress(proxy, config) + self.namespace = namespace + self.soapaction = soapaction + self.header = header + self.methodattrs = methodattrs + self.transport = transport() + self.encoding = encoding + self.throw_faults = throw_faults + self.http_proxy = http_proxy + self.config = config + self.noroot = noroot + + # GSI Additions + if hasattr(config, "channel_mode") and \ + hasattr(config, "delegation_mode"): + self.channel_mode = config.channel_mode + self.delegation_mode = config.delegation_mode + #end GSI Additions + + def invoke(self, method, args): + return self.__call(method, args, {}) + + def __call(self, name, args, kw, ns = None, sa = None, hd = None, + ma = None): + + ns = ns or self.namespace + ma = ma or self.methodattrs + + if sa: # Get soapaction + if type(sa) == TupleType: + sa = sa[0] + else: + if self.soapaction: + sa = self.soapaction + else: + sa = name + + if hd: # Get header + if type(hd) == TupleType: + hd = hd[0] + else: + hd = self.header + + hd = hd or self.header + + if ma: # Get methodattrs + if type(ma) == TupleType: ma = ma[0] + else: + ma = self.methodattrs + ma = ma or self.methodattrs + + m = buildSOAP(args = args, kw = kw, method = name, namespace = ns, + header = hd, methodattrs = ma, encoding = self.encoding, + config = self.config, noroot = self.noroot) + + + call_retry = 0 + try: + + r, self.namespace = self.transport.call(self.proxy, m, ns, sa, + encoding = self.encoding, + http_proxy = self.http_proxy, + config = self.config) + + except Exception, ex: + # + # Call failed. + # + # See if we have a fault handling vector installed in our + # config. If we do, invoke it. If it returns a true value, + # retry the call. + # + # In any circumstance other than the fault handler returning + # true, reraise the exception. This keeps the semantics of this + # code the same as without the faultHandler code. + # + + if hasattr(self.config, "faultHandler"): + if callable(self.config.faultHandler): + call_retry = self.config.faultHandler(self.proxy, ex) + if not call_retry: + raise + else: + raise + else: + raise + + if call_retry: + r, self.namespace = self.transport.call(self.proxy, m, ns, sa, + encoding = self.encoding, + http_proxy = self.http_proxy, + config = self.config) + + + p, attrs = parseSOAPRPC(r, attrs = 1) + + try: + throw_struct = self.throw_faults and \ + isinstance (p, faultType) + except: + throw_struct = 0 + + if throw_struct: + if Config.debug: + print p + raise p + + # If unwrap_results=1 and there is only element in the struct, + # SOAPProxy will assume that this element is the result + # and return it rather than the struct containing it. + # Otherwise SOAPproxy will return the struct with all the + # elements as attributes. + if self.unwrap_results: + try: + count = 0 + for i in p.__dict__.keys(): + if i[0] != "_": # don't count the private stuff + count += 1 + t = getattr(p, i) + if count == 1: # Only one piece of data, bubble it up + p = t + except: + pass + + # Automatically simplfy SOAP complex types into the + # corresponding python types. (structType --> dict, + # arrayType --> array, etc.) + if self.simplify_objects: + p = simplify(p) + + if self.config.returnAllAttrs: + return p, attrs + return p + + def _callWithBody(self, body): + return self.__call(None, body, {}) + + def __getattr__(self, name): # hook to catch method calls + if name == '__del__': + raise AttributeError, name + return self.__Method(self.__call, name, config = self.config) + + # To handle attribute wierdness + class __Method: + # Some magic to bind a SOAP method to an RPC server. + # Supports "nested" methods (e.g. examples.getStateName) -- concept + # borrowed from xmlrpc/soaplib -- www.pythonware.com + # Altered (improved?) to let you inline namespaces on a per call + # basis ala SOAP::LITE -- www.soaplite.com + + def __init__(self, call, name, ns = None, sa = None, hd = None, + ma = None, config = Config): + + self.__call = call + self.__name = name + self.__ns = ns + self.__sa = sa + self.__hd = hd + self.__ma = ma + self.__config = config + return + + def __call__(self, *args, **kw): + if self.__name[0] == "_": + if self.__name in ["__repr__","__str__"]: + return self.__repr__() + else: + return self.__f_call(*args, **kw) + else: + return self.__r_call(*args, **kw) + + def __getattr__(self, name): + if name == '__del__': + raise AttributeError, name + if self.__name[0] == "_": + # Don't nest method if it is a directive + return self.__class__(self.__call, name, self.__ns, + self.__sa, self.__hd, self.__ma) + + return self.__class__(self.__call, "%s.%s" % (self.__name, name), + self.__ns, self.__sa, self.__hd, self.__ma) + + def __f_call(self, *args, **kw): + if self.__name == "_ns": self.__ns = args + elif self.__name == "_sa": self.__sa = args + elif self.__name == "_hd": self.__hd = args + elif self.__name == "_ma": self.__ma = args + return self + + def __r_call(self, *args, **kw): + return self.__call(self.__name, args, kw, self.__ns, self.__sa, + self.__hd, self.__ma) + + def __repr__(self): + return "<%s at %d>" % (self.__class__, id(self)) diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Config.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Config.py new file mode 100644 index 0000000000000000000000000000000000000000..1d82e841273c6496703c6cee26ec1d2695b5de5f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Config.py @@ -0,0 +1,202 @@ +""" +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +ident = '$Id$' +from version import __version__ + +import copy, socket +from types import * + +from NS import NS + +################################################################################ +# Configuration class +################################################################################ + +class SOAPConfig: + __readonly = ('SSLserver', 'SSLclient', 'GSIserver', 'GSIclient') + + def __init__(self, config = None, **kw): + d = self.__dict__ + + if config: + if not isinstance(config, SOAPConfig): + raise AttributeError, \ + "initializer must be SOAPConfig instance" + + s = config.__dict__ + + for k, v in s.items(): + if k[0] != '_': + d[k] = v + else: + # Setting debug also sets returnFaultInfo, + # dumpHeadersIn, dumpHeadersOut, dumpSOAPIn, and dumpSOAPOut + self.debug = 0 + self.dumpFaultInfo = 1 + # Setting namespaceStyle sets typesNamespace, typesNamespaceURI, + # schemaNamespace, and schemaNamespaceURI + self.namespaceStyle = '1999' + self.strictNamespaces = 0 + self.typed = 1 + self.buildWithNamespacePrefix = 1 + self.returnAllAttrs = 0 + + # Strict checking of range for floats and doubles + self.strict_range = 0 + + # Default encoding for dictionary keys + self.dict_encoding = 'ascii' + + # New argument name handling mechanism. See + # README.MethodParameterNaming for details + self.specialArgs = 1 + + # If unwrap_results=1 and there is only element in the struct, + # SOAPProxy will assume that this element is the result + # and return it rather than the struct containing it. + # Otherwise SOAPproxy will return the struct with all the + # elements as attributes. + self.unwrap_results = 1 + + # Automatically convert SOAP complex types, and + # (recursively) public contents into the corresponding + # python types. (Private subobjects have names that start + # with '_'.) + # + # Conversions: + # - faultType --> raise python exception + # - arrayType --> array + # - compoundType --> dictionary + # + self.simplify_objects = 0 + + # Per-class authorization method. If this is set, before + # calling a any class method, the specified authorization + # method will be called. If it returns 1, the method call + # will proceed, otherwise the call will throw with an + # authorization error. + self.authMethod = None + + # Globus Support if pyGlobus.io available + try: + from pyGlobus import io; + d['GSIserver'] = 1 + d['GSIclient'] = 1 + except: + d['GSIserver'] = 0 + d['GSIclient'] = 0 + + + # Server SSL support if M2Crypto.SSL available + try: + from M2Crypto import SSL + d['SSLserver'] = 1 + except: + d['SSLserver'] = 0 + + # Client SSL support if socket.ssl available + try: + from socket import ssl + d['SSLclient'] = 1 + except: + d['SSLclient'] = 0 + + for k, v in kw.items(): + if k[0] != '_': + setattr(self, k, v) + + def __setattr__(self, name, value): + if name in self.__readonly: + raise AttributeError, "readonly configuration setting" + + d = self.__dict__ + + if name in ('typesNamespace', 'typesNamespaceURI', + 'schemaNamespace', 'schemaNamespaceURI'): + + if name[-3:] == 'URI': + base, uri = name[:-3], 1 + else: + base, uri = name, 0 + + if type(value) == StringType: + if NS.NSMAP.has_key(value): + n = (value, NS.NSMAP[value]) + elif NS.NSMAP_R.has_key(value): + n = (NS.NSMAP_R[value], value) + else: + raise AttributeError, "unknown namespace" + elif type(value) in (ListType, TupleType): + if uri: + n = (value[1], value[0]) + else: + n = (value[0], value[1]) + else: + raise AttributeError, "unknown namespace type" + + d[base], d[base + 'URI'] = n + + try: + d['namespaceStyle'] = \ + NS.STMAP_R[(d['typesNamespace'], d['schemaNamespace'])] + except: + d['namespaceStyle'] = '' + + elif name == 'namespaceStyle': + value = str(value) + + if not NS.STMAP.has_key(value): + raise AttributeError, "unknown namespace style" + + d[name] = value + n = d['typesNamespace'] = NS.STMAP[value][0] + d['typesNamespaceURI'] = NS.NSMAP[n] + n = d['schemaNamespace'] = NS.STMAP[value][1] + d['schemaNamespaceURI'] = NS.NSMAP[n] + + elif name == 'debug': + d[name] = \ + d['returnFaultInfo'] = \ + d['dumpHeadersIn'] = \ + d['dumpHeadersOut'] = \ + d['dumpSOAPIn'] = \ + d['dumpSOAPOut'] = value + + else: + d[name] = value + + +Config = SOAPConfig() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Errors.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Errors.py new file mode 100644 index 0000000000000000000000000000000000000000..0db6612709391c1d6f0c7060427544e7758a4064 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Errors.py @@ -0,0 +1,79 @@ +""" +################################################################################ +# +# SOAPpy - Cayce Ullman (cayce@actzero.com) +# Brian Matthews (blm@actzero.com) +# Gregory Warnes (Gregory.R.Warnes@Pfizer.com) +# Christopher Blunck (blunck@gst.com) +# +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +ident = '$Id$' +from version import __version__ + +import exceptions + +################################################################################ +# Exceptions +################################################################################ +class Error(exceptions.Exception): + def __init__(self, msg): + self.msg = msg + def __str__(self): + return "<Error : %s>" % self.msg + __repr__ = __str__ + def __call__(self): + return (msg,) + +class RecursionError(Error): + pass + +class UnknownTypeError(Error): + pass + +class HTTPError(Error): + # indicates an HTTP protocol error + def __init__(self, code, msg): + self.code = code + self.msg = msg + def __str__(self): + return "<HTTPError %s %s>" % (self.code, self.msg) + __repr__ = __str__ + def __call___(self): + return (self.code, self.msg, ) + +class UnderflowError(exceptions.ArithmeticError): + pass + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/GSIServer.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/GSIServer.py new file mode 100644 index 0000000000000000000000000000000000000000..4ed58749c0da6109d5af428b4600a086d8b460ad --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/GSIServer.py @@ -0,0 +1,142 @@ +""" +GSIServer - Contributed by Ivan R. Judson <judson@mcs.anl.gov> + + +################################################################################ +# +# SOAPpy - Cayce Ullman (cayce@actzero.com) +# Brian Matthews (blm@actzero.com) +# Gregory Warnes (Gregory.R.Warnes@Pfizer.com) +# Christopher Blunck (blunck@gst.com) +# +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +ident = '$Id$' +from version import __version__ + + +#import xml.sax +import re +import socket +import sys +import SocketServer +from types import * +import BaseHTTPServer + +# SOAPpy modules +from Parser import parseSOAPRPC +from Config import SOAPConfig +from Types import faultType, voidType, simplify +from NS import NS +from SOAPBuilder import buildSOAP +from Utilities import debugHeader, debugFooter + +try: from M2Crypto import SSL +except: pass + +##### + +from Server import * + +from pyGlobus.io import GSITCPSocketServer, ThreadingGSITCPSocketServer +from pyGlobus import ioc + +def GSIConfig(): + config = SOAPConfig() + config.channel_mode = ioc.GLOBUS_IO_SECURE_CHANNEL_MODE_GSI_WRAP + config.delegation_mode = ioc.GLOBUS_IO_SECURE_DELEGATION_MODE_FULL_PROXY + config.tcpAttr = None + config.authMethod = "_authorize" + return config + +Config = GSIConfig() + +class GSISOAPServer(GSITCPSocketServer, SOAPServerBase): + def __init__(self, addr = ('localhost', 8000), + RequestHandler = SOAPRequestHandler, log = 0, + encoding = 'UTF-8', config = Config, namespace = None): + + # Test the encoding, raising an exception if it's not known + if encoding != None: + ''.encode(encoding) + + self.namespace = namespace + self.objmap = {} + self.funcmap = {} + self.encoding = encoding + self.config = config + self.log = log + + self.allow_reuse_address= 1 + + GSITCPSocketServer.__init__(self, addr, RequestHandler, + self.config.channel_mode, + self.config.delegation_mode, + tcpAttr = self.config.tcpAttr) + + def get_request(self): + sock, addr = GSITCPSocketServer.get_request(self) + + return sock, addr + +class ThreadingGSISOAPServer(ThreadingGSITCPSocketServer, SOAPServerBase): + + def __init__(self, addr = ('localhost', 8000), + RequestHandler = SOAPRequestHandler, log = 0, + encoding = 'UTF-8', config = Config, namespace = None): + + # Test the encoding, raising an exception if it's not known + if encoding != None: + ''.encode(encoding) + + self.namespace = namespace + self.objmap = {} + self.funcmap = {} + self.encoding = encoding + self.config = config + self.log = log + + self.allow_reuse_address= 1 + + ThreadingGSITCPSocketServer.__init__(self, addr, RequestHandler, + self.config.channel_mode, + self.config.delegation_mode, + tcpAttr = self.config.tcpAttr) + + def get_request(self): + sock, addr = ThreadingGSITCPSocketServer.get_request(self) + + return sock, addr + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/NS.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/NS.py new file mode 100644 index 0000000000000000000000000000000000000000..e04adec508cb57d7896d52e329a7aebcb5b25eee --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/NS.py @@ -0,0 +1,104 @@ +""" +################################################################################ +# +# SOAPpy - Cayce Ullman (cayce@actzero.com) +# Brian Matthews (blm@actzero.com) +# Gregory Warnes (Gregory.R.Warnes@Pfizer.com) +# Christopher Blunck (blunck@gst.com) +# +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +from __future__ import nested_scopes + +ident = '$Id$' +from version import __version__ + +############################################################################## +# Namespace Class +################################################################################ +def invertDict(dict): + d = {} + + for k, v in dict.items(): + d[v] = k + + return d + +class NS: + XML = "http://www.w3.org/XML/1998/namespace" + + ENV = "http://schemas.xmlsoap.org/soap/envelope/" + ENC = "http://schemas.xmlsoap.org/soap/encoding/" + + XSD = "http://www.w3.org/1999/XMLSchema" + XSD2 = "http://www.w3.org/2000/10/XMLSchema" + XSD3 = "http://www.w3.org/2001/XMLSchema" + + XSD_L = [XSD, XSD2, XSD3] + EXSD_L= [ENC, XSD, XSD2, XSD3] + + XSI = "http://www.w3.org/1999/XMLSchema-instance" + XSI2 = "http://www.w3.org/2000/10/XMLSchema-instance" + XSI3 = "http://www.w3.org/2001/XMLSchema-instance" + XSI_L = [XSI, XSI2, XSI3] + + URN = "http://soapinterop.org/xsd" + + # For generated messages + XML_T = "xml" + ENV_T = "SOAP-ENV" + ENC_T = "SOAP-ENC" + XSD_T = "xsd" + XSD2_T= "xsd2" + XSD3_T= "xsd3" + XSI_T = "xsi" + XSI2_T= "xsi2" + XSI3_T= "xsi3" + URN_T = "urn" + + NSMAP = {ENV_T: ENV, ENC_T: ENC, XSD_T: XSD, XSD2_T: XSD2, + XSD3_T: XSD3, XSI_T: XSI, XSI2_T: XSI2, XSI3_T: XSI3, + URN_T: URN} + NSMAP_R = invertDict(NSMAP) + + STMAP = {'1999': (XSD_T, XSI_T), '2000': (XSD2_T, XSI2_T), + '2001': (XSD3_T, XSI3_T)} + STMAP_R = invertDict(STMAP) + + def __init__(self): + raise Error, "Don't instantiate this" + + + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Parser.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Parser.py new file mode 100644 index 0000000000000000000000000000000000000000..4cb2058bffeac3ed446480366c7d27f64af140bc --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Parser.py @@ -0,0 +1,1067 @@ +# SOAPpy modules +from Config import Config +from Types import * +from NS import NS +from Utilities import * + +import string +import fpconst +import xml.sax +from wstools.XMLname import fromXMLname + +try: from M2Crypto import SSL +except: pass + +ident = '$Id$' +from version import __version__ + + +################################################################################ +# SOAP Parser +################################################################################ +class RefHolder: + def __init__(self, name, frame): + self.name = name + self.parent = frame + self.pos = len(frame) + self.subpos = frame.namecounts.get(name, 0) + + def __repr__(self): + return "<%s %s at %d>" % (self.__class__, self.name, id(self)) + + def __str__(self): + return "<%s %s at %d>" % (self.__class__, self.name, id(self)) + +class SOAPParser(xml.sax.handler.ContentHandler): + class Frame: + def __init__(self, name, kind = None, attrs = {}, rules = {}): + self.name = name + self.kind = kind + self.attrs = attrs + self.rules = rules + + self.contents = [] + self.names = [] + self.namecounts = {} + self.subattrs = [] + + def append(self, name, data, attrs): + self.names.append(name) + self.contents.append(data) + self.subattrs.append(attrs) + + if self.namecounts.has_key(name): + self.namecounts[name] += 1 + else: + self.namecounts[name] = 1 + + def _placeItem(self, name, value, pos, subpos = 0, attrs = None): + self.contents[pos] = value + + if attrs: + self.attrs.update(attrs) + + def __len__(self): + return len(self.contents) + + def __repr__(self): + return "<%s %s at %d>" % (self.__class__, self.name, id(self)) + + def __init__(self, rules = None): + xml.sax.handler.ContentHandler.__init__(self) + self.body = None + self.header = None + self.attrs = {} + self._data = None + self._next = "E" # Keeping state for message validity + self._stack = [self.Frame('SOAP')] + + # Make two dictionaries to store the prefix <-> URI mappings, and + # initialize them with the default + self._prem = {NS.XML_T: NS.XML} + self._prem_r = {NS.XML: NS.XML_T} + self._ids = {} + self._refs = {} + self._rules = rules + + def startElementNS(self, name, qname, attrs): + # Workaround two sax bugs + if name[0] == None and name[1][0] == ' ': + name = (None, name[1][1:]) + else: + name = tuple(name) + + # First some checking of the layout of the message + + if self._next == "E": + if name[1] != 'Envelope': + raise Error, "expected `SOAP-ENV:Envelope', gto `%s:%s'" % \ + (self._prem_r[name[0]], name[1]) + if name[0] != NS.ENV: + raise faultType, ("%s:VersionMismatch" % NS.ENV_T, + "Don't understand version `%s' Envelope" % name[0]) + else: + self._next = "HorB" + elif self._next == "HorB": + if name[0] == NS.ENV and name[1] in ("Header", "Body"): + self._next = None + else: + raise Error, \ + "expected `SOAP-ENV:Header' or `SOAP-ENV:Body', " \ + "got `%s'" % self._prem_r[name[0]] + ':' + name[1] + elif self._next == "B": + if name == (NS.ENV, "Body"): + self._next = None + else: + raise Error, "expected `SOAP-ENV:Body', got `%s'" % \ + self._prem_r[name[0]] + ':' + name[1] + elif self._next == "": + raise Error, "expected nothing, got `%s'" % \ + self._prem_r[name[0]] + ':' + name[1] + + if len(self._stack) == 2: + rules = self._rules + else: + try: + rules = self._stack[-1].rules[name[1]] + except: + rules = None + + if type(rules) not in (NoneType, DictType): + kind = rules + else: + kind = attrs.get((NS.ENC, 'arrayType')) + + if kind != None: + del attrs._attrs[(NS.ENC, 'arrayType')] + + i = kind.find(':') + if i >= 0: + kind = (self._prem[kind[:i]], kind[i + 1:]) + else: + kind = None + + self.pushFrame(self.Frame(name[1], kind, attrs._attrs, rules)) + + self._data = [] # Start accumulating + + def pushFrame(self, frame): + self._stack.append(frame) + + def popFrame(self): + return self._stack.pop() + + def endElementNS(self, name, qname): + # Workaround two sax bugs + if name[0] == None and name[1][0] == ' ': + ns, name = None, name[1][1:] + else: + ns, name = tuple(name) + + name = fromXMLname(name) # convert to SOAP 1.2 XML name encoding + + if self._next == "E": + raise Error, "didn't get SOAP-ENV:Envelope" + if self._next in ("HorB", "B"): + raise Error, "didn't get SOAP-ENV:Body" + + cur = self.popFrame() + attrs = cur.attrs + + idval = None + + if attrs.has_key((None, 'id')): + idval = attrs[(None, 'id')] + + if self._ids.has_key(idval): + raise Error, "duplicate id `%s'" % idval + + del attrs[(None, 'id')] + + root = 1 + + if len(self._stack) == 3: + if attrs.has_key((NS.ENC, 'root')): + root = int(attrs[(NS.ENC, 'root')]) + + # Do some preliminary checks. First, if root="0" is present, + # the element must have an id. Next, if root="n" is present, + # n something other than 0 or 1, raise an exception. + + if root == 0: + if idval == None: + raise Error, "non-root element must have an id" + elif root != 1: + raise Error, "SOAP-ENC:root must be `0' or `1'" + + del attrs[(NS.ENC, 'root')] + + while 1: + href = attrs.get((None, 'href')) + if href: + if href[0] != '#': + raise Error, "Non-local hrefs are not yet suppported." + if self._data != None and \ + string.join(self._data, "").strip() != '': + raise Error, "hrefs can't have data" + + href = href[1:] + + if self._ids.has_key(href): + data = self._ids[href] + else: + data = RefHolder(name, self._stack[-1]) + + if self._refs.has_key(href): + self._refs[href].append(data) + else: + self._refs[href] = [data] + + del attrs[(None, 'href')] + + break + + kind = None + + if attrs: + for i in NS.XSI_L: + if attrs.has_key((i, 'type')): + kind = attrs[(i, 'type')] + del attrs[(i, 'type')] + + if kind != None: + i = kind.find(':') + if i >= 0: + kind = (self._prem[kind[:i]], kind[i + 1:]) + else: +# XXX What to do here? (None, kind) is just going to fail in convertType + #print "Kind with no NS:", kind + kind = (None, kind) + + null = 0 + + if attrs: + for i in (NS.XSI, NS.XSI2): + if attrs.has_key((i, 'null')): + null = attrs[(i, 'null')] + del attrs[(i, 'null')] + + if attrs.has_key((NS.XSI3, 'nil')): + null = attrs[(NS.XSI3, 'nil')] + del attrs[(NS.XSI3, 'nil')] + + + ## Check for nil + + # check for nil='true' + if type(null) in (StringType, UnicodeType): + if null.lower() == 'true': + null = 1 + + # check for nil=1, but watch out for string values + try: + null = int(null) + except ValueError, e: + if not e[0].startswith("invalid literal for int()"): + raise e + null = 0 + + if null: + if len(cur) or \ + (self._data != None and string.join(self._data, "").strip() != ''): + raise Error, "nils can't have data" + + data = None + + break + + if len(self._stack) == 2: + if (ns, name) == (NS.ENV, "Header"): + self.header = data = headerType(attrs = attrs) + self._next = "B" + break + elif (ns, name) == (NS.ENV, "Body"): + self.body = data = bodyType(attrs = attrs) + self._next = "" + break + elif len(self._stack) == 3 and self._next == None: + if (ns, name) == (NS.ENV, "Fault"): + data = faultType() + self._next = None # allow followons + break + + #print "\n" + #print "data=", self._data + #print "kind=", kind + #print "cur.kind=", cur.kind + #print "cur.rules=", cur.rules + #print "\n" + + + if cur.rules != None: + rule = cur.rules + + if type(rule) in (StringType, UnicodeType): + rule = (None, rule) # none flags special handling + elif type(rule) == ListType: + rule = tuple(rule) + + #print "kind=",kind + #print "rule=",rule + + +# XXX What if rule != kind? + if callable(rule): + data = rule(string.join(self._data, "")) + elif type(rule) == DictType: + data = structType(name = (ns, name), attrs = attrs) + elif rule[1][:9] == 'arrayType': + data = self.convertType(cur.contents, + rule, attrs) + else: + data = self.convertType(string.join(self._data, ""), + rule, attrs) + + break + + #print "No rules, using kind or cur.kind..." + + if (kind == None and cur.kind != None) or \ + (kind == (NS.ENC, 'Array')): + kind = cur.kind + + if kind == None: + kind = 'ur-type[%d]' % len(cur) + else: + kind = kind[1] + + if len(cur.namecounts) == 1: + elemsname = cur.names[0] + else: + elemsname = None + + data = self.startArray((ns, name), kind, attrs, elemsname) + + break + + if len(self._stack) == 3 and kind == None and \ + len(cur) == 0 and \ + (self._data == None or string.join(self._data, "").strip() == ''): + data = structType(name = (ns, name), attrs = attrs) + break + + if len(cur) == 0 and ns != NS.URN: + # Nothing's been added to the current frame so it must be a + # simple type. + + if kind == None: + # If the current item's container is an array, it will + # have a kind. If so, get the bit before the first [, + # which is the type of the array, therefore the type of + # the current item. + + kind = self._stack[-1].kind + + if kind != None: + i = kind[1].find('[') + if i >= 0: + kind = (kind[0], kind[1][:i]) + elif ns != None: + kind = (ns, name) + + if kind != None: + try: + data = self.convertType(string.join(self._data, ""), + kind, attrs) + except UnknownTypeError: + data = None + else: + data = None + + if data == None: + if self._data == None: + data = '' + else: + data = string.join(self._data, "") + + if len(attrs) == 0: + try: data = str(data) + except: pass + + break + + data = structType(name = (ns, name), attrs = attrs) + + break + + if isinstance(data, compoundType): + for i in range(len(cur)): + v = cur.contents[i] + data._addItem(cur.names[i], v, cur.subattrs[i]) + + if isinstance(v, RefHolder): + v.parent = data + + if root: + self._stack[-1].append(name, data, attrs) + + if idval != None: + self._ids[idval] = data + + if self._refs.has_key(idval): + for i in self._refs[idval]: + i.parent._placeItem(i.name, data, i.pos, i.subpos, attrs) + + del self._refs[idval] + + self.attrs[id(data)] = attrs + + if isinstance(data, anyType): + data._setAttrs(attrs) + + self._data = None # Stop accumulating + + def endDocument(self): + if len(self._refs) == 1: + raise Error, \ + "unresolved reference " + self._refs.keys()[0] + elif len(self._refs) > 1: + raise Error, \ + "unresolved references " + ', '.join(self._refs.keys()) + + def startPrefixMapping(self, prefix, uri): + self._prem[prefix] = uri + self._prem_r[uri] = prefix + + def endPrefixMapping(self, prefix): + try: + del self._prem_r[self._prem[prefix]] + del self._prem[prefix] + except: + pass + + def characters(self, c): + if self._data != None: + self._data.append(c) + + arrayre = '^(?:(?P<ns>[^:]*):)?' \ + '(?P<type>[^[]+)' \ + '(?:\[(?P<rank>,*)\])?' \ + '(?:\[(?P<asize>\d+(?:,\d+)*)?\])$' + + def startArray(self, name, kind, attrs, elemsname): + if type(self.arrayre) == StringType: + self.arrayre = re.compile (self.arrayre) + + offset = attrs.get((NS.ENC, "offset")) + + if offset != None: + del attrs[(NS.ENC, "offset")] + + try: + if offset[0] == '[' and offset[-1] == ']': + offset = int(offset[1:-1]) + if offset < 0: + raise Exception + else: + raise Exception + except: + raise AttributeError, "invalid Array offset" + else: + offset = 0 + + try: + m = self.arrayre.search(kind) + + if m == None: + raise Exception + + t = m.group('type') + + if t == 'ur-type': + return arrayType(None, name, attrs, offset, m.group('rank'), + m.group('asize'), elemsname) + elif m.group('ns') != None: + return typedArrayType(None, name, + (self._prem[m.group('ns')], t), attrs, offset, + m.group('rank'), m.group('asize'), elemsname) + else: + return typedArrayType(None, name, (None, t), attrs, offset, + m.group('rank'), m.group('asize'), elemsname) + except: + raise AttributeError, "invalid Array type `%s'" % kind + + # Conversion + + class DATETIMECONSTS: + SIGNre = '(?P<sign>-?)' + CENTURYre = '(?P<century>\d{2,})' + YEARre = '(?P<year>\d{2})' + MONTHre = '(?P<month>\d{2})' + DAYre = '(?P<day>\d{2})' + HOURre = '(?P<hour>\d{2})' + MINUTEre = '(?P<minute>\d{2})' + SECONDre = '(?P<second>\d{2}(?:\.\d*)?)' + TIMEZONEre = '(?P<zulu>Z)|(?P<tzsign>[-+])(?P<tzhour>\d{2}):' \ + '(?P<tzminute>\d{2})' + BOSre = '^\s*' + EOSre = '\s*$' + + __allres = {'sign': SIGNre, 'century': CENTURYre, 'year': YEARre, + 'month': MONTHre, 'day': DAYre, 'hour': HOURre, + 'minute': MINUTEre, 'second': SECONDre, 'timezone': TIMEZONEre, + 'b': BOSre, 'e': EOSre} + + dateTime = '%(b)s%(sign)s%(century)s%(year)s-%(month)s-%(day)sT' \ + '%(hour)s:%(minute)s:%(second)s(%(timezone)s)?%(e)s' % __allres + timeInstant = dateTime + timePeriod = dateTime + time = '%(b)s%(hour)s:%(minute)s:%(second)s(%(timezone)s)?%(e)s' % \ + __allres + date = '%(b)s%(sign)s%(century)s%(year)s-%(month)s-%(day)s' \ + '(%(timezone)s)?%(e)s' % __allres + century = '%(b)s%(sign)s%(century)s(%(timezone)s)?%(e)s' % __allres + gYearMonth = '%(b)s%(sign)s%(century)s%(year)s-%(month)s' \ + '(%(timezone)s)?%(e)s' % __allres + gYear = '%(b)s%(sign)s%(century)s%(year)s(%(timezone)s)?%(e)s' % \ + __allres + year = gYear + gMonthDay = '%(b)s--%(month)s-%(day)s(%(timezone)s)?%(e)s' % __allres + recurringDate = gMonthDay + gDay = '%(b)s---%(day)s(%(timezone)s)?%(e)s' % __allres + recurringDay = gDay + gMonth = '%(b)s--%(month)s--(%(timezone)s)?%(e)s' % __allres + month = gMonth + + recurringInstant = '%(b)s%(sign)s(%(century)s|-)(%(year)s|-)-' \ + '(%(month)s|-)-(%(day)s|-)T' \ + '(%(hour)s|-):(%(minute)s|-):(%(second)s|-)' \ + '(%(timezone)s)?%(e)s' % __allres + + duration = '%(b)s%(sign)sP' \ + '((?P<year>\d+)Y)?' \ + '((?P<month>\d+)M)?' \ + '((?P<day>\d+)D)?' \ + '((?P<sep>T)' \ + '((?P<hour>\d+)H)?' \ + '((?P<minute>\d+)M)?' \ + '((?P<second>\d*(?:\.\d*)?)S)?)?%(e)s' % \ + __allres + + timeDuration = duration + + # The extra 31 on the front is: + # - so the tuple is 1-based + # - so months[month-1] is December's days if month is 1 + + months = (31, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) + + def convertDateTime(self, value, kind): + def getZoneOffset(d): + zoffs = 0 + + try: + if d['zulu'] == None: + zoffs = 60 * int(d['tzhour']) + int(d['tzminute']) + if d['tzsign'] != '-': + zoffs = -zoffs + except TypeError: + pass + + return zoffs + + def applyZoneOffset(months, zoffs, date, minfield, posday = 1): + if zoffs == 0 and (minfield > 4 or 0 <= date[5] < 60): + return date + + if minfield > 5: date[5] = 0 + if minfield > 4: date[4] = 0 + + if date[5] < 0: + date[4] += int(date[5]) / 60 + date[5] %= 60 + + date[4] += zoffs + + if minfield > 3 or 0 <= date[4] < 60: return date + + date[3] += date[4] / 60 + date[4] %= 60 + + if minfield > 2 or 0 <= date[3] < 24: return date + + date[2] += date[3] / 24 + date[3] %= 24 + + if minfield > 1: + if posday and date[2] <= 0: + date[2] += 31 # zoffs is at most 99:59, so the + # day will never be less than -3 + return date + + while 1: + # The date[1] == 3 (instead of == 2) is because we're + # going back a month, so we need to know if the previous + # month is February, so we test if this month is March. + + leap = minfield == 0 and date[1] == 3 and \ + date[0] % 4 == 0 and \ + (date[0] % 100 != 0 or date[0] % 400 == 0) + + if 0 < date[2] <= months[date[1]] + leap: break + + date[2] += months[date[1] - 1] + leap + + date[1] -= 1 + + if date[1] > 0: break + + date[1] = 12 + + if minfield > 0: break + + date[0] -= 1 + + return date + + try: + exp = getattr(self.DATETIMECONSTS, kind) + except AttributeError: + return None + + if type(exp) == StringType: + exp = re.compile(exp) + setattr (self.DATETIMECONSTS, kind, exp) + + m = exp.search(value) + + try: + if m == None: + raise Exception + + d = m.groupdict() + f = ('century', 'year', 'month', 'day', + 'hour', 'minute', 'second') + fn = len(f) # Index of first non-None value + r = [] + + if kind in ('duration', 'timeDuration'): + if d['sep'] != None and d['hour'] == None and \ + d['minute'] == None and d['second'] == None: + raise Exception + + f = f[1:] + + for i in range(len(f)): + s = d[f[i]] + + if s != None: + if f[i] == 'second': + s = float(s) + else: + try: s = int(s) + except ValueError: s = long(s) + + if i < fn: fn = i + + r.append(s) + + if fn > len(r): # Any non-Nones? + raise Exception + + if d['sign'] == '-': + r[fn] = -r[fn] + + return tuple(r) + + if kind == 'recurringInstant': + for i in range(len(f)): + s = d[f[i]] + + if s == None or s == '-': + if i > fn: + raise Exception + s = None + else: + if i < fn: + fn = i + + if f[i] == 'second': + s = float(s) + else: + try: + s = int(s) + except ValueError: + s = long(s) + + r.append(s) + + s = r.pop(0) + + if fn == 0: + r[0] += s * 100 + else: + fn -= 1 + + if fn < len(r) and d['sign'] == '-': + r[fn] = -r[fn] + + cleanDate(r, fn) + + return tuple(applyZoneOffset(self.DATETIMECONSTS.months, + getZoneOffset(d), r, fn, 0)) + + r = [0, 0, 1, 1, 0, 0, 0] + + for i in range(len(f)): + field = f[i] + + s = d.get(field) + + if s != None: + if field == 'second': + s = float(s) + else: + try: + s = int(s) + except ValueError: + s = long(s) + + if i < fn: + fn = i + + r[i] = s + + if fn > len(r): # Any non-Nones? + raise Exception + + s = r.pop(0) + + if fn == 0: + r[0] += s * 100 + else: + fn -= 1 + + if d.get('sign') == '-': + r[fn] = -r[fn] + + cleanDate(r, fn) + + zoffs = getZoneOffset(d) + + if zoffs: + r = applyZoneOffset(self.DATETIMECONSTS.months, zoffs, r, fn) + + if kind == 'century': + return r[0] / 100 + + s = [] + + for i in range(1, len(f)): + if d.has_key(f[i]): + s.append(r[i - 1]) + + if len(s) == 1: + return s[0] + return tuple(s) + except Exception, e: + raise Error, "invalid %s value `%s' - %s" % (kind, value, e) + + intlimits = \ + { + 'nonPositiveInteger': (0, None, 0), + 'non-positive-integer': (0, None, 0), + 'negativeInteger': (0, None, -1), + 'negative-integer': (0, None, -1), + 'long': (1, -9223372036854775808L, + 9223372036854775807L), + 'int': (0, -2147483648L, 2147483647), + 'short': (0, -32768, 32767), + 'byte': (0, -128, 127), + 'nonNegativeInteger': (0, 0, None), + 'non-negative-integer': (0, 0, None), + 'positiveInteger': (0, 1, None), + 'positive-integer': (0, 1, None), + 'unsignedLong': (1, 0, 18446744073709551615L), + 'unsignedInt': (0, 0, 4294967295L), + 'unsignedShort': (0, 0, 65535), + 'unsignedByte': (0, 0, 255), + } + floatlimits = \ + { + 'float': (7.0064923216240861E-46, -3.4028234663852886E+38, + 3.4028234663852886E+38), + 'double': (2.4703282292062327E-324, -1.7976931348623158E+308, + 1.7976931348623157E+308), + } + zerofloatre = '[1-9]' + + + + + + def convertType(self, d, t, attrs, config=Config): + if t[0] is None and t[1] is not None: + type = t[1].strip() + if type[:9] == 'arrayType': + index_eq = type.find('=') + index_obr = type.find('[') + index_cbr = type.find(']') + elemtype = type[index_eq+1:index_obr] + elemnum = type[index_obr+1:index_cbr] + if elemtype=="ur-type": + return(d) + else: + newarr = map( lambda(di): + self.convertToBasicTypes(d=di, + t = ( NS.XSD, elemtype), + attrs=attrs, + config=config), + d) + return newarr + else: + t = (NS.XSD, t[1]) + + return self.convertToBasicTypes(d, t, attrs, config) + + + def convertToSOAPpyTypes(self, d, t, attrs, config=Config): + pass + + + def convertToBasicTypes(self, d, t, attrs, config=Config): + dnn = d or '' + + #if Config.debug: + #print "convertToBasicTypes:" + #print " requested_type=", t + #print " data=", d + + if t[0] in NS.EXSD_L: + if t[1] == "integer": + try: + d = int(d) + if len(attrs): + d = long(d) + except: + d = long(d) + return d + if self.intlimits.has_key (t[1]): # integer types + l = self.intlimits[t[1]] + try: d = int(d) + except: d = long(d) + + if l[1] != None and d < l[1]: + raise UnderflowError, "%s too small" % d + if l[2] != None and d > l[2]: + raise OverflowError, "%s too large" % d + + if l[0] or len(attrs): + return long(d) + return d + if t[1] == "string": + if len(attrs): + return unicode(dnn) + try: + return str(dnn) + except: + return dnn + if t[1] == "boolean": + d = d.strip().lower() + if d in ('0', 'false'): + return 0 + if d in ('1', 'true'): + return 1 + raise AttributeError, "invalid boolean value" + if t[1] in ('double','float'): + l = self.floatlimits[t[1]] + s = d.strip().lower() + + d = float(s) + + if config.strict_range: + if d < l[1]: raise UnderflowError + if d > l[2]: raise OverflowError + else: + # some older SOAP impementations (notably SOAP4J, + # Apache SOAP) return "infinity" instead of "INF" + # so check the first 3 characters for a match. + if s == "nan": + return fpconst.NaN + elif s[0:3] in ("inf", "+inf"): + return fpconst.PosInf + elif s[0:3] == "-inf": + return fpconst.NegInf + + if fpconst.isNaN(d): + if s != 'nan': + raise ValueError, "invalid %s: %s" % (t[1], s) + elif fpconst.isNegInf(d): + if s != '-inf': + raise UnderflowError, "%s too small: %s" % (t[1], s) + elif fpconst.isPosInf(d): + if s != 'inf': + raise OverflowError, "%s too large: %s" % (t[1], s) + elif d < 0 and d < l[1]: + raise UnderflowError, "%s too small: %s" % (t[1], s) + elif d > 0 and ( d < l[0] or d > l[2] ): + raise OverflowError, "%s too large: %s" % (t[1], s) + elif d == 0: + if type(self.zerofloatre) == StringType: + self.zerofloatre = re.compile(self.zerofloatre) + + if self.zerofloatre.search(s): + raise UnderflowError, "invalid %s: %s" % (t[1], s) + + return d + if t[1] in ("dateTime", "date", "timeInstant", "time"): + return self.convertDateTime(d, t[1]) + if t[1] == "decimal": + return float(d) + if t[1] in ("language", "QName", "NOTATION", "NMTOKEN", "Name", + "NCName", "ID", "IDREF", "ENTITY"): + return collapseWhiteSpace(d) + if t[1] in ("IDREFS", "ENTITIES", "NMTOKENS"): + d = collapseWhiteSpace(d) + return d.split() + if t[0] in NS.XSD_L: + if t[1] in ("base64", "base64Binary"): + if d: + return base64.decodestring(d) + else: + return '' + if t[1] == "hexBinary": + if d: + return decodeHexString(d) + else: + return + if t[1] == "anyURI": + return urllib.unquote(collapseWhiteSpace(d)) + if t[1] in ("normalizedString", "token"): + return collapseWhiteSpace(d) + if t[0] == NS.ENC: + if t[1] == "base64": + if d: + return base64.decodestring(d) + else: + return '' + if t[0] == NS.XSD: + if t[1] == "binary": + try: + e = attrs[(None, 'encoding')] + + if d: + if e == 'hex': + return decodeHexString(d) + elif e == 'base64': + return base64.decodestring(d) + else: + return '' + except: + pass + + raise Error, "unknown or missing binary encoding" + if t[1] == "uri": + return urllib.unquote(collapseWhiteSpace(d)) + if t[1] == "recurringInstant": + return self.convertDateTime(d, t[1]) + if t[0] in (NS.XSD2, NS.ENC): + if t[1] == "uriReference": + return urllib.unquote(collapseWhiteSpace(d)) + if t[1] == "timePeriod": + return self.convertDateTime(d, t[1]) + if t[1] in ("century", "year"): + return self.convertDateTime(d, t[1]) + if t[0] in (NS.XSD, NS.XSD2, NS.ENC): + if t[1] == "timeDuration": + return self.convertDateTime(d, t[1]) + if t[0] == NS.XSD3: + if t[1] == "anyURI": + return urllib.unquote(collapseWhiteSpace(d)) + if t[1] in ("gYearMonth", "gMonthDay"): + return self.convertDateTime(d, t[1]) + if t[1] == "gYear": + return self.convertDateTime(d, t[1]) + if t[1] == "gMonth": + return self.convertDateTime(d, t[1]) + if t[1] == "gDay": + return self.convertDateTime(d, t[1]) + if t[1] == "duration": + return self.convertDateTime(d, t[1]) + if t[0] in (NS.XSD2, NS.XSD3): + if t[1] == "token": + return collapseWhiteSpace(d) + if t[1] == "recurringDate": + return self.convertDateTime(d, t[1]) + if t[1] == "month": + return self.convertDateTime(d, t[1]) + if t[1] == "recurringDay": + return self.convertDateTime(d, t[1]) + if t[0] == NS.XSD2: + if t[1] == "CDATA": + return collapseWhiteSpace(d) + + raise UnknownTypeError, "unknown type `%s'" % (str(t[0]) + ':' + t[1]) + + +################################################################################ +# call to SOAPParser that keeps all of the info +################################################################################ +def _parseSOAP(xml_str, rules = None): + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO + + parser = xml.sax.make_parser() + t = SOAPParser(rules = rules) + parser.setContentHandler(t) + e = xml.sax.handler.ErrorHandler() + parser.setErrorHandler(e) + + inpsrc = xml.sax.xmlreader.InputSource() + inpsrc.setByteStream(StringIO(xml_str)) + + # turn on namespace mangeling + parser.setFeature(xml.sax.handler.feature_namespaces,1) + + try: + parser.parse(inpsrc) + except xml.sax.SAXParseException, e: + parser._parser = None + raise e + + return t + +################################################################################ +# SOAPParser's more public interface +################################################################################ +def parseSOAP(xml_str, attrs = 0): + t = _parseSOAP(xml_str) + + if attrs: + return t.body, t.attrs + return t.body + + +def parseSOAPRPC(xml_str, header = 0, body = 0, attrs = 0, rules = None): + + t = _parseSOAP(xml_str, rules = rules) + p = t.body[0] + + # Empty string, for RPC this translates into a void + if type(p) in (type(''), type(u'')) and p in ('', u''): + name = "Response" + for k in t.body.__dict__.keys(): + if k[0] != "_": + name = k + p = structType(name) + + if header or body or attrs: + ret = (p,) + if header : ret += (t.header,) + if body: ret += (t.body,) + if attrs: ret += (t.attrs,) + return ret + else: + return p diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/SOAP.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/SOAP.py new file mode 100644 index 0000000000000000000000000000000000000000..eb09c3b0669c7c7648695d0a8a3c060093a4dccc --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/SOAP.py @@ -0,0 +1,40 @@ +"""This file is here for backward compatibility with versions <= 0.9.9 + +Delete when 1.0.0 is released! +""" + +ident = '$Id$' +from version import __version__ + +from Client import * +from Config import * +from Errors import * +from NS import * +from Parser import * +from SOAPBuilder import * +from Server import * +from Types import * +from Utilities import * +import wstools +import WSDL + +from warnings import warn + +warn(""" + +The sub-module SOAPpy.SOAP is deprecated and is only +provided for short-term backward compatibility. Objects are now +available directly within the SOAPpy module. Thus, instead of + + from SOAPpy import SOAP + ... + SOAP.SOAPProxy(...) + +use + + from SOAPpy import SOAPProxy + ... + SOAPProxy(...) + +instead. +""", DeprecationWarning) diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/SOAPBuilder.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/SOAPBuilder.py new file mode 100644 index 0000000000000000000000000000000000000000..536c88b65fb839860e01bdfeeaa399e07f7c4598 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/SOAPBuilder.py @@ -0,0 +1,634 @@ +""" +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +ident = '$Id$' +from version import __version__ + +import cgi +import copy +from wstools.XMLname import toXMLname, fromXMLname +import fpconst + +# SOAPpy modules +from Config import Config +from NS import NS +from Types import * + +# Test whether this Python version has Types.BooleanType +# If it doesn't have it, then False and True are serialized as integers +try: + BooleanType + pythonHasBooleanType = 1 +except NameError: + pythonHasBooleanType = 0 + +################################################################################ +# SOAP Builder +################################################################################ +class SOAPBuilder: + _xml_top = '<?xml version="1.0"?>\n' + _xml_enc_top = '<?xml version="1.0" encoding="%s"?>\n' + _env_top = ( '%(ENV_T)s:Envelope\n' + \ + ' %(ENV_T)s:encodingStyle="%(ENC)s"\n' ) % \ + NS.__dict__ + _env_bot = '</%(ENV_T)s:Envelope>\n' % NS.__dict__ + + # Namespaces potentially defined in the Envelope tag. + + _env_ns = {NS.ENC: NS.ENC_T, NS.ENV: NS.ENV_T, + NS.XSD: NS.XSD_T, NS.XSD2: NS.XSD2_T, NS.XSD3: NS.XSD3_T, + NS.XSI: NS.XSI_T, NS.XSI2: NS.XSI2_T, NS.XSI3: NS.XSI3_T} + + def __init__(self, args = (), kw = {}, method = None, namespace = None, + header = None, methodattrs = None, envelope = 1, encoding = 'UTF-8', + use_refs = 0, config = Config, noroot = 0): + + # Test the encoding, raising an exception if it's not known + if encoding != None: + ''.encode(encoding) + + self.args = args + self.kw = kw + self.envelope = envelope + self.encoding = encoding + self.method = method + self.namespace = namespace + self.header = header + self.methodattrs= methodattrs + self.use_refs = use_refs + self.config = config + self.out = [] + self.tcounter = 0 + self.ncounter = 1 + self.icounter = 1 + self.envns = {} + self.ids = {} + self.depth = 0 + self.multirefs = [] + self.multis = 0 + self.body = not isinstance(args, bodyType) + self.noroot = noroot + + def build(self): + if Config.debug: print "In build." + ns_map = {} + + # Cache whether typing is on or not + typed = self.config.typed + + if self.header: + # Create a header. + self.dump(self.header, "Header", typed = typed) + #self.header = None # Wipe it out so no one is using it. + + if self.body: + # Call genns to record that we've used SOAP-ENV. + self.depth += 1 + body_ns = self.genns(ns_map, NS.ENV)[0] + self.out.append("<%sBody>\n" % body_ns) + + if self.method: + # Save the NS map so that it can be restored when we + # fall out of the scope of the method definition + save_ns_map = ns_map.copy() + self.depth += 1 + a = '' + if self.methodattrs: + for (k, v) in self.methodattrs.items(): + a += ' %s="%s"' % (k, v) + + if self.namespace: # Use the namespace info handed to us + methodns, n = self.genns(ns_map, self.namespace) + else: + methodns, n = '', '' + + self.out.append('<%s%s%s%s%s>\n' % ( + methodns, self.method, n, a, self.genroot(ns_map))) + + try: + if type(self.args) != TupleType: + args = (self.args,) + else: + args = self.args + + for i in args: + self.dump(i, typed = typed, ns_map = ns_map) + + if hasattr(self.config, "argsOrdering") and self.config.argsOrdering.has_key(self.method): + for k in self.config.argsOrdering.get(self.method): + self.dump(self.kw.get(k), k, typed = typed, ns_map = ns_map) + else: + for (k, v) in self.kw.items(): + self.dump(v, k, typed = typed, ns_map = ns_map) + + except RecursionError: + if self.use_refs == 0: + # restart + b = SOAPBuilder(args = self.args, kw = self.kw, + method = self.method, namespace = self.namespace, + header = self.header, methodattrs = self.methodattrs, + envelope = self.envelope, encoding = self.encoding, + use_refs = 1, config = self.config) + return b.build() + raise + + if self.method: + self.out.append("</%s%s>\n" % (methodns, self.method)) + # End of the method definition; drop any local namespaces + ns_map = save_ns_map + self.depth -= 1 + + if self.body: + # dump may add to self.multirefs, but the for loop will keep + # going until it has used all of self.multirefs, even those + # entries added while in the loop. + + self.multis = 1 + + for obj, tag in self.multirefs: + self.dump(obj, tag, typed = typed, ns_map = ns_map) + + self.out.append("</%sBody>\n" % body_ns) + self.depth -= 1 + + if self.envelope: + e = map (lambda ns: ' xmlns:%s="%s"\n' % (ns[1], ns[0]), + self.envns.items()) + + self.out = ['<', self._env_top] + e + ['>\n'] + \ + self.out + \ + [self._env_bot] + + if self.encoding != None: + self.out.insert(0, self._xml_enc_top % self.encoding) + return ''.join(self.out).encode(self.encoding) + + self.out.insert(0, self._xml_top) + return ''.join(self.out) + + def gentag(self): + if Config.debug: print "In gentag." + self.tcounter += 1 + return "v%d" % self.tcounter + + def genns(self, ns_map, nsURI): + if nsURI == None: + return ('', '') + + if type(nsURI) == TupleType: # already a tuple + if len(nsURI) == 2: + ns, nsURI = nsURI + else: + ns, nsURI = None, nsURI[0] + else: + ns = None + + if ns_map.has_key(nsURI): + return (ns_map[nsURI] + ':', '') + + if self._env_ns.has_key(nsURI): + ns = self.envns[nsURI] = ns_map[nsURI] = self._env_ns[nsURI] + return (ns + ':', '') + + if not ns: + ns = "ns%d" % self.ncounter + self.ncounter += 1 + ns_map[nsURI] = ns + if self.config.buildWithNamespacePrefix: + return (ns + ':', ' xmlns:%s="%s"' % (ns, nsURI)) + else: + return ('', ' xmlns="%s"' % (nsURI)) + + def genroot(self, ns_map): + if self.noroot: + return '' + + if self.depth != 2: + return '' + + ns, n = self.genns(ns_map, NS.ENC) + return ' %sroot="%d"%s' % (ns, not self.multis, n) + + # checkref checks an element to see if it needs to be encoded as a + # multi-reference element or not. If it returns None, the element has + # been handled and the caller can continue with subsequent elements. + # If it returns a string, the string should be included in the opening + # tag of the marshaled element. + + def checkref(self, obj, tag, ns_map): + if self.depth < 2: + return '' + + if not self.ids.has_key(id(obj)): + n = self.ids[id(obj)] = self.icounter + self.icounter = n + 1 + + if self.use_refs == 0: + return '' + + if self.depth == 2: + return ' id="i%d"' % n + + self.multirefs.append((obj, tag)) + else: + if self.use_refs == 0: + raise RecursionError, "Cannot serialize recursive object" + + n = self.ids[id(obj)] + + if self.multis and self.depth == 2: + return ' id="i%d"' % n + + self.out.append('<%s href="#i%d"%s/>\n' % + (tag, n, self.genroot(ns_map))) + return None + + # dumpers + + def dump(self, obj, tag = None, typed = 1, ns_map = {}): + if Config.debug: print "In dump.", "obj=", obj + ns_map = ns_map.copy() + self.depth += 1 + + if type(tag) not in (NoneType, StringType, UnicodeType): + raise KeyError, "tag must be a string or None" + + try: + meth = getattr(self, "dump_" + type(obj).__name__) + except AttributeError: + if type(obj) == LongType: + obj_type = "integer" + elif pythonHasBooleanType and type(obj) == BooleanType: + obj_type = "boolean" + else: + obj_type = type(obj).__name__ + + self.out.append(self.dumper(None, obj_type, obj, tag, typed, + ns_map, self.genroot(ns_map))) + else: + meth(obj, tag, typed, ns_map) + + + self.depth -= 1 + + # generic dumper + def dumper(self, nsURI, obj_type, obj, tag, typed = 1, ns_map = {}, + rootattr = '', id = '', + xml = '<%(tag)s%(type)s%(id)s%(attrs)s%(root)s>%(data)s</%(tag)s>\n'): + if Config.debug: print "In dumper." + + if nsURI == None: + nsURI = self.config.typesNamespaceURI + + tag = tag or self.gentag() + + tag = toXMLname(tag) # convert from SOAP 1.2 XML name encoding + + a = n = t = '' + if typed and obj_type: + ns, n = self.genns(ns_map, nsURI) + ins = self.genns(ns_map, self.config.schemaNamespaceURI)[0] + t = ' %stype="%s%s"%s' % (ins, ns, obj_type, n) + + try: a = obj._marshalAttrs(ns_map, self) + except: pass + + try: data = obj._marshalData() + except: + if (obj_type != "string"): # strings are already encoded + data = cgi.escape(str(obj)) + else: + data = obj + + + return xml % {"tag": tag, "type": t, "data": data, "root": rootattr, + "id": id, "attrs": a} + + def dump_float(self, obj, tag, typed = 1, ns_map = {}): + if Config.debug: print "In dump_float." + tag = tag or self.gentag() + + tag = toXMLname(tag) # convert from SOAP 1.2 XML name encoding + + if Config.strict_range: + doubleType(obj) + + if fpconst.isPosInf(obj): + obj = "INF" + elif fpconst.isNegInf(obj): + obj = "-INF" + elif fpconst.isNaN(obj): + obj = "NaN" + else: + obj = repr(obj) + + # Note: python 'float' is actually a SOAP 'double'. + self.out.append(self.dumper(None, "double", obj, tag, typed, ns_map, + self.genroot(ns_map))) + + def dump_string(self, obj, tag, typed = 0, ns_map = {}): + if Config.debug: print "In dump_string." + tag = tag or self.gentag() + tag = toXMLname(tag) # convert from SOAP 1.2 XML name encoding + + id = self.checkref(obj, tag, ns_map) + if id == None: + return + + try: data = obj._marshalData() + except: data = obj + + self.out.append(self.dumper(None, "string", cgi.escape(data), tag, + typed, ns_map, self.genroot(ns_map), id)) + + dump_str = dump_string # For Python 2.2+ + dump_unicode = dump_string + + def dump_None(self, obj, tag, typed = 0, ns_map = {}): + if Config.debug: print "In dump_None." + tag = tag or self.gentag() + tag = toXMLname(tag) # convert from SOAP 1.2 XML name encoding + ns = self.genns(ns_map, self.config.schemaNamespaceURI)[0] + + self.out.append('<%s %snull="1"%s/>\n' % + (tag, ns, self.genroot(ns_map))) + + dump_NoneType = dump_None # For Python 2.2+ + + def dump_list(self, obj, tag, typed = 1, ns_map = {}): + if Config.debug: print "In dump_list.", "obj=", obj + tag = tag or self.gentag() + tag = toXMLname(tag) # convert from SOAP 1.2 XML name encoding + + if type(obj) == InstanceType: + data = obj.data + else: + data = obj + + if typed: + id = self.checkref(obj, tag, ns_map) + if id == None: + return + + try: + sample = data[0] + empty = 0 + except: + # preserve type if present + if getattr(obj,"_typed",None) and getattr(obj,"_type",None): + if getattr(obj, "_complexType", None): + sample = typedArrayType(typed=obj._type, + complexType = obj._complexType) + sample._typename = obj._type + if not getattr(obj,"_ns",None): obj._ns = NS.URN + else: + sample = typedArrayType(typed=obj._type) + else: + sample = structType() + empty = 1 + + # First scan list to see if all are the same type + same_type = 1 + + if not empty: + for i in data[1:]: + if type(sample) != type(i) or \ + (type(sample) == InstanceType and \ + sample.__class__ != i.__class__): + same_type = 0 + break + + ndecl = '' + if same_type: + if (isinstance(sample, structType)) or \ + type(sample) == DictType or \ + (isinstance(sample, anyType) and \ + (getattr(sample, "_complexType", None) and \ + sample._complexType)): # force to urn struct + try: + tns = obj._ns or NS.URN + except: + tns = NS.URN + + ns, ndecl = self.genns(ns_map, tns) + + try: + typename = sample._typename + except: + typename = "SOAPStruct" + + t = ns + typename + + elif isinstance(sample, anyType): + ns = sample._validNamespaceURI(self.config.typesNamespaceURI, + self.config.strictNamespaces) + if ns: + ns, ndecl = self.genns(ns_map, ns) + t = ns + str(sample._type) + else: + t = 'ur-type' + else: + typename = type(sample).__name__ + + # For Python 2.2+ + if type(sample) == StringType: typename = 'string' + + # HACK: unicode is a SOAP string + if type(sample) == UnicodeType: typename = 'string' + + # HACK: python 'float' is actually a SOAP 'double'. + if typename=="float": typename="double" + t = self.genns(ns_map, self.config.typesNamespaceURI)[0] + \ + typename + + else: + t = self.genns(ns_map, self.config.typesNamespaceURI)[0] + \ + "ur-type" + + try: a = obj._marshalAttrs(ns_map, self) + except: a = '' + + ens, edecl = self.genns(ns_map, NS.ENC) + ins, idecl = self.genns(ns_map, self.config.schemaNamespaceURI) + + if typed: + self.out.append( + '<%s %sarrayType="%s[%d]" %stype="%sArray"%s%s%s%s%s%s>\n' % + (tag, ens, t, len(data), ins, ens, ndecl, edecl, idecl, + self.genroot(ns_map), id, a)) + + if typed: + try: elemsname = obj._elemsname + except: elemsname = "item" + else: + elemsname = tag + + for i in data: + self.dump(i, elemsname, not same_type, ns_map) + + if typed: self.out.append('</%s>\n' % tag) + + dump_tuple = dump_list + + def dump_dictionary(self, obj, tag, typed = 1, ns_map = {}): + if Config.debug: print "In dump_dictionary." + tag = tag or self.gentag() + tag = toXMLname(tag) # convert from SOAP 1.2 XML name encoding + + id = self.checkref(obj, tag, ns_map) + if id == None: + return + + try: a = obj._marshalAttrs(ns_map, self) + except: a = '' + + self.out.append('<%s%s%s%s>\n' % + (tag, id, a, self.genroot(ns_map))) + + for (k, v) in obj.items(): + if k[0] != "_": + self.dump(v, k, 1, ns_map) + + self.out.append('</%s>\n' % tag) + + dump_dict = dump_dictionary # For Python 2.2+ + + def dump_instance(self, obj, tag, typed = 1, ns_map = {}): + if Config.debug: print "In dump_instance.", "obj=", obj, "tag=", tag + if not tag: + # If it has a name use it. + if isinstance(obj, anyType) and obj._name: + tag = obj._name + else: + tag = self.gentag() + tag = toXMLname(tag) # convert from SOAP 1.2 XML name encoding + + if isinstance(obj, arrayType): # Array + self.dump_list(obj, tag, typed, ns_map) + return + + if isinstance(obj, faultType): # Fault + cns, cdecl = self.genns(ns_map, NS.ENC) + vns, vdecl = self.genns(ns_map, NS.ENV) + self.out.append('''<%sFault %sroot="1"%s%s> +<faultcode>%s</faultcode> +<faultstring>%s</faultstring> +''' % (vns, cns, vdecl, cdecl, obj.faultcode, obj.faultstring)) + if hasattr(obj, "detail"): + self.dump(obj.detail, "detail", typed, ns_map) + self.out.append("</%sFault>\n" % vns) + return + + r = self.genroot(ns_map) + + try: a = obj._marshalAttrs(ns_map, self) + except: a = '' + + if isinstance(obj, voidType): # void + self.out.append("<%s%s%s></%s>\n" % (tag, a, r, tag)) + return + + id = self.checkref(obj, tag, ns_map) + if id == None: + return + + if isinstance(obj, structType): + # Check for namespace + ndecl = '' + ns = obj._validNamespaceURI(self.config.typesNamespaceURI, + self.config.strictNamespaces) + if ns: + ns, ndecl = self.genns(ns_map, ns) + tag = ns + tag + self.out.append("<%s%s%s%s%s>\n" % (tag, ndecl, id, a, r)) + + keylist = obj.__dict__.keys() + + # first write out items with order information + if hasattr(obj, '_keyord'): + for i in range(len(obj._keyord)): + self.dump(obj._aslist(i), obj._keyord[i], 1, ns_map) + keylist.remove(obj._keyord[i]) + + # now write out the rest + for k in keylist: + if (k[0] != "_"): + self.dump(getattr(obj,k), k, 1, ns_map) + + if isinstance(obj, bodyType): + self.multis = 1 + + for v, k in self.multirefs: + self.dump(v, k, typed = typed, ns_map = ns_map) + + self.out.append('</%s>\n' % tag) + + elif isinstance(obj, anyType): + t = '' + + if typed: + ns = obj._validNamespaceURI(self.config.typesNamespaceURI, + self.config.strictNamespaces) + if ns: + ons, ondecl = self.genns(ns_map, ns) + ins, indecl = self.genns(ns_map, + self.config.schemaNamespaceURI) + t = ' %stype="%s%s"%s%s' % \ + (ins, ons, obj._type, ondecl, indecl) + + self.out.append('<%s%s%s%s%s>%s</%s>\n' % + (tag, t, id, a, r, obj._marshalData(), tag)) + + else: # Some Class + self.out.append('<%s%s%s>\n' % (tag, id, r)) + + for (k, v) in obj.__dict__.items(): + if k[0] != "_": + self.dump(v, k, 1, ns_map) + + self.out.append('</%s>\n' % tag) + + +################################################################################ +# SOAPBuilder's more public interface +################################################################################ + +def buildSOAP(args=(), kw={}, method=None, namespace=None, + header=None, methodattrs=None, envelope=1, encoding='UTF-8', + config=Config, noroot = 0): + t = SOAPBuilder(args=args, kw=kw, method=method, namespace=namespace, + header=header, methodattrs=methodattrs,envelope=envelope, + encoding=encoding, config=config,noroot=noroot) + return t.build() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Server.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Server.py new file mode 100644 index 0000000000000000000000000000000000000000..90f2aacda66e0458808b0c0569363f3a99f681f3 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Server.py @@ -0,0 +1,705 @@ +""" +################################################################################ +# +# SOAPpy - Cayce Ullman (cayce@actzero.com) +# Brian Matthews (blm@actzero.com) +# Gregory Warnes (Gregory.R.Warnes@Pfizer.com) +# Christopher Blunck (blunck@gst.com) +# +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +ident = '$Id$' +from version import __version__ + + +#import xml.sax +import re +import socket +import sys +import SocketServer +from types import * +import BaseHTTPServer +import thread + +# SOAPpy modules +from Parser import parseSOAPRPC +from Config import Config +from Types import faultType, voidType, simplify +from NS import NS +from SOAPBuilder import buildSOAP +from Utilities import debugHeader, debugFooter + +try: from M2Crypto import SSL +except: pass + +ident = '$Id$' + +from version import __version__ + +################################################################################ +# Call context dictionary +################################################################################ + +_contexts = dict() + +def GetSOAPContext(): + global _contexts + return _contexts[thread.get_ident()] + +################################################################################ +# Server +################################################################################ + +# Method Signature class for adding extra info to registered funcs, right now +# used just to indicate it should be called with keywords, instead of ordered +# params. +class MethodSig: + def __init__(self, func, keywords=0, context=0): + self.func = func + self.keywords = keywords + self.context = context + self.__name__ = func.__name__ + + def __call__(self, *args, **kw): + return apply(self.func,args,kw) + +class SOAPContext: + def __init__(self, header, body, attrs, xmldata, connection, httpheaders, + soapaction): + + self.header = header + self.body = body + self.attrs = attrs + self.xmldata = xmldata + self.connection = connection + self.httpheaders= httpheaders + self.soapaction = soapaction + +# A class to describe how header messages are handled +class HeaderHandler: + # Initially fail out if there are any problems. + def __init__(self, header, attrs): + for i in header.__dict__.keys(): + if i[0] == "_": + continue + + d = getattr(header, i) + + try: + fault = int(attrs[id(d)][(NS.ENV, 'mustUnderstand')]) + except: + fault = 0 + + if fault: + raise faultType, ("%s:MustUnderstand" % NS.ENV_T, + "Required Header Misunderstood", + "%s" % i) + +################################################################################ +# SOAP Server +################################################################################ +class SOAPServerBase: + + def get_request(self): + sock, addr = SocketServer.TCPServer.get_request(self) + + if self.ssl_context: + sock = SSL.Connection(self.ssl_context, sock) + sock._setup_ssl(addr) + if sock.accept_ssl() != 1: + raise socket.error, "Couldn't accept SSL connection" + + return sock, addr + + def registerObject(self, object, namespace = '', path = ''): + if namespace == '' and path == '': namespace = self.namespace + if namespace == '' and path != '': + namespace = path.replace("/", ":") + if namespace[0] == ":": namespace = namespace[1:] + self.objmap[namespace] = object + + def registerFunction(self, function, namespace = '', funcName = None, + path = ''): + if not funcName : funcName = function.__name__ + if namespace == '' and path == '': namespace = self.namespace + if namespace == '' and path != '': + namespace = path.replace("/", ":") + if namespace[0] == ":": namespace = namespace[1:] + if self.funcmap.has_key(namespace): + self.funcmap[namespace][funcName] = function + else: + self.funcmap[namespace] = {funcName : function} + + def registerKWObject(self, object, namespace = '', path = ''): + if namespace == '' and path == '': namespace = self.namespace + if namespace == '' and path != '': + namespace = path.replace("/", ":") + if namespace[0] == ":": namespace = namespace[1:] + for i in dir(object.__class__): + if i[0] != "_" and callable(getattr(object, i)): + self.registerKWFunction(getattr(object,i), namespace) + + # convenience - wraps your func for you. + def registerKWFunction(self, function, namespace = '', funcName = None, + path = ''): + if namespace == '' and path == '': namespace = self.namespace + if namespace == '' and path != '': + namespace = path.replace("/", ":") + if namespace[0] == ":": namespace = namespace[1:] + self.registerFunction(MethodSig(function,keywords=1), namespace, + funcName) + + def unregisterObject(self, object, namespace = '', path = ''): + if namespace == '' and path == '': namespace = self.namespace + if namespace == '' and path != '': + namespace = path.replace("/", ":") + if namespace[0] == ":": namespace = namespace[1:] + + del self.objmap[namespace] + +class SOAPRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): + def version_string(self): + return '<a href="http://pywebsvcs.sf.net">' + \ + 'SOAPpy ' + __version__ + '</a> (Python ' + \ + sys.version.split()[0] + ')' + + def date_time_string(self): + self.__last_date_time_string = \ + BaseHTTPServer.BaseHTTPRequestHandler.\ + date_time_string(self) + + return self.__last_date_time_string + + def do_POST(self): + global _contexts + + status = 500 + try: + if self.server.config.dumpHeadersIn: + s = 'Incoming HTTP headers' + debugHeader(s) + print self.raw_requestline.strip() + print "\n".join(map (lambda x: x.strip(), + self.headers.headers)) + debugFooter(s) + + data = self.rfile.read(int(self.headers["Content-length"])) + + if self.server.config.dumpSOAPIn: + s = 'Incoming SOAP' + debugHeader(s) + print data, + if data[-1] != '\n': + print + debugFooter(s) + + (r, header, body, attrs) = \ + parseSOAPRPC(data, header = 1, body = 1, attrs = 1) + + method = r._name + args = r._aslist() + kw = r._asdict() + + if Config.simplify_objects: + args = simplify(args) + kw = simplify(kw) + + # Handle mixed named and unnamed arguments by assuming + # that all arguments with names of the form "v[0-9]+" + # are unnamed and should be passed in numeric order, + # other arguments are named and should be passed using + # this name. + + # This is a non-standard exension to the SOAP protocol, + # but is supported by Apache AXIS. + + # It is enabled by default. To disable, set + # Config.specialArgs to False. + + if Config.specialArgs: + + ordered_args = {} + named_args = {} + + for (k,v) in kw.items(): + + if k[0]=="v": + try: + i = int(k[1:]) + ordered_args[i] = v + except ValueError: + named_args[str(k)] = v + + else: + named_args[str(k)] = v + + # We have to decide namespace precedence + # I'm happy with the following scenario + # if r._ns is specified use it, if not check for + # a path, if it's specified convert it and use it as the + # namespace. If both are specified, use r._ns. + + ns = r._ns + + if len(self.path) > 1 and not ns: + ns = self.path.replace("/", ":") + if ns[0] == ":": ns = ns[1:] + + # authorization method + a = None + + keylist = ordered_args.keys() + keylist.sort() + + # create list in proper order w/o names + tmp = map( lambda x: ordered_args[x], keylist) + ordered_args = tmp + + #print '<-> Argument Matching Yielded:' + #print '<-> Ordered Arguments:' + str(ordered_args) + #print '<-> Named Arguments :' + str(named_args) + + resp = "" + + # For fault messages + if ns: + nsmethod = "%s:%s" % (ns, method) + else: + nsmethod = method + + try: + # First look for registered functions + if self.server.funcmap.has_key(ns) and \ + self.server.funcmap[ns].has_key(method): + f = self.server.funcmap[ns][method] + + # look for the authorization method + if self.server.config.authMethod != None: + authmethod = self.server.config.authMethod + if self.server.funcmap.has_key(ns) and \ + self.server.funcmap[ns].has_key(authmethod): + a = self.server.funcmap[ns][authmethod] + else: + # Now look at registered objects + # Check for nested attributes. This works even if + # there are none, because the split will return + # [method] + f = self.server.objmap[ns] + + # Look for the authorization method + if self.server.config.authMethod != None: + authmethod = self.server.config.authMethod + if hasattr(f, authmethod): + a = getattr(f, authmethod) + + # then continue looking for the method + l = method.split(".") + for i in l: + f = getattr(f, i) + except: + info = sys.exc_info() + try: + resp = buildSOAP(faultType("%s:Client" % NS.ENV_T, + "Method Not Found", + "%s : %s %s %s" % (nsmethod, + info[0], + info[1], + info[2])), + encoding = self.server.encoding, + config = self.server.config) + finally: + del info + status = 500 + else: + try: + if header: + x = HeaderHandler(header, attrs) + + fr = 1 + + # call context book keeping + # We're stuffing the method into the soapaction if there + # isn't one, someday, we'll set that on the client + # and it won't be necessary here + # for now we're doing both + + if "SOAPAction".lower() not in self.headers.keys() or \ + self.headers["SOAPAction"] == "\"\"": + self.headers["SOAPAction"] = method + + thread_id = thread.get_ident() + _contexts[thread_id] = SOAPContext(header, body, + attrs, data, + self.connection, + self.headers, + self.headers["SOAPAction"]) + + # Do an authorization check + if a != None: + if not apply(a, (), {"_SOAPContext" : + _contexts[thread_id] }): + raise faultType("%s:Server" % NS.ENV_T, + "Authorization failed.", + "%s" % nsmethod) + + # If it's wrapped, some special action may be needed + if isinstance(f, MethodSig): + c = None + + if f.context: # retrieve context object + c = _contexts[thread_id] + + if Config.specialArgs: + if c: + named_args["_SOAPContext"] = c + fr = apply(f, ordered_args, named_args) + elif f.keywords: + # This is lame, but have to de-unicode + # keywords + + strkw = {} + + for (k, v) in kw.items(): + strkw[str(k)] = v + if c: + strkw["_SOAPContext"] = c + fr = apply(f, (), strkw) + elif c: + fr = apply(f, args, {'_SOAPContext':c}) + else: + fr = apply(f, args, {}) + + else: + if Config.specialArgs: + fr = apply(f, ordered_args, named_args) + else: + fr = apply(f, args, {}) + + + if type(fr) == type(self) and \ + isinstance(fr, voidType): + resp = buildSOAP(kw = {'%sResponse' % method: fr}, + encoding = self.server.encoding, + config = self.server.config) + else: + resp = buildSOAP(kw = + {'%sResponse' % method: {'Result': fr}}, + encoding = self.server.encoding, + config = self.server.config) + + # Clean up _contexts + if _contexts.has_key(thread_id): + del _contexts[thread_id] + + except Exception, e: + import traceback + info = sys.exc_info() + + try: + if self.server.config.dumpFaultInfo: + s = 'Method %s exception' % nsmethod + debugHeader(s) + traceback.print_exception(info[0], info[1], + info[2]) + debugFooter(s) + + if isinstance(e, faultType): + f = e + else: + f = faultType("%s:Server" % NS.ENV_T, + "Method Failed", + "%s" % nsmethod) + + if self.server.config.returnFaultInfo: + f._setDetail("".join(traceback.format_exception( + info[0], info[1], info[2]))) + elif not hasattr(f, 'detail'): + f._setDetail("%s %s" % (info[0], info[1])) + finally: + del info + + resp = buildSOAP(f, encoding = self.server.encoding, + config = self.server.config) + status = 500 + else: + status = 200 + except faultType, e: + import traceback + info = sys.exc_info() + try: + if self.server.config.dumpFaultInfo: + s = 'Received fault exception' + debugHeader(s) + traceback.print_exception(info[0], info[1], + info[2]) + debugFooter(s) + + if self.server.config.returnFaultInfo: + e._setDetail("".join(traceback.format_exception( + info[0], info[1], info[2]))) + elif not hasattr(e, 'detail'): + e._setDetail("%s %s" % (info[0], info[1])) + finally: + del info + + resp = buildSOAP(e, encoding = self.server.encoding, + config = self.server.config) + status = 500 + except Exception, e: + # internal error, report as HTTP server error + + if self.server.config.dumpFaultInfo: + s = 'Internal exception %s' % e + import traceback + debugHeader(s) + info = sys.exc_info() + try: + traceback.print_exception(info[0], info[1], info[2]) + finally: + del info + + debugFooter(s) + + self.send_response(500) + self.end_headers() + + if self.server.config.dumpHeadersOut and \ + self.request_version != 'HTTP/0.9': + s = 'Outgoing HTTP headers' + debugHeader(s) + if self.responses.has_key(status): + s = ' ' + self.responses[status][0] + else: + s = '' + print "%s %d%s" % (self.protocol_version, 500, s) + print "Server:", self.version_string() + print "Date:", self.__last_date_time_string + debugFooter(s) + else: + # got a valid SOAP response + self.send_response(status) + + t = 'text/xml'; + if self.server.encoding != None: + t += '; charset="%s"' % self.server.encoding + self.send_header("Content-type", t) + self.send_header("Content-length", str(len(resp))) + self.end_headers() + + if self.server.config.dumpHeadersOut and \ + self.request_version != 'HTTP/0.9': + s = 'Outgoing HTTP headers' + debugHeader(s) + if self.responses.has_key(status): + s = ' ' + self.responses[status][0] + else: + s = '' + print "%s %d%s" % (self.protocol_version, status, s) + print "Server:", self.version_string() + print "Date:", self.__last_date_time_string + print "Content-type:", t + print "Content-length:", len(resp) + debugFooter(s) + + if self.server.config.dumpSOAPOut: + s = 'Outgoing SOAP' + debugHeader(s) + print resp, + if resp[-1] != '\n': + print + debugFooter(s) + + self.wfile.write(resp) + self.wfile.flush() + + # We should be able to shut down both a regular and an SSL + # connection, but under Python 2.1, calling shutdown on an + # SSL connections drops the output, so this work-around. + # This should be investigated more someday. + + if self.server.config.SSLserver and \ + isinstance(self.connection, SSL.Connection): + self.connection.set_shutdown(SSL.SSL_SENT_SHUTDOWN | + SSL.SSL_RECEIVED_SHUTDOWN) + else: + self.connection.shutdown(1) + + def do_GET(self): + + #print 'command ', self.command + #print 'path ', self.path + #print 'request_version', self.request_version + #print 'headers' + #print ' type ', self.headers.type + #print ' maintype', self.headers.maintype + #print ' subtype ', self.headers.subtype + #print ' params ', self.headers.plist + + path = self.path.lower() + if path.endswith('wsdl'): + method = 'wsdl' + function = namespace = None + if self.server.funcmap.has_key(namespace) \ + and self.server.funcmap[namespace].has_key(method): + function = self.server.funcmap[namespace][method] + else: + if namespace in self.server.objmap.keys(): + function = self.server.objmap[namespace] + l = method.split(".") + for i in l: + function = getattr(function, i) + + if function: + self.send_response(200) + self.send_header("Content-type", 'text/plain') + self.end_headers() + response = apply(function, ()) + self.wfile.write(str(response)) + return + + # return error + self.send_response(200) + self.send_header("Content-type", 'text/html') + self.end_headers() + self.wfile.write('''\ +<title> +<head>Error!</head> +</title> + +<body> +<h1>Oops!</h1> + +<p> + This server supports HTTP GET requests only for the the purpose of + obtaining Web Services Description Language (WSDL) for a specific + service. + + Either you requested an URL that does not end in "wsdl" or this + server does not implement a wsdl method. +</p> + + +</body>''') + + + def log_message(self, format, *args): + if self.server.log: + BaseHTTPServer.BaseHTTPRequestHandler.\ + log_message (self, format, *args) + + + +class SOAPServer(SOAPServerBase, SocketServer.TCPServer): + + def __init__(self, addr = ('localhost', 8000), + RequestHandler = SOAPRequestHandler, log = 0, encoding = 'UTF-8', + config = Config, namespace = None, ssl_context = None): + + # Test the encoding, raising an exception if it's not known + if encoding != None: + ''.encode(encoding) + + if ssl_context != None and not config.SSLserver: + raise AttributeError, \ + "SSL server not supported by this Python installation" + + self.namespace = namespace + self.objmap = {} + self.funcmap = {} + self.ssl_context = ssl_context + self.encoding = encoding + self.config = config + self.log = log + + self.allow_reuse_address= 1 + + SocketServer.TCPServer.__init__(self, addr, RequestHandler) + + +class ThreadingSOAPServer(SOAPServerBase, SocketServer.ThreadingTCPServer): + + def __init__(self, addr = ('localhost', 8000), + RequestHandler = SOAPRequestHandler, log = 0, encoding = 'UTF-8', + config = Config, namespace = None, ssl_context = None): + + # Test the encoding, raising an exception if it's not known + if encoding != None: + ''.encode(encoding) + + if ssl_context != None and not config.SSLserver: + raise AttributeError, \ + "SSL server not supported by this Python installation" + + self.namespace = namespace + self.objmap = {} + self.funcmap = {} + self.ssl_context = ssl_context + self.encoding = encoding + self.config = config + self.log = log + + self.allow_reuse_address= 1 + + SocketServer.ThreadingTCPServer.__init__(self, addr, RequestHandler) + +# only define class if Unix domain sockets are available +if hasattr(socket, "AF_UNIX"): + + class SOAPUnixSocketServer(SOAPServerBase, SocketServer.UnixStreamServer): + + def __init__(self, addr = 8000, + RequestHandler = SOAPRequestHandler, log = 0, encoding = 'UTF-8', + config = Config, namespace = None, ssl_context = None): + + # Test the encoding, raising an exception if it's not known + if encoding != None: + ''.encode(encoding) + + if ssl_context != None and not config.SSLserver: + raise AttributeError, \ + "SSL server not supported by this Python installation" + + self.namespace = namespace + self.objmap = {} + self.funcmap = {} + self.ssl_context = ssl_context + self.encoding = encoding + self.config = config + self.log = log + + self.allow_reuse_address= 1 + + SocketServer.UnixStreamServer.__init__(self, str(addr), RequestHandler) + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Types.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Types.py new file mode 100644 index 0000000000000000000000000000000000000000..693637c55234c965b885ebe65ec218ed70cf868b --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Types.py @@ -0,0 +1,1735 @@ +""" +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +ident = '$Id$' +from version import __version__ + + +import UserList +import base64 +import cgi +import urllib +import copy +import re +import time +from types import * + +# SOAPpy modules +from Errors import * +from NS import NS +from Utilities import encodeHexString, cleanDate +from Config import Config + +############################################################################### +# Utility functions +############################################################################### + +def isPrivate(name): return name[0]=='_' +def isPublic(name): return name[0]!='_' + +############################################################################### +# Types and Wrappers +############################################################################### + +class anyType: + _validURIs = (NS.XSD, NS.XSD2, NS.XSD3, NS.ENC) + + def __init__(self, data = None, name = None, typed = 1, attrs = None): + if self.__class__ == anyType: + raise Error, "anyType can't be instantiated directly" + + if type(name) in (ListType, TupleType): + self._ns, self._name = name + else: + self._ns = self._validURIs[0] + self._name = name + + self._typed = typed + self._attrs = {} + + self._cache = None + self._type = self._typeName() + + self._data = self._checkValueSpace(data) + + if attrs != None: + self._setAttrs(attrs) + + def __str__(self): + if hasattr(self,'_name') and self._name: + return "<%s %s at %d>" % (self.__class__, self._name, id(self)) + return "<%s at %d>" % (self.__class__, id(self)) + + __repr__ = __str__ + + def _checkValueSpace(self, data): + return data + + def _marshalData(self): + return str(self._data) + + def _marshalAttrs(self, ns_map, builder): + a = '' + + for attr, value in self._attrs.items(): + ns, n = builder.genns(ns_map, attr[0]) + a += n + ' %s%s="%s"' % \ + (ns, attr[1], cgi.escape(str(value), 1)) + + return a + + def _fixAttr(self, attr): + if type(attr) in (StringType, UnicodeType): + attr = (None, attr) + elif type(attr) == ListType: + attr = tuple(attr) + elif type(attr) != TupleType: + raise AttributeError, "invalid attribute type" + + if len(attr) != 2: + raise AttributeError, "invalid attribute length" + + if type(attr[0]) not in (NoneType, StringType, UnicodeType): + raise AttributeError, "invalid attribute namespace URI type" + + return attr + + def _getAttr(self, attr): + attr = self._fixAttr(attr) + + try: + return self._attrs[attr] + except: + return None + + def _setAttr(self, attr, value): + attr = self._fixAttr(attr) + + if type(value) is StringType: + value = unicode(value) + + self._attrs[attr] = value + + + def _setAttrs(self, attrs): + if type(attrs) in (ListType, TupleType): + for i in range(0, len(attrs), 2): + self._setAttr(attrs[i], attrs[i + 1]) + + return + + if type(attrs) == DictType: + d = attrs + elif isinstance(attrs, anyType): + d = attrs._attrs + else: + raise AttributeError, "invalid attribute type" + + for attr, value in d.items(): + self._setAttr(attr, value) + + def _setMustUnderstand(self, val): + self._setAttr((NS.ENV, "mustUnderstand"), val) + + def _getMustUnderstand(self): + return self._getAttr((NS.ENV, "mustUnderstand")) + + def _setActor(self, val): + self._setAttr((NS.ENV, "actor"), val) + + def _getActor(self): + return self._getAttr((NS.ENV, "actor")) + + def _typeName(self): + return self.__class__.__name__[:-4] + + def _validNamespaceURI(self, URI, strict): + if not hasattr(self, '_typed') or not self._typed: + return None + if URI in self._validURIs: + return URI + if not strict: + return self._ns + raise AttributeError, \ + "not a valid namespace for type %s" % self._type + +class voidType(anyType): + pass + +class stringType(anyType): + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (StringType, UnicodeType): + raise AttributeError, "invalid %s type:" % self._type + + return data + +class untypedType(stringType): + def __init__(self, data = None, name = None, attrs = None): + stringType.__init__(self, data, name, 0, attrs) + +class IDType(stringType): pass +class NCNameType(stringType): pass +class NameType(stringType): pass +class ENTITYType(stringType): pass +class IDREFType(stringType): pass +class languageType(stringType): pass +class NMTOKENType(stringType): pass +class QNameType(stringType): pass + +class tokenType(anyType): + _validURIs = (NS.XSD2, NS.XSD3) + __invalidre = '[\n\t]|^ | $| ' + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (StringType, UnicodeType): + raise AttributeError, "invalid %s type" % self._type + + if type(self.__invalidre) == StringType: + self.__invalidre = re.compile(self.__invalidre) + + if self.__invalidre.search(data): + raise ValueError, "invalid %s value" % self._type + + return data + +class normalizedStringType(anyType): + _validURIs = (NS.XSD3,) + __invalidre = '[\n\r\t]' + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (StringType, UnicodeType): + raise AttributeError, "invalid %s type" % self._type + + if type(self.__invalidre) == StringType: + self.__invalidre = re.compile(self.__invalidre) + + if self.__invalidre.search(data): + raise ValueError, "invalid %s value" % self._type + + return data + +class CDATAType(normalizedStringType): + _validURIs = (NS.XSD2,) + +class booleanType(anyType): + def __int__(self): + return self._data + + __nonzero__ = __int__ + + def _marshalData(self): + return ['false', 'true'][self._data] + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if data in (0, '0', 'false', ''): + return 0 + if data in (1, '1', 'true'): + return 1 + raise ValueError, "invalid %s value" % self._type + +class decimalType(anyType): + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType, FloatType): + raise Error, "invalid %s value" % self._type + + return data + +class floatType(anyType): + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType, FloatType) or \ + data < -3.4028234663852886E+38 or \ + data > 3.4028234663852886E+38: + raise ValueError, "invalid %s value: %s" % (self._type, repr(data)) + + return data + + def _marshalData(self): + return "%.18g" % self._data # More precision + +class doubleType(anyType): + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType, FloatType) or \ + data < -1.7976931348623158E+308 or \ + data > 1.7976931348623157E+308: + raise ValueError, "invalid %s value: %s" % (self._type, repr(data)) + + return data + + def _marshalData(self): + return "%.18g" % self._data # More precision + +class durationType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + try: + # A tuple or a scalar is OK, but make them into a list + + if type(data) == TupleType: + data = list(data) + elif type(data) != ListType: + data = [data] + + if len(data) > 6: + raise Exception, "too many values" + + # Now check the types of all the components, and find + # the first nonzero element along the way. + + f = -1 + + for i in range(len(data)): + if data[i] == None: + data[i] = 0 + continue + + if type(data[i]) not in \ + (IntType, LongType, FloatType): + raise Exception, "element %d a bad type" % i + + if data[i] and f == -1: + f = i + + # If they're all 0, just use zero seconds. + + if f == -1: + self._cache = 'PT0S' + + return (0,) * 6 + + # Make sure only the last nonzero element has a decimal fraction + # and only the first element is negative. + + d = -1 + + for i in range(f, len(data)): + if data[i]: + if d != -1: + raise Exception, \ + "all except the last nonzero element must be " \ + "integers" + if data[i] < 0 and i > f: + raise Exception, \ + "only the first nonzero element can be negative" + elif data[i] != long(data[i]): + d = i + + # Pad the list on the left if necessary. + + if len(data) < 6: + n = 6 - len(data) + f += n + d += n + data = [0] * n + data + + # Save index of the first nonzero element and the decimal + # element for _marshalData. + + self.__firstnonzero = f + self.__decimal = d + + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return tuple(data) + + def _marshalData(self): + if self._cache == None: + d = self._data + t = 0 + + if d[self.__firstnonzero] < 0: + s = '-P' + else: + s = 'P' + + t = 0 + + for i in range(self.__firstnonzero, len(d)): + if d[i]: + if i > 2 and not t: + s += 'T' + t = 1 + if self.__decimal == i: + s += "%g" % abs(d[i]) + else: + s += "%d" % long(abs(d[i])) + s += ['Y', 'M', 'D', 'H', 'M', 'S'][i] + + self._cache = s + + return self._cache + +class timeDurationType(durationType): + _validURIs = (NS.XSD, NS.XSD2, NS.ENC) + +class dateTimeType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + try: + if data == None: + data = time.time() + + if (type(data) in (IntType, LongType)): + data = list(time.gmtime(data)[:6]) + elif (type(data) == FloatType): + f = data - int(data) + data = list(time.gmtime(int(data))[:6]) + data[5] += f + elif type(data) in (ListType, TupleType): + if len(data) < 6: + raise Exception, "not enough values" + if len(data) > 9: + raise Exception, "too many values" + + data = list(data[:6]) + + cleanDate(data) + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return tuple(data) + + def _marshalData(self): + if self._cache == None: + d = self._data + s = "%04d-%02d-%02dT%02d:%02d:%02d" % ((abs(d[0]),) + d[1:]) + if d[0] < 0: + s = '-' + s + f = d[5] - int(d[5]) + if f != 0: + s += ("%g" % f)[1:] + s += 'Z' + + self._cache = s + + return self._cache + +class recurringInstantType(anyType): + _validURIs = (NS.XSD,) + + def _checkValueSpace(self, data): + try: + if data == None: + data = list(time.gmtime(time.time())[:6]) + if (type(data) in (IntType, LongType)): + data = list(time.gmtime(data)[:6]) + elif (type(data) == FloatType): + f = data - int(data) + data = list(time.gmtime(int(data))[:6]) + data[5] += f + elif type(data) in (ListType, TupleType): + if len(data) < 1: + raise Exception, "not enough values" + if len(data) > 9: + raise Exception, "too many values" + + data = list(data[:6]) + + if len(data) < 6: + data += [0] * (6 - len(data)) + + f = len(data) + + for i in range(f): + if data[i] == None: + if f < i: + raise Exception, \ + "only leftmost elements can be none" + else: + f = i + break + + cleanDate(data, f) + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return tuple(data) + + def _marshalData(self): + if self._cache == None: + d = self._data + e = list(d) + neg = '' + + if not e[0]: + e[0] = '--' + else: + if e[0] < 0: + neg = '-' + e[0] = abs(e[0]) + if e[0] < 100: + e[0] = '-' + "%02d" % e[0] + else: + e[0] = "%04d" % e[0] + + for i in range(1, len(e)): + if e[i] == None or (i < 3 and e[i] == 0): + e[i] = '-' + else: + if e[i] < 0: + neg = '-' + e[i] = abs(e[i]) + + e[i] = "%02d" % e[i] + + if d[5]: + f = abs(d[5] - int(d[5])) + + if f: + e[5] += ("%g" % f)[1:] + + s = "%s%s-%s-%sT%s:%s:%sZ" % ((neg,) + tuple(e)) + + self._cache = s + + return self._cache + +class timeInstantType(dateTimeType): + _validURIs = (NS.XSD, NS.XSD2, NS.ENC) + +class timePeriodType(dateTimeType): + _validURIs = (NS.XSD2, NS.ENC) + +class timeType(anyType): + def _checkValueSpace(self, data): + try: + if data == None: + data = time.gmtime(time.time())[3:6] + elif (type(data) == FloatType): + f = data - int(data) + data = list(time.gmtime(int(data))[3:6]) + data[2] += f + elif type(data) in (IntType, LongType): + data = time.gmtime(data)[3:6] + elif type(data) in (ListType, TupleType): + if len(data) == 9: + data = data[3:6] + elif len(data) > 3: + raise Exception, "too many values" + + data = [None, None, None] + list(data) + + if len(data) < 6: + data += [0] * (6 - len(data)) + + cleanDate(data, 3) + + data = data[3:] + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return tuple(data) + + def _marshalData(self): + if self._cache == None: + d = self._data + s = '' + + s = time.strftime("%H:%M:%S", (0, 0, 0) + d + (0, 0, -1)) + f = d[2] - int(d[2]) + if f != 0: + s += ("%g" % f)[1:] + s += 'Z' + + self._cache = s + + return self._cache + +class dateType(anyType): + def _checkValueSpace(self, data): + try: + if data == None: + data = time.gmtime(time.time())[0:3] + elif type(data) in (IntType, LongType, FloatType): + data = time.gmtime(data)[0:3] + elif type(data) in (ListType, TupleType): + if len(data) == 9: + data = data[0:3] + elif len(data) > 3: + raise Exception, "too many values" + + data = list(data) + + if len(data) < 3: + data += [1, 1, 1][len(data):] + + data += [0, 0, 0] + + cleanDate(data) + + data = data[:3] + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return tuple(data) + + def _marshalData(self): + if self._cache == None: + d = self._data + s = "%04d-%02d-%02dZ" % ((abs(d[0]),) + d[1:]) + if d[0] < 0: + s = '-' + s + + self._cache = s + + return self._cache + +class gYearMonthType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + try: + if data == None: + data = time.gmtime(time.time())[0:2] + elif type(data) in (IntType, LongType, FloatType): + data = time.gmtime(data)[0:2] + elif type(data) in (ListType, TupleType): + if len(data) == 9: + data = data[0:2] + elif len(data) > 2: + raise Exception, "too many values" + + data = list(data) + + if len(data) < 2: + data += [1, 1][len(data):] + + data += [1, 0, 0, 0] + + cleanDate(data) + + data = data[:2] + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return tuple(data) + + def _marshalData(self): + if self._cache == None: + d = self._data + s = "%04d-%02dZ" % ((abs(d[0]),) + d[1:]) + if d[0] < 0: + s = '-' + s + + self._cache = s + + return self._cache + +class gYearType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + try: + if data == None: + data = time.gmtime(time.time())[0:1] + elif type(data) in (IntType, LongType, FloatType): + data = [data] + + if type(data) in (ListType, TupleType): + if len(data) == 9: + data = data[0:1] + elif len(data) < 1: + raise Exception, "too few values" + elif len(data) > 1: + raise Exception, "too many values" + + if type(data[0]) == FloatType: + try: s = int(data[0]) + except: s = long(data[0]) + + if s != data[0]: + raise Exception, "not integral" + + data = [s] + elif type(data[0]) not in (IntType, LongType): + raise Exception, "bad type" + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return data[0] + + def _marshalData(self): + if self._cache == None: + d = self._data + s = "%04dZ" % abs(d) + if d < 0: + s = '-' + s + + self._cache = s + + return self._cache + +class centuryType(anyType): + _validURIs = (NS.XSD2, NS.ENC) + + def _checkValueSpace(self, data): + try: + if data == None: + data = time.gmtime(time.time())[0:1] / 100 + elif type(data) in (IntType, LongType, FloatType): + data = [data] + + if type(data) in (ListType, TupleType): + if len(data) == 9: + data = data[0:1] / 100 + elif len(data) < 1: + raise Exception, "too few values" + elif len(data) > 1: + raise Exception, "too many values" + + if type(data[0]) == FloatType: + try: s = int(data[0]) + except: s = long(data[0]) + + if s != data[0]: + raise Exception, "not integral" + + data = [s] + elif type(data[0]) not in (IntType, LongType): + raise Exception, "bad type" + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return data[0] + + def _marshalData(self): + if self._cache == None: + d = self._data + s = "%02dZ" % abs(d) + if d < 0: + s = '-' + s + + self._cache = s + + return self._cache + +class yearType(gYearType): + _validURIs = (NS.XSD2, NS.ENC) + +class gMonthDayType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + try: + if data == None: + data = time.gmtime(time.time())[1:3] + elif type(data) in (IntType, LongType, FloatType): + data = time.gmtime(data)[1:3] + elif type(data) in (ListType, TupleType): + if len(data) == 9: + data = data[0:2] + elif len(data) > 2: + raise Exception, "too many values" + + data = list(data) + + if len(data) < 2: + data += [1, 1][len(data):] + + data = [0] + data + [0, 0, 0] + + cleanDate(data, 1) + + data = data[1:3] + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return tuple(data) + + def _marshalData(self): + if self._cache == None: + self._cache = "--%02d-%02dZ" % self._data + + return self._cache + +class recurringDateType(gMonthDayType): + _validURIs = (NS.XSD2, NS.ENC) + +class gMonthType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + try: + if data == None: + data = time.gmtime(time.time())[1:2] + elif type(data) in (IntType, LongType, FloatType): + data = [data] + + if type(data) in (ListType, TupleType): + if len(data) == 9: + data = data[1:2] + elif len(data) < 1: + raise Exception, "too few values" + elif len(data) > 1: + raise Exception, "too many values" + + if type(data[0]) == FloatType: + try: s = int(data[0]) + except: s = long(data[0]) + + if s != data[0]: + raise Exception, "not integral" + + data = [s] + elif type(data[0]) not in (IntType, LongType): + raise Exception, "bad type" + + if data[0] < 1 or data[0] > 12: + raise Exception, "bad value" + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return data[0] + + def _marshalData(self): + if self._cache == None: + self._cache = "--%02d--Z" % self._data + + return self._cache + +class monthType(gMonthType): + _validURIs = (NS.XSD2, NS.ENC) + +class gDayType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + try: + if data == None: + data = time.gmtime(time.time())[2:3] + elif type(data) in (IntType, LongType, FloatType): + data = [data] + + if type(data) in (ListType, TupleType): + if len(data) == 9: + data = data[2:3] + elif len(data) < 1: + raise Exception, "too few values" + elif len(data) > 1: + raise Exception, "too many values" + + if type(data[0]) == FloatType: + try: s = int(data[0]) + except: s = long(data[0]) + + if s != data[0]: + raise Exception, "not integral" + + data = [s] + elif type(data[0]) not in (IntType, LongType): + raise Exception, "bad type" + + if data[0] < 1 or data[0] > 31: + raise Exception, "bad value" + else: + raise Exception, "invalid type" + except Exception, e: + raise ValueError, "invalid %s value - %s" % (self._type, e) + + return data[0] + + def _marshalData(self): + if self._cache == None: + self._cache = "---%02dZ" % self._data + + return self._cache + +class recurringDayType(gDayType): + _validURIs = (NS.XSD2, NS.ENC) + +class hexBinaryType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (StringType, UnicodeType): + raise AttributeError, "invalid %s type" % self._type + + return data + + def _marshalData(self): + if self._cache == None: + self._cache = encodeHexString(self._data) + + return self._cache + +class base64BinaryType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (StringType, UnicodeType): + raise AttributeError, "invalid %s type" % self._type + + return data + + def _marshalData(self): + if self._cache == None: + self._cache = base64.encodestring(self._data) + + return self._cache + +class base64Type(base64BinaryType): + _validURIs = (NS.ENC,) + +class binaryType(anyType): + _validURIs = (NS.XSD, NS.ENC) + + def __init__(self, data, name = None, typed = 1, encoding = 'base64', + attrs = None): + + anyType.__init__(self, data, name, typed, attrs) + + self._setAttr('encoding', encoding) + + def _marshalData(self): + if self._cache == None: + if self._getAttr((None, 'encoding')) == 'base64': + self._cache = base64.encodestring(self._data) + else: + self._cache = encodeHexString(self._data) + + return self._cache + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (StringType, UnicodeType): + raise AttributeError, "invalid %s type" % self._type + + return data + + def _setAttr(self, attr, value): + attr = self._fixAttr(attr) + + if attr[1] == 'encoding': + if attr[0] != None or value not in ('base64', 'hex'): + raise AttributeError, "invalid encoding" + + self._cache = None + + anyType._setAttr(self, attr, value) + + +class anyURIType(anyType): + _validURIs = (NS.XSD3,) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (StringType, UnicodeType): + raise AttributeError, "invalid %s type" % self._type + + return data + + def _marshalData(self): + if self._cache == None: + self._cache = urllib.quote(self._data) + + return self._cache + +class uriType(anyURIType): + _validURIs = (NS.XSD,) + +class uriReferenceType(anyURIType): + _validURIs = (NS.XSD2,) + +class NOTATIONType(anyType): + def __init__(self, data, name = None, typed = 1, attrs = None): + + if self.__class__ == NOTATIONType: + raise Error, "a NOTATION can't be instantiated directly" + + anyType.__init__(self, data, name, typed, attrs) + +class ENTITIESType(anyType): + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) in (StringType, UnicodeType): + return (data,) + + if type(data) not in (ListType, TupleType) or \ + filter (lambda x: type(x) not in (StringType, UnicodeType), data): + raise AttributeError, "invalid %s type" % self._type + + return data + + def _marshalData(self): + return ' '.join(self._data) + +class IDREFSType(ENTITIESType): pass +class NMTOKENSType(ENTITIESType): pass + +class integerType(anyType): + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType): + raise ValueError, "invalid %s value" % self._type + + return data + +class nonPositiveIntegerType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or data > 0: + raise ValueError, "invalid %s value" % self._type + + return data + +class non_Positive_IntegerType(nonPositiveIntegerType): + _validURIs = (NS.XSD,) + + def _typeName(self): + return 'non-positive-integer' + +class negativeIntegerType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or data >= 0: + raise ValueError, "invalid %s value" % self._type + + return data + +class negative_IntegerType(negativeIntegerType): + _validURIs = (NS.XSD,) + + def _typeName(self): + return 'negative-integer' + +class longType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or \ + data < -9223372036854775808L or \ + data > 9223372036854775807L: + raise ValueError, "invalid %s value" % self._type + + return data + +class intType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or \ + data < -2147483648L or \ + data > 2147483647: + raise ValueError, "invalid %s value" % self._type + + return data + +class shortType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or \ + data < -32768 or \ + data > 32767: + raise ValueError, "invalid %s value" % self._type + + return data + +class byteType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or \ + data < -128 or \ + data > 127: + raise ValueError, "invalid %s value" % self._type + + return data + +class nonNegativeIntegerType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or data < 0: + raise ValueError, "invalid %s value" % self._type + + return data + +class non_Negative_IntegerType(nonNegativeIntegerType): + _validURIs = (NS.XSD,) + + def _typeName(self): + return 'non-negative-integer' + +class unsignedLongType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or \ + data < 0 or \ + data > 18446744073709551615L: + raise ValueError, "invalid %s value" % self._type + + return data + +class unsignedIntType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or \ + data < 0 or \ + data > 4294967295L: + raise ValueError, "invalid %s value" % self._type + + return data + +class unsignedShortType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or \ + data < 0 or \ + data > 65535: + raise ValueError, "invalid %s value" % self._type + + return data + +class unsignedByteType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or \ + data < 0 or \ + data > 255: + raise ValueError, "invalid %s value" % self._type + + return data + +class positiveIntegerType(anyType): + _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) + + def _checkValueSpace(self, data): + if data == None: + raise ValueError, "must supply initial %s value" % self._type + + if type(data) not in (IntType, LongType) or data <= 0: + raise ValueError, "invalid %s value" % self._type + + return data + +class positive_IntegerType(positiveIntegerType): + _validURIs = (NS.XSD,) + + def _typeName(self): + return 'positive-integer' + +# Now compound types + +class compoundType(anyType): + def __init__(self, data = None, name = None, typed = 1, attrs = None): + if self.__class__ == compoundType: + raise Error, "a compound can't be instantiated directly" + + anyType.__init__(self, data, name, typed, attrs) + self._keyord = [] + + if type(data) == DictType: + self.__dict__.update(data) + + def _aslist(self, item=None): + if item is not None: + return self.__dict__[self._keyord[item]] + else: + return map( lambda x: self.__dict__[x], self._keyord) + + def _asdict(self, item=None, encoding=Config.dict_encoding): + if item is not None: + if type(item) in (UnicodeType,StringType): + item = item.encode(encoding) + return self.__dict__[item] + else: + retval = {} + def fun(x): retval[x.encode(encoding)] = self.__dict__[x] + + if hasattr(self, '_keyord'): + map( fun, self._keyord) + else: + for name in dir(self): + if isPublic(name): + retval[name] = getattr(self,name) + return retval + + + def __getitem__(self, item): + if type(item) == IntType: + return self.__dict__[self._keyord[item]] + else: + return getattr(self, item) + + def __len__(self): + return len(self._keyord) + + def __nonzero__(self): + return 1 + + def _keys(self): + return filter(lambda x: x[0] != '_', self.__dict__.keys()) + + def _addItem(self, name, value, attrs = None): + + if name in self._keyord: + if type(self.__dict__[name]) != ListType: + self.__dict__[name] = [self.__dict__[name]] + self.__dict__[name].append(value) + else: + self.__dict__[name] = value + self._keyord.append(name) + + def _placeItem(self, name, value, pos, subpos = 0, attrs = None): + + if subpos == 0 and type(self.__dict__[name]) != ListType: + self.__dict__[name] = value + else: + self.__dict__[name][subpos] = value + + self._keyord[pos] = name + + + def _getItemAsList(self, name, default = []): + try: + d = self.__dict__[name] + except: + return default + + if type(d) == ListType: + return d + return [d] + + def __str__(self): + return anyType.__str__(self) + ": " + str(self._asdict()) + + def __repr__(self): + return self.__str__() + +class structType(compoundType): + pass + +class headerType(structType): + _validURIs = (NS.ENV,) + + def __init__(self, data = None, typed = 1, attrs = None): + structType.__init__(self, data, "Header", typed, attrs) + +class bodyType(structType): + _validURIs = (NS.ENV,) + + def __init__(self, data = None, typed = 1, attrs = None): + structType.__init__(self, data, "Body", typed, attrs) + +class arrayType(UserList.UserList, compoundType): + def __init__(self, data = None, name = None, attrs = None, + offset = 0, rank = None, asize = 0, elemsname = None): + + if data: + if type(data) not in (ListType, TupleType): + raise Error, "Data must be a sequence" + + UserList.UserList.__init__(self, data) + compoundType.__init__(self, data, name, 0, attrs) + + self._elemsname = elemsname or "item" + + if data == None: + self._rank = rank + + # According to 5.4.2.2 in the SOAP spec, each element in a + # sparse array must have a position. _posstate keeps track of + # whether we've seen a position or not. It's possible values + # are: + # -1 No elements have been added, so the state is indeterminate + # 0 An element without a position has been added, so no + # elements can have positions + # 1 An element with a position has been added, so all elements + # must have positions + + self._posstate = -1 + + self._full = 0 + + if asize in ('', None): + asize = '0' + + self._dims = map (lambda x: int(x), str(asize).split(',')) + self._dims.reverse() # It's easier to work with this way + self._poss = [0] * len(self._dims) # This will end up + # reversed too + + for i in range(len(self._dims)): + if self._dims[i] < 0 or \ + self._dims[i] == 0 and len(self._dims) > 1: + raise TypeError, "invalid Array dimensions" + + if offset > 0: + self._poss[i] = offset % self._dims[i] + offset = int(offset / self._dims[i]) + + # Don't break out of the loop if offset is 0 so we test all the + # dimensions for > 0. + if offset: + raise AttributeError, "invalid Array offset" + + a = [None] * self._dims[0] + + for i in range(1, len(self._dims)): + b = [] + + for j in range(self._dims[i]): + b.append(copy.deepcopy(a)) + + a = b + + self.data = a + + + def _aslist(self, item=None): + if item is not None: + return self.data[int(item)] + else: + return self.data + + def _asdict(self, item=None, encoding=Config.dict_encoding): + if item is not None: + if type(item) in (UnicodeType,StringType): + item = item.encode(encoding) + return self.data[int(item)] + else: + retval = {} + def fun(x): retval[str(x).encode(encoding)] = self.data[x] + + map( fun, range(len(self.data)) ) + return retval + + def __getitem__(self, item): + try: + return self.data[int(item)] + except ValueError: + return getattr(self, item) + + def __len__(self): + return len(self.data) + + def __nonzero__(self): + return 1 + + def __str__(self): + return anyType.__str__(self) + ": " + str(self._aslist()) + + def _keys(self): + return filter(lambda x: x[0] != '_', self.__dict__.keys()) + + def _addItem(self, name, value, attrs): + if self._full: + raise ValueError, "Array is full" + + pos = attrs.get((NS.ENC, 'position')) + + if pos != None: + if self._posstate == 0: + raise AttributeError, \ + "all elements in a sparse Array must have a " \ + "position attribute" + + self._posstate = 1 + + try: + if pos[0] == '[' and pos[-1] == ']': + pos = map (lambda x: int(x), pos[1:-1].split(',')) + pos.reverse() + + if len(pos) == 1: + pos = pos[0] + + curpos = [0] * len(self._dims) + + for i in range(len(self._dims)): + curpos[i] = pos % self._dims[i] + pos = int(pos / self._dims[i]) + + if pos == 0: + break + + if pos: + raise Exception + elif len(pos) != len(self._dims): + raise Exception + else: + for i in range(len(self._dims)): + if pos[i] >= self._dims[i]: + raise Exception + + curpos = pos + else: + raise Exception + except: + raise AttributeError, \ + "invalid Array element position %s" % str(pos) + else: + if self._posstate == 1: + raise AttributeError, \ + "only elements in a sparse Array may have a " \ + "position attribute" + + self._posstate = 0 + + curpos = self._poss + + a = self.data + + for i in range(len(self._dims) - 1, 0, -1): + a = a[curpos[i]] + + if curpos[0] >= len(a): + a += [None] * (len(a) - curpos[0] + 1) + + a[curpos[0]] = value + + if pos == None: + self._poss[0] += 1 + + for i in range(len(self._dims) - 1): + if self._poss[i] < self._dims[i]: + break + + self._poss[i] = 0 + self._poss[i + 1] += 1 + + if self._dims[-1] and self._poss[-1] >= self._dims[-1]: + #self._full = 1 + #FIXME: why is this occuring? + pass + + def _placeItem(self, name, value, pos, subpos, attrs = None): + curpos = [0] * len(self._dims) + + for i in range(len(self._dims)): + if self._dims[i] == 0: + curpos[0] = pos + break + + curpos[i] = pos % self._dims[i] + pos = int(pos / self._dims[i]) + + if pos == 0: + break + + if self._dims[i] != 0 and pos: + raise Error, "array index out of range" + + a = self.data + + for i in range(len(self._dims) - 1, 0, -1): + a = a[curpos[i]] + + if curpos[0] >= len(a): + a += [None] * (len(a) - curpos[0] + 1) + + a[curpos[0]] = value + +class typedArrayType(arrayType): + def __init__(self, data = None, name = None, typed = None, attrs = None, + offset = 0, rank = None, asize = 0, elemsname = None, complexType = 0): + + arrayType.__init__(self, data, name, attrs, offset, rank, asize, + elemsname) + + self._typed = 1 + self._type = typed + self._complexType = complexType + +class faultType(structType, Error): + def __init__(self, faultcode = "", faultstring = "", detail = None): + self.faultcode = faultcode + self.faultstring = faultstring + if detail != None: + self.detail = detail + + structType.__init__(self, None, 0) + + def _setDetail(self, detail = None): + if detail != None: + self.detail = detail + else: + try: del self.detail + except AttributeError: pass + + def __repr__(self): + if getattr(self, 'detail', None) != None: + return "<Fault %s: %s: %s>" % (self.faultcode, + self.faultstring, + self.detail) + else: + return "<Fault %s: %s>" % (self.faultcode, self.faultstring) + + __str__ = __repr__ + + def __call__(self): + return (self.faultcode, self.faultstring, self.detail) + +class SOAPException(Exception): + def __init__(self, code="", string="", detail=None): + self.value = ("SOAPpy SOAP Exception", code, string, detail) + self.code = code + self.string = string + self.detail = detail + + def __str__(self): + return repr(self.value) + +class RequiredHeaderMismatch(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + +class MethodNotFound(Exception): + def __init__(self, value): + (val, detail) = value.split(":") + self.value = val + self.detail = detail + + def __str__(self): + return repr(self.value, self.detail) + +class AuthorizationFailed(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + +class MethodFailed(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + +####### +# Convert complex SOAPpy objects to native python equivalents +####### + +def simplify(object, level=0): + """ + Convert the SOAPpy objects and thier contents to simple python types. + + This function recursively converts the passed 'container' object, + and all public subobjects. (Private subobjects have names that + start with '_'.) + + Conversions: + - faultType --> raise python exception + - arrayType --> array + - compoundType --> dictionary + """ + + if level > 10: + return object + + if isinstance( object, faultType ): + if object.faultstring == "Required Header Misunderstood": + raise RequiredHeaderMismatch(object.detail) + elif object.faultstring == "Method Not Found": + raise MethodNotFound(object.detail) + elif object.faultstring == "Authorization Failed": + raise AuthorizationFailed(object.detail) + elif object.faultstring == "Method Failed": + raise MethodFailed(object.detail) + else: + se = SOAPException(object.faultcode, object.faultstring, + object.detail) + raise se + elif isinstance( object, arrayType ): + data = object._aslist() + for k in range(len(data)): + data[k] = simplify(data[k], level=level+1) + return data + elif isinstance( object, compoundType ) or isinstance(object, structType): + data = object._asdict() + for k in data.keys(): + if isPublic(k): + data[k] = simplify(data[k], level=level+1) + return data + elif type(object)==DictType: + for k in object.keys(): + if isPublic(k): + object[k] = simplify(object[k]) + return object + elif type(object)==list: + for k in range(len(object)): + object[k] = simplify(object[k]) + return object + else: + return object + + +def simplify_contents(object, level=0): + """ + Convert the contents of SOAPpy objects to simple python types. + + This function recursively converts the sub-objects contained in a + 'container' object to simple python types. + + Conversions: + - faultType --> raise python exception + - arrayType --> array + - compoundType --> dictionary + """ + + if level>10: return object + + if isinstance( object, faultType ): + for k in object._keys(): + if isPublic(k): + setattr(object, k, simplify(object[k], level=level+1)) + raise object + elif isinstance( object, arrayType ): + data = object._aslist() + for k in range(len(data)): + object[k] = simplify(data[k], level=level+1) + elif isinstance(object, structType): + data = object._asdict() + for k in data.keys(): + if isPublic(k): + setattr(object, k, simplify(data[k], level=level+1)) + elif isinstance( object, compoundType ) : + data = object._asdict() + for k in data.keys(): + if isPublic(k): + object[k] = simplify(data[k], level=level+1) + elif type(object)==DictType: + for k in object.keys(): + if isPublic(k): + object[k] = simplify(object[k]) + elif type(object)==list: + for k in range(len(object)): + object[k] = simplify(object[k]) + + return object + + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/URLopener.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/URLopener.py new file mode 100644 index 0000000000000000000000000000000000000000..09d4b84107e5a82a1bdfde1a70428fef42774e36 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/URLopener.py @@ -0,0 +1,23 @@ +"""Provide a class for loading data from URL's that handles basic +authentication""" + +ident = '$Id$' +from version import __version__ + +from Config import Config +from urllib import FancyURLopener + +class URLopener(FancyURLopener): + + username = None + passwd = None + + + def __init__(self, username=None, passwd=None, *args, **kw): + FancyURLopener.__init__( self, *args, **kw) + self.username = username + self.passwd = passwd + + + def prompt_user_passwd(self, host, realm): + return self.username, self.passwd diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Utilities.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..5944ee17098fd741da23be3eb3216b48179d1ca2 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/Utilities.py @@ -0,0 +1,178 @@ +""" +################################################################################ +# Copyright (c) 2003, Pfizer +# Copyright (c) 2001, Cayce Ullman. +# Copyright (c) 2001, Brian Matthews. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of actzero, inc. nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +""" + +ident = '$Id$' +from version import __version__ + +import exceptions +import copy +import re +import string +import sys +from types import * + +# SOAPpy modules +from Errors import * + +################################################################################ +# Utility infielders +################################################################################ +def collapseWhiteSpace(s): + return re.sub('\s+', ' ', s).strip() + +def decodeHexString(data): + conv = { + '0': 0x0, '1': 0x1, '2': 0x2, '3': 0x3, '4': 0x4, + '5': 0x5, '6': 0x6, '7': 0x7, '8': 0x8, '9': 0x9, + + 'a': 0xa, 'b': 0xb, 'c': 0xc, 'd': 0xd, 'e': 0xe, + 'f': 0xf, + + 'A': 0xa, 'B': 0xb, 'C': 0xc, 'D': 0xd, 'E': 0xe, + 'F': 0xf, + } + + ws = string.whitespace + + bin = '' + + i = 0 + + while i < len(data): + if data[i] not in ws: + break + i += 1 + + low = 0 + + while i < len(data): + c = data[i] + + if c in string.whitespace: + break + + try: + c = conv[c] + except KeyError: + raise ValueError, \ + "invalid hex string character `%s'" % c + + if low: + bin += chr(high * 16 + c) + low = 0 + else: + high = c + low = 1 + + i += 1 + + if low: + raise ValueError, "invalid hex string length" + + while i < len(data): + if data[i] not in string.whitespace: + raise ValueError, \ + "invalid hex string character `%s'" % c + + i += 1 + + return bin + +def encodeHexString(data): + h = '' + + for i in data: + h += "%02X" % ord(i) + + return h + +def leapMonth(year, month): + return month == 2 and \ + year % 4 == 0 and \ + (year % 100 != 0 or year % 400 == 0) + +def cleanDate(d, first = 0): + ranges = (None, (1, 12), (1, 31), (0, 23), (0, 59), (0, 61)) + months = (0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) + names = ('year', 'month', 'day', 'hours', 'minutes', 'seconds') + + if len(d) != 6: + raise ValueError, "date must have 6 elements" + + for i in range(first, 6): + s = d[i] + + if type(s) == FloatType: + if i < 5: + try: + s = int(s) + except OverflowError: + if i > 0: + raise + s = long(s) + + if s != d[i]: + raise ValueError, "%s must be integral" % names[i] + + d[i] = s + elif type(s) == LongType: + try: s = int(s) + except: pass + elif type(s) != IntType: + raise TypeError, "%s isn't a valid type" % names[i] + + if i == first and s < 0: + continue + + if ranges[i] != None and \ + (s < ranges[i][0] or ranges[i][1] < s): + raise ValueError, "%s out of range" % names[i] + + if first < 6 and d[5] >= 61: + raise ValueError, "seconds out of range" + + if first < 2: + leap = first < 1 and leapMonth(d[0], d[1]) + + if d[2] > months[d[1]] + leap: + raise ValueError, "day out of range" + +def debugHeader(title): + s = '*** ' + title + ' ' + print s + ('*' * (72 - len(s))) + +def debugFooter(title): + print '*' * 72 + sys.stdout.flush() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/WSDL.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/WSDL.py new file mode 100644 index 0000000000000000000000000000000000000000..50b8a6bd1ece27c008179125d9cbd2615eab5bca --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/WSDL.py @@ -0,0 +1,119 @@ +"""Parse web services description language to get SOAP methods. + +Rudimentary support.""" + +ident = '$Id$' +from version import __version__ + +import wstools +from Client import SOAPProxy, SOAPAddress +from Config import Config +import urllib + +class Proxy: + """WSDL Proxy. + + SOAPProxy wrapper that parses method names, namespaces, soap actions from + the web service description language (WSDL) file passed into the + constructor. The WSDL reference can be passed in as a stream, an url, a + file name, or a string. + + Loads info into self.methods, a dictionary with methodname keys and values + of WSDLTools.SOAPCallinfo. + + For example, + + url = 'http://www.xmethods.org/sd/2001/TemperatureService.wsdl' + wsdl = WSDL.Proxy(url) + print len(wsdl.methods) # 1 + print wsdl.methods.keys() # getTemp + + + See WSDLTools.SOAPCallinfo for more info on each method's attributes. + """ + + def __init__(self, wsdlsource, config=Config, **kw ): + + reader = wstools.WSDLTools.WSDLReader() + self.wsdl = None + + # From Mark Pilgrim's "Dive Into Python" toolkit.py--open anything. + if self.wsdl is None and hasattr(wsdlsource, "read"): + #print 'stream' + self.wsdl = reader.loadFromStream(wsdlsource) + + # NOT TESTED (as of April 17, 2003) + #if self.wsdl is None and wsdlsource == '-': + # import sys + # self.wsdl = reader.loadFromStream(sys.stdin) + # print 'stdin' + + if self.wsdl is None: + try: + file(wsdlsource) + self.wsdl = reader.loadFromFile(wsdlsource) + #print 'file' + except (IOError, OSError): + pass + + if self.wsdl is None: + try: + stream = urllib.urlopen(wsdlsource) + self.wsdl = reader.loadFromStream(stream, wsdlsource) + except (IOError, OSError): pass + + if self.wsdl is None: + import StringIO + self.wsdl = reader.loadFromString(str(wsdlsource)) + #print 'string' + + # Package wsdl info as a dictionary of remote methods, with method name + # as key (based on ServiceProxy.__init__ in ZSI library). + self.methods = {} + service = self.wsdl.services[0] + port = service.ports[0] + name = service.name + binding = port.getBinding() + portType = binding.getPortType() + for operation in portType.operations: + callinfo = wstools.WSDLTools.callInfoFromWSDL(port, operation.name) + self.methods[callinfo.methodName] = callinfo + + self.soapproxy = SOAPProxy('http://localhost/dummy.webservice', + config=config, **kw) + + def __str__(self): + s = '' + for method in self.methods.values(): + s += str(method) + return s + + def __getattr__(self, name): + """Set up environment then let parent class handle call. + + Raises AttributeError is method name is not found.""" + + if not self.methods.has_key(name): raise AttributeError, name + + callinfo = self.methods[name] + self.soapproxy.proxy = SOAPAddress(callinfo.location) + self.soapproxy.namespace = callinfo.namespace + self.soapproxy.soapaction = callinfo.soapAction + return self.soapproxy.__getattr__(name) + + def show_methods(self): + for key in self.methods.keys(): + method = self.methods[key] + print "Method Name:", key.ljust(15) + print + inps = method.inparams + for parm in range(len(inps)): + details = inps[parm] + print " In #%d: %s (%s)" % (parm, details.name, details.type) + print + outps = method.outparams + for parm in range(len(outps)): + details = outps[parm] + print " Out #%d: %s (%s)" % (parm, details.name, details.type) + print + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/__init__.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ecd0bfe7bc1f293751af4fa0a07762c34ece8ba6 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/__init__.py @@ -0,0 +1,15 @@ + +ident = '$Id$' +from version import __version__ + +from Client import * +from Config import * +from Errors import * +from NS import * +from Parser import * +from SOAPBuilder import * +from Server import * +from Types import * +from Utilities import * +import wstools +import WSDL diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/version.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/version.py new file mode 100644 index 0000000000000000000000000000000000000000..17a0a3e4def812221da23f048ba5533e9f872955 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/version.py @@ -0,0 +1,2 @@ +__version__="0.12.0" + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/Namespaces.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/Namespaces.py new file mode 100755 index 0000000000000000000000000000000000000000..4d29eacee6548adfb2d51ae53a3b5e33a0e7de1f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/Namespaces.py @@ -0,0 +1,125 @@ +#! /usr/bin/env python +"""Namespace module, so you don't need PyXML +""" + +try: + from xml.ns import SOAP, SCHEMA, WSDL, XMLNS, DSIG, ENCRYPTION + DSIG.C14N = "http://www.w3.org/TR/2001/REC-xml-c14n-20010315" + +except: + class SOAP: + ENV = "http://schemas.xmlsoap.org/soap/envelope/" + ENC = "http://schemas.xmlsoap.org/soap/encoding/" + ACTOR_NEXT = "http://schemas.xmlsoap.org/soap/actor/next" + + class SCHEMA: + XSD1 = "http://www.w3.org/1999/XMLSchema" + XSD2 = "http://www.w3.org/2000/10/XMLSchema" + XSD3 = "http://www.w3.org/2001/XMLSchema" + XSD_LIST = [ XSD1, XSD2, XSD3 ] + XSI1 = "http://www.w3.org/1999/XMLSchema-instance" + XSI2 = "http://www.w3.org/2000/10/XMLSchema-instance" + XSI3 = "http://www.w3.org/2001/XMLSchema-instance" + XSI_LIST = [ XSI1, XSI2, XSI3 ] + BASE = XSD3 + + class WSDL: + BASE = "http://schemas.xmlsoap.org/wsdl/" + BIND_HTTP = "http://schemas.xmlsoap.org/wsdl/http/" + BIND_MIME = "http://schemas.xmlsoap.org/wsdl/mime/" + BIND_SOAP = "http://schemas.xmlsoap.org/wsdl/soap/" + BIND_SOAP12 = "http://schemas.xmlsoap.org/wsdl/soap12/" + + class XMLNS: + BASE = "http://www.w3.org/2000/xmlns/" + XML = "http://www.w3.org/XML/1998/namespace" + HTML = "http://www.w3.org/TR/REC-html40" + + class DSIG: + BASE = "http://www.w3.org/2000/09/xmldsig#" + C14N = "http://www.w3.org/TR/2001/REC-xml-c14n-20010315" + C14N_COMM = "http://www.w3.org/TR/2000/CR-xml-c14n-20010315#WithComments" + C14N_EXCL = "http://www.w3.org/2001/10/xml-exc-c14n#" + DIGEST_MD2 = "http://www.w3.org/2000/09/xmldsig#md2" + DIGEST_MD5 = "http://www.w3.org/2000/09/xmldsig#md5" + DIGEST_SHA1 = "http://www.w3.org/2000/09/xmldsig#sha1" + ENC_BASE64 = "http://www.w3.org/2000/09/xmldsig#base64" + ENVELOPED = "http://www.w3.org/2000/09/xmldsig#enveloped-signature" + HMAC_SHA1 = "http://www.w3.org/2000/09/xmldsig#hmac-sha1" + SIG_DSA_SHA1 = "http://www.w3.org/2000/09/xmldsig#dsa-sha1" + SIG_RSA_SHA1 = "http://www.w3.org/2000/09/xmldsig#rsa-sha1" + XPATH = "http://www.w3.org/TR/1999/REC-xpath-19991116" + XSLT = "http://www.w3.org/TR/1999/REC-xslt-19991116" + + class ENCRYPTION: + BASE = "http://www.w3.org/2001/04/xmlenc#" + BLOCK_3DES = "http://www.w3.org/2001/04/xmlenc#des-cbc" + BLOCK_AES128 = "http://www.w3.org/2001/04/xmlenc#aes128-cbc" + BLOCK_AES192 = "http://www.w3.org/2001/04/xmlenc#aes192-cbc" + BLOCK_AES256 = "http://www.w3.org/2001/04/xmlenc#aes256-cbc" + DIGEST_RIPEMD160 = "http://www.w3.org/2001/04/xmlenc#ripemd160" + DIGEST_SHA256 = "http://www.w3.org/2001/04/xmlenc#sha256" + DIGEST_SHA512 = "http://www.w3.org/2001/04/xmlenc#sha512" + KA_DH = "http://www.w3.org/2001/04/xmlenc#dh" + KT_RSA_1_5 = "http://www.w3.org/2001/04/xmlenc#rsa-1_5" + KT_RSA_OAEP = "http://www.w3.org/2001/04/xmlenc#rsa-oaep-mgf1p" + STREAM_ARCFOUR = "http://www.w3.org/2001/04/xmlenc#arcfour" + WRAP_3DES = "http://www.w3.org/2001/04/xmlenc#kw-3des" + WRAP_AES128 = "http://www.w3.org/2001/04/xmlenc#kw-aes128" + WRAP_AES192 = "http://www.w3.org/2001/04/xmlenc#kw-aes192" + WRAP_AES256 = "http://www.w3.org/2001/04/xmlenc#kw-aes256" + + +class OASIS: + '''URLs for Oasis specifications + ''' + WSSE = "http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd" + UTILITY = "http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd" + LIFETIME = "http://docs.oasis-open.org/wsrf/2004/06/wsrf-WS-ResourceLifetime-1.2-draft-01.xsd" + PROPERTIES = "http://docs.oasis-open.org/wsrf/2004/06/wsrf-WS-ResourceProperties-1.2-draft-01.xsd" + BASENOTIFICATION = "http://docs.oasis-open.org/wsn/2004/06/wsn-WS-BaseNotification-1.2-draft-01.xsd" + BASEFAULTS = "http://docs.oasis-open.org/wsrf/2004/06/wsrf-WS-BaseFaults-1.2-draft-01.xsd" + +class WSSE: + BASE = "http://schemas.xmlsoap.org/ws/2002/04/secext" + TRUST = "http://schemas.xmlsoap.org/ws/2004/04/trust" + + +class WSU: + BASE = "http://schemas.xmlsoap.org/ws/2002/04/utility" + UTILITY = "http://schemas.xmlsoap.org/ws/2002/07/utility" + + +class WSR: + PROPERTIES = "http://www.ibm.com/xmlns/stdwip/web-services/WS-ResourceProperties" + LIFETIME = "http://www.ibm.com/xmlns/stdwip/web-services/WS-ResourceLifetime" + + +class WSA200408: + ADDRESS = "http://schemas.xmlsoap.org/ws/2004/08/addressing" + ANONYMOUS = "%s/role/anonymous" %ADDRESS + FAULT = "%s/fault" %ADDRESS +WSA = WSA200408 + +class WSA200403: + ADDRESS = "http://schemas.xmlsoap.org/ws/2004/03/addressing" + ANONYMOUS = "%s/role/anonymous" %ADDRESS + FAULT = "%s/fault" %ADDRESS + +class WSA200303: + ADDRESS = "http://schemas.xmlsoap.org/ws/2003/03/addressing" + ANONYMOUS = "%s/role/anonymous" %ADDRESS + FAULT = None + +class WSP: + POLICY = "http://schemas.xmlsoap.org/ws/2002/12/policy" + +class BEA: + SECCONV = "http://schemas.xmlsoap.org/ws/2004/04/sc" + +class GLOBUS: + SECCONV = "http://wsrf.globus.org/core/2004/07/security/secconv" + CORE = "http://www.globus.org/namespaces/2004/06/core" + SIG = "http://www.globus.org/2002/04/xmlenc#gssapi-sign" + +ZSI_SCHEMA_URI = 'http://www.zolera.com/schemas/ZSI/' diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/TimeoutSocket.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/TimeoutSocket.py new file mode 100755 index 0000000000000000000000000000000000000000..48b898d8962a30ac3f110f7f624b4f2babc38f13 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/TimeoutSocket.py @@ -0,0 +1,179 @@ +"""Based on code from timeout_socket.py, with some tweaks for compatibility. + These tweaks should really be rolled back into timeout_socket, but it's + not totally clear who is maintaining it at this point. In the meantime, + we'll use a different module name for our tweaked version to avoid any + confusion. + + The original timeout_socket is by: + + Scott Cotton <scott@chronis.pobox.com> + Lloyd Zusman <ljz@asfast.com> + Phil Mayes <pmayes@olivebr.com> + Piers Lauder <piers@cs.su.oz.au> + Radovan Garabik <garabik@melkor.dnp.fmph.uniba.sk> +""" + +ident = "$Id$" + +import string, socket, select, errno + +WSAEINVAL = getattr(errno, 'WSAEINVAL', 10022) + + +class TimeoutSocket: + """A socket imposter that supports timeout limits.""" + + def __init__(self, timeout=20, sock=None): + self.timeout = float(timeout) + self.inbuf = '' + if sock is None: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.sock = sock + self.sock.setblocking(0) + self._rbuf = '' + self._wbuf = '' + + def __getattr__(self, name): + # Delegate to real socket attributes. + return getattr(self.sock, name) + + def connect(self, *addr): + timeout = self.timeout + sock = self.sock + try: + # Non-blocking mode + sock.setblocking(0) + apply(sock.connect, addr) + sock.setblocking(timeout != 0) + return 1 + except socket.error,why: + if not timeout: + raise + sock.setblocking(1) + if len(why.args) == 1: + code = 0 + else: + code, why = why + if code not in ( + errno.EINPROGRESS, errno.EALREADY, errno.EWOULDBLOCK + ): + raise + r,w,e = select.select([],[sock],[],timeout) + if w: + try: + apply(sock.connect, addr) + return 1 + except socket.error,why: + if len(why.args) == 1: + code = 0 + else: + code, why = why + if code in (errno.EISCONN, WSAEINVAL): + return 1 + raise + raise TimeoutError('socket connect() timeout.') + + def send(self, data, flags=0): + total = len(data) + next = 0 + while 1: + r, w, e = select.select([],[self.sock], [], self.timeout) + if w: + buff = data[next:next + 8192] + sent = self.sock.send(buff, flags) + next = next + sent + if next == total: + return total + continue + raise TimeoutError('socket send() timeout.') + + def recv(self, amt, flags=0): + if select.select([self.sock], [], [], self.timeout)[0]: + return self.sock.recv(amt, flags) + raise TimeoutError('socket recv() timeout.') + + buffsize = 4096 + handles = 1 + + def makefile(self, mode="r", buffsize=-1): + self.handles = self.handles + 1 + self.mode = mode + return self + + def close(self): + self.handles = self.handles - 1 + if self.handles == 0 and self.sock.fileno() >= 0: + self.sock.close() + + def read(self, n=-1): + if not isinstance(n, type(1)): + n = -1 + if n >= 0: + k = len(self._rbuf) + if n <= k: + data = self._rbuf[:n] + self._rbuf = self._rbuf[n:] + return data + n = n - k + L = [self._rbuf] + self._rbuf = "" + while n > 0: + new = self.recv(max(n, self.buffsize)) + if not new: break + k = len(new) + if k > n: + L.append(new[:n]) + self._rbuf = new[n:] + break + L.append(new) + n = n - k + return "".join(L) + k = max(4096, self.buffsize) + L = [self._rbuf] + self._rbuf = "" + while 1: + new = self.recv(k) + if not new: break + L.append(new) + k = min(k*2, 1024**2) + return "".join(L) + + def readline(self, limit=-1): + data = "" + i = self._rbuf.find('\n') + while i < 0 and not (0 < limit <= len(self._rbuf)): + new = self.recv(self.buffsize) + if not new: break + i = new.find('\n') + if i >= 0: i = i + len(self._rbuf) + self._rbuf = self._rbuf + new + if i < 0: i = len(self._rbuf) + else: i = i+1 + if 0 <= limit < len(self._rbuf): i = limit + data, self._rbuf = self._rbuf[:i], self._rbuf[i:] + return data + + def readlines(self, sizehint = 0): + total = 0 + list = [] + while 1: + line = self.readline() + if not line: break + list.append(line) + total += len(line) + if sizehint and total >= sizehint: + break + return list + + def writelines(self, list): + self.send(''.join(list)) + + def write(self, data): + self.send(data) + + def flush(self): + pass + + +class TimeoutError(Exception): + pass diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/UserTuple.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/UserTuple.py new file mode 100755 index 0000000000000000000000000000000000000000..b8c36539b1632522c1ffeb028d60463a0e85e9d7 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/UserTuple.py @@ -0,0 +1,99 @@ +""" +A more or less complete user-defined wrapper around tuple objects. +Adapted version of the standard library's UserList. + +Taken from Stefan Schwarzer's ftputil library, available at +<http://www.ndh.net/home/sschwarzer/python/python_software.html>, and used under this license: + + + + +Copyright (C) 1999, Stefan Schwarzer +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +- Neither the name of the above author nor the names of the + contributors to the software may be used to endorse or promote + products derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" + + + + +# $Id$ + +#XXX tuple instances (in Python 2.2) contain also: +# __class__, __delattr__, __getattribute__, __hash__, __new__, +# __reduce__, __setattr__, __str__ +# What about these? + +class UserTuple: + def __init__(self, inittuple=None): + self.data = () + if inittuple is not None: + # XXX should this accept an arbitrary sequence? + if type(inittuple) == type(self.data): + self.data = inittuple + elif isinstance(inittuple, UserTuple): + # this results in + # self.data is inittuple.data + # but that's ok for tuples because they are + # immutable. (Builtin tuples behave the same.) + self.data = inittuple.data[:] + else: + # the same applies here; (t is tuple(t)) == 1 + self.data = tuple(inittuple) + def __repr__(self): return repr(self.data) + def __lt__(self, other): return self.data < self.__cast(other) + def __le__(self, other): return self.data <= self.__cast(other) + def __eq__(self, other): return self.data == self.__cast(other) + def __ne__(self, other): return self.data != self.__cast(other) + def __gt__(self, other): return self.data > self.__cast(other) + def __ge__(self, other): return self.data >= self.__cast(other) + def __cast(self, other): + if isinstance(other, UserTuple): return other.data + else: return other + def __cmp__(self, other): + return cmp(self.data, self.__cast(other)) + def __contains__(self, item): return item in self.data + def __len__(self): return len(self.data) + def __getitem__(self, i): return self.data[i] + def __getslice__(self, i, j): + i = max(i, 0); j = max(j, 0) + return self.__class__(self.data[i:j]) + def __add__(self, other): + if isinstance(other, UserTuple): + return self.__class__(self.data + other.data) + elif isinstance(other, type(self.data)): + return self.__class__(self.data + other) + else: + return self.__class__(self.data + tuple(other)) + # dir( () ) contains no __radd__ (at least in Python 2.2) + def __mul__(self, n): + return self.__class__(self.data*n) + __rmul__ = __mul__ + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/Utility.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/Utility.py new file mode 100755 index 0000000000000000000000000000000000000000..bbd64e13f59c781e1a91c4bb582a425379e150d0 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/Utility.py @@ -0,0 +1,1348 @@ +# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. + +ident = "$Id$" + +import types +import string, httplib, smtplib, urllib, socket, weakref +from os.path import isfile +from string import join, strip, split +from UserDict import UserDict +from cStringIO import StringIO +from TimeoutSocket import TimeoutSocket, TimeoutError +from urlparse import urlparse +from httplib import HTTPConnection, HTTPSConnection +from exceptions import Exception + +import xml.dom.minidom +from xml.dom import Node + +import logging +from c14n import Canonicalize +from Namespaces import SCHEMA, SOAP, XMLNS, ZSI_SCHEMA_URI + + +try: + from xml.dom.ext import SplitQName +except: + def SplitQName(qname): + '''SplitQName(qname) -> (string, string) + + Split Qualified Name into a tuple of len 2, consisting + of the prefix and the local name. + + (prefix, localName) + + Special Cases: + xmlns -- (localName, 'xmlns') + None -- (None, localName) + ''' + + l = qname.split(':') + if len(l) == 1: + l.insert(0, None) + elif len(l) == 2: + if l[0] == 'xmlns': + l.reverse() + else: + return + return tuple(l) + + +class NamespaceError(Exception): + """Used to indicate a Namespace Error.""" + + +class RecursionError(Exception): + """Used to indicate a HTTP redirect recursion.""" + + +class ParseError(Exception): + """Used to indicate a XML parsing error.""" + + +class DOMException(Exception): + """Used to indicate a problem processing DOM.""" + + +class Base: + """Base class for instance level Logging""" + def __init__(self, module=__name__): + self.logger = logging.getLogger('%s-%s(%x)' %(module, self.__class__, id(self))) + + +class HTTPResponse: + """Captures the information in an HTTP response message.""" + + def __init__(self, response): + self.status = response.status + self.reason = response.reason + self.headers = response.msg + self.body = response.read() or None + response.close() + +class TimeoutHTTP(HTTPConnection): + """A custom http connection object that supports socket timeout.""" + def __init__(self, host, port=None, timeout=20): + HTTPConnection.__init__(self, host, port) + self.timeout = timeout + + def connect(self): + self.sock = TimeoutSocket(self.timeout) + self.sock.connect((self.host, self.port)) + + +class TimeoutHTTPS(HTTPSConnection): + """A custom https object that supports socket timeout. Note that this + is not really complete. The builtin SSL support in the Python socket + module requires a real socket (type) to be passed in to be hooked to + SSL. That means our fake socket won't work and our timeout hacks are + bypassed for send and recv calls. Since our hack _is_ in place at + connect() time, it should at least provide some timeout protection.""" + def __init__(self, host, port=None, timeout=20, **kwargs): + HTTPSConnection.__init__(self, str(host), port, **kwargs) + self.timeout = timeout + + def connect(self): + sock = TimeoutSocket(self.timeout) + sock.connect((self.host, self.port)) + realsock = getattr(sock.sock, '_sock', sock.sock) + ssl = socket.ssl(realsock, self.key_file, self.cert_file) + self.sock = httplib.FakeSocket(sock, ssl) + +def urlopen(url, timeout=20, redirects=None): + """A minimal urlopen replacement hack that supports timeouts for http. + Note that this supports GET only.""" + scheme, host, path, params, query, frag = urlparse(url) + + if not scheme in ('http', 'https'): + return urllib.urlopen(url) + if params: path = '%s;%s' % (path, params) + if query: path = '%s?%s' % (path, query) + if frag: path = '%s#%s' % (path, frag) + + if scheme == 'https': + # If ssl is not compiled into Python, you will not get an exception + # until a conn.endheaders() call. We need to know sooner, so use + # getattr. + if hasattr(socket, 'ssl'): + conn = TimeoutHTTPS(host, None, timeout) + else: + import M2Crypto + ctx = M2Crypto.SSL.Context() + ctx.set_session_timeout(timeout) + conn = M2Crypto.httpslib.HTTPSConnection(host, ssl_context=ctx) + #conn.set_debuglevel(1) + else: + conn = TimeoutHTTP(host, None, timeout) + + conn.putrequest('GET', path) + conn.putheader('Connection', 'close') + conn.endheaders() + response = None + while 1: + response = conn.getresponse() + if response.status != 100: + break + conn._HTTPConnection__state = httplib._CS_REQ_SENT + conn._HTTPConnection__response = None + + status = response.status + + # If we get an HTTP redirect, we will follow it automatically. + if status >= 300 and status < 400: + location = response.msg.getheader('location') + if location is not None: + response.close() + if redirects is not None and redirects.has_key(location): + raise RecursionError( + 'Circular HTTP redirection detected.' + ) + if redirects is None: + redirects = {} + redirects[location] = 1 + return urlopen(location, timeout, redirects) + raise HTTPResponse(response) + + if not (status >= 200 and status < 300): + raise HTTPResponse(response) + + body = StringIO(response.read()) + response.close() + return body + +class DOM: + """The DOM singleton defines a number of XML related constants and + provides a number of utility methods for DOM related tasks. It + also provides some basic abstractions so that the rest of the + package need not care about actual DOM implementation in use.""" + + # Namespace stuff related to the SOAP specification. + + NS_SOAP_ENV_1_1 = 'http://schemas.xmlsoap.org/soap/envelope/' + NS_SOAP_ENC_1_1 = 'http://schemas.xmlsoap.org/soap/encoding/' + + NS_SOAP_ENV_1_2 = 'http://www.w3.org/2001/06/soap-envelope' + NS_SOAP_ENC_1_2 = 'http://www.w3.org/2001/06/soap-encoding' + + NS_SOAP_ENV_ALL = (NS_SOAP_ENV_1_1, NS_SOAP_ENV_1_2) + NS_SOAP_ENC_ALL = (NS_SOAP_ENC_1_1, NS_SOAP_ENC_1_2) + + NS_SOAP_ENV = NS_SOAP_ENV_1_1 + NS_SOAP_ENC = NS_SOAP_ENC_1_1 + + _soap_uri_mapping = { + NS_SOAP_ENV_1_1 : '1.1', + NS_SOAP_ENV_1_2 : '1.2', + } + + SOAP_ACTOR_NEXT_1_1 = 'http://schemas.xmlsoap.org/soap/actor/next' + SOAP_ACTOR_NEXT_1_2 = 'http://www.w3.org/2001/06/soap-envelope/actor/next' + SOAP_ACTOR_NEXT_ALL = (SOAP_ACTOR_NEXT_1_1, SOAP_ACTOR_NEXT_1_2) + + def SOAPUriToVersion(self, uri): + """Return the SOAP version related to an envelope uri.""" + value = self._soap_uri_mapping.get(uri) + if value is not None: + return value + raise ValueError( + 'Unsupported SOAP envelope uri: %s' % uri + ) + + def GetSOAPEnvUri(self, version): + """Return the appropriate SOAP envelope uri for a given + human-friendly SOAP version string (e.g. '1.1').""" + attrname = 'NS_SOAP_ENV_%s' % join(split(version, '.'), '_') + value = getattr(self, attrname, None) + if value is not None: + return value + raise ValueError( + 'Unsupported SOAP version: %s' % version + ) + + def GetSOAPEncUri(self, version): + """Return the appropriate SOAP encoding uri for a given + human-friendly SOAP version string (e.g. '1.1').""" + attrname = 'NS_SOAP_ENC_%s' % join(split(version, '.'), '_') + value = getattr(self, attrname, None) + if value is not None: + return value + raise ValueError( + 'Unsupported SOAP version: %s' % version + ) + + def GetSOAPActorNextUri(self, version): + """Return the right special next-actor uri for a given + human-friendly SOAP version string (e.g. '1.1').""" + attrname = 'SOAP_ACTOR_NEXT_%s' % join(split(version, '.'), '_') + value = getattr(self, attrname, None) + if value is not None: + return value + raise ValueError( + 'Unsupported SOAP version: %s' % version + ) + + + # Namespace stuff related to XML Schema. + + NS_XSD_99 = 'http://www.w3.org/1999/XMLSchema' + NS_XSI_99 = 'http://www.w3.org/1999/XMLSchema-instance' + + NS_XSD_00 = 'http://www.w3.org/2000/10/XMLSchema' + NS_XSI_00 = 'http://www.w3.org/2000/10/XMLSchema-instance' + + NS_XSD_01 = 'http://www.w3.org/2001/XMLSchema' + NS_XSI_01 = 'http://www.w3.org/2001/XMLSchema-instance' + + NS_XSD_ALL = (NS_XSD_99, NS_XSD_00, NS_XSD_01) + NS_XSI_ALL = (NS_XSI_99, NS_XSI_00, NS_XSI_01) + + NS_XSD = NS_XSD_01 + NS_XSI = NS_XSI_01 + + _xsd_uri_mapping = { + NS_XSD_99 : NS_XSI_99, + NS_XSD_00 : NS_XSI_00, + NS_XSD_01 : NS_XSI_01, + } + + for key, value in _xsd_uri_mapping.items(): + _xsd_uri_mapping[value] = key + + + def InstanceUriForSchemaUri(self, uri): + """Return the appropriate matching XML Schema instance uri for + the given XML Schema namespace uri.""" + return self._xsd_uri_mapping.get(uri) + + def SchemaUriForInstanceUri(self, uri): + """Return the appropriate matching XML Schema namespace uri for + the given XML Schema instance namespace uri.""" + return self._xsd_uri_mapping.get(uri) + + + # Namespace stuff related to WSDL. + + NS_WSDL_1_1 = 'http://schemas.xmlsoap.org/wsdl/' + NS_WSDL_ALL = (NS_WSDL_1_1,) + NS_WSDL = NS_WSDL_1_1 + + NS_SOAP_BINDING_1_1 = 'http://schemas.xmlsoap.org/wsdl/soap/' + NS_HTTP_BINDING_1_1 = 'http://schemas.xmlsoap.org/wsdl/http/' + NS_MIME_BINDING_1_1 = 'http://schemas.xmlsoap.org/wsdl/mime/' + + NS_SOAP_BINDING_ALL = (NS_SOAP_BINDING_1_1,) + NS_HTTP_BINDING_ALL = (NS_HTTP_BINDING_1_1,) + NS_MIME_BINDING_ALL = (NS_MIME_BINDING_1_1,) + + NS_SOAP_BINDING = NS_SOAP_BINDING_1_1 + NS_HTTP_BINDING = NS_HTTP_BINDING_1_1 + NS_MIME_BINDING = NS_MIME_BINDING_1_1 + + NS_SOAP_HTTP_1_1 = 'http://schemas.xmlsoap.org/soap/http' + NS_SOAP_HTTP_ALL = (NS_SOAP_HTTP_1_1,) + NS_SOAP_HTTP = NS_SOAP_HTTP_1_1 + + + _wsdl_uri_mapping = { + NS_WSDL_1_1 : '1.1', + } + + def WSDLUriToVersion(self, uri): + """Return the WSDL version related to a WSDL namespace uri.""" + value = self._wsdl_uri_mapping.get(uri) + if value is not None: + return value + raise ValueError( + 'Unsupported SOAP envelope uri: %s' % uri + ) + + def GetWSDLUri(self, version): + attr = 'NS_WSDL_%s' % join(split(version, '.'), '_') + value = getattr(self, attr, None) + if value is not None: + return value + raise ValueError( + 'Unsupported WSDL version: %s' % version + ) + + def GetWSDLSoapBindingUri(self, version): + attr = 'NS_SOAP_BINDING_%s' % join(split(version, '.'), '_') + value = getattr(self, attr, None) + if value is not None: + return value + raise ValueError( + 'Unsupported WSDL version: %s' % version + ) + + def GetWSDLHttpBindingUri(self, version): + attr = 'NS_HTTP_BINDING_%s' % join(split(version, '.'), '_') + value = getattr(self, attr, None) + if value is not None: + return value + raise ValueError( + 'Unsupported WSDL version: %s' % version + ) + + def GetWSDLMimeBindingUri(self, version): + attr = 'NS_MIME_BINDING_%s' % join(split(version, '.'), '_') + value = getattr(self, attr, None) + if value is not None: + return value + raise ValueError( + 'Unsupported WSDL version: %s' % version + ) + + def GetWSDLHttpTransportUri(self, version): + attr = 'NS_SOAP_HTTP_%s' % join(split(version, '.'), '_') + value = getattr(self, attr, None) + if value is not None: + return value + raise ValueError( + 'Unsupported WSDL version: %s' % version + ) + + + # Other xml namespace constants. + NS_XMLNS = 'http://www.w3.org/2000/xmlns/' + + + + def isElement(self, node, name, nsuri=None): + """Return true if the given node is an element with the given + name and optional namespace uri.""" + if node.nodeType != node.ELEMENT_NODE: + return 0 + return node.localName == name and \ + (nsuri is None or self.nsUriMatch(node.namespaceURI, nsuri)) + + def getElement(self, node, name, nsuri=None, default=join): + """Return the first child of node with a matching name and + namespace uri, or the default if one is provided.""" + nsmatch = self.nsUriMatch + ELEMENT_NODE = node.ELEMENT_NODE + for child in node.childNodes: + if child.nodeType == ELEMENT_NODE: + if ((child.localName == name or name is None) and + (nsuri is None or nsmatch(child.namespaceURI, nsuri)) + ): + return child + if default is not join: + return default + raise KeyError, name + + def getElementById(self, node, id, default=join): + """Return the first child of node matching an id reference.""" + attrget = self.getAttr + ELEMENT_NODE = node.ELEMENT_NODE + for child in node.childNodes: + if child.nodeType == ELEMENT_NODE: + if attrget(child, 'id') == id: + return child + if default is not join: + return default + raise KeyError, name + + def getMappingById(self, document, depth=None, element=None, + mapping=None, level=1): + """Create an id -> element mapping of those elements within a + document that define an id attribute. The depth of the search + may be controlled by using the (1-based) depth argument.""" + if document is not None: + element = document.documentElement + mapping = {} + attr = element._attrs.get('id', None) + if attr is not None: + mapping[attr.value] = element + if depth is None or depth > level: + level = level + 1 + ELEMENT_NODE = element.ELEMENT_NODE + for child in element.childNodes: + if child.nodeType == ELEMENT_NODE: + self.getMappingById(None, depth, child, mapping, level) + return mapping + + def getElements(self, node, name, nsuri=None): + """Return a sequence of the child elements of the given node that + match the given name and optional namespace uri.""" + nsmatch = self.nsUriMatch + result = [] + ELEMENT_NODE = node.ELEMENT_NODE + for child in node.childNodes: + if child.nodeType == ELEMENT_NODE: + if ((child.localName == name or name is None) and ( + (nsuri is None) or nsmatch(child.namespaceURI, nsuri))): + result.append(child) + return result + + def hasAttr(self, node, name, nsuri=None): + """Return true if element has attribute with the given name and + optional nsuri. If nsuri is not specified, returns true if an + attribute exists with the given name with any namespace.""" + if nsuri is None: + if node.hasAttribute(name): + return True + return False + return node.hasAttributeNS(nsuri, name) + + def getAttr(self, node, name, nsuri=None, default=join): + """Return the value of the attribute named 'name' with the + optional nsuri, or the default if one is specified. If + nsuri is not specified, an attribute that matches the + given name will be returned regardless of namespace.""" + if nsuri is None: + result = node._attrs.get(name, None) + if result is None: + for item in node._attrsNS.keys(): + if item[1] == name: + result = node._attrsNS[item] + break + else: + result = node._attrsNS.get((nsuri, name), None) + if result is not None: + return result.value + if default is not join: + return default + return '' + + def getAttrs(self, node): + """Return a Collection of all attributes + """ + attrs = {} + for k,v in node._attrs.items(): + attrs[k] = v.value + return attrs + + def getElementText(self, node, preserve_ws=None): + """Return the text value of an xml element node. Leading and trailing + whitespace is stripped from the value unless the preserve_ws flag + is passed with a true value.""" + result = [] + for child in node.childNodes: + nodetype = child.nodeType + if nodetype == child.TEXT_NODE or \ + nodetype == child.CDATA_SECTION_NODE: + result.append(child.nodeValue) + value = join(result, '') + if preserve_ws is None: + value = strip(value) + return value + + def findNamespaceURI(self, prefix, node): + """Find a namespace uri given a prefix and a context node.""" + attrkey = (self.NS_XMLNS, prefix) + DOCUMENT_NODE = node.DOCUMENT_NODE + ELEMENT_NODE = node.ELEMENT_NODE + while 1: + if node is None: + raise DOMException('Value for prefix %s not found.' % prefix) + if node.nodeType != ELEMENT_NODE: + node = node.parentNode + continue + result = node._attrsNS.get(attrkey, None) + if result is not None: + return result.value + if hasattr(node, '__imported__'): + raise DOMException('Value for prefix %s not found.' % prefix) + node = node.parentNode + if node.nodeType == DOCUMENT_NODE: + raise DOMException('Value for prefix %s not found.' % prefix) + + def findDefaultNS(self, node): + """Return the current default namespace uri for the given node.""" + attrkey = (self.NS_XMLNS, 'xmlns') + DOCUMENT_NODE = node.DOCUMENT_NODE + ELEMENT_NODE = node.ELEMENT_NODE + while 1: + if node.nodeType != ELEMENT_NODE: + node = node.parentNode + continue + result = node._attrsNS.get(attrkey, None) + if result is not None: + return result.value + if hasattr(node, '__imported__'): + raise DOMException('Cannot determine default namespace.') + node = node.parentNode + if node.nodeType == DOCUMENT_NODE: + raise DOMException('Cannot determine default namespace.') + + def findTargetNS(self, node): + """Return the defined target namespace uri for the given node.""" + attrget = self.getAttr + attrkey = (self.NS_XMLNS, 'xmlns') + DOCUMENT_NODE = node.DOCUMENT_NODE + ELEMENT_NODE = node.ELEMENT_NODE + while 1: + if node.nodeType != ELEMENT_NODE: + node = node.parentNode + continue + result = attrget(node, 'targetNamespace', default=None) + if result is not None: + return result + node = node.parentNode + if node.nodeType == DOCUMENT_NODE: + raise DOMException('Cannot determine target namespace.') + + def getTypeRef(self, element): + """Return (namespaceURI, name) for a type attribue of the given + element, or None if the element does not have a type attribute.""" + typeattr = self.getAttr(element, 'type', default=None) + if typeattr is None: + return None + parts = typeattr.split(':', 1) + if len(parts) == 2: + nsuri = self.findNamespaceURI(parts[0], element) + else: + nsuri = self.findDefaultNS(element) + return (nsuri, parts[1]) + + def importNode(self, document, node, deep=0): + """Implements (well enough for our purposes) DOM node import.""" + nodetype = node.nodeType + if nodetype in (node.DOCUMENT_NODE, node.DOCUMENT_TYPE_NODE): + raise DOMException('Illegal node type for importNode') + if nodetype == node.ENTITY_REFERENCE_NODE: + deep = 0 + clone = node.cloneNode(deep) + self._setOwnerDoc(document, clone) + clone.__imported__ = 1 + return clone + + def _setOwnerDoc(self, document, node): + node.ownerDocument = document + for child in node.childNodes: + self._setOwnerDoc(document, child) + + def nsUriMatch(self, value, wanted, strict=0, tt=type(())): + """Return a true value if two namespace uri values match.""" + if value == wanted or (type(wanted) is tt) and value in wanted: + return 1 + if not strict: + wanted = type(wanted) is tt and wanted or (wanted,) + value = value[-1:] != '/' and value or value[:-1] + for item in wanted: + if item == value or item[:-1] == value: + return 1 + return 0 + + def createDocument(self, nsuri, qname, doctype=None): + """Create a new writable DOM document object.""" + impl = xml.dom.minidom.getDOMImplementation() + return impl.createDocument(nsuri, qname, doctype) + + def loadDocument(self, data): + """Load an xml file from a file-like object and return a DOM + document instance.""" + return xml.dom.minidom.parse(data) + + def loadFromURL(self, url): + """Load an xml file from a URL and return a DOM document.""" + if isfile(url) is True: + file = open(url, 'r') + else: + file = urlopen(url) + + try: + result = self.loadDocument(file) + except Exception, ex: + file.close() + raise ParseError(('Failed to load document %s' %url,) + ex.args) + else: + file.close() + return result + +DOM = DOM() + + +class MessageInterface: + '''Higher Level Interface, delegates to DOM singleton, must + be subclassed and implement all methods that throw NotImplementedError. + ''' + def __init__(self, sw): + '''Constructor, May be extended, do not override. + sw -- soapWriter instance + ''' + self.sw = None + if type(sw) != weakref.ReferenceType and sw is not None: + self.sw = weakref.ref(sw) + else: + self.sw = sw + + def AddCallback(self, func, *arglist): + self.sw().AddCallback(func, *arglist) + + def Known(self, obj): + return self.sw().Known(obj) + + def Forget(self, obj): + return self.sw().Forget(obj) + + def canonicalize(self): + '''canonicalize the underlying DOM, and return as string. + ''' + raise NotImplementedError, '' + + def createDocument(self, namespaceURI=SOAP.ENV, localName='Envelope'): + '''create Document + ''' + raise NotImplementedError, '' + + def createAppendElement(self, namespaceURI, localName): + '''create and append element(namespaceURI,localName), and return + the node. + ''' + raise NotImplementedError, '' + + def findNamespaceURI(self, qualifiedName): + raise NotImplementedError, '' + + def resolvePrefix(self, prefix): + raise NotImplementedError, '' + + def setAttributeNS(self, namespaceURI, localName, value): + '''set attribute (namespaceURI, localName)=value + ''' + raise NotImplementedError, '' + + def setAttributeType(self, namespaceURI, localName): + '''set attribute xsi:type=(namespaceURI, localName) + ''' + raise NotImplementedError, '' + + def setNamespaceAttribute(self, namespaceURI, prefix): + '''set namespace attribute xmlns:prefix=namespaceURI + ''' + raise NotImplementedError, '' + + +class ElementProxy(Base, MessageInterface): + ''' + ''' + _soap_env_prefix = 'SOAP-ENV' + _soap_enc_prefix = 'SOAP-ENC' + _zsi_prefix = 'ZSI' + _xsd_prefix = 'xsd' + _xsi_prefix = 'xsi' + _xml_prefix = 'xml' + _xmlns_prefix = 'xmlns' + + _soap_env_nsuri = SOAP.ENV + _soap_enc_nsuri = SOAP.ENC + _zsi_nsuri = ZSI_SCHEMA_URI + _xsd_nsuri = SCHEMA.XSD3 + _xsi_nsuri = SCHEMA.XSI3 + _xml_nsuri = XMLNS.XML + _xmlns_nsuri = XMLNS.BASE + + standard_ns = {\ + _xml_prefix:_xml_nsuri, + _xmlns_prefix:_xmlns_nsuri + } + reserved_ns = {\ + _soap_env_prefix:_soap_env_nsuri, + _soap_enc_prefix:_soap_enc_nsuri, + _zsi_prefix:_zsi_nsuri, + _xsd_prefix:_xsd_nsuri, + _xsi_prefix:_xsi_nsuri, + } + name = None + namespaceURI = None + + def __init__(self, sw, message=None): + '''Initialize. + sw -- SoapWriter + ''' + self._indx = 0 + MessageInterface.__init__(self, sw) + Base.__init__(self) + self._dom = DOM + self.node = None + if type(message) in (types.StringType,types.UnicodeType): + self.loadFromString(message) + elif isinstance(message, ElementProxy): + self.node = message._getNode() + else: + self.node = message + self.processorNss = self.standard_ns.copy() + self.processorNss.update(self.reserved_ns) + + def __str__(self): + return self.toString() + + def evaluate(self, expression, processorNss=None): + '''expression -- XPath compiled expression + ''' + from Ft.Xml import XPath + if not processorNss: + context = XPath.Context.Context(self.node, processorNss=self.processorNss) + else: + context = XPath.Context.Context(self.node, processorNss=processorNss) + nodes = expression.evaluate(context) + return map(lambda node: ElementProxy(self.sw,node), nodes) + + ############################################# + # Methods for checking/setting the + # classes (namespaceURI,name) node. + ############################################# + def checkNode(self, namespaceURI=None, localName=None): + ''' + namespaceURI -- namespace of element + localName -- local name of element + ''' + namespaceURI = namespaceURI or self.namespaceURI + localName = localName or self.name + check = False + if localName and self.node: + check = self._dom.isElement(self.node, localName, namespaceURI) + if not check: + raise NamespaceError, 'unexpected node type %s, expecting %s' %(self.node, localName) + + def setNode(self, node=None): + if node: + if isinstance(node, ElementProxy): + self.node = node._getNode() + else: + self.node = node + elif self.node: + node = self._dom.getElement(self.node, self.name, self.namespaceURI, default=None) + if not node: + raise NamespaceError, 'cant find element (%s,%s)' %(self.namespaceURI,self.name) + self.node = node + else: + #self.node = self._dom.create(self.node, self.name, self.namespaceURI, default=None) + self.createDocument(self.namespaceURI, localName=self.name, doctype=None) + + self.checkNode() + + ############################################# + # Wrapper Methods for direct DOM Element Node access + ############################################# + def _getNode(self): + return self.node + + def _getElements(self): + return self._dom.getElements(self.node, name=None) + + def _getOwnerDocument(self): + return self.node.ownerDocument or self.node + + def _getUniquePrefix(self): + '''I guess we need to resolve all potential prefixes + because when the current node is attached it copies the + namespaces into the parent node. + ''' + while 1: + self._indx += 1 + prefix = 'ns%d' %self._indx + try: + self._dom.findNamespaceURI(prefix, self._getNode()) + except DOMException, ex: + break + return prefix + + def _getPrefix(self, node, nsuri): + ''' + Keyword arguments: + node -- DOM Element Node + nsuri -- namespace of attribute value + ''' + try: + if node and (node.nodeType == node.ELEMENT_NODE) and \ + (nsuri == self._dom.findDefaultNS(node)): + return None + except DOMException, ex: + pass + if nsuri == XMLNS.XML: + return self._xml_prefix + if node.nodeType == Node.ELEMENT_NODE: + for attr in node.attributes.values(): + if attr.namespaceURI == XMLNS.BASE \ + and nsuri == attr.value: + return attr.localName + else: + if node.parentNode: + return self._getPrefix(node.parentNode, nsuri) + raise NamespaceError, 'namespaceURI "%s" is not defined' %nsuri + + def _appendChild(self, node): + ''' + Keyword arguments: + node -- DOM Element Node + ''' + if node is None: + raise TypeError, 'node is None' + self.node.appendChild(node) + + def _insertBefore(self, newChild, refChild): + ''' + Keyword arguments: + child -- DOM Element Node to insert + refChild -- DOM Element Node + ''' + self.node.insertBefore(newChild, refChild) + + def _setAttributeNS(self, namespaceURI, qualifiedName, value): + ''' + Keyword arguments: + namespaceURI -- namespace of attribute + qualifiedName -- qualified name of new attribute value + value -- value of attribute + ''' + self.node.setAttributeNS(namespaceURI, qualifiedName, value) + + ############################################# + #General Methods + ############################################# + def isFault(self): + '''check to see if this is a soap:fault message. + ''' + return False + + def getPrefix(self, namespaceURI): + try: + prefix = self._getPrefix(node=self.node, nsuri=namespaceURI) + except NamespaceError, ex: + prefix = self._getUniquePrefix() + self.setNamespaceAttribute(prefix, namespaceURI) + return prefix + + def getDocument(self): + return self._getOwnerDocument() + + def setDocument(self, document): + self.node = document + + def importFromString(self, xmlString): + doc = self._dom.loadDocument(StringIO(xmlString)) + node = self._dom.getElement(doc, name=None) + clone = self.importNode(node) + self._appendChild(clone) + + def importNode(self, node): + if isinstance(node, ElementProxy): + node = node._getNode() + return self._dom.importNode(self._getOwnerDocument(), node, deep=1) + + def loadFromString(self, data): + self.node = self._dom.loadDocument(StringIO(data)) + + def canonicalize(self): + return Canonicalize(self.node) + + def toString(self): + return self.canonicalize() + + def createDocument(self, namespaceURI, localName, doctype=None): + '''If specified must be a SOAP envelope, else may contruct an empty document. + ''' + prefix = self._soap_env_prefix + + if namespaceURI == self.reserved_ns[prefix]: + qualifiedName = '%s:%s' %(prefix,localName) + elif namespaceURI is localName is None: + self.node = self._dom.createDocument(None,None,None) + return + else: + raise KeyError, 'only support creation of document in %s' %self.reserved_ns[prefix] + + document = self._dom.createDocument(nsuri=namespaceURI, qname=qualifiedName, doctype=doctype) + self.node = document.childNodes[0] + + #set up reserved namespace attributes + for prefix,nsuri in self.reserved_ns.items(): + self._setAttributeNS(namespaceURI=self._xmlns_nsuri, + qualifiedName='%s:%s' %(self._xmlns_prefix,prefix), + value=nsuri) + + ############################################# + #Methods for attributes + ############################################# + def hasAttribute(self, namespaceURI, localName): + return self._dom.hasAttr(self._getNode(), name=localName, nsuri=namespaceURI) + + def setAttributeType(self, namespaceURI, localName): + '''set xsi:type + Keyword arguments: + namespaceURI -- namespace of attribute value + localName -- name of new attribute value + + ''' + self.logger.debug('setAttributeType: (%s,%s)', namespaceURI, localName) + value = localName + if namespaceURI: + value = '%s:%s' %(self.getPrefix(namespaceURI),localName) + + xsi_prefix = self.getPrefix(self._xsi_nsuri) + self._setAttributeNS(self._xsi_nsuri, '%s:type' %xsi_prefix, value) + + def createAttributeNS(self, namespace, name, value): + document = self._getOwnerDocument() + attrNode = document.createAttributeNS(namespace, name, value) + + def setAttributeNS(self, namespaceURI, localName, value): + ''' + Keyword arguments: + namespaceURI -- namespace of attribute to create, None is for + attributes in no namespace. + localName -- local name of new attribute + value -- value of new attribute + ''' + prefix = None + if namespaceURI: + try: + prefix = self.getPrefix(namespaceURI) + except KeyError, ex: + prefix = 'ns2' + self.setNamespaceAttribute(prefix, namespaceURI) + qualifiedName = localName + if prefix: + qualifiedName = '%s:%s' %(prefix, localName) + self._setAttributeNS(namespaceURI, qualifiedName, value) + + def setNamespaceAttribute(self, prefix, namespaceURI): + ''' + Keyword arguments: + prefix -- xmlns prefix + namespaceURI -- value of prefix + ''' + self._setAttributeNS(XMLNS.BASE, 'xmlns:%s' %prefix, namespaceURI) + + ############################################# + #Methods for elements + ############################################# + def createElementNS(self, namespace, qname): + ''' + Keyword arguments: + namespace -- namespace of element to create + qname -- qualified name of new element + ''' + document = self._getOwnerDocument() + node = document.createElementNS(namespace, qname) + return ElementProxy(self.sw, node) + + def createAppendSetElement(self, namespaceURI, localName, prefix=None): + '''Create a new element (namespaceURI,name), append it + to current node, then set it to be the current node. + Keyword arguments: + namespaceURI -- namespace of element to create + localName -- local name of new element + prefix -- if namespaceURI is not defined, declare prefix. defaults + to 'ns1' if left unspecified. + ''' + node = self.createAppendElement(namespaceURI, localName, prefix=None) + node=node._getNode() + self._setNode(node._getNode()) + + def createAppendElement(self, namespaceURI, localName, prefix=None): + '''Create a new element (namespaceURI,name), append it + to current node, and return the newly created node. + Keyword arguments: + namespaceURI -- namespace of element to create + localName -- local name of new element + prefix -- if namespaceURI is not defined, declare prefix. defaults + to 'ns1' if left unspecified. + ''' + declare = False + qualifiedName = localName + if namespaceURI: + try: + prefix = self.getPrefix(namespaceURI) + except: + declare = True + prefix = prefix or self._getUniquePrefix() + if prefix: + qualifiedName = '%s:%s' %(prefix, localName) + node = self.createElementNS(namespaceURI, qualifiedName) + if declare: + node._setAttributeNS(XMLNS.BASE, 'xmlns:%s' %prefix, namespaceURI) + self._appendChild(node=node._getNode()) + return node + + def createInsertBefore(self, namespaceURI, localName, refChild): + qualifiedName = localName + prefix = self.getPrefix(namespaceURI) + if prefix: + qualifiedName = '%s:%s' %(prefix, localName) + node = self.createElementNS(namespaceURI, qualifiedName) + self._insertBefore(newChild=node._getNode(), refChild=refChild._getNode()) + return node + + def getElement(self, namespaceURI, localName): + ''' + Keyword arguments: + namespaceURI -- namespace of element + localName -- local name of element + ''' + node = self._dom.getElement(self.node, localName, namespaceURI, default=None) + if node: + return ElementProxy(self.sw, node) + return None + + def getAttributeValue(self, namespaceURI, localName): + ''' + Keyword arguments: + namespaceURI -- namespace of attribute + localName -- local name of attribute + ''' + if self.hasAttribute(namespaceURI, localName): + attr = self.node.getAttributeNodeNS(namespaceURI,localName) + return attr.value + return None + + def getValue(self): + return self._dom.getElementText(self.node, preserve_ws=True) + + ############################################# + #Methods for text nodes + ############################################# + def createAppendTextNode(self, pyobj): + node = self.createTextNode(pyobj) + self._appendChild(node=node._getNode()) + return node + + def createTextNode(self, pyobj): + document = self._getOwnerDocument() + node = document.createTextNode(pyobj) + return ElementProxy(self.sw, node) + + ############################################# + #Methods for retrieving namespaceURI's + ############################################# + def findNamespaceURI(self, qualifiedName): + parts = SplitQName(qualifiedName) + element = self._getNode() + if len(parts) == 1: + return (self._dom.findTargetNS(element), value) + return self._dom.findNamespaceURI(parts[0], element) + + def resolvePrefix(self, prefix): + element = self._getNode() + return self._dom.findNamespaceURI(prefix, element) + + def getSOAPEnvURI(self): + return self._soap_env_nsuri + + def isEmpty(self): + return not self.node + + + +class Collection(UserDict): + """Helper class for maintaining ordered named collections.""" + default = lambda self,k: k.name + def __init__(self, parent, key=None): + UserDict.__init__(self) + self.parent = weakref.ref(parent) + self.list = [] + self._func = key or self.default + + def __getitem__(self, key): + if type(key) is type(1): + return self.list[key] + return self.data[key] + + def __setitem__(self, key, item): + item.parent = weakref.ref(self) + self.list.append(item) + self.data[key] = item + + def keys(self): + return map(lambda i: self._func(i), self.list) + + def items(self): + return map(lambda i: (self._func(i), i), self.list) + + def values(self): + return self.list + + +class CollectionNS(UserDict): + """Helper class for maintaining ordered named collections.""" + default = lambda self,k: k.name + def __init__(self, parent, key=None): + UserDict.__init__(self) + self.parent = weakref.ref(parent) + self.targetNamespace = None + self.list = [] + self._func = key or self.default + + def __getitem__(self, key): + self.targetNamespace = self.parent().targetNamespace + if type(key) is types.IntType: + return self.list[key] + elif self.__isSequence(key): + nsuri,name = key + return self.data[nsuri][name] + return self.data[self.parent().targetNamespace][key] + + def __setitem__(self, key, item): + item.parent = weakref.ref(self) + self.list.append(item) + targetNamespace = getattr(item, 'targetNamespace', self.parent().targetNamespace) + if not self.data.has_key(targetNamespace): + self.data[targetNamespace] = {} + self.data[targetNamespace][key] = item + + def __isSequence(self, key): + return (type(key) in (types.TupleType,types.ListType) and len(key) == 2) + + def keys(self): + keys = [] + for tns in self.data.keys(): + keys.append(map(lambda i: (tns,self._func(i)), self.data[tns].values())) + return keys + + def items(self): + return map(lambda i: (self._func(i), i), self.list) + + def values(self): + return self.list + + + +# This is a runtime guerilla patch for pulldom (used by minidom) so +# that xml namespace declaration attributes are not lost in parsing. +# We need them to do correct QName linking for XML Schema and WSDL. +# The patch has been submitted to SF for the next Python version. + +from xml.dom.pulldom import PullDOM, START_ELEMENT +if 1: + def startPrefixMapping(self, prefix, uri): + if not hasattr(self, '_xmlns_attrs'): + self._xmlns_attrs = [] + self._xmlns_attrs.append((prefix or 'xmlns', uri)) + self._ns_contexts.append(self._current_context.copy()) + self._current_context[uri] = prefix or '' + + PullDOM.startPrefixMapping = startPrefixMapping + + def startElementNS(self, name, tagName , attrs): + # Retrieve xml namespace declaration attributes. + xmlns_uri = 'http://www.w3.org/2000/xmlns/' + xmlns_attrs = getattr(self, '_xmlns_attrs', None) + if xmlns_attrs is not None: + for aname, value in xmlns_attrs: + attrs._attrs[(xmlns_uri, aname)] = value + self._xmlns_attrs = [] + uri, localname = name + if uri: + # When using namespaces, the reader may or may not + # provide us with the original name. If not, create + # *a* valid tagName from the current context. + if tagName is None: + prefix = self._current_context[uri] + if prefix: + tagName = prefix + ":" + localname + else: + tagName = localname + if self.document: + node = self.document.createElementNS(uri, tagName) + else: + node = self.buildDocument(uri, tagName) + else: + # When the tagname is not prefixed, it just appears as + # localname + if self.document: + node = self.document.createElement(localname) + else: + node = self.buildDocument(None, localname) + + for aname,value in attrs.items(): + a_uri, a_localname = aname + if a_uri == xmlns_uri: + if a_localname == 'xmlns': + qname = a_localname + else: + qname = 'xmlns:' + a_localname + attr = self.document.createAttributeNS(a_uri, qname) + node.setAttributeNodeNS(attr) + elif a_uri: + prefix = self._current_context[a_uri] + if prefix: + qname = prefix + ":" + a_localname + else: + qname = a_localname + attr = self.document.createAttributeNS(a_uri, qname) + node.setAttributeNodeNS(attr) + else: + attr = self.document.createAttribute(a_localname) + node.setAttributeNode(attr) + attr.value = value + + self.lastEvent[1] = [(START_ELEMENT, node), None] + self.lastEvent = self.lastEvent[1] + self.push(node) + + PullDOM.startElementNS = startElementNS + +# +# This is a runtime guerilla patch for minidom so +# that xmlns prefixed attributes dont raise AttributeErrors +# during cloning. +# +# Namespace declarations can appear in any start-tag, must look for xmlns +# prefixed attribute names during cloning. +# +# key (attr.namespaceURI, tag) +# ('http://www.w3.org/2000/xmlns/', u'xsd') <xml.dom.minidom.Attr instance at 0x82227c4> +# ('http://www.w3.org/2000/xmlns/', 'xmlns') <xml.dom.minidom.Attr instance at 0x8414b3c> +# +# xml.dom.minidom.Attr.nodeName = xmlns:xsd +# xml.dom.minidom.Attr.value = = http://www.w3.org/2001/XMLSchema + +if 1: + def _clone_node(node, deep, newOwnerDocument): + """ + Clone a node and give it the new owner document. + Called by Node.cloneNode and Document.importNode + """ + if node.ownerDocument.isSameNode(newOwnerDocument): + operation = xml.dom.UserDataHandler.NODE_CLONED + else: + operation = xml.dom.UserDataHandler.NODE_IMPORTED + if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE: + clone = newOwnerDocument.createElementNS(node.namespaceURI, + node.nodeName) + for attr in node.attributes.values(): + clone.setAttributeNS(attr.namespaceURI, attr.nodeName, attr.value) + + prefix, tag = xml.dom.minidom._nssplit(attr.nodeName) + if prefix == 'xmlns': + a = clone.getAttributeNodeNS(attr.namespaceURI, tag) + elif prefix: + a = clone.getAttributeNodeNS(attr.namespaceURI, tag) + else: + a = clone.getAttributeNodeNS(attr.namespaceURI, attr.nodeName) + a.specified = attr.specified + + if deep: + for child in node.childNodes: + c = xml.dom.minidom._clone_node(child, deep, newOwnerDocument) + clone.appendChild(c) + elif node.nodeType == xml.dom.minidom.Node.DOCUMENT_FRAGMENT_NODE: + clone = newOwnerDocument.createDocumentFragment() + if deep: + for child in node.childNodes: + c = xml.dom.minidom._clone_node(child, deep, newOwnerDocument) + clone.appendChild(c) + + elif node.nodeType == xml.dom.minidom.Node.TEXT_NODE: + clone = newOwnerDocument.createTextNode(node.data) + elif node.nodeType == xml.dom.minidom.Node.CDATA_SECTION_NODE: + clone = newOwnerDocument.createCDATASection(node.data) + elif node.nodeType == xml.dom.minidom.Node.PROCESSING_INSTRUCTION_NODE: + clone = newOwnerDocument.createProcessingInstruction(node.target, + node.data) + elif node.nodeType == xml.dom.minidom.Node.COMMENT_NODE: + clone = newOwnerDocument.createComment(node.data) + elif node.nodeType == xml.dom.minidom.Node.ATTRIBUTE_NODE: + clone = newOwnerDocument.createAttributeNS(node.namespaceURI, + node.nodeName) + clone.specified = True + clone.value = node.value + elif node.nodeType == xml.dom.minidom.Node.DOCUMENT_TYPE_NODE: + assert node.ownerDocument is not newOwnerDocument + operation = xml.dom.UserDataHandler.NODE_IMPORTED + clone = newOwnerDocument.implementation.createDocumentType( + node.name, node.publicId, node.systemId) + clone.ownerDocument = newOwnerDocument + if deep: + clone.entities._seq = [] + clone.notations._seq = [] + for n in node.notations._seq: + notation = xml.dom.minidom.Notation(n.nodeName, n.publicId, n.systemId) + notation.ownerDocument = newOwnerDocument + clone.notations._seq.append(notation) + if hasattr(n, '_call_user_data_handler'): + n._call_user_data_handler(operation, n, notation) + for e in node.entities._seq: + entity = xml.dom.minidom.Entity(e.nodeName, e.publicId, e.systemId, + e.notationName) + entity.actualEncoding = e.actualEncoding + entity.encoding = e.encoding + entity.version = e.version + entity.ownerDocument = newOwnerDocument + clone.entities._seq.append(entity) + if hasattr(e, '_call_user_data_handler'): + e._call_user_data_handler(operation, n, entity) + else: + # Note the cloning of Document and DocumentType nodes is + # implemenetation specific. minidom handles those cases + # directly in the cloneNode() methods. + raise xml.dom.NotSupportedErr("Cannot clone node %s" % repr(node)) + + # Check for _call_user_data_handler() since this could conceivably + # used with other DOM implementations (one of the FourThought + # DOMs, perhaps?). + if hasattr(node, '_call_user_data_handler'): + node._call_user_data_handler(operation, node, clone) + return clone + + xml.dom.minidom._clone_node = _clone_node + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/WSDLTools.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/WSDLTools.py new file mode 100755 index 0000000000000000000000000000000000000000..f864bf9988ca2ef0f4764bf46a33a2e46034fddd --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/WSDLTools.py @@ -0,0 +1,1602 @@ +# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. + +ident = "$Id$" + +import urllib, weakref +from cStringIO import StringIO +from Namespaces import OASIS, XMLNS, WSA200408, WSA200403, WSA200303 +from Utility import Collection, CollectionNS, DOM, ElementProxy +from XMLSchema import XMLSchema, SchemaReader, WSDLToolsAdapter + + +class WSDLReader: + """A WSDLReader creates WSDL instances from urls and xml data.""" + + # Custom subclasses of WSDLReader may wish to implement a caching + # strategy or other optimizations. Because application needs vary + # so widely, we don't try to provide any caching by default. + + def loadFromStream(self, stream, name=None): + """Return a WSDL instance loaded from a stream object.""" + document = DOM.loadDocument(stream) + wsdl = WSDL() + if name: + wsdl.location = name + elif hasattr(stream, 'name'): + wsdl.location = stream.name + wsdl.load(document) + return wsdl + + def loadFromURL(self, url): + """Return a WSDL instance loaded from the given url.""" + document = DOM.loadFromURL(url) + wsdl = WSDL() + wsdl.location = url + wsdl.load(document) + return wsdl + + def loadFromString(self, data): + """Return a WSDL instance loaded from an xml string.""" + return self.loadFromStream(StringIO(data)) + + def loadFromFile(self, filename): + """Return a WSDL instance loaded from the given file.""" + file = open(filename, 'rb') + try: + wsdl = self.loadFromStream(file) + finally: + file.close() + return wsdl + +class WSDL: + """A WSDL object models a WSDL service description. WSDL objects + may be created manually or loaded from an xml representation + using a WSDLReader instance.""" + + def __init__(self, targetNamespace=None, strict=1): + self.targetNamespace = targetNamespace or 'urn:this-document.wsdl' + self.documentation = '' + self.location = None + self.document = None + self.name = None + self.services = CollectionNS(self) + self.messages = CollectionNS(self) + self.portTypes = CollectionNS(self) + self.bindings = CollectionNS(self) + self.imports = Collection(self) + self.types = Types(self) + self.extensions = [] + self.strict = strict + + def __del__(self): + if self.document is not None: + self.document.unlink() + + version = '1.1' + + def addService(self, name, documentation='', targetNamespace=None): + if self.services.has_key(name): + raise WSDLError( + 'Duplicate service element: %s' % name + ) + item = Service(name, documentation) + if targetNamespace: + item.targetNamespace = targetNamespace + self.services[name] = item + return item + + def addMessage(self, name, documentation='', targetNamespace=None): + if self.messages.has_key(name): + raise WSDLError( + 'Duplicate message element: %s.' % name + ) + item = Message(name, documentation) + if targetNamespace: + item.targetNamespace = targetNamespace + self.messages[name] = item + return item + + def addPortType(self, name, documentation='', targetNamespace=None): + if self.portTypes.has_key(name): + raise WSDLError( + 'Duplicate portType element: name' + ) + item = PortType(name, documentation) + if targetNamespace: + item.targetNamespace = targetNamespace + self.portTypes[name] = item + return item + + def addBinding(self, name, type, documentation='', targetNamespace=None): + if self.bindings.has_key(name): + raise WSDLError( + 'Duplicate binding element: %s' % name + ) + item = Binding(name, type, documentation) + if targetNamespace: + item.targetNamespace = targetNamespace + self.bindings[name] = item + return item + + def addImport(self, namespace, location): + item = ImportElement(namespace, location) + self.imports[namespace] = item + return item + + def toDom(self): + """ Generate a DOM representation of the WSDL instance. + Not dealing with generating XML Schema, thus the targetNamespace + of all XML Schema elements or types used by WSDL message parts + needs to be specified via import information items. + """ + namespaceURI = DOM.GetWSDLUri(self.version) + self.document = DOM.createDocument(namespaceURI ,'wsdl:definitions') + + # Set up a couple prefixes for easy reading. + child = DOM.getElement(self.document, None) + child.setAttributeNS(None, 'targetNamespace', self.targetNamespace) + child.setAttributeNS(XMLNS.BASE, 'xmlns:wsdl', namespaceURI) + child.setAttributeNS(XMLNS.BASE, 'xmlns:xsd', 'http://www.w3.org/1999/XMLSchema') + child.setAttributeNS(XMLNS.BASE, 'xmlns:soap', 'http://schemas.xmlsoap.org/wsdl/soap/') + child.setAttributeNS(XMLNS.BASE, 'xmlns:tns', self.targetNamespace) + + # wsdl:import + for item in self.imports: + item.toDom() + # wsdl:message + for item in self.messages: + item.toDom() + # wsdl:portType + for item in self.portTypes: + item.toDom() + # wsdl:binding + for item in self.bindings: + item.toDom() + # wsdl:service + for item in self.services: + item.toDom() + + def load(self, document): + # We save a reference to the DOM document to ensure that elements + # saved as "extensions" will continue to have a meaningful context + # for things like namespace references. The lifetime of the DOM + # document is bound to the lifetime of the WSDL instance. + self.document = document + + definitions = DOM.getElement(document, 'definitions', None, None) + if definitions is None: + raise WSDLError( + 'Missing <definitions> element.' + ) + self.version = DOM.WSDLUriToVersion(definitions.namespaceURI) + NS_WSDL = DOM.GetWSDLUri(self.version) + + self.targetNamespace = DOM.getAttr(definitions, 'targetNamespace', + None, None) + self.name = DOM.getAttr(definitions, 'name', None, None) + self.documentation = GetDocumentation(definitions) + + # Resolve (recursively) any import elements in the document. + imported = {} + base_location = self.location + while len(DOM.getElements(definitions, 'import', NS_WSDL)): + for element in DOM.getElements(definitions, 'import', NS_WSDL): + location = DOM.getAttr(element, 'location') + location = urllib.basejoin(base_location, location) + self._import(self.document, element, base_location) + + #reader = SchemaReader(base_url=self.location) + for element in DOM.getElements(definitions, None, None): + targetNamespace = DOM.getAttr(element, 'targetNamespace') + localName = element.localName + + if not DOM.nsUriMatch(element.namespaceURI, NS_WSDL): + if localName == 'schema': + reader = SchemaReader(base_url=self.location) + schema = reader.loadFromNode(WSDLToolsAdapter(self), element) + schema.setBaseUrl(self.location) + self.types.addSchema(schema) + else: + self.extensions.append(element) + continue + + elif localName == 'message': + name = DOM.getAttr(element, 'name') + docs = GetDocumentation(element) + message = self.addMessage(name, docs, targetNamespace) + parts = DOM.getElements(element, 'part', NS_WSDL) + message.load(parts) + continue + + elif localName == 'portType': + name = DOM.getAttr(element, 'name') + docs = GetDocumentation(element) + ptype = self.addPortType(name, docs, targetNamespace) + #operations = DOM.getElements(element, 'operation', NS_WSDL) + #ptype.load(operations) + ptype.load(element) + continue + + elif localName == 'binding': + name = DOM.getAttr(element, 'name') + type = DOM.getAttr(element, 'type', default=None) + if type is None: + raise WSDLError( + 'Missing type attribute for binding %s.' % name + ) + type = ParseQName(type, element) + docs = GetDocumentation(element) + binding = self.addBinding(name, type, docs, targetNamespace) + operations = DOM.getElements(element, 'operation', NS_WSDL) + binding.load(operations) + binding.load_ex(GetExtensions(element)) + continue + + elif localName == 'service': + name = DOM.getAttr(element, 'name') + docs = GetDocumentation(element) + service = self.addService(name, docs, targetNamespace) + ports = DOM.getElements(element, 'port', NS_WSDL) + service.load(ports) + service.load_ex(GetExtensions(element)) + continue + + elif localName == 'types': + self.types.documentation = GetDocumentation(element) + base_location = DOM.getAttr(element, 'base-location') + if base_location: + element.removeAttribute('base-location') + base_location = base_location or self.location + reader = SchemaReader(base_url=base_location) + for item in DOM.getElements(element, None, None): + if item.localName == 'schema': + schema = reader.loadFromNode(WSDLToolsAdapter(self), item) + # XXX <types> could have been imported + #schema.setBaseUrl(self.location) + schema.setBaseUrl(base_location) + self.types.addSchema(schema) + else: + self.types.addExtension(item) + # XXX remove the attribute + # element.removeAttribute('base-location') + continue + + def _import(self, document, element, base_location=None): + '''Algo take <import> element's children, clone them, + and add them to the main document. Support for relative + locations is a bit complicated. The orig document context + is lost, so we need to store base location in DOM elements + representing <types>, by creating a special temporary + "base-location" attribute, and <import>, by resolving + the relative "location" and storing it as "location". + + document -- document we are loading + element -- DOM Element representing <import> + base_location -- location of document from which this + <import> was gleaned. + ''' + namespace = DOM.getAttr(element, 'namespace', default=None) + location = DOM.getAttr(element, 'location', default=None) + if namespace is None or location is None: + raise WSDLError( + 'Invalid import element (missing namespace or location).' + ) + if base_location: + location = urllib.basejoin(base_location, location) + element.setAttributeNS(None, 'location', location) + + obimport = self.addImport(namespace, location) + obimport._loaded = 1 + + importdoc = DOM.loadFromURL(location) + try: + if location.find('#') > -1: + idref = location.split('#')[-1] + imported = DOM.getElementById(importdoc, idref) + else: + imported = importdoc.documentElement + if imported is None: + raise WSDLError( + 'Import target element not found for: %s' % location + ) + + imported_tns = DOM.findTargetNS(imported) + if imported_tns != namespace: + return + + if imported.localName == 'definitions': + imported_nodes = imported.childNodes + else: + imported_nodes = [imported] + parent = element.parentNode + + parent.removeChild(element) + + for node in imported_nodes: + if node.nodeType != node.ELEMENT_NODE: + continue + child = DOM.importNode(document, node, 1) + parent.appendChild(child) + child.setAttribute('targetNamespace', namespace) + attrsNS = imported._attrsNS + for attrkey in attrsNS.keys(): + if attrkey[0] == DOM.NS_XMLNS: + attr = attrsNS[attrkey].cloneNode(1) + child.setAttributeNode(attr) + + #XXX Quick Hack, should be in WSDL Namespace. + if child.localName == 'import': + rlocation = child.getAttributeNS(None, 'location') + alocation = urllib.basejoin(location, rlocation) + child.setAttribute('location', alocation) + elif child.localName == 'types': + child.setAttribute('base-location', location) + + finally: + importdoc.unlink() + return location + +class Element: + """A class that provides common functions for WSDL element classes.""" + def __init__(self, name=None, documentation=''): + self.name = name + self.documentation = documentation + self.extensions = [] + + def addExtension(self, item): + item.parent = weakref.ref(self) + self.extensions.append(item) + + +class ImportElement(Element): + def __init__(self, namespace, location): + self.namespace = namespace + self.location = location + + def getWSDL(self): + """Return the WSDL object that contains this Message Part.""" + return self.parent().parent() + + def toDom(self): + wsdl = self.getWSDL() + ep = ElementProxy(None, DOM.getElement(wsdl.document, None)) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), 'import') + epc.setAttributeNS(None, 'namespace', self.namespace) + epc.setAttributeNS(None, 'location', self.location) + + _loaded = None + + +class Types(Collection): + default = lambda self,k: k.targetNamespace + def __init__(self, parent): + Collection.__init__(self, parent) + self.documentation = '' + self.extensions = [] + + def addSchema(self, schema): + name = schema.targetNamespace + self[name] = schema + return schema + + def addExtension(self, item): + self.extensions.append(item) + + +class Message(Element): + def __init__(self, name, documentation=''): + Element.__init__(self, name, documentation) + self.parts = Collection(self) + + def addPart(self, name, type=None, element=None): + if self.parts.has_key(name): + raise WSDLError( + 'Duplicate message part element: %s' % name + ) + if type is None and element is None: + raise WSDLError( + 'Missing type or element attribute for part: %s' % name + ) + item = MessagePart(name) + item.element = element + item.type = type + self.parts[name] = item + return item + + def load(self, elements): + for element in elements: + name = DOM.getAttr(element, 'name') + part = MessagePart(name) + self.parts[name] = part + elemref = DOM.getAttr(element, 'element', default=None) + typeref = DOM.getAttr(element, 'type', default=None) + if typeref is None and elemref is None: + raise WSDLError( + 'No type or element attribute for part: %s' % name + ) + if typeref is not None: + part.type = ParseTypeRef(typeref, element) + if elemref is not None: + part.element = ParseTypeRef(elemref, element) + + def getElementDeclaration(self): + """Return the XMLSchema.ElementDeclaration instance or None""" + element = None + if self.element: + nsuri,name = self.element + wsdl = self.getWSDL() + if wsdl.types.has_key(nsuri) and wsdl.types[nsuri].elements.has_key(name): + element = wsdl.types[nsuri].elements[name] + return element + + def getTypeDefinition(self): + """Return the XMLSchema.TypeDefinition instance or None""" + type = None + if self.type: + nsuri,name = self.type + wsdl = self.getWSDL() + if wsdl.types.has_key(nsuri) and wsdl.types[nsuri].types.has_key(name): + type = wsdl.types[nsuri].types[name] + return type + + def getWSDL(self): + """Return the WSDL object that contains this Message Part.""" + return self.parent().parent() + + def toDom(self): + wsdl = self.getWSDL() + ep = ElementProxy(None, DOM.getElement(wsdl.document, None)) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), 'message') + epc.setAttributeNS(None, 'name', self.name) + + for part in self.parts: + part.toDom(epc._getNode()) + + +class MessagePart(Element): + def __init__(self, name): + Element.__init__(self, name, '') + self.element = None + self.type = None + + def getWSDL(self): + """Return the WSDL object that contains this Message Part.""" + return self.parent().parent().parent().parent() + + def getTypeDefinition(self): + wsdl = self.getWSDL() + nsuri,name = self.type + schema = wsdl.types.get(nsuri, {}) + return schema.get(name) + + def getElementDeclaration(self): + wsdl = self.getWSDL() + nsuri,name = self.element + schema = wsdl.types.get(nsuri, {}) + return schema.get(name) + + def toDom(self, node): + """node -- node representing message""" + wsdl = self.getWSDL() + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), 'part') + epc.setAttributeNS(None, 'name', self.name) + + if self.element is not None: + ns,name = self.element + prefix = epc.getPrefix(ns) + epc.setAttributeNS(None, 'element', '%s:%s'%(prefix,name)) + elif self.type is not None: + ns,name = self.type + prefix = epc.getPrefix(ns) + epc.setAttributeNS(None, 'type', '%s:%s'%(prefix,name)) + + +class PortType(Element): + '''PortType has a anyAttribute, thus must provide for an extensible + mechanism for supporting such attributes. ResourceProperties is + specified in WS-ResourceProperties. wsa:Action is specified in + WS-Address. + + Instance Data: + name -- name attribute + resourceProperties -- optional. wsr:ResourceProperties attribute, + value is a QName this is Parsed into a (namespaceURI, name) + that represents a Global Element Declaration. + operations + ''' + + def __init__(self, name, documentation=''): + Element.__init__(self, name, documentation) + self.operations = Collection(self) + self.resourceProperties = None + + def getWSDL(self): + return self.parent().parent() + + def getTargetNamespace(self): + return self.targetNamespace or self.getWSDL().targetNamespace + + def getResourceProperties(self): + return self.resourceProperties + + def addOperation(self, name, documentation='', parameterOrder=None): + item = Operation(name, documentation, parameterOrder) + self.operations[name] = item + return item + + def load(self, element): + self.name = DOM.getAttr(element, 'name') + self.documentation = GetDocumentation(element) + self.targetNamespace = DOM.getAttr(element, 'targetNamespace') + if DOM.hasAttr(element, 'ResourceProperties', OASIS.PROPERTIES): + rpref = DOM.getAttr(element, 'ResourceProperties', OASIS.PROPERTIES) + self.resourceProperties = ParseQName(rpref, element) + + lookfor = (WSA200408, WSA200403, WSA200303,) + NS_WSDL = DOM.GetWSDLUri(self.getWSDL().version) + elements = DOM.getElements(element, 'operation', NS_WSDL) + for element in elements: + name = DOM.getAttr(element, 'name') + docs = GetDocumentation(element) + param_order = DOM.getAttr(element, 'parameterOrder', default=None) + if param_order is not None: + param_order = param_order.split(' ') + operation = self.addOperation(name, docs, param_order) + + item = DOM.getElement(element, 'input', None, None) + if item is not None: + name = DOM.getAttr(item, 'name') + docs = GetDocumentation(item) + msgref = DOM.getAttr(item, 'message') + message = ParseQName(msgref, item) + for WSA in lookfor: + action = DOM.getAttr(item, 'Action', WSA.ADDRESS, None) + if action: break + operation.setInput(message, name, docs, action) + + item = DOM.getElement(element, 'output', None, None) + if item is not None: + name = DOM.getAttr(item, 'name') + docs = GetDocumentation(item) + msgref = DOM.getAttr(item, 'message') + message = ParseQName(msgref, item) + for WSA in lookfor: + action = DOM.getAttr(item, 'Action', WSA.ADDRESS, None) + if action: break + operation.setOutput(message, name, docs, action) + + for item in DOM.getElements(element, 'fault', None): + name = DOM.getAttr(item, 'name') + docs = GetDocumentation(item) + msgref = DOM.getAttr(item, 'message') + message = ParseQName(msgref, item) + for WSA in lookfor: + action = DOM.getAttr(item, 'Action', WSA.ADDRESS, None) + if action: break + operation.addFault(message, name, docs, action) + + def toDom(self): + wsdl = self.getWSDL() + + ep = ElementProxy(None, DOM.getElement(wsdl.document, None)) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), 'portType') + epc.setAttributeNS(None, 'name', self.name) + if self.resourceProperties: + ns,name = self.resourceProperties + prefix = epc.getPrefix(ns) + epc.setAttributeNS(OASIS.PROPERTIES, 'ResourceProperties', '%s:%s'%(prefix,name)) + + for op in self.operations: + op.toDom(epc._getNode()) + + + +class Operation(Element): + def __init__(self, name, documentation='', parameterOrder=None): + Element.__init__(self, name, documentation) + self.parameterOrder = parameterOrder + self.faults = Collection(self) + self.input = None + self.output = None + + def getWSDL(self): + """Return the WSDL object that contains this Operation.""" + return self.parent().parent().parent().parent() + + def getPortType(self): + return self.parent().parent() + + def getInputAction(self): + """wsa:Action attribute""" + return GetWSAActionInput(self) + + def getInputMessage(self): + if self.input is None: + return None + wsdl = self.getPortType().getWSDL() + return wsdl.messages[self.input.message] + + def getOutputAction(self): + """wsa:Action attribute""" + return GetWSAActionOutput(self) + + def getOutputMessage(self): + if self.output is None: + return None + wsdl = self.getPortType().getWSDL() + return wsdl.messages[self.output.message] + + def getFaultAction(self, name): + """wsa:Action attribute""" + return GetWSAActionFault(self, name) + + def getFaultMessage(self, name): + wsdl = self.getPortType().getWSDL() + return wsdl.messages[self.faults[name].message] + + def addFault(self, message, name, documentation='', action=None): + if self.faults.has_key(name): + raise WSDLError( + 'Duplicate fault element: %s' % name + ) + item = MessageRole('fault', message, name, documentation, action) + self.faults[name] = item + return item + + def setInput(self, message, name='', documentation='', action=None): + self.input = MessageRole('input', message, name, documentation, action) + self.input.parent = weakref.ref(self) + return self.input + + def setOutput(self, message, name='', documentation='', action=None): + self.output = MessageRole('output', message, name, documentation, action) + self.output.parent = weakref.ref(self) + return self.output + + def toDom(self, node): + wsdl = self.getWSDL() + + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), 'operation') + epc.setAttributeNS(None, 'name', self.name) + node = epc._getNode() + if self.input: + self.input.toDom(node) + if self.output: + self.output.toDom(node) + for fault in self.faults: + fault.toDom(node) + + +class MessageRole(Element): + def __init__(self, type, message, name='', documentation='', action=None): + Element.__init__(self, name, documentation) + self.message = message + self.type = type + self.action = action + + def getWSDL(self): + """Return the WSDL object that contains this MessageRole.""" + if self.parent().getWSDL() == 'fault': + return self.parent().parent().getWSDL() + return self.parent().getWSDL() + + def getMessage(self): + """Return the WSDL object that represents the attribute message + (namespaceURI, name) tuple + """ + wsdl = self.getWSDL() + return wsdl.messages[self.message] + + def toDom(self, node): + wsdl = self.getWSDL() + + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), self.type) + epc.setAttributeNS(None, 'message', self.message) + + if self.action: + epc.setAttributeNS(WSA200408.ADDRESS, 'Action', self.action) + + +class Binding(Element): + def __init__(self, name, type, documentation=''): + Element.__init__(self, name, documentation) + self.operations = Collection(self) + self.type = type + + def getWSDL(self): + """Return the WSDL object that contains this binding.""" + return self.parent().parent() + + def getPortType(self): + """Return the PortType object associated with this binding.""" + return self.getWSDL().portTypes[self.type] + + def findBinding(self, kind): + for item in self.extensions: + if isinstance(item, kind): + return item + return None + + def findBindings(self, kind): + return [ item for item in self.extensions if isinstance(item, kind) ] + + def addOperationBinding(self, name, documentation=''): + item = OperationBinding(name, documentation) + self.operations[name] = item + return item + + def load(self, elements): + for element in elements: + name = DOM.getAttr(element, 'name') + docs = GetDocumentation(element) + opbinding = self.addOperationBinding(name, docs) + opbinding.load_ex(GetExtensions(element)) + + item = DOM.getElement(element, 'input', None, None) + if item is not None: + mbinding = MessageRoleBinding('input') + mbinding.documentation = GetDocumentation(item) + opbinding.input = mbinding + mbinding.load_ex(GetExtensions(item)) + + item = DOM.getElement(element, 'output', None, None) + if item is not None: + mbinding = MessageRoleBinding('output') + mbinding.documentation = GetDocumentation(item) + opbinding.output = mbinding + mbinding.load_ex(GetExtensions(item)) + + for item in DOM.getElements(element, 'fault', None): + name = DOM.getAttr(item, 'name') + mbinding = MessageRoleBinding('fault', name) + mbinding.documentation = GetDocumentation(item) + opbinding.faults[name] = mbinding + mbinding.load_ex(GetExtensions(item)) + + def load_ex(self, elements): + for e in elements: + ns, name = e.namespaceURI, e.localName + if ns in DOM.NS_SOAP_BINDING_ALL and name == 'binding': + transport = DOM.getAttr(e, 'transport', default=None) + style = DOM.getAttr(e, 'style', default='document') + ob = SoapBinding(transport, style) + self.addExtension(ob) + continue + elif ns in DOM.NS_HTTP_BINDING_ALL and name == 'binding': + verb = DOM.getAttr(e, 'verb') + ob = HttpBinding(verb) + self.addExtension(ob) + continue + else: + self.addExtension(e) + + def toDom(self): + wsdl = self.getWSDL() + ep = ElementProxy(None, DOM.getElement(wsdl.document, None)) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), 'binding') + epc.setAttributeNS(None, 'name', self.name) + + ns,name = self.type + prefix = epc.getPrefix(ns) + epc.setAttributeNS(None, 'type', '%s:%s' %(prefix,name)) + + node = epc._getNode() + for ext in self.extensions: + ext.toDom(node) + for op_binding in self.operations: + op_binding.toDom(node) + + +class OperationBinding(Element): + def __init__(self, name, documentation=''): + Element.__init__(self, name, documentation) + self.input = None + self.output = None + self.faults = Collection(self) + + def getWSDL(self): + """Return the WSDL object that contains this binding.""" + return self.parent().parent().parent().parent() + + + def getBinding(self): + """Return the parent Binding object of the operation binding.""" + return self.parent().parent() + + def getOperation(self): + """Return the abstract Operation associated with this binding.""" + return self.getBinding().getPortType().operations[self.name] + + def findBinding(self, kind): + for item in self.extensions: + if isinstance(item, kind): + return item + return None + + def findBindings(self, kind): + return [ item for item in self.extensions if isinstance(item, kind) ] + + def addInputBinding(self, binding): + if self.input is None: + self.input = MessageRoleBinding('input') + self.input.parent = weakref.ref(self) + self.input.addExtension(binding) + return binding + + def addOutputBinding(self, binding): + if self.output is None: + self.output = MessageRoleBinding('output') + self.output.parent = weakref.ref(self) + self.output.addExtension(binding) + return binding + + def addFaultBinding(self, name, binding): + fault = self.get(name, None) + if fault is None: + fault = MessageRoleBinding('fault', name) + fault.addExtension(binding) + return binding + + def load_ex(self, elements): + for e in elements: + ns, name = e.namespaceURI, e.localName + if ns in DOM.NS_SOAP_BINDING_ALL and name == 'operation': + soapaction = DOM.getAttr(e, 'soapAction', default=None) + style = DOM.getAttr(e, 'style', default=None) + ob = SoapOperationBinding(soapaction, style) + self.addExtension(ob) + continue + elif ns in DOM.NS_HTTP_BINDING_ALL and name == 'operation': + location = DOM.getAttr(e, 'location') + ob = HttpOperationBinding(location) + self.addExtension(ob) + continue + else: + self.addExtension(e) + + def toDom(self, node): + wsdl = self.getWSDL() + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), 'operation') + epc.setAttributeNS(None, 'name', self.name) + + node = epc._getNode() + for ext in self.extensions: + ext.toDom(node) + if self.input: + self.input.toDom(node) + if self.output: + self.output.toDom(node) + for fault in self.faults: + fault.toDom(node) + + +class MessageRoleBinding(Element): + def __init__(self, type, name='', documentation=''): + Element.__init__(self, name, documentation) + self.type = type + + def getWSDL(self): + """Return the WSDL object that contains this MessageRole.""" + if self.type == 'fault': + return self.parent().parent().getWSDL() + return self.parent().getWSDL() + + def findBinding(self, kind): + for item in self.extensions: + if isinstance(item, kind): + return item + return None + + def findBindings(self, kind): + return [ item for item in self.extensions if isinstance(item, kind) ] + + def load_ex(self, elements): + for e in elements: + ns, name = e.namespaceURI, e.localName + if ns in DOM.NS_SOAP_BINDING_ALL and name == 'body': + encstyle = DOM.getAttr(e, 'encodingStyle', default=None) + namespace = DOM.getAttr(e, 'namespace', default=None) + parts = DOM.getAttr(e, 'parts', default=None) + use = DOM.getAttr(e, 'use', default=None) + if use is None: + raise WSDLError( + 'Invalid soap:body binding element.' + ) + ob = SoapBodyBinding(use, namespace, encstyle, parts) + self.addExtension(ob) + continue + + elif ns in DOM.NS_SOAP_BINDING_ALL and name == 'fault': + encstyle = DOM.getAttr(e, 'encodingStyle', default=None) + namespace = DOM.getAttr(e, 'namespace', default=None) + name = DOM.getAttr(e, 'name', default=None) + use = DOM.getAttr(e, 'use', default=None) + if use is None or name is None: + raise WSDLError( + 'Invalid soap:fault binding element.' + ) + ob = SoapFaultBinding(name, use, namespace, encstyle) + self.addExtension(ob) + continue + + elif ns in DOM.NS_SOAP_BINDING_ALL and name in ( + 'header', 'headerfault' + ): + encstyle = DOM.getAttr(e, 'encodingStyle', default=None) + namespace = DOM.getAttr(e, 'namespace', default=None) + message = DOM.getAttr(e, 'message') + part = DOM.getAttr(e, 'part') + use = DOM.getAttr(e, 'use') + if name == 'header': + _class = SoapHeaderBinding + else: + _class = SoapHeaderFaultBinding + message = ParseQName(message, e) + ob = _class(message, part, use, namespace, encstyle) + self.addExtension(ob) + continue + + elif ns in DOM.NS_HTTP_BINDING_ALL and name == 'urlReplacement': + ob = HttpUrlReplacementBinding() + self.addExtension(ob) + continue + + elif ns in DOM.NS_HTTP_BINDING_ALL and name == 'urlEncoded': + ob = HttpUrlEncodedBinding() + self.addExtension(ob) + continue + + elif ns in DOM.NS_MIME_BINDING_ALL and name == 'multipartRelated': + ob = MimeMultipartRelatedBinding() + self.addExtension(ob) + ob.load_ex(GetExtensions(e)) + continue + + elif ns in DOM.NS_MIME_BINDING_ALL and name == 'content': + part = DOM.getAttr(e, 'part', default=None) + type = DOM.getAttr(e, 'type', default=None) + ob = MimeContentBinding(part, type) + self.addExtension(ob) + continue + + elif ns in DOM.NS_MIME_BINDING_ALL and name == 'mimeXml': + part = DOM.getAttr(e, 'part', default=None) + ob = MimeXmlBinding(part) + self.addExtension(ob) + continue + + else: + self.addExtension(e) + + def toDom(self, node): + wsdl = self.getWSDL() + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), self.type) + + node = epc._getNode() + for item in self.extensions: + if item: item.toDom(node) + + +class Service(Element): + def __init__(self, name, documentation=''): + Element.__init__(self, name, documentation) + self.ports = Collection(self) + + def getWSDL(self): + return self.parent().parent() + + def addPort(self, name, binding, documentation=''): + item = Port(name, binding, documentation) + self.ports[name] = item + return item + + def load(self, elements): + for element in elements: + name = DOM.getAttr(element, 'name', default=None) + docs = GetDocumentation(element) + binding = DOM.getAttr(element, 'binding', default=None) + if name is None or binding is None: + raise WSDLError( + 'Invalid port element.' + ) + binding = ParseQName(binding, element) + port = self.addPort(name, binding, docs) + port.load_ex(GetExtensions(element)) + + def load_ex(self, elements): + for e in elements: + self.addExtension(e) + + def toDom(self): + wsdl = self.getWSDL() + ep = ElementProxy(None, DOM.getElement(wsdl.document, None)) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), "service") + epc.setAttributeNS(None, "name", self.name) + + node = epc._getNode() + for port in self.ports: + port.toDom(node) + + +class Port(Element): + def __init__(self, name, binding, documentation=''): + Element.__init__(self, name, documentation) + self.binding = binding + + def getWSDL(self): + return self.parent().parent().getWSDL() + + def getService(self): + """Return the Service object associated with this port.""" + return self.parent().parent() + + def getBinding(self): + """Return the Binding object that is referenced by this port.""" + wsdl = self.getService().getWSDL() + return wsdl.bindings[self.binding] + + def getPortType(self): + """Return the PortType object that is referenced by this port.""" + wsdl = self.getService().getWSDL() + binding = wsdl.bindings[self.binding] + return wsdl.portTypes[binding.type] + + def getAddressBinding(self): + """A convenience method to obtain the extension element used + as the address binding for the port.""" + for item in self.extensions: + if isinstance(item, SoapAddressBinding) or \ + isinstance(item, HttpAddressBinding): + return item + raise WSDLError( + 'No address binding found in port.' + ) + + def load_ex(self, elements): + for e in elements: + ns, name = e.namespaceURI, e.localName + if ns in DOM.NS_SOAP_BINDING_ALL and name == 'address': + location = DOM.getAttr(e, 'location', default=None) + ob = SoapAddressBinding(location) + self.addExtension(ob) + continue + elif ns in DOM.NS_HTTP_BINDING_ALL and name == 'address': + location = DOM.getAttr(e, 'location', default=None) + ob = HttpAddressBinding(location) + self.addExtension(ob) + continue + else: + self.addExtension(e) + + def toDom(self, node): + wsdl = self.getWSDL() + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLUri(wsdl.version), "port") + epc.setAttributeNS(None, "name", self.name) + + ns,name = self.binding + prefix = epc.getPrefix(ns) + epc.setAttributeNS(None, "binding", "%s:%s" %(prefix,name)) + + node = epc._getNode() + for ext in self.extensions: + ext.toDom(node) + + +class SoapBinding: + def __init__(self, transport, style='rpc'): + self.transport = transport + self.style = style + + def getWSDL(self): + return self.parent().getWSDL() + + def toDom(self, node): + wsdl = self.getWSDL() + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLSoapBindingUri(wsdl.version), 'binding') + if self.transport: + epc.setAttributeNS(None, "transport", self.transport) + if self.style: + epc.setAttributeNS(None, "style", self.style) + +class SoapAddressBinding: + def __init__(self, location): + self.location = location + + def getWSDL(self): + return self.parent().getWSDL() + + def toDom(self, node): + wsdl = self.getWSDL() + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLSoapBindingUri(wsdl.version), 'address') + epc.setAttributeNS(None, "location", self.location) + + +class SoapOperationBinding: + def __init__(self, soapAction=None, style=None): + self.soapAction = soapAction + self.style = style + + def getWSDL(self): + return self.parent().getWSDL() + + def toDom(self, node): + wsdl = self.getWSDL() + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLSoapBindingUri(wsdl.version), 'operation') + if self.soapAction: + epc.setAttributeNS(None, 'soapAction', self.soapAction) + if self.style: + epc.setAttributeNS(None, 'style', self.style) + + +class SoapBodyBinding: + def __init__(self, use, namespace=None, encodingStyle=None, parts=None): + if not use in ('literal', 'encoded'): + raise WSDLError( + 'Invalid use attribute value: %s' % use + ) + self.encodingStyle = encodingStyle + self.namespace = namespace + if type(parts) in (type(''), type(u'')): + parts = parts.split() + self.parts = parts + self.use = use + + def getWSDL(self): + return self.parent().getWSDL() + + def toDom(self, node): + wsdl = self.getWSDL() + ep = ElementProxy(None, node) + epc = ep.createAppendElement(DOM.GetWSDLSoapBindingUri(wsdl.version), 'body') + epc.setAttributeNS(None, "use", self.use) + epc.setAttributeNS(None, "namespace", self.namespace) + + +class SoapFaultBinding: + def __init__(self, name, use, namespace=None, encodingStyle=None): + if not use in ('literal', 'encoded'): + raise WSDLError( + 'Invalid use attribute value: %s' % use + ) + self.encodingStyle = encodingStyle + self.namespace = namespace + self.name = name + self.use = use + + +class SoapHeaderBinding: + def __init__(self, message, part, use, namespace=None, encodingStyle=None): + if not use in ('literal', 'encoded'): + raise WSDLError( + 'Invalid use attribute value: %s' % use + ) + self.encodingStyle = encodingStyle + self.namespace = namespace + self.message = message + self.part = part + self.use = use + + tagname = 'header' + +class SoapHeaderFaultBinding(SoapHeaderBinding): + tagname = 'headerfault' + + +class HttpBinding: + def __init__(self, verb): + self.verb = verb + +class HttpAddressBinding: + def __init__(self, location): + self.location = location + + +class HttpOperationBinding: + def __init__(self, location): + self.location = location + +class HttpUrlReplacementBinding: + pass + + +class HttpUrlEncodedBinding: + pass + + +class MimeContentBinding: + def __init__(self, part=None, type=None): + self.part = part + self.type = type + + +class MimeXmlBinding: + def __init__(self, part=None): + self.part = part + + +class MimeMultipartRelatedBinding: + def __init__(self): + self.parts = [] + + def load_ex(self, elements): + for e in elements: + ns, name = e.namespaceURI, e.localName + if ns in DOM.NS_MIME_BINDING_ALL and name == 'part': + self.parts.append(MimePartBinding()) + continue + + +class MimePartBinding: + def __init__(self): + self.items = [] + + def load_ex(self, elements): + for e in elements: + ns, name = e.namespaceURI, e.localName + if ns in DOM.NS_MIME_BINDING_ALL and name == 'content': + part = DOM.getAttr(e, 'part', default=None) + type = DOM.getAttr(e, 'type', default=None) + ob = MimeContentBinding(part, type) + self.items.append(ob) + continue + + elif ns in DOM.NS_MIME_BINDING_ALL and name == 'mimeXml': + part = DOM.getAttr(e, 'part', default=None) + ob = MimeXmlBinding(part) + self.items.append(ob) + continue + + elif ns in DOM.NS_SOAP_BINDING_ALL and name == 'body': + encstyle = DOM.getAttr(e, 'encodingStyle', default=None) + namespace = DOM.getAttr(e, 'namespace', default=None) + parts = DOM.getAttr(e, 'parts', default=None) + use = DOM.getAttr(e, 'use', default=None) + if use is None: + raise WSDLError( + 'Invalid soap:body binding element.' + ) + ob = SoapBodyBinding(use, namespace, encstyle, parts) + self.items.append(ob) + continue + + +class WSDLError(Exception): + pass + + + +def DeclareNSPrefix(writer, prefix, nsuri): + if writer.hasNSPrefix(nsuri): + return + writer.declareNSPrefix(prefix, nsuri) + +def ParseTypeRef(value, element): + parts = value.split(':', 1) + if len(parts) == 1: + return (DOM.findTargetNS(element), value) + nsuri = DOM.findNamespaceURI(parts[0], element) + return (nsuri, parts[1]) + +def ParseQName(value, element): + nameref = value.split(':', 1) + if len(nameref) == 2: + nsuri = DOM.findNamespaceURI(nameref[0], element) + name = nameref[-1] + else: + nsuri = DOM.findTargetNS(element) + name = nameref[-1] + return nsuri, name + +def GetDocumentation(element): + docnode = DOM.getElement(element, 'documentation', None, None) + if docnode is not None: + return DOM.getElementText(docnode) + return '' + +def GetExtensions(element): + return [ item for item in DOM.getElements(element, None, None) + if item.namespaceURI != DOM.NS_WSDL ] + +def GetWSAActionFault(operation, name): + """Find wsa:Action attribute, and return value or WSA.FAULT + for the default. + """ + attr = operation.faults[name].action + if attr is not None: + return attr + return WSA.FAULT + +def GetWSAActionInput(operation): + """Find wsa:Action attribute, and return value or the default.""" + attr = operation.input.action + if attr is not None: + return attr + portType = operation.getPortType() + targetNamespace = portType.getTargetNamespace() + ptName = portType.name + msgName = operation.input.name + if not msgName: + msgName = operation.name + 'Request' + if targetNamespace.endswith('/'): + return '%s%s/%s' %(targetNamespace, ptName, msgName) + return '%s/%s/%s' %(targetNamespace, ptName, msgName) + +def GetWSAActionOutput(operation): + """Find wsa:Action attribute, and return value or the default.""" + attr = operation.output.action + if attr is not None: + return attr + targetNamespace = operation.getPortType().getTargetNamespace() + ptName = operation.getPortType().name + msgName = operation.output.name + if not msgName: + msgName = operation.name + 'Response' + if targetNamespace.endswith('/'): + return '%s%s/%s' %(targetNamespace, ptName, msgName) + return '%s/%s/%s' %(targetNamespace, ptName, msgName) + +def FindExtensions(object, kind, t_type=type(())): + if isinstance(kind, t_type): + result = [] + namespaceURI, name = kind + return [ item for item in object.extensions + if hasattr(item, 'nodeType') \ + and DOM.nsUriMatch(namespaceURI, item.namespaceURI) \ + and item.name == name ] + return [ item for item in object.extensions if isinstance(item, kind) ] + +def FindExtension(object, kind, t_type=type(())): + if isinstance(kind, t_type): + namespaceURI, name = kind + for item in object.extensions: + if hasattr(item, 'nodeType') \ + and DOM.nsUriMatch(namespaceURI, item.namespaceURI) \ + and item.name == name: + return item + else: + for item in object.extensions: + if isinstance(item, kind): + return item + return None + + +class SOAPCallInfo: + """SOAPCallInfo captures the important binding information about a + SOAP operation, in a structure that is easier to work with than + raw WSDL structures.""" + + def __init__(self, methodName): + self.methodName = methodName + self.inheaders = [] + self.outheaders = [] + self.inparams = [] + self.outparams = [] + self.retval = None + + encodingStyle = DOM.NS_SOAP_ENC + documentation = '' + soapAction = None + transport = None + namespace = None + location = None + use = 'encoded' + style = 'rpc' + + def addInParameter(self, name, type, namespace=None, element_type=0): + """Add an input parameter description to the call info.""" + parameter = ParameterInfo(name, type, namespace, element_type) + self.inparams.append(parameter) + return parameter + + def addOutParameter(self, name, type, namespace=None, element_type=0): + """Add an output parameter description to the call info.""" + parameter = ParameterInfo(name, type, namespace, element_type) + self.outparams.append(parameter) + return parameter + + def setReturnParameter(self, name, type, namespace=None, element_type=0): + """Set the return parameter description for the call info.""" + parameter = ParameterInfo(name, type, namespace, element_type) + self.retval = parameter + return parameter + + def addInHeaderInfo(self, name, type, namespace, element_type=0, + mustUnderstand=0): + """Add an input SOAP header description to the call info.""" + headerinfo = HeaderInfo(name, type, namespace, element_type) + if mustUnderstand: + headerinfo.mustUnderstand = 1 + self.inheaders.append(headerinfo) + return headerinfo + + def addOutHeaderInfo(self, name, type, namespace, element_type=0, + mustUnderstand=0): + """Add an output SOAP header description to the call info.""" + headerinfo = HeaderInfo(name, type, namespace, element_type) + if mustUnderstand: + headerinfo.mustUnderstand = 1 + self.outheaders.append(headerinfo) + return headerinfo + + def getInParameters(self): + """Return a sequence of the in parameters of the method.""" + return self.inparams + + def getOutParameters(self): + """Return a sequence of the out parameters of the method.""" + return self.outparams + + def getReturnParameter(self): + """Return param info about the return value of the method.""" + return self.retval + + def getInHeaders(self): + """Return a sequence of the in headers of the method.""" + return self.inheaders + + def getOutHeaders(self): + """Return a sequence of the out headers of the method.""" + return self.outheaders + + +class ParameterInfo: + """A ParameterInfo object captures parameter binding information.""" + def __init__(self, name, type, namespace=None, element_type=0): + if element_type: + self.element_type = 1 + if namespace is not None: + self.namespace = namespace + self.name = name + self.type = type + + element_type = 0 + namespace = None + default = None + + +class HeaderInfo(ParameterInfo): + """A HeaderInfo object captures SOAP header binding information.""" + def __init__(self, name, type, namespace, element_type=None): + ParameterInfo.__init__(self, name, type, namespace, element_type) + + mustUnderstand = 0 + actor = None + + +def callInfoFromWSDL(port, name): + """Return a SOAPCallInfo given a WSDL port and operation name.""" + wsdl = port.getService().getWSDL() + binding = port.getBinding() + portType = binding.getPortType() + operation = portType.operations[name] + opbinding = binding.operations[name] + messages = wsdl.messages + callinfo = SOAPCallInfo(name) + + addrbinding = port.getAddressBinding() + if not isinstance(addrbinding, SoapAddressBinding): + raise ValueError, 'Unsupported binding type.' + callinfo.location = addrbinding.location + + soapbinding = binding.findBinding(SoapBinding) + if soapbinding is None: + raise ValueError, 'Missing soap:binding element.' + callinfo.transport = soapbinding.transport + callinfo.style = soapbinding.style or 'document' + + soap_op_binding = opbinding.findBinding(SoapOperationBinding) + if soap_op_binding is not None: + callinfo.soapAction = soap_op_binding.soapAction + callinfo.style = soap_op_binding.style or callinfo.style + + parameterOrder = operation.parameterOrder + + if operation.input is not None: + message = messages[operation.input.message] + msgrole = opbinding.input + + mime = msgrole.findBinding(MimeMultipartRelatedBinding) + if mime is not None: + raise ValueError, 'Mime bindings are not supported.' + else: + for item in msgrole.findBindings(SoapHeaderBinding): + part = messages[item.message].parts[item.part] + header = callinfo.addInHeaderInfo( + part.name, + part.element or part.type, + item.namespace, + element_type = part.element and 1 or 0 + ) + header.encodingStyle = item.encodingStyle + + body = msgrole.findBinding(SoapBodyBinding) + if body is None: + raise ValueError, 'Missing soap:body binding.' + callinfo.encodingStyle = body.encodingStyle + callinfo.namespace = body.namespace + callinfo.use = body.use + + if body.parts is not None: + parts = [] + for name in body.parts: + parts.append(message.parts[name]) + else: + parts = message.parts.values() + + for part in parts: + callinfo.addInParameter( + part.name, + part.element or part.type, + element_type = part.element and 1 or 0 + ) + + if operation.output is not None: + try: + message = messages[operation.output.message] + except KeyError: + if self.strict: + raise RuntimeError( + "Recieved message not defined in the WSDL schema: %s" % + operation.output.message) + else: + message = wsdl.addMessage(operation.output.message) + print "Warning:", \ + "Recieved message not defined in the WSDL schema.", \ + "Adding it." + print "Message:", operation.output.message + + msgrole = opbinding.output + + mime = msgrole.findBinding(MimeMultipartRelatedBinding) + if mime is not None: + raise ValueError, 'Mime bindings are not supported.' + else: + for item in msgrole.findBindings(SoapHeaderBinding): + part = messages[item.message].parts[item.part] + header = callinfo.addOutHeaderInfo( + part.name, + part.element or part.type, + item.namespace, + element_type = part.element and 1 or 0 + ) + header.encodingStyle = item.encodingStyle + + body = msgrole.findBinding(SoapBodyBinding) + if body is None: + raise ValueError, 'Missing soap:body binding.' + callinfo.encodingStyle = body.encodingStyle + callinfo.namespace = body.namespace + callinfo.use = body.use + + if body.parts is not None: + parts = [] + for name in body.parts: + parts.append(message.parts[name]) + else: + parts = message.parts.values() + + if parts: + for part in parts: + callinfo.addOutParameter( + part.name, + part.element or part.type, + element_type = part.element and 1 or 0 + ) + + return callinfo diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/XMLSchema.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/XMLSchema.py new file mode 100755 index 0000000000000000000000000000000000000000..d785a3eabaa518e68c04bc3dead280382922d17f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/XMLSchema.py @@ -0,0 +1,2879 @@ +# Copyright (c) 2003, The Regents of the University of California, +# through Lawrence Berkeley National Laboratory (subject to receipt of +# any required approvals from the U.S. Dept. of Energy). All rights +# reserved. +# +# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. + +ident = "$Id$" + +import types, weakref, urllib, sys +from threading import RLock +from Namespaces import XMLNS +from Utility import DOM, DOMException, Collection, SplitQName +from StringIO import StringIO + +def GetSchema(component): + """convience function for finding the parent XMLSchema instance. + """ + parent = component + while not isinstance(parent, XMLSchema): + parent = parent._parent() + return parent + +class SchemaReader: + """A SchemaReader creates XMLSchema objects from urls and xml data. + """ + def __init__(self, domReader=None, base_url=None): + """domReader -- class must implement DOMAdapterInterface + base_url -- base url string + """ + self.__base_url = base_url + self.__readerClass = domReader + if not self.__readerClass: + self.__readerClass = DOMAdapter + self._includes = {} + self._imports = {} + + def __setImports(self, schema): + """Add dictionary of imports to schema instance. + schema -- XMLSchema instance + """ + for ns,val in schema.imports.items(): + if self._imports.has_key(ns): + schema.addImportSchema(self._imports[ns]) + + def __setIncludes(self, schema): + """Add dictionary of includes to schema instance. + schema -- XMLSchema instance + """ + for schemaLocation, val in schema.includes.items(): + if self._includes.has_key(schemaLocation): + schema.addIncludeSchema(self._imports[schemaLocation]) + + def addSchemaByLocation(self, location, schema): + """provide reader with schema document for a location. + """ + self._includes[location] = schema + + def addSchemaByNamespace(self, schema): + """provide reader with schema document for a targetNamespace. + """ + self._imports[schema.targetNamespace] = schema + + def loadFromNode(self, parent, element): + """element -- DOM node or document + parent -- WSDLAdapter instance + """ + reader = self.__readerClass(element) + schema = XMLSchema(parent) + #HACK to keep a reference + schema.wsdl = parent + schema.setBaseUrl(self.__base_url) + schema.load(reader) + return schema + + def loadFromStream(self, file, url=None): + """Return an XMLSchema instance loaded from a file object. + file -- file object + url -- base location for resolving imports/includes. + """ + reader = self.__readerClass() + reader.loadDocument(file) + schema = XMLSchema() + if url is not None: + schema.setBaseUrl(url) + schema.load(reader) + self.__setIncludes(schema) + self.__setImports(schema) + return schema + + def loadFromString(self, data): + """Return an XMLSchema instance loaded from an XML string. + data -- XML string + """ + return self.loadFromStream(StringIO(data)) + + def loadFromURL(self, url): + """Return an XMLSchema instance loaded from the given url. + url -- URL to dereference + """ + reader = self.__readerClass() + if self.__base_url: + url = urllib.basejoin(self.__base_url,url) + reader.loadFromURL(url) + schema = XMLSchema() + schema.setBaseUrl(url) + schema.load(reader) + self.__setIncludes(schema) + self.__setImports(schema) + return schema + + def loadFromFile(self, filename): + """Return an XMLSchema instance loaded from the given file. + filename -- name of file to open + """ + if self.__base_url: + filename = urllib.basejoin(self.__base_url,filename) + file = open(filename, 'rb') + try: + schema = self.loadFromStream(file, filename) + finally: + file.close() + + return schema + + +class SchemaError(Exception): + pass + +########################### +# DOM Utility Adapters +########################## +class DOMAdapterInterface: + def hasattr(self, attr, ns=None): + """return true if node has attribute + attr -- attribute to check for + ns -- namespace of attribute, by default None + """ + raise NotImplementedError, 'adapter method not implemented' + + def getContentList(self, *contents): + """returns an ordered list of child nodes + *contents -- list of node names to return + """ + raise NotImplementedError, 'adapter method not implemented' + + def setAttributeDictionary(self, attributes): + """set attribute dictionary + """ + raise NotImplementedError, 'adapter method not implemented' + + def getAttributeDictionary(self): + """returns a dict of node's attributes + """ + raise NotImplementedError, 'adapter method not implemented' + + def getNamespace(self, prefix): + """returns namespace referenced by prefix. + """ + raise NotImplementedError, 'adapter method not implemented' + + def getTagName(self): + """returns tagName of node + """ + raise NotImplementedError, 'adapter method not implemented' + + + def getParentNode(self): + """returns parent element in DOMAdapter or None + """ + raise NotImplementedError, 'adapter method not implemented' + + def loadDocument(self, file): + """load a Document from a file object + file -- + """ + raise NotImplementedError, 'adapter method not implemented' + + def loadFromURL(self, url): + """load a Document from an url + url -- URL to dereference + """ + raise NotImplementedError, 'adapter method not implemented' + + +class DOMAdapter(DOMAdapterInterface): + """Adapter for ZSI.Utility.DOM + """ + def __init__(self, node=None): + """Reset all instance variables. + element -- DOM document, node, or None + """ + if hasattr(node, 'documentElement'): + self.__node = node.documentElement + else: + self.__node = node + self.__attributes = None + + def hasattr(self, attr, ns=None): + """attr -- attribute + ns -- optional namespace, None means unprefixed attribute. + """ + if not self.__attributes: + self.setAttributeDictionary() + if ns: + return self.__attributes.get(ns,{}).has_key(attr) + return self.__attributes.has_key(attr) + + def getContentList(self, *contents): + nodes = [] + ELEMENT_NODE = self.__node.ELEMENT_NODE + for child in DOM.getElements(self.__node, None): + if child.nodeType == ELEMENT_NODE and\ + SplitQName(child.tagName)[1] in contents: + nodes.append(child) + return map(self.__class__, nodes) + + def setAttributeDictionary(self): + self.__attributes = {} + for v in self.__node._attrs.values(): + self.__attributes[v.nodeName] = v.nodeValue + + def getAttributeDictionary(self): + if not self.__attributes: + self.setAttributeDictionary() + return self.__attributes + + def getTagName(self): + return self.__node.tagName + + def getParentNode(self): + if self.__node.parentNode.nodeType == self.__node.ELEMENT_NODE: + return DOMAdapter(self.__node.parentNode) + return None + + def getNamespace(self, prefix): + """prefix -- deference namespace prefix in node's context. + Ascends parent nodes until found. + """ + namespace = None + if prefix == 'xmlns': + namespace = DOM.findDefaultNS(prefix, self.__node) + else: + try: + namespace = DOM.findNamespaceURI(prefix, self.__node) + except DOMException, ex: + if prefix != 'xml': + raise SchemaError, '%s namespace not declared for %s'\ + %(prefix, self.__node._get_tagName()) + namespace = XMLNS.XML + return namespace + + def loadDocument(self, file): + self.__node = DOM.loadDocument(file) + if hasattr(self.__node, 'documentElement'): + self.__node = self.__node.documentElement + + def loadFromURL(self, url): + self.__node = DOM.loadFromURL(url) + if hasattr(self.__node, 'documentElement'): + self.__node = self.__node.documentElement + + +class XMLBase: + """ These class variables are for string indentation. + """ + tag = None + __indent = 0 + __rlock = RLock() + + def __str__(self): + XMLBase.__rlock.acquire() + XMLBase.__indent += 1 + tmp = "<" + str(self.__class__) + '>\n' + for k,v in self.__dict__.items(): + tmp += "%s* %s = %s\n" %(XMLBase.__indent*' ', k, v) + XMLBase.__indent -= 1 + XMLBase.__rlock.release() + return tmp + + +"""Marker Interface: can determine something about an instances properties by using + the provided convenience functions. + +""" +class DefinitionMarker: + """marker for definitions + """ + pass + +class DeclarationMarker: + """marker for declarations + """ + pass + +class AttributeMarker: + """marker for attributes + """ + pass + +class AttributeGroupMarker: + """marker for attribute groups + """ + pass + +class WildCardMarker: + """marker for wildcards + """ + pass + +class ElementMarker: + """marker for wildcards + """ + pass + +class ReferenceMarker: + """marker for references + """ + pass + +class ModelGroupMarker: + """marker for model groups + """ + pass + +class AllMarker(ModelGroupMarker): + """marker for all model group + """ + pass + +class ChoiceMarker(ModelGroupMarker): + """marker for choice model group + """ + pass + +class SequenceMarker(ModelGroupMarker): + """marker for sequence model group + """ + pass + +class ExtensionMarker: + """marker for extensions + """ + pass + +class RestrictionMarker: + """marker for restrictions + """ + facets = ['enumeration', 'length', 'maxExclusive', 'maxInclusive',\ + 'maxLength', 'minExclusive', 'minInclusive', 'minLength',\ + 'pattern', 'fractionDigits', 'totalDigits', 'whiteSpace'] + +class SimpleMarker: + """marker for simple type information + """ + pass + +class ListMarker: + """marker for simple type list + """ + pass + +class UnionMarker: + """marker for simple type Union + """ + pass + + +class ComplexMarker: + """marker for complex type information + """ + pass + +class LocalMarker: + """marker for complex type information + """ + pass + + +class MarkerInterface: + def isDefinition(self): + return isinstance(self, DefinitionMarker) + + def isDeclaration(self): + return isinstance(self, DeclarationMarker) + + def isAttribute(self): + return isinstance(self, AttributeMarker) + + def isAttributeGroup(self): + return isinstance(self, AttributeGroupMarker) + + def isElement(self): + return isinstance(self, ElementMarker) + + def isReference(self): + return isinstance(self, ReferenceMarker) + + def isWildCard(self): + return isinstance(self, WildCardMarker) + + def isModelGroup(self): + return isinstance(self, ModelGroupMarker) + + def isAll(self): + return isinstance(self, AllMarker) + + def isChoice(self): + return isinstance(self, ChoiceMarker) + + def isSequence(self): + return isinstance(self, SequenceMarker) + + def isExtension(self): + return isinstance(self, ExtensionMarker) + + def isRestriction(self): + return isinstance(self, RestrictionMarker) + + def isSimple(self): + return isinstance(self, SimpleMarker) + + def isComplex(self): + return isinstance(self, ComplexMarker) + + def isLocal(self): + return isinstance(self, LocalMarker) + + def isList(self): + return isinstance(self, ListMarker) + + def isUnion(self): + return isinstance(self, UnionMarker) + + +########################################################## +# Schema Components +######################################################### +class XMLSchemaComponent(XMLBase, MarkerInterface): + """ + class variables: + required -- list of required attributes + attributes -- dict of default attribute values, including None. + Value can be a function for runtime dependencies. + contents -- dict of namespace keyed content lists. + 'xsd' content of xsd namespace. + xmlns_key -- key for declared xmlns namespace. + xmlns -- xmlns is special prefix for namespace dictionary + xml -- special xml prefix for xml namespace. + """ + required = [] + attributes = {} + contents = {} + xmlns_key = '' + xmlns = 'xmlns' + xml = 'xml' + + def __init__(self, parent=None): + """parent -- parent instance + instance variables: + attributes -- dictionary of node's attributes + """ + self.attributes = None + self._parent = parent + if self._parent: + self._parent = weakref.ref(parent) + + if not self.__class__ == XMLSchemaComponent\ + and not (type(self.__class__.required) == type(XMLSchemaComponent.required)\ + and type(self.__class__.attributes) == type(XMLSchemaComponent.attributes)\ + and type(self.__class__.contents) == type(XMLSchemaComponent.contents)): + raise RuntimeError, 'Bad type for a class variable in %s' %self.__class__ + + def getItemTrace(self): + """Returns a node trace up to the <schema> item. + """ + item, path, name, ref = self, [], 'name', 'ref' + while not isinstance(item,XMLSchema) and not isinstance(item,WSDLToolsAdapter): + attr = item.getAttribute(name) + if attr is None: + attr = item.getAttribute(ref) + if attr is None: path.append('<%s>' %(item.tag)) + else: path.append('<%s ref="%s">' %(item.tag, attr)) + else: + path.append('<%s name="%s">' %(item.tag,attr)) + item = item._parent() + try: + tns = item.getTargetNamespace() + except: + tns = '' + path.append('<%s targetNamespace="%s">' %(item.tag, tns)) + path.reverse() + return ''.join(path) + + def getTargetNamespace(self): + """return targetNamespace + """ + parent = self + targetNamespace = 'targetNamespace' + tns = self.attributes.get(targetNamespace) + while not tns: + parent = parent._parent() + tns = parent.attributes.get(targetNamespace) + return tns + + def getAttributeDeclaration(self, attribute): + """attribute -- attribute with a QName value (eg. type). + collection -- check types collection in parent Schema instance + """ + return self.getQNameAttribute('attr_decl', attribute) + + def getAttributeGroup(self, attribute): + """attribute -- attribute with a QName value (eg. type). + collection -- check types collection in parent Schema instance + """ + return self.getQNameAttribute('attr_groups', attribute) + + def getTypeDefinition(self, attribute): + """attribute -- attribute with a QName value (eg. type). + collection -- check types collection in parent Schema instance + """ + return self.getQNameAttribute('types', attribute) + + def getElementDeclaration(self, attribute): + """attribute -- attribute with a QName value (eg. element). + collection -- check elements collection in parent Schema instance. + """ + return self.getQNameAttribute('elements', attribute) + + def getModelGroup(self, attribute): + """attribute -- attribute with a QName value (eg. ref). + collection -- check model_group collection in parent Schema instance. + """ + return self.getQNameAttribute('model_groups', attribute) + + def getQNameAttribute(self, collection, attribute): + """returns object instance representing QName --> (namespace,name), + or if does not exist return None. + attribute -- an information item attribute, with a QName value. + collection -- collection in parent Schema instance to search. + """ + obj = None + tdc = self.attributes.get(attribute) + if tdc: + parent = GetSchema(self) + targetNamespace = tdc.getTargetNamespace() + if parent.targetNamespace == targetNamespace: + item = tdc.getName() + try: + obj = getattr(parent, collection)[item] + except KeyError, ex: + raise KeyError, "targetNamespace(%s) collection(%s) has no item(%s)"\ + %(targetNamespace, collection, item) + elif parent.imports.has_key(targetNamespace): + schema = parent.imports[targetNamespace].getSchema() + item = tdc.getName() + try: + obj = getattr(schema, collection)[item] + except KeyError, ex: + raise KeyError, "targetNamespace(%s) collection(%s) has no item(%s)"\ + %(targetNamespace, collection, item) + return obj + + def getXMLNS(self, prefix=None): + """deference prefix or by default xmlns, returns namespace. + """ + if prefix == XMLSchemaComponent.xml: + return XMLNS.XML + parent = self + ns = self.attributes[XMLSchemaComponent.xmlns].get(prefix or\ + XMLSchemaComponent.xmlns_key) + while not ns: + parent = parent._parent() + ns = parent.attributes[XMLSchemaComponent.xmlns].get(prefix or\ + XMLSchemaComponent.xmlns_key) + if not ns and isinstance(parent, WSDLToolsAdapter): + raise SchemaError, 'unknown prefix %s' %prefix + return ns + + def getAttribute(self, attribute): + """return requested attribute or None + """ + return self.attributes.get(attribute) + + def setAttributes(self, node): + """Sets up attribute dictionary, checks for required attributes and + sets default attribute values. attr is for default attribute values + determined at runtime. + + structure of attributes dictionary + ['xmlns'][xmlns_key] -- xmlns namespace + ['xmlns'][prefix] -- declared namespace prefix + [namespace][prefix] -- attributes declared in a namespace + [attribute] -- attributes w/o prefix, default namespaces do + not directly apply to attributes, ie Name can't collide + with QName. + """ + self.attributes = {XMLSchemaComponent.xmlns:{}} + for k,v in node.getAttributeDictionary().items(): + prefix,value = SplitQName(k) + if value == XMLSchemaComponent.xmlns: + self.attributes[value][prefix or XMLSchemaComponent.xmlns_key] = v + elif prefix: + ns = node.getNamespace(prefix) + if not ns: + raise SchemaError, 'no namespace for attribute prefix %s'\ + %prefix + if not self.attributes.has_key(ns): + self.attributes[ns] = {} + elif self.attributes[ns].has_key(value): + raise SchemaError, 'attribute %s declared multiple times in %s'\ + %(value, ns) + self.attributes[ns][value] = v + elif not self.attributes.has_key(value): + self.attributes[value] = v + else: + raise SchemaError, 'attribute %s declared multiple times' %value + + if not isinstance(self, WSDLToolsAdapter): + self.__checkAttributes() + self.__setAttributeDefaults() + + #set QNames + for k in ['type', 'element', 'base', 'ref', 'substitutionGroup', 'itemType']: + if self.attributes.has_key(k): + prefix, value = SplitQName(self.attributes.get(k)) + self.attributes[k] = \ + TypeDescriptionComponent((self.getXMLNS(prefix), value)) + + #Union, memberTypes is a whitespace separated list of QNames + for k in ['memberTypes']: + if self.attributes.has_key(k): + qnames = self.attributes[k] + self.attributes[k] = [] + for qname in qnames.split(): + prefix, value = SplitQName(qname) + self.attributes['memberTypes'].append(\ + TypeDescriptionComponent(\ + (self.getXMLNS(prefix), value))) + + def getContents(self, node): + """retrieve xsd contents + """ + return node.getContentList(*self.__class__.contents['xsd']) + + def __setAttributeDefaults(self): + """Looks for default values for unset attributes. If + class variable representing attribute is None, then + it must be defined as an instance variable. + """ + for k,v in self.__class__.attributes.items(): + if v and not self.attributes.has_key(k): + if isinstance(v, types.FunctionType): + self.attributes[k] = v(self) + else: + self.attributes[k] = v + + def __checkAttributes(self): + """Checks that required attributes have been defined, + attributes w/default cannot be required. Checks + all defined attributes are legal, attribute + references are not subject to this test. + """ + for a in self.__class__.required: + if not self.attributes.has_key(a): + raise SchemaError,\ + 'class instance %s, missing required attribute %s'\ + %(self.__class__, a) + for a in self.attributes.keys(): + if (a not in (XMLSchemaComponent.xmlns, XMLNS.XML)) and\ + (a not in self.__class__.attributes.keys()) and not\ + (self.isAttribute() and self.isReference()): + raise SchemaError, '%s, unknown attribute(%s,%s)' \ + %(self.getItemTrace(), a, self.attributes[a]) + + +class WSDLToolsAdapter(XMLSchemaComponent): + """WSDL Adapter to grab the attributes from the wsdl document node. + """ + attributes = {'name':None, 'targetNamespace':None} + tag = 'definitions' + + def __init__(self, wsdl): + XMLSchemaComponent.__init__(self, parent=wsdl) + self.setAttributes(DOMAdapter(wsdl.document)) + + def getImportSchemas(self): + """returns WSDLTools.WSDL types Collection + """ + return self._parent().types + + +class Notation(XMLSchemaComponent): + """<notation> + parent: + schema + attributes: + id -- ID + name -- NCName, Required + public -- token, Required + system -- anyURI + contents: + annotation? + """ + required = ['name', 'public'] + attributes = {'id':None, 'name':None, 'public':None, 'system':None} + contents = {'xsd':('annotation')} + tag = 'notation' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +class Annotation(XMLSchemaComponent): + """<annotation> + parent: + all,any,anyAttribute,attribute,attributeGroup,choice,complexContent, + complexType,element,extension,field,group,import,include,key,keyref, + list,notation,redefine,restriction,schema,selector,simpleContent, + simpleType,union,unique + attributes: + id -- ID + contents: + (documentation | appinfo)* + """ + attributes = {'id':None} + contents = {'xsd':('documentation', 'appinfo')} + tag = 'annotation' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'documentation': + #print_debug('class %s, documentation skipped' %self.__class__, 5) + continue + elif component == 'appinfo': + #print_debug('class %s, appinfo skipped' %self.__class__, 5) + continue + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content = tuple(content) + + + class Documentation(XMLSchemaComponent): + """<documentation> + parent: + annotation + attributes: + source, anyURI + xml:lang, language + contents: + mixed, any + """ + attributes = {'source':None, 'xml:lang':None} + contents = {'xsd':('mixed', 'any')} + tag = 'documentation' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'mixed': + #print_debug('class %s, mixed skipped' %self.__class__, 5) + continue + elif component == 'any': + #print_debug('class %s, any skipped' %self.__class__, 5) + continue + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content = tuple(content) + + + class Appinfo(XMLSchemaComponent): + """<appinfo> + parent: + annotation + attributes: + source, anyURI + contents: + mixed, any + """ + attributes = {'source':None, 'anyURI':None} + contents = {'xsd':('mixed', 'any')} + tag = 'appinfo' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'mixed': + #print_debug('class %s, mixed skipped' %self.__class__, 5) + continue + elif component == 'any': + #print_debug('class %s, any skipped' %self.__class__, 5) + continue + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content = tuple(content) + + +class XMLSchemaFake: + # This is temporary, for the benefit of WSDL until the real thing works. + def __init__(self, element): + self.targetNamespace = DOM.getAttr(element, 'targetNamespace') + self.element = element + +class XMLSchema(XMLSchemaComponent): + """A schema is a collection of schema components derived from one + or more schema documents, that is, one or more <schema> element + information items. It represents the abstract notion of a schema + rather than a single schema document (or other representation). + + <schema> + parent: + ROOT + attributes: + id -- ID + version -- token + xml:lang -- language + targetNamespace -- anyURI + attributeFormDefault -- 'qualified' | 'unqualified', 'unqualified' + elementFormDefault -- 'qualified' | 'unqualified', 'unqualified' + blockDefault -- '#all' | list of + ('substitution | 'extension' | 'restriction') + finalDefault -- '#all' | list of + ('extension' | 'restriction' | 'list' | 'union') + + contents: + ((include | import | redefine | annotation)*, + (attribute, attributeGroup, complexType, element, group, + notation, simpleType)*, annotation*)* + + + attributes -- schema attributes + imports -- import statements + includes -- include statements + redefines -- + types -- global simpleType, complexType definitions + elements -- global element declarations + attr_decl -- global attribute declarations + attr_groups -- attribute Groups + model_groups -- model Groups + notations -- global notations + """ + attributes = {'id':None, + 'version':None, + 'xml:lang':None, + 'targetNamespace':None, + 'attributeFormDefault':'unqualified', + 'elementFormDefault':'unqualified', + 'blockDefault':None, + 'finalDefault':None} + contents = {'xsd':('include', 'import', 'redefine', 'annotation', 'attribute',\ + 'attributeGroup', 'complexType', 'element', 'group',\ + 'notation', 'simpleType', 'annotation')} + empty_namespace = '' + tag = 'schema' + + def __init__(self, parent=None): + """parent -- + instance variables: + targetNamespace -- schema's declared targetNamespace, or empty string. + _imported_schemas -- namespace keyed dict of schema dependencies, if + a schema is provided instance will not resolve import statement. + _included_schemas -- schemaLocation keyed dict of component schemas, + if schema is provided instance will not resolve include statement. + _base_url -- needed for relative URLs support, only works with URLs + relative to initial document. + includes -- collection of include statements + imports -- collection of import statements + elements -- collection of global element declarations + types -- collection of global type definitions + attr_decl -- collection of global attribute declarations + attr_groups -- collection of global attribute group definitions + model_groups -- collection of model group definitions + notations -- collection of notations + + """ + self.targetNamespace = None + XMLSchemaComponent.__init__(self, parent) + f = lambda k: k.attributes['name'] + ns = lambda k: k.attributes['namespace'] + sl = lambda k: k.attributes['schemaLocation'] + self.includes = Collection(self, key=sl) + self.imports = Collection(self, key=ns) + self.elements = Collection(self, key=f) + self.types = Collection(self, key=f) + self.attr_decl = Collection(self, key=f) + self.attr_groups = Collection(self, key=f) + self.model_groups = Collection(self, key=f) + self.notations = Collection(self, key=f) + + self._imported_schemas = {} + self._included_schemas = {} + self._base_url = None + + def addImportSchema(self, schema): + """for resolving import statements in Schema instance + schema -- schema instance + _imported_schemas + """ + if not isinstance(schema, XMLSchema): + raise TypeError, 'expecting a Schema instance' + if schema.targetNamespace != self.targetNamespace: + self._imported_schemas[schema.targetNamespace] = schema + else: + raise SchemaError, 'import schema bad targetNamespace' + + def addIncludeSchema(self, schemaLocation, schema): + """for resolving include statements in Schema instance + schemaLocation -- schema location + schema -- schema instance + _included_schemas + """ + if not isinstance(schema, XMLSchema): + raise TypeError, 'expecting a Schema instance' + if not schema.targetNamespace or\ + schema.targetNamespace == self.targetNamespace: + self._included_schemas[schemaLocation] = schema + else: + raise SchemaError, 'include schema bad targetNamespace' + + def setImportSchemas(self, schema_dict): + """set the import schema dictionary, which is used to + reference depedent schemas. + """ + self._imported_schemas = schema_dict + + def getImportSchemas(self): + """get the import schema dictionary, which is used to + reference depedent schemas. + """ + return self._imported_schemas + + def getSchemaNamespacesToImport(self): + """returns tuple of namespaces the schema instance has declared + itself to be depedent upon. + """ + return tuple(self.includes.keys()) + + def setIncludeSchemas(self, schema_dict): + """set the include schema dictionary, which is keyed with + schemaLocation (uri). + This is a means of providing + schemas to the current schema for content inclusion. + """ + self._included_schemas = schema_dict + + def getIncludeSchemas(self): + """get the include schema dictionary, which is keyed with + schemaLocation (uri). + """ + return self._included_schemas + + def getBaseUrl(self): + """get base url, used for normalizing all relative uri's + """ + return self._base_url + + def setBaseUrl(self, url): + """set base url, used for normalizing all relative uri's + """ + self._base_url = url + + def getElementFormDefault(self): + """return elementFormDefault attribute + """ + return self.attributes.get('elementFormDefault') + + def isElementFormDefaultQualified(self): + return self.attributes.get('elementFormDefault') == 'qualified' + + def getAttributeFormDefault(self): + """return attributeFormDefault attribute + """ + return self.attributes.get('attributeFormDefault') + + def getBlockDefault(self): + """return blockDefault attribute + """ + return self.attributes.get('blockDefault') + + def getFinalDefault(self): + """return finalDefault attribute + """ + return self.attributes.get('finalDefault') + + def load(self, node): + pnode = node.getParentNode() + if pnode: + pname = SplitQName(pnode.getTagName())[1] + if pname == 'types': + attributes = {} + self.setAttributes(pnode) + attributes.update(self.attributes) + self.setAttributes(node) + for k,v in attributes['xmlns'].items(): + if not self.attributes['xmlns'].has_key(k): + self.attributes['xmlns'][k] = v + else: + self.setAttributes(node) + else: + self.setAttributes(node) + + self.targetNamespace = self.getTargetNamespace() + contents = self.getContents(node) + + indx = 0 + num = len(contents) + while indx < num: + while indx < num: + node = contents[indx] + component = SplitQName(node.getTagName())[1] + + if component == 'include': + tp = self.__class__.Include(self) + tp.fromDom(node) + self.includes[tp.attributes['schemaLocation']] = tp + + schema = tp.getSchema() + if schema.targetNamespace and \ + schema.targetNamespace != self.targetNamespace: + raise SchemaError, 'included schema bad targetNamespace' + + for collection in ['imports','elements','types',\ + 'attr_decl','attr_groups','model_groups','notations']: + for k,v in getattr(schema,collection).items(): + if not getattr(self,collection).has_key(k): + v._parent = weakref.ref(self) + getattr(self,collection)[k] = v + + elif component == 'import': + tp = self.__class__.Import(self) + tp.fromDom(node) + import_ns = tp.getAttribute('namespace') + if import_ns: + if import_ns == self.targetNamespace: + raise SchemaError,\ + 'import and schema have same targetNamespace' + self.imports[import_ns] = tp + else: + self.imports[self.__class__.empty_namespace] = tp + + if not self.getImportSchemas().has_key(import_ns) and\ + tp.getAttribute('schemaLocation'): + self.addImportSchema(tp.getSchema()) + + elif component == 'redefine': + #print_debug('class %s, redefine skipped' %self.__class__, 5) + pass + elif component == 'annotation': + #print_debug('class %s, annotation skipped' %self.__class__, 5) + pass + else: + break + indx += 1 + + # (attribute, attributeGroup, complexType, element, group, + # notation, simpleType)*, annotation*)* + while indx < num: + node = contents[indx] + component = SplitQName(node.getTagName())[1] + + if component == 'attribute': + tp = AttributeDeclaration(self) + tp.fromDom(node) + self.attr_decl[tp.getAttribute('name')] = tp + elif component == 'attributeGroup': + tp = AttributeGroupDefinition(self) + tp.fromDom(node) + self.attr_groups[tp.getAttribute('name')] = tp + elif component == 'complexType': + tp = ComplexType(self) + tp.fromDom(node) + self.types[tp.getAttribute('name')] = tp + elif component == 'element': + tp = ElementDeclaration(self) + tp.fromDom(node) + self.elements[tp.getAttribute('name')] = tp + elif component == 'group': + tp = ModelGroupDefinition(self) + tp.fromDom(node) + self.model_groups[tp.getAttribute('name')] = tp + elif component == 'notation': + tp = Notation(self) + tp.fromDom(node) + self.notations[tp.getAttribute('name')] = tp + elif component == 'simpleType': + tp = SimpleType(self) + tp.fromDom(node) + self.types[tp.getAttribute('name')] = tp + else: + break + indx += 1 + + while indx < num: + node = contents[indx] + component = SplitQName(node.getTagName())[1] + + if component == 'annotation': + #print_debug('class %s, annotation 2 skipped' %self.__class__, 5) + pass + else: + break + indx += 1 + + + class Import(XMLSchemaComponent): + """<import> + parent: + schema + attributes: + id -- ID + namespace -- anyURI + schemaLocation -- anyURI + contents: + annotation? + """ + attributes = {'id':None, + 'namespace':None, + 'schemaLocation':None} + contents = {'xsd':['annotation']} + tag = 'import' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self._schema = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + if self.attributes['namespace'] == self.getTargetNamespace(): + raise SchemaError, 'namespace of schema and import match' + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + def getSchema(self): + """if schema is not defined, first look for a Schema class instance + in parent Schema. Else if not defined resolve schemaLocation + and create a new Schema class instance, and keep a hard reference. + """ + if not self._schema: + ns = self.attributes['namespace'] + schema = self._parent().getImportSchemas().get(ns) + if not schema and self._parent()._parent: + schema = self._parent()._parent().getImportSchemas().get(ns) + if not schema: + url = self.attributes.get('schemaLocation') + if not url: + raise SchemaError, 'namespace(%s) is unknown' %ns + base_url = self._parent().getBaseUrl() + reader = SchemaReader(base_url=base_url) + reader._imports = self._parent().getImportSchemas() + reader._includes = self._parent().getIncludeSchemas() + self._schema = reader.loadFromURL(url) + return self._schema or schema + + + class Include(XMLSchemaComponent): + """<include schemaLocation> + parent: + schema + attributes: + id -- ID + schemaLocation -- anyURI, required + contents: + annotation? + """ + required = ['schemaLocation'] + attributes = {'id':None, + 'schemaLocation':None} + contents = {'xsd':['annotation']} + tag = 'include' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self._schema = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + def getSchema(self): + """if schema is not defined, first look for a Schema class instance + in parent Schema. Else if not defined resolve schemaLocation + and create a new Schema class instance. + """ + if not self._schema: + schema = self._parent() + self._schema = schema.getIncludeSchemas().get(\ + self.attributes['schemaLocation'] + ) + if not self._schema: + url = self.attributes['schemaLocation'] + reader = SchemaReader(base_url=schema.getBaseUrl()) + reader._imports = schema.getImportSchemas() + reader._includes = schema.getIncludeSchemas() + self._schema = reader.loadFromURL(url) + return self._schema + + +class AttributeDeclaration(XMLSchemaComponent,\ + AttributeMarker,\ + DeclarationMarker): + """<attribute name> + parent: + schema + attributes: + id -- ID + name -- NCName, required + type -- QName + default -- string + fixed -- string + contents: + annotation?, simpleType? + """ + required = ['name'] + attributes = {'id':None, + 'name':None, + 'type':None, + 'default':None, + 'fixed':None} + contents = {'xsd':['annotation','simpleType']} + tag = 'attribute' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def fromDom(self, node): + """ No list or union support + """ + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + elif component == 'simpleType': + self.content = AnonymousSimpleType(self) + self.content.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +class LocalAttributeDeclaration(AttributeDeclaration,\ + AttributeMarker,\ + LocalMarker,\ + DeclarationMarker): + """<attribute name> + parent: + complexType, restriction, extension, attributeGroup + attributes: + id -- ID + name -- NCName, required + type -- QName + form -- ('qualified' | 'unqualified'), schema.attributeFormDefault + use -- ('optional' | 'prohibited' | 'required'), optional + default -- string + fixed -- string + contents: + annotation?, simpleType? + """ + required = ['name'] + attributes = {'id':None, + 'name':None, + 'type':None, + 'form':lambda self: GetSchema(self).getAttributeFormDefault(), + 'use':'optional', + 'default':None, + 'fixed':None} + contents = {'xsd':['annotation','simpleType']} + + def __init__(self, parent): + AttributeDeclaration.__init__(self, parent) + self.annotation = None + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + elif component == 'simpleType': + self.content = AnonymousSimpleType(self) + self.content.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +class AttributeWildCard(XMLSchemaComponent,\ + AttributeMarker,\ + DeclarationMarker,\ + WildCardMarker): + """<anyAttribute> + parents: + complexType, restriction, extension, attributeGroup + attributes: + id -- ID + namespace -- '##any' | '##other' | + (anyURI* | '##targetNamespace' | '##local'), ##any + processContents -- 'lax' | 'skip' | 'strict', strict + contents: + annotation? + """ + attributes = {'id':None, + 'namespace':'##any', + 'processContents':'strict'} + contents = {'xsd':['annotation']} + tag = 'anyAttribute' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +class AttributeReference(XMLSchemaComponent,\ + AttributeMarker,\ + ReferenceMarker): + """<attribute ref> + parents: + complexType, restriction, extension, attributeGroup + attributes: + id -- ID + ref -- QName, required + use -- ('optional' | 'prohibited' | 'required'), optional + default -- string + fixed -- string + contents: + annotation? + """ + required = ['ref'] + attributes = {'id':None, + 'ref':None, + 'use':'optional', + 'default':None, + 'fixed':None} + contents = {'xsd':['annotation']} + tag = 'attribute' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + + def getAttributeDeclaration(self, attribute='ref'): + return XMLSchemaComponent.getAttributeDeclaration(self, attribute) + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +class AttributeGroupDefinition(XMLSchemaComponent,\ + AttributeGroupMarker,\ + DefinitionMarker): + """<attributeGroup name> + parents: + schema, redefine + attributes: + id -- ID + name -- NCName, required + contents: + annotation?, (attribute | attributeGroup)*, anyAttribute? + """ + required = ['name'] + attributes = {'id':None, + 'name':None} + contents = {'xsd':['annotation', 'attribute', 'attributeGroup', 'anyAttribute']} + tag = 'attributeGroup' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.attr_content = None + + def getAttributeContent(self): + return self.attr_content + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for indx in range(len(contents)): + component = SplitQName(contents[indx].getTagName())[1] + if (component == 'annotation') and (not indx): + self.annotation = Annotation(self) + self.annotation.fromDom(contents[indx]) + elif component == 'attribute': + if contents[indx].hasattr('name'): + content.append(LocalAttributeDeclaration(self)) + elif contents[indx].hasattr('ref'): + content.append(AttributeReference(self)) + else: + raise SchemaError, 'Unknown attribute type' + content[-1].fromDom(contents[indx]) + elif component == 'attributeGroup': + content.append(AttributeGroupReference(self)) + content[-1].fromDom(contents[indx]) + elif component == 'anyAttribute': + if len(contents) != indx+1: + raise SchemaError, 'anyAttribute is out of order in %s' %self.getItemTrace() + content.append(AttributeWildCard(self)) + content[-1].fromDom(contents[indx]) + else: + raise SchemaError, 'Unknown component (%s)' %(contents[indx].getTagName()) + + self.attr_content = tuple(content) + +class AttributeGroupReference(XMLSchemaComponent,\ + AttributeGroupMarker,\ + ReferenceMarker): + """<attributeGroup ref> + parents: + complexType, restriction, extension, attributeGroup + attributes: + id -- ID + ref -- QName, required + contents: + annotation? + """ + required = ['ref'] + attributes = {'id':None, + 'ref':None} + contents = {'xsd':['annotation']} + tag = 'attributeGroup' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + + def getAttributeGroup(self, attribute='ref'): + """attribute -- attribute with a QName value (eg. type). + collection -- check types collection in parent Schema instance + """ + return XMLSchemaComponent.getQNameAttribute(self, 'attr_groups', attribute) + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + + +###################################################### +# Elements +##################################################### +class IdentityConstrants(XMLSchemaComponent): + """Allow one to uniquely identify nodes in a document and ensure the + integrity of references between them. + + attributes -- dictionary of attributes + selector -- XPath to selected nodes + fields -- list of XPath to key field + """ + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.selector = None + self.fields = None + self.annotation = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + fields = [] + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + elif component == 'selector': + self.selector = self.Selector(self) + self.selector.fromDom(i) + continue + elif component == 'field': + fields.append(self.Field(self)) + fields[-1].fromDom(i) + continue + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.fields = tuple(fields) + + + class Constraint(XMLSchemaComponent): + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + class Selector(Constraint): + """<selector xpath> + parent: + unique, key, keyref + attributes: + id -- ID + xpath -- XPath subset, required + contents: + annotation? + """ + required = ['xpath'] + attributes = {'id':None, + 'xpath':None} + contents = {'xsd':['annotation']} + tag = 'selector' + + class Field(Constraint): + """<field xpath> + parent: + unique, key, keyref + attributes: + id -- ID + xpath -- XPath subset, required + contents: + annotation? + """ + required = ['xpath'] + attributes = {'id':None, + 'xpath':None} + contents = {'xsd':['annotation']} + tag = 'field' + + +class Unique(IdentityConstrants): + """<unique name> Enforce fields are unique w/i a specified scope. + + parent: + element + attributes: + id -- ID + name -- NCName, required + contents: + annotation?, selector, field+ + """ + required = ['name'] + attributes = {'id':None, + 'name':None} + contents = {'xsd':['annotation', 'selector', 'field']} + tag = 'unique' + + +class Key(IdentityConstrants): + """<key name> Enforce fields are unique w/i a specified scope, and all + field values are present w/i document. Fields cannot + be nillable. + + parent: + element + attributes: + id -- ID + name -- NCName, required + contents: + annotation?, selector, field+ + """ + required = ['name'] + attributes = {'id':None, + 'name':None} + contents = {'xsd':['annotation', 'selector', 'field']} + tag = 'key' + + +class KeyRef(IdentityConstrants): + """<keyref name refer> Ensure a match between two sets of values in an + instance. + parent: + element + attributes: + id -- ID + name -- NCName, required + refer -- QName, required + contents: + annotation?, selector, field+ + """ + required = ['name', 'refer'] + attributes = {'id':None, + 'name':None, + 'refer':None} + contents = {'xsd':['annotation', 'selector', 'field']} + tag = 'keyref' + + +class ElementDeclaration(XMLSchemaComponent,\ + ElementMarker,\ + DeclarationMarker): + """<element name> + parents: + schema + attributes: + id -- ID + name -- NCName, required + type -- QName + default -- string + fixed -- string + nillable -- boolean, false + abstract -- boolean, false + substitutionGroup -- QName + block -- ('#all' | ('substition' | 'extension' | 'restriction')*), + schema.blockDefault + final -- ('#all' | ('extension' | 'restriction')*), + schema.finalDefault + contents: + annotation?, (simpleType,complexType)?, (key | keyref | unique)* + + """ + required = ['name'] + attributes = {'id':None, + 'name':None, + 'type':None, + 'default':None, + 'fixed':None, + 'nillable':0, + 'abstract':0, + 'substitutionGroup':None, + 'block':lambda self: self._parent().getBlockDefault(), + 'final':lambda self: self._parent().getFinalDefault()} + contents = {'xsd':['annotation', 'simpleType', 'complexType', 'key',\ + 'keyref', 'unique']} + tag = 'element' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + self.constraints = () + + def isQualified(self): + '''Global elements are always qualified. + ''' + return True + + def getElementDeclaration(self, attribute): + raise Warning, 'invalid operation for <%s>' %self.tag + + def getTypeDefinition(self, attribute=None): + '''If attribute is None, "type" is assumed, return the corresponding + representation of the global type definition (TypeDefinition), + or the local definition if don't find "type". To maintain backwards + compat, if attribute is provided call base class method. + ''' + if attribute: + return XMLSchemaComponent.getTypeDefinition(self, attribute) + gt = XMLSchemaComponent.getTypeDefinition(self, 'type') + if gt: + return gt + return self.content + + def getConstraints(self): + return self._constraints + def setConstraints(self, constraints): + self._constraints = tuple(constraints) + constraints = property(getConstraints, setConstraints, None, "tuple of key, keyref, unique constraints") + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + constraints = [] + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + elif component == 'simpleType' and not self.content: + self.content = AnonymousSimpleType(self) + self.content.fromDom(i) + elif component == 'complexType' and not self.content: + self.content = LocalComplexType(self) + self.content.fromDom(i) + elif component == 'key': + constraints.append(Key(self)) + constraints[-1].fromDom(i) + elif component == 'keyref': + constraints.append(KeyRef(self)) + constraints[-1].fromDom(i) + elif component == 'unique': + constraints.append(Unique(self)) + constraints[-1].fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + self.constraints = constraints + + +class LocalElementDeclaration(ElementDeclaration,\ + LocalMarker): + """<element> + parents: + all, choice, sequence + attributes: + id -- ID + name -- NCName, required + form -- ('qualified' | 'unqualified'), schema.elementFormDefault + type -- QName + minOccurs -- Whole Number, 1 + maxOccurs -- (Whole Number | 'unbounded'), 1 + default -- string + fixed -- string + nillable -- boolean, false + block -- ('#all' | ('extension' | 'restriction')*), schema.blockDefault + contents: + annotation?, (simpleType,complexType)?, (key | keyref | unique)* + """ + required = ['name'] + attributes = {'id':None, + 'name':None, + 'form':lambda self: GetSchema(self).getElementFormDefault(), + 'type':None, + 'minOccurs':'1', + 'maxOccurs':'1', + 'default':None, + 'fixed':None, + 'nillable':0, + 'abstract':0, + 'block':lambda self: GetSchema(self).getBlockDefault()} + contents = {'xsd':['annotation', 'simpleType', 'complexType', 'key',\ + 'keyref', 'unique']} + + def isQualified(self): + '''Local elements can be qualified or unqualifed according + to the attribute form, or the elementFormDefault. By default + local elements are unqualified. + ''' + form = self.getAttribute('form') + if form == 'qualified': + return True + if form == 'unqualified': + return False + raise SchemaError, 'Bad form (%s) for element: %s' %(form, self.getItemTrace()) + + +class ElementReference(XMLSchemaComponent,\ + ElementMarker,\ + ReferenceMarker): + """<element ref> + parents: + all, choice, sequence + attributes: + id -- ID + ref -- QName, required + minOccurs -- Whole Number, 1 + maxOccurs -- (Whole Number | 'unbounded'), 1 + contents: + annotation? + """ + required = ['ref'] + attributes = {'id':None, + 'ref':None, + 'minOccurs':'1', + 'maxOccurs':'1'} + contents = {'xsd':['annotation']} + tag = 'element' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + + def getElementDeclaration(self, attribute=None): + '''If attribute is None, "ref" is assumed, return the corresponding + representation of the global element declaration (ElementDeclaration), + To maintain backwards compat, if attribute is provided call base class method. + ''' + if attribute: + return XMLSchemaComponent.getElementDeclaration(self, attribute) + return XMLSchemaComponent.getElementDeclaration(self, 'ref') + + def fromDom(self, node): + self.annotation = None + self.setAttributes(node) + for i in self.getContents(node): + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +class ElementWildCard(LocalElementDeclaration,\ + WildCardMarker): + """<any> + parents: + choice, sequence + attributes: + id -- ID + minOccurs -- Whole Number, 1 + maxOccurs -- (Whole Number | 'unbounded'), 1 + namespace -- '##any' | '##other' | + (anyURI* | '##targetNamespace' | '##local'), ##any + processContents -- 'lax' | 'skip' | 'strict', strict + contents: + annotation? + """ + required = [] + attributes = {'id':None, + 'minOccurs':'1', + 'maxOccurs':'1', + 'namespace':'##any', + 'processContents':'strict'} + contents = {'xsd':['annotation']} + tag = 'any' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + + def isQualified(self): + '''Global elements are always qualified, but if processContents + are not strict could have dynamically generated local elements. + ''' + return GetSchema(self).isElementFormDefaultQualified() + + def getTypeDefinition(self, attribute): + raise Warning, 'invalid operation for <%s>' %self.tag + + def fromDom(self, node): + self.annotation = None + self.setAttributes(node) + for i in self.getContents(node): + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +###################################################### +# Model Groups +##################################################### +class Sequence(XMLSchemaComponent,\ + SequenceMarker): + """<sequence> + parents: + complexType, extension, restriction, group, choice, sequence + attributes: + id -- ID + minOccurs -- Whole Number, 1 + maxOccurs -- (Whole Number | 'unbounded'), 1 + + contents: + annotation?, (element | group | choice | sequence | any)* + """ + attributes = {'id':None, + 'minOccurs':'1', + 'maxOccurs':'1'} + contents = {'xsd':['annotation', 'element', 'group', 'choice', 'sequence',\ + 'any']} + tag = 'sequence' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + continue + elif component == 'element': + if i.hasattr('ref'): + content.append(ElementReference(self)) + else: + content.append(LocalElementDeclaration(self)) + elif component == 'group': + content.append(ModelGroupReference(self)) + elif component == 'choice': + content.append(Choice(self)) + elif component == 'sequence': + content.append(Sequence(self)) + elif component == 'any': + content.append(ElementWildCard(self)) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + content[-1].fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content = tuple(content) + + +class All(XMLSchemaComponent,\ + AllMarker): + """<all> + parents: + complexType, extension, restriction, group + attributes: + id -- ID + minOccurs -- '0' | '1', 1 + maxOccurs -- '1', 1 + + contents: + annotation?, element* + """ + attributes = {'id':None, + 'minOccurs':'1', + 'maxOccurs':'1'} + contents = {'xsd':['annotation', 'element']} + tag = 'all' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + continue + elif component == 'element': + if i.hasattr('ref'): + content.append(ElementReference(self)) + else: + content.append(LocalElementDeclaration(self)) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + content[-1].fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content = tuple(content) + + +class Choice(XMLSchemaComponent,\ + ChoiceMarker): + """<choice> + parents: + complexType, extension, restriction, group, choice, sequence + attributes: + id -- ID + minOccurs -- Whole Number, 1 + maxOccurs -- (Whole Number | 'unbounded'), 1 + + contents: + annotation?, (element | group | choice | sequence | any)* + """ + attributes = {'id':None, + 'minOccurs':'1', + 'maxOccurs':'1'} + contents = {'xsd':['annotation', 'element', 'group', 'choice', 'sequence',\ + 'any']} + tag = 'choice' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + continue + elif component == 'element': + if i.hasattr('ref'): + content.append(ElementReference(self)) + else: + content.append(LocalElementDeclaration(self)) + elif component == 'group': + content.append(ModelGroupReference(self)) + elif component == 'choice': + content.append(Choice(self)) + elif component == 'sequence': + content.append(Sequence(self)) + elif component == 'any': + content.append(ElementWildCard(self)) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + content[-1].fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content = tuple(content) + + +class ModelGroupDefinition(XMLSchemaComponent,\ + ModelGroupMarker,\ + DefinitionMarker): + """<group name> + parents: + redefine, schema + attributes: + id -- ID + name -- NCName, required + + contents: + annotation?, (all | choice | sequence)? + """ + required = ['name'] + attributes = {'id':None, + 'name':None} + contents = {'xsd':['annotation', 'all', 'choice', 'sequence']} + tag = 'group' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + continue + elif component == 'all' and not self.content: + self.content = All(self) + elif component == 'choice' and not self.content: + self.content = Choice(self) + elif component == 'sequence' and not self.content: + self.content = Sequence(self) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +class ModelGroupReference(XMLSchemaComponent,\ + ModelGroupMarker,\ + ReferenceMarker): + """<group ref> + parents: + choice, complexType, extension, restriction, sequence + attributes: + id -- ID + ref -- NCName, required + minOccurs -- Whole Number, 1 + maxOccurs -- (Whole Number | 'unbounded'), 1 + + contents: + annotation? + """ + required = ['ref'] + attributes = {'id':None, + 'ref':None, + 'minOccurs':'1', + 'maxOccurs':'1'} + contents = {'xsd':['annotation']} + tag = 'group' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + + def getModelGroupReference(self): + return self.getModelGroup('ref') + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + + +class ComplexType(XMLSchemaComponent,\ + DefinitionMarker,\ + ComplexMarker): + """<complexType name> + parents: + redefine, schema + attributes: + id -- ID + name -- NCName, required + mixed -- boolean, false + abstract -- boolean, false + block -- ('#all' | ('extension' | 'restriction')*), schema.blockDefault + final -- ('#all' | ('extension' | 'restriction')*), schema.finalDefault + + contents: + annotation?, (simpleContent | complexContent | + ((group | all | choice | sequence)?, (attribute | attributeGroup)*, anyAttribute?)) + """ + required = ['name'] + attributes = {'id':None, + 'name':None, + 'mixed':0, + 'abstract':0, + 'block':lambda self: self._parent().getBlockDefault(), + 'final':lambda self: self._parent().getFinalDefault()} + contents = {'xsd':['annotation', 'simpleContent', 'complexContent',\ + 'group', 'all', 'choice', 'sequence', 'attribute', 'attributeGroup',\ + 'anyAttribute', 'any']} + tag = 'complexType' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + self.attr_content = None + + def getAttributeContent(self): + return self.attr_content + + def getElementDeclaration(self, attribute): + raise Warning, 'invalid operation for <%s>' %self.tag + + def getTypeDefinition(self, attribute): + raise Warning, 'invalid operation for <%s>' %self.tag + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + indx = 0 + num = len(contents) + #XXX ugly + if not num: + return + component = SplitQName(contents[indx].getTagName())[1] + if component == 'annotation': + self.annotation = Annotation(self) + self.annotation.fromDom(contents[indx]) + indx += 1 + component = SplitQName(contents[indx].getTagName())[1] + + self.content = None + if component == 'simpleContent': + self.content = self.__class__.SimpleContent(self) + self.content.fromDom(contents[indx]) + elif component == 'complexContent': + self.content = self.__class__.ComplexContent(self) + self.content.fromDom(contents[indx]) + else: + if component == 'all': + self.content = All(self) + elif component == 'choice': + self.content = Choice(self) + elif component == 'sequence': + self.content = Sequence(self) + elif component == 'group': + self.content = ModelGroupReference(self) + + if self.content: + self.content.fromDom(contents[indx]) + indx += 1 + + self.attr_content = [] + while indx < num: + component = SplitQName(contents[indx].getTagName())[1] + if component == 'attribute': + if contents[indx].hasattr('ref'): + self.attr_content.append(AttributeReference(self)) + else: + self.attr_content.append(LocalAttributeDeclaration(self)) + elif component == 'attributeGroup': + self.attr_content.append(AttributeGroupReference(self)) + elif component == 'anyAttribute': + self.attr_content.append(AttributeWildCard(self)) + else: + raise SchemaError, 'Unknown component (%s): %s' \ + %(contents[indx].getTagName(),self.getItemTrace()) + self.attr_content[-1].fromDom(contents[indx]) + indx += 1 + + class _DerivedType(XMLSchemaComponent): + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.derivation = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + for i in contents: + component = SplitQName(i.getTagName())[1] + if component in self.__class__.contents['xsd']: + if component == 'annotation' and not self.annotation: + self.annotation = Annotation(self) + self.annotation.fromDom(i) + continue + elif component == 'restriction' and not self.derivation: + self.derivation = self.__class__.Restriction(self) + elif component == 'extension' and not self.derivation: + self.derivation = self.__class__.Extension(self) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.derivation.fromDom(i) + + class ComplexContent(_DerivedType,\ + ComplexMarker): + """<complexContent> + parents: + complexType + attributes: + id -- ID + mixed -- boolean, false + + contents: + annotation?, (restriction | extension) + """ + attributes = {'id':None, + 'mixed':0 } + contents = {'xsd':['annotation', 'restriction', 'extension']} + tag = 'complexContent' + + class _DerivationBase(XMLSchemaComponent): + """<extension>,<restriction> + parents: + complexContent + attributes: + id -- ID + base -- QName, required + + contents: + annotation?, (group | all | choice | sequence)?, + (attribute | attributeGroup)*, anyAttribute? + """ + required = ['base'] + attributes = {'id':None, + 'base':None } + contents = {'xsd':['annotation', 'group', 'all', 'choice',\ + 'sequence', 'attribute', 'attributeGroup', 'anyAttribute']} + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + self.attr_content = None + + def getAttributeContent(self): + return self.attr_content + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + indx = 0 + num = len(contents) + #XXX ugly + if not num: + return + component = SplitQName(contents[indx].getTagName())[1] + if component == 'annotation': + self.annotation = Annotation(self) + self.annotation.fromDom(contents[indx]) + indx += 1 + component = SplitQName(contents[indx].getTagName())[1] + + if component == 'all': + self.content = All(self) + self.content.fromDom(contents[indx]) + indx += 1 + elif component == 'choice': + self.content = Choice(self) + self.content.fromDom(contents[indx]) + indx += 1 + elif component == 'sequence': + self.content = Sequence(self) + self.content.fromDom(contents[indx]) + indx += 1 + elif component == 'group': + self.content = ModelGroupReference(self) + self.content.fromDom(contents[indx]) + indx += 1 + else: + self.content = None + + self.attr_content = [] + while indx < num: + component = SplitQName(contents[indx].getTagName())[1] + if component == 'attribute': + if contents[indx].hasattr('ref'): + self.attr_content.append(AttributeReference(self)) + else: + self.attr_content.append(LocalAttributeDeclaration(self)) + elif component == 'attributeGroup': + if contents[indx].hasattr('ref'): + self.attr_content.append(AttributeGroupReference(self)) + else: + self.attr_content.append(AttributeGroupDefinition(self)) + elif component == 'anyAttribute': + self.attr_content.append(AttributeWildCard(self)) + else: + raise SchemaError, 'Unknown component (%s)' %(contents[indx].getTagName()) + self.attr_content[-1].fromDom(contents[indx]) + indx += 1 + + class Extension(_DerivationBase, + ExtensionMarker): + """<extension base> + parents: + complexContent + attributes: + id -- ID + base -- QName, required + + contents: + annotation?, (group | all | choice | sequence)?, + (attribute | attributeGroup)*, anyAttribute? + """ + tag = 'extension' + + class Restriction(_DerivationBase,\ + RestrictionMarker): + """<restriction base> + parents: + complexContent + attributes: + id -- ID + base -- QName, required + + contents: + annotation?, (group | all | choice | sequence)?, + (attribute | attributeGroup)*, anyAttribute? + """ + tag = 'restriction' + + + class SimpleContent(_DerivedType,\ + SimpleMarker): + """<simpleContent> + parents: + complexType + attributes: + id -- ID + + contents: + annotation?, (restriction | extension) + """ + attributes = {'id':None} + contents = {'xsd':['annotation', 'restriction', 'extension']} + tag = 'simpleContent' + + class Extension(XMLSchemaComponent,\ + ExtensionMarker): + """<extension base> + parents: + simpleContent + attributes: + id -- ID + base -- QName, required + + contents: + annotation?, (attribute | attributeGroup)*, anyAttribute? + """ + required = ['base'] + attributes = {'id':None, + 'base':None } + contents = {'xsd':['annotation', 'attribute', 'attributeGroup', + 'anyAttribute']} + tag = 'extension' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.attr_content = None + + def getAttributeContent(self): + return self.attr_content + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + + indx = 0 + num = len(contents) + + if num: + component = SplitQName(contents[indx].getTagName())[1] + if component == 'annotation': + self.annotation = Annotation(self) + self.annotation.fromDom(contents[indx]) + indx += 1 + component = SplitQName(contents[indx].getTagName())[1] + + content = [] + while indx < num: + component = SplitQName(contents[indx].getTagName())[1] + if component == 'attribute': + if contents[indx].hasattr('ref'): + content.append(AttributeReference(self)) + else: + content.append(LocalAttributeDeclaration(self)) + elif component == 'attributeGroup': + content.append(AttributeGroupReference(self)) + elif component == 'anyAttribute': + content.append(AttributeWildCard(self)) + else: + raise SchemaError, 'Unknown component (%s)'\ + %(contents[indx].getTagName()) + content[-1].fromDom(contents[indx]) + indx += 1 + self.attr_content = tuple(content) + + + class Restriction(XMLSchemaComponent,\ + RestrictionMarker): + """<restriction base> + parents: + simpleContent + attributes: + id -- ID + base -- QName, required + + contents: + annotation?, simpleType?, (enumeration | length | + maxExclusive | maxInclusive | maxLength | minExclusive | + minInclusive | minLength | pattern | fractionDigits | + totalDigits | whiteSpace)*, (attribute | attributeGroup)*, + anyAttribute? + """ + required = ['base'] + attributes = {'id':None, + 'base':None } + contents = {'xsd':['annotation', 'simpleType', 'attribute',\ + 'attributeGroup', 'anyAttribute'] + RestrictionMarker.facets} + tag = 'restriction' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + self.attr_content = None + + def getAttributeContent(self): + return self.attr_content + + def fromDom(self, node): + self.content = [] + self.setAttributes(node) + contents = self.getContents(node) + + indx = 0 + num = len(contents) + component = SplitQName(contents[indx].getTagName())[1] + if component == 'annotation': + self.annotation = Annotation(self) + self.annotation.fromDom(contents[indx]) + indx += 1 + component = SplitQName(contents[indx].getTagName())[1] + + content = [] + while indx < num: + component = SplitQName(contents[indx].getTagName())[1] + if component == 'attribute': + if contents[indx].hasattr('ref'): + content.append(AttributeReference(self)) + else: + content.append(LocalAttributeDeclaration(self)) + elif component == 'attributeGroup': + content.append(AttributeGroupReference(self)) + elif component == 'anyAttribute': + content.append(AttributeWildCard(self)) + elif component == 'simpleType': + self.content.append(LocalSimpleType(self)) + self.content[-1].fromDom(contents[indx]) + else: + raise SchemaError, 'Unknown component (%s)'\ + %(contents[indx].getTagName()) + content[-1].fromDom(contents[indx]) + indx += 1 + self.attr_content = tuple(content) + + +class LocalComplexType(ComplexType,\ + LocalMarker): + """<complexType> + parents: + element + attributes: + id -- ID + mixed -- boolean, false + + contents: + annotation?, (simpleContent | complexContent | + ((group | all | choice | sequence)?, (attribute | attributeGroup)*, anyAttribute?)) + """ + required = [] + attributes = {'id':None, + 'mixed':0} + tag = 'complexType' + + +class SimpleType(XMLSchemaComponent,\ + DefinitionMarker,\ + SimpleMarker): + """<simpleType name> + parents: + redefine, schema + attributes: + id -- ID + name -- NCName, required + final -- ('#all' | ('extension' | 'restriction' | 'list' | 'union')*), + schema.finalDefault + + contents: + annotation?, (restriction | list | union) + """ + required = ['name'] + attributes = {'id':None, + 'name':None, + 'final':lambda self: self._parent().getFinalDefault()} + contents = {'xsd':['annotation', 'restriction', 'list', 'union']} + tag = 'simpleType' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def getElementDeclaration(self, attribute): + raise Warning, 'invalid operation for <%s>' %self.tag + + def getTypeDefinition(self, attribute): + raise Warning, 'invalid operation for <%s>' %self.tag + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + for child in contents: + component = SplitQName(child.getTagName())[1] + if component == 'annotation': + self.annotation = Annotation(self) + self.annotation.fromDom(child) + continue + break + else: + return + if component == 'restriction': + self.content = self.__class__.Restriction(self) + elif component == 'list': + self.content = self.__class__.List(self) + elif component == 'union': + self.content = self.__class__.Union(self) + else: + raise SchemaError, 'Unknown component (%s)' %(component) + self.content.fromDom(child) + + class Restriction(XMLSchemaComponent,\ + RestrictionMarker): + """<restriction base> + parents: + simpleType + attributes: + id -- ID + base -- QName, required or simpleType child + + contents: + annotation?, simpleType?, (enumeration | length | + maxExclusive | maxInclusive | maxLength | minExclusive | + minInclusive | minLength | pattern | fractionDigits | + totalDigits | whiteSpace)* + """ + attributes = {'id':None, + 'base':None } + contents = {'xsd':['annotation', 'simpleType']+RestrictionMarker.facets} + tag = 'restriction' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def getSimpleTypeContent(self): + for el in self.content: + if el.isSimple(): return el + return None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for indx in range(len(contents)): + component = SplitQName(contents[indx].getTagName())[1] + if (component == 'annotation') and (not indx): + self.annotation = Annotation(self) + self.annotation.fromDom(contents[indx]) + continue + elif (component == 'simpleType') and (not indx or indx == 1): + content.append(AnonymousSimpleType(self)) + content[-1].fromDom(contents[indx]) + elif component in RestrictionMarker.facets: + #print_debug('%s class instance, skipping %s' %(self.__class__, component)) + pass + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content = tuple(content) + + + class Union(XMLSchemaComponent, + UnionMarker): + """<union> + parents: + simpleType + attributes: + id -- ID + memberTypes -- list of QNames, required or simpleType child. + + contents: + annotation?, simpleType* + """ + attributes = {'id':None, + 'memberTypes':None } + contents = {'xsd':['annotation', 'simpleType']} + tag = 'union' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def fromDom(self, node): + self.setAttributes(node) + contents = self.getContents(node) + content = [] + + for indx in range(len(contents)): + component = SplitQName(contents[indx].getTagName())[1] + if (component == 'annotation') and (not indx): + self.annotation = Annotation(self) + self.annotation.fromDom(contents[indx]) + elif (component == 'simpleType'): + content.append(AnonymousSimpleType(self)) + content[-1].fromDom(contents[indx]) + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + self.content = tuple(content) + + class List(XMLSchemaComponent, + ListMarker): + """<list> + parents: + simpleType + attributes: + id -- ID + itemType -- QName, required or simpleType child. + + contents: + annotation?, simpleType? + """ + attributes = {'id':None, + 'itemType':None } + contents = {'xsd':['annotation', 'simpleType']} + tag = 'list' + + def __init__(self, parent): + XMLSchemaComponent.__init__(self, parent) + self.annotation = None + self.content = None + + def getItemType(self): + return self.attributes.get('itemType') + + def getTypeDefinition(self, attribute='itemType'): + '''return the type refered to by itemType attribute or + the simpleType content. If returns None, then the + type refered to by itemType is primitive. + ''' + tp = XMLSchemaComponent.getTypeDefinition(self, attribute) + return tp or self.content + + def fromDom(self, node): + self.annotation = None + self.content = None + self.setAttributes(node) + contents = self.getContents(node) + for indx in range(len(contents)): + component = SplitQName(contents[indx].getTagName())[1] + if (component == 'annotation') and (not indx): + self.annotation = Annotation(self) + self.annotation.fromDom(contents[indx]) + elif (component == 'simpleType'): + self.content = AnonymousSimpleType(self) + self.content.fromDom(contents[indx]) + break + else: + raise SchemaError, 'Unknown component (%s)' %(i.getTagName()) + + +class AnonymousSimpleType(SimpleType,\ + SimpleMarker): + """<simpleType> + parents: + attribute, element, list, restriction, union + attributes: + id -- ID + + contents: + annotation?, (restriction | list | union) + """ + required = [] + attributes = {'id':None} + tag = 'simpleType' + + +class Redefine: + """<redefine> + parents: + attributes: + + contents: + """ + tag = 'redefine' + + +########################### +########################### + + +if sys.version_info[:2] >= (2, 2): + tupleClass = tuple +else: + import UserTuple + tupleClass = UserTuple.UserTuple + +class TypeDescriptionComponent(tupleClass): + """Tuple of length 2, consisting of + a namespace and unprefixed name. + """ + def __init__(self, args): + """args -- (namespace, name) + Remove the name's prefix, irrelevant. + """ + if len(args) != 2: + raise TypeError, 'expecting tuple (namespace, name), got %s' %args + elif args[1].find(':') >= 0: + args = (args[0], SplitQName(args[1])[1]) + tuple.__init__(self, args) + return + + def getTargetNamespace(self): + return self[0] + + def getName(self): + return self[1] + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/XMLname.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/XMLname.py new file mode 100644 index 0000000000000000000000000000000000000000..5961160a13422e133513911c01d86c4d624e2572 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/XMLname.py @@ -0,0 +1,90 @@ +"""Translate strings to and from SOAP 1.2 XML name encoding + +Implements rules for mapping application defined name to XML names +specified by the w3 SOAP working group for SOAP version 1.2 in +Appendix A of "SOAP Version 1.2 Part 2: Adjuncts", W3C Working Draft +17, December 2001, <http://www.w3.org/TR/soap12-part2/#namemap> + +Also see <http://www.w3.org/2000/xp/Group/xmlp-issues>. + +Author: Gregory R. Warnes <Gregory.R.Warnes@Pfizer.com> +Date:: 2002-04-25 +Version 0.9.0 + +""" + +ident = "$Id$" + +from re import * + + +def _NCNameChar(x): + return x.isalpha() or x.isdigit() or x=="." or x=='-' or x=="_" + + +def _NCNameStartChar(x): + return x.isalpha() or x=="_" + + +def _toUnicodeHex(x): + hexval = hex(ord(x[0]))[2:] + hexlen = len(hexval) + # Make hexval have either 4 or 8 digits by prepending 0's + if (hexlen==1): hexval = "000" + hexval + elif (hexlen==2): hexval = "00" + hexval + elif (hexlen==3): hexval = "0" + hexval + elif (hexlen==4): hexval = "" + hexval + elif (hexlen==5): hexval = "000" + hexval + elif (hexlen==6): hexval = "00" + hexval + elif (hexlen==7): hexval = "0" + hexval + elif (hexlen==8): hexval = "" + hexval + else: raise Exception, "Illegal Value returned from hex(ord(x))" + + return "_x"+ hexval + "_" + + +def _fromUnicodeHex(x): + return eval( r'u"\u'+x[2:-1]+'"' ) + + +def toXMLname(string): + """Convert string to a XML name.""" + if string.find(':') != -1 : + (prefix, localname) = string.split(':',1) + else: + prefix = None + localname = string + + T = unicode(localname) + + N = len(localname) + X = []; + for i in range(N) : + if i< N-1 and T[i]==u'_' and T[i+1]==u'x': + X.append(u'_x005F_') + elif i==0 and N >= 3 and \ + ( T[0]==u'x' or T[0]==u'X' ) and \ + ( T[1]==u'm' or T[1]==u'M' ) and \ + ( T[2]==u'l' or T[2]==u'L' ): + X.append(u'_xFFFF_' + T[0]) + elif (not _NCNameChar(T[i])) or (i==0 and not _NCNameStartChar(T[i])): + X.append(_toUnicodeHex(T[i])) + else: + X.append(T[i]) + + if prefix: + return "%s:%s" % (prefix, u''.join(X)) + return u''.join(X) + + +def fromXMLname(string): + """Convert XML name to unicode string.""" + + retval = sub(r'_xFFFF_','', string ) + + def fun( matchobj ): + return _fromUnicodeHex( matchobj.group(0) ) + + retval = sub(r'_x[0-9A-Za-z]+_', fun, retval ) + + return retval diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/__init__.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5b6f7ef027ea1edab6eff22e407802710916a859 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/__init__.py @@ -0,0 +1,9 @@ +#! /usr/bin/env python +"""WSDL parsing services package for Web Services for Python.""" + +ident = "$Id$" + +import WSDLTools +import XMLname +import logging + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/c14n.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/c14n.py new file mode 100755 index 0000000000000000000000000000000000000000..781f256c90731248352167a68179a9deac0ef46d --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/c14n.py @@ -0,0 +1,536 @@ +#! /usr/bin/env python +"""Compatibility module, imported by ZSI if you don't have PyXML 0.7. + +No copyright violations -- we're only using parts of PyXML that we +wrote. +""" + +_copyright = '''ZSI: Zolera Soap Infrastructure. + +Copyright 2001, Zolera Systems, Inc. All Rights Reserved. +Copyright 2002-2003, Rich Salz. All Rights Reserved. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, and/or +sell copies of the Software, and to permit persons to whom the Software +is furnished to do so, provided that the above copyright notice(s) and +this permission notice appear in all copies of the Software and that +both the above copyright notice(s) and this permission notice appear in +supporting documentation. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS +INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT +OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, use +or other dealings in this Software without prior written authorization +of the copyright holder. +''' + +_copyright += "\n\nPortions are also: " +_copyright += '''Copyright 2001, Zolera Systems Inc. All Rights Reserved. +Copyright 2001, MIT. All Rights Reserved. + +Distributed under the terms of: + Python 2.0 License or later. + http://www.python.org/2.0.1/license.html +or + W3C Software License + http://www.w3.org/Consortium/Legal/copyright-software-19980720 +''' + +from xml.dom import Node +from Namespaces import XMLNS +import cStringIO as StringIO +try: + from xml.dom.ext import c14n +except ImportError, ex: + _implementation2 = None + _attrs = lambda E: (E.attributes and E.attributes.values()) or [] + _children = lambda E: E.childNodes or [] +else: + class _implementation2(c14n._implementation): + """Patch for exclusive c14n + """ + def __init__(self, node, write, **kw): + self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes') + self._exclusive = None + if node.nodeType == Node.ELEMENT_NODE: + if not c14n._inclusive(self): + self._exclusive = self._inherit_context(node) + c14n._implementation.__init__(self, node, write, **kw) + + def _do_element(self, node, initial_other_attrs = []): + """Patch for the xml.dom.ext.c14n implemenation _do_element method. + This fixes a problem with sorting of namespaces. + """ + # Get state (from the stack) make local copies. + # ns_parent -- NS declarations in parent + # ns_rendered -- NS nodes rendered by ancestors + # ns_local -- NS declarations relevant to this element + # xml_attrs -- Attributes in XML namespace from parent + # xml_attrs_local -- Local attributes in XML namespace. + ns_parent, ns_rendered, xml_attrs = \ + self.state[0], self.state[1].copy(), self.state[2].copy() #0422 + ns_local = ns_parent.copy() + xml_attrs_local = {} + + # Divide attributes into NS, XML, and others. + #other_attrs = initial_other_attrs[:] + other_attrs = [] + sort_these_attrs = initial_other_attrs[:] + + in_subset = c14n._in_subset(self.subset, node) + #for a in _attrs(node): + sort_these_attrs +=c14n._attrs(node) + + for a in sort_these_attrs: + if a.namespaceURI == c14n.XMLNS.BASE: + n = a.nodeName + if n == "xmlns:": n = "xmlns" # DOM bug workaround + ns_local[n] = a.nodeValue + elif a.namespaceURI == c14n.XMLNS.XML: + if c14n._inclusive(self) or (in_subset and c14n._in_subset(self.subset, a)): #020925 Test to see if attribute node in subset + xml_attrs_local[a.nodeName] = a #0426 + else: + if c14n._in_subset(self.subset, a): #020925 Test to see if attribute node in subset + other_attrs.append(a) + #add local xml:foo attributes to ancestor's xml:foo attributes + xml_attrs.update(xml_attrs_local) + + # Render the node + W, name = self.write, None + if in_subset: + name = node.nodeName + W('<') + W(name) + + # Create list of NS attributes to render. + ns_to_render = [] + for n,v in ns_local.items(): + + # If default namespace is XMLNS.BASE or empty, + # and if an ancestor was the same + if n == "xmlns" and v in [ c14n.XMLNS.BASE, '' ] \ + and ns_rendered.get('xmlns') in [ c14n.XMLNS.BASE, '', None ]: + continue + + # "omit namespace node with local name xml, which defines + # the xml prefix, if its string value is + # http://www.w3.org/XML/1998/namespace." + if n in ["xmlns:xml", "xml"] \ + and v in [ 'http://www.w3.org/XML/1998/namespace' ]: + continue + + + # If not previously rendered + # and it's inclusive or utilized + if (n,v) not in ns_rendered.items() \ + and (c14n._inclusive(self) or \ + c14n._utilized(n, node, other_attrs, self.unsuppressedPrefixes)): + ns_to_render.append((n, v)) + + ##################################### + # JRB + ##################################### + if not c14n._inclusive(self): + if node.prefix is None: + look_for = [('xmlns', node.namespaceURI),] + else: + look_for = [('xmlns:%s' %node.prefix, node.namespaceURI),] + for a in c14n._attrs(node): + if a.namespaceURI != XMLNS.BASE: + #print "ATTRIBUTE: ", (a.namespaceURI, a.prefix) + if a.prefix: + #print "APREFIX: ", a.prefix + look_for.append(('xmlns:%s' %a.prefix, a.namespaceURI)) + + for key,namespaceURI in look_for: + if ns_rendered.has_key(key): + if ns_rendered[key] == namespaceURI: + # Dont write out + pass + else: + #ns_to_render += [(key, namespaceURI)] + pass + elif (key,namespaceURI) in ns_to_render: + # Dont write out + pass + else: + # Unique write out, rewrite to render + ns_local[key] = namespaceURI + for a in self._exclusive: + if a.nodeName == key: + #self._do_attr(a.nodeName, a.value) + #ns_rendered[key] = namespaceURI + #break + ns_to_render += [(a.nodeName, a.value)] + break + elif key is None and a.nodeName == 'xmlns': + #print "DEFAULT: ", (a.nodeName, a.value) + ns_to_render += [(a.nodeName, a.value)] + break + #print "KEY: ", key + else: + #print "Look for: ", look_for + #print "NS_TO_RENDER: ", ns_to_render + #print "EXCLUSIVE NS: ", map(lambda f: (f.nodeName,f.value),self._exclusive) + raise RuntimeError, \ + 'can not find namespace (%s="%s") for exclusive canonicalization'\ + %(key, namespaceURI) + ##################################### + + + + # Sort and render the ns, marking what was rendered. + ns_to_render.sort(c14n._sorter_ns) + for n,v in ns_to_render: + #XXX JRB, getting 'xmlns,None' here when xmlns='' + if v: self._do_attr(n, v) + else: + v = '' + self._do_attr(n, v) + ns_rendered[n]=v #0417 + + # If exclusive or the parent is in the subset, add the local xml attributes + # Else, add all local and ancestor xml attributes + # Sort and render the attributes. + if not c14n._inclusive(self) or c14n._in_subset(self.subset,node.parentNode): #0426 + other_attrs.extend(xml_attrs_local.values()) + else: + other_attrs.extend(xml_attrs.values()) + #print "OTHER: ", other_attrs + other_attrs.sort(c14n._sorter) + for a in other_attrs: + self._do_attr(a.nodeName, a.value) + W('>') + + + # Push state, recurse, pop state. + state, self.state = self.state, (ns_local, ns_rendered, xml_attrs) + for c in c14n._children(node): + c14n._implementation.handlers[c.nodeType](self, c) + self.state = state + + if name: W('</%s>' % name) + c14n._implementation.handlers[c14n.Node.ELEMENT_NODE] = _do_element + + +_IN_XML_NS = lambda n: n.namespaceURI == XMLNS.XML + +# Does a document/PI has lesser/greater document order than the +# first element? +_LesserElement, _Element, _GreaterElement = range(3) + +def _sorter(n1,n2): + '''_sorter(n1,n2) -> int + Sorting predicate for non-NS attributes.''' + + i = cmp(n1.namespaceURI, n2.namespaceURI) + if i: return i + return cmp(n1.localName, n2.localName) + + +def _sorter_ns(n1,n2): + '''_sorter_ns((n,v),(n,v)) -> int + "(an empty namespace URI is lexicographically least)."''' + + if n1[0] == 'xmlns': return -1 + if n2[0] == 'xmlns': return 1 + return cmp(n1[0], n2[0]) + +def _utilized(n, node, other_attrs, unsuppressedPrefixes): + '''_utilized(n, node, other_attrs, unsuppressedPrefixes) -> boolean + Return true if that nodespace is utilized within the node''' + + if n.startswith('xmlns:'): + n = n[6:] + elif n.startswith('xmlns'): + n = n[5:] + if n == node.prefix or n in unsuppressedPrefixes: return 1 + for attr in other_attrs: + if n == attr.prefix: return 1 + return 0 + +_in_subset = lambda subset, node: not subset or node in subset + +# +# JRB. Currently there is a bug in do_element, but since the underlying +# Data Structures in c14n have changed I can't just apply the +# _implementation2 patch above. But this will work OK for most uses, +# just not XML Signatures. +# +class _implementation: + '''Implementation class for C14N. This accompanies a node during it's + processing and includes the parameters and processing state.''' + + # Handler for each node type; populated during module instantiation. + handlers = {} + + def __init__(self, node, write, **kw): + '''Create and run the implementation.''' + + self.write = write + self.subset = kw.get('subset') + if self.subset: + self.comments = kw.get('comments', 1) + else: + self.comments = kw.get('comments', 0) + self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes') + nsdict = kw.get('nsdict', { 'xml': XMLNS.XML, 'xmlns': XMLNS.BASE }) + + # Processing state. + self.state = (nsdict, ['xml'], []) + + if node.nodeType == Node.DOCUMENT_NODE: + self._do_document(node) + elif node.nodeType == Node.ELEMENT_NODE: + self.documentOrder = _Element # At document element + if self.unsuppressedPrefixes is not None: + self._do_element(node) + else: + inherited = self._inherit_context(node) + self._do_element(node, inherited) + elif node.nodeType == Node.DOCUMENT_TYPE_NODE: + pass + else: + raise TypeError, str(node) + + + def _inherit_context(self, node): + '''_inherit_context(self, node) -> list + Scan ancestors of attribute and namespace context. Used only + for single element node canonicalization, not for subset + canonicalization.''' + + # Collect the initial list of xml:foo attributes. + xmlattrs = filter(_IN_XML_NS, _attrs(node)) + + # Walk up and get all xml:XXX attributes we inherit. + inherited, parent = [], node.parentNode + while parent and parent.nodeType == Node.ELEMENT_NODE: + for a in filter(_IN_XML_NS, _attrs(parent)): + n = a.localName + if n not in xmlattrs: + xmlattrs.append(n) + inherited.append(a) + parent = parent.parentNode + return inherited + + + def _do_document(self, node): + '''_do_document(self, node) -> None + Process a document node. documentOrder holds whether the document + element has been encountered such that PIs/comments can be written + as specified.''' + + self.documentOrder = _LesserElement + for child in node.childNodes: + if child.nodeType == Node.ELEMENT_NODE: + self.documentOrder = _Element # At document element + self._do_element(child) + self.documentOrder = _GreaterElement # After document element + elif child.nodeType == Node.PROCESSING_INSTRUCTION_NODE: + self._do_pi(child) + elif child.nodeType == Node.COMMENT_NODE: + self._do_comment(child) + elif child.nodeType == Node.DOCUMENT_TYPE_NODE: + pass + else: + raise TypeError, str(child) + handlers[Node.DOCUMENT_NODE] = _do_document + + + def _do_text(self, node): + '''_do_text(self, node) -> None + Process a text or CDATA node. Render various special characters + as their C14N entity representations.''' + if not _in_subset(self.subset, node): return + s = node.data \ + .replace("&", "&") \ + .replace("<", "<") \ + .replace(">", ">") \ + .replace("\015", "
") + if s: self.write(s) + handlers[Node.TEXT_NODE] = _do_text + handlers[Node.CDATA_SECTION_NODE] = _do_text + + + def _do_pi(self, node): + '''_do_pi(self, node) -> None + Process a PI node. Render a leading or trailing #xA if the + document order of the PI is greater or lesser (respectively) + than the document element. + ''' + if not _in_subset(self.subset, node): return + W = self.write + if self.documentOrder == _GreaterElement: W('\n') + W('<?') + W(node.nodeName) + s = node.data + if s: + W(' ') + W(s) + W('?>') + if self.documentOrder == _LesserElement: W('\n') + handlers[Node.PROCESSING_INSTRUCTION_NODE] = _do_pi + + + def _do_comment(self, node): + '''_do_comment(self, node) -> None + Process a comment node. Render a leading or trailing #xA if the + document order of the comment is greater or lesser (respectively) + than the document element. + ''' + if not _in_subset(self.subset, node): return + if self.comments: + W = self.write + if self.documentOrder == _GreaterElement: W('\n') + W('<!--') + W(node.data) + W('-->') + if self.documentOrder == _LesserElement: W('\n') + handlers[Node.COMMENT_NODE] = _do_comment + + + def _do_attr(self, n, value): + ''''_do_attr(self, node) -> None + Process an attribute.''' + + W = self.write + W(' ') + W(n) + W('="') + s = value \ + .replace("&", "&") \ + .replace("<", "<") \ + .replace('"', '"') \ + .replace('\011', '	') \ + .replace('\012', '
') \ + .replace('\015', '
') + W(s) + W('"') + + def _do_element(self, node, initial_other_attrs = []): + '''_do_element(self, node, initial_other_attrs = []) -> None + Process an element (and its children).''' + + # Get state (from the stack) make local copies. + # ns_parent -- NS declarations in parent + # ns_rendered -- NS nodes rendered by ancestors + # xml_attrs -- Attributes in XML namespace from parent + # ns_local -- NS declarations relevant to this element + ns_parent, ns_rendered, xml_attrs = \ + self.state[0], self.state[1][:], self.state[2][:] + ns_local = ns_parent.copy() + + # Divide attributes into NS, XML, and others. + other_attrs = initial_other_attrs[:] + in_subset = _in_subset(self.subset, node) + for a in _attrs(node): + if a.namespaceURI == XMLNS.BASE: + n = a.nodeName + if n == "xmlns:": n = "xmlns" # DOM bug workaround + ns_local[n] = a.nodeValue + elif a.namespaceURI == XMLNS.XML: + if self.unsuppressedPrefixes is None or in_subset: + xml_attrs.append(a) + else: + other_attrs.append(a) + + # Render the node + W, name = self.write, None + if in_subset: + name = node.nodeName + W('<') + W(name) + + # Create list of NS attributes to render. + ns_to_render = [] + for n,v in ns_local.items(): + pval = ns_parent.get(n) + + # If default namespace is XMLNS.BASE or empty, skip + if n == "xmlns" \ + and v in [ XMLNS.BASE, '' ] and pval in [ XMLNS.BASE, '' ]: + continue + + # "omit namespace node with local name xml, which defines + # the xml prefix, if its string value is + # http://www.w3.org/XML/1998/namespace." + if n == "xmlns:xml" \ + and v in [ 'http://www.w3.org/XML/1998/namespace' ]: + continue + + # If different from parent, or parent didn't render + # and if not exclusive, or this prefix is needed or + # not suppressed + if (v != pval or n not in ns_rendered) \ + and (self.unsuppressedPrefixes is None or \ + _utilized(n, node, other_attrs, self.unsuppressedPrefixes)): + ns_to_render.append((n, v)) + + # Sort and render the ns, marking what was rendered. + ns_to_render.sort(_sorter_ns) + for n,v in ns_to_render: + self._do_attr(n, v) + ns_rendered.append(n) + + # Add in the XML attributes (don't pass to children, since + # we're rendering them), sort, and render. + other_attrs.extend(xml_attrs) + xml_attrs = [] + other_attrs.sort(_sorter) + for a in other_attrs: + self._do_attr(a.nodeName, a.value) + W('>') + + # Push state, recurse, pop state. + state, self.state = self.state, (ns_local, ns_rendered, xml_attrs) + for c in _children(node): + _implementation.handlers[c.nodeType](self, c) + self.state = state + + if name: W('</%s>' % name) + handlers[Node.ELEMENT_NODE] = _do_element + + +def Canonicalize(node, output=None, **kw): + '''Canonicalize(node, output=None, **kw) -> UTF-8 + + Canonicalize a DOM document/element node and all descendents. + Return the text; if output is specified then output.write will + be called to output the text and None will be returned + Keyword parameters: + nsdict: a dictionary of prefix:uri namespace entries + assumed to exist in the surrounding context + comments: keep comments if non-zero (default is 0) + subset: Canonical XML subsetting resulting from XPath + (default is []) + unsuppressedPrefixes: do exclusive C14N, and this specifies the + prefixes that should be inherited. + ''' + if output: + if _implementation2 is None: + _implementation(node, output.write, **kw) + else: + apply(_implementation2, (node, output.write), kw) + else: + s = StringIO.StringIO() + if _implementation2 is None: + _implementation(node, s.write, **kw) + else: + apply(_implementation2, (node, s.write), kw) + return s.getvalue() + + +if __name__ == '__main__': print _copyright diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/logging.py b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..76ce6dc1864304077cca7ec71c3a47b1472cb929 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/SOAPpy/wstools/logging.py @@ -0,0 +1,85 @@ +#! /usr/bin/env python +"""Logging""" +import sys + + +class ILogger: + '''Logger interface, by default this class + will be used and logging calls are no-ops. + ''' + level = 0 + def __init__(self, msg): + return + def warning(self, *args): + return + def debug(self, *args): + return + def error(self, *args): + return + def setLevel(cls, level): + cls.level = level + setLevel = classmethod(setLevel) +_LoggerClass = ILogger + + +class BasicLogger(ILogger): + def __init__(self, msg, out=sys.stdout): + self.msg, self.out = msg, out + + def warning(self, msg, *args): + if self.level < 1: return + print >>self, self.WARN, self.msg, + print >>self, msg %args + WARN = 'WARN' + def debug(self, msg, *args): + if self.level < 2: return + print >>self, self.DEBUG, self.msg, + print >>self, msg %args + DEBUG = 'DEBUG' + def error(self, msg, *args): + print >>self, self.ERROR, self.msg, + print >>self, msg %args + ERROR = 'ERROR' + + def write(self, *args): + '''Write convenience function; writes strings. + ''' + for s in args: self.out.write(s) + + +def setBasicLogger(): + '''Use Basic Logger. + ''' + setLoggerClass(BasicLogger) + BasicLogger.setLevel(0) + +def setBasicLoggerWARN(): + '''Use Basic Logger. + ''' + setLoggerClass(BasicLogger) + BasicLogger.setLevel(1) + +def setBasicLoggerDEBUG(): + '''Use Basic Logger. + ''' + setLoggerClass(BasicLogger) + BasicLogger.setLevel(2) + +def setLoggerClass(loggingClass): + '''Set Logging Class. + ''' + assert issubclass(loggingClass, ILogger), 'loggingClass must subclass ILogger' + global _LoggerClass + _LoggerClass = loggingClass + +def setLevel(level=0): + '''Set Global Logging Level. + ''' + ILogger.level = level + +def getLogger(msg): + '''Return instance of Logging class. + ''' + return _LoggerClass(msg) + + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/TODO b/LTA/LTAIngest/SOAPpy-0.12.0/TODO new file mode 100644 index 0000000000000000000000000000000000000000..18b686d80e2b0053ad1cbf6be5bd732a2514eeb3 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/TODO @@ -0,0 +1,14 @@ +# $Id: TODO,v 1.1 2005/05/13 08:20:39 renting Exp $ + +- figure out why parsing rules are broken + +- generate a test harness that will run all of the test code. + +- create unit-tests for all features, as well as for reported bugs. + +- write better documentation (!!!) + - topics: WSDL, Globus, Authentication, ... + - general introduction article + - ... + + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/__init__.py b/LTA/LTAIngest/SOAPpy-0.12.0/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/bid/inventoryClient.py b/LTA/LTAIngest/SOAPpy-0.12.0/bid/inventoryClient.py new file mode 100755 index 0000000000000000000000000000000000000000..2bc71f0dc8dc769ccda1fcddcf8c98cb00a95051 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/bid/inventoryClient.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python + + +import getopt +import sys +import string +import re +import time +sys.path.insert(1,"..") +from SOAPpy import SOAP +import traceback + +DEFAULT_SERVERS_FILE = './inventory.servers' + +DEFAULT_METHODS = ('SimpleBuy', 'RequestForQuote','Buy','Ping') + +def usage (error = None): + sys.stdout = sys.stderr + + if error != None: + print error + + print """usage: %s [options] [server ...] + If a long option shows an argument is mandatory, it's mandatory for the + equivalent short option also. + + -?, --help display this usage + -d, --debug turn on debugging in the SOAP library + -i, --invert test servers *not* in the list of servers given + -m, --method=METHOD#[,METHOD#...] + call only the given methods, specify a METHOD# of ? + for the list of method numbers + -o, --output=TYPE turn on output, TYPE is one or more of s(uccess), + f(ailure), n(ot implemented), F(ailed (as expected)), + a(ll) + [f] + -s, --servers=FILE use FILE as list of servers to test [%s] + -t, --stacktrace print a stack trace on each unexpected failure + -T, --always-stacktrace + print a stack trace on any failure +""" % (sys.argv[0], DEFAULT_SERVERS_FILE), + + sys.exit (0) + + +def methodUsage (): + sys.stdout = sys.stderr + + print "Methods are specified by number. Multiple methods can be " \ + "specified using a\ncomma-separated list of numbers or ranges. " \ + "For example 1,4-6,8 specifies\nmethods 1, 4, 5, 6, and 8.\n" + + print "The available methods are:\n" + + half = (len (DEFAULT_METHODS) + 1) / 2 + for i in range (half): + print "%4d. %-25s" % (i + 1, DEFAULT_METHODS[i]), + if i + half < len (DEFAULT_METHODS): + print "%4d. %-25s" % (i + 1 + half, DEFAULT_METHODS[i + half]), + print + + sys.exit (0) + + +def readServers (file): + servers = [] + f = open (file, 'r') + + while 1: + line = f.readline () + + if line == '': + break + + if line[0] in ('#', '\n') or line[0] in string.whitespace: + continue + + cur = {'nonfunctional': {}} + tag = None + servers.append (cur) + + while 1: + if line[0] in string.whitespace: + if tag == 'nonfunctional': + value = method + ' ' + cur[tag][method] + else: + value = cur[tag] + value += ' ' + line.strip () + else: + tag, value = line.split (':', 1) + + tag = tag.strip ().lower () + value = value.strip () + + if value[0] == '"' and value[-1] == '"': + value = value[1:-1] + + if tag == 'nonfunctional': + value = value.split (' ', 1) + [''] + + method = value[0] + cur[tag][method] = value[1] + else: + cur[tag] = value + + line = f.readline () + + if line == '' or line[0] == '\n': + break + + return servers + +def str2list (s): + l = {} + + for i in s.split (','): + if i.find ('-') != -1: + i = i.split ('-') + for i in range (int (i[0]),int (i[1]) + 1): + l[i] = 1 + else: + l[int (i)] = 1 + + l = l.keys () + l.sort () + + return l + +def SimpleBuy(serv, sa, epname): + serv = serv._sa (sa % {'methodname':'SimpleBuy'}) + return serv.SimpleBuy(ProductName="widget", Quantity = 50, Address = "this is my address") #JHawk, Phalanx require this order of params + + +def RequestForQuote(serv, sa, epname): + serv = serv._sa (sa % {'methodname':'RequestForQuote'}) + return serv.RequestForQuote(Quantity=3, ProductName = "thing") # for Phalanx, JHawk + + +def Buy(serv, sa, epname): + import copy + serv = serv._sa (sa % {'methodname':'Buy'}) + billTo_d = {"name":"Buyer One", "address":"1 1st Street", + "city":"New York", "state":"NY", "zipCode":"10000"} + shipTo_d = {"name":"Buyer One ", "address":"1 1st Street ", + "city":"New York ", "state":"NY ", "zipCode":"10000 "} + + for k,v in shipTo_d.items(): + shipTo_d[k] = v[:-1] + + itemd1 = SOAP.structType( {"name":"widg1","quantity":200,"price":SOAP.decimalType(45.99), "_typename":"LineItem"}) + itemd2 = SOAP.structType( {"name":"widg2","quantity":400,"price":SOAP.decimalType(33.45), "_typename":"LineItem"}) + + items_d = SOAP.arrayType( [itemd1, itemd2] ) + items_d._ns = "http://www.soapinterop.org/Bid" + po_d = SOAP.structType( data = {"poID":"myord","createDate":SOAP.dateTimeType(),"shipTo":shipTo_d, "billTo":billTo_d, "items":items_d}) + try: + # it's called PO by MST (MS SOAP Toolkit), JHawk (.NET Remoting), + # Idoox WASP, Paul (SOAP::Lite), PranishK (ATL), GLUE, Aumsoft, + # HP, EasySoap, and Jake (Frontier). [Actzero accepts either] + return serv.Buy(PO=po_d) + except: + # called PurchaseOrder by KeithBa + return serv.Buy(PurchaseOrder=po_d) + + +def Ping(serv, sa, epname): + serv = serv._sa (sa % {'methodname':'Ping'}) + return serv.Ping() + +def main(): + servers = DEFAULT_SERVERS_FILE + methodnums = None + output = 'f' + invert = 0 + succeed = 0 + printtrace = 0 + stats = 1 + total = 0 + fail = 0 + failok = 0 + notimp = 0 + + try: + opts,args = getopt.getopt (sys.argv[1:], '?dm:io:s:t', + ['help', 'method', 'debug', 'invert', + 'output', 'servers=']) + for opt, arg in opts: + if opt in ('-?', '--help'): + usage () + elif opt in ('-d', '--debug'): + SOAP.Config.debug = 1 + elif opt in ('-i', '--invert'): + invert = 1 + elif opt in ('-m', '--method'): + if arg == '?': + methodUsage () + methodnums = str2list (arg) + elif opt in ('-o', '--output'): + output = arg + elif opt in ('-s', '--servers'): + servers = arg + else: + raise AttributeError, \ + "Recognized but unimplemented option `%s'" % opt + except SystemExit: + raise + except: + usage (sys.exc_info ()[1]) + + if 'a' in output: + output = 'fFns' + + servers = readServers(servers) + + if methodnums == None: + methodnums = range (1, len (DEFAULT_METHODS) + 1) + + limitre = re.compile ('|'.join (args), re.IGNORECASE) + + for s in servers: + if (not not limitre.match (s['name'])) == invert: + continue + + serv = SOAP.SOAPProxy(s['endpoint'], namespace = s['namespace']) + + for num in (methodnums): + if num > len(DEFAULT_METHODS): + break + + total += 1 + + name = DEFAULT_METHODS[num - 1] + + title = '%s: %s (#%d)' % (s['name'], name, num) + + try: + fn = globals ()[name] + except KeyboardInterrupt: + raise + except: + if 'n' in output: + print title, "test not yet implemented" + notimp += 1 + continue + + try: + res = fn (serv, s['soapaction'], s['name']) + if s['nonfunctional'].has_key (name): + print title, "succeeded despite marked nonfunctional" + elif 's' in output: + print title, "succeeded " + succeed += 1 + except KeyboardInterrupt: + print "fail" + raise + except: + if s['nonfunctional'].has_key (name): + if 'F' in output: + t = 'as expected' + if s['nonfunctional'][name] != '': + t += ', ' + s['nonfunctional'][name] + print title, "failed (%s) -" %t, sys.exc_info()[1] + failok += 1 + else: + if 'f' in output: + print title, "failed -", str (sys.exc_info()[1]) + fail += 1 + + if stats: + print " Tests ended at:", time.ctime (time.time()) + if stats > 0: + print " Total tests: %d" % total + print " Successes: %d (%3.2f%%)" % \ + (succeed, 100.0 * succeed / total) + if stats > 0 or fail > 0: + print "Failed unexpectedly: %d (%3.2f%%)" % \ + (fail, 100.0 * fail / total) + if stats > 0: + print " Failed as expected: %d (%3.2f%%)" % \ + (failok, 100.0 * failok / total) + if stats > 0 or notimp > 0: + print " Not implemented: %d (%3.2f%%)" % \ + (notimp, 100.0 * notimp / total) + + return fail + notimp + + + +if __name__ == "__main__": + main() + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/bid/inventoryServer.py b/LTA/LTAIngest/SOAPpy-0.12.0/bid/inventoryServer.py new file mode 100755 index 0000000000000000000000000000000000000000..b48eb12593d6f9e6f94df4c17ad072eed8b372e6 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/bid/inventoryServer.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python +# Copyright (c) 2001, actzero, inc. +import sys +sys.path.insert(1,"..") +from SOAPpy import SOAP +#SOAP.Config.debug = 1 +serverstring = "SOAP.py (actzero.com) running "+sys.platform +NUMBUYS = 0 +NUMSIMPLEBUYS = 0 +NUMREQUESTS = 0 +NUMPINGS = 0 + +def SimpleBuy(Address, ProductName, Quantity): + # currently, this type-checks the params, and makes sure + # the strings are of len > 0 + global NUMSIMPLEBUYS + NUMSIMPLEBUYS += 1 + if Quantity < 1: raise ValueError, "must order at least one" + else: return "Receipt for %d %s(s) bought from %s" % (int(Quantity), ProductName, serverstring) + + +def RequestForQuote(ProductName, Quantity): + # type-checks and makes sure Quantity >= 1 + global NUMREQUESTS + NUMREQUESTS += 1 + if Quantity < 1: raise ValueError, "must order at least 1" + else: + import whrandom + mult = whrandom.random() + times = 0 + while mult > 0.25: + mult = mult - 0.25 + times += 1 + mult += 0.5 + mult = round(mult, 3) + print mult, times + return SOAP.doubleType(round(mult*int(Quantity),2)) + + +def Buy(**kw): + + global NUMBUYS + NUMBUYS += 1 + try: + PurchaseOrder = kw["PurchaseOrder"] + except: + PurchaseOrder = kw["PO"] + try: + POkeys = PurchaseOrder['_keyord'] + POkeys.sort() + POkeys_expected = ["shipTo","billTo","items","poID","createDate"] + POkeys_expected.sort() + if POkeys != POkeys_expected: + raise ValueError, "struct 'PurchaseOrder' needs %s, %s, %s, %s, and %s" % tuple(POkeys_expected) + except: + raise TypeError, "'PurchaseOrder' missing one or more element(s)" + + try: + btkeys = PurchaseOrder["billTo"]["_keyord"] + btkeys.sort() + btkeys_expected = ["address","zipCode","name","state","city"] + btkeys_expected.sort() + except: + raise TypeError, "'billTo' missing one or more elements" + + try: + stkeys = PurchaseOrder["shipTo"]["_keyord"] + stkeys.sort() + stkeys_expected = ["address","zipCode","name","state","city"] + stkeys_expected.sort() + except: + raise TypeError, "'shipTo' missing one or more elements" + + + try: + items = PurchaseOrder["items"].__dict__ + data = items["data"] + retstring = "" + for item in data: + itemdict = item["_asdict"] + q = itemdict["quantity"] + p = itemdict["price"] + name = itemdict["name"] + if retstring != "": + retstring += ", " + else: + retstring = "bought " + retstring += "%d %s(s) for %.2f" % (q,name,p) + retstring += " from "+serverstring + return retstring + + except: + raise TypeError, "items must be an array of 'item' structs" + +def Ping(): + global NUMPINGS + NUMPINGS += 1 + return + +def Monitor(str): + if str=="actzero": + global NUMBUYS + global NUMREQUESTS + global NUMSIMPLEBUYS + global NUMPINGS + return "(Buys, RequestForQuote(s),SimpleBuy(s), Ping(s)) = " + \ + repr( (NUMBUYS,NUMREQUESTS,NUMSIMPLEBUYS, NUMPINGS) ) + else: + raise ValueError, "not the right string" + +def Clear(str): + if str=="actzero": + global NUMBUYS + global NUMREQUESTS + global NUMSIMPLEBUYS + global NUMPINGS + NUMBUYS = 0 + NUMREQUESTS = 0 + NUMSIMPLEBUYS = 0 + NUMPINGS = 0 + return "(Buys, RequestForQuote(s),SimpleBuy(s), Ping(s)) = " + \ + repr( (NUMBUYS,NUMREQUESTS,NUMSIMPLEBUYS, NUMPINGS) ) + else: + raise ValueError, "not the right string" + + +if __name__ == "__main__": + if len(sys.argv) > 1: + try: + port = int(sys.argv[1]) + if port not in range(2000,15000): raise ValueError + except: + print "port must be a number between 2000 and 15000" + sys.exit(1) + else: port = 9000 + namespace = "http://www.soapinterop.org/Bid" + server = SOAP.SOAPServer( ('zoo',port) ) + + server.registerKWFunction(SimpleBuy, namespace ) + server.registerKWFunction(RequestForQuote, namespace ) + server.registerKWFunction(Buy, namespace ) + server.registerKWFunction(Ping, namespace ) + server.registerKWFunction(Monitor, namespace ) + server.registerKWFunction(Clear, namespace ) + + try: + server.serve_forever() + except KeyboardInterrupt: + pass diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/bid/monitorClient.py b/LTA/LTAIngest/SOAPpy-0.12.0/bid/monitorClient.py new file mode 100755 index 0000000000000000000000000000000000000000..424f6919e85a41ad48727e76c72c9f5929bfcaef --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/bid/monitorClient.py @@ -0,0 +1,50 @@ +from SOAPpy import SOAP +import sys +import getopt + + +def usage(): + print """usage: %s [options] + -m, --method=METHOD#[,METHOD#...] specify METHOD# of ? for the list + -p, --port=PORT# allows to specify PORT# of server + """ + sys.exit(1) + +def methodUsage(): + print "The available methods are:" + print "1. Monitor \t\t2. Clear" + sys.exit(0) + + +port = 12080 +methodnum = 1 + +try: + opts, args = getopt.getopt (sys.argv[1:], 'p:m:', ['method','port']) + for opt, arg in opts: + if opt in ('-m','--method'): + if arg == '?': + methodUsage() + methodnum = int(arg) + elif opt in ('-p', '--port'): + port = int(arg) + else: + raise AttributeError, "Recognized but unimpl option '%s'" % opt +except SystemExit: + raise +except: + usage () + +ep = "http://208.177.157.221:%d/xmethodsInterop" % (port) +sa = "urn:soapinterop" +ns = "http://www.soapinterop.org/Bid" + +serv = SOAP.SOAPProxy(ep, namespace =ns, soapaction = sa) +if methodnum == 1: + print serv.Monitor(str="actzero") +elif methodnum == 2: + print serv.Clear(str="actzero") +else: + print "invalid methodnum" + methodUsage() + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/contrib/soap_cli.py b/LTA/LTAIngest/SOAPpy-0.12.0/contrib/soap_cli.py new file mode 100755 index 0000000000000000000000000000000000000000..694333330567eddf15c1bc6be9eb2a713b9f7183 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/contrib/soap_cli.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +import time +from SOAPpy import SOAP + +srv = SOAP.SOAPProxy('http://localhost:10080/') + +for p in ('good param', 'ok param'): + ret = srv.badparam(p) + if isinstance(ret, SOAP.faultType): + print ret + else: + print 'ok' + +dt = SOAP.dateTimeType(time.localtime(time.time())) +print srv.dt(dt) + + + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/contrib/soap_handler.py b/LTA/LTAIngest/SOAPpy-0.12.0/contrib/soap_handler.py new file mode 100755 index 0000000000000000000000000000000000000000..bcb0c4df8fd702eff5c949091047e3bce3b3d676 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/contrib/soap_handler.py @@ -0,0 +1,233 @@ + +import http_server +from SOAPpy.SOAP import * +Fault = faultType +import string, sys + +Config = SOAPConfig(debug=1) + +class soap_handler: + def __init__(self, encoding='UTF-8', config=Config, namespace=None): + self.namespace = namespace + self.objmap = {} + self.funcmap = {} + self.config = config + self.encoding = encoding + + def match (self, request): + return 1 + + def handle_request (self, request): + [path, params, query, fragment] = request.split_uri() + if request.command == 'post': + request.collector = collector(self, request) + else: + request.error(400) + + def continue_request(self, data, request): + # Everthing that follows is cripped from do_POST(). + if self.config.debug: + print "\n***RECEIVING***\n", data, "*" * 13 + "\n" + sys.stdout.flush() + + try: + r, header, body = parseSOAPRPC(data, header=1, body=1) + + method = r._name + args = r._aslist + kw = r._asdict + + ns = r._ns + resp = "" + # For faults messages + if ns: + nsmethod = "%s:%s" % (ns, method) + else: + nsmethod = method + + try: + # First look for registered functions + if self.funcmap.has_key(ns) and \ + self.funcmap[ns].has_key(method): + f = self.funcmap[ns][method] + else: # Now look at registered objects + # Check for nested attributes + if method.find(".") != -1: + t = self.objmap[ns] + l = method.split(".") + for i in l: + t = getattr(t,i) + f = t + else: + f = getattr(self.objmap[ns], method) + except: + if self.config.debug: + import traceback + traceback.print_exc () + + resp = buildSOAP(Fault("%s:Client" % NS.ENV_T, + "No method %s found" % nsmethod, + "%s %s" % tuple(sys.exc_info()[0:2])), + encoding = self.encoding, config = self.config) + status = 500 + else: + try: + # If it's wrapped to indicate it takes keywords + # send it keywords + if header: + x = HeaderHandler(header) + + if isinstance(f,MethodSig): + c = None + if f.context: # Build context object + c = SOAPContext(header, body, d, self.connection, self.headers, + self.headers["soapaction"]) + + if f.keywords: + tkw = {} + # This is lame, but have to de-unicode keywords + for (k,v) in kw.items(): + tkw[str(k)] = v + if c: + tkw["_SOAPContext"] = c + fr = apply(f,(),tkw) + else: + if c: + fr = apply(f,args,{'_SOAPContext':c}) + else: + fr = apply(f,args,{}) + else: + fr = apply(f,args,{}) + if type(fr) == type(self) and isinstance(fr, voidType): + resp = buildSOAP(kw = {'%sResponse' % method:fr}, + encoding = self.encoding, + config = self.config) + else: + resp = buildSOAP(kw = + {'%sResponse' % method:{'Result':fr}}, + encoding = self.encoding, + config = self.config) + except Fault, e: + resp = buildSOAP(e, config = self.config) + status = 500 + except: + if self.config.debug: + import traceback + traceback.print_exc () + + resp = buildSOAP(Fault("%s:Server" % NS.ENV_T, \ + "Method %s failed." % nsmethod, + "%s %s" % tuple(sys.exc_info()[0:2])), + encoding = self.encoding, + config = self.config) + status = 500 + else: + status = 200 + except Fault,e: + resp = buildSOAP(e, encoding = self.encoding, + config = self.config) + status = 500 + except: + # internal error, report as HTTP server error + if self.config.debug: + import traceback + traceback.print_exc () + request.error(500) + #self.send_response(500) + #self.end_headers() + else: + request['Content-Type'] = 'text/xml; charset="%s"' % self.encoding + request.push(resp) + request.done() + # got a valid SOAP response + #self.send_response(status) + #self.send_header("Content-type", + # 'text/xml; charset="%s"' % self.encoding) + #self.send_header("Content-length", str(len(resp))) + #self.end_headers() + + if self.config.debug: + print "\n***SENDING***\n", resp, "*" * 13 + "\n" + sys.stdout.flush() + + """ + # We should be able to shut down both a regular and an SSL + # connection, but under Python 2.1, calling shutdown on an + # SSL connections drops the output, so this work-around. + # This should be investigated more someday. + + if self.config.SSLserver and \ + isinstance(self.connection, SSL.Connection): + self.connection.set_shutdown(SSL.SSL_SENT_SHUTDOWN | + SSL.SSL_RECEIVED_SHUTDOWN) + else: + self.connection.shutdown(1) + """ + + def registerObject(self, object, namespace = ''): + if namespace == '': namespace = self.namespace + self.objmap[namespace] = object + + def registerFunction(self, function, namespace = '', funcName = None): + if not funcName : funcName = function.__name__ + if namespace == '': namespace = self.namespace + if self.funcmap.has_key(namespace): + self.funcmap[namespace][funcName] = function + else: + self.funcmap[namespace] = {funcName : function} + + + +class collector: + "gathers input for POST and PUT requests" + + def __init__ (self, handler, request): + + self.handler = handler + self.request = request + self.data = '' + + # make sure there's a content-length header + cl = request.get_header ('content-length') + + if not cl: + request.error (411) + else: + cl = string.atoi (cl) + # using a 'numeric' terminator + self.request.channel.set_terminator (cl) + + def collect_incoming_data (self, data): + self.data = self.data + data + + def found_terminator (self): + # set the terminator back to the default + self.request.channel.set_terminator ('\r\n\r\n') + self.handler.continue_request (self.data, self.request) + + +if __name__ == '__main__': + + import asyncore + import http_server + + class Thing: + + def badparam(self, param): + if param == 'good param': + return 1 + else: + return Fault(faultstring='bad param') + + def dt(self, aDateTime): + return aDateTime + + thing = Thing() + soaph = soap_handler() + soaph.registerObject(thing) + + hs = http_server.http_server('', 10080) + hs.install_handler(soaph) + + asyncore.loop() + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/docs/GettingStarted.txt b/LTA/LTAIngest/SOAPpy-0.12.0/docs/GettingStarted.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdf77dc29567680b18b73ea040b5084fce2f526b --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/docs/GettingStarted.txt @@ -0,0 +1,186 @@ + +Getting Started +=============== + +NEW: + + Mark Pilgrims' online book _Dive Into Python_ at + http://diveintopython.org includes a nice tutorial for SOAPpy in + Chapter 12. "SOAP Web Services" at + http://diveintopython.org/soap_web_services. + + + +The easiest way to get up to speed is to run and read the scripts in the +tests directory. Better documentation is coming. + +Here are some examples of how to use SOAPpy: + + +CLIENT EXAMPLES: + + ## CODE + from SOAPpy import SOAPProxy + server = SOAPProxy("http://localhost:8080/") + print server.echo("Hello world") + ## /CODE + +This opens a connection to the server listening on localhost:8080, calls the +method echo with the ordered parameter of "Hello World", and prints the +results. + + + ## CODE + from SOAPpy import SOAPProxy + server = SOAPProxy("https://localhost:8443/") + print server.echo("Hello world") + ## /CODE + +This opens a secure connection to the SSL server listening on +localhost:8443, calls the method echo with the ordered parameter of +"Hello World" and prints the results. Python must be built with OpenSSL. + + + ## CODE + from SOAPpy import SOAPProxy + server = SOAPProxy("http://services.xmethods.com/soap", + namespace = "urn:xmethods-delayed-quotes") + print server.getQuote(symbol = "IBM") + ## /CODE + +This calls method getQuote that is in the namespace URI of +urn:xmethods-delayed-quotes on server services.xmethods.com. getQuote is +passed a named parameter, symbol. + + + ## CODE + from SOAPpy import SOAPProxy + server = SOAPProxy("http://services.xmethods.com/soap") + + print server._ns("urn:xmethods-delayed-quotes").getQuote(symbol = "IBM") + ## /CODE + +This does the same thing as the previous example, however namespace is +specified inline on a per call basis rather than at the server level. + + + ## CODE + from SOAPpy import SOAPProxy + server = SOAPProxy("http://services.xmethods.com/soap", + soapaction = "http://somesite.com/myaction") + + print server._ns("urn:xmethods-delayed-quotes").getQuote(symbol = "IBM") + ## /CODE + +This is the same quote call with a soapaction specified. + + + ## CODE + from SOAPpy import SOAPProxy + server = SOAPProxy("http://services.xmethods.com:80/soap") + + ns = "urn:xmethods-delayed-quotes") + sa = "http://somesite.com/myaction" + my_call = server._ns(ns)._sa(sa) + my_call.getQuote(symbol = "IBM") + my_call.getQuote(symbol = "IBM") + my_call.getQuote(symbol = "IBM") + ## /CODE + +The same example, this time with both the soapaction and the namespace +specified inline and saved in a local variable for getQuote to be called +against. + +** What SOAPpy does with the results of a call could seem surprising. If +there is only one element in the structType that has the return value and +unwrap_results is turned on (the default) it will bubble up the single +attribute, otherwise it will return you a structType object with all of the +attributes. + + + +SERVER EXAMPLES: + + ## CODE + from SOAPpy import SOAPServer + def echo(s): + return s + s # repeats a string twice + + server = SOAPServer(("localhost", 8080)) + server.registerFunction(echo) + server.serve_forever() + ## /CODE + +This exposes the function echo (that takes an unnamed arguement) on a server +running on localhost:8080. + + + ## CODE + from SOAPpy import SOAPServer + def echo(s): + return s + s # repeats a string twice + + server = SOAPServer() + server.registerFunction(echo, "echo-space") + server.serve_forever() + ## /CODE + +The same as above, but this time the method is available in the namespace +"echo-space". + + + ## CODE + from SOAPpy import SOAPServer + + class echoBuilder: + def echo(self, val): + return val + val + + server = SOAPServer() + e = echoBuilder() + server.registerObject(e) + server.serve_forever() + ## /CODE + +This registers the whole instance of the object echoBuilder, e. Every +method of the instance is exposed on the server. + + + ## CODE + from SOAPpy import SOAPServer + + def echo(**kw): + return kw['first'] + kw['second'] + kw['third'] + + server = SOAPServer() + server.registerKWFunction(echo) + server.serve_forever() + ## /CODE + +This time the method echo is exposed and it expects named arguments. The +main thing to notice here is the use of the method registerKWFunction over +registerFunction. + + + ## CODE + from SOAPpy import SOAPServer + + from M2Crypto import SSL + + def echo(s): + return s+s # repeats a string twice + + ssl_context = SSL.Context() + ssl_context.load_cert('server.pem') + + server = SOAPServer(("localhost",8443), ssl_context = ssl_context) + server.registerFunction(echo) + server.serve_forever() + ## /CODE + +This exposes the function echo (taking an unnamed arguement) on a server +accepting SSL connections at localhost:8443. Ng Pheng Siong's M2Crypto +package (available at <http://www.pobox.org.sg/home/ngps/m2/>) must be +installed. Also see tests/silabserver.py. + +$Id$ diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/docs/GlobusSupport.txt b/LTA/LTAIngest/SOAPpy-0.12.0/docs/GlobusSupport.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ffd5cabd0ffac4a047ccb749d5f19645e68995f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/docs/GlobusSupport.txt @@ -0,0 +1,97 @@ + +Globus Support +============== + +Extensions have been added to the SOAPpy module to allow the use of the +Globus Toolkit v2 for secure transport of SOAP calls. These extensions are +possible by using the Globus Toolkit (http://www.globus.org) and the +pyGlobus software (http://www-itg.lbl.gov/gtg/projects/pyGlobus/), which +exposes the Globus Toolkit via a set of Python interfaces. This enables +bi-directional PKI authentication so that the server and client are both +guaranteed of the identity of the other. Using PKI this way also allows a +more robust authorization solution above the SOAP hosting layer, which +provides better application level authorization control. These tools are +used by the Access Grid Project (http://www.accessgrid.org) to build a +Grid-based, Web Services based, real-time collaboration environment. + +In order to use the SOAPpy module with the Globus Toolkit, you must first +obtain and install the Globus Toolkit and pyGlobus software. Information on +how to do that is at the respective web sites listed below. In order to use +the Globus Toolkit it is necessary to have an x509 identity certificate. +Information on how to obtain one of those is available on the web as well. + +To use GSI with an authorization method, set the SOAPConfig.authMethod = +"methodname". You must have this method defined on any objects you register +with SOAPpy, and/or as a registered method. It should return 0 or 1 to +indicate if authorization is allowed or not. + +Once the software is installed, you have obtained your certificate, and the +SOAPpy module is installed, the following code shows how to run a GSI +secured SOAP server (These snippets are directly from the echoServer.py and +echoClient.py in the test directory). + +Server +------ + + def _authorize(self, *args, **kw): + return 1 + + Config.authMethod = "_authorize" + + addr = ('localhost', 9900) + from SOAPpy.GSIServer import GSISOAPServer + server = GSISOAPServer(addr) + + server.registerFunction(_authorize) + server.registerFunction(echo) + + Then you use the server like the SSL server or the standard server. + +Client +------ + + import pyGlobus + + # The httpg distinguishes this as a GSI TCP connection, so after + # this you can use the SOAP proxy as you would any other SOAP Proxy. + + server = SOAPProxy("httpg://localhost:9900/") + print server.echo("moo") + + + +Globus Toolkit http://www.globus.org +------------------------------------ + + The Globus Toolkit is an open source software toolkit used for + building grids. It is being developed by the Globus Alliance and + many others all over the world. A growing number of projects and + companies are using the Globus Toolkit to unlock the potential + of grids for their cause. + +PyGlobus http://www-itg.lbl.gov/gtg/projects/pyGlobus/ +------------------------------------------------------ + + The goal of this project is to allow the use of the entire + Globus toolkit from Python, a high-level scripting + language. SWIG is used to generate the necessary interface + code. Currently a substantial subset of the 2.2.4 and 2.4 + versions of the Globus toolkit has been wrapped. + +The Access Grid http://www.accessgrid.org/ +------------------------------------------ + + The Access GridT is an ensemble of resources including + multimedia large-format displays, presentation and interactive + environments, and interfaces to Grid middleware and to + visualization environments. These resources are used to support + group-to-group interactions across the Grid. For example, the + Access Grid (AG) is used for large-scale distributed meetings, + collaborative work sessions, seminars, lectures, tutorials, and + training. The Access Grid thus differs from desktop-to-desktop + tools that focus on individual communication. + +- Submitted 2004-01-08 by Ivan R. Judson <mailto:judson@mcs.anl.gov> + + +$Id$ diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/docs/MethodParameterNaming.txt b/LTA/LTAIngest/SOAPpy-0.12.0/docs/MethodParameterNaming.txt new file mode 100755 index 0000000000000000000000000000000000000000..0d10d9fb88c4752f0b8cd88e574b0e78e5c6dd2b --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/docs/MethodParameterNaming.txt @@ -0,0 +1,71 @@ + +Experimental method for handing ordered vs named method parameters +------------------------------------------------------------------ + +There is an incompatibility with the way that Python and SOAP handle +method arguments: SOAP requires that all arguments have names and that +they are presented in the same order as the method signature. Python +(like other scripting languages, notably the S language) has the +concept of unnamed arguments. Further Python does not preserve the +order of named arguments, since they are handled using the dictionary +data type. It seems to me that this makes it impossible to fully meet +the SOAP specification without significant modifications to the Python +method of handling named arguments or to the Python dictionary class. + +Historically SOAPpy has attempted to work around this issue by +handling all arguments as unnamed unless the method or function was +explicitly flagged, in which case all arguments were considered named. +This has resulted in a several problems, particularly for a SOAPpy +client communicating with a SOAPpy server. First, when named +arguments were used in call to a non-flagged function, the argument +would silently be reordered by the sender (since they were stored +using a Python dictionary), *and* the names would be ignored by the +receiver, which assumed that the parameters were unnamed and only the +order was significant. This results in incorrect argument matching. +This problem also occurred with mixed named and unnamed arguments. + +For my primary SOAP application, it is not reasonable to flag all of +the SOAPpy methods as requiring named arguments, for a variety of +reasons. One reason is that the available methods are not known +apriori by the client software, hence the names of the arguments are +not known. Second, many of the methods provide a large number of +optional arguments, making it impractical to specify them all. + +In an attempt to overcome this problem, I implemented an experimental +and non-standard method of handling named and unnamed arguments. This +mechanism is enabled in SOAPpy by setting +SOAPpy.SOAP.Config.specialArgs=1, and disabled by setting +SOAPpy.SOAP.Config.specialArgs=0. + +When enabled, parameters with names of the form v#### (i.e., matching +the regexp "^v[0-9]+$") are assumed to be unnamed parameters and are +passed to the method in numeric order. All other parameters are +assumed to be named and are passed using the name. Outgoing SOAP +method calls now always generate names in this way--whether or not +specialArgs is enabled. + + +I selected the form v#### because it is a valid XML name, but is +unlikely to be used as a parameter name. + +[As it turns out, this choice was fortitous because Apache's SOAP tool +uses the same system.] + +In my testing, this mechanism for handling method parameter names +works fine between a SOAPpy client and a SOAPpy server, and resolves +the parameter reordering problems I was experiencing. This system +seems unlikely to have any negative side effects on other SOAP +applications, except in the (hopefully) rare case when v#### might be +used as an actual parameter name. + +**In version 0.9.9-pre1, this feature is enabled by default.** Please +let me know if there are situations where this causes problems. + +Note that this mechanism is only a partial solution, since it is still +impossible to return named parameters in a specified order using +SOAPpy. SOAP applications or implementations which require this +feature are simply not compatible with SOAPpy. + +-Greg Warnes <Gregory.R.Warnes@Pfizer.com> +2003-03-07 (updated 2003-11-14) + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/docs/UsingHeaders.txt b/LTA/LTAIngest/SOAPpy-0.12.0/docs/UsingHeaders.txt new file mode 100755 index 0000000000000000000000000000000000000000..cf50341b81844a827009327da1dcdf1d62adad70 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/docs/UsingHeaders.txt @@ -0,0 +1,104 @@ +Using Headers +============= + +SOAPpy has a Header class to hold data for the header of a SOAP message. +Each Header instance has methods to set/get the MustUnderstand attribute, and +methods to set/get the Actor attribute. + +SOAPpy also has a SOAPContext class so that each server method can be +implemented in such a way that it gets the context of the connecting client. +This includes both common SOAP information and connection information (see +below for an example). + +CLIENT EXAMPLES +--------------- + +## CODE +import SOAPpy +test = 42 +server = SOAPpy.SOAPProxy("http://localhost:8888") +server = server._sa ("urn:soapinterop") + +hd = SOAPpy.Header() +hd.InteropTestHeader ='This should fault, as you don\'t understand the header.' +hd._setMustUnderstand ('InteropTestHeader', 0) +hd._setActor ('InteropTestHeader','http://schemas.xmlsoap.org/soap/actor/next') +server = server._hd (hd) + +print server.echoInteger (test) +## /CODE + +This should succeed (provided the server has defined echoInteger), as it +builds a valid header into this client with MustUnderstand set to 0 +and then sends the SOAP with this header. + + +## CODE +import SOAPpy +test = 42 +server = SOAPpy.SOAPProxy("http://localhost:8888") +server = server._sa ("urn:soapinterop") +#Header +hd = SOAPpy.Header() +hd.InteropTestHeader = 'This should fault,as you don\'t understand the header.' +hd._setMustUnderstand ('InteropTestHeader', 1) +hd._setActor ('InteropTestHeader','http://schemas.xmlsoap.org/soap/actor/next') +server = server._hd (hd) + +print server.echoInteger (test) +## /CODE + +This should fail (even if the server has defined 'echoInteger'), as it +builds a valid header into this client, but sets MustUnderstand to 1 +for a message that the server presumably won't understand before sending. + + + + +SERVER EXAMPLES +--------------- + +## CODE +import SOAPpy +def echoInteger (inputInteger): + return inputInteger +server = SOAPpy.SOAPServer ( ('localhost', 8080) ) +server.registerFunction (echoInteger) +server.serve_forever() +## /CODE + +This is a simple server designed to work with the first 2 clients above. + + +## CODE +import SOAPpy +def echoInteger (inputInteger, _SOAPContext): + c = _SOAPContext + print c.xmldata + print c.header + print c.body + print c.connection.getpeername() + print c.soapaction + print c.httpheaders + return inputInteger + +host = 'localhost' +port = 8888 + +server = SOAPpy.SOAPServer ( (host, port) ) +server.registerFunction (SOAPpy.MethodSig(echoInteger, keywords=0,context=1)) + +server.serve_forever() +## /CODE + +This is a server which shows off the SOAPContext feature. This +server gets a context from the client that has connected to it, and +prints some of the pertinent aspects of that client before +returning. This server should also work with the code for the two +clients written above. + + + + + +$Id$ diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/docs/WSDL.txt b/LTA/LTAIngest/SOAPpy-0.12.0/docs/WSDL.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fe3560df040a4fd0e06b4e304fc557b4ff39cfa --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/docs/WSDL.txt @@ -0,0 +1,22 @@ + +WSDL NOTES: + +Release 0.9.9 and later include logic for dealing with web service +description language (WSDL) files. + + - SOAPpy.WSDL provides a SOAP Proxy object that parses a WSDL file + and provides access to the listed services: + + url = 'http://www.xmethods.org/sd/2001/TemperatureService.wsdl' + zip = '01072' + proxy = SOAPpy.WSDL.Proxy(url) + temp = proxy.getTemp(zip) + print 'Temperature at', zip, 'is', temp + + - On the server, you can allow the client to download the WSDL for + a service by sending a request of the form by adding a do_GET + method to the SOAPRequestHandler. [Not yet working when + debug=FALSE. Add example here when working] + + +$Id$ diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/docs/attrs.txt b/LTA/LTAIngest/SOAPpy-0.12.0/docs/attrs.txt new file mode 100644 index 0000000000000000000000000000000000000000..e019d3aced1949228ec9370aae19432865f0aa0b --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/docs/attrs.txt @@ -0,0 +1,15 @@ + +Using SOAPpy Attributes +======================= + +All SOAPpy data classes implement methods to access and mutate +individual attributes. + +The _setAttr method has as parameters a 'tag', the attribute name, and the +value to which the attribute should be set. + +The _getAttrs method simply has the 'tag' parameter. + + + +$Id$ diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/docs/complexTypes.txt b/LTA/LTAIngest/SOAPpy-0.12.0/docs/complexTypes.txt new file mode 100644 index 0000000000000000000000000000000000000000..d95c0d366cccb0ebf1c1136233d889fde7a0f7bf --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/docs/complexTypes.txt @@ -0,0 +1,19 @@ +COMPLEX TYPES HOWTO +=================== + +The easiest way (at the moment) to create complex SOAP typs is to +use the SOAPpy.structType class, which allows you to create an +object with named arguments of arbitraty types. For example: + +>>> in0 = SOAPpy.structType() +>>> in0._addItem('outwardDate', dep) +>>> in0._addItem('returnDate', ret) +>>> in0._addItem('originAirport', 'den') +>>> in0._addItem('destinationAirport', 'iad') + +SOAPpy has code for declaring structured object templates including +the type for each component, but this broke sometime in the past and +has not yet been corrected. (See tests/TCtypes.py to see how it +should work.) + + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/docs/simpleTypes.txt b/LTA/LTAIngest/SOAPpy-0.12.0/docs/simpleTypes.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9216bfe885f57f70fd34962bff89167263c448e --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/docs/simpleTypes.txt @@ -0,0 +1,220 @@ +Simple Types HOWTO +================== + +The easiest way to understand use of data types is look at and run the examples +already written (in tests/, validate/ and bid/) , and to write your own +clients, looking at the xml as it is sent (by setting SOAP.Config.debug=1). + +As far as the built-in types are concerned, SOAP.py will preserve type +as expected. That is: python integer will be of type integer, and +equivalently for string and float. To access more than just these types, +there are classes in SOAP.py. These allow invoking a certain type by making +an instance of the corresponding class. + +The SOAPBuilder in SOAP.py will automatically convert python lists to Arrays +and python dictionaries to Structs- these are two of the most frequently used +data types. + +CLIENT EXAMPLES +--------------- + + ## CODE + import SOAP + server = SOAP.SOAPProxy("http://localhost:8080/") + print server.echo("Hello world") + ## /CODE + + This example (taken from quickstart.txt) sends an ordered parameter of type + string. + + ## CODE + import SOAP + import time + #SOAP.Config.debug = 1 + test = time.gmtime (time.time ()) + server = SOAP.SOAPProxy("http://localhost:8080/") + print server.echoDate (inputDate = SOAP.DateTime(test)) + ## /CODE + + This test calls echoDate with the named parameter inputDate, which is a + TimeInstant. It prints the the result. + **Note: The reason that it is a TimeInstant and not a DateTime + is that SOAP.py uses the 1999 schema intead of the 2001 schema. To make it + a DateTime, one would just use SOAP.dateTimeType() in place of SOAP.DateTime(). + ** + + + ## CODE + import SOAP + server = SOAP.SOAPProxy("http://localhost:8080/") + test = [0, 1, -1, 3853] + print server.echoIntegerArray (inputIntegerArray = test) + ## /CODE + + This calls echoIntegerArray with the named parameter inputIntegerArray, which + is a four-member array of type int. It prints the result. + + ## CODE + import SOAP + test = {'varFloat': 2.256, 'varInt': 474, 'varString': 'Utah'} + server = SOAP.SOAPProxy("http://localhost:8080/") + print server.echoStruct (inputStruct = test) + ## /CODE + + This code calls the method echoStruct with the named parameter inputStruct, + which is of type Struct. It then prints the result. + + + ## CODE + import SOAP + item1 = SOAP.Struct( data = {"name":"widget","quantity":200,"price":SOAP.decimalType(45.99), "_typename":"LineItem"}) + items = SOAP.Array ( data = [item1] ) + items._ns = "http://www.soapinterop.org/Bid" + server = SOAP.SOAPProxy("http://localhost:8080") + server = server._sa ("http://www.soapinterop.org/Buy") + server = server._ns ("http://www.soapinterop.org/Bid") + po = SOAP.Struct( data = {"poID":"Order 1234", "createDate": SOAP.dateTimeType(), "items": items} ) + print server.Buy(PurchaseOrder = po) + ## /CODE + + A few new things here. + -First, we are creating an Array, 'items', with components of (made up) type + 'LineItem'. (Notice the use of "_typename" to specify type). + -This code associates a namespace with the Array, rather than use the default. + -SOAP.dateTimeType() is called directly to get a dateTime instead of SOAP.py's + default, 'timeInstant'. + -Note that when creating a Struct or Array, the data must be passed in as a + named 'data' param (as the first param, by order, is 'name'). + -The proxy is instantiated and then the values for its namespace (_ns) and + soapaction (_sa) are assigned. + -This call will work for a server expecting a parameter with the same + components as those in the variable 'po' above. It will work whether the + server has a named param 'PurchaseOrder' or has an unnamed param, but will + not work if the server expects a named param with a name of anything but + 'PurchaseOrder'. + + +SERVER EXAMPLES +--------------- + + ## CODE + import SOAP + def echo(s): + return s + s # repeats a string twice + + server = SOAP.SOAPServer(("localhost", 8080)) + server.registerFunction(echo) + server.serve_forever() + ## /CODE + + This server example, from quickstart.txt, echoes (as type string) the + string that is passed in, s. + + + ## CODE + import SOAP + + def echoDate (inputDate): + return SOAP.DateTime(inputDate) + + server = SOAP.SOAPServer(("localhost", 8080)) + server.registerKWFunction(echoDate ) + server.serve_forever() + ## /CODE + + This code accepts an inputDate and returns the same date, ensuring that it + is of type TimeInstant by returning an instance of DateTime instead of + simply returning the value. + + + ## CODE + import SOAP + def echoIntegerArray (inputIntegerArray): + if type(inputIntegerArray) != type([]) or len(inputIntegerArray) != 4: + for elem in inputIntegerArray: + if type(elem) != type(1): + raise TypeError, "expected 4-member Array of ints" + return inputIntegerArray + server = SOAP.SOAPServer(("localhost", 8080)) + server.registerKWFunction(echoIntegerArray ) + server.serve_forever() + ## /CODE + + This server supports the method echoIntegerArray, requiring the named parameter + inputIntegerArray, which must be a four-member array of type int. + + + ## CODE + import SOAP + + def echoStruct (inputStruct): + myfloat = inputStruct["varFloat"] + mystr = inputStruct["varString"] + myint = inputStruct["varInt"] + return inputStruct + + server = SOAP.SOAPServer(("localhost", 8080)) + server.registerKWFunction(echoStruct ) + server.serve_forever() + ## /CODE + + This code creates a server with a method echoStruct, which requires that the + incoming Struct have elements named varFloat, varString, and varInt. That is, + the server will fault if the incoming Struct does not have any of those + elements. **Note, this server code does NOT require that these be the only + elements in the struct- just that they be present**. This method simply + returns the Struct passed in. + + + ## CODE + import sys + import SOAP + serverstring = "SOAP.py (actzero.com) running "+sys.platform + def Buy(**kw): + try: + PurchaseOrder = kw["PurchaseOrder"] + except: + PurchaseOrder = kw["PO"] + + POkeys = PurchaseOrder['_keyord'] + POkeys.sort() + POkeys_expected = ["items","poID","createDate"] + POkeys_expected.sort() + if POkeys != POkeys_expected: + raise ValueError, "struct 'PurchaseOrder' needs %s, %s, and %s" % tuple(POkeys_expected) + + items = PurchaseOrder["items"].__dict__ + data = items["data"] + retstring = "" + for item in data: + itemdict = item["_asdict"] + q = itemdict["quantity"] + p = itemdict["price"] + name = itemdict["name"] + if retstring != "": + retstring += ", " + else: + retstring = "bought " + retstring += "%d %s(s) for %.2f" % (q,name,p) + retstring += " from "+serverstring + return retstring + + server = SOAP.SOAPServer(("localhost", 8080)) + namespace = "http://www.soapinterop.org/Bid" + server.registerKWFunction(Buy, namespace ) + server.serve_forever() + ## /CODE + + This example creates a server to implement 'Buy', which takes a parameter + named either PurchaseOrder or PO. (Notice the use of **kw as the input + parameter to the method for this functionality). + The server gets the names of the Struct's members by using the '_keyord' + key of the Struct-as-dictionary. It checks these names against what it + expects from the client, and raises a fault if the two are not the same. + By using the __dict__ attribute, the server gets the 'items' (an elemnent of + the PurchaseOrder Struct) as a dictionary. Then it checks that 'items' is + formatted as expected. Finally, it returns a confirmation of what was bought. + + + +$Id$ diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/setup.py b/LTA/LTAIngest/SOAPpy-0.12.0/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..56f01be595016a7016a8deff0cf25c673d3c7da9 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# +# $Id$ + +CVS=0 + +from distutils.core import setup, Command, Extension +from SOAPpy.version import __version__ + +url="http://pywebsvcs.sf.net/" + +long_description="SOAPpy provides tools for building SOAP clients and servers. For more information see " + url + + +if CVS: + import time + __version__ += "_CVS_" + time.strftime('%Y_%m_%d') + + +setup(name="SOAPpy", + version=__version__, + description="SOAP Services for Python", + maintainer="Gregory Warnes", + maintainer_email="Gregory.R.Warnes@Pfizer.com", + url = url, + long_description=long_description, + packages=['SOAPpy','SOAPpy/wstools'] + ) + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/BabelfishWSDLTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/BabelfishWSDLTest.py new file mode 100755 index 0000000000000000000000000000000000000000..8905c9bed13f18703415ff07e0247672eca66b5e --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/BabelfishWSDLTest.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +ident = '$Id$' + +import os, re +import sys +sys.path.insert(1, "..") + +from SOAPpy import WSDL + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + proxy = "%s:%s" % (phost, pport) +except: + proxy = None + +server = WSDL.Proxy('http://www.xmethods.net/sd/2001/BabelFishService.wsdl', + http_proxy=proxy) + +english = "Hi Friend!" + +print "Babelfish Translations" +print "------------------------" +print "English: '%s'" % english +print "French: '%s'" % server.BabelFish('en_fr',english) +print "Spanish: '%s'" % server.BabelFish('en_es',english) +print "Italian: '%s'" % server.BabelFish('en_it',english) +print "German: '%s'" % server.BabelFish('en_de',english) + +print "Done." diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug1001646.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug1001646.py new file mode 100644 index 0000000000000000000000000000000000000000..56fa6879b224267c5c56214639bdb58e5b4e2954 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug1001646.py @@ -0,0 +1,75 @@ +""" +Check handing of unicode. +""" + +import sys +sys.path.insert(1, "..") +from SOAPpy import * + +# Uncomment to see outgoing HTTP headers and SOAP and incoming +#Config.debug = 1 +#Config.dumpHeadersIn = 1 +#Config.dumpSOAPIn = 1 +#Config.dumpSOAPOut = 1 + +# ask for returned SOAP responses to be converted to basic python types +Config.simplify_objects = 1 + +#Config.BuildWithNoType = 1 +#Config.BuildWithNoNamespacePrefix = 1 + + +def headers(): + '''Return a soap header containing all the needed information.''' + hd = Types.headerType() + hd.useragent = Types.stringType("foo") + return hd + +server = SOAPProxy("http://localhost:9900/",header=headers()) + +adgroupid = 197497504 +keyword1 = { 'status': 'Moderate', + 'adGroupId': 197497504, + 'destinationURL': None, + 'language': '', + 'text': 'does not work', + 'negative': bool(0), + 'maxCpc': 50000, + 'type': 'Keyword', + 'id': 1 } +keyword2 = { 'status': 'Moderate', + 'adGroupId': 197497504, + 'destinationURL': None, + 'language': '', + 'text': 'yes it does not', + 'negative': bool(0), + 'maxCpc': 50000, + 'type': 'Keyword', + 'id': 2 } +keylist = [keyword1, keyword2] + +# Check that the data goes through properly + +retval = server.echo_simple(adgroupid, keylist) + +kw1 = retval[1][0] +kw2 = retval[1][1] + +assert(retval[0] == adgroupid) + +for key in kw1.keys(): + assert(kw1[key]==keyword1[key]) + +for key in kw2.keys(): + assert(kw2[key]==keyword2[key]) + +# Check that the header is preserved +retval = server.echo_header((adgroupid, keylist)) + +assert(retval[1].has_key('useragent')) +assert(retval[1]['useragent'] == 'foo') + +server.quit() + +print "Success!" + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug916265.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug916265.py new file mode 100644 index 0000000000000000000000000000000000000000..36c58bf173b67ee52141c503e910bdcddf22bd60 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug916265.py @@ -0,0 +1,43 @@ +""" +Check handing of unicode. +""" + +import sys +sys.path.insert(1, "..") +from SOAPpy import * + +# Uncomment to see outgoing HTTP headers and SOAP and incoming +#Config.debug = 1 +#Config.dumpHeadersIn = 1 +#Config.dumpSOAPIn = 1 +#Config.dumpSOAPOut = 1 + +# ask for returned SOAP responses to be converted to basic python types +Config.simplify_objects = 0 + +#Config.BuildWithNoType = 1 +#Config.BuildWithNoNamespacePrefix = 1 + +server = SOAPProxy("http://localhost:9900/") + +x = u'uMOO' # Single unicode string +y = server.echo_simple((x,)) +assert( x==y[0] ) + +x = [u'uMoo1',u'uMoo2'] # array of unicode strings +y = server.echo_simple(x) +assert( x[0] == y[0] ) +assert( x[1] == y[1] ) + +x = { + u'A':1, + u'B':u'B', + 'C':u'C', + 'D':'D' + } +y = server.echo_simple(x) + +for key in x.keys(): + assert( x[key] == y[0][key] ) + +print "Success" diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug918216.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug918216.py new file mode 100644 index 0000000000000000000000000000000000000000..04c02c1cba9a6d04a91deed8c1db9c7b5120ab0f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/Bug918216.py @@ -0,0 +1,38 @@ +import sys +sys.path.insert(1, "..") +from SOAPpy import * + +detailed_fault = \ +""" +<?xml version="1.0" encoding="UTF-8"?> +<SOAP-ENV:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" SOAP-ENV:encodingStyle="http://schemas.microsoft.com/soap/encoding/clr/1.0 http://schemas.xmlsoap.org/soap/encoding/" xmlns:a1="http://schemas.microsoft.com/clr/ns/System.Runtime.Serialization.Formatters"> +<SOAP-ENV:Body> +<SOAP-ENV:Fault id="ref-1"> + +<faultcode>soapenv:Server.generalException</faultcode> +<faultstring>Exception thrown on Server</faultstring> + +<detail> +<loginFailureFault href="#id0"/> +<exceptionName xsi:type="xsd:string">...</exceptionName> +</detail> + +</SOAP-ENV:Fault> + +<multiRef id="id0"> +<description xsi:type="xsd:string">Login failure (504):Unknown User</description> +<module xsi:type="xsd:string"> ... </module> +<timestamp xsi:type="xsd:string">...</timestamp> +<faultcode xsi:type="xsd:string"> ...</faultcode> +<parameter xsi:type="xsd:string"> ... </parameter> +</multiRef> + +</SOAP-ENV:Body> +</SOAP-ENV:Envelope> +""" + +z = parseSOAPRPC(detailed_fault.strip() ) +assert(z.__class__==faultType) +assert(z.faultstring=="Exception thrown on Server") +assert(z.detail.loginFailureFault.description=='Login failure (504):Unknown User') +print "Success" diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/ComplexTypes.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/ComplexTypes.py new file mode 100644 index 0000000000000000000000000000000000000000..34d8f9b828e7b3a39ab2b6b337e46aec400514cb --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/ComplexTypes.py @@ -0,0 +1,26 @@ +import sys +sys.path.insert(1, "..") + +import SOAPpy + +import time +dep = SOAPpy.dateTimeType((2004, 3, 24, 12, 30, 59, 4, 86, 0)) +ret = SOAPpy.dateTimeType((2004, 3, 26, 12, 30, 59, 4, 86, 0)) + +in0 = SOAPpy.structType() +in0._addItem('outwardDate', dep) +in0._addItem('returnDate', ret) +in0._addItem('originAirport', 'den') +in0._addItem('destinationAirport', 'iad') + + +x = SOAPpy.buildSOAP( + in0, + method="getAirFareQuote", + namespace="urn:SBGAirFareQuotes.sbg.travel.ws.dsdata.co.uk" + ) + + +wsdl = 'http://www.xmethods.net/sd/2001/TemperatureService.wsdl' +proxy = SOAPpy.WSDL.Proxy(wsdl) + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/GoogleTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/GoogleTest.py new file mode 100644 index 0000000000000000000000000000000000000000..bd75e12b9d31f4be39ea87368856ae6d7389a1e0 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/GoogleTest.py @@ -0,0 +1,11 @@ +from SOAPpy import WSDL +server = WSDL.Proxy('/home/warneg/src/google/googleapi/GoogleSearch.wsdl') +key = "6k0oDPZQFHL0zpjy6ZO6ufUVFKBgvqTo" + +results = server.doGoogleSearch(key, 'warnes', 0, 10, False, "", + False, "", "utf-8", "utf-8") + + +for i in range(len(results.resultElements)): + res = results.resultElements[i] + print '%d: %s --> %s' % ( i, res.title, res.URL ) diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/SOAPtest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/SOAPtest.py new file mode 100755 index 0000000000000000000000000000000000000000..65b7130206ab7a599a9f4e1a763c7914c7d42532 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/SOAPtest.py @@ -0,0 +1,3807 @@ +#!/usr/bin/env python + +################################################################################ +# +# A bunch of regression type tests for the builder and parser. +# +################################################################################ + +ident = '$Id$' + +import urllib +import sys +import unittest +import re + +sys.path.insert(1, "..") +from SOAPpy import * +config=Config +config.strict_range=1 + + +# run these tests with this variable set both to 1 and 0 +config.simplify_objects=0 + +# as borrowed from jake.soapware.org for float compares. +def nearlyeq(a, b, prec = 1e-7): + return abs(a - b) <= abs(a) * prec + +# helper +def negfloat(x): + return float(x) * -1.0 + +class Book(structType): + def __init__(self): + self.title = "Title of a book" + structType.__init__(self) + +class Person(structType): + def __init__(self): + self.age = "49" + self.height = "5.5" + structType.__init__(self) + +class Result(structType): + def __init__(self): + structType.__init__(self, name = 'Result') + self.Book = Book() + self.Person = Person() + +class one: + def __init__(self): + self.str = "one" + +class two: + def __init__(self): + self.str = "two" + +class three: + def __init__(self): + self.str = "three" + +ws = ' \t\r\n' +N = None + +class SOAPTestCase(unittest.TestCase): + # big message + def notestBigMessage(self): + x=[] + for y in string.lowercase: + x.append(y*999999) + buildSOAP(x) + + # test arrayType + def testArrayType(self): + x = structType( {"name":"widg1","quantity":200, + "price":decimalType(45.99), + "_typename":"LineItem"}) + y = buildSOAP([x, x]) + # could be parsed using an XML parser? + self.failUnless(string.find(y, "LineItem")>-1) + + # test arguments ordering + def testOrdering(self): + x = buildSOAP(method="newCustomer", namespace="urn:customer", \ + kw={"name":"foo1", "address":"bar"}, \ + config=SOAPConfig(argsOrdering={"newCustomer":("address", "name")})) + # could be parsed using an XML parser? + self.failUnless(string.find(x, "<address ")<string.find(x, "<name ")) + x = buildSOAP(method="newCustomer", namespace="urn:customer", \ + kw={"name":"foo1", "address":"bar"}, \ + config=SOAPConfig(argsOrdering={"newCustomer":("name", "address")})) + # could be parsed using an XML parser? + self.failUnless(string.find(x, "<address ")>string.find(x, "<name ")) + + # test struct + def testStructIn(self): + x = '''<?xml version="1.0" encoding="utf-8"?> +<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:soapenc="http://schemas.xmlsoap.org/soap/encoding/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"> +<soap:Body soap:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> +<SomeMethod> +<Result> +<Book> + <title>My Life and Work</title> +</Book> +<Person> + <name>Henry Ford</name> + <age> 49 </age> + <height> 5.5 </height> +</Person> +</Result> +</SomeMethod> +</soap:Body> +</soap:Envelope> +''' + # parse rules + pr = {'SomeMethod': + {'Result': + {'Book': {'title':(NS.XSD, "string")}, + 'Person': {'age':(NS.XSD, "int"), + 'height':negfloat} + } + } + } + y = parseSOAPRPC(x, rules=pr) + if config.simplify_objects: + self.assertEquals(y['Result']['Person']['age'], 49); + self.assertEquals(y['Result']['Person']['height'], -5.5); + else: + self.assertEquals(y.Result.Person.age, 49); + self.assertEquals(y.Result.Person.height, -5.5); + + # Try the reverse + def testStructOut(self): + x = buildSOAP(Result()) + + def testIntFloat(self): + x='''<SOAP-ENV:Envelope + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:xsd="http://www.w3.org/2001/XMLSchema" + xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" + SOAP-ENV:encodingStyle="http://schemas.microsoft.com/soap/encoding/clr/1.0 + http://schemas.xmlsoap.org/soap/encoding/" + xmlns:i3="http://soapinterop.org/xsd" xmlns:i2="http://soapinterop.org/"> + <SOAP-ENV:Body> + <i2:echoStructArray id="ref-1"> + <return href="#ref-4"/> + </i2:echoStructArray> + <SOAP-ENC:Array id="ref-4" SOAP-ENC:arrayType="i3:SOAPStruct[3]"> + <item href="#ref-5"/> + <item href="#ref-6"/> + <item href="#ref-7"/> + </SOAP-ENC:Array> + <i3:SOAPStruct id="ref-5"> + <varString xsi:type="xsd:string">West Virginia</varString> + <varInt xsi:type="xsd:int">-546</varInt> + <varFloat xsi:type="xsd:float">-5.398</varFloat> + </i3:SOAPStruct> + <i3:SOAPStruct id="ref-6"> + <varString xsi:type="xsd:string">New Mexico</varString> + <varInt xsi:type="xsd:int">-641</varInt> + <varFloat xsi:type="xsd:float">-9.351</varFloat> + </i3:SOAPStruct> + <i3:SOAPStruct id="ref-7"> + <varString xsi:type="xsd:string">Missouri</varString> + <varInt xsi:type="xsd:int">-819</varInt> + <varFloat xsi:type="xsd:float">1.375</varFloat> + </i3:SOAPStruct> + </SOAP-ENV:Body> +</SOAP-ENV:Envelope>''' + y = parseSOAPRPC(x) + if(config.simplify_objects): + self.assertEquals(y['return'][0]['varString'], "West Virginia") + self.assertEquals(y['return'][1]['varInt'], -641) + self.assertEquals(y['return'][2]['varFloat'], 1.375) + else: + self.assertEquals(getattr(y,"return")[0].varString, "West Virginia") + self.assertEquals(getattr(y,"return")[1].varInt, -641) + self.assertEquals(getattr(y,"return")[2].varFloat, 1.375) + + def testArray1(self): + x='''<SOAP-ENV:Envelope + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:xsd="http://www.w3.org/2001/XMLSchema" + xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" + SOAP-ENV:encodingStyle="http://schemas.microsoft.com/soap/encoding/clr/1.0 + http://schemas.xmlsoap.org/soap/encoding/" + xmlns:i3="http://soapinterop.org/xsd" xmlns:i2="http://soapinterop.org/"> + <SOAP-ENV:Body> + <i2:echoStructArray id="ref-1"> + <return href="#ref-4"/> + </i2:echoStructArray> + <SOAP-ENC:Array id="ref-4" SOAP-ENC:arrayType="i3:SOAPStruct[3]"> + <item href="#ref-5"/> + <item href="#ref-6"/> + <item href="#ref-7"/> + </SOAP-ENC:Array> + <i3:SOAPStruct id="ref-5"> + <xsd:string>West Virginia</xsd:string> + <xsd:int>-546</xsd:int> + <xsd:float>-5.398</xsd:float> + </i3:SOAPStruct> + <i3:SOAPStruct id="ref-6"> + <xsd:string>New Mexico</xsd:string> + <xsd:int>-641</xsd:int> + <xsd:float>-9.351</xsd:float> + </i3:SOAPStruct> + <i3:SOAPStruct id="ref-7"> + <xsd:string>Missouri</xsd:string> + <xsd:int>-819</xsd:int> + <xsd:float>1.375</xsd:float> + </i3:SOAPStruct> + </SOAP-ENV:Body> +</SOAP-ENV:Envelope>''' + y = parseSOAPRPC(x) + if(config.simplify_objects): + self.assertEquals(y["return"][0]['string'], "West Virginia") + self.assertEquals(y["return"][1]['int'], -641) + self.assertEquals(y["return"][2]['float'], 1.375) + else: + self.assertEquals(getattr(y,"return")[0].string, "West Virginia") + self.assertEquals(getattr(y,"return")[1].int, -641) + self.assertEquals(getattr(y,"return")[2].float, 1.375) + + def testUTF8Encoding1(self): + x = '''<?xml version="1.0" encoding="UTF-8"?> +<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"> +<SOAP-ENV:Body SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsd2="http://www.w3.org/2000/10/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance" xmlns:xsi2="http://www.w3.org/2000/10/XMLSchema-instance"> +<ns0:echoStringArrayResponse xmlns:ns0="http://soapinterop.org/"> +<return2 href="#id3"/> +</ns0:echoStringArrayResponse> +<a id="id0" xmlns:ns0="http://soapinterop.org/" xsi2:type="xsd:string" xsi:type="xsd:string"></a> +<a id="id1" xmlns:ns0="http://soapinterop.org/" xsi2:type="xsd:string" xsi:type="xsd:string">Hello</a> +<a id="id2" xmlns:ns0="http://soapinterop.org/" xsi2:type="xsd:string" xsi:type="xsd:string">\'<&>"</a> +<return2 SOAP-ENC:arrayType="xsd:string[3]" id="id3" xmlns:ns0="http://soapinterop.org/"> +<a href="#id0"/> +<a href="#id1"/> +<a href="#id2"/> +</return2> +</SOAP-ENV:Body></SOAP-ENV:Envelope>''' + y = parseSOAPRPC(x) + if config.simplify_objects: + self.assertEquals(y['return2'][1], "Hello") + else: + self.assertEquals(y.return2[1], "Hello") + + def testUTF8Encoding2(self): + x = '''<?xml version="1.0" encoding="UTF-8"?> +<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"> +<SOAP-ENV:Body SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> +<ns0:echoStringArrayResponse xmlns:ns0="http://soapinterop.org/"> +<a xsi:type="xsd:string"></a> +<a xsi:type="xsd:string">Hello</a> +<a xsi:type="xsd:string">\'<&>"</a> +<b xsi:type="xsd:string">Goodbye</b> +</ns0:echoStringArrayResponse> +</SOAP-ENV:Body> +</SOAP-ENV:Envelope>''' + y = parseSOAPRPC(x) + self.assertEquals(type(y.a), type([])) + self.assertEquals(type(y.b), type('')) + self.assertEquals(type(y._getItemAsList('a')), type([])) + self.assertEquals(type(y._getItemAsList('b')), type([])) + self.assertEquals(y.b, 'Goodbye') + self.assertEquals(y.a, ['', 'Hello', '\'<&>"']) + self.assertEquals(y._getItemAsList('b'), ['Goodbye']) + self.assertEquals(y._getItemAsList('c'), []) + self.assertEquals(y._getItemAsList('c', 'hello'), 'hello') + + def testUTF8Encoding2(self): + x = '''<?xml version="1.0" encoding="UTF-8"?> +<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"> +<SOAP-ENV:Body + SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:xsd="http://www.w3.org/1999/XMLSchema" + xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> +<a1 SOAP-ENC:root="1">Hello</a1> +<a2 SOAP-ENC:root="0" id="id">\'<&>"</a2> +<a3>Goodbye</a3> +</SOAP-ENV:Body> +</SOAP-ENV:Envelope>''' + y = parseSOAP(x) + self.assertEquals(y.a1, 'Hello') + self.assertEquals(y.a3, 'Goodbye') + self.failIf(hasattr(y, 'a2')) + + def testUTF8Encoding3(self): + x = '''<?xml version="1.0" encoding="utf-8"?> +<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:soapenc="http://schemas.xmlsoap.org/soap/encoding/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"> +<soap:Body soap:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> +<SomeMethod> +<Result> +<Book> + <title>My Life and Work</title> + <author href="#Person-1"/> +</Book> +<Person id="Person-1"> + <name>Henry Ford</name> + <address href="#Address-2"/> +</Person> +<Address id="Address-2"> + <email>mailto:henryford@hotmail.com</email> + <web>http://www.henryford.com</web> + <pers href="#Person-1"/> +</Address> +</Result> +</SomeMethod> +</soap:Body> +</soap:Envelope> +''' + y = parseSOAPRPC(x) + if config.simplify_objects: + self.assertEquals(y['Result']['Book']['author']['name'], "Henry Ford") + self.assertEquals(y['Result']['Book']['author']['address']['web'], "http://www.henryford.com") + self.assertEquals(y['Result']['Book']['author']['address']['pers']['name'], "Henry Ford") + else: + self.assertEquals(y.Result.Book.author.name, "Henry Ford") + self.assertEquals(y.Result.Book.author.address.web, "http://www.henryford.com") + self.assertEquals(y.Result.Book.author.address.pers.name, "Henry Ford") + + # ref example + def testRef(self): + x = '''<?xml version="1.0" encoding="utf-8"?> +<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:soapenc="http://schemas.xmlsoap.org/soap/encoding/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"> +<soap:Body soap:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> +<echoFloatArrayResponse xmlns="http://soapinterop.org/"> +<Return href="#i1" xmlns="" /> +</echoFloatArrayResponse> +<soapenc:Array id="i1" soapenc:arrayType="xsd:float[4]"> +<Item>0</Item> +<Item>1</Item> +<Item>-1</Item> +<Item>3853.33325</Item> +</soapenc:Array> +</soap:Body> +</soap:Envelope> +''' + y = parseSOAPRPC(x) + if config.simplify_objects: + self.assertEquals(y['Return'][0], 0) + self.assertEquals(y['Return'][1], 1) + self.assertEquals(y['Return'][2], -1) + self.failUnless(nearlyeq(y['Return'][3], 3853.33325)) + else: + self.assertEquals(y.Return[0], 0) + self.assertEquals(y.Return[1], 1) + self.assertEquals(y.Return[2], -1) + self.failUnless(nearlyeq(y.Return[3], 3853.33325)) + + # Make sure passing in our own bodyType works. + def testBodyType(self): + a = [23, 42] + b = bodyType() + b.a = b.b = a + + x = buildSOAP(b) + y = parseSOAP(x) + + self.assertEquals(id(y.a), id(y.b)) + self.assertEquals(y.a, a) + self.assertEquals(y.b, a) + + # Test Envelope versioning (see section 4.1.2 of http://www.w3.org/TR/SOAP). + def testEnvelopeVersioning(self): + xml = '''<SOAP-ENV:Envelope + SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:xsd="http://www.w3.org/1999/XMLSchema" + xmlns:SOAP-ENV="http://new/envelope/version/" + xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance" + xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/"> + <SOAP-ENV:Body> + <_1 xsi:type="xsd:int" SOAP-ENC:root="1">1</_1> + </SOAP-ENV:Body> +</SOAP-ENV:Envelope>''' + + try: + parseSOAP(xml) + except Exception, e: + self.failUnless(isinstance(e, faultType)) + self.assertEquals(e.faultcode, '%s:VersionMismatch' % NS.ENV_T) + self.failIf(hasattr(e, 'detail')) + + # Big terrible ordered data with attributes test. + def testBigOrderedData(self): + data = '''<?xml version="1.0" encoding="UTF-8" ?> +<Envelope xmlns="http://schemas.xmlsoap.org/soap/envelope/"> +<Body> +<replyBlock generic="1.0" attrib1="false" attrib2='hello'> +<itemList> +<mainItem mainattrib1='uno'> +<name>first_main_item</name> +<description>whatever etc.</description> +<infoList> +<itemInfo a1='123' a2='abc'> +<name>unoItem1</name> +</itemInfo> +<itemInfo a1='456' a2='def'> +<name>unoItem2</name> +</itemInfo> +<itemInfo a1='789' a2='ghi'> +<name>unoItem3</name> +</itemInfo> +</infoList> +</mainItem> +<mainItem mainattrib1='dos'> +<name>second_main_item</name> +<description>whatever etc.</description> +<infoList> +<itemInfo a1='3123' a2='3abc'> +<name>dosItem1</name> +</itemInfo> +<itemInfo a1='3456' a2='3def'> +<name>dosItem2</name> +</itemInfo> +<itemInfo a1='3789' a2='3ghi'> +<name>dosItem3</name> +</itemInfo> +</infoList> +</mainItem> +</itemList> +<itemList> +<mainItem mainattrib1='single'> +<name>single_main_item</name> +<description>whatever etc.</description> +<infoList> +<itemInfo a1='666' a2='xxx'> +<name>singleItem1</name> +</itemInfo> +</infoList> +</mainItem> +</itemList> +</replyBlock> +</Body> +</Envelope>''' + + x = parseSOAP(data) + # print ".>",x.replyBlock.itemList._ns + y = buildSOAP(x) + + def testEnvelope1(self): + my_xml2 = ''' +<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> +<SOAP-ENV:Header> +<t:Transaction xmlns:t="some-URI" SOAP-ENV:mustUnderstand="1"> +5 +</t:Transaction> +</SOAP-ENV:Header> + <SOAP-ENV:Body> + <m:GetLastTradePriceResponse xmlns:m="Some-URI"> + <PriceAndVolume> + <LastTradePrice> + 34.5 + </LastTradePrice> + <DayVolume> + 10000 + </DayVolume> + </PriceAndVolume> + </m:GetLastTradePriceResponse> + </SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + (x,h) = parseSOAPRPC(my_xml2,header=1) + + def testEnvelope2(self): + x =''' +<V:Envelope + xmlns:V="http://schemas.xmlsoap.org/soap/envelope/" + xmlns:C="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:i="http://www.w3.org/1999/XMLSchema-instance" + xmlns:d="http://www.w3.org/1999/XMLSchema" + V:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> +<V:Body> +<m:echoStructArray + xmlns:m="urn:xmethodsInterop"> +<inputStructArray + i:type="C:Array" + C:arrayType="ns3:SOAPStruct[0]" + xmlns:ns3="http://soapinterop.org/xsd"/> +</m:echoStructArray> +</V:Body> +</V:Envelope>''' + x = parseSOAPRPC(x) + + def testEnvelope3(self): + x = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"> +<SOAP-ENV:Body> +<m:echoStringResponse xmlns:m="http://soapinterop.org/"> +<Result name="fred">hello</Result> +</m:echoStringResponse> +</SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + x, a = parseSOAPRPC(x, attrs = 1) + if config.simplify_objects: + self.assertEquals(a[id(x['Result'])][(None, 'name')], 'fred') + else: + self.assertEquals(a[id(x.Result)][(None, 'name')], 'fred') + + def testParseException(self): + x='''<SOAP-ENV:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" SOAP-ENV:encodingStyle="http://schemas.microsoft.com/soap/encoding/clr/1.0 http://schemas.xmlsoap.org/soap/encoding/" xmlns:a1="http://schemas.microsoft.com/clr/ns/System.Runtime.Serialization.Formatters"> +<SOAP-ENV:Body> +<SOAP-ENV:Fault id="ref-1"> +<faultcode id="ref-2">SOAP-ENV:Server</faultcode> +<faultstring id="ref-3">Exception thrown on Server</faultstring> +<detail xsi:type="a1:ServerFault"> + +<exceptionType id="ref-4">System.Runtime.Serialization.SerializationException, mscorlib, Version=1.0.2411.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</exceptionType> + +<message id="ref-5">Soap Parser Error System.Runtime.Serialization.SerializationException: Parse Error, xsd type not valid: Array + at System.Runtime.Serialization.Formatters.Soap.SoapHandler.ProcessGetType(String value, String xmlKey) + at System.Runtime.Serialization.Formatters.Soap.SoapHandler.ProcessType(ParseRecord pr, ParseRecord objectPr) + at System.Runtime.Serialization.Formatters.Soap.SoapHandler.ProcessAttributes(ParseRecord pr, ParseRecord objectPr) + at System.Runtime.Serialization.Formatters.Soap.SoapHandler.StartElement(String prefix, String name, String urn) + at System.XML.XmlParser.ParseElement() + at System.XML.XmlParser.ParseTag() + at System.XML.XmlParser.Parse() + at System.XML.XmlParser.Parse0() + at System.XML.XmlParser.Run()</message> + +<stackTrace id="ref-6"> at System.Runtime.Serialization.Formatters.Soap.SoapHandler.Error(IXmlProcessor p, Exception ex) + at System.XML.XmlParser.Run() + at System.Runtime.Serialization.Formatters.Soap.SoapParser.Run() + at System.Runtime.Serialization.Formatters.Soap.ObjectReader.Deserialize(HeaderHandler handler, ISerParser serParser) + at System.Runtime.Serialization.Formatters.Soap.SoapFormatter.Deserialize(Stream serializationStream, HeaderHandler handler) + at System.Runtime.Remoting.Channels.CoreChannel.DeserializeMessage(String mimeType, Stream xstm, Boolean methodRequest, IMessage msg, Header[] h) + at System.Runtime.Remoting.Channels.SoapServerFormatterSink.ProcessMessage(IServerChannelSinkStack sinkStack, ITransportHeaders requestHeaders, Stream requestStream, IMessage& msg, ITransportHeaders& responseHeaders, Stream& responseStream)</stackTrace> +</detail> + +</SOAP-ENV:Fault> +</SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + + z = parseSOAPRPC(x) + self.assertEquals(z.__class__,faultType) + self.assertEquals(z.faultstring, "Exception thrown on Server") + + + def testFlatEnvelope(self): + x = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"><SOAP-ENV:Body><m:echoStringResponse xmlns:m="http://soapinterop.org/"><Result></Result></m:echoStringResponse></SOAP-ENV:Body></SOAP-ENV:Envelope> +''' + z = parseSOAPRPC(x) + if config.simplify_objects: + self.assertEquals(type(z['Result']), type('')) + else: + self.assertEquals(type(z.Result), type('')) + + def testNumericArray(self): + x = [1,2,3,4,5] + y = buildSOAP(x) + z = parseSOAPRPC(y) + self.assertEquals(x, z) + + def testStringArray(self): + x = ["cayce", "asd", "buy"] + y = buildSOAP(x) + z = parseSOAPRPC(y) + self.assertEquals(x, z) + + def testStringArray1(self): + x = arrayType(['a', 'b', 'c']) + y = buildSOAP(x) + z = parseSOAP(y) + if config.simplify_objects: + self.assertEquals(z.v1._elemsname, 'item') + self.assertEquals(z.v1, x) + else: + self.assertEquals(z['v1']['_elemsname'], 'item') + self.assertEquals(z['v1'], x) + + def testStringArray2(self): + x = arrayType(['d', 'e', 'f'], elemsname = 'elementals') + y = buildSOAP(x) + z = parseSOAP(y) + if config.simplify_objects: + self.assertEquals(z.v1._elemsname, 'elementals') + self.assertEquals(z.v1, x) + else: + self.assertEquals(z['v1']['_elemsname'], 'elementals') + self.assertEquals(z['v1'], x) + + def testInt1(self): + my_xml = ''' +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> + <SOAP-ENV:Body> + <m:getStateName xmlns:m="http://www.soapware.org/"> + <statenum xsi:type="xsd:int">41</statenum> + </m:getStateName> + </SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + s = parseSOAPRPC(my_xml) + if config.simplify_objects: + self.assertEquals(s['statenum'], 41) + self.assertEquals(type(s['statenum']), type(0)) + else: + self.assertEquals(s.statenum, 41) + self.assertEquals(type(s.statenum), type(0)) + + def testInt2(self): + my_xml_ns = ''' +<XSOAP-ENV:Envelope XSOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:XSOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:XSOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:Xxsd="http://www.w3.org/1999/XMLSchema" xmlns:Xxsi="http://www.w3.org/1999/XMLSchema-instance"> + <XSOAP-ENV:Body> + <m:getStateName xmlns:m="http://www.soapware.org/"> + <statenum Xxsi:type="Xxsd:int">41</statenum> + </m:getStateName> + </XSOAP-ENV:Body> +</XSOAP-ENV:Envelope> +''' + s = parseSOAPRPC(my_xml_ns) + if config.simplify_objects: + self.assertEquals(s['statenum'], 41, "NS one failed") + self.assertEquals(type(s['statenum']), type(0)) + else: + self.assertEquals(s.statenum, 41, "NS one failed") + self.assertEquals(type(s.statenum), type(0)) + + def testPriceAndVolume(self): + my_xml2 = ''' +<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> +<SOAP-ENV:Header> +<t:Transaction xmlns:t="some-URI" SOAP-ENV:mustUnderstand="1"> +5 +</t:Transaction> +</SOAP-ENV:Header> + <SOAP-ENV:Body> + <m:GetLastTradePriceResponse xmlns:m="Some-URI"> + <PriceAndVolume> + <LastTradePrice> + 34.5 + </LastTradePrice> + <DayVolume> + 10000 + </DayVolume> + </PriceAndVolume> + </m:GetLastTradePriceResponse> + </SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + s = parseSOAPRPC(my_xml2) + if config.simplify_objects: + self.assertEquals(s['PriceAndVolume']['LastTradePrice'].strip(), "34.5") + self.assertEquals(s['PriceAndVolume']['DayVolume'].strip(), "10000") + else: + self.assertEquals(s.PriceAndVolume.LastTradePrice.strip(), "34.5") + self.assertEquals(s.PriceAndVolume.DayVolume.strip(), "10000") + + def testInt3(self): + my_xml3 = ''' +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> + <SOAP-ENV:Body> + <Bounds> + <param> + <lowerBound xsi:type="xsd:int"> 18 </lowerBound> + <upperBound xsi:type="xsd:int"> 139</upperBound> + </param> + </Bounds> + </SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + s = parseSOAPRPC(my_xml3) + + if config.simplify_objects: + self.assertEquals(s['param']['lowerBound'], 18) + self.assertEquals(s['param']['upperBound'], 139) + else: + self.assertEquals(s.param.lowerBound, 18) + self.assertEquals(s.param.upperBound, 139) + + def testBoolean(self): + my_xml4 = ''' +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> + <SOAP-ENV:Body> + <Bounds> +<param SOAP-ENC:arrayType="xsd:ur-type[4]" xsi:type="SOAP-ENC:Array"><item xsi:type="xsd:int">12</item> + <item xsi:type="xsd:string">Egypt</item> + <item xsi:type="xsd:boolean">0</item> + <item xsi:type="xsd:int">-31</item> + </param> + <param1 xsi:null="1"></param1> + <param2 xsi:null="true"></param2> + <param3 xsi:type="xsd:int" xsi:null="false">7</param3> + </Bounds> + </SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + s = parseSOAPRPC(my_xml4) + if config.simplify_objects: + self.assertEquals(s['param'][0], 12) + self.assertEquals(s['param'][1], "Egypt") + self.assertEquals(s['param'][2], 0) + self.assertEquals(s['param'][3], -31) + self.assertEquals(s['param1'], None) + self.assertEquals(s['param2'], None) + self.assertEquals(s['param3'], 7) + else: + self.assertEquals(s.param[0], 12) + self.assertEquals(s.param[1], "Egypt") + self.assertEquals(s.param[2], 0) + self.assertEquals(s.param[3], -31) + self.assertEquals(s.param1, None) + self.assertEquals(s.param2, None) + self.assertEquals(s.param3, 7) + + def testFault(self): + my_xml5 = ''' +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> + <SOAP-ENV:Body> + <SOAP-ENV:Fault> + <faultcode>SOAP-ENV:Client</faultcode> + <faultstring>Cant call getStateName because there are too many parameters.</faultstring> + </SOAP-ENV:Fault> + </SOAP-ENV:Body> + </SOAP-ENV:Envelope> +''' + s = parseSOAPRPC(my_xml5) + self.assertEquals(s.__class__, faultType) + self.assertEquals(s.faultcode, "SOAP-ENV:Client") + + def testArray2(self): + my_xml6 = ''' +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> +<SOAP-ENV:Body> +<h SOAP-ENC:arrayType="xsd:ur-type[6]" xsi:type="SOAP-ENC:Array"> +<item xsi:type="xsd:int">5</item> +<item xsi:type="xsd:int">3</item> +<item xsi:type="xsd:int">2</item> +<item xsi:type="xsd:string">monkey</item> +<item xsi:type="xsd:string">cay</item> +<item> +<cat xsi:type="xsd:string">hello</cat> +<ferret SOAP-ENC:arrayType="xsd:ur-type[6]" xsi:type="SOAP-ENC:Array"> +<item xsi:type="xsd:int">5</item> +<item xsi:type="xsd:int">4</item> +<item xsi:type="xsd:int">3</item> +<item xsi:type="xsd:int">2</item> +<item xsi:type="xsd:int">1</item> +<item> +<cow xsi:type="xsd:string">moose</cow> +</item> +</ferret> +<monkey xsi:type="xsd:int">5</monkey> +</item> +</h> +</SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + q = parseSOAPRPC(my_xml6) + self.assertEquals(q[0], 5) + self.assertEquals(q[1], 3) + self.assertEquals(q[2], 2) + self.assertEquals(q[3], 'monkey') + self.assertEquals(q[4], 'cay') + x = q[5] + if config.simplify_objects: + self.assertEquals(x['monkey'], 5) + self.assertEquals(x['cat'], "hello") + self.assertEquals(x['ferret'][0], 5) + self.assertEquals(x['ferret'][3], 2) + self.assertEquals(x['ferret'][5]['cow'], "moose") + else: + self.assertEquals(x.monkey, 5) + self.assertEquals(x.cat, "hello") + self.assertEquals(x.ferret[0], 5) + self.assertEquals(x.ferret[3], 2) + self.assertEquals(x.ferret[5].cow, "moose") + + def testArray3(self): + x = arrayType([5,4,3,21], "spam") + y = buildSOAP(x) + z = parseSOAPRPC(y) + self.assertEquals(x, z) + + # test struct + def testStruct(self): + x = structType(name = "eggs") + x.test = 5 + y = buildSOAP(x) + z = parseSOAPRPC(y) + if config.simplify_objects: + self.assertEquals( x['test'], z['test'] ) + else: + self.assertEquals( x.test, z.test ) + + # test faults + def testFault1(self): + x = faultType("ServerError","Howdy",[5,4,3,2,1]) + y = buildSOAP(x) + + z = parseSOAPRPC(y) + self.assertEquals( x.faultcode , z.faultcode) + self.assertEquals( x.faultstring , z.faultstring) + self.assertEquals( x.detail , z.detail) + + # Test the recursion + def testRecursion(self): + o = one() + t = two() + o.t = t + t.o = o + tre = three() + tre.o = o + tre.t = t + x = buildSOAP(tre) + y = parseSOAPRPC(x) + if config.simplify_objects: + self.assertEquals( y['t']['o']['t']['o']['t']['o']['t']['str'] , "two") + else: + self.assertEquals( y.t.o.t.o.t.o.t.str , "two") + + # Test the recursion with structs + def testRecursionWithStructs(self): + o = structType("one") + t = structType("two") + o.t = t + o.str = "one" + t.o = o + t.str = "two" + tre = structType("three") + tre.o = o + tre.t = t + tre.str = "three" + x = buildSOAP(tre) + y = parseSOAPRPC(x) + if config.simplify_objects: + self.assertEquals( y['t']['o']['t']['o']['t']['o']['t']['str'] , "two") + else: + self.assertEquals( y.t.o.t.o.t.o.t.str , "two") + + def testAmp(self): + m = "Test Message <tag> & </tag>" + x = structType("test") + x.msg = m + y = buildSOAP(x) + z = parseSOAPRPC(y) + if config.simplify_objects: + self.assertEquals( m , z['msg']) + else: + self.assertEquals( m , z.msg) + + def testInt4(self): + my_xml7 = ''' +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> +<SOAP-ENV:Body> +<Bounds> +<param> +<lowerBound xsi:type="xsd:int"> 18 </lowerBound> +<upperBound xsi:type="xsd:int"> 139</upperBound> +</param> +</Bounds> +</SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + x = parseSOAPRPC(my_xml7) + y = buildSOAP(x) + + # Does buildSOAP require a valid encoding? + def testBuildSOAPEncoding(self): + try: + x = buildSOAP('hello', encoding = 'gleck') + except LookupError, e: + if str (e)[0:16] != 'unknown encoding': raise + x = None + except: + print "Got unexpected exception: %s %s" % tuple (sys.exc_info ()[0:2]) + x = '' + self.assertEquals( x , None) + + # Does SOAPProxy require a valid encoding? + def testSOAPProxyEncoding(self): + try: + x = SOAPProxy('', encoding = 'gleck') + except LookupError, e: + if str (e)[0:16] != 'unknown encoding': raise + x = None + except: + print "Got unexpected exception: %s %s" % tuple (sys.exc_info ()[0:2]) + x = '' + self.assertEquals( x , None) + + # Does SOAPServer require a valid encoding? + def testSOAPServerEncoding(self): + try: + x = SOAPServer(('localhost', 0), encoding = 'gleck') + except LookupError, e: + if str (e)[0:16] != 'unknown encoding': raise + x = None + except: + print "Got unexpected exception: %s %s" % tuple (sys.exc_info ()[0:2]) + x = '' + self.assertEquals( x , None) + + def testEncodings(self): + encodings = ('US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16') + + tests = ('A', u'\u0041') + for t in tests: + for i in range (len (encodings)): + x = buildSOAP (t, encoding = encodings[i]) + y = parseSOAPRPC (x) + self.assertEquals( y , t) + + tests = (u'\u00a1',) + for t in tests: + for i in range (len (encodings)): + try: + x = buildSOAP (t, encoding = encodings[i]) + except: + if i > 0: raise + continue + y = parseSOAPRPC (x) + self.assertEquals( y , t) + + tests = (u'\u01a1', u'\u2342') + for t in tests: + for i in range (len (encodings)): + try: + x = buildSOAP (t, encoding = encodings[i]) + except: + if i > 1: raise + continue + y = parseSOAPRPC (x) + self.assertEquals( y , t) + + def build_xml(self, schema, type, value, attrs = ''): + return '''<?xml version="1.0" encoding="UTF-8"?> + <SOAP-ENV:Envelope + SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:xsd="%(schema)s" + xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" + xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> + <SOAP-ENV:Body> + <_1 xsi:type="xsd:%(type)s"%(attrs)s>%(value)s</_1> + </SOAP-ENV:Body> + </SOAP-ENV:Envelope>''' % {'schema': schema, 'type': type, 'value': value, + 'attrs': attrs} + + # Make sure the various limits are checked when parsing + def testIntegerLimits(self): + for t, l in SOAPParser.intlimits.items(): + try: + parseSOAP(xml % (NS.XSD, t, 'hello')) + raise AssertionError, "parsed %s of 'hello' without error" % t + except AssertionError: + raise + except: + pass + + if l[1] != None: + try: + parseSOAP(self.build_xml(NS.XSD, t, l[1] - 1)) + raise AssertionError, "parsed %s of %s without error" % \ + (t, l[1] - 1) + except AssertionError: + raise + except UnderflowError: + pass + + if l[2] != None: + try: + parseSOAP(self.build_xml(NS.XSD, t, l[2] + 1)) + raise AssertionError, "parsed %s of %s without error" % \ + (t, l[2] + 1) + except AssertionError: + raise + except OverflowError: + pass + + # Make sure the various limits are checked when parsing + # Next, floats. Note that chances are good this won't work in any non-Unix Pythons. + def testFloatLimits(self): + for i in \ + ( + ('float', '-3.402823466391E+38'), + ('float', '3.402823466391E+38'), + ('float', '3.5e+38'), + ('float', '6.9e-46'), + ('double', '-1.7976931348623159E+308'), + ('double', '1.7976931348623159E+308'), + ('double', '1.8e308'), + ('double', '2.4e-324'), + ): + try: + parseSOAP(self.build_xml(NS.XSD, i[0], i[1])) + + # Hide this error for now, cause it is a bug in python 2.0 and 2.1 + #if not (sys.version_info[0] == 2 and sys.version_info[1] <= 2) \ + # and i[1]=='1.7976931348623159E+308': + raise AssertionError, "parsed %s of %s without error" % i + except AssertionError: + raise + except (UnderflowError, OverflowError): + pass + + # Make sure we can't instantiate the base classes + def testCannotInstantiateBaseClasses(self): + for t in (anyType, NOTATIONType): + try: + x = t() + raise AssertionError, "instantiated %s directly" % repr(t) + except: + pass + + # Try everything that requires initial data without any. + def testMustBeInitialized(self): + for t in (CDATAType, ENTITIESType, ENTITYType, IDType, IDREFType, + IDREFSType, NCNameType, NMTOKENType, NMTOKENSType, NOTATIONType, + NameType, QNameType, anyURIType, base64Type, base64BinaryType, + binaryType, booleanType, byteType, decimalType, doubleType, + durationType, floatType, hexBinaryType, intType, integerType, + languageType, longType, negative_IntegerType, negativeIntegerType, + non_Negative_IntegerType, non_Positive_IntegerType, + nonNegativeIntegerType, nonPositiveIntegerType, normalizedStringType, + positive_IntegerType, positiveIntegerType, shortType, stringType, + timeDurationType, tokenType, unsignedByteType, unsignedIntType, + unsignedLongType, unsignedShortType, untypedType, uriType, + uriReferenceType): + try: + t() + raise AssertionError, "instantiated a %s with no value" % t.__name__ + except AssertionError: + raise + except: + pass + + + def testInstantiations(self): + # string, ENTITY, ID, IDREF, language, Name, NCName, + # NMTOKEN, QName, untypedType + for t in (stringType, ENTITYType, IDType, IDREFType, + languageType, NameType, NCNameType, NMTOKENType, + QNameType, untypedType): + # First some things that shouldn't be taken as the current type + + test = (10, (), [], {}) + for i in test: + try: + t(i) + + raise AssertionError, \ + "instantiated a %s with a bad type (%s)" % \ + (repr(t), repr(type(i))) + except AssertionError: + raise + except: + pass + + # Now some things that should + + for i in ('hello', u'goodbye'): + x = t(i) + d = x._marshalData() + + if d != i: + raise AssertionError, "expected %s, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (i, z) + + # ENTITIES, IDREFS, NMTOKENS + for t in (ENTITIESType, IDREFSType, NMTOKENSType): + # First some things that shouldn't be taken as the current type + + test = ({}, lambda x: x, ((),), ([],), [{}], [()]) + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad type (%s)" % \ + repr(t), repr(type(i)) + except AssertionError: + raise + except: + pass + + # Now some things that should + + for i in ('hello', (), [], ('hello', 'goodbye'), ['aloha', 'guten_tag']): + x = t(i) + d = x._marshalData() + + if type(i) in (type(()), type([])): + j = list(i) + else: + j = [i] + k = ' '.join(j) + + if d != k: + raise AssertionError, "expected %s, got %s" % (k, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != j: + raise AssertionError, "expected %s, got %s" % (repr(j), repr(z)) + + # uri, uriReference, anyURI + for t in (uriType, uriReferenceType, anyURIType): + # First some things that shouldn't be taken as the current type + + test = (10, (), [], {}) + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad type (%s)" % \ + t.__name__, repr(type(i)) + except AssertionError: + raise + except: + pass + + # Now some things that should + + for i in ('hello', u'goodbye', '!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~'): + x = t(i) + d = x._marshalData() + + j = urllib.quote(i) + + if d != j: + raise AssertionError, "expected %s, got %s" % (j, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # token First some things that shouldn't be valid because of type + test = (42, 3.14, (), [], {}) + t = tokenType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad type (%s)" % (t.__name__, repr(i)) + except AssertionError: + raise + except AttributeError: + pass + + # Now some things that shouldn't be valid because of content + + test = (' hello', 'hello ', 'hel\nlo', 'hel\tlo', 'hel lo', ' \n \t ') + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should be valid + + for i in ('', 'hello', u'hello'): + x = t(i) + d = x._marshalData() + + if d != i: + raise AssertionError, "expected %s, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i and i != '': + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + #### CDATA, normalizedString + + for t in (CDATAType, normalizedStringType): + # First some things that shouldn't be valid because of type + + test = (42, 3.14, (), [], {}) + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad type (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except AttributeError: + pass + + # Now some things that shouldn't be valid because of content + + test = ('hel\nlo', 'hel\rlo', 'hel\tlo', '\n\r\t') + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should be valid + + for i in ('', 'hello', u'hello', 'hel lo'): + x = t(i) + d = x._marshalData() + + if d != i: + raise AssertionError, "expected %s, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i and i != '': + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + #### boolean + + # First some things that shouldn't be valid + + test = (10, 'hello', (), [], {}) + t = booleanType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % (t.__name__, repr(i)) + except AssertionError: + raise + except: + pass + + # Now some things that should + + for i in ((0, 'false'), ('false', 'false'), (1, 'true'), + ('true', 'true'), (0.0, 'false'), (1.0, 'true')): + x = t(i[0]) + d = x._marshalData() + + if d != i[1]: + raise AssertionError, "%s: expected %s, got %s" % (i[0], i[1], d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + j = ('false', 'true')[z] + + if j != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], repr(i[1]), repr(j)) + + # Now test parsing, both valid and invalid + + test = (('10', None), ('hello', None), ('false', 0), ('FALSE', 0), + (ws + 'false' + ws, 0), (ws + '0' + ws, 0), + ('0', 0), ('true', 1), ('TRUE', 1), ('1', 1), + (ws + 'true' + ws, 1), (ws + '1' + ws, 1)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != None: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + # Can we give it a name and no type? + + #print + x = t(1, name = 'George', typed = 0) + #print "x=",x + y = buildSOAP(x) + #print "y=",y + z = parseSOAP(y) + #print "z=",z + + test = 'true' + if z.George != test: + raise AssertionError, "expected %s, got %s" % (repr(test), repr(z)) + + # How about some attributes, set in various and sundry manners? + + x = t(1, attrs = {'nonamespaceURI': 1}) + x._setAttrs({(None, 'NonenamespaceURI'): 2, + ('http://some/namespace', 'namespaceURIattr1'): 3}) + x._setAttr(('http://some/other/namespace', 'namespaceURIattr2'), 4) + + self.assertEquals( x._getAttr('nonamespaceURI') , 1) + self.assertEquals( x._getAttr('NonenamespaceURI') , 2) + self.assertEquals( x._getAttr(('http://some/namespace', + 'namespaceURIattr1')) , 3) + self.assertEquals( x._getAttr(('http://some/other/namespace', + 'namespaceURIattr2')) , 4) + self.assertEquals( x._getAttr('non-extant attr') , None) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + self.assertEquals( z , 1) + + #### decimal + + # First some things that shouldn't be valid + + test = ('hello', (), [], {}) + t = decimalType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad type (%s)" % \ + (t.__name__, repr(type(i))) + except AssertionError: + raise + except: + pass + + # Now some things that should + + for i in (10, 3.14, 23L): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %f, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', None), ('1.2.3', None), ('10', 10), ('10.', 10), + ('.1', .1), ('.1000000', .1), (ws + '10.4' + ws, 10.4)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != None: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + #### float + + # First some things that shouldn't be valid + + test = ('hello', (), [], {}, -3.402823466391E+38, 3.402823466391E+38) + t = floatType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (10, 3.14, 23L, -3.4028234663852886E+38, 3.4028234663852886E+38): + x = t(i) + d = x._marshalData() + + if not nearlyeq(float(d), i): + raise AssertionError, "expected %f, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if not nearlyeq(z, i): + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', None), ('1.2.3', None), ('10', 10), ('10.', 10), + ('.1', .1), ('.1000000', .1), (ws + '10.4' + ws, 10.4), + ('-3.402823466391E+38', None), ('3.402823466391E+38', None), + ('-3.4028234663852886E+38', -3.4028234663852886E+38), + ('3.4028234663852886E+38', 3.4028234663852886E+38)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if abs(z - i[1]) > 1e-6: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != None: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + #### double + + # First some things that shouldn't be valid + + test = ('hello', (), [], {}, + -1.7976931348623159E+308, 1.7976931348623159E+308) + t = doubleType + + for i in test: + try: + t(i) + # Hide this error for now, cause it is a bug in python 2.0 and 2.1 + if not (sys.version_info[0] == 2 and sys.version_info[1] <= 2 + and i==1.7976931348623159E+308): + raise AssertionError, \ + "instantiated a double with a bad value (%s)" % repr(i) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (10, 3.14, 23L, -1.79769313486E+308, 1.79769313486E+308): + x = t(i) + d = x._marshalData() + + if not nearlyeq(float(d), i): + raise AssertionError, "expected %s, got %s" % (i, str(x)) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if not nearlyeq(z, i): + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', None), ('1.2.3', None), ('10', 10), ('10.', 10), + ('.1', .1), ('.1000000', .1), (ws + '10.4' + ws, 10.4), + ('-1.7976931348623159E+308', None), ('1.7976931348623158E+308', None), + ('-1.79769313486E+308', -1.79769313486E+308), + ('1.79769313486E+308', 1.79769313486E+308)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if abs(z - i[1]) > 1e-6: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != None: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + #### hexBinary + + x = '' + for i in range(256): + x += chr(i) + test = ('', x, 'hello') + t = hexBinaryType + + l = [] + for i in test: + l.append(hexBinaryType(i)) + + x = buildSOAP(l) + y = parseSOAPRPC(x) + + for i in range(len(test)): + if test[i] != y[i]: + raise AssertionError, "@ %d expected '%s', got '%s'" % \ + (i, test[i], y[i]) + + # Now test parsing, both valid and invalid + + test = (('hello', None), ('6163 747A65726F', None), ('6163747A65726', None), + ('6163747A65726F', 'actzero'), (ws + '6163747A65726F' + ws, 'actzero')) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != None: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + #### base64Binary and base64 + + s = '' + for i in range(256): + s += chr(i) + + for t in (base64BinaryType, base64Type): + # First some things that shouldn't be valid + + test = ((), [], {}, lambda x: x) + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except AttributeError: + pass + + # Now some things that should + + test = ('', s, u'hello') + + l = [] + for i in test: + l.append(t(i)) + + x = buildSOAP(l) + y = parseSOAPRPC(x) + + for i in range(len(test)): + if test[i] != y[i]: + raise AssertionError, "@ %d expected '%s', got '%s'" % \ + (i, test[i], y[i]) + + # Now test parsing, both valid and invalid + + test = (('hello', None), ('YWN0emVybw=', None), + ('YWN 0emVybw==', 'actzero'), ('YWN0emVybw==', 'actzero'), + (ws + 'YWN0emVybw==' + ws, 'actzero')) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != None: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + #### binary (uses s from above) + + # First some check invalid encodings + + try: + x = binaryType('hello', encoding = 'yellow') + raise AssertionError, "created binary with invalid encoding" + except AssertionError: + raise + except: + pass + + for t in ('hex', 'base64'): + # First some things that shouldn't be valid + + test = ((), [], {}, lambda x: x) + + for i in test: + try: + binaryType(i, encoding = t) + raise AssertionError, \ + "instantiated a %s binary with a bad value (%s)" % \ + (e, repr(i)) + except AssertionError: + raise + except AttributeError: + pass + + # Now some things that should + + test = ('', s, u'hello') + + l = [] + for i in test: + l.append(binaryType(i, encoding = t)) + + x = buildSOAP(l) + y = parseSOAPRPC(x) + + for i in range(len(test)): + if test[i] != y[i]: + raise AssertionError, "@ %d expected '%s', got '%s'" % \ + (i, test[i], y[i]) + + # Now test parsing, both valid and invalid + + if t == 'hex': + test = (('hello', None), ('6163 747A65726F', None), + ('6163747A65726', None), ('6163747A65726F', 'actzero'), + (ws + '6163747A65726F' + ws, 'actzero')) + else: + test = (('hello', None), ('YWN0emVybw=', None), + ('YWN 0emVybw==', 'actzero'), ('YWN0emVybw==', 'actzero'), + (ws + 'YWN0emVybw==' + ws, 'actzero')) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(NS.XSD, 'binary', i[0], + ' encoding="%s"' % t)) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != None: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t, sys.exc_info()[0], sys.exc_info()[1]) + + # Finally try an Array of binaries (with references!) + + test = ('', s, u'hello') + + l = [] + for i in test: + l.append(binaryType(i, encoding = t)) + + l.append(l[1]) + x = buildSOAP(l) + y = parseSOAPRPC(x) + + for i in range(len(test)): + if test[i] != y[i]: + raise AssertionError, "@ %d expected '%s', got '%s'" % \ + (i, test[i], y[i]) + + # Make sure the references worked + + self.assertEquals( id(y[1]) , id(y[3])) + + def badTest(self, t, data): + for i in data: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except: + pass + + def goodTest(self, t, data): + for i in data: + x = t(i[0]) + d = x._marshalData() + + if d != i[1]: + raise AssertionError, "%s(%s): expected %s, got %s" % \ + (t.__name__, repr(i[0]), i[1], d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i[2]: + raise AssertionError, "%s(%s): expected %s, got %s" % \ + (t.__name__, repr(i[0]), repr(i[2]), repr(z)) + + def parseTest(self, t, data): + for i in data: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], + i[0])) + + if z != i[1]: + raise AssertionError, "%s(%s): expected %s, got %s" % \ + (t.__name__, repr(i[0]), i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def allTests(self, t, baddata, gooddata, parsedata): + self.badTest(t, baddata) + self.goodTest(t, gooddata) + self.parseTest(t, parsedata) + + # duration and timeDuration + def testTimeDuration(self): + baddata = \ + ( + 'hello', + ('hello',), + (-10, -10), + (-10, 0, -10), + (10.5, 10.5), + (0, 10.5, 0, 10.5, 0), + (1, 2, 3, 4, 5, 6, 7), + (1, 2, 'hello', 4, 5, 6), + (1, 2, 3.5, 4, 5, 6), + ) + gooddata = \ + ( + (0, 'PT0S', (N, N, N, N, N, 0.0,)), + ((), 'PT0S', (N, N, N, N, N, 0.0,)), + ([], 'PT0S', (N, N, N, N, N, 0.0,)), + ((0.5,), 'PT0.5S', (N, N, N, N, N, 0.5,)), + (10L, 'PT10S', (N, N, N, N, N, 10.0,)), + (-10, '-PT10S', (N, N, N, N, N, -10.0,)), + (10.5, 'PT10.5S', (N, N, N, N, N, 10.5,)), + ((10L, 20), 'PT10M20S', (N, N, N, N, 10, 20.0)), + ((-10, 20), '-PT10M20S', (N, N, N, N, -10, 20.0)), + ((10, 0), 'PT10M', (N, N, N, N, 10, N)), + ((10, 0, 0), 'PT10H', (N, N, N, 10, N, N)), + ((10, 0L, 0, 0), 'P10D', (N, N, 10, N, N, N)), + ((10, 0, 0, 0, 0), 'P10M', (N, 10, N, N, N, N)), + ((10, 0, 0, 0L, 0, 0), 'P10Y', (10, N, N, N, N, N)), + ((-10, 0, 0, 0, 0, 0), '-P10Y', (-10, N, N, N, N, N)), + ((10, 0, 0, 0, 0, 20L), 'P10YT20S', (10, N, N, N, N, 20.0,)), + ((1, 2, 3, 4, 5, 6.75), 'P1Y2M3DT4H5M6.75S', + (1, 2, 3, 4, 5, 6.75)), + ((-1, 2, 3, 4, 5, 6.75), '-P1Y2M3DT4H5M6.75S', + (-1, 2, 3, 4, 5, 6.75)), + ((1, 2, 3, 10, 30, 0), 'P1Y2M3DT10H30M', + (1, 2, 3, 10, 30, N)), + ((1e6, 2e6, 3e6, 4e6, 5e6, 6.7e6), + 'P1000000Y2000000M3000000DT4000000H5000000M6700000S', + (1e6, 2e6, 3e6, 4e6, 5e6, 6.7e6)), + ((1347, 0, N, 0, 0), 'P1347M', (N, 1347, N, N, N, N)), + ((-1347, 0, 0, 0, N), '-P1347M', (N, -1347, N, N, N, N)), + ((1e15, 0, 0, 0, 0), 'P1000000000000000M', + (N, 1000000000000000L, N, N, N, N)), + ((-1e15, 0, 0, 0, 0), '-P1000000000000000M', + (N, -1000000000000000L, N, N, N, N)), + ((1000000000000000L, 0, 0, 0, 0), 'P1000000000000000M', + (N, 1000000000000000L, N, N, N, N)), + ((-1000000000000000L, 0, 0, 0, 0), '-P1000000000000000M', + (N, -1000000000000000L, N, N, N, N)), + ) + parsedata = ( + ('hello', N), + ('P T0S', N), + ('P10.5Y10.5M', N), + ('P1Y2MT', N), + ('PT0S', (N, N, N, N, N, 0,)), + ('P10Y', (10, N, N, N, N, N)), + (ws + 'P10M' + ws, (N, 10, N, N, N, N)), + ('P0Y1347M', (0, 1347, N, N, N, N)), + ('P0Y1347M0D', (0, 1347, 0, N, N, N)), + ('P0MT0M', (N, 0, N, N, 0, N)), + ) + + for t in (durationType, timeDurationType): + self.allTests(t, baddata, gooddata, parsedata) + + # dateTime, timeInstant, and timePeriod + def testTimePeriod(self): + baddata = \ + ( + 'hello', + ('hello',), + (1, 2, 3, 4, 5), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (1, 2, 3, 4, 5, 'hello'), + (1, 2.5, 3, 4, 5, 6), + (1, 0, 3, 4, 5, 6), + (1, 13, 3, 4, 5, 6), + (1, 1, 0, 4, 5, 6), + (1, 1, 32, 4, 5, 6), + (1, 2, 29, 4, 5, 6), + (0, 2, 30, 4, 5, 6), + (100, 2, 29, 4, 5, 6), + (1, 2, 3, -1, 5, 6), + (1, 2, 3, 24, 5, 6), + (1, 2, 3, 4, -1, 6), + (1, 2, 3, 4, 60, 6), + (1, 2, 3, 4, 5, -1), + (1, 2, 3, 4, 5, 61), + (1, 3, 32, 4, 5, 6), + (1, 4, 31, 4, 5, 6), + (1, 5, 32, 4, 5, 6), + (1, 6, 31, 4, 5, 6), + (1, 7, 32, 4, 5, 6), + (1, 8, 32, 4, 5, 6), + (1, 9, 31, 4, 5, 6), + (1, 10, 32, 4, 5, 6), + (1, 11, 31, 4, 5, 6), + (1, 12, 32, 4, 5, 6), + ) + gooddata = \ + ( + (1L, '1970-01-01T00:00:01Z', (1970, 1, 1, 0, 0, 1.0)), + (1.5, '1970-01-01T00:00:01.5Z', (1970, 1, 1, 0, 0, 1.5)), + ((-1, 2, 3, 4, 5, 6), '-0001-02-03T04:05:06Z', + (-1, 2, 3, 4, 5, 6.0)), + ((1, 2, 3, 4, 5, 6), '0001-02-03T04:05:06Z', + (1, 2, 3, 4, 5, 6.0)), + ((10, 2, 3, 4, 5, 6), '0010-02-03T04:05:06Z', + (10, 2, 3, 4, 5, 6.0)), + ((100, 2, 3, 4, 5, 6), '0100-02-03T04:05:06Z', + (100, 2, 3, 4, 5, 6.0)), + ((1970, 2, 3, 4, 5, 6), '1970-02-03T04:05:06Z', + (1970, 2, 3, 4, 5, 6.0)), + ((-1970, 2, 3, 4, 5, 6), '-1970-02-03T04:05:06Z', + (-1970, 2, 3, 4, 5, 6.0)), + ((1970L, 2.0, 3.0, 4L, 5L, 6.875), '1970-02-03T04:05:06.875Z', + (1970, 2, 3, 4, 5, 6.875)), + ((11990, 1, 2, 3, 4L, 5.25, 0, 0, 0), + '11990-01-02T03:04:05.25Z', + (11990, 1, 2, 3, 4, 5.25)), + ((1e15, 1, 2, 3, 4L, 5.25, 0, 0, 0), + '1000000000000000-01-02T03:04:05.25Z', + (1e15, 1, 2, 3, 4, 5.25)), + ((-1e15, 1, 2, 3, 4L, 5.25, 0, 0, 0), + '-1000000000000000-01-02T03:04:05.25Z', + (-1e15, 1, 2, 3, 4, 5.25)), + ((1000000000000000L, 1, 2, 3, 4L, 5.25, 0, 0, 0), + '1000000000000000-01-02T03:04:05.25Z', + (1e15, 1, 2, 3, 4, 5.25)), + ((-1000000000000000L, 1, 2, 3, 4L, 5.25, 0, 0, 0), + '-1000000000000000-01-02T03:04:05.25Z', + (-1e15, 1, 2, 3, 4, 5.25)), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('1970 -01 -01T00:00:01Z', N), + ('0001-02-03t07:08:23Z', N), + + # Invalid ranges + ('2001-00-03T07:08:23Z', N), + ('2001-13-03T07:08:23Z', N), + ('2001-02-00T07:08:23Z', N), + ('2001-02-29T07:08:23Z', N), + ('2000-02-30T07:08:23Z', N), + ('1900-02-29T07:08:23Z', N), + ('2001-02-03T24:08:23Z', N), + ('2001-02-03T04:60:23Z', N), + ('2001-02-03T04:05:61Z', N), + ('2001-01-32T04:05:06Z', N), + ('2001-03-32T04:05:06Z', N), + ('2001-04-31T04:05:06Z', N), + ('2001-05-32T04:05:06Z', N), + ('2001-06-31T04:05:06Z', N), + ('2001-07-32T04:05:06Z', N), + ('2001-08-32T04:05:06Z', N), + ('2001-09-31T04:05:06Z', N), + ('2001-10-32T04:05:06Z', N), + ('2001-11-31T04:05:06Z', N), + ('2001-12-32T04:05:06Z', N), + + # Whitespace + (ws + '1970-01-01T00:00:00Z' + ws, (1970, 1, 1, 0, 0, 0)), + + # No timezones + ('11971-02-03T04:05:06.125', (11971, 2, 3, 4, 5, 6.125)), + ('1971-02-03T04:05:06.125', (1971, 2, 3, 4, 5, 6.125)), + ('-1971-02-03T04:05:06.125', (-1971, 2, 3, 4, 5, 6.125)), + + # Non-zulu + ('11971-02-03T04:05:06.125-07:08', (11971, 2, 3, 11, 13, 6.125)), + ('11971-02-03T04:05:06.125+07:08', (11971, 2, 2, 20, 57, 6.125)), + ('-11971-02-03T04:05:06.125-07:08', (-11971, 2, 3, 11, 13, 6.125)), + ('-11971-02-03T04:05:06.125+07:08', (-11971, 2, 2, 20, 57, 6.125)), + ('1971-02-03T04:05:06.125-07:08', (1971, 2, 3, 11, 13, 6.125)), + ('1971-02-03T04:05:06.125+07:08', (1971, 2, 2, 20, 57, 6.125)), + ('-1971-02-03T04:05:06.125-07:08', (-1971, 2, 3, 11, 13, 6.125)), + ('-1971-02-03T04:05:06.125+07:08', (-1971, 2, 2, 20, 57, 6.125)), + + # Edgepoints (ranges) + ('2001-01-03T07:08:09Z', (2001, 1, 3, 7, 8, 9)), + ('2001-12-03T07:08:09Z', (2001, 12, 3, 7, 8, 9)), + ('2001-02-01T07:08:09Z', (2001, 2, 1, 7, 8, 9)), + ('2001-02-28T07:08:09Z', (2001, 2, 28, 7, 8, 9)), + ('2000-02-29T07:08:09Z', (2000, 2, 29, 7, 8, 9)), + ('1900-02-28T07:08:09Z', (1900, 2, 28, 7, 8, 9)), + ('2001-02-03T00:08:09Z', (2001, 2, 3, 0, 8, 9)), + ('2001-02-03T23:08:09Z', (2001, 2, 3, 23, 8, 9)), + ('2001-02-03T04:00:09Z', (2001, 2, 3, 4, 0, 9)), + ('2001-02-03T04:59:09Z', (2001, 2, 3, 4, 59, 9)), + ('2001-02-03T04:05:00Z', (2001, 2, 3, 4, 5, 0)), + ('2001-02-03T04:05:60.9Z', (2001, 2, 3, 4, 5, 60.9)), + ('2001-01-31T04:05:06Z', (2001, 1, 31, 4, 5, 6)), + ('2001-03-31T04:05:06Z', (2001, 3, 31, 4, 5, 6)), + ('2001-04-30T04:05:06Z', (2001, 4, 30, 4, 5, 6)), + ('2001-05-31T04:05:06Z', (2001, 5, 31, 4, 5, 6)), + ('2001-06-30T04:05:06Z', (2001, 6, 30, 4, 5, 6)), + ('2001-07-31T04:05:06Z', (2001, 7, 31, 4, 5, 6)), + ('2001-08-31T04:05:06Z', (2001, 8, 31, 4, 5, 6)), + ('2001-09-30T04:05:06Z', (2001, 9, 30, 4, 5, 6)), + ('2001-10-31T04:05:06Z', (2001, 10, 31, 4, 5, 6)), + ('2001-11-30T04:05:06Z', (2001, 11, 30, 4, 5, 6)), + ('2001-12-31T04:05:06Z', (2001, 12, 31, 4, 5, 6)), + + # Edgepoints (crossing boundaries) + ('0001-01-01T07:08:23+07:08', (1, 1, 1, 0, 0, 23)), + ('0001-01-01T07:07:42+07:08', (0, 12, 31, 23, 59, 42)), + ('-0004-01-01T07:07:42+07:08', (-5, 12, 31, 23, 59, 42)), + ('2001-03-01T07:07:42+07:08', (2001, 2, 28, 23, 59, 42)), + ('2000-03-01T07:07:42+07:08', (2000, 2, 29, 23, 59, 42)), + ('1900-03-01T07:07:42+07:08', (1900, 2, 28, 23, 59, 42)), + ) + + for t in (dateTimeType, timeInstantType, timePeriodType): + self.allTests(t, baddata, gooddata, parsedata) + + # recurringInstant + def testRecurringInstant(self): + baddata = \ + ( + 'hello', + ('hello',), + (1, 2, N, 3, 4, 5), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (1, 2, 3, 4, 5, 'hello'), + (1, 2, 3.5, 4, 5, 6), + ) + gooddata = \ + ( + (1L, '1970-01-01T00:00:01Z', (1970, 1, 1, 0, 0, 1.0)), + (1.5, '1970-01-01T00:00:01.5Z', (1970, 1, 1, 0, 0, 1.5)), + (1e9, '2001-09-09T01:46:40Z', (2001, 9, 9, 1, 46, 40.0)), + ((1, 1, 2, 3, 4, 5), '-01-01-02T03:04:05Z', + (1, 1, 2, 3, 4, 5)), + ((-1, 1, 2, 3, 4, 5), '--01-01-02T03:04:05Z', + (-1, 1, 2, 3, 4, 5)), + ((10, 1, 2, 3, 4, 5), '-10-01-02T03:04:05Z', + (10, 1, 2, 3, 4, 5)), + ((-10, 1, 2, 3, 4, 5), '--10-01-02T03:04:05Z', + (-10, 1, 2, 3, 4, 5)), + ((100, 1, 2, 3, 4, 5), '0100-01-02T03:04:05Z', + (100, 1, 2, 3, 4, 5)), + ((-100, 1, 2, 3, 4, 5), '-0100-01-02T03:04:05Z', + (-100, 1, 2, 3, 4, 5)), + ((1970L, 1, 2, 3, 4, 5), '1970-01-02T03:04:05Z', + (1970, 1, 2, 3, 4, 5)), + ((1970L, 1, 2L, 3, 4.0, 5.25), '1970-01-02T03:04:05.25Z', + (1970, 1, 2, 3, 4, 5.25)), + ((11990, 1, 2, 3L, 4, 5.25), '11990-01-02T03:04:05.25Z', + (11990, 1, 2, 3, 4, 5.25)), + ((1e15, 1, 2, 3L, 4, 5.25), + '1000000000000000-01-02T03:04:05.25Z', + (1e15, 1, 2, 3, 4, 5.25)), + ((-1e15, 1, 2, 3L, 4, 5.25), + '-1000000000000000-01-02T03:04:05.25Z', + (-1e15, 1, 2, 3, 4, 5.25)), + ((N, 1, 2, 3, 4L, 5.25), '---01-02T03:04:05.25Z', + (N, 1, 2, 3, 4, 5.25)), + ((N, N, 2, 3, 4, 5.25, 0, 0, 0), '-----02T03:04:05.25Z', + (N, N, 2, 3, 4, 5.25)), + ((N, N, -2, 3, 4, 5.25, 0, 0, 0), '------02T03:04:05.25Z', + (N, N, -2, 3, 4, 5.25)), + ((N, N, N, 3, 4, 5.25), '------T03:04:05.25Z', + (N, N, N, 3, 4, 5.25)), + ((N, N, N, N, 4, 5.25, 0, 0, 0), '------T-:04:05.25Z', + (N, N, N, N, 4, 5.25)), + ((N, N, N, N, N, 5.25), '------T-:-:05.25Z', + (N, N, N, N, N, 5.25)), + ((N, N, N, N, N, -5.25), '-------T-:-:05.25Z', + (N, N, N, N, N, -5.25)), + ((N, N, N, N, N, N, 0, 0, 0), '------T-:-:-Z', + (N, N, N, N, N, N)), + ((N, N, N, N, N, N, N), '------T-:-:-Z', + (N, N, N, N, N, N)), + ((N, N, N, N, N, N, N, N), + '------T-:-:-Z', (N, N, N, N, N, N)), + ((N, N, N, N, N, N, N, N, N), + '------T-:-:-Z', (N, N, N, N, N, N)), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('1970 -01 -01T00:00:01Z', N), + ('0001-01-01t07:08:23+07:08', N), + + # Invalid ranges + ('2001-00-03T07:08:23Z', N), + ('2001-13-03T07:08:23Z', N), + ('2001-02-00T07:08:23Z', N), + ('2001-02-29T07:08:23Z', N), + ('2000-02-30T07:08:23Z', N), + ('1900-02-29T07:08:23Z', N), + ('2001-02-03T24:08:23Z', N), + ('2001-02-03T04:60:23Z', N), + ('2001-02-03T04:05:61Z', N), + ('2001-01-32T04:05:06Z', N), + ('2001-03-32T04:05:06Z', N), + ('2001-04-31T04:05:06Z', N), + ('2001-05-32T04:05:06Z', N), + ('2001-06-31T04:05:06Z', N), + ('2001-07-32T04:05:06Z', N), + ('2001-08-32T04:05:06Z', N), + ('2001-09-31T04:05:06Z', N), + ('2001-10-32T04:05:06Z', N), + ('2001-11-31T04:05:06Z', N), + ('2001-12-32T04:05:06Z', N), + + # Whitespace + (ws + '1970-01-01T00:00:01Z' + ws, (1970, 1, 1, 0, 0, 1)), + + # No timezones + ('11971-02-03T04:05:06.125', (11971, 2, 3, 4, 5, 6.125)), + ('-11971-02-03T04:05:06.125', (-11971, 2, 3, 4, 5, 6.125)), + ('1971-02-03T04:05:06.125', (1971, 2, 3, 4, 5, 6.125)), + ('-1971-02-03T04:05:06.125', (-1971, 2, 3, 4, 5, 6.125)), + ('-71-02-03T04:05:06.125', (71, 2, 3, 4, 5, 6.125)), + ('--71-02-03T04:05:06.125', (-71, 2, 3, 4, 5, 6.125)), + ('---02-03T04:05:06.125', (N, 2, 3, 4, 5, 6.125)), + ('----02-03T04:05:06.125', (N, -2, 3, 4, 5, 6.125)), + ('-----03T04:05:06.125', (N, N, 3, 4, 5, 6.125)), + ('------03T04:05:06.125', (N, N, -3, 4, 5, 6.125)), + ('------T04:05:06.125', (N, N, N, 4, 5, 6.125)), + ('-------T04:05:06.125', (N, N, N, -4, 5, 6.125)), + ('------T-:05:06.125', (N, N, N, N, 5, 6.125)), + ('-------T-:05:06.125', (N, N, N, N, -5, 6.125)), + ('------T-:-:06.125', (N, N, N, N, N, 6.125)), + ('-------T-:-:06.125', (N, N, N, N, N, -6.125)), + ('------T-:-:-', (N, N, N, N, N, N)), + ('-------T-:-:-', (N, N, N, N, N, N)), + + # Non-zulu + ('11971-02-03T04:05:06.125-07:08', (11971, 2, 3, 11, 13, 6.125)), + ('11971-02-03T04:05:06.125+07:08', (11971, 2, 2, 20, 57, 6.125)), + ('-11971-02-03T04:05:06.125-07:08', (-11971, 2, 3, 11, 13, 6.125)), + ('-11971-02-03T04:05:06.125+07:08', (-11971, 2, 2, 20, 57, 6.125)), + ('1971-02-03T04:05:06.125-07:08', (1971, 2, 3, 11, 13, 6.125)), + ('1971-02-03T04:05:06.125+07:08', (1971, 2, 2, 20, 57, 6.125)), + ('-1971-02-03T04:05:06.125-07:08', (-1971, 2, 3, 11, 13, 6.125)), + ('-1971-02-03T04:05:06.125+07:08', (-1971, 2, 2, 20, 57, 6.125)), + ('-71-02-03T04:05:06.125-07:08', (71, 2, 3, 11, 13, 6.125)), + ('-71-02-03T04:05:06.125+07:08', (71, 2, 2, 20, 57, 6.125)), + ('--71-02-03T04:05:06.125-07:08', (-71, 2, 3, 11, 13, 6.125)), + ('--71-02-03T04:05:06.125+07:08', (-71, 2, 2, 20, 57, 6.125)), + ('---02-03T04:05:06.125-07:08', (N, 2, 3, 11, 13, 6.125)), + ('---02-03T04:05:06.125+07:08', (N, 2, 2, 20, 57, 6.125)), + ('----02-03T04:05:06.125-07:08', (N, -2, 3, 11, 13, 6.125)), + ('----02-03T04:05:06.125+07:08', (N, -2, 2, 20, 57, 6.125)), + ('-----03T04:05:06.125-07:08', (N, N, 3, 11, 13, 6.125)), + ('-----03T04:05:06.125+07:08', (N, N, 2, 20, 57, 6.125)), + ('------03T04:05:06.125-07:08', (N, N, -3, 11, 13, 6.125)), + ('------03T04:05:06.125+07:08', (N, N, -4, 20, 57, 6.125)), + ('------T04:05:06.125-07:08', (N, N, N, 11, 13, 6.125)), + ('------T04:05:06.125+07:08', (N, N, N, -4, 57, 6.125)), + ('-------T04:05:06.125-07:08', (N, N, N, 3, 13, 6.125)), + ('-------T04:05:06.125+07:08', (N, N, N, -12, 57, 6.125)), + ('------T-:05:06.125-07:08', (N, N, N, N, 433, 6.125)), + ('------T-:05:06.125+07:08', (N, N, N, N, -423, 6.125)), + ('-------T-:05:06.125-07:08', (N, N, N, N, 423, 6.125)), + ('-------T-:05:06.125+07:08', (N, N, N, N, -433, 6.125)), + ('------T-:-:06.125-07:08', (N, N, N, N, 428, 6.125)), + ('------T-:-:06.125+07:08', (N, N, N, N, -428, 6.125)), + ('-------T-:-:06.125-07:08', (N, N, N, N, 427, 53.875)), + ('-------T-:-:06.125+07:08', (N, N, N, N, -429, 53.875)), + ('------T-:-:--07:08', (N, N, N, N, 428, 0)), + ('------T-:-:-+07:08', (N, N, N, N, -428, 0)), + ('-------T-:-:--07:08', (N, N, N, N, 428, 0)), + ('-------T-:-:-+07:08', (N, N, N, N, -428, 0)), + + # Edgepoints (ranges) + ('2001-01-03T07:08:09Z', (2001, 1, 3, 7, 8, 9)), + ('2001-12-03T07:08:09Z', (2001, 12, 3, 7, 8, 9)), + ('2001-02-01T07:08:09Z', (2001, 2, 1, 7, 8, 9)), + ('2001-02-28T07:08:09Z', (2001, 2, 28, 7, 8, 9)), + ('2000-02-29T07:08:09Z', (2000, 2, 29, 7, 8, 9)), + ('1900-02-28T07:08:09Z', (1900, 2, 28, 7, 8, 9)), + ('2001-02-03T00:08:09Z', (2001, 2, 3, 0, 8, 9)), + ('2001-02-03T23:08:09Z', (2001, 2, 3, 23, 8, 9)), + ('2001-02-03T04:00:09Z', (2001, 2, 3, 4, 0, 9)), + ('2001-02-03T04:59:09Z', (2001, 2, 3, 4, 59, 9)), + ('2001-02-03T04:05:00Z', (2001, 2, 3, 4, 5, 0)), + ('2001-02-03T04:05:60.9Z', (2001, 2, 3, 4, 5, 60.9)), + ('2001-01-31T04:05:06Z', (2001, 1, 31, 4, 5, 6)), + ('2001-03-31T04:05:06Z', (2001, 3, 31, 4, 5, 6)), + ('2001-04-30T04:05:06Z', (2001, 4, 30, 4, 5, 6)), + ('2001-05-31T04:05:06Z', (2001, 5, 31, 4, 5, 6)), + ('2001-06-30T04:05:06Z', (2001, 6, 30, 4, 5, 6)), + ('2001-07-31T04:05:06Z', (2001, 7, 31, 4, 5, 6)), + ('2001-08-31T04:05:06Z', (2001, 8, 31, 4, 5, 6)), + ('2001-09-30T04:05:06Z', (2001, 9, 30, 4, 5, 6)), + ('2001-10-31T04:05:06Z', (2001, 10, 31, 4, 5, 6)), + ('2001-11-30T04:05:06Z', (2001, 11, 30, 4, 5, 6)), + ('2001-12-31T04:05:06Z', (2001, 12, 31, 4, 5, 6)), + + # Edgepoints (crossing boundaries) + ('0001-01-01T07:08:23+07:08', (1, 1, 1, 0, 0, 23)), + ('0001-01-01T07:07:42+07:08', (0, 12, 31, 23, 59, 42)), + ('-0004-01-01T07:07:42+07:08', (-5, 12, 31, 23, 59, 42)), + ('2001-03-01T07:07:42+07:08', (2001, 2, 28, 23, 59, 42)), + ('2000-03-01T07:07:42+07:08', (2000, 2, 29, 23, 59, 42)), + ('1900-03-01T07:07:42+07:08', (1900, 2, 28, 23, 59, 42)), + ('---03-01T07:07:42+07:08', (N, 2, 28, 23, 59, 42)), + ) + + for t in (recurringInstantType,): + self.allTests(t, baddata, gooddata, parsedata) + + def testTime(self): + baddata = \ + ( + 'hello', + ('hello',), + (1, 2, 3, 4, 5), + (1, 2, 3, 4, 5, 6, 7, 8), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (1, 2, 'hello'), + (1, 2.5, 3), + (25, 0, 0), + (1, 60, 0), + (1, 0, 61), + ) + gooddata = \ + ( + (1L, '00:00:01Z', (0, 0, 1.0)), + (1.5, '00:00:01.5Z', (0, 0, 1.5)), + (3661.5, '01:01:01.5Z', (1, 1, 1.5)), + (86399.75, '23:59:59.75Z', (23, 59, 59.75)), + ((1,), '01:00:00Z', (1, 0, 0)), + ((1, 2), '01:02:00Z', (1, 2, 0)), + ((10L, 20.0, 30), '10:20:30Z', (10, 20, 30.0)), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('00 00:01Z', N), + ('07:O8:23Z', N), + + # Invalid ranges + ('24:08:23Z', N), + ('04:60:23Z', N), + ('04:05:61Z', N), + + # Whitespace + (ws + '00:00:01Z' + ws, (0, 0, 1)), + + # No timezones + ('04:05:06.125', (4, 5, 6.125)), + + # Non-zulu + ('04:05:06.125-07:08', (11, 13, 6.125)), + ('04:05:06.125+07:08', (-4, 57, 6.125)), + + # Edgepoints (ranges) + ('00:08:09Z', (0, 8, 9)), + ('23:08:09Z', (23, 8, 9)), + ('04:00:09Z', (4, 0, 9)), + ('04:59:09Z', (4, 59, 9)), + ('04:05:00Z', (4, 5, 0)), + ('04:05:60.9Z', (4, 5, 60.9)), + + # Edgepoints (crossing boundaries) + ('07:08:23+07:08', (0, 0, 23)), + ('07:07:42+07:08', (-1, 59, 42)), + ) + + for t in (timeType,): + self.allTests(t, baddata, gooddata, parsedata) + + def testDate(self): + baddata = \ + ( + 'hello', + ('hello',), + (1, 2, 3, 4, 5), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (1, 2, 3, 4, 5, 'hello'), + (1, 2.5, 3, 4, 5, 6), + (1, 2, 3.5), + (1, 0, 3), + (1, 13, 3), + (1, 1, 0), + (1, 1, 32), + (1, 2, 29), + (0, 2, 30), + (100, 2, 29), + (1, 3, 32), + (1, 4, 31), + (1, 5, 32), + (1, 6, 31), + (1, 7, 32), + (1, 8, 32), + (1, 9, 31), + (1, 10, 32), + (1, 11, 31), + (1, 12, 32), + ) + gooddata = \ + ( + (1L, '1970-01-01Z', (1970, 1, 1)), + (1.5, '1970-01-01Z', (1970, 1, 1)), + ((2,), '0002-01-01Z', (2, 1, 1)), + ((2, 3), '0002-03-01Z', (2, 3, 1)), + ((-2, 3, 4), '-0002-03-04Z', (-2, 3, 4)), + ((2, 3, 4), '0002-03-04Z', (2, 3, 4)), + ((10, 2, 3), '0010-02-03Z', (10, 2, 3)), + ((100, 2, 3), '0100-02-03Z', (100, 2, 3)), + ((1970, 2, 3), '1970-02-03Z', (1970, 2, 3)), + ((-1970, 2, 3), '-1970-02-03Z', (-1970, 2, 3)), + ((1970L, 2.0, 3.0), '1970-02-03Z', (1970, 2, 3)), + ((11990, 1L, 2), '11990-01-02Z', (11990, 1, 2)), + ((1e15, 1, 2), '1000000000000000-01-02Z', (1e15, 1, 2)), + ((-1e15, 1, 2), '-1000000000000000-01-02Z', (-1e15, 1, 2)), + ((1000000000000000L, 1, 2), '1000000000000000-01-02Z', + (1e15, 1, 2)), + ((-1000000000000000L, 1, 2), '-1000000000000000-01-02Z', + (-1e15, 1, 2)), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('1970 -01 -01Z', N), + ('0001-02-03z', N), + + # Invalid ranges + ('2001-00-03Z', N), + ('2001-13-03Z', N), + ('2001-02-00Z', N), + ('2001-02-29Z', N), + ('2000-02-30Z', N), + ('1900-02-29Z', N), + ('2001-01-32Z', N), + ('2001-03-32Z', N), + ('2001-04-31Z', N), + ('2001-05-32Z', N), + ('2001-06-31Z', N), + ('2001-07-32Z', N), + ('2001-08-32Z', N), + ('2001-09-31Z', N), + ('2001-10-32Z', N), + ('2001-11-31Z', N), + ('2001-12-32Z', N), + + # Whitespace + (ws + '1970-01-01Z' + ws, (1970, 1, 1)), + + # No timezones + ('11971-02-03', (11971, 2, 3)), + ('1971-02-03', (1971, 2, 3)), + ('-1971-02-03', (-1971, 2, 3)), + + # Non-zulu + ('11971-02-03-07:08', (11971, 2, 3)), + ('11971-02-03+07:08', (11971, 2, 2)), + ('-11971-02-03-07:08', (-11971, 2, 3)), + ('-11971-02-03+07:08', (-11971, 2, 2)), + ('1971-02-03-07:08', (1971, 2, 3)), + ('1971-02-03+07:08', (1971, 2, 2)), + ('-1971-02-03-07:08', (-1971, 2, 3)), + ('-1971-02-03+07:08', (-1971, 2, 2)), + + # Edgepoints (ranges) + ('2001-01-03Z', (2001, 1, 3)), + ('2001-12-03Z', (2001, 12, 3)), + ('2001-02-01Z', (2001, 2, 1)), + ('2001-02-28Z', (2001, 2, 28)), + ('2000-02-29Z', (2000, 2, 29)), + ('1900-02-28Z', (1900, 2, 28)), + ('2001-01-31Z', (2001, 1, 31)), + ('2001-03-31Z', (2001, 3, 31)), + ('2001-04-30Z', (2001, 4, 30)), + ('2001-05-31Z', (2001, 5, 31)), + ('2001-06-30Z', (2001, 6, 30)), + ('2001-07-31Z', (2001, 7, 31)), + ('2001-08-31Z', (2001, 8, 31)), + ('2001-09-30Z', (2001, 9, 30)), + ('2001-10-31Z', (2001, 10, 31)), + ('2001-11-30Z', (2001, 11, 30)), + ('2001-12-31Z', (2001, 12, 31)), + + # Edgepoints (crossing boundaries) + ('0001-01-01+07:08', (0, 12, 31)), + ('-0004-01-01+07:08', (-5, 12, 31)), + ('2001-03-01+07:08', (2001, 2, 28)), + ('2000-03-01+07:08', (2000, 2, 29)), + ('1900-03-01+07:08', (1900, 2, 28)), + ) + + for t in (dateType,): + self.allTests(t, baddata, gooddata, parsedata) + + def testGYearMonth(self): + baddata = \ + ( + 'hello', + ('hello',), + (1, 2, 3), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (1, 2, 3.5), + (1, 'hello'), + (1, 2.5), + (1, 0), + (1, 13), + ) + gooddata = \ + ( + (1L, '1970-01Z', (1970, 1)), + (1.5, '1970-01Z', (1970, 1)), + ((2,), '0002-01Z', (2, 1)), + ((2, 3), '0002-03Z', (2, 3)), + ((-2, 3), '-0002-03Z', (-2, 3)), + ((10, 2), '0010-02Z', (10, 2)), + ((100, 2), '0100-02Z', (100, 2)), + ((1970, 2), '1970-02Z', (1970, 2)), + ((-1970, 2), '-1970-02Z', (-1970, 2)), + ((1970L, 2.0), '1970-02Z', (1970, 2)), + ((11990, 1L), '11990-01Z', (11990, 1)), + ((1e15, 1), '1000000000000000-01Z', (1e15, 1)), + ((-1e15, 1), '-1000000000000000-01Z', (-1e15, 1)), + ((1000000000000000L, 1), '1000000000000000-01Z', (1e15, 1)), + ((-1000000000000000L, 1), '-1000000000000000-01Z', (-1e15, 1)), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('1970 -01Z', N), + ('0001-02z', N), + + # Invalid ranges + ('2001-00Z', N), + ('2001-13Z', N), + + # Whitespace + (ws + '1970-01Z' + ws, (1970, 1)), + + # No timezones + ('11971-02', (11971, 2)), + ('1971-02', (1971, 2)), + ('-1971-02', (-1971, 2)), + + # Non-zulu + ('11971-02-07:08', (11971, 2)), + ('11971-02+07:08', (11971, 1)), + ('-11971-02-07:08', (-11971, 2)), + ('-11971-02+07:08', (-11971, 1)), + ('1971-02-07:08', (1971, 2)), + ('1971-02+07:08', (1971, 1)), + ('-1971-02-07:08', (-1971, 2)), + ('-1971-02+07:08', (-1971, 1)), + + # Edgepoints (ranges) + ('2001-01Z', (2001, 1)), + ('2001-12Z', (2001, 12)), + + # Edgepoints (crossing boundaries) + ('0001-01+07:08', (0, 12)), + ('-0004-01+07:08', (-5, 12)), + ('2001-03+07:08', (2001, 2)), + ('2000-03+07:08', (2000, 2)), + ('1900-03+07:08', (1900, 2)), + ) + + for t in (gYearMonthType,): + self.allTests(t, baddata, gooddata, parsedata) + + def testGYearAndYear(self): + baddata = \ + ( + 'hello', + ('hello',), + (1, 2), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (2.5,), + ) + gooddata = \ + ( + (1L, '0001Z', 1), + (10, '0010Z', 10), + (100, '0100Z', 100), + (1970, '1970Z', 1970), + (-1970, '-1970Z', -1970), + (1970L, '1970Z', 1970), + (11990.0, '11990Z', 11990), + (1e15, '1000000000000000Z', 1e15), + (-1e15, '-1000000000000000Z', -1e15), + (1000000000000000L, '1000000000000000Z', 1e15), + (-1000000000000000L, '-1000000000000000Z', -1e15), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('197OZ', N), + ('0001z', N), + + # Whitespace + (ws + '1970Z' + ws, 1970), + + # No timezones + ('11971', 11971), + ('1971', 1971), + ('-1971', -1971), + + # Non-zulu + ('11971-07:08', 11971), + ('11971+07:08', 11970), + ('-11971-07:08', -11971), + ('-11971+07:08', -11972), + ('1971-07:08', 1971), + ('1971+07:08', 1970), + ('-1971-07:08', -1971), + ('-1971+07:08', -1972), + + # Edgepoints (crossing boundaries) + ('0001+07:08', 0), + ('-0004+07:08', -5), + ) + + for t in (gYearType, yearType): + self.allTests(t, baddata, gooddata, parsedata) + + def testCentury(self): + baddata = \ + ( + 'hello', + ('hello',), + (1, 2), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (2.5,), + ) + gooddata = \ + ( + (1L, '01Z', 1), + (10, '10Z', 10), + (100, '100Z', 100), + (19, '19Z', 19), + (-19, '-19Z', -19), + (19L, '19Z', 19), + (119.0, '119Z', 119), + (1e15, '1000000000000000Z', 1e15), + (-1e15, '-1000000000000000Z', -1e15), + (1000000000000000L, '1000000000000000Z', 1e15), + (-1000000000000000L, '-1000000000000000Z', -1e15), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('197OZ', N), + ('0001z', N), + + # Whitespace + (ws + '1970Z' + ws, 1970), + + # No timezones + ('11971', 11971), + ('1971', 1971), + ('-1971', -1971), + + # Non-zulu + ('11971-07:08', 11971), + ('11971+07:08', 11970), + ('-11971-07:08', -11971), + ('-11971+07:08', -11972), + ('1971-07:08', 1971), + ('1971+07:08', 1970), + ('-1971-07:08', -1971), + ('-1971+07:08', -1972), + + # Edgepoints (crossing boundaries) + ('0001+07:08', 0), + ('-0004+07:08', -5), + ) + + for t in (centuryType,): + self.allTests(t, baddata, gooddata, parsedata) + + def testGMonthDayAndRecurringDate(self): + baddata = \ + ( + 'hello', + ('hello',), + (3, 4, 5), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (4, 5, 'hello'), + (2.5, 3), + (0, 3), + (13, 3), + (1, 0), + (1, 32), + (2, 29), + (3, 32), + (4, 31), + (5, 32), + (6, 31), + (7, 32), + (8, 32), + (9, 31), + (10, 32), + (11, 31), + (12, 32), + ) + gooddata = \ + ( + (1L, '--01-01Z', (1, 1)), + (1.5, '--01-01Z', (1, 1)), + ((2,), '--02-01Z', (2, 1)), + ((2, 3), '--02-03Z', (2, 3)), + ((10, 2), '--10-02Z', (10, 2)), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('--01 -01Z', N), + ('--02-03z', N), + + # Invalid ranges + ('--00-03Z', N), + ('--13-03Z', N), + ('--01-32Z', N), + ('--02-00Z', N), + ('--02-29Z', N), + ('--03-32Z', N), + ('--04-31Z', N), + ('--05-32Z', N), + ('--06-31Z', N), + ('--07-32Z', N), + ('--08-32Z', N), + ('--09-31Z', N), + ('--10-32Z', N), + ('--11-31Z', N), + ('--12-32Z', N), + + # Whitespace + (ws + '--01-01Z' + ws, (1, 1)), + + # No timezones + ('--02-03', (2, 3)), + + # Non-zulu + ('--02-03-07:08', (2, 3)), + ('--02-03+07:08', (2, 2)), + + # Edgepoints (ranges) + ('--01-03Z', (1, 3)), + ('--12-03Z', (12, 3)), + ('--01-31Z', (1, 31)), + ('--02-01Z', (2, 1)), + ('--02-28Z', (2, 28)), + ('--03-31Z', (3, 31)), + ('--04-30Z', (4, 30)), + ('--05-31Z', (5, 31)), + ('--06-30Z', (6, 30)), + ('--07-31Z', (7, 31)), + ('--08-31Z', (8, 31)), + ('--09-30Z', (9, 30)), + ('--10-31Z', (10, 31)), + ('--11-30Z', (11, 30)), + ('--12-31Z', (12, 31)), + + # Edgepoints (crossing boundaries) + ('--01-01+07:08', (12, 31)), + ('--03-01+07:08', (2, 28)), + ) + + for t in (gMonthDayType, recurringDateType): + self.allTests(t, baddata, gooddata, parsedata) + + def testGMonthAndMonth(self): + baddata = \ + ( + 'hello', + ('hello',), + (3, 4,), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (2.5,), + (0,), + (13,), + ) + gooddata = \ + ( + (1L, '--01--Z', 1), + ((2,), '--02--Z', 2), + ((10,), '--10--Z', 10), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('--01 --Z', N), + ('--03--z', N), + + # Invalid ranges + ('--00--Z', N), + ('--13--Z', N), + + # Whitespace + (ws + '--01--Z' + ws, 1), + + # No timezones + ('--03--', 3), + + # Non-zulu + ('--03---07:08', 3), + ('--03--+07:08', 2), + + # Edgepoints (ranges) + ('--01--Z', 1), + ('--12--Z', 12), + + # Edgepoints (crossing boundaries) + ('--01--+07:08', 12), + ('--12---07:08', 12), + ) + + for t in (gMonthType, monthType): + self.allTests(t, baddata, gooddata, parsedata) + + def testGDayAndRecurringDay(self): + baddata = \ + ( + 'hello', + ('hello',), + (3, 4,), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + (2.5,), + (0,), + (32,), + ) + gooddata = \ + ( + (1L, '---01Z', 1), + ((2,), '---02Z', 2), + ((10,), '---10Z', 10), + ) + parsedata = \ + ( + # Some strings that won't match the r.e. + ('hello', N), + ('---01 Z', N), + ('---03z', N), + + # Invalid ranges + ('---00Z', N), + ('---32Z', N), + + # Whitespace + (ws + '---01Z' + ws, 1), + + # No timezones + ('---03', 3), + + # Non-zulu + ('---03-07:08', 3), + ('---03+07:08', 2), + + # Edgepoints (ranges) + ('---01Z', 1), + ('---31Z', 31), + + # Edgepoints (crossing boundaries) + ('---01+07:08', 31), + ('---31-07:08', 31), + ) + + for t in (gDayType, recurringDayType): + self.allTests(t, baddata, gooddata, parsedata) + + def testInteger(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}) + t = integerType + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (10, 23L, 1111111111111111111111111111111111111111111111111111L): + x = integerType(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('10 000', N), + ('1', 1), + ('123456789012345678901234567890', 123456789012345678901234567890L), + (ws + '12' + ws, 12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], + i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testNonPositiveInteger(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, 1, 23) + for t in (nonPositiveIntegerType, non_Positive_IntegerType): + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a t with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (0, -23L, -1111111111111111111111111111111111111111111111111L): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), ('1', N), + ('0', 0), + ('-1', -1), + ('-123456789012345678901234567890', -123456789012345678901234567890L), + (ws + '-12' + ws, -12)) + + for i in test: + try: + if t == nonPositiveIntegerType: + n = t.__name__[:-4] + else: + n = 'non-positive-integer' + + z = parseSOAPRPC(self.build_xml(t._validURIs[0], n, i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testNegativeInteger(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, 0, 23) + for t in (negativeIntegerType, negative_IntegerType): + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (-1, -23L, -111111111111111111111111111111111111111111111111L): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), ('1', N), + ('0', N), + ('-1', -1), + ('-123456789012345678901234567890', -123456789012345678901234567890L), + (ws + '-12' + ws, -12)) + + for i in test: + try: + if t == negativeIntegerType: + n = t.__name__[:-4] + else: + n = 'negative-integer' + + z = parseSOAPRPC(self.build_xml(t._validURIs[0], n, i[0])) + + if z != i[1]: + raise AssertionError, "expected %s, got %s" % (i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testLong(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, + -9223372036854775809L, 9223372036854775808L) + t = longType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (-1, -23L, -9223372036854775808L, 9223372036854775807L): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), + ('-9223372036854775809', N), ('9223372036854775808', N), + ('-1', -1), ('0', 0), ('1', 1), + ('-9223372036854775808', -9223372036854775808L), + ('9223372036854775807', 9223372036854775807L), + (ws + '-12' + ws, -12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testInt(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -2147483649L, 2147483648L) + t = intType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (-1, -23L, -2147483648L, 2147483647): + x = intType(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), + ('-2147483649', N), ('2147483648', N), + ('-1', -1), ('0', 0), ('1', 1), + ('-2147483648', -2147483648L), + ('2147483647', 2147483647), + (ws + '-12' + ws, -12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testShort(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -32769, 32768) + t = shortType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (-1, -23L, -32768, 32767): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), + ('-32769', N), ('32768', N), + ('-1', -1), ('0', 0), ('1', 1), + ('-32768', -32768), + ('32767', 32767), + (ws + '-12' + ws, -12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testByte(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -129, 128) + t = byteType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (-1, -23L, -128, 127): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), + ('-129', N), ('128', N), + ('-1', -1), ('0', 0), ('1', 1), + ('-128', -128), + ('127', 127), + (ws + '-12' + ws, -12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testNonNegativeInteger(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -42, -1) + for t in (nonNegativeIntegerType, non_Negative_IntegerType): + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (0, 1, 23L, 111111111111111111111111111111111111111111111111L): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), ('-1', N), + ('0', 0), + ('1', 1), + ('123456789012345678901234567890', 123456789012345678901234567890L), + (ws + '12' + ws, 12)) + + for i in test: + try: + if t == nonNegativeIntegerType: + n = t.__name__[:-4] + else: + n = 'non-negative-integer' + + z = parseSOAPRPC(self.build_xml(t._validURIs[0], n, i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testUnsignedLong(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -42, -1, 18446744073709551616L) + t = unsignedLongType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (0, 23L, 18446744073709551615L): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), ('-1', N), + ('18446744073709551616', N), + ('0', 0), ('1', 1), + ('18446744073709551615', 18446744073709551615L), + (ws + '12' + ws, 12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testUnsignedInt(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -42, -1, 4294967296L) + t = unsignedIntType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (0, 23L, 4294967295L): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), ('-1', N), + ('4294967296', N), + ('0', 0), ('1', 1), + ('4294967295', 4294967295L), + (ws + '12' + ws, 12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testUnsignedShort(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -42, -1, 65536) + t = unsignedShortType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (0, 23L, 65535): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), ('-1', N), + ('65536', N), + ('0', 0), ('1', 1), + ('65535', 65535), + (ws + '12' + ws, 12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testUnsignedByte(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -42, -1, 256) + t = unsignedByteType + + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a %s with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (0, 23L, 255): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), ('-1', N), + ('256', N), + ('0', 0), ('1', 1), + ('255', 255), + (ws + '12' + ws, 12)) + + for i in test: + try: + z = parseSOAPRPC(self.build_xml(t._validURIs[0], t.__name__[:-4], i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testPositiveInteger(self): + # First some things that shouldn't be valid + test = ('hello', 3.14, (), [], {}, -42, -1, 0) + for t in (positiveIntegerType, positive_IntegerType): + for i in test: + try: + t(i) + raise AssertionError, \ + "instantiated a t with a bad value (%s)" % \ + (t.__name__, repr(i)) + except AssertionError: + raise + except ValueError: + pass + + # Now some things that should + + for i in (1, 23L, 1111111111111111111111111111111111111111111111111111L): + x = t(i) + d = x._marshalData() + + if d != str(i): + raise AssertionError, "expected %d, got %s" % (i, d) + + y = buildSOAP(x) + z = parseSOAPRPC(y) + + if z != i: + raise AssertionError, "expected %s, got %s" % (repr(i), repr(z)) + + # Now test parsing, both valid and invalid + + test = (('hello', N), ('3.14', N), ('-10 000', N), ('-1', N), + ('0', N), ('1', 1), + ('123456789012345678901234567890', 123456789012345678901234567890L), + (ws + '12' + ws, 12)) + + for i in test: + try: + if t == positiveIntegerType: + n = t.__name__[:-4] + else: + n = 'positive-integer' + + z = parseSOAPRPC(self.build_xml(t._validURIs[0], n, i[0])) + + if z != i[1]: + raise AssertionError, "%s: expected %s, got %s" % \ + (i[0], i[1], repr(z)) + except AssertionError: + raise + except: + if i[1] != N: + raise AssertionError, \ + "parsing %s as %s threw exception %s:%s" % \ + (i[0], t.__name__, sys.exc_info()[0], sys.exc_info()[1]) + + def testUntyped(self): + # Make sure untypedType really isn't typed + a = stringType('hello', name = 'a') + b = untypedType('earth', name = 'b') + + x = buildSOAP((a, b)) + #print "x=",x + + self.failUnless(x.find('<a xsi:type="xsd:string" SOAP-ENC:root="1">hello</a>') != -1) + self.failUnless(x.find('<b SOAP-ENC:root="1">earth</b>') != -1) + + # Now some Array tests + def testArray(self): + env = '''<?xml version="1.0" encoding="UTF-8"?> +<SOAP-ENV:Envelope SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsd2="http://www.w3.org/2000/10/XMLSchema" xmlns:xsd3="http://www.w3.org/2001/XMLSchema" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/"> +%s +</SOAP-ENV:Envelope>''' + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[4]" SOAP-ENC:offset="[2]" xsi:type="SOAP-ENC:Array"> + <_2 SOAP-ENC:arrayType="xsd:int[2]" xsi:type="SOAP-ENC:Array"> + <item>1</item> + <item>2</item> + </_2> + <_3 SOAP-ENC:arrayType="xsd:int[2]" xsi:type="SOAP-ENC:Array"> + <item>3</item> + <item>4</item> + </_3> +</_1> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [None, None, [1, 2], [3, 4]]) + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[3,4,2]" SOAP-ENC:offset="[17]" xsi:type="SOAP-ENC:Array"> + <item>1</item> + <item>2</item> + <item>3</item> + <item>4</item> + <item>5</item> + <item>6</item> + <item>7</item> +</_1> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [ + [[None, None], [None, None], [None, None], [None, None]], + [[None, None], [None, None], [None, None], [None, None]], + [[None, 1], [2, 3], [4, 5], [6, 7]] + ]) + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[3,4,2]" xsi:type="SOAP-ENC:Array"> + <item SOAP-ENC:position="[17]">-17</item> + <item SOAP-ENC:position="[13]">13</item> + <item SOAP-ENC:position="[22]">-22</item> + <item SOAP-ENC:position="[1]">1</item> + <item SOAP-ENC:position="[17]">17</item> + <item SOAP-ENC:position="[23]">23</item> + <item SOAP-ENC:position="[6]">6</item> +</_1> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [ + [[None, 1L], [None, None], [None, None], [6L, None]], + [[None, None], [None, None], [None, 13L], [None, None]], + [[None, 17L], [None, None], [None, None], [-22L, 23L]] + ]) + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[4]" SOAP-ENC:offset="[3]" xsi:type="SOAP-ENC:Array"> + <item SOAP-ENC:position="[2]">2</item> + <item SOAP-ENC:position="[0]">0</item> + <item SOAP-ENC:position="[1]">1</item> + <item SOAP-ENC:position="[3]">3</item> +</_1> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [0, 1, 2, 3]) + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[2,3,4]" SOAP-ENC:offset="[23]" xsi:type="SOAP-ENC:Array"> +</_1> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [ + [ + [None, None, None, None], + [None, None, None, None], + [None, None, None, None], + ], + [ + [None, None, None, None], + [None, None, None, None], + [None, None, None, None], + ] + ]) + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[4]" SOAP-ENC:offset="[3]" xsi:type="SOAP-ENC:Array"> + <item>2</item> + <item>3</item> +</_1> +</SOAP-ENV:Body>''' + + try: + x = parseSOAPRPC(xml) + raise AssertionError, "full array parsed" + except AssertionError: + raise + except: + pass + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[2,0,4]" xsi:type="SOAP-ENC:Array"> +</_1> +</SOAP-ENV:Body>''' + + try: + x = parseSOAPRPC(xml) + raise AssertionError, "array with bad dimension (0) parsed" + except AssertionError: + raise + except: + pass + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[2,3,-4]" xsi:type="SOAP-ENC:Array"> +</_1> +</SOAP-ENV:Body>''' + + try: + x = parseSOAPRPC(xml) + raise AssertionError, "array with bad dimension (negative) parsed" + except AssertionError: + raise + except: + pass + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[2,3,4.4]" xsi:type="SOAP-ENC:Array"> +</_1> +</SOAP-ENV:Body>''' + + try: + x = parseSOAPRPC(xml) + raise AssertionError, "array with bad dimension (non-integral) parsed" + except AssertionError: + raise + except: + pass + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[2,hello,4]" xsi:type="SOAP-ENC:Array"> +</_1> +</SOAP-ENV:Body>''' + + try: + x = parseSOAPRPC(xml) + raise AssertionError, "array with bad dimension (non-numeric) parsed" + except AssertionError: + raise + except: + pass + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[2,3,4]" SOAP-ENC:offset="[-4]" xsi:type="SOAP-ENC:Array"> +</_1> +</SOAP-ENV:Body>''' + + try: + x = parseSOAPRPC(xml) + raise AssertionError, "array with too large offset parsed" + except AssertionError: + raise + except: + pass + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[2,3,4]" SOAP-ENC:offset="[24]" xsi:type="SOAP-ENC:Array"> +</_1> +</SOAP-ENV:Body>''' + + try: + x = parseSOAPRPC(xml) + raise AssertionError, "array with too large offset parsed" + except AssertionError: + raise + except: + pass + + xml = env % '''<SOAP-ENV:Body> +<_1 SOAP-ENC:arrayType="xsd:int[2,3,4]" xsi:type="SOAP-ENC:Array"> + <item SOAP-ENC:position="0">2</item> + <item>3</item> +</_1> +</SOAP-ENV:Body>''' + + try: + x = parseSOAPRPC(xml) + raise AssertionError, "full array parsed" + except AssertionError: + raise + except: + pass + + xml = env % '''<SOAP-ENV:Body> + <myFavoriteNumbers type="SOAP-ENC:Array" SOAP-ENC:arrayType="xsd:int[2]"> + <number>3</number> + <number>4</number> + </myFavoriteNumbers> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [3, 4]) + + xml = env % '''<SOAP-ENV:Body> +<SOAP-ENC:Array SOAP-ENC:arrayType="xsd:ur-type[4]"> + <thing xsi:type="xsd:int">12345</thing> + <thing xsi:type="xsd:decimal">6.789</thing> + <thing xsi:type="xsd:string">Of Mans First Disobedience, and the Fruit +Of that Forbidden Tree, whose mortal tast +Brought Death into the World, and all our woe,</thing> + <thing xsi:type="xsd2:uriReference"> + http://www.dartmouth.edu/~milton/reading_room/ + </thing> +</SOAP-ENC:Array> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [12345, 6.789, '''Of Mans First Disobedience, and the Fruit +Of that Forbidden Tree, whose mortal tast +Brought Death into the World, and all our woe,''', + 'http://www.dartmouth.edu/~milton/reading_room/']) + + xml = env % '''<SOAP-ENV:Body> +<SOAP-ENC:Array SOAP-ENC:arrayType="xyz:Order[2]"> + <Order> + <Product>Apple</Product> + <Price>1.56</Price> + </Order> + <Order> + <Product>Peach</Product> + <Price>1.48</Price> + </Order> +</SOAP-ENC:Array> +</SOAP-ENV:Body>''' + + #x = parseSOAPRPC(xml) + + #print "x=",x + + xml = env % '''<SOAP-ENV:Body> +<SOAP-ENC:Array SOAP-ENC:arrayType="xsd:string[3]"> + <item href="#array-1"/> + <item href="#array-2"/> + <item href="#array-2"/> +</SOAP-ENC:Array> +<SOAP-ENC:Array id="array-1" SOAP-ENC:arrayType="xsd:string[3]"> + <item>r1c1</item> + <item>r1c2</item> + <item>r1c3</item> +</SOAP-ENC:Array> +<SOAP-ENC:Array id="array-2" SOAP-ENC:arrayType="xsd:string[2]"> + <item>r2c1</item> + <item>r2c2</item> +</SOAP-ENC:Array> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [['r1c1', 'r1c2', 'r1c3'], + ['r2c1', 'r2c2'], ['r2c1', 'r2c2']]) + + xml = env % '''<SOAP-ENV:Body> +<SOAP-ENC:Array SOAP-ENC:arrayType="xsd:string[2,3]"> + <item>r1c1</item> + <item>r1c2</item> + <item>r1c3</item> + <item>r2c1</item> + <item>r2c2</item> + <item>r2c3</item> +</SOAP-ENC:Array> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [['r1c1', 'r1c2', 'r1c3'], ['r2c1', 'r2c2', 'r2c3']]) + + xml = env % '''<SOAP-ENV:Body> +<SOAP-ENC:Array SOAP-ENC:arrayType="xsd:string[5]" SOAP-ENC:offset="[2]"> + <item>The third element</item> + <item>The fourth element</item> +</SOAP-ENC:Array> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + self.assertEquals( x , [None, None, 'The third element', 'The fourth element', None]) + + xml = env % '''<SOAP-ENV:Body> +<SOAP-ENC:Array SOAP-ENC:arrayType="xsd:string[,][4]"> + <SOAP-ENC:Array href="#array-1" SOAP-ENC:position="[2]"/> +</SOAP-ENC:Array> +<SOAP-ENC:Array id="array-1" SOAP-ENC:arrayType="xsd:string[10,10]"> + <item SOAP-ENC:position="[2,2]">Third row, third col</item> + <item SOAP-ENC:position="[7,2]">Eighth row, third col</item> +</SOAP-ENC:Array> +</SOAP-ENV:Body>''' + + x = parseSOAPRPC(xml) + + # Example using key data + def testKeyData(self): + xml = '''<?xml version="1.0" encoding="UTF-8"?> +<soap:Envelope xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:soapenc="http://schemas.xmlsoap.org/soap/encoding/" xmlns:xsd="http://www.w3.org/1999/XMLSchema" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> +<soap:Body> + <xkms:RegisterResult xmlns:xkms="http://www.xkms.org/schema/xkms-2001-01-20"> + <xkms:Result>Success</xkms:Result> + <xkms:Answer soapenc:arrayType="KeyBinding[1]"> + <xkms:KeyBinding> + <xkms:Status>Valid</xkms:Status> + <xkms:KeyID>mailto:actzerotestkeyname</xkms:KeyID> + <dsig:KeyInfo> + <dsig:X509Data> + <dsig:X509Certificate>MIIDPjCCAqegAwIBAgIEOroMvDANBgkqhkiG9w0BAQUFADAxMQswCQYDVQQGEwJVI3nlMkH84ZdPKIyz60sNcVEwJ8kF+B6ZVNimCF+r7BWgLi/Dolce5CpbfMMyexZ+UQEMADrc7331eYS891KXSDQx</dsig:X509Certificate> + </dsig:X509Data> + <dsig:KeyName>mailto:actzerotestkeyname</dsig:KeyName> + <dsig:KeyValue> + <dsig:RSAKeyValue> + <dsig:Modulus>wgmV2FY6MBKvtaMmCvCoNi/0hycZkiPKC2PXjRLJKFJ5wjNfF+vWsQQUXxOKUQnu +HjJqRkx90jJvnEzW3j9FlZFQcZTfJbE0v6BXhhSre2kZvkgcOERmDMeMs//oEA4u +epnedUwrkPzedWU9AL7c/oN7rk65UuPWf7V8c/4E9bc=</dsig:Modulus> + <dsig:Exponent>AQAB</dsig:Exponent> + </dsig:RSAKeyValue> + </dsig:KeyValue> + </dsig:KeyInfo> + </xkms:KeyBinding> + </xkms:Answer> + <xkms:Private>9GKuRC3ISwE9aEatzDKW0WIp+P/ufOvCxy9d5jVglLaRiTTIelHoGKCE6cDG62HYOu/3ebce6M7Z6LX6l1J9pB5PUx+f2DaMYYEGuOtNA7/ei5Ga/mibRBCehQIcN6FF6ESFOwAJBRLajj+orgYSy0u1sTCla0V4nSBrYA2H6lx8mD3qfDJ4hie7nU0YqZxy50F9f9UxXKIVSeutyIIBjWDDKv0kVpKy7OUerOaZXOW6HBohXuV74kXMUZu+MpLIkMHOrhJeo+edfhmeFuw4kCo5it6GkrOKrGs6zo1hSxWp7uuvKAPbvUrumC6sTsTxAUg4KTGq85IUnBTYI40Q9TZtzMcONtrWfIIF23/7NJyOmygBaFa4wFqHxe7j2gSWCQRv2fPwXo/AAJTeKwsUIY8OgmANHHbFVqJEeg27jbCuSaQFxWD7ms240YurTb55HBLk6JSufDl0CUbxoUgjrDB++gUb8oalroWDIb5NcZ94QER+HiTQfB11HcPDHvONnzk/n+iF+Mcri53ZbAButnfp2x87sh6RedeiUUWruYA4eonRq5+aj2I9cIrGLQaLemna1AQ+PyD2SMelBLukfR7GUc7zaSPjPJh2W/aYAJSyjM98g6ABNntdfhuf+6jRYnYFqSXZL1W1JPf92OMOfwfuXTE2K68sNwCRhcbHDLM=</xkms:Private> + </xkms:RegisterResult> + </soap:Body> +</soap:Envelope>''' + + x = parseSOAPRPC(xml) + + def testZeroLengthTypedArray(self): + """ + Test that zero length typed arrays maintain thier type information when + converted to a SOAP message. + """ + empty_int = typedArrayType(typed="int") + empty_int_message = buildSOAP( empty_int ) + self.assertNotEquals( re.search("xsd:int\[0\]", empty_int_message), + None ) + +if __name__ == '__main__': + + print """ + + NOTE: The 'testArray' test will fail because 'referenced' elements are + included in the return object. This is a known shortcoming of + the current version of SOAPpy. + + All other tests should succeed. + + """ + + unittest.main() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/TCtest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/TCtest.py new file mode 100755 index 0000000000000000000000000000000000000000..82824ef075a4057cb8bb7096ee565a6468257bfa --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/TCtest.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python + +import sys, unittest +sys.path.insert(1, "..") +from SOAPpy import * +Config.debug=1 + +class ClientTestCase(unittest.TestCase): + def testParseRules(self): + x = """<?xml version="1.0" encoding="utf-8"?> + <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" + xmlns:soapenc="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:xsd="http://www.w3.org/2001/XMLSchema"> + <soap:Body + soap:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> + <SomeMethod> + <Result> + <Book> + <title>My Life and Work</title> + </Book> + <Person> + <name>Henry Ford</name> + <age> 49 </age> + <height> 5.5 </height> + </Person> + </Result> + </SomeMethod> + </soap:Body> + </soap:Envelope> + """ + + def negfloat(x): + return float(x) * -1.0 + + # parse rules + pr = {'SomeMethod': + {'Result': + { + 'Book': {'title':'string'}, + 'Person': {'age':'int', + 'height':negfloat} + } + } + } + y = parseSOAPRPC(x, rules=pr) + + assert y.Result.Person.age == 49 + assert y.Result.Person.height == -5.5 + + + x = '''<SOAP-ENV:Envelope + SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" + xmlns:xsd="http://www.w3.org/1999/XMLSchema" + xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> + <SOAP-ENV:Body> + <Bounds> + <param> + <item>12</item> + <item>23</item> + <item>0</item> + <item>-31</item> + </param> + <param1 xsi:null="1"></param1> + </Bounds> + </SOAP-ENV:Body> + </SOAP-ENV:Envelope> + ''' + + + # parse rules + pr = {'Bounds': + {'param': 'arrayType=string[]', + } + } + + pr2 = {'Bounds': + {'param': 'arrayType=int[4]', + } + } + + y = parseSOAPRPC(x, rules=pr) + assert y.param[1]=='23' + + y = parseSOAPRPC(x, rules=pr2) + assert y.param[1]==23 + + x = '''<SOAP-ENV:Envelope + SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" + xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" + xmlns:xsd="http://www.w3.org/1999/XMLSchema" + xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"> + + <SOAP-ENV:Body> + <Bounds> + <param> + <item xsi:type="xsd:int">12</item> + <item xsi:type="xsd:string">23</item> + <item xsi:type="xsd:float">0</item> + <item xsi:type="xsd:int">-31</item> + </param> + <param1 xsi:null="1"></param1> + </Bounds> + </SOAP-ENV:Body> + </SOAP-ENV:Envelope> + ''' + + pr = {'Bounds': + {'param': 'arrayType=ur-type[]' + } + } + y = parseSOAPRPC(x, rules=pr) + assert y.param[0]==12 + assert y.param[1]=='23' + assert y.param[2]==float(0) + assert y.param[3]==-31 + + # Try the reverse, not implemented yet. + + def testBuildObject(self): + + class Book(structType): + def __init__(self): + self.title = "Title of a book" + + class Person(structType): + def __init__(self): + self.age = "49" + self.height = "5.5" + + class Library(structType): + def __init__(self): + self._name = "Result" + self.Book = Book() + self.Person = Person() + + obj = Library() + + x = buildSOAP( kw={'Library':obj} ) + + print(x) + +if __name__ == '__main__': + unittest.main() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/TemperatureService.wsdl b/LTA/LTAIngest/SOAPpy-0.12.0/tests/TemperatureService.wsdl new file mode 100755 index 0000000000000000000000000000000000000000..2c0d553ce846f8044db2517006874597edd0546f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/TemperatureService.wsdl @@ -0,0 +1,33 @@ +<?xml version="1.0"?> +<definitions name="TemperatureService" targetNamespace="http://www.xmethods.net/sd/TemperatureService.wsdl" xmlns:tns="http://www.xmethods.net/sd/TemperatureService.wsdl" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/wsdl/soap/" xmlns="http://schemas.xmlsoap.org/wsdl/"> + <message name="getTempRequest"> + <part name="zipcode" type="xsd:string"/> + </message> + <message name="getTempResponse"> + <part name="return" type="xsd:float"/> + </message> + <portType name="TemperaturePortType"> + <operation name="getTemp"> + <input message="tns:getTempRequest"/> + <output message="tns:getTempResponse"/> + </operation> + </portType> + <binding name="TemperatureBinding" type="tns:TemperaturePortType"> + <soap:binding style="rpc" transport="http://schemas.xmlsoap.org/soap/http"/> + <operation name="getTemp"> + <soap:operation soapAction=""/> + <input> + <soap:body use="encoded" namespace="urn:xmethods-Temperature" encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"/> + </input> + <output> + <soap:body use="encoded" namespace="urn:xmethods-Temperature" encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"/> + </output> + </operation> + </binding> + <service name="TemperatureService"> + <documentation>Returns current temperature in a given U.S. zipcode </documentation> + <port name="TemperaturePort" binding="tns:TemperatureBinding"> + <soap:address location="http://services.xmethods.net:80/soap/servlet/rpcrouter"/> + </port> + </service> +</definitions> diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/ZeroLengthArray.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/ZeroLengthArray.py new file mode 100644 index 0000000000000000000000000000000000000000..bc461a9e8099ff9628058162a32f281dab3b4737 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/ZeroLengthArray.py @@ -0,0 +1,8 @@ +import sys +sys.path.insert(1, "..") +from SOAPpy import * + +one = typedArrayType(data=[1],typed=type(1)) +tmp = typedArrayType(data=[], typed=type(1)) +print buildSOAP( one ) +print buildSOAP( tmp ) diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/alanbushTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/alanbushTest.py new file mode 100755 index 0000000000000000000000000000000000000000..bf6467272fe08522d5050c5b6a6af617281b24d9 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/alanbushTest.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. + +ident = '$Id$' + +import os, re,sys + +# add local SOAPpy code to search path +sys.path.insert(1, "..") + +from SOAPpy import * +Config.debug=0 + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + proxy = "%s:%s" % (phost, pport) +except: + proxy = None + +SoapEndpointURL = 'http://www.alanbushtrust.org.uk/soap/compositions.asp' +MethodNamespaceURI = 'urn:alanbushtrust-org-uk:soap.methods' +SoapAction = MethodNamespaceURI + ".GetCategories" + +server = SOAPProxy(SoapEndpointURL, + namespace=MethodNamespaceURI, + soapaction=SoapAction, + http_proxy=proxy + ) + +for category in server.GetCategories(): + print category diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/cardClient.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/cardClient.py new file mode 100755 index 0000000000000000000000000000000000000000..4d8d272d0d9f84c002a5917d6badfc1b4948f93d --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/cardClient.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. + +import sys + +sys.path.insert (1, '..') + +from SOAPpy import * + +ident = '$Id$' + +endpoint = "http://localhost:12027/xmethodsInterop" +sa = "urn:soapinterop" +ns = "http://soapinterop.org/" + +serv = SOAPProxy(endpoint, namespace=ns, soapaction=sa) +try: hand = serv.dealHand(NumberOfCards = 13, StringSeparator = '\n') +except: print "no dealHand"; hand = 0 +try: sortedhand = serv.dealArrangedHand(NumberOfCards=13,StringSeparator='\n') +except: print "no sorted"; sortedhand = 0 +try: card = serv.dealCard() +except: print "no card"; card = 0 + +print "*****hand****\n",hand,"\n*********" +print "******sortedhand*****\n",sortedhand,"\n*********" +print "card:",card + +serv.quit() + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/cardServer.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/cardServer.py new file mode 100755 index 0000000000000000000000000000000000000000..3b5b0b4f4d096499f319ed9d987b05e733efdd94 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/cardServer.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. + +import string +import sys + +sys.path.insert (1, '..') + +from SOAPpy import * + +ident = '$Id$' + +# create the list of all cards, and keep strings for each suit +__cs = "Clubs" +__ds = "Diamonds" +__hs = "Hearts" +__ss = "Spades" +__cards = [] +for suit in [__cs, __ds, __hs, __ss]: + for num in range(9): + num += 1 + __cards.append(str(num+1)+" of "+suit) + for face in ["ace","King","Queen","Jack"]: + __cards.append(face+" of "+suit) + + +def deal(num): + if num not in range(1,53): + return -1 + else: + alreadydealt = [] + ignore = 0 + handdealt = [] + import whrandom + while num > 0: + idx = int(str(whrandom.random())[2:4]) + if idx in range(52) and idx not in alreadydealt: + handdealt.append(__cards[idx]) + alreadydealt.append(idx) + num -= 1 + else: + ignore += 1 + continue + return handdealt + +def arrangeHand(hand): + c = [] + d = [] + h = [] + s = [] + import string + for card in hand: + if string.find(card, __cs) != -1: + c.append(card) + elif string.find(card, __ds) != -1: + d.append(card) + elif string.find(card, __hs) != -1: + h.append(card) + elif string.find(card, __ss) != -1: + s.append(card) + for cards, str in ((c, __cs),(d, __ds),(h,__hs), (s,__ss)): + cards.sort() + idx = 0 + if "10 of "+str in cards: + cards.remove("10 of "+str) + if "Jack of "+str in cards: idx += 1 + if "Queen of "+str in cards: idx += 1 + if "King of "+str in cards: idx += 1 + if "ace of "+str in cards: idx +=1 + cards.insert(len(cards)-idx,"10 of "+str) + if "King of "+str in cards: + cards.remove("King of "+str) + if "ace of "+str in cards: cards.insert(len(cards)-1,"King of "+str) + else: cards.append("King of "+str) + return c+d+h+s + +def dealHand (NumberOfCards, StringSeparator): + hand = deal(NumberOfCards) + return string.join(hand,StringSeparator) + + +def dealArrangedHand (NumberOfCards, StringSeparator): + if NumberOfCards < 1 or NumberOfCards > 52: + raise ValueError, "NumberOfCards must be between 1 and 52" + unarranged = deal(NumberOfCards) + hand = arrangeHand(unarranged) + return string.join(hand, StringSeparator) + +def dealCard (): + return deal(1)[0] + +run = 1 + +def quit(): + global run + run=0; + +namespace = 'http://soapinterop.org/' + +server = SOAPServer (("localhost", 12027)) + +server.registerKWFunction (dealHand, namespace) +server.registerKWFunction (dealArrangedHand, namespace) +server.registerKWFunction (dealCard, namespace) +server.registerKWFunction (quit, namespace) + +try: + while run: + server.handle_request() +except KeyboardInterrupt: + pass diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoClient.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoClient.py new file mode 100755 index 0000000000000000000000000000000000000000..42d3d0af2f3c0087a105c2cdf50f5c26488abe9f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoClient.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. + +import sys +sys.path.insert(1, "..") + +from SOAPpy import * + +# Uncomment to see outgoing HTTP headers and SOAP and incoming +#Config.debug = 1 +#Config.dumpHeadersIn = 1 +#Config.dumpSOAPIn = 1 +#Config.dumpSOAPOut = 1 + +# ask for returned SOAP responses to be converted to basic python types +Config.simplify_objects = 1 + +#Config.BuildWithNoType = 1 +#Config.BuildWithNoNamespacePrefix = 1 + +if len(sys.argv) > 1 and sys.argv[1] == '-s': + # Use secure http + pathserver = SOAPProxy("https://localhost:9900/pathtest") + server = SOAPProxy("https://localhost:9900") + +elif len(sys.argv) > 1 and sys.argv[1] == '-g': + # use Globus for communication + import pyGlobus + pathserver = SOAPProxy("httpg://localhost:9900/pathtest") + server = SOAPProxy("httpg://localhost:9900") + +else: + # Default: use standard http + pathserver = SOAPProxy("http://localhost:9900/pathtest") + server = SOAPProxy("http://localhost:9900") + +# Echo... + +try: + print server.echo("MOO") +except Exception, e: + print "Caught exception: ", e +try: + print pathserver.echo("MOO") +except Exception, e: + print "Caught exception: ", e + +# ...in an object +try: + print server.echo_ino("moo") +except Exception, e: + print "Caught exception: ", e +try: + print pathserver.echo_ino("cow") +except Exception, e: + print "Caught exception: ", e + +# ...in an object in an object +try: + print server.prop.echo2("moo") +except Exception, e: + print "Caught exception: ", e + +try: + print pathserver.prop.echo2("cow") +except Exception, e: + print "Caught exception: ", e + +# ...with keyword arguments +try: + print server.echo_wkw(third = "three", first = "one", second = "two") +except Exception, e: + print "Caught exception: ", e +try: + print pathserver.echo_wkw(third = "three", first = "one", second = "two") +except Exception, e: + print "Caught exception: ", e + +# ...with a context object +try: + print server.echo_wc("moo") +except Exception, e: + print "Caught exception: ", e +try: + print pathserver.echo_wc("cow") +except Exception, e: + print "Caught exception: ", e + +# ...with a header +hd = headerType(data = {"mystring": "Hello World"}) +try: + print server._hd(hd).echo_wc("moo") +except Exception, e: + print "Caught exception: ", e +try: + print pathserver._hd(hd).echo_wc("cow") +except Exception, e: + print "Caught exception: ", e + +# close down server +server.quit() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoHeader.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoHeader.py new file mode 100755 index 0000000000000000000000000000000000000000..380e6ce8d79bf43e66019f020264530965a89c73 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoHeader.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. + +import sys +sys.path.insert(1, "..") + +from SOAPpy import * + +# Uncomment to see outgoing HTTP headers and SOAP and incoming +#Config.debug = 1 + +Config.BuildWithNoType = 1 +Config.BuildWithNoNamespacePrefix = 1 + + + +hd = headerType(data = {"mystring": "Hello World"}) +server = SOAPProxy("http://localhost:9900/", header=hd) + +print server.echo("Hello world") + +server.quit() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoServer.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoServer.py new file mode 100755 index 0000000000000000000000000000000000000000..57c1064aba68068c1d038cd3e3fd827da8ad596e --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/echoServer.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# +# Copyright (c) 2001 actzero, inc. All rights reserved. + +import sys +sys.path.insert(1, "..") + +from SOAPpy import * + +# Uncomment to see outgoing HTTP headers and SOAP and incoming +Config.dumpSOAPIn = 1 +Config.dumpSOAPOut = 1 +Config.debug = 1 + +# specify name of authorization function +Config.authMethod = "_authorize" + +# Set this to 0 to test authorization +allowAll = 1 + +# ask for returned SOAP responses to be converted to basic python types +Config.simplify_objects = 1 + + +# provide a mechanism to stop the server +run = 1 +def quit(): + global run + run=0; + + +if Config.SSLserver: + from M2Crypto import SSL + +def _authorize(*args, **kw): + global allowAll, Config + + if Config.debug: + print "Authorize (function) called! (result = %d)" % allowAll + print "Arguments: %s" % kw + + if allowAll: + return 1 + else: + return 0 + +# Simple echo +def echo(s): + global Config + + # Test of context retrieval + ctx = Server.GetSOAPContext() + if Config.debug: + print "SOAP Context: ", ctx + + return s + s + +# An echo class +class echoBuilder2: + def echo2(self, val): + return val * 3 + +# A class that has an instance variable which is an echo class +class echoBuilder: + def __init__(self): + self.prop = echoBuilder2() + + def echo_ino(self, val): + return val + val + def _authorize(self, *args, **kw): + global allowAll, Config + + if Config.debug: + print "Authorize (method) called with arguments:" + print "*args=%s" % str(args) + print "**kw =%s" % str(kw) + print "Approved -> %d" % allowAll + + if allowAll: + return 1 + else: + return 0 + +# Echo with context +def echo_wc(s, _SOAPContext): + global Config + + c = _SOAPContext + + sep = '-' * 72 + + # The Context object has extra info about the call + if Config.debug: + print "-- XML", sep[7:] + # The original XML request + print c.xmldata + + print "-- Header", sep[10:] + # The SOAP Header or None if not present + print c.header + + if c.header: + print "-- Header.mystring", sep[19:] + # An element of the SOAP Header + print c.header.mystring + + print "-- Body", sep[8:] + # The whole Body object + print c.body + + print "-- Peer", sep[8:] + if not GSI: + # The socket object, useful for + print c.connection.getpeername() + else: + # The socket object, useful for + print c.connection.get_remote_address() + ctx = c.connection.get_security_context() + print ctx.inquire()[0].display() + + print "-- SOAPAction", sep[14:] + # The SOAPaction HTTP header + print c.soapaction + + print "-- HTTP headers", sep[16:] + # All the HTTP headers + print c.httpheaders + + return s + s + +# Echo with keyword arguments +def echo_wkw(**kw): + return kw['first'] + kw['second'] + kw['third'] + +# Simple echo +def echo_simple(*arg): + return arg + +def echo_header(s, _SOAPContext): + global Config + + c = _SOAPContext + return s, c.header + + +addr = ('localhost', 9900) +GSI = 0 +SSL = 0 +if len(sys.argv) > 1 and sys.argv[1] == '-s': + SSL = 1 + if not Config.SSLserver: + raise RuntimeError, \ + "this Python installation doesn't have OpenSSL and M2Crypto" + ssl_context = SSL.Context() + ssl_context.load_cert('validate/server.pem') + server = SOAPServer(addr, ssl_context = ssl_context) + prefix = 'https' +elif len(sys.argv) > 1 and sys.argv[1] == '-g': + GSI = 1 + from SOAPpy.GSIServer import GSISOAPServer + server = GSISOAPServer(addr) + prefix = 'httpg' +else: + server = SOAPServer(addr) + prefix = 'http' + +print "Server listening at: %s://%s:%d/" % (prefix, addr[0], addr[1]) + +# register the method +server.registerFunction(echo) +server.registerFunction(echo, path = "/pathtest") +server.registerFunction(_authorize) +server.registerFunction(_authorize, path = "/pathtest") + +# Register a whole object +o = echoBuilder() +server.registerObject(o, path = "/pathtest") +server.registerObject(o) + +# Register a function which gets called with the Context object +server.registerFunction(MethodSig(echo_wc, keywords = 0, context = 1), + path = "/pathtest") +server.registerFunction(MethodSig(echo_wc, keywords = 0, context = 1)) + +# Register a function that takes keywords +server.registerKWFunction(echo_wkw, path = "/pathtest") +server.registerKWFunction(echo_wkw) + +server.registerFunction(echo_simple) +server.registerFunction(MethodSig(echo_header, keywords=0, context=1)) +server.registerFunction(quit) + +# Start the server +try: + while run: + server.handle_request() +except KeyboardInterrupt: + pass diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/esj_test_client.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/esj_test_client.py new file mode 100644 index 0000000000000000000000000000000000000000..51d4acbc5112985edfc6cf65101a55233608f91a --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/esj_test_client.py @@ -0,0 +1,71 @@ +#!/usr/bin/python2 + +#standard imports +import syslog, sys + +#domain specific imports +sys.path.insert (1, '..') +import SOAPpy + +SOAPpy.Config.simplify_objects=1 + +## def test_integer(self,pass_integer): +## def test_string(self,pass_string): +## def test_float(self,pass_float): +## def test_tuple(self,pass_tuple): +## def test_list(self,pass_list): +## def test_dictionary(self,pass_dictionary): + +if __name__ == "__main__": + + server = SOAPpy.SOAPProxy("http://localhost:9999") + + original_integer = 5 + result_integer = server.test_integer(original_integer) + print "original_integer %s" % original_integer + print "result_integer %s" % result_integer + assert(result_integer==original_integer) + print + + original_string = "five" + result_string = server.test_string(original_string) + print "original_string %s" % original_string + print "result_string %s" % result_string + assert(result_string==original_string) + print + + original_float = 5.0 + result_float = server.test_float(original_float) + print "original_float %s" % original_float + print "result_float %s" % result_float + assert(result_float==original_float) + print + + original_tuple = (1,2,"three","four",5) + result_tuple = server.test_tuple(original_tuple) + print "original_tuple %s" % str(original_tuple) + print "result_tuple %s" % str(result_tuple) + assert(tuple(result_tuple)==original_tuple) + print + + original_list = [5,4,"three",2,1] + result_list = server.test_list(original_list) + print "original_list %s" % original_list + print "result_list %s" % result_list + assert(result_list==original_list) + print + + original_dictionary = { + 'one': 1, + "two": 2, + "three": 3, + "four": 4, + "five": 5, + } + result_dictionary = server.test_dictionary(original_dictionary) + print "original_dictionary %s" % original_dictionary + print "result_dictionary %s" % result_dictionary + assert(result_dictionary==original_dictionary) + print + + server.quit() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/esj_test_server.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/esj_test_server.py new file mode 100644 index 0000000000000000000000000000000000000000..8c62400d44e4051f6ae80b34cd017340e67613d3 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/esj_test_server.py @@ -0,0 +1,48 @@ +#!/usr/bin/python2 + +#standard imports +import syslog, sys + +#domain specific imports +sys.path.insert (1, '..') +import SOAPpy + +class test_service: + + run = 1 + + def test_integer(self,pass_integer): + print type(pass_integer) + return pass_integer + + def test_string(self,pass_string): + print type(pass_string) + return pass_string + + def test_float(self,pass_float): + print type(pass_float) + return pass_float + + def test_tuple(self,pass_tuple): + print type(pass_tuple), pass_tuple + return pass_tuple + + def test_list(self,pass_list): + print type(pass_list), pass_list + return pass_list + + def test_dictionary(self,pass_dictionary): + print type(pass_dictionary), pass_dictionary + return pass_dictionary + + def quit(self): + self.run = 0 + +server = SOAPpy.SOAPServer(("localhost",9999)) +SOAPpy.Config.simplify_objects=1 + +access_object = test_service() +server.registerObject(access_object) + +while access_object.run: + server.handle_request() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/excelTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/excelTest.py new file mode 100755 index 0000000000000000000000000000000000000000..b1cfa63c71153957926d66fd174e164f835b5c01 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/excelTest.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import sys +sys.path.insert(1, "..") + +from SOAPpy import * +server = SOAPProxy("http://206.135.217.234:8000/") +server.COM_SetProperty("Visible", 1) +server.Workbooks.Open("c:\\test.xls") +server.COM_NestedCall('ActiveSheet.Range("A2").EntireRow.Delete()') +server.quit() + + + + + + + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/largeDataTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/largeDataTest.py new file mode 100755 index 0000000000000000000000000000000000000000..ccacad1a72da149ccb4ebbd9ff8efe37e5a6a112 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/largeDataTest.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. + +import sys +sys.path.insert(1, "..") + +from SOAPpy import * +from SOAPpy import Parser + +# Uncomment to see outgoing HTTP headers and SOAP and incoming +#Config.debug = 1 + +if len(sys.argv) > 1 and sys.argv[1] == '-s': + server = SOAPProxy("https://localhost:9900") +else: + server = SOAPProxy("http://localhost:9900") + + +# BIG data: + +big = repr('.' * (1<<18) ) + +# ...in an object +print "server.echo_ino(big):..", +tmp = server.echo_ino(big) +print "done" + +# ...in an object in an object +print "server.prop.echo2(big)..", +tmp = server.prop.echo2(big) +print "done" + +# ...with keyword arguments +print 'server.echo_wkw(third = big, first = "one", second = "two")..', +tmp = server.echo_wkw(third = big, first = "one", second = "two") +print "done" + +# ...with a context object +print "server.echo_wc(big)..", +tmp = server.echo_wc(big) +print "done" + +# ...with a header +hd = headerType(data = {"mystring": "Hello World"}) +print "server._hd(hd).echo_wc(big)..", +tmp = server._hd(hd).echo_wc(big) +print "done" + +server.quit() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/newsTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/newsTest.py new file mode 100755 index 0000000000000000000000000000000000000000..0d56a1d90a94cb03774503dc6a4a26378a4dfb83 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/newsTest.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +ident = '$Id$' + +import os, re +import sys +sys.path.insert(1, "..") + +from SOAPpy import SOAPProxy + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + proxy = "%s:%s" % (phost, pport) +except: + proxy = None + +SoapEndpointURL = 'http://www22.brinkster.com/prasads/BreakingNewsService.asmx?WSDL' + +MethodNamespaceURI = 'http://tempuri.org/' + +# Three ways to do namespaces, force it at the server level + +server = SOAPProxy(SoapEndpointURL, namespace = MethodNamespaceURI, + soapaction='http://tempuri.org/GetCNNNews', encoding = None, + http_proxy=proxy) +print "[server level CNN News call]" +print server.GetCNNNews() + +# Do it inline ala SOAP::LITE, also specify the actually ns (namespace) and +# sa (soapaction) + +server = SOAPProxy(SoapEndpointURL, encoding = None) +print "[inline CNNNews call]" +print server._ns('ns1', + MethodNamespaceURI)._sa('http://tempuri.org/GetCNNNews').GetCNNNews() + +# Create an instance of your server with specific namespace and then use +# inline soapactions for each call + +dq = server._ns(MethodNamespaceURI) +print "[namespaced CNNNews call]" +print dq._sa('http://tempuri.org/GetCNNNews').GetCNNNews() +print "[namespaced CBSNews call]" +print dq._sa('http://tempuri.org/GetCBSNews').GetCBSNews() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/quoteTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/quoteTest.py new file mode 100755 index 0000000000000000000000000000000000000000..5b56ff771b2d01acafaf666dbb6541fd0e18e6e1 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/quoteTest.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. +ident = '$Id$' + +import os, re +import sys +sys.path.insert(1, "..") + +from SOAPpy import SOAPProxy + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + proxy = "%s:%s" % (phost, pport) +except: + proxy = None + +# Three ways to do namespaces, force it at the server level + +server = SOAPProxy("http://services.xmethods.com:9090/soap", + namespace = 'urn:xmethods-delayed-quotes', + http_proxy=proxy) + +print "IBM>>", server.getQuote(symbol = 'IBM') + +# Do it inline ala SOAP::LITE, also specify the actually ns + +server = SOAPProxy("http://services.xmethods.com:9090/soap", + http_proxy=proxy) +print "IBM>>", server._ns('ns1', + 'urn:xmethods-delayed-quotes').getQuote(symbol = 'IBM') + +# Create a namespaced version of your server + +dq = server._ns('urn:xmethods-delayed-quotes') +print "IBM>>", dq.getQuote(symbol='IBM') +print "ORCL>>", dq.getQuote(symbol='ORCL') +print "INTC>>", dq.getQuote(symbol='INTC') diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/simpleWSDL.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/simpleWSDL.py new file mode 100644 index 0000000000000000000000000000000000000000..2ea2bcc101b139c7b57fdbeed4e5e88552b2fbc5 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/simpleWSDL.py @@ -0,0 +1,10 @@ +import sys + +sys.path.insert(1, "..") +import SOAPpy + +url = 'http://www.xmethods.org/sd/2001/TemperatureService.wsdl' +zip = '06340' +proxy = SOAPpy.WSDL.Proxy(url) +temp = proxy.getTemp(zip) +print 'Temperature at', zip, 'is', temp diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/speedTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/speedTest.py new file mode 100755 index 0000000000000000000000000000000000000000..be5cd39c4dcb1fc8c0121e5454d31e47990d0596 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/speedTest.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +ident = '$Id$' + +import time +import sys +sys.path.insert(1, "..") + +x='''<SOAP-ENV:Envelope + xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" + xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance" + xmlns:xsd="http://www.w3.org/1999/XMLSchema"> + <SOAP-ENV:Body> + <ns1:getRate xmlns:ns1="urn:demo1:exchange" SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> + <country1 xsi:type="xsd:string">USA</country1> + <country2 xsi:type="xsd:string">japan</country2> + </ns1:getRate> + </SOAP-ENV:Body> + </SOAP-ENV:Envelope>''' + +x2='''<SOAP-ENV:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" SOAP-ENV:encodingStyle="http://schemas.microsoft.com/soap/encoding/clr/1.0 http://schemas.xmlsoap.org/soap/encoding/" xmlns:i3="http://soapinterop.org/xsd" xmlns:i2="http://soapinterop.org/"> +<SOAP-ENV:Body> +<i2:echoStructArray id="ref-1"> +<return href="#ref-4"/> +</i2:echoStructArray> +<SOAP-ENC:Array id="ref-4" SOAP-ENC:arrayType="i3:SOAPStruct[3]"> +<item href="#ref-5"/> +<item href="#ref-6"/> +<item href="#ref-7"/> +</SOAP-ENC:Array> +<i3:SOAPStruct id="ref-5"> +<varString xsi:type="xsd:string">West Virginia</varString> +<varInt xsi:type="xsd:int">-546</varInt> +<varFloat xsi:type="xsd:float">-5.398</varFloat> +</i3:SOAPStruct> +<i3:SOAPStruct id="ref-6"> +<varString xsi:type="xsd:string">New Mexico</varString> +<varInt xsi:type="xsd:int">-641</varInt> +<varFloat xsi:type="xsd:float">-9.351</varFloat> +</i3:SOAPStruct> +<i3:SOAPStruct id="ref-7"> +<varString xsi:type="xsd:string">Missouri</varString> +<varInt xsi:type="xsd:int">-819</varInt> +<varFloat xsi:type="xsd:float">1.495</varFloat> +</i3:SOAPStruct> +</SOAP-ENV:Body> +</SOAP-ENV:Envelope> +''' + +# Import in function, because for some reason they slow each other +# down in same namespace ??? +def SOAPParse(inxml): + from SOAPpy import parseSOAPRPC + t= time.time() + parseSOAPRPC(inxml) + return time.time()-t + +def SAXParse(inxml): + import xml.sax + y = xml.sax.handler.ContentHandler() + t= time.time() + xml.sax.parseString(inxml,y) + return time.time()-t + +def DOMParse(inxml): + import xml.dom.minidom + t= time.time() + xml.dom.minidom.parseString(inxml) + return time.time()-t + +# Wierd but the SAX parser runs really slow the first time. +# Probably got to load a c module or something +SAXParse(x) +print +print "Simple XML" +print "SAX Parse, no marshalling ", SAXParse(x) +print "SOAP Parse, and marshalling ", SOAPParse(x) +print "DOM Parse, no marshalling ", DOMParse(x) +print +print "Complex XML (references)" +print "SAX Parse, no marshalling ", SAXParse(x2) +print "SOAP Parse, and marshalling ", SOAPParse(x2) +print "DOM Parse, no marshalling ", DOMParse(x2) diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/storageTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/storageTest.py new file mode 100755 index 0000000000000000000000000000000000000000..a88487ed3fd2f554d8e545246ff58983536c491e --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/storageTest.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +ident = '$Id$' + +import sys, os, time, signal, re +sys.path.insert(1, "..") +from SOAPpy import SOAPProxy, SOAPConfig, SOAPUserAgent + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + http_proxy = "%s:%s" % (phost, pport) +except: + http_proxy = None + + +PROXY="http://www.soapware.org/xmlStorageSystem" +EMAIL="SOAPpy@actzero.com" +NAME="test_user" +PASSWORD="mypasswd" +SERIAL=1123214 + +MY_PORT=15600 + +def resourceChanged (url): + print "\n##### NOTIFICATION MESSAGE: Resource %s has changed #####\n" % url + return booleanType(1) + +def printstatus (cmd, stat): + print + if stat.flError: + print "### %s failed: %s ###" % (cmd, stat.message) + else: + print "### %s successful: %s ###" % (cmd, stat.message) + return not stat.flError + +server = SOAPProxy(encoding="US-ASCII", + proxy=PROXY, + soapaction="/xmlStorageSystem", + http_proxy=http_proxy, +# config=SOAPConfig(debug=1) + ) + +# Register as a new user or update user information +reg = server.registerUser(email=EMAIL, name=NAME, password=PASSWORD, + clientPort=MY_PORT, userAgent=SOAPUserAgent(), + serialnumber=SERIAL) +printstatus("registerUser", reg) + +# See what this server can do +reg = server.getServerCapabilities (email=EMAIL, password=PASSWORD) +if printstatus("getServerCapabilities", reg): + print "Legal file extensions: " + str(reg.legalFileExtensions) + print "Maximum file size: " + str(reg.maxFileSize) + print "Maximum bytes per user: " + str(reg.maxBytesPerUser) + print "Number of bytes in use by the indicated user: " + str(reg.ctBytesInUse) + print "URL of the folder containing your files: " + str(reg.yourUpstreamFolderUrl) + +# Store some files +reg = server.saveMultipleFiles (email=EMAIL, password=PASSWORD, + relativepathList=['index.html','again.html'], + fileTextList=['<html><title>bennett@actzero.com home page</title><body>' + + '<a href=again.html>Hello Earth</a></body></html>', + '<html><title>bennett@actzero.com home page</title><body>' + + '<a href=index.html>Hello Earth Again</a></body></html>']) +if printstatus("saveMultipleFiles", reg): + print "Files stored:" + for file in reg.urlList: + print " %s" % file + + # Save this for call to test pleaseNotify + mylist = reg.urlList +else: + mylist = [] + +# Check to see what files are stored +reg = server.getMyDirectory (email=EMAIL, password=PASSWORD) +if printstatus("getMyDirectory", reg): + i = 1 + while hasattr(reg.directory, "file%05d" % i): + d = getattr(reg.directory, "file%05d" % i) + print "Relative Path: %s" % d.relativePath + print "Size: %d" % d.size + print "Created: %s" % d.whenCreated + print "Last Uploaded: %s" % d.whenLastUploaded + print "URL: %s" % d.url + print + i += 1 + +# Set up notification +reg = server.pleaseNotify(notifyProcedure="resourceChanged", port=MY_PORT, path="/", protocol="soap", urlList=mylist) +printstatus("notifyProcedure", reg) + +pid = os.fork() +if pid == 0: + # I am a child process. Set up SOAP server to receive notification + print + print "## Starting notification server ##" + + s = SOAPServer(('localhost', MY_PORT)) + s.registerFunction(resourceChanged) + s.serve_forever() + +else: + + def handler(signum, frame): + # Kill child process + print "Killing child process %d" % pid + os.kill(pid, signal.SIGINT) + + signal.signal(signal.SIGINT, handler) + + # I am a parent process + # Change some files + time.sleep(3) + reg = server.saveMultipleFiles (email=EMAIL, password=PASSWORD, + relativepathList=['index.html'], + fileTextList=['<html><title>bennett@actzero.com home page</title><body>' + + '<a href=again.html>Hello Bennett</a></body></html>']) + if printstatus("saveMultipleFiles", reg): + print "Files stored:" + for file in reg.urlList: + print " %s" % file + + os.waitpid(pid, 0) diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/testClient1.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/testClient1.py new file mode 100755 index 0000000000000000000000000000000000000000..a386c139a86a781139409e81fae0534f3b87dd30 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/testClient1.py @@ -0,0 +1,118 @@ +import gc +import socket +import threading +import time +import unittest +import sys +sys.path.insert(1, "..") + +import SOAPpy +#SOAPpy.Config.debug=1 + +# global to shut down server +quit = 0 + +def echoDateTime(dt): + return dt + +def echo(s): + """repeats a string twice""" + return s + s + +def kill(): + """tell the server to quit""" + global quit + quit = 1 + +def server1(): + """start a SOAP server on localhost:8000""" + + print "Starting SOAP Server...", + server = SOAPpy.Server.SOAPServer(addr=('127.0.0.1', 8000)) + server.registerFunction(echoDateTime) + server.registerFunction(echo) + server.registerFunction(kill) + print "Done." + + global quit + while not quit: + server.handle_request() + quit = 0 + print "Server shut down." + +class ClientTestCase(unittest.TestCase): + + server = None + startup_timeout = 5 # seconds + + def setUp(self): + '''This is run once before each unit test.''' + + serverthread = threading.Thread(target=server1, name="SOAPServer") + serverthread.start() + + start = time.time() + connected = False + server = None + while not connected and time.time() - start < self.startup_timeout: + print "Trying to connect to the SOAP server...", + try: + server = SOAPpy.Client.SOAPProxy('127.0.0.1:8000') + server.echo('Hello World') + except socket.error, e: + print "Failure:", e + time.sleep(0.5) + else: + connected = True + self.server = server + print "Success." + + if not connected: raise 'Server failed to start.' + + def tearDown(self): + '''This is run once after each unit test.''' + + print "Trying to shut down SOAP server..." + if self.server is not None: + self.server.kill() + time.sleep(5) + + return 1 + + def testEcho(self): + '''Test echo function.''' + + server = SOAPpy.Client.SOAPProxy('127.0.0.1:8000') + s = 'Hello World' + self.assertEquals(server.echo(s), s+s) + + def testNamedEcho(self): + '''Test echo function.''' + + server = SOAPpy.Client.SOAPProxy('127.0.0.1:8000') + s = 'Hello World' + self.assertEquals(server.echo(s=s), s+s) + + def testEchoDateTime(self): + '''Test passing DateTime objects.''' + + server = SOAPpy.Client.SOAPProxy('127.0.0.1:8000') + dt = SOAPpy.Types.dateTimeType(data=time.time()) + dt_return = server.echoDateTime(dt) + self.assertEquals(dt_return, dt) + + +# def testNoLeak(self): +# '''Test for memory leak.''' + +# gc.set_debug(gc.DEBUG_SAVEALL) +# for i in range(400): +# server = SOAPpy.Client.SOAPProxy('127.0.0.1:8000') +# s = 'Hello World' +# server.echo(s) +# gc.collect() +# self.assertEquals(len(gc.garbage), 0) + + +if __name__ == '__main__': + unittest.main() diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/testWSDL.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/testWSDL.py new file mode 100755 index 0000000000000000000000000000000000000000..7803a505b8f35823074d4b50887930e67cb50c2f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/testWSDL.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +import unittest +import os, re +import sys +sys.path.insert (1, '..') +import SOAPpy + +ident = '$Id$' + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + http_proxy = "%s:%s" % (phost, pport) +except: + http_proxy = None + + + +class IntegerArithmenticTestCase(unittest.TestCase): + + def setUp(self): + self.wsdlstr1 = '''<?xml version="1.0"?> + <definitions name="TemperatureService" targetNamespace="http://www.xmethods.net/sd/TemperatureService.wsdl" xmlns:tns="http://www.xmethods.net/sd/TemperatureService.wsdl" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/wsdl/soap/" xmlns="http://schemas.xmlsoap.org/wsdl/"> + <message name="getTempRequest"> + <part name="zipcode" type="xsd:string"/> + </message> + <message name="getTempResponse"> + <part name="return" type="xsd:float"/> + </message> + <portType name="TemperaturePortType"> + <operation name="getTemp"> + <input message="tns:getTempRequest"/> + <output message="tns:getTempResponse"/> + </operation> + </portType> + <binding name="TemperatureBinding" type="tns:TemperaturePortType"> + <soap:binding style="rpc" transport="http://schemas.xmlsoap.org/soap/http"/> + <operation name="getTemp"> + <soap:operation soapAction=""/> + <input> + <soap:body use="encoded" namespace="urn:xmethods-Temperature" encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"/> + </input> + <output> + <soap:body use="encoded" namespace="urn:xmethods-Temperature" encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"/> + </output> + </operation> + </binding> + <service name="TemperatureService"> + <documentation>Returns current temperature in a given U.S. zipcode </documentation> + <port name="TemperaturePort" binding="tns:TemperatureBinding"> + <soap:address location="http://services.xmethods.net:80/soap/servlet/rpcrouter"/> + </port> + </service> + </definitions> + ''' + + def testParseWsdlString(self): + '''Parse XMethods TemperatureService wsdl from a string.''' + + wsdl = SOAPpy.WSDL.Proxy(self.wsdlstr1, http_proxy=http_proxy) + self.assertEquals(len(wsdl.methods), 1) + method = wsdl.methods.values()[0] + self.assertEquals(method.methodName, 'getTemp') + self.assertEquals(method.namespace, 'urn:xmethods-Temperature') + self.assertEquals(method.location, + 'http://services.xmethods.net:80/soap/servlet/rpcrouter') + + def testParseWsdlFile(self): + '''Parse XMethods TemperatureService wsdl from a file.''' + + # figure out path to the test directory + dir = os.path.abspath('.') + fname = './TemperatureService.wsdl' + + try: + f = file(fname) + except (IOError, OSError): + self.assert_(0, 'Cound not find wsdl file "%s"' % file) + + wsdl = SOAPpy.WSDL.Proxy(fname, http_proxy=http_proxy) + self.assertEquals(len(wsdl.methods), 1) + method = wsdl.methods.values()[0] + self.assertEquals(method.methodName, 'getTemp') + self.assertEquals(method.namespace, 'urn:xmethods-Temperature') + self.assertEquals(method.location, + 'http://services.xmethods.net:80/soap/servlet/rpcrouter') + + def testParseWsdlUrl(self): + '''Parse XMethods TemperatureService wsdl from a url.''' + + wsdl = SOAPpy.WSDL.Proxy('http://www.xmethods.net/sd/2001/TemperatureService.wsdl', http_proxy=http_proxy) + self.assertEquals(len(wsdl.methods), 1) + method = wsdl.methods.values()[0] + self.assertEquals(method.methodName, 'getTemp') + self.assertEquals(method.namespace, 'urn:xmethods-Temperature') + self.assertEquals(method.location, + 'http://services.xmethods.net:80/soap/servlet/rpcrouter') + + def testGetTemp(self): + '''Parse TemperatureService and call getTemp.''' + + zip = '01072' + proxy = SOAPpy.WSDL.Proxy(self.wsdlstr1, http_proxy=http_proxy) + temp = proxy.getTemp(zip) + print 'Temperature at', zip, 'is', temp + + +if __name__ == '__main__': + unittest.main() + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/testleak.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/testleak.py new file mode 100755 index 0000000000000000000000000000000000000000..34bea27f44d688059684e3b42b9bc17643738345 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/testleak.py @@ -0,0 +1,21 @@ +#!/usr/bin/python + +import sys +sys.path.insert(1, "..") +import SOAPpy +import time +import gc +import types + +gc.set_debug(gc.DEBUG_SAVEALL) + +for i in range(400): + try: + t = SOAPpy.SOAP.parseSOAPRPC('bad soap payload') + except: pass + +gc.collect() +if len(gc.garbage): + print 'still leaking' +else: + print 'no leak' diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/translateTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/translateTest.py new file mode 100755 index 0000000000000000000000000000000000000000..5cee0e0bee962882378403127237053c736aac17 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/translateTest.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. +ident = '$Id$' + +import os, re +import sys +sys.path.insert(1, "..") + +from SOAPpy import SOAPProxy + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + proxy = "%s:%s" % (phost, pport) +except: + proxy = None + +server = SOAPProxy("http://services.xmethods.com:80/perl/soaplite.cgi", + http_proxy=proxy) +babel = server._ns('urn:xmethodsBabelFish#BabelFish') + +print babel.BabelFish(translationmode = "en_fr", + sourcedata = "The quick brown fox did something or other") diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/weatherTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/weatherTest.py new file mode 100755 index 0000000000000000000000000000000000000000..456e5815927b82d11581af999dcc434ee26f015f --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/weatherTest.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +ident = '$Id$' + +import os, re +import sys +sys.path.insert(1, "..") + +from SOAPpy import SOAPProxy + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + proxy = "%s:%s" % (phost, pport) +except: + proxy = None + +SoapEndpointURL = 'http://services.xmethods.net:80/soap/servlet/rpcrouter' +MethodNamespaceURI = 'urn:xmethods-Temperature' + +# Do it inline ala SOAP::LITE, also specify the actually ns + +server = SOAPProxy(SoapEndpointURL, http_proxy=proxy) +print "inline", server._ns('ns1', MethodNamespaceURI).getTemp(zipcode='94063') diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/whoisTest.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/whoisTest.py new file mode 100755 index 0000000000000000000000000000000000000000..b35fcfc444bc90edb31f54a6d81ee390689159c2 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/whoisTest.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +ident = '$Id$' + +import os, re +import sys +sys.path.insert(1, "..") + +from SOAPpy import SOAPProxy + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + proxy = "%s:%s" % (phost, pport) +except: + proxy = None + +server = SOAPProxy("http://www.SoapClient.com/xml/SQLDataSoap.WSDL", + http_proxy=proxy) + +print "whois>>", server.ProcessSRL(SRLFile="WHOIS.SRI", + RequestName="whois", + key = "microsoft.com") + diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tests/xmethods.py b/LTA/LTAIngest/SOAPpy-0.12.0/tests/xmethods.py new file mode 100644 index 0000000000000000000000000000000000000000..431fc017d5ef3a7acf3fa8bad4a6fafc9eeeef4a --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tests/xmethods.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. +ident = '$Id$' + +import os, re +import sys +sys.path.insert(1, "..") + +from SOAPpy import SOAPProxy + +# Check for a web proxy definition in environment +try: + proxy_url=os.environ['http_proxy'] + phost, pport = re.search('http://([^:]+):([0-9]+)', proxy_url).group(1,2) + proxy = "%s:%s" % (phost, pport) +except: + proxy = None + + +print "##########################################" +print " SOAP services registered at xmethods.net" +print "##########################################" + +server = SOAPProxy("http://www.xmethods.net/interfaces/query", + namespace = 'urn:xmethods-delayed-quotes', + http_proxy=proxy) + +names = server.getAllServiceNames() + +for item in names: + print 'name:', item['name'] + print 'id :', item['id'] + print diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/tools/interop2html.py b/LTA/LTAIngest/SOAPpy-0.12.0/tools/interop2html.py new file mode 100755 index 0000000000000000000000000000000000000000..599a44c4fea51e0039488d196b826e4b1478c702 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/tools/interop2html.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +import string +import cgi + +ident = '$Id$' + +lines = open('output.txt').readlines() +#preserve the tally +tally = lines[-6:] +#whack the tally from lines +lines = lines[:-6] +table={} +for line in lines: + if line[:3] == ' ' or line == '>\n' : continue + line = line[:-1] #delete end of line char + row = [line[:line.find(': ')], line[line.find(': ')+2:]] #split server name from rest of line + restofrow = row[1].split(' ',3) #break out method name, number, status code, status comment + if len(restofrow) > 3: + if restofrow[3].find('as expected') != -1: + restofrow[2] = restofrow[2] + ' (as expected)' + elif restofrow[3][:2] == '- ' : + restofrow[3] = restofrow[3][2:] + try: table[row[0]].append([restofrow[0],restofrow[2:]]) + except KeyError: table[row[0]] = [[restofrow[0],restofrow[2:]]] + +print "<html><body>" +print "<script>function popup(text) {" +print "text = '<html><head><title>Test Detail</title></head><body><p>' + text + '</p></body></html>';" +print "newWin=window.open('','win1','location=no,menubar=no,width=400,height=200');" +print "newWin.document.open();" +print "newWin.document.write(text);" +print "newWin.focus(); } </script>" +print "<br><table style='font-family: Arial; color: #cccccc'><tr><td colspan=2><font face=arial color=#cccccc><b>Summary</b></font></td></tr>" +for x in tally: + z = x[:-1].split(":",1) + print "<tr><td><font face=arial color=#cccccc>",z[0],"</font></td><td><font face=arial color=#cccccc>",z[1],"</font></td></tr>" +print "</table><br>" +c = 0 +totalmethods = len(table[table.keys()[0]]) +while c < totalmethods: + print "<br><table width='95%' style='font-family: Arial'>" + print "<tr><td width='27%' bgcolor='#cccccc'></td>" + cols = [c, c + 1, c + 2] + if c != 16: + cols += [c + 3] + for i in cols: + try: header = table[table.keys()[0]][i][0] + except: break + print "<td width ='17%' align='center' bgcolor='#cccccc'><b>",header,"</b></td>" + print "</tr>" + l = table.keys() + l.sort() + for key in l: + print "<tr><td bgcolor='#cccccc'>", key , "</td>" + for i in cols: + try: status = table[key][i][1][0] + except: break + if status.find("succeed") != -1: + bgcolor = "#339900" + status = "Pass" + elif status.find("expected") != -1: + bgcolor = "#FF9900" + hreftitle = table[key][i][1][1].replace("'","") # remove apostrophes from title properties + popuphtml = '"' + cgi.escape(cgi.escape(table[key][i][1][1]).replace("'","'").replace('"',""")) + '"' + status = "<a title='" + hreftitle + "' href='javascript:popup(" + popuphtml + ")'>Failed (expected)</a>" + else: + bgcolor = "#CC0000" + hreftitle = table[key][i][1][1].replace("'","") # remove apostrophes from title properties + popuphtml = '"' + cgi.escape(cgi.escape(table[key][i][1][1]).replace("'","'").replace('"',""")) + '"' + status = "<a title='" + hreftitle + "' href='javascript:popup(" + popuphtml + ")'>Failed</a>" + print "<td align='center' bgcolor=" , bgcolor , ">" , status , "</td>" + print "</tr>" + print "</table>" + c = c + len(cols) +print "</body></html>" diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/validate/server.pem b/LTA/LTAIngest/SOAPpy-0.12.0/validate/server.pem new file mode 100755 index 0000000000000000000000000000000000000000..28cfe15fbac3ebe496f188b3b8559c508895eb60 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/validate/server.pem @@ -0,0 +1,60 @@ +$Id: server.pem,v 1.1 2005/05/13 08:22:02 renting Exp $ +# Test certificate generated using CA.pl written by Steve Hensen +# bundled with OpenSSL. +# +# Steps used to generate server.pem : +# a)CA.pl -newca (creates a new CA heirarchy) +# b)CA.pl -newreq (creates a new certificate request) +# c)CA.pl -sign (sign the certificate request) +# d)openssl rsa <newreq.pem >newkey.pem (unencrypt the private key) +# e)Copy the certificate from newcert.pem, the unencrypted RSA +# private key from newkey.pem and the certificate request from +# newreq.pem and create server.pem to contain all three of them. +-----BEGIN CERTIFICATE----- +MIIDhjCCAu+gAwIBAgIBATANBgkqhkiG9w0BAQQFADCBgDELMAkGA1UEBhMCVVMx +CzAJBgNVBAgTAkNBMQswCQYDVQQHEwJSQzEQMA4GA1UEChMHYWN0emVybzETMBEG +A1UECxMKdGVjaG5vbG9neTEPMA0GA1UEAxMGc3lzYWRtMR8wHQYJKoZIhvcNAQkB +FhBpbmZvQGFjdHplcm8uY29tMB4XDTAxMDUxNjIyMzkwM1oXDTAyMDUxNjIyMzkw +M1owgYAxCzAJBgNVBAYTAlVTMQswCQYDVQQIEwJDQTELMAkGA1UEBxMCUkMxEDAO +BgNVBAoTB2FjdHplcm8xEzARBgNVBAsTCnRlY2hub2xvZ3kxDzANBgNVBAMTBnN5 +c2FkbTEfMB0GCSqGSIb3DQEJARYQaW5mb0BhY3R6ZXJvLmNvbTCBnzANBgkqhkiG +9w0BAQEFAAOBjQAwgYkCgYEAyRBB6l+DI3aMNeYf7IuodvZ9nNxnfQHVnGyRtwhb +1g2tugTwFsE67oHA5qvwaDBILtsqkr9agXYDbZwJmV58xtBY675tibf7/1R8mcDO +d4Dremdn0CMyk4+n6Z8GpLJ59TZ3y98DXUOqbLvzzltDz0si2XVa8G7f4K5k/xxB +GZcCAwEAAaOCAQwwggEIMAkGA1UdEwQCMAAwLAYJYIZIAYb4QgENBB8WHU9wZW5T +U0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQWBBT/DGQzyXlwLXMWMaT4 +lp9O928tvzCBrQYDVR0jBIGlMIGigBSdjwZua1AI3XoUtwLyW0Optc/4O6GBhqSB +gzCBgDELMAkGA1UEBhMCVVMxCzAJBgNVBAgTAkNBMQswCQYDVQQHEwJSQzEQMA4G +A1UEChMHYWN0emVybzETMBEGA1UECxMKdGVjaG5vbG9neTEPMA0GA1UEAxMGc3lz +YWRtMR8wHQYJKoZIhvcNAQkBFhBpbmZvQGFjdHplcm8uY29tggEAMA0GCSqGSIb3 +DQEBBAUAA4GBABQodV+rrwMsvTEEza08EeS1Rf2ISuzh6e9VbfiJLVB5Xv1SeEt1 +sOv8ETZyN/4OXvZWQG/5md/5NNkf5K6CeKiwctztkyKTXdPIFS6FJVZdduWhiWPF +6gutQgOogtpCHTLwdSDk75n5MXFlnehORqOREMqqCJtFlHMEV1211Ssi +-----END CERTIFICATE----- +-----BEGIN RSA PRIVATE KEY----- +MIICWwIBAAKBgQDJEEHqX4Mjdow15h/si6h29n2c3Gd9AdWcbJG3CFvWDa26BPAW +wTrugcDmq/BoMEgu2yqSv1qBdgNtnAmZXnzG0Fjrvm2Jt/v/VHyZwM53gOt6Z2fQ +IzKTj6fpnwaksnn1NnfL3wNdQ6psu/POW0PPSyLZdVrwbt/grmT/HEEZlwIDAQAB +AoGALcho6gBjsRCObrt+63MFokkQY0aAviNLy7mhGIdrufsVYvU64kOPsr2S+jOO +o3rTBPBc6ltuNWp072GHggfU61y4Bvfqxq2IRRDVH+yjmsdKSPYoBSIs3ZKjwJGx +pFAT1nfNP05MfqUwZm8HbTnqqakrWm0p53Zvv6NP3vNjmzECQQD6EK5a7bD7VSVz +MawUgUkZGZUtForbZL5nwIo1j94/TbnxUuuwej0MiCJ0MQsPCY/LML/gYaxTdQOg +qYkGyIAPAkEAzdXbgTc81FflECxc5CXw9Yi1g0+nMkH5drlk+sct5dCzokPJZBQ3 +oxIaQcJP/rUMgG0A2mSpOnbAHNHX+z/F+QJAEQGbafGqTJ1wy5HAOzDDsOJNg+B5 +lwwV6uZsP9JF8hYuJBxYjQrzJewIM9C2CNLEpbPuCKt71b0qfv2opP5zvwJAMyjh +WveAvgJuo5tzJx2rC0wEWXPVya8OMw0XZSFWbhV2YHFav+4qefSI5ClIurUDO3Rc +TuvQCAD19PPPK9qI+QJADpbLUWw8NsMaHpJgeigXVIsRtJcroDw2r87bJxsgcgQz +CsIH32VLvFOmpJdwnji6GX+vD2i0UH4ythnMCq4NUg== +-----END RSA PRIVATE KEY----- +-----BEGIN CERTIFICATE REQUEST----- +MIIBwTCCASoCAQAwgYAxCzAJBgNVBAYTAlVTMQswCQYDVQQIEwJDQTELMAkGA1UE +BxMCUkMxEDAOBgNVBAoTB2FjdHplcm8xEzARBgNVBAsTCnRlY2hub2xvZ3kxDzAN +BgNVBAMTBnN5c2FkbTEfMB0GCSqGSIb3DQEJARYQaW5mb0BhY3R6ZXJvLmNvbTCB +nzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyRBB6l+DI3aMNeYf7IuodvZ9nNxn +fQHVnGyRtwhb1g2tugTwFsE67oHA5qvwaDBILtsqkr9agXYDbZwJmV58xtBY675t +ibf7/1R8mcDOd4Dremdn0CMyk4+n6Z8GpLJ59TZ3y98DXUOqbLvzzltDz0si2XVa +8G7f4K5k/xxBGZcCAwEAAaAAMA0GCSqGSIb3DQEBBAUAA4GBAIoUVScm4lAkfo1o +n4b2Mpq3oV+dZnnTgYog4vmn/2UF0OSSTWlWPvINVkRtfg0iskZsbcWGn+RDY5e/ +aTqN7Xz+BV5XlbQLZzuQdKPsfBcZ766El1chmUuO5tELpFtQkmlAgAXRMuh0Xeb+ +A9wmVNyCMU6/+ajqwO642nSPOLM0 +-----END CERTIFICATE REQUEST----- diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/validate/silab.servers b/LTA/LTAIngest/SOAPpy-0.12.0/validate/silab.servers new file mode 100755 index 0000000000000000000000000000000000000000..f6a17df4a4d91f17d0cf15c595e4648b31944b61 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/validate/silab.servers @@ -0,0 +1,264 @@ +# This list of servers was taken from the SOAPBuilders Interoperability Lab +# (http://www.xmethods.net/ilab/ilab.html) 4/23/01. +# +# $Id: silab.servers,v 1.1 2005/05/13 08:22:02 renting Exp $ + +Name: SOAP.py 0.9.6 (1999) +Endpoint: http://208.177.157.221:9595/xmethodsInterop +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ + +Name: SOAP.py 0.9.6 (2001) +Like: SOAP.py 0.9.6 (1999) +Style: 2001 + +Name: Apache 2.1 +WSDL: http://www.xmethods.net/sd/interop/ApacheInterop11.wsdl +Endpoint: http://nagoya.apache.org:5089/soap/servlet/rpcrouter +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: echoFloatINF server returns 'Infinity' instead of 'INF' +Nonfunctional: echoFloatNegINF server returns '-Infinity' instead of '-INF' +Nonfunctional: echoStruct WSDL specifies 'inputStruct' parameter, method + takes 'echoStruct' parameter +Nonfunctional: echoDate not implemented by server +Nonfunctional: echoBase64 not implemented by server + +Name: EasySoap++ +WSDL: http://easysoap.sourceforge.net/interop.wsdl +Endpoint: http://www.xmethods.net/c/easysoap.cgi +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: actorShouldFail server doesn't fail when mustUnderstand=1 +Nonfunctional: mustUnderstandEqualsOne server doesn't fail when + mustUnderstand=1 + +Name: eSoapServer +Endpoint: http://www.connecttel.com/cgi-bin/esoapserver.cgi +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ + +Name: Frontier 7.0b43 +Endpoint: http://www.soapware.org:80/xmethodsInterop +SOAPAction: "/xmethodsInterop" +Namespace: http://soapinterop.org/ +Style: 2001 + +Name: 4S4C 1.3.3 +WSDL: http://soap.4s4c.com/ilab/soap.asp?WSDL +Endpoint: http://soap.4s4c.com/ilab/soap.asp +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: echoFloatINF server doesn't understand 'INF' +Nonfunctional: echoFloatNaN server doesn't understand 'NaN' +Nonfunctional: echoFloatNegINF server doesn't understand '-INF' + +Name: GLUE +WSDL: http://209.61.190.164:8004/glue/http://soapinterop.org/.wsdl +Endpoint: http://209.61.190.164:8004/glue/http://soapinterop.org/ +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: actorShouldFail server doesn't fail when mustUnderstand=1 +Nonfunctional: mustUnderstandEqualsOne server doesn't fail when + mustUnderstand=1 + +Name: HP SOAP +Page: http://soap.bluestone.com/interop/ +WSDL: http://soap.bluestone.com:80/interop/EchoService/EchoService.wsdl +Endpoint: http://soap.bluestone.com:80/scripts/SaISAPI.dll/SaServletEngine.class/hp-soap/soap/rpc/interop/EchoService +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ + +Name: IDOOX WASP 1.0 +Page: http://soap.idoox.net:7080/IopResults/jsp/index.jsp +WSDL: http://soap.idoox.net:7080/soap/services/ilab.wsdl +Endpoint: http://soap.idoox.net:7080/soap/servlet/soap/ilab +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ + +Name: Kafka XSLT Interop Service +Page: http://www.vbxml.com/soapworkshop/services/kafka10/services/interop.htm +WSDL: http://www.vbxml.com/soapworkshop/services/kafka10/services/endpoint.asp?service=ilab&type=wsdl +Endpoint: http://www.vbxml.com/soapworkshop/services/kafka10/services/endpoint.asp?service=ilab +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: actorShouldFail server doesn't fail when mustUnderstand=1 +Nonfunctional: echoDate not implemented by server +Nonfunctional: echoBase64 not implemented by server +Nonfunctional: mustUnderstandEqualsOne server doesn't fail when + mustUnderstand=1 + +Name: MS ATL Server +WSDL: http://4.34.185.52/ilab/ilab.wsdl +Endpoint: http://4.34.185.52/ilab/ilab.dll?Handler=Default +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Style: 2001 +Typed: no + +Name: MS SOAP Toolkit 2.0 (typed) +Page: http://www.mssoapinterop.org/stk/ilab.htm +WSDL: http://www.mssoapinterop.org/stk/InteropTyped.wsdl +Endpoint: http://www.mssoapinterop.org/stk/InteropTyped.wsdl +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: echoBase64 return value doesn't have a type +Nonfunctional: echoFloatINF server doesn't understand 'INF' +Nonfunctional: echoFloatNaN server doesn't understand 'NaN' +Nonfunctional: echoFloatNegINF server doesn't understand '-INF' + +Name: MS SOAP Toolkit 2.0 (untyped) +Like: MS SOAP Toolkit 2.0 (typed) +WSDL: http://www.mssoapinterop.org/stk/Interop.wsdl +Endpoint: http://www.mssoapinterop.org/stk/Interop.wsdl +Typed: no +Functional: echoBase64 + +Name: MS .NET Beta 2 (typed) +WSDL: http://www.mssoapinterop.org/test/typed.asmx?WSDL +Endpoint: http://www.mssoapinterop.org/test/typed.asmx +SOAPAction: "http://soapinterop.org/%(methodname)s" +Namespace: http://soapinterop.org/ +Nonfunctional: actorShouldFail server doesn't fail when mustUnderstand=1 +Nonfunctional: mustUnderstandEqualsOne server doesn't fail when + mustUnderstand=1 +Nonfunctional: echoDate server doesn't recognize time zone of Z +Nonfunctional: echoBase64 not implemented by server + +Name: MS .NET Beta 2 (untyped) +Like: MS .NET Beta 2 (typed) +WSDL: http://www.mssoapinterop.org/test/simple.asmx?WSDL +Endpoint: http://www.mssoapinterop.org/test/simple.asmx +Typed: no + +Name: MS .NET Remoting (1999 typed) +WSDL: http://www.mssoapinterop.org/DotNetRemoting1999Typed/InteropService.WSDL +Endpoint: http://www.mssoapinterop.org/DotNetRemoting1999Typed/InteropService.soap +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ + +Name: MS .NET Remoting (1999 untyped) +WSDL: http://www.mssoapinterop.org/DotNetRemoting1999/InteropService.WSDL +Endpoint: http://www.mssoapinterop.org/DotNetRemoting1999/InteropService.soap +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Typed: no + +Name: MS .NET Remoting (2001 typed) +WSDL: http://www.mssoapinterop.org/DotNetRemoting2001Typed/InteropService.WSDL +Endpoint: http://www.mssoapinterop.org/DotNetRemoting2001Typed/InteropService.soap +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Style: 2001 + +Name: MS .NET Remoting (2001 untyped) +WSDL: http://www.mssoapinterop.org/DotNetRemoting2001/InteropService.WSDL +Endpoint: http://www.mssoapinterop.org/DotNetRemoting2001/InteropService.soap +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Typed: no +Style: 2001 + +Name: Phalanx +WSDL: http://www.phalanxsys.com/interop/interop.wsdl +Endpoint: http://www.phalanxsys.com/interop/listener.asp +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Style: 2001 + +Name: SOAP::Lite +WSDL: http://services.soaplite.com/interop.wsdl +Endpoint: http://services.soaplite.com/interop.cgi +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ + +Name: SOAPR4 (1999) +Endpoint: http://www.jin.gr.jp/~nahi/Ruby/SOAP4R/SOAPBuildersInterop/1999/ +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: actorShouldFail server doesn't fail when mustUnderstand=1 +Nonfunctional: mustUnderstandEqualsOne server doesn't fail when + mustUnderstand=1 +Nonfunctional: echoVoid server return nil element instead of no elements + +Name: SOAPR4 (2001) +Like: SOAPR4 (1999) +Endpoint: http://www.jin.gr.jp/~nahi/Ruby/SOAP4R/SOAPBuildersInterop/ +Style: 2001 + +Name: SOAPx4 for PHP +Endpoint: http://dietrich.ganx4.com/soapx4/soap.php +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: actorShouldPass server returns type with no namespace +Nonfunctional: actorShouldFail server doesn't fail when mustUnderstand=1 +Nonfunctional: echoDate not implemented by server +Nonfunctional: echoBase64 not implemented by server +Nonfunctional: echoFloat server returns type with no namespace +Nonfunctional: echoFloatArray server returns array elements as strings +Nonfunctional: echoFloatINF server returns type with no namespace +Nonfunctional: echoFloatNaN server returns type with no namespace +Nonfunctional: echoFloatNegINF returns float 0 instead of -INF and type + has no namespace +Nonfunctional: echoFloatNegZero returns 0 instead of -0 and type has no + namespace +Nonfunctional: echoInteger server returns type with no namespace +Nonfunctional: echoIntegerArray server returns array elements as strings +Nonfunctional: echoString server responds with fault when sent '<&>" +Nonfunctional: echoStringArray server responds with fault when an array + element is '<&>" +Nonfunctional: echoVeryLargeFloat server returns type with no namespace +Nonfunctional: echoVerySmallFloat server returns type with no namespace +Nonfunctional: echoVoid server doesn't return anything +Nonfunctional: mustUnderstandEqualsOne server doesn't fail when + mustUnderstand=1 +Nonfunctional: mustUnderstandEqualsZero server returns type with no namespace + +Name: SoapRMI +Endpoint: http://rainier.extreme.indiana.edu:1568 +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: echoDate not implemented by server +Nonfunctional: echoBase64 not implemented by server + +Name: SQLData SOAP Server +WSDL: http://www.SoapClient.com/interop/SQLDataInterop.wsdl +Endpoint: http://www.soapclient.com/interop/sqldatainterop.wsdl +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ + +Name: White Mesa SOAP RPC 2.2 (1999) +WSDL: http://www.whitemesa.net/wsdl/interop.wsdl +Endpoint: http://www.whitemesa.net/interop +SOAPAction: "urn:soapinterop" +Namespace: http://soapinterop.org/ +Nonfunctional: echoFloatINF server doesn't understand 'INF' +Nonfunctional: echoFloatNaN server doesn't understand 'NaN' +Nonfunctional: echoFloatNegINF server doesn't understand '-INF' +Nonfunctional: echoDate not implemented by server +Nonfunctional: echoBase64 server returns data containing control character +Style: 1999 +Typed: no + +Name: White Mesa SOAP RPC 2.2 (2001) +WSDL: http://www.whitemesa.net/wsdl/std/interop.wsdl +Endpoint: http://www.whitemesa.net/interop/std +SOAPAction: http://soapinterop.org/ +Namespace: http://soapinterop.org/ +Nonfunctional: echoFloatINF server doesn't understand 'INF' +Nonfunctional: echoFloatNaN server doesn't understand 'NaN' +Nonfunctional: echoFloatNegINF server doesn't understand '-INF' +Nonfunctional: echoDate not implemented by server +Nonfunctional: echoBase64 server returns data containing control character +Style: 2001 +Typed: no + +Name: Zolera SOAP Infrastructure +Endpoint: http://63.142.188.184:7000/ +SOAPAction: urn:soapinterop +Namespace: http://soapinterop.org/ +Style: 2001 +Nonfunctional: actorShouldPass server claims message is unparsable +Nonfunctional: echoBase64 server returns data with invalid type +Nonfunctional: echoVoid server doesn't return an empty return value +Nonfunctional: mustUnderstandEqualsZero server claims message is unparsable diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/validate/silabclient.py b/LTA/LTAIngest/SOAPpy-0.12.0/validate/silabclient.py new file mode 100755 index 0000000000000000000000000000000000000000..7d9cd88d05c6037e80f9447e837065aa02503490 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/validate/silabclient.py @@ -0,0 +1,699 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. + +# This set of clients validates when run against the servers in +# silab.servers. + +import copy +import fileinput +import getopt +import re +import string +import sys +import time +import traceback + +sys.path.insert (1, '..') + +from SOAPpy import SOAP + +SOAP.Config.typesNamespace = SOAP.NS.XSD3 +SOAP.Config.typesNamespace = SOAP.NS.XSD3 + +ident = '$Id$' + +DEFAULT_SERVERS_FILE = 'silab.servers' + +DEFAULT_METHODS = \ + ( + 'actorShouldPass', 'actorShouldFail', + 'echoDate', 'echoBase64', + 'echoFloat', 'echoFloatArray', + 'echoFloatINF', 'echoFloatNaN', + 'echoFloatNegINF', 'echoFloatNegZero', + 'echoInteger', 'echoIntegerArray', + 'echoString', 'echoStringArray', + 'echoStruct', 'echoStructArray', + 'echoVeryLargeFloat', 'echoVerySmallFloat', + 'echoVoid', + 'mustUnderstandEqualsOne', 'mustUnderstandEqualsZero', + ) + + +def usage (error = None): + sys.stdout = sys.stderr + + if error != None: + print error + + print """usage: %s [options] [server ...] + If a long option shows an argument is mandatory, it's mandatory for the + equivalent short option also. + + -?, --help display this usage + -d, --debug turn on debugging in the SOAP library + -e, --exit-on-failure exit on the first (unexpected) failure + -h, --harsh turn on harsh testing: + - look for the documented error code from + mustUnderstand failures + - use non-ASCII strings in the string tests + -i, --invert test servers *not* in the list of servers given + -m, --method=METHOD#[,METHOD#...] + call only the given methods, specify a METHOD# of ? + for the list of method numbers + -n, --no-stats, --no-statistics + don't display success and failure statistics + -N, --no-boring-stats, --no-boring-statistics + only display unexpected failures and unimplemented + tests, and only if non-zero + -o, --output=TYPE turn on output, TYPE is one or more of s(uccess), + f(ailure), n(ot implemented), F(ailed (as expected)), + a(ll) + [f] + -s, --servers=FILE use FILE as list of servers to test [%s] + -t, --stacktrace print a stack trace on each unexpected failure + -T, --always-stacktrace + print a stack trace on any failure +""" % (sys.argv[0], DEFAULT_SERVERS_FILE), + + sys.exit (0) + + +def methodUsage (): + sys.stdout = sys.stderr + + print "Methods are specified by number. Multiple methods can be " \ + "specified using a\ncomma-separated list of numbers or ranges. " \ + "For example 1,4-6,8 specifies\nmethods 1, 4, 5, 6, and 8.\n" + + print "The available methods are:\n" + + half = (len (DEFAULT_METHODS) + 1) / 2 + + for i in range (half): + print "%4d. %-25s" % (i + 1, DEFAULT_METHODS[i]), + if i + half < len (DEFAULT_METHODS): + print "%4d. %-25s" % (i + 1 + half, DEFAULT_METHODS[i + half]), + print + + sys.exit (0) + + +# as borrowed from jake.soapware.org for float compares. +def nearlyeq (a, b, prec = 1e-7): + return abs (a - b) <= abs (a) * prec + +def readServers (file): + servers = [] + names = {} + cur = None + + f = fileinput.input(file) + + for line in f: + if line[0] == '#': + continue + + if line == '' or line[0] == '\n': + cur = None + continue + + if cur == None: + cur = {'nonfunctional': {}, '_line': f.filelineno(), + '_file': f.filename()} + tag = None + servers.append (cur) + + if line[0] in string.whitespace: + if tag == 'nonfunctional': + value = method + ' ' + cur[tag][method] + else: + value = cur[tag] + value += ' ' + line.strip () + elif line[0] == '_': + raise ValueError, \ + "%s, line %d: can't have a tag starting with `_'" % \ + (f.filename(), f.filelineno()) + else: + tag, value = line.split (':', 1) + + tag = tag.strip ().lower () + value = value.strip () + + if value[0] == '"' and value[-1] == '"': + value = value[1:-1] + + if tag == 'typed': + if value.lower() in ('0', 'no', 'false'): + value = 0 + elif value.lower() in ('1', 'yes', 'false'): + value = 1 + else: + raise ValueError, \ + "%s, line %d: unknown typed value `%s'" % \ + (f.filename(), f.filelineno(), value) + elif tag == 'name': + if names.has_key(value): + old = names[value] + + raise ValueError, \ + "%s, line %d: already saw a server named `%s' " \ + "(on line %d of %s)" % \ + (f.filename(), f.filelineno(), value, + old['_line'], old['_file']) + names[value] = cur + + if tag == 'nonfunctional': + value = value.split (' ', 1) + [''] + + method = value[0] + cur[tag][method] = value[1] + elif tag == 'functional': + try: + del cur['nonfunctional'][value] + except: + raise ValueError, \ + "%s, line %d: `%s' not marked nonfunctional" % \ + (f.filename(), f.filelineno(), value) + elif tag == 'like': + try: + new = copy.deepcopy(names[value]) + except: + raise ValueError, \ + "%s, line %d: don't know about a server named `%s'" % \ + (f.filename(), f.filelineno(), value) + + # This is so we don't lose the nonfunctional methods in new or + # in cur + + new['nonfunctional'].update(cur['nonfunctional']) + del cur['nonfunctional'] + + new.update(cur) + + # This is because servers and possibly names has a reference to + # cur, so we have to keep working with cur so changes are + # reflected in servers and names. + + cur.update(new) + else: + cur[tag] = value + + return servers + +def str2list (s): + l = {} + + for i in s.split (','): + if i.find ('-') != -1: + i = i.split ('-') + for i in range (int (i[0]),int (i[1]) + 1): + l[i] = 1 + else: + l[int (i)] = 1 + + l = l.keys () + l.sort () + + return l + +def testActorShouldPass (server, action, harsh): + test = 42 + server = server._sa (action % {'methodname': 'echoInteger'}) + hd = SOAP.headerType () + hd.InteropTestHeader = SOAP.stringType ("This shouldn't fault because " + "the mustUnderstand attribute is 0") + hd.InteropTestHeader._setMustUnderstand (0) + hd.InteropTestHeader._setActor ( + 'http://schemas.xmlsoap.org/soap/actor/next') + server = server._hd (hd) + + result = server.echoInteger (inputInteger = test) + + if not SOAP.Config.typed: + result = int (result) + + if result != test: + raise Exception, "expected %s, got %s" % (test, result) + +def testActorShouldFail (server, action, harsh): + test = 42 + server = server._sa (action % {'methodname': 'echoInteger'}) + hd = SOAP.headerType () + hd.InteropTestHeader = SOAP.stringType ("This should fault because " + "the mustUnderstand attribute is 1") + hd.InteropTestHeader._setMustUnderstand (1) + hd.InteropTestHeader._setActor ( + 'http://schemas.xmlsoap.org/soap/actor/next') + server = server._hd (hd) + + try: + result = server.echoInteger (inputInteger = test) + except SOAP.faultType, e: + if harsh and e.faultcode != 'SOAP-ENV:MustUnderstand': + raise AttributeError, "unexpected faultcode %s" % e.faultcode + return + + raise Exception, "should fail, succeeded with %s" % result + +def testEchoFloat (server, action, harsh): + server = server._sa (action % {'methodname': 'echoFloat'}) + + for test in (0.0, 1.0, -1.0, 3853.33333333): + result = server.echoFloat (inputFloat = test) + + if not SOAP.Config.typed: + result = float (result) + + if not nearlyeq (result, test): + raise Exception, "expected %.8f, got %.8f" % (test, result) + +def testEchoFloatArray (server, action, harsh): + test = [0.0, 1.0, -1.0, 3853.33333333] + server = server._sa (action % {'methodname': 'echoFloatArray'}) + result = server.echoFloatArray (inputFloatArray = test) + + for i in range (len (test)): + if not SOAP.Config.typed: + result[i] = float (result[i]) + + if not nearlyeq (result[i], test[i]): + raise Exception, "@ %d expected %s, got %s" % \ + (i, repr (test), repr (result)) + +def testEchoFloatINF (server, action, harsh): + try: + test = float ('INF') + except: + test = float (1e300**2) + server = server._sa (action % {'methodname': 'echoFloat'}) + result = server.echoFloat (inputFloat = test) + + if not SOAP.Config.typed: + result = float (result) + + if result != test: + raise Exception, "expected %.8f, got %.8f" % (test, result) + +def testEchoFloatNaN (server, action, harsh): + try: + test = float ('NaN') + except: + test = float (0.0) + server = server._sa (action % {'methodname': 'echoFloat'}) + result = server.echoFloat (inputFloat = test) + + if not SOAP.Config.typed: + result = float (result) + + if result != test: + raise Exception, "expected %.8f, got %.8f" % (test, result) + +def testEchoFloatNegINF (server, action, harsh): + try: + test = float ('-INF') + except: + test = float (-1e300**2) + + server = server._sa (action % {'methodname': 'echoFloat'}) + result = server.echoFloat (inputFloat = test) + + if not SOAP.Config.typed: + result = float (result) + + if result != test: + raise Exception, "expected %.8f, got %.8f" % (test, result) + +def testEchoFloatNegZero (server, action, harsh): + test = float ('-0.0') + server = server._sa (action % {'methodname': 'echoFloat'}) + result = server.echoFloat (inputFloat = test) + + if not SOAP.Config.typed: + result = float (result) + + if result != test: + raise Exception, "expected %.8f, got %.8f" % (test, result) + +def testEchoInteger (server, action, harsh): + server = server._sa (action % {'methodname': 'echoInteger'}) + + for test in (0, 1, -1, 3853): + result = server.echoInteger (inputInteger = test) + + if not SOAP.Config.typed: + result = int (result) + + if result != test: + raise Exception, "expected %.8f, got %.8f" % (test, result) + +def testEchoIntegerArray (server, action, harsh): + test = [0, 1, -1, 3853] + server = server._sa (action % {'methodname': 'echoIntegerArray'}) + result = server.echoIntegerArray (inputIntegerArray = test) + + for i in range (len (test)): + if not SOAP.Config.typed: + result[i] = int (result[i]) + + if result[i] != test[i]: + raise Exception, "@ %d expected %s, got %s" % \ + (i, repr (test), repr (result)) + +relaxedStringTests = ['', 'Hello', '\'<&>"',] +relaxedStringTests = ['Hello', '\'<&>"',] +harshStringTests = ['', 'Hello', '\'<&>"', + u'\u0041', u'\u00a2', u'\u0141', u'\u2342', + u'\'<\u0041&>"', u'\'<\u00a2&>"', u'\'<\u0141&>"', u'\'<\u2342&>"',] + +def testEchoString (server, action, harsh): + if harsh: + test = harshStringTests + else: + test = relaxedStringTests + server = server._sa (action % {'methodname': 'echoString'}) + + for test in test: + result = server.echoString (inputString = test) + + if result != test: + raise Exception, "expected %s, got %s" % \ + (repr (test), repr (result)) + +def testEchoStringArray (server, action, harsh): + if harsh: + test = harshStringTests + else: + test = relaxedStringTests + server = server._sa (action % {'methodname': 'echoStringArray'}) + result = server.echoStringArray (inputStringArray = test) + + if result != test: + raise Exception, "expected %s, got %s" % (repr (test), repr (result)) + +def testEchoStruct (server, action, harsh): + test = {'varFloat': 2.256, 'varInt': 474, 'varString': 'Utah'} + server = server._sa (action % {'methodname': 'echoStruct'}) + result = server.echoStruct (inputStruct = test) + + if not SOAP.Config.typed: + result.varFloat = float (result.varFloat) + result.varInt = int (result.varInt) + + if not nearlyeq (test['varFloat'], result.varFloat): + raise Exception, ".varFloat expected %s, got %s" % \ + (i, repr (test['varFloat']), repr (result.varFloat)) + + for i in test.keys (): + if i == 'varFloat': + continue + + if test[i] != getattr (result, i): + raise Exception, ".%s expected %s, got %s" % \ + (i, repr (test[i]), repr (getattr (result, i))) + + +def testEchoStructArray (server, action, harsh): + test = [{'varFloat': -5.398, 'varInt': -546, 'varString': 'West Virginia'}, + {'varFloat': -9.351, 'varInt': -641, 'varString': 'New Mexico'}, + {'varFloat': 1.495, 'varInt': -819, 'varString': 'Missouri'}] + server = server._sa (action % {'methodname': 'echoStructArray'}) + result = server.echoStructArray (inputStructArray = test) + + for s in range (len (test)): + if not SOAP.Config.typed: + result[s].varFloat = float (result[s].varFloat) + result[s].varInt = int (result[s].varInt) + + if not nearlyeq (test[s]['varFloat'], result[s].varFloat): + raise Exception, \ + "@ %d.varFloat expected %s, got %s" % \ + (s, repr (test[s]['varFloat']), repr (result[s].varFloat)) + + for i in test[s].keys (): + if i == 'varFloat': + continue + + if test[s][i] != getattr (result[s], i): + raise Exception, "@ %d.%s expected %s, got %s" % \ + (s, i, repr (test[s][i]), repr (getattr (result[s], i))) + +def testEchoVeryLargeFloat (server, action, harsh): + test = 2.2535e29 + server = server._sa (action % {'methodname': 'echoFloat'}) + result = server.echoFloat (inputFloat = test) + + if not SOAP.Config.typed: + result = float (result) + + if not nearlyeq (result, test): + raise Exception, "expected %s, got %s" % (repr (test), repr (result)) + +def testEchoVerySmallFloat (server, action, harsh): + test = 2.2535e29 + server = server._sa (action % {'methodname': 'echoFloat'}) + result = server.echoFloat (inputFloat = test) + + if not SOAP.Config.typed: + result = float (result) + + if not nearlyeq (result, test): + raise Exception, "expected %s, got %s" % (repr (test), repr (result)) + +def testEchoVoid (server, action, harsh): + server = server._sa (action % {'methodname': 'echoVoid'}) + result = server.echoVoid () + + for k in result.__dict__.keys (): + if k[0] != '_': + raise Exception, "expected an empty structType, got %s" % \ + repr (result.__dict__) + +def testMustUnderstandEqualsOne (server, action, harsh): + test = 42 + server = server._sa (action % {'methodname': 'echoInteger'}) + hd = SOAP.headerType () + hd.MustUnderstandThis = SOAP.stringType ("This should fault because " + "the mustUnderstand attribute is 1") + hd.MustUnderstandThis._setMustUnderstand (1) + server = server._hd (hd) + + try: + result = server.echoInteger (inputInteger = test) + except SOAP.faultType, e: + if harsh and e.faultcode != 'SOAP-ENV:MustUnderstand': + raise AttributeError, "unexpected faultcode %s" % e.faultcode + return + + raise Exception, "should fail, succeeded with %s" % result + +def testMustUnderstandEqualsZero (server, action, harsh): + test = 42 + server = server._sa (action % {'methodname': 'echoInteger'}) + hd = SOAP.headerType () + hd.MustUnderstandThis = SOAP.stringType ("This shouldn't fault because " + "the mustUnderstand attribute is 0") + hd.MustUnderstandThis._setMustUnderstand (0) + server = server._hd (hd) + + result = server.echoInteger (inputInteger = test) + + if not SOAP.Config.typed: + result = int (result) + + if result != test: + raise Exception, "expected %s, got %s" % (test, result) + +def testEchoDate (server, action, harsh): + test = time.gmtime (time.time ()) + server = server._sa (action % {'methodname': 'echoDate'}) + if SOAP.Config.namespaceStyle == '1999': + result = server.echoDate (inputDate = SOAP.timeInstantType (test)) + else: + result = server.echoDate (inputDate = SOAP.dateTimeType (test)) + + if not SOAP.Config.typed and type (result) in (type (''), type (u'')): + p = SOAP.SOAPParser() + result = p.convertDateTime(result, 'timeInstant') + + if result != test[:6]: + raise Exception, "expected %s, got %s" % (repr (test), repr (result)) + +def testEchoBase64 (server, action, harsh): + test = '\x00\x10\x20\x30\x40\x50\x60\x70\x80\x90\xa0\xb0\xc0\xd0\xe0\xf0' + server = server._sa (action % {'methodname': 'echoBase64'}) + result = server.echoBase64 (inputBase64 = SOAP.base64Type (test)) + + if not SOAP.Config.typed: + import base64 + result = base64.decodestring(result) + + if result != test: + raise Exception, "expected %s, got %s" % (repr (test), repr (result)) + + +def main (): + stats = 1 + total = 0 + fail = 0 + failok = 0 + succeed = 0 + exitonfailure = 0 + harsh = 0 + invert = 0 + printtrace = 0 + methodnums = None + notimp = 0 + output = 'f' + servers = DEFAULT_SERVERS_FILE + + started = time.time () + + try: + opts, args = getopt.getopt (sys.argv[1:], '?dehim:nNo:s:tT', + ['help', 'debug', 'exit-on-failure', 'harsh', 'invert', + 'method', 'no-stats', 'no-statistics', + 'no-boring-statistics', 'no-boring-stats', 'output', + 'servers=', 'stacktrace', 'always-stacktrace']) + + for opt, arg in opts: + if opt in ('-?', '--help'): + usage () + elif opt in ('-d', '--debug'): + SOAP.Config.debug = 1 + elif opt in ('-h', '--harsh'): + harsh = 1 + elif opt in ('-i', '--invert'): + invert = 1 + elif opt in ('-e', '--exit-on-failure'): + exitonfailure = 1 + elif opt in ('-m', '--method'): + if arg == '?': + methodUsage () + methodnums = str2list (arg) + elif opt in ('-n', '--no-stats', '--no-statistics'): + stats = 0 + elif opt in ('-N', '--no-boring-stats', '--no-boring-statistics'): + stats = -1 + elif opt in ('-o', '--output'): + output = arg + elif opt in ('-s', '--servers'): + servers = arg + elif opt in ('-t', '--stacktrace'): + printtrace = 1 + elif opt in ('-T', '--always-stacktrace'): + printtrace = 2 + else: + raise AttributeError, \ + "Recognized but unimplemented option `%s'" % opt + except SystemExit: + raise + except: + usage (sys.exc_info ()[1]) + + if 'a' in output: + output = 'fFns' + + servers = readServers (servers) + + if methodnums == None: + methodnums = range (1, len (DEFAULT_METHODS) + 1) + + limitre = re.compile ('|'.join (args), re.IGNORECASE) + + for s in servers: + if (not not limitre.match (s['name'])) == invert: + continue + + try: typed = s['typed'] + except: typed = 1 + + try: style = s['style'] + except: style = 1999 + + SOAP.Config.typed = typed + SOAP.Config.namespaceStyle = style + + server = SOAP.SOAPProxy (s['endpoint'], ("m", s['namespace'])) + + for num in (methodnums): + if num > len (DEFAULT_METHODS): + break + + total += 1 + + name = DEFAULT_METHODS[num - 1] + + title = '%s: %s (#%d)' % (s['name'], name, num) + + if SOAP.Config.debug: + print "%s:" % title + + try: + fn = globals ()['test' + name[0].upper () + name[1:]] + except KeyboardInterrupt: + raise + except: + if 'n' in output: + print title, "test not yet implemented" + notimp += 1 + continue + + try: + fn (server, s['soapaction'], harsh) + if s['nonfunctional'].has_key (name): + print title, \ + "succeeded despite being marked nonfunctional" + if 's' in output: + print title, "succeeded" + succeed += 1 + except KeyboardInterrupt: + raise + except: + fault = str (sys.exc_info ()[1]) + if fault[-1] == '\n': + fault = fault[:-1] + + if s['nonfunctional'].has_key (name): + if 'F' in output: + t = 'as expected' + if s['nonfunctional'][name] != '': + t += ', ' + s['nonfunctional'][name] + print title, "failed (%s) -" % t, fault + if printtrace > 1: + traceback.print_exc () + failok += 1 + else: + if 'f' in output: + print title, "failed -", fault + if printtrace: + traceback.print_exc () + fail += 1 + + if exitonfailure: + return -1 + + if stats: + print " Tests started at:", time.ctime (started) + if stats > 0: + print " Total tests: %d" % total + print " Successes: %d (%3.2f%%)" % \ + (succeed, 100.0 * succeed / total) + if stats > 0 or fail > 0: + print "Failed unexpectedly: %d (%3.2f%%)" % \ + (fail, 100.0 * fail / total) + if stats > 0: + print " Failed as expected: %d (%3.2f%%)" % \ + (failok, 100.0 * failok / total) + if stats > 0 or notimp > 0: + print " Not implemented: %d (%3.2f%%)" % \ + (notimp, 100.0 * notimp / total) + + return fail + notimp + +if __name__ == '__main__': + try: + sys.exit (main ()) + except KeyboardInterrupt: + sys.exit (0) diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/validate/silabserver.py b/LTA/LTAIngest/SOAPpy-0.12.0/validate/silabserver.py new file mode 100755 index 0000000000000000000000000000000000000000..51c143a53b4de247c5b320bd9dd560033e57a3f4 --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/validate/silabserver.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +# Copyright (c) 2001 actzero, inc. All rights reserved. + +# This is a server for the XMethods matrix +# (http://jake.soapware.org/currentXmethodsResults). + +import getopt +import sys + +sys.path.insert (1, '..') + +from SOAPpy import SOAP + +if SOAP.Config.SSLserver: + from M2Crypto import SSL + +ident = '$Id$' + +def echoFloat (inputFloat): + return inputFloat + +def echoFloatArray (inputFloatArray): + return inputFloatArray + +def echoInteger (inputInteger): + return inputInteger + +def echoIntegerArray (inputIntegerArray): + return inputIntegerArray + +def echoString (inputString): + return inputString + +def echoStringArray (inputStringArray): + return inputStringArray + +def echoStruct (inputStruct): + return inputStruct + +def echoStructArray (inputStructArray): + return inputStructArray + +def echoVoid (): + return SOAP.voidType() + +def echoDate (inputDate): + return SOAP.dateTimeType (inputDate) + +def echoBase64 (inputBase64): + return SOAP.binaryType (inputBase64) + +namespace = 'http://soapinterop.org/' + +DEFAULT_HOST = 'localhost' +DEFAULT_HTTP_PORT = 8080 +DEFAULT_HTTPS_PORT = 8443 + +def usage (error = None): + sys.stdout = sys.stderr + + if error != None: + print error + + print """usage: %s [options] + If a long option shows an argument is mandatory, it's mandatory for the + equivalent short option also. The default (if any) is shown in brackets. + + -?, --help display this usage + -h, --host=HOST use HOST in the address to listen on [%s] + -p, --port=PORT listen on PORT [%d] +""" % (sys.argv[0], DEFAULT_HOST, DEFAULT_HTTP_PORT), + + if SOAP.Config.SSLserver: + print " -s, --ssl serve using SSL" + + sys.exit (0) + +def main (): + host = DEFAULT_HOST + port = None + ssl = 0 + + try: + opts = '?h:p:' + args = ['help', 'host', 'port'] + + if SOAP.Config.SSLserver: + opts += 's' + args += ['ssl'] + + opts, args = getopt.getopt (sys.argv[1:], opts, args) + + for opt, arg in opts: + if opt in ('-?', '--help'): + usage () + elif opt in ('-h', '--host'): + host = arg + elif opt in ('-p', '--port'): + port = int (arg) + elif opt in ('-s', '--ssl'): + ssl = 1 + else: + raise AttributeError, \ + "Recognized but unimplemented option `%s'" % opt + except SystemExit: + raise + except: + usage (sys.exc_info ()[1]) + + if port == None: + port = [DEFAULT_HTTP_PORT, DEFAULT_HTTPS_PORT][ssl] + + if ssl: + ssl_context = SSL.Context() + ssl_context.load_cert('server.pem') + else: + ssl_context = None + + server = SOAP.SOAPServer ((host, port), namespace = namespace, + ssl_context = ssl_context) + + server.registerFunction (echoFloat) + server.registerFunction (echoFloatArray) + server.registerFunction (echoInteger) + server.registerFunction (echoIntegerArray) + server.registerFunction (echoString) + server.registerFunction (echoStringArray) + server.registerFunction (echoStruct) + server.registerFunction (echoStructArray) + server.registerFunction (echoVoid) + server.registerFunction (echoDate) + server.registerFunction (echoBase64) + + server.serve_forever() + +if __name__ == '__main__': + try: + sys.exit (main ()) + except KeyboardInterrupt: + sys.exit (0) diff --git a/LTA/LTAIngest/SOAPpy-0.12.0/validate/soapware.py b/LTA/LTAIngest/SOAPpy-0.12.0/validate/soapware.py new file mode 100755 index 0000000000000000000000000000000000000000..4dc7ba0830bc6e1cb36db4fbed841d35df96d14b --- /dev/null +++ b/LTA/LTAIngest/SOAPpy-0.12.0/validate/soapware.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +# This server validates as of 4/23/01 when run with UserLand's SOAP validator +# (http://validator.soapware.org/). + +import getopt +import sys + +sys.path.insert (1, '..') + +from SOAPpy import SOAP + +ident = '$Id$' + +def whichToolkit (): + return SOAP.SOAPUserAgent () + +def countTheEntities (s): + counts = {'ctLeftAngleBrackets': 0, 'ctRightAngleBrackets': 0, + 'ctAmpersands': 0, 'ctApostrophes': 0, 'ctQuotes': 0} + + for i in s: + if i == '<': + counts['ctLeftAngleBrackets'] += 1 + elif i == '>': + counts['ctRightAngleBrackets'] += 1 + elif i == '&': + counts['ctAmpersands'] += 1 + elif i == "'": + counts['ctApostrophes'] += 1 + elif i == '"': + counts['ctQuotes'] += 1 + + return counts + +def easyStructTest (stooges): + return stooges['larry'] + stooges['moe'] + stooges['curly'] + +def echoStructTest (myStruct): + return myStruct + +def manyTypesTest (num, bool, state, doub, dat, bin): + return [num, SOAP.booleanType (bool), state, doub, + SOAP.dateTimeType (dat), bin] + +def moderateSizeArrayCheck (myArray): + return myArray[0] + myArray[-1] + +def nestedStructTest (myStruct): + return easyStructTest (myStruct.year2000.month04.day01) + +def simpleStructReturnTest (myNumber): + return {'times10': myNumber * 10, 'times100': myNumber * 100, + 'times1000': myNumber * 1000} + +namespace = 'http://www.soapware.org/' + +DEFAULT_HOST = 'localhost' +DEFAULT_PORT = 8080 + +def usage (error = None): + sys.stdout = sys.stderr + + if error != None: + print error + + print """usage: %s [options] + If a long option shows an argument is mandatory, it's mandatory for the + equivalent short option also. The default (if any) is shown in brackets. + + -?, --help display this usage + -h, --host=HOST use HOST in the address to listen on [%s] + -p, --port=PORT listen on PORT [%d] +""" % (sys.argv[0], DEFAULT_HOST, DEFAULT_PORT), + + sys.exit (0) + +def main (): + host = DEFAULT_HOST + port = DEFAULT_PORT + + try: + opts, args = getopt.getopt (sys.argv[1:], '?h:p:', + ['help', 'host', 'port']) + + for opt, arg in opts: + if opt in ('-?', '--help'): + usage () + elif opt in ('-h', '--host'): + host = arg + elif opt in ('-p', '--port'): + port = int (arg) + else: + raise AttributeError, \ + "Recognized but unimplemented option `%s'" % opt + except SystemExit: + raise + except: + usage (sys.exc_info ()[1]) + + server = SOAP.SOAPServer ((host, port)) + + server.registerFunction (whichToolkit, namespace) + server.registerFunction (countTheEntities) + server.registerFunction (easyStructTest) + server.registerFunction (echoStructTest) + server.registerFunction (manyTypesTest) + server.registerFunction (moderateSizeArrayCheck) + server.registerFunction (nestedStructTest) + server.registerFunction (simpleStructReturnTest) + + server.serve_forever() + +if __name__ == '__main__': + try: + sys.exit (main ()) + except KeyboardInterrupt: + sys.exit (0) diff --git a/LTA/LTAIngest/__init__.py b/LTA/LTAIngest/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/LTA/LTAIngest/dav/.serverrc b/LTA/LTAIngest/dav/.serverrc new file mode 100644 index 0000000000000000000000000000000000000000..991ac5ee7467e6676481319bd99ec22ffdd81017 --- /dev/null +++ b/LTA/LTAIngest/dav/.serverrc @@ -0,0 +1,54 @@ +name: wop19 +host: 195.169.63.139 +username: pipeline +password: Vser$%er$%esdd4462r2TG%db4t34yhqD5yhw&Bb +localdir: /wop19_2/pipeline +remotedir: /wop19_2/pipeline/ftp + +name: wsrtweb +host: wsrtweb.astron.nl +port: 80 +username: ConversionPipeline +password: cpipeline + +name: webdav_momtest_nico +username: DistributionPipeline +host: 10.87.6.36 +remotedir: /repository2 +port: 8080 +password: dpipeline + +name: webdav_momtest +username: DistributionPipeline +host: momtest.astron.nl +remotedir: /repository2 +port: 8080 +password: dpipeline + +name: webdav_lofartest_repository2 +username: DistributionPipeline +host: lofartest.astron.nl +remotedir: /repository2 +port: 8080 +password: dpipeline + +name: webdav_lofar +username: DistributionPipeline +host: lofar.astron.nl +remotedir: /repository2 +port: 8080 +password: dpipeline + +name: webdav_mom_repository2 +username: DistributionPipeline +host: wsrt.astron.nl +remotedir: /repository2 +port: 8080 +password: dpipeline + +name: webdav_lofar_repository2 +username: DistributionPipeline +host: lofar.astron.nl +remotedir: /repository2 +port: 8080 +password: dpipeline \ No newline at end of file diff --git a/LTA/LTAIngest/dav/davhttplib.py b/LTA/LTAIngest/dav/davhttplib.py new file mode 100644 index 0000000000000000000000000000000000000000..b25feb61dbe373fc8a5375ac88ee1dc63aa244b1 --- /dev/null +++ b/LTA/LTAIngest/dav/davhttplib.py @@ -0,0 +1,1459 @@ +"""HTTP/1.1 client library + +<intro stuff goes here> +<other stuff, too> + +HTTPConnection goes through a number of "states", which define when a client +may legally make another request or fetch the response for a particular +request. This diagram details these state transitions: + + (null) + | + | HTTPConnection() + v + Idle + | + | putrequest() + v + Request-started + | + | ( putheader() )* endheaders() + v + Request-sent + | + | response = getresponse() + v + Unread-response [Response-headers-read] + |\____________________ + | | + | response.read() | putrequest() + v v + Idle Req-started-unread-response + ______/| + / | + response.read() | | ( putheader() )* endheaders() + v v + Request-started Req-sent-unread-response + | + | response.read() + v + Request-sent + +This diagram presents the following rules: + -- a second request may not be started until {response-headers-read} + -- a response [object] cannot be retrieved until {request-sent} + -- there is no differentiation between an unread response body and a + partially read response body + +Note: this enforcement is applied by the HTTPConnection class. The + HTTPResponse class does not enforce this state machine, which + implies sophisticated clients may accelerate the request/response + pipeline. Caution should be taken, though: accelerating the states + beyond the above pattern may imply knowledge of the server's + connection-close behavior for certain requests. For example, it + is impossible to tell whether the server will close the connection + UNTIL the response headers have been read; this means that further + requests cannot be placed into the pipeline until it is known that + the server will NOT be closing the connection. + +Logical State __state __response +------------- ------- ---------- +Idle _CS_IDLE None +Request-started _CS_REQ_STARTED None +Request-sent _CS_REQ_SENT None +Unread-response _CS_IDLE <response_class> +Req-started-unread-response _CS_REQ_STARTED <response_class> +Req-sent-unread-response _CS_REQ_SENT <response_class> +""" + +import errno +import mimetools +import socket +from urlparse import urlsplit + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +__all__ = ["HTTP", "HTTPResponse", "HTTPConnection", "HTTPSConnection", + "HTTPException", "NotConnected", "UnknownProtocol", + "UnknownTransferEncoding", "UnimplementedFileMode", + "IncompleteRead", "InvalidURL", "ImproperConnectionState", + "CannotSendRequest", "CannotSendHeader", "ResponseNotReady", + "BadStatusLine", "error", "responses"] + +HTTP_PORT = 80 +HTTPS_PORT = 443 + +_UNKNOWN = 'UNKNOWN' + +# connection states +_CS_IDLE = 'Idle' +_CS_REQ_STARTED = 'Request-started' +_CS_REQ_SENT = 'Request-sent' + +# status codes +# informational +CONTINUE = 100 +SWITCHING_PROTOCOLS = 101 +PROCESSING = 102 + +# successful +OK = 200 +CREATED = 201 +ACCEPTED = 202 +NON_AUTHORITATIVE_INFORMATION = 203 +NO_CONTENT = 204 +RESET_CONTENT = 205 +PARTIAL_CONTENT = 206 +MULTI_STATUS = 207 +IM_USED = 226 + +# redirection +MULTIPLE_CHOICES = 300 +MOVED_PERMANENTLY = 301 +FOUND = 302 +SEE_OTHER = 303 +NOT_MODIFIED = 304 +USE_PROXY = 305 +TEMPORARY_REDIRECT = 307 + +# client error +BAD_REQUEST = 400 +UNAUTHORIZED = 401 +PAYMENT_REQUIRED = 402 +FORBIDDEN = 403 +NOT_FOUND = 404 +METHOD_NOT_ALLOWED = 405 +NOT_ACCEPTABLE = 406 +PROXY_AUTHENTICATION_REQUIRED = 407 +REQUEST_TIMEOUT = 408 +CONFLICT = 409 +GONE = 410 +LENGTH_REQUIRED = 411 +PRECONDITION_FAILED = 412 +REQUEST_ENTITY_TOO_LARGE = 413 +REQUEST_URI_TOO_LONG = 414 +UNSUPPORTED_MEDIA_TYPE = 415 +REQUESTED_RANGE_NOT_SATISFIABLE = 416 +EXPECTATION_FAILED = 417 +UNPROCESSABLE_ENTITY = 422 +LOCKED = 423 +FAILED_DEPENDENCY = 424 +UPGRADE_REQUIRED = 426 + +# server error +INTERNAL_SERVER_ERROR = 500 +NOT_IMPLEMENTED = 501 +BAD_GATEWAY = 502 +SERVICE_UNAVAILABLE = 503 +GATEWAY_TIMEOUT = 504 +HTTP_VERSION_NOT_SUPPORTED = 505 +INSUFFICIENT_STORAGE = 507 +NOT_EXTENDED = 510 + +# Mapping status codes to official W3C names +responses = { + 100: 'Continue', + 101: 'Switching Protocols', + + 200: 'OK', + 201: 'Created', + 202: 'Accepted', + 203: 'Non-Authoritative Information', + 204: 'No Content', + 205: 'Reset Content', + 206: 'Partial Content', + + 300: 'Multiple Choices', + 301: 'Moved Permanently', + 302: 'Found', + 303: 'See Other', + 304: 'Not Modified', + 305: 'Use Proxy', + 306: '(Unused)', + 307: 'Temporary Redirect', + + 400: 'Bad Request', + 401: 'Unauthorized', + 402: 'Payment Required', + 403: 'Forbidden', + 404: 'Not Found', + 405: 'Method Not Allowed', + 406: 'Not Acceptable', + 407: 'Proxy Authentication Required', + 408: 'Request Timeout', + 409: 'Conflict', + 410: 'Gone', + 411: 'Length Required', + 412: 'Precondition Failed', + 413: 'Request Entity Too Large', + 414: 'Request-URI Too Long', + 415: 'Unsupported Media Type', + 416: 'Requested Range Not Satisfiable', + 417: 'Expectation Failed', + + 500: 'Internal Server Error', + 501: 'Not Implemented', + 502: 'Bad Gateway', + 503: 'Service Unavailable', + 504: 'Gateway Timeout', + 505: 'HTTP Version Not Supported', +} + +# maximal amount of data to read at one time in _safe_read +MAXAMOUNT = 1048576 + +class HTTPMessage(mimetools.Message): + + def addheader(self, key, value): + """Add header for field key handling repeats.""" + prev = self.dict.get(key) + if prev is None: + self.dict[key] = value + else: + combined = ", ".join((prev, value)) + self.dict[key] = combined + + def addcontinue(self, key, more): + """Add more field data from a continuation line.""" + prev = self.dict[key] + self.dict[key] = prev + "\n " + more + + def readheaders(self): + """Read header lines. + + Read header lines up to the entirely blank line that terminates them. + The (normally blank) line that ends the headers is skipped, but not + included in the returned list. If a non-header line ends the headers, + (which is an error), an attempt is made to backspace over it; it is + never included in the returned list. + + The variable self.status is set to the empty string if all went well, + otherwise it is an error message. The variable self.headers is a + completely uninterpreted list of lines contained in the header (so + printing them will reproduce the header exactly as it appears in the + file). + + If multiple header fields with the same name occur, they are combined + according to the rules in RFC 2616 sec 4.2: + + Appending each subsequent field-value to the first, each separated + by a comma. The order in which header fields with the same field-name + are received is significant to the interpretation of the combined + field value. + """ + # XXX The implementation overrides the readheaders() method of + # rfc822.Message. The base class design isn't amenable to + # customized behavior here so the method here is a copy of the + # base class code with a few small changes. + + self.dict = {} + self.unixfrom = '' + self.headers = hlist = [] + self.status = '' + headerseen = "" + firstline = 1 + startofline = unread = tell = None + if hasattr(self.fp, 'unread'): + unread = self.fp.unread + elif self.seekable: + tell = self.fp.tell + while True: + if tell: + try: + startofline = tell() + except IOError: + startofline = tell = None + self.seekable = 0 + line = self.fp.readline() + if not line: + self.status = 'EOF in headers' + break + # Skip unix From name time lines + if firstline and line.startswith('From '): + self.unixfrom = self.unixfrom + line + continue + firstline = 0 + if headerseen and line[0] in ' \t': + # XXX Not sure if continuation lines are handled properly + # for http and/or for repeating headers + # It's a continuation line. + hlist.append(line) + self.addcontinue(headerseen, line.strip()) + continue + elif self.iscomment(line): + # It's a comment. Ignore it. + continue + elif self.islast(line): + # Note! No pushback here! The delimiter line gets eaten. + break + headerseen = self.isheader(line) + if headerseen: + # It's a legal header line, save it. + hlist.append(line) + self.addheader(headerseen, line[len(headerseen)+1:].strip()) + continue + else: + # It's not a header line; throw it back and stop here. + if not self.dict: + self.status = 'No headers' + else: + self.status = 'Non-header line where header expected' + # Try to undo the read. + if unread: + unread(line) + elif tell: + self.fp.seek(startofline) + else: + self.status = self.status + '; bad seek' + break + +class HTTPResponse: + + # strict: If true, raise BadStatusLine if the status line can't be + # parsed as a valid HTTP/1.0 or 1.1 status line. By default it is + # false because it prevents clients from talking to HTTP/0.9 + # servers. Note that a response with a sufficiently corrupted + # status line will look like an HTTP/0.9 response. + + # See RFC 2616 sec 19.6 and RFC 1945 sec 6 for details. + + def __init__(self, sock, debuglevel=0, strict=0, method=None): + self.fp = sock.makefile('rb', 0) + self.debuglevel = debuglevel + self.strict = strict + self._method = method + + self.msg = None + + # from the Status-Line of the response + self.version = _UNKNOWN # HTTP-Version + self.status = _UNKNOWN # Status-Code + self.reason = _UNKNOWN # Reason-Phrase + + self.chunked = _UNKNOWN # is "chunked" being used? + self.chunk_left = _UNKNOWN # bytes left to read in current chunk + self.length = _UNKNOWN # number of bytes left in response + self.will_close = _UNKNOWN # conn will close at end of response + + def _read_status(self): + # Initialize with Simple-Response defaults + line = self.fp.readline() + if self.debuglevel > 0: + print "reply:", repr(line) + if not line: + # Presumably, the server closed the connection before + # sending a valid response. + raise BadStatusLine(line) + try: + [version, status, reason] = line.split(None, 2) + except ValueError: + try: + [version, status] = line.split(None, 1) + reason = "" + except ValueError: + # empty version will cause next test to fail and status + # will be treated as 0.9 response. + version = "" + if not version.startswith('HTTP/'): + if self.strict: + self.close() + raise BadStatusLine(line) + else: + # assume it's a Simple-Response from an 0.9 server + self.fp = LineAndFileWrapper(line, self.fp) + return "HTTP/0.9", 200, "" + + # The status code is a three-digit number + try: + status = int(status) + if status < 100 or status > 999: + raise BadStatusLine(line) + except ValueError: + raise BadStatusLine(line) + return version, status, reason + + def begin(self): + if self.msg is not None: + # we've already started reading the response + return + + # read until we get a non-100 response + while True: + version, status, reason = self._read_status() + if status != CONTINUE: + break + # skip the header from the 100 response + while True: + skip = self.fp.readline().strip() + if not skip: + break + if self.debuglevel > 0: + print "header:", skip + + self.status = status + self.reason = reason.strip() + if version == 'HTTP/1.0': + self.version = 10 + elif version.startswith('HTTP/1.'): + self.version = 11 # use HTTP/1.1 code for HTTP/1.x where x>=1 + elif version == 'HTTP/0.9': + self.version = 9 + else: + raise UnknownProtocol(version) + + if self.version == 9: + self.length = None + self.chunked = 0 + self.will_close = 1 + self.msg = HTTPMessage(StringIO()) + return + + self.msg = HTTPMessage(self.fp, 0) + if self.debuglevel > 0: + for hdr in self.msg.headers: + print "header:", hdr, + + # don't let the msg keep an fp + self.msg.fp = None + + # are we using the chunked-style of transfer encoding? + tr_enc = self.msg.getheader('transfer-encoding') + if tr_enc and tr_enc.lower() == "chunked": + self.chunked = 1 + self.chunk_left = None + else: + self.chunked = 0 + + # will the connection close at the end of the response? + self.will_close = self._check_close() + + # do we have a Content-Length? + # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked" + length = self.msg.getheader('content-length') + if length and not self.chunked: + try: + self.length = int(length) + except ValueError: + self.length = None + else: + self.length = None + + # does the body have a fixed length? (of zero) + if (status == NO_CONTENT or status == NOT_MODIFIED or + 100 <= status < 200 or # 1xx codes + self._method == 'HEAD'): + self.length = 0 + + # if the connection remains open, and we aren't using chunked, and + # a content-length was not provided, then assume that the connection + # WILL close. + if not self.will_close and \ + not self.chunked and \ + self.length is None: + self.will_close = 1 + + def _check_close(self): + conn = self.msg.getheader('connection') + if self.version == 11: + # An HTTP/1.1 proxy is assumed to stay open unless + # explicitly closed. + conn = self.msg.getheader('connection') + if conn and "close" in conn.lower(): + return True + return False + + # Some HTTP/1.0 implementations have support for persistent + # connections, using rules different than HTTP/1.1. + + # For older HTTP, Keep-Alive indiciates persistent connection. + if self.msg.getheader('keep-alive'): + return False + + # At least Akamai returns a "Connection: Keep-Alive" header, + # which was supposed to be sent by the client. + if conn and "keep-alive" in conn.lower(): + return False + + # Proxy-Connection is a netscape hack. + pconn = self.msg.getheader('proxy-connection') + if pconn and "keep-alive" in pconn.lower(): + return False + + # otherwise, assume it will close + return True + + def close(self): + if self.fp: + self.fp.close() + self.fp = None + + def isclosed(self): + # NOTE: it is possible that we will not ever call self.close(). This + # case occurs when will_close is TRUE, length is None, and we + # read up to the last byte, but NOT past it. + # + # IMPLIES: if will_close is FALSE, then self.close() will ALWAYS be + # called, meaning self.isclosed() is meaningful. + return self.fp is None + + # XXX It would be nice to have readline and __iter__ for this, too. + + def read(self, amt=None): + if self.fp is None: + return '' + + if self.chunked: + return self._read_chunked(amt) + + if amt is None: + # unbounded read + if self.length is None: + s = self.fp.read() + else: + s = self._safe_read(self.length) + self.length = 0 + self.close() # we read everything + return s + + if self.length is not None: + if amt > self.length: + # clip the read to the "end of response" + amt = self.length + + # we do not use _safe_read() here because this may be a .will_close + # connection, and the user is reading more bytes than will be provided + # (for example, reading in 1k chunks) + s = self.fp.read(amt) + if self.length is not None: + self.length -= len(s) + + return s + + def _read_chunked(self, amt): + assert self.chunked != _UNKNOWN + chunk_left = self.chunk_left + value = '' + + # XXX This accumulates chunks by repeated string concatenation, + # which is not efficient as the number or size of chunks gets big. + while True: + if chunk_left is None: + line = self.fp.readline() + i = line.find(';') + if i >= 0: + line = line[:i] # strip chunk-extensions + chunk_left = int(line, 16) + if chunk_left == 0: + break + if amt is None: + value += self._safe_read(chunk_left) + elif amt < chunk_left: + value += self._safe_read(amt) + self.chunk_left = chunk_left - amt + return value + elif amt == chunk_left: + value += self._safe_read(amt) + self._safe_read(2) # toss the CRLF at the end of the chunk + self.chunk_left = None + return value + else: + value += self._safe_read(chunk_left) + amt -= chunk_left + + # we read the whole chunk, get another + self._safe_read(2) # toss the CRLF at the end of the chunk + chunk_left = None + + # read and discard trailer up to the CRLF terminator + ### note: we shouldn't have any trailers! + while True: + line = self.fp.readline() + if not line: + # a vanishingly small number of sites EOF without + # sending the trailer + break + if line == '\r\n': + break + + # we read everything; close the "file" + self.close() + + return value + + def _safe_read(self, amt): + """Read the number of bytes requested, compensating for partial reads. + + Normally, we have a blocking socket, but a read() can be interrupted + by a signal (resulting in a partial read). + + Note that we cannot distinguish between EOF and an interrupt when zero + bytes have been read. IncompleteRead() will be raised in this + situation. + + This function should be used when <amt> bytes "should" be present for + reading. If the bytes are truly not available (due to EOF), then the + IncompleteRead exception can be used to detect the problem. + """ + s = [] + while amt > 0: + chunk = self.fp.read(min(amt, MAXAMOUNT)) + if not chunk: + raise IncompleteRead(s) + s.append(chunk) + amt -= len(chunk) + return ''.join(s) + + def getheader(self, name, default=None): + if self.msg is None: + raise ResponseNotReady() + return self.msg.getheader(name, default) + + def getheaders(self): + """Return list of (header, value) tuples.""" + if self.msg is None: + raise ResponseNotReady() + return self.msg.items() + + +class HTTPConnection: + + _http_vsn = 11 + _http_vsn_str = 'HTTP/1.1' + + response_class = HTTPResponse + default_port = HTTP_PORT + auto_open = 1 + debuglevel = 0 + strict = 0 + + def __init__(self, host, port=None, strict=None): + self.sock = None + self._buffer = [] + self.__response = None + self.__state = _CS_IDLE + self._method = None + + self._set_hostport(host, port) + if strict is not None: + self.strict = strict + + def _set_hostport(self, host, port): + if port is None: + i = host.rfind(':') + j = host.rfind(']') # ipv6 addresses have [...] + if i > j: + try: + port = int(host[i+1:]) + except ValueError: + raise InvalidURL("nonnumeric port: '%s'" % host[i+1:]) + host = host[:i] + else: + port = self.default_port + if host and host[0] == '[' and host[-1] == ']': + host = host[1:-1] + self.host = host + self.port = port + + def set_debuglevel(self, level): + self.debuglevel = level + + def connect(self): + """Connect to the host and port specified in __init__.""" + msg = "getaddrinfo returns an empty list" + for res in socket.getaddrinfo(self.host, self.port, 0, + socket.SOCK_STREAM): + af, socktype, proto, canonname, sa = res + try: + self.sock = socket.socket(af, socktype, proto) + if self.debuglevel > 0: + print "connect: (%s, %s)" % (self.host, self.port) + self.sock.connect(sa) + except socket.error, msg: + if self.debuglevel > 0: + print 'connect fail:', (self.host, self.port) + if self.sock: + self.sock.close() + self.sock = None + continue + break + if not self.sock: + raise socket.error, msg + + def close(self): + """Close the connection to the HTTP server.""" + if self.sock: + self.sock.close() # close it manually... there may be other refs + self.sock = None + if self.__response: + self.__response.close() + self.__response = None + self.__state = _CS_IDLE + + def send(self, str): + """Send `str' to the server.""" + if self.sock is None: + if self.auto_open: + self.connect() + else: + raise NotConnected() + + # send the data to the server. if we get a broken pipe, then close + # the socket. we want to reconnect when somebody tries to send again. + # + # NOTE: we DO propagate the error, though, because we cannot simply + # ignore the error... the caller will know if they can retry. + if self.debuglevel > 0: + print "send:", repr(str) + try: + self.sock.sendall(str) + except socket.error, v: + if v[0] == 32: # Broken pipe + self.close() + raise + + def sendbinary(self, fp, blocksize=8192): + '''Send a file in binary mode.''' + if self.sock is None: + if self.auto_open: + self.connect() + else: + raise NotConnected() + if self.debuglevel > 0: + print "sending file: ", fp.name + try: + while 1: + buf = fp.read(blocksize) + if not buf: break + self.sock.sendall(buf) + except socket.error, v: + if v[0] == 32: # Broken pipe + self.close() + raise + + def _output(self, s): + """Add a line of output to the current request buffer. + + Assumes that the line does *not* end with \\r\\n. + """ + self._buffer.append(s) + + def _send_output(self): + """Send the currently buffered request and clear the buffer. + + Appends an extra \\r\\n to the buffer. + """ + self._buffer.extend(("", "")) + msg = "\r\n".join(self._buffer) + del self._buffer[:] + self.send(msg) + + def putrequest(self, method, url, skip_host=0, skip_accept_encoding=0): + """Send a request to the server. + + `method' specifies an HTTP request method, e.g. 'GET'. + `url' specifies the object being requested, e.g. '/index.html'. + `skip_host' if True does not add automatically a 'Host:' header + `skip_accept_encoding' if True does not add automatically an + 'Accept-Encoding:' header + """ + + # if a prior response has been completed, then forget about it. + if self.__response and self.__response.isclosed(): + self.__response = None + + + # in certain cases, we cannot issue another request on this connection. + # this occurs when: + # 1) we are in the process of sending a request. (_CS_REQ_STARTED) + # 2) a response to a previous request has signalled that it is going + # to close the connection upon completion. + # 3) the headers for the previous response have not been read, thus + # we cannot determine whether point (2) is true. (_CS_REQ_SENT) + # + # if there is no prior response, then we can request at will. + # + # if point (2) is true, then we will have passed the socket to the + # response (effectively meaning, "there is no prior response"), and + # will open a new one when a new request is made. + # + # Note: if a prior response exists, then we *can* start a new request. + # We are not allowed to begin fetching the response to this new + # request, however, until that prior response is complete. + # + if self.__state == _CS_IDLE: + self.__state = _CS_REQ_STARTED + else: + raise CannotSendRequest() + + # Save the method we use, we need it later in the response phase + self._method = method + if not url: + url = '/' + str = '%s %s %s' % (method, url, self._http_vsn_str) + + self._output(str) + + if self._http_vsn == 11: + # Issue some standard headers for better HTTP/1.1 compliance + + if not skip_host: + # this header is issued *only* for HTTP/1.1 + # connections. more specifically, this means it is + # only issued when the client uses the new + # HTTPConnection() class. backwards-compat clients + # will be using HTTP/1.0 and those clients may be + # issuing this header themselves. we should NOT issue + # it twice; some web servers (such as Apache) barf + # when they see two Host: headers + + # If we need a non-standard port,include it in the + # header. If the request is going through a proxy, + # but the host of the actual URL, not the host of the + # proxy. + + netloc = '' + if url.startswith('http'): + nil, netloc, nil, nil, nil = urlsplit(url) + + if netloc: + try: + netloc_enc = netloc.encode("ascii") + except UnicodeEncodeError: + netloc_enc = netloc.encode("idna") + self.putheader('Host', netloc_enc) + else: + try: + host_enc = self.host.encode("ascii") + except UnicodeEncodeError: + host_enc = self.host.encode("idna") + if self.port == HTTP_PORT: + self.putheader('Host', host_enc) + else: + self.putheader('Host', "%s:%s" % (host_enc, self.port)) + + # note: we are assuming that clients will not attempt to set these + # headers since *this* library must deal with the + # consequences. this also means that when the supporting + # libraries are updated to recognize other forms, then this + # code should be changed (removed or updated). + + # we only want a Content-Encoding of "identity" since we don't + # support encodings such as x-gzip or x-deflate. + if not skip_accept_encoding: + self.putheader('Accept-Encoding', 'identity') + + # we can accept "chunked" Transfer-Encodings, but no others + # NOTE: no TE header implies *only* "chunked" + #self.putheader('TE', 'chunked') + + # if TE is supplied in the header, then it must appear in a + # Connection header. + #self.putheader('Connection', 'TE') + + else: + # For HTTP/1.0, the server will assume "not chunked" + pass + + def putheader(self, header, value): + """Send a request header line to the server. + + For example: h.putheader('Accept', 'text/html') + """ + if self.__state != _CS_REQ_STARTED: + raise CannotSendHeader() + + str = '%s: %s' % (header, value) + self._output(str) + + def endheaders(self): + """Indicate that the last header line has been sent to the server.""" + + if self.__state == _CS_REQ_STARTED: + self.__state = _CS_REQ_SENT + else: + raise CannotSendHeader() + + self._send_output() + + def request(self, method, url, body=None, headers={}): + """Send a complete request to the server.""" + + try: + self._send_request(method, url, body, headers) + except socket.error, v: + # trap 'Broken pipe' if we're allowed to automatically reconnect + if v[0] != 32 or not self.auto_open: + raise + # try one more time + self._send_request(method, url, body, headers) + + def _send_request(self, method, url, body, headers): + import os + # honour explicitly requested Host: and Accept-Encoding headers + header_names = dict.fromkeys([k.lower() for k in headers]) + skips = {} + if 'host' in header_names: + skips['skip_host'] = 1 + if 'accept-encoding' in header_names: + skips['skip_accept_encoding'] = 1 + + self.putrequest(method, url, **skips) + + if body: + if isinstance(body, file): + s = os.stat(body.name) ## how big is this file? + self.putheader('Content-Length', str(s.st_size)) + else: + self.putheader('Content-Length', str(len(body))) + for hdr, value in headers.iteritems(): + self.putheader(hdr, value) + self.endheaders() + + if body: + if isinstance(body, file): + self.sendbinary(body) + else: + self.send(body) + + def getresponse(self): + "Get the response from the server." + + # if a prior response has been completed, then forget about it. + if self.__response and self.__response.isclosed(): + self.__response = None + + # + # if a prior response exists, then it must be completed (otherwise, we + # cannot read this response's header to determine the connection-close + # behavior) + # + # note: if a prior response existed, but was connection-close, then the + # socket and response were made independent of this HTTPConnection + # object since a new request requires that we open a whole new + # connection + # + # this means the prior response had one of two states: + # 1) will_close: this connection was reset and the prior socket and + # response operate independently + # 2) persistent: the response was retained and we await its + # isclosed() status to become true. + # + if self.__state != _CS_REQ_SENT or self.__response: + raise ResponseNotReady() + + if self.debuglevel > 0: + response = self.response_class(self.sock, self.debuglevel, + strict=self.strict, + method=self._method) + else: + response = self.response_class(self.sock, strict=self.strict, + method=self._method) + + response.begin() + assert response.will_close != _UNKNOWN + self.__state = _CS_IDLE + + if response.will_close: + # this effectively passes the connection to the response + self.close() + else: + # remember this, so we can tell when it is complete + self.__response = response + + return response + +# The next several classes are used to define FakeSocket, a socket-like +# interface to an SSL connection. + +# The primary complexity comes from faking a makefile() method. The +# standard socket makefile() implementation calls dup() on the socket +# file descriptor. As a consequence, clients can call close() on the +# parent socket and its makefile children in any order. The underlying +# socket isn't closed until they are all closed. + +# The implementation uses reference counting to keep the socket open +# until the last client calls close(). SharedSocket keeps track of +# the reference counting and SharedSocketClient provides an constructor +# and close() method that call incref() and decref() correctly. + +class SharedSocket: + + def __init__(self, sock): + self.sock = sock + self._refcnt = 0 + + def incref(self): + self._refcnt += 1 + + def decref(self): + self._refcnt -= 1 + assert self._refcnt >= 0 + if self._refcnt == 0: + self.sock.close() + + def __del__(self): + self.sock.close() + +class SharedSocketClient: + + def __init__(self, shared): + self._closed = 0 + self._shared = shared + self._shared.incref() + self._sock = shared.sock + + def close(self): + if not self._closed: + self._shared.decref() + self._closed = 1 + self._shared = None + +class SSLFile(SharedSocketClient): + """File-like object wrapping an SSL socket.""" + + BUFSIZE = 8192 + + def __init__(self, sock, ssl, bufsize=None): + SharedSocketClient.__init__(self, sock) + self._ssl = ssl + self._buf = '' + self._bufsize = bufsize or self.__class__.BUFSIZE + + def _read(self): + buf = '' + # put in a loop so that we retry on transient errors + while True: + try: + buf = self._ssl.read(self._bufsize) + except socket.sslerror, err: + if (err[0] == socket.SSL_ERROR_WANT_READ + or err[0] == socket.SSL_ERROR_WANT_WRITE): + continue + if (err[0] == socket.SSL_ERROR_ZERO_RETURN + or err[0] == socket.SSL_ERROR_EOF): + break + raise + except socket.error, err: + if err[0] == errno.EINTR: + continue + if err[0] == errno.EBADF: + # XXX socket was closed? + break + raise + else: + break + return buf + + def read(self, size=None): + L = [self._buf] + avail = len(self._buf) + while size is None or avail < size: + s = self._read() + if s == '': + break + L.append(s) + avail += len(s) + all = "".join(L) + if size is None: + self._buf = '' + return all + else: + self._buf = all[size:] + return all[:size] + + def readline(self): + L = [self._buf] + self._buf = '' + while 1: + i = L[-1].find("\n") + if i >= 0: + break + s = self._read() + if s == '': + break + L.append(s) + if i == -1: + # loop exited because there is no more data + return "".join(L) + else: + all = "".join(L) + # XXX could do enough bookkeeping not to do a 2nd search + i = all.find("\n") + 1 + line = all[:i] + self._buf = all[i:] + return line + + def readlines(self, sizehint=0): + total = 0 + list = [] + while True: + line = self.readline() + if not line: + break + list.append(line) + total += len(line) + if sizehint and total >= sizehint: + break + return list + + def fileno(self): + return self._sock.fileno() + + def __iter__(self): + return self + + def next(self): + line = self.readline() + if not line: + raise StopIteration + return line + +class FakeSocket(SharedSocketClient): + + class _closedsocket: + def __getattr__(self, name): + raise error(9, 'Bad file descriptor') + + def __init__(self, sock, ssl): + sock = SharedSocket(sock) + SharedSocketClient.__init__(self, sock) + self._ssl = ssl + + def close(self): + SharedSocketClient.close(self) + self._sock = self.__class__._closedsocket() + + def makefile(self, mode, bufsize=None): + if mode != 'r' and mode != 'rb': + raise UnimplementedFileMode() + return SSLFile(self._shared, self._ssl, bufsize) + + def send(self, stuff, flags = 0): + return self._ssl.write(stuff) + + sendall = send + + def recv(self, len = 1024, flags = 0): + return self._ssl.read(len) + + def __getattr__(self, attr): + return getattr(self._sock, attr) + + +class HTTPSConnection(HTTPConnection): + "This class allows communication via SSL." + + default_port = HTTPS_PORT + + def __init__(self, host, port=None, key_file=None, cert_file=None, + strict=None): + HTTPConnection.__init__(self, host, port, strict) + self.key_file = key_file + self.cert_file = cert_file + + def connect(self): + "Connect to a host on a given (SSL) port." + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((self.host, self.port)) + ssl = socket.ssl(sock, self.key_file, self.cert_file) + self.sock = FakeSocket(sock, ssl) + + +class HTTP: + "Compatibility class with httplib.py from 1.5." + + _http_vsn = 10 + _http_vsn_str = 'HTTP/1.0' + + debuglevel = 0 + + _connection_class = HTTPConnection + + def __init__(self, host='', port=None, strict=None): + "Provide a default host, since the superclass requires one." + + # some joker passed 0 explicitly, meaning default port + if port == 0: + port = None + + # Note that we may pass an empty string as the host; this will throw + # an error when we attempt to connect. Presumably, the client code + # will call connect before then, with a proper host. + self._setup(self._connection_class(host, port, strict)) + + def _setup(self, conn): + self._conn = conn + + # set up delegation to flesh out interface + self.send = conn.send + self.putrequest = conn.putrequest + self.endheaders = conn.endheaders + self.set_debuglevel = conn.set_debuglevel + + conn._http_vsn = self._http_vsn + conn._http_vsn_str = self._http_vsn_str + + self.file = None + + def connect(self, host=None, port=None): + "Accept arguments to set the host/port, since the superclass doesn't." + + if host is not None: + self._conn._set_hostport(host, port) + self._conn.connect() + + def getfile(self): + "Provide a getfile, since the superclass' does not use this concept." + return self.file + + def putheader(self, header, *values): + "The superclass allows only one value argument." + self._conn.putheader(header, '\r\n\t'.join(values)) + + def getreply(self): + """Compat definition since superclass does not define it. + + Returns a tuple consisting of: + - server status code (e.g. '200' if all goes well) + - server "reason" corresponding to status code + - any RFC822 headers in the response from the server + """ + try: + response = self._conn.getresponse() + except BadStatusLine, e: + ### hmm. if getresponse() ever closes the socket on a bad request, + ### then we are going to have problems with self.sock + + ### should we keep this behavior? do people use it? + # keep the socket open (as a file), and return it + self.file = self._conn.sock.makefile('rb', 0) + + # close our socket -- we want to restart after any protocol error + self.close() + + self.headers = None + return -1, e.line, None + + self.headers = response.msg + self.file = response.fp + return response.status, response.reason, response.msg + + def close(self): + self._conn.close() + + # note that self.file == response.fp, which gets closed by the + # superclass. just clear the object ref here. + ### hmm. messy. if status==-1, then self.file is owned by us. + ### well... we aren't explicitly closing, but losing this ref will + ### do it + self.file = None + +if hasattr(socket, 'ssl'): + class HTTPS(HTTP): + """Compatibility with 1.5 httplib interface + + Python 1.5.2 did not have an HTTPS class, but it defined an + interface for sending http requests that is also useful for + https. + """ + + _connection_class = HTTPSConnection + + def __init__(self, host='', port=None, key_file=None, cert_file=None, + strict=None): + # provide a default host, pass the X509 cert info + + # urf. compensate for bad input. + if port == 0: + port = None + self._setup(self._connection_class(host, port, key_file, + cert_file, strict)) + + # we never actually use these for anything, but we keep them + # here for compatibility with post-1.5.2 CVS. + self.key_file = key_file + self.cert_file = cert_file + + +class HTTPException(Exception): + # Subclasses that define an __init__ must call Exception.__init__ + # or define self.args. Otherwise, str() will fail. + pass + +class NotConnected(HTTPException): + pass + +class InvalidURL(HTTPException): + pass + +class UnknownProtocol(HTTPException): + def __init__(self, version): + self.args = version, + self.version = version + +class UnknownTransferEncoding(HTTPException): + pass + +class UnimplementedFileMode(HTTPException): + pass + +class IncompleteRead(HTTPException): + def __init__(self, partial): + self.args = partial, + self.partial = partial + +class ImproperConnectionState(HTTPException): + pass + +class CannotSendRequest(ImproperConnectionState): + pass + +class CannotSendHeader(ImproperConnectionState): + pass + +class ResponseNotReady(ImproperConnectionState): + pass + +class BadStatusLine(HTTPException): + def __init__(self, line): + self.args = line, + self.line = line + +# for backwards compatibility +error = HTTPException + +class LineAndFileWrapper: + """A limited file-like object for HTTP/0.9 responses.""" + + # The status-line parsing code calls readline(), which normally + # get the HTTP status line. For a 0.9 response, however, this is + # actually the first line of the body! Clients need to get a + # readable file object that contains that line. + + def __init__(self, line, file): + self._line = line + self._file = file + self._line_consumed = 0 + self._line_offset = 0 + self._line_left = len(line) + + def __getattr__(self, attr): + return getattr(self._file, attr) + + def _done(self): + # called when the last byte is read from the line. After the + # call, all read methods are delegated to the underlying file + # object. + self._line_consumed = 1 + self.read = self._file.read + self.readline = self._file.readline + self.readlines = self._file.readlines + + def read(self, amt=None): + if self._line_consumed: + return self._file.read(amt) + assert self._line_left + if amt is None or amt > self._line_left: + s = self._line[self._line_offset:] + self._done() + if amt is None: + return s + self._file.read() + else: + return s + self._file.read(amt - len(s)) + else: + assert amt <= self._line_left + i = self._line_offset + j = i + amt + s = self._line[i:j] + self._line_offset = j + self._line_left -= amt + if self._line_left == 0: + self._done() + return s + + def readline(self): + if self._line_consumed: + return self._file.readline() + assert self._line_left + s = self._line[self._line_offset:] + self._done() + return s + + def readlines(self, size=None): + if self._line_consumed: + return self._file.readlines(size) + assert self._line_left + L = [self._line[self._line_offset:]] + self._done() + if size is None: + return L + self._file.readlines() + else: + return L + self._file.readlines(size) + +def test(): + """Test this module. + + A hodge podge of tests collected here, because they have too many + external dependencies for the regular test suite. + """ + + import sys + import getopt + opts, args = getopt.getopt(sys.argv[1:], 'd') + dl = 0 + for o, a in opts: + if o == '-d': dl = dl + 1 + host = 'www.python.org' + selector = '/' + if args[0:]: host = args[0] + if args[1:]: selector = args[1] + h = HTTP() + h.set_debuglevel(dl) + h.connect(host) + h.putrequest('GET', selector) + h.endheaders() + status, reason, headers = h.getreply() + print 'status =', status + print 'reason =', reason + print "read", len(h.getfile().read()) + print + if headers: + for header in headers.headers: print header.strip() + print + + # minimal test that code to extract host from url works + class HTTP11(HTTP): + _http_vsn = 11 + _http_vsn_str = 'HTTP/1.1' + + h = HTTP11('www.python.org') + h.putrequest('GET', 'http://www.python.org/~jeremy/') + h.endheaders() + h.getreply() + h.close() + + if hasattr(socket, 'ssl'): + + for host, selector in (('sourceforge.net', '/projects/python'), + ): + print "https://%s%s" % (host, selector) + hs = HTTPS() + hs.set_debuglevel(dl) + hs.connect(host) + hs.putrequest('GET', selector) + hs.endheaders() + status, reason, headers = hs.getreply() + print 'status =', status + print 'reason =', reason + print "read", len(hs.getfile().read()) + print + if headers: + for header in headers.headers: print header.strip() + print + +if __name__ == '__main__': + test() diff --git a/LTA/LTAIngest/dav/davlib.py b/LTA/LTAIngest/dav/davlib.py new file mode 100644 index 0000000000000000000000000000000000000000..ec4fce277fcc7ce9e12e4bf277305da7a9b75117 --- /dev/null +++ b/LTA/LTAIngest/dav/davlib.py @@ -0,0 +1,336 @@ +# pylint: disable-msg=W0402,W0231,W0141,R0903,C0321,W0701,R0904,C0103,W0201,W0102,R0913,W0622,E1101,C0111,C0121,R0901 +# DAV client library +# +# Copyright (C) 1998-2000 Guido van Rossum. All Rights Reserved. +# Written by Greg Stein. Given to Guido. Licensed using the Python license. +# +# This module is maintained by Greg and is available at: +# http://www.lyra.org/greg/python/davlib.py +# +# Since this isn't in the Python distribution yet, we'll use the CVS ID +# for tracking: +# $Id$ +# + +import davhttplib as httplib +import urllib +import string +import types +import mimetypes +import qp_xml + + +INFINITY = 'infinity' +XML_DOC_HEADER = '<?xml version="1.0" encoding="utf-8"?>' +XML_CONTENT_TYPE = 'text/xml; charset="utf-8"' + +# block size for copying files up to the server +BLOCKSIZE = 16384 + + +class HTTPProtocolChooser(httplib.HTTPSConnection): + def __init__(self, *args, **kw): + self.protocol = kw.pop('protocol') + if self.protocol == "https": + self.default_port = 443 + else: + self.default_port = 80 + + apply(httplib.HTTPSConnection.__init__, (self,) + args, kw) + + def connect(self): + if self.protocol == "https": + httplib.HTTPSConnection.connect(self) + else: + httplib.HTTPConnection.connect(self) + + +class HTTPConnectionAuth(HTTPProtocolChooser): + def __init__(self, *args, **kw): + apply(HTTPProtocolChooser.__init__, (self,) + args, kw) + + self.__username = None + self.__password = None + self.__nonce = None + self.__opaque = None + + def setauth(self, username, password): + self.__username = username + self.__password = password + + +def _parse_status(elem): + text = elem.textof() + idx1 = string.find(text, ' ') + idx2 = string.find(text, ' ', idx1+1) + return int(text[idx1:idx2]), text[idx2+1:] + +class _blank: + def __init__(self, **kw): + self.__dict__.update(kw) +class _propstat(_blank): pass +class _response(_blank): pass +class _multistatus(_blank): pass + +def _extract_propstat(elem): + ps = _propstat(prop={}, status=None, responsedescription=None) + for child in elem.children: + if child.ns != 'DAV:': + continue + if child.name == 'prop': + for prop in child.children: + ps.prop[(prop.ns, prop.name)] = prop + elif child.name == 'status': + ps.status = _parse_status(child) + elif child.name == 'responsedescription': + ps.responsedescription = child.textof() + ### unknown element name + + return ps + +def _extract_response(elem): + resp = _response(href=[], status=None, responsedescription=None, propstat=[]) + for child in elem.children: + if child.ns != 'DAV:': + continue + if child.name == 'href': + resp.href.append(child.textof()) + elif child.name == 'status': + resp.status = _parse_status(child) + elif child.name == 'responsedescription': + resp.responsedescription = child.textof() + elif child.name == 'propstat': + resp.propstat.append(_extract_propstat(child)) + ### unknown child element + + return resp + +def _extract_msr(root): + if root.ns != 'DAV:' or root.name != 'multistatus': + raise 'invalid response: <DAV:multistatus> expected' + + msr = _multistatus(responses=[ ], responsedescription=None) + + for child in root.children: + if child.ns != 'DAV:': + continue + if child.name == 'responsedescription': + msr.responsedescription = child.textof() + elif child.name == 'response': + msr.responses.append(_extract_response(child)) + ### unknown child element + + return msr + +def _extract_locktoken(root): + if root.ns != 'DAV:' or root.name != 'prop': + raise 'invalid response: <DAV:prop> expected' + elem = root.find('lockdiscovery', 'DAV:') + if not elem: + raise 'invalid response: <DAV:lockdiscovery> expected' + elem = elem.find('activelock', 'DAV:') + if not elem: + raise 'invalid response: <DAV:activelock> expected' + elem = elem.find('locktoken', 'DAV:') + if not elem: + raise 'invalid response: <DAV:locktoken> expected' + elem = elem.find('href', 'DAV:') + if not elem: + raise 'invalid response: <DAV:href> expected' + return elem.textof() + + +class DAVResponse(httplib.HTTPResponse): + def parse_multistatus(self): + self.root = qp_xml.Parser().parse(self) + self.msr = _extract_msr(self.root) + + def parse_lock_response(self): + self.root = qp_xml.Parser().parse(self) + self.locktoken = _extract_locktoken(self.root) + + +class DAV(HTTPConnectionAuth): + + response_class = DAVResponse + + def get(self, url, extra_hdrs={ }): + return self._request('GET', url, extra_hdrs=extra_hdrs) + + def head(self, url, extra_hdrs={ }): + return self._request('HEAD', url, extra_hdrs=extra_hdrs) + + def post(self, url, data={ }, body=None, extra_hdrs={ }): + headers = extra_hdrs.copy() + + assert body or data, "body or data must be supplied" + assert not (body and data), "cannot supply both body and data" + if data: + body = '' + for key, value in data.items(): + if isinstance(value, types.ListType): + for item in value: + body = body + '&' + key + '=' + urllib.quote(str(item)) + else: + body = body + '&' + key + '=' + urllib.quote(str(value)) + body = body[1:] + headers['Content-Type'] = 'application/x-www-form-urlencoded' + + return self._request('POST', url, body, headers) + + def options(self, url='*', extra_hdrs={ }): + return self._request('OPTIONS', url, extra_hdrs=extra_hdrs) + + def trace(self, url, extra_hdrs={ }): + return self._request('TRACE', url, extra_hdrs=extra_hdrs) + + def put(self, url, contents, + content_type=None, content_enc=None, extra_hdrs={ }): + + if not content_type: + content_type, content_enc = mimetypes.guess_type(url) + + headers = extra_hdrs.copy() + if content_type: + headers['Content-Type'] = content_type + if content_enc: + headers['Content-Encoding'] = content_enc + return self._request('PUT', url, contents, headers) + + def delete(self, url, extra_hdrs={ }): + return self._request('DELETE', url, extra_hdrs=extra_hdrs) + + def propfind(self, url, body=None, depth=None, extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Content-Type'] = XML_CONTENT_TYPE + if depth is not None: + headers['Depth'] = str(depth) + return self._request('PROPFIND', url, body, headers) + + def proppatch(self, url, body, extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Content-Type'] = XML_CONTENT_TYPE + return self._request('PROPPATCH', url, body, headers) + + def mkcol(self, url, extra_hdrs={ }): + return self._request('MKCOL', url, extra_hdrs=extra_hdrs) + + def move(self, src, dst, extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Destination'] = dst + return self._request('MOVE', src, extra_hdrs=headers) + + def copy(self, src, dst, depth=None, extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Destination'] = dst + if depth is not None: + headers['Depth'] = str(depth) + return self._request('COPY', src, extra_hdrs=headers) + + def lock(self, url, owner='', timeout=None, depth=None, + scope='exclusive', type='write', extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Content-Type'] = XML_CONTENT_TYPE + if depth is not None: + headers['Depth'] = str(depth) + if timeout is not None: + headers['Timeout'] = timeout + body = XML_DOC_HEADER + \ + '<DAV:lockinfo xmlns:DAV="DAV:">' + \ + '<DAV:lockscope><DAV:%s/></DAV:lockscope>' % scope + \ + '<DAV:locktype><DAV:%s/></DAV:locktype>' % type + \ + '<DAV:owner>' + owner + '</DAV:owner>' + \ + '</DAV:lockinfo>' + return self._request('LOCK', url, body, extra_hdrs=headers) + + def unlock(self, url, locktoken, extra_hdrs={ }): + headers = extra_hdrs.copy() + if locktoken[0] != '<': + locktoken = '<' + locktoken + '>' + headers['Lock-Token'] = locktoken + return self._request('UNLOCK', url, extra_hdrs=headers) + + def _request(self, method, url, body=None, extra_hdrs={}): + "Internal method for sending a request." + + self.request(method, url, body, extra_hdrs) + return self.getresponse() + + + # + # Higher-level methods for typical client use + # + + def allprops(self, url, depth=None): + body = XML_DOC_HEADER + \ + '<DAV:propfind xmlns:DAV="DAV:"><DAV:allprop/></DAV:propfind>' + return self.propfind(url, body, depth=depth) + + def propnames(self, url, depth=None): + body = XML_DOC_HEADER + \ + '<DAV:propfind xmlns:DAV="DAV:"><DAV:propname/></DAV:propfind>' + return self.propfind(url, body, depth) + + def getprops(self, url, *names, **kw): + assert names, 'at least one property name must be provided' + if kw.has_key('ns'): + xmlns = ' xmlns:NS="' + kw['ns'] + '"' + ns = 'NS:' + del kw['ns'] + else: + xmlns = ns = '' + if kw.has_key('depth'): + depth = kw['depth'] + del kw['depth'] + else: + depth = 0 + assert not kw, 'unknown arguments' + body = XML_DOC_HEADER + \ + '<DAV:propfind xmlns:DAV="DAV:"' + xmlns + '><DAV:prop><' + ns + \ + string.joinfields(names, '/><' + ns) + \ + '/></DAV:prop></DAV:propfind>' + return self.propfind(url, body, depth) + + def delprops(self, url, *names, **kw): + assert names, 'at least one property name must be provided' + if kw.has_key('ns'): + xmlns = ' xmlns:NS="' + kw['ns'] + '"' + ns = 'NS:' + del kw['ns'] + else: + xmlns = ns = '' + assert not kw, 'unknown arguments' + body = XML_DOC_HEADER + \ + '<DAV:propertyupdate xmlns:DAV="DAV:"' + xmlns + \ + '><DAV:remove><DAV:prop><' + ns + \ + string.joinfields(names, '/><' + ns) + \ + '/></DAV:prop></DAV:remove></DAV:propertyupdate>' + return self.proppatch(url, body) + + def setprops(self, url, *xmlprops, **props): + assert xmlprops or props, 'at least one property must be provided' + xmlprops = list(xmlprops) + if props.has_key('ns'): + xmlns = ' xmlns:NS="' + props['ns'] + '"' + ns = 'NS:' + del props['ns'] + else: + xmlns = ns = '' + for key, value in props.items(): + if value: + xmlprops.append('<%s%s>%s</%s%s>' % (ns, key, value, ns, key)) + else: + xmlprops.append('<%s%s/>' % (ns, key)) + elems = string.joinfields(xmlprops, '') + body = XML_DOC_HEADER + \ + '<DAV:propertyupdate xmlns:DAV="DAV:"' + xmlns + \ + '><DAV:set><DAV:prop>' + \ + elems + \ + '</DAV:prop></DAV:set></DAV:propertyupdate>' + return self.proppatch(url, body) + + def get_lock(self, url, owner='', timeout=None, depth=None): + response = self.lock(url, owner, timeout, depth) + response.parse_lock_response() + return response.locktoken + \ No newline at end of file diff --git a/LTA/LTAIngest/dav/davlib_orig.py b/LTA/LTAIngest/dav/davlib_orig.py new file mode 100644 index 0000000000000000000000000000000000000000..00a804c51ff0a10457d5721a0c839d74ff049776 --- /dev/null +++ b/LTA/LTAIngest/dav/davlib_orig.py @@ -0,0 +1,316 @@ +# +# DAV client library +# +# Copyright (C) 1998-2000 Guido van Rossum. All Rights Reserved. +# Written by Greg Stein. Given to Guido. Licensed using the Python license. +# +# This module is maintained by Greg and is available at: +# http://www.lyra.org/greg/python/davlib.py +# +# Since this isn't in the Python distribution yet, we'll use the CVS ID +# for tracking: +# $Id$ +# + +import httplib +import urllib +import string +import types +import mimetypes +import qp_xml + + +INFINITY = 'infinity' +XML_DOC_HEADER = '<?xml version="1.0" encoding="utf-8"?>' +XML_CONTENT_TYPE = 'text/xml; charset="utf-8"' + +# block size for copying files up to the server +BLOCKSIZE = 16384 + + +class HTTPConnectionAuth(httplib.HTTPConnection): + def __init__(self, *args, **kw): + apply(httplib.HTTPConnection.__init__, (self,) + args, kw) + + self.__username = None + self.__password = None + self.__nonce = None + self.__opaque = None + + def setauth(self, username, password): + self.__username = username + self.__password = password + + +def _parse_status(elem): + text = elem.textof() + idx1 = string.find(text, ' ') + idx2 = string.find(text, ' ', idx1+1) + return int(text[idx1:idx2]), text[idx2+1:] + +class _blank: + def __init__(self, **kw): + self.__dict__.update(kw) +class _propstat(_blank): pass +class _response(_blank): pass +class _multistatus(_blank): pass + +def _extract_propstat(elem): + ps = _propstat(prop={}, status=None, responsedescription=None) + for child in elem.children: + if child.ns != 'DAV:': + continue + if child.name == 'prop': + for prop in child.children: + ps.prop[(prop.ns, prop.name)] = prop + elif child.name == 'status': + ps.status = _parse_status(child) + elif child.name == 'responsedescription': + ps.responsedescription = child.textof() + ### unknown element name + + return ps + +def _extract_response(elem): + resp = _response(href=[], status=None, responsedescription=None, propstat=[]) + for child in elem.children: + if child.ns != 'DAV:': + continue + if child.name == 'href': + resp.href.append(child.textof()) + elif child.name == 'status': + resp.status = _parse_status(child) + elif child.name == 'responsedescription': + resp.responsedescription = child.textof() + elif child.name == 'propstat': + resp.propstat.append(_extract_propstat(child)) + ### unknown child element + + return resp + +def _extract_msr(root): + if root.ns != 'DAV:' or root.name != 'multistatus': + raise 'invalid response: <DAV:multistatus> expected' + + msr = _multistatus(responses=[ ], responsedescription=None) + + for child in root.children: + if child.ns != 'DAV:': + continue + if child.name == 'responsedescription': + msr.responsedescription = child.textof() + elif child.name == 'response': + msr.responses.append(_extract_response(child)) + ### unknown child element + + return msr + +def _extract_locktoken(root): + if root.ns != 'DAV:' or root.name != 'prop': + raise 'invalid response: <DAV:prop> expected' + elem = root.find('lockdiscovery', 'DAV:') + if not elem: + raise 'invalid response: <DAV:lockdiscovery> expected' + elem = elem.find('activelock', 'DAV:') + if not elem: + raise 'invalid response: <DAV:activelock> expected' + elem = elem.find('locktoken', 'DAV:') + if not elem: + raise 'invalid response: <DAV:locktoken> expected' + elem = elem.find('href', 'DAV:') + if not elem: + raise 'invalid response: <DAV:href> expected' + return elem.textof() + + +class DAVResponse(httplib.HTTPResponse): + def parse_multistatus(self): + self.root = qp_xml.Parser().parse(self) + self.msr = _extract_msr(self.root) + + def parse_lock_response(self): + self.root = qp_xml.Parser().parse(self) + self.locktoken = _extract_locktoken(self.root) + + +class DAV(HTTPConnectionAuth): + + response_class = DAVResponse + + def get(self, url, extra_hdrs={ }): + return self._request('GET', url, extra_hdrs=extra_hdrs) + + def head(self, url, extra_hdrs={ }): + return self._request('HEAD', url, extra_hdrs=extra_hdrs) + + def post(self, url, data={ }, body=None, extra_hdrs={ }): + headers = extra_hdrs.copy() + + assert body or data, "body or data must be supplied" + assert not (body and data), "cannot supply both body and data" + if data: + body = '' + for key, value in data.items(): + if isinstance(value, types.ListType): + for item in value: + body = body + '&' + key + '=' + urllib.quote(str(item)) + else: + body = body + '&' + key + '=' + urllib.quote(str(value)) + body = body[1:] + headers['Content-Type'] = 'application/x-www-form-urlencoded' + + return self._request('POST', url, body, headers) + + def options(self, url='*', extra_hdrs={ }): + return self._request('OPTIONS', url, extra_hdrs=extra_hdrs) + + def trace(self, url, extra_hdrs={ }): + return self._request('TRACE', url, extra_hdrs=extra_hdrs) + + def put(self, url, contents, + content_type=None, content_enc=None, extra_hdrs={ }): + + if not content_type: + content_type, content_enc = mimetypes.guess_type(url) + + headers = extra_hdrs.copy() + if content_type: + headers['Content-Type'] = content_type + if content_enc: + headers['Content-Encoding'] = content_enc + return self._request('PUT', url, contents, headers) + + def delete(self, url, extra_hdrs={ }): + return self._request('DELETE', url, extra_hdrs=extra_hdrs) + + def propfind(self, url, body=None, depth=None, extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Content-Type'] = XML_CONTENT_TYPE + if depth is not None: + headers['Depth'] = str(depth) + return self._request('PROPFIND', url, body, headers) + + def proppatch(self, url, body, extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Content-Type'] = XML_CONTENT_TYPE + return self._request('PROPPATCH', url, body, headers) + + def mkcol(self, url, extra_hdrs={ }): + return self._request('MKCOL', url, extra_hdrs=extra_hdrs) + + def move(self, src, dst, extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Destination'] = dst + return self._request('MOVE', src, extra_hdrs=headers) + + def copy(self, src, dst, depth=None, extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Destination'] = dst + if depth is not None: + headers['Depth'] = str(depth) + return self._request('COPY', src, extra_hdrs=headers) + + def lock(self, url, owner='', timeout=None, depth=None, + scope='exclusive', type='write', extra_hdrs={ }): + headers = extra_hdrs.copy() + headers['Content-Type'] = XML_CONTENT_TYPE + if depth is not None: + headers['Depth'] = str(depth) + if timeout is not None: + headers['Timeout'] = timeout + body = XML_DOC_HEADER + \ + '<DAV:lockinfo xmlns:DAV="DAV:">' + \ + '<DAV:lockscope><DAV:%s/></DAV:lockscope>' % scope + \ + '<DAV:locktype><DAV:%s/></DAV:locktype>' % type + \ + owner + \ + '</DAV:lockinfo>' + return self._request('LOCK', url, body, extra_hdrs=headers) + + def unlock(self, url, locktoken, extra_hdrs={ }): + headers = extra_hdrs.copy() + if locktoken[0] != '<': + locktoken = '<' + locktoken + '>' + headers['Lock-Token'] = locktoken + return self._request('UNLOCK', url, extra_hdrs=headers) + + def _request(self, method, url, body=None, extra_hdrs={}): + "Internal method for sending a request." + + self.request(method, url, body, extra_hdrs) + return self.getresponse() + + + # + # Higher-level methods for typical client use + # + + def allprops(self, url, depth=None): + return self.propfind(url, depth=depth) + + def propnames(self, url, depth=None): + body = XML_DOC_HEADER + \ + '<DAV:propfind xmlns:DAV="DAV:"><DAV:propname/></DAV:propfind>' + return self.propfind(url, body, depth) + + def getprops(self, url, *names, **kw): + assert names, 'at least one property name must be provided' + if kw.has_key('ns'): + xmlns = ' xmlns:NS="' + kw['ns'] + '"' + ns = 'NS:' + del kw['ns'] + else: + xmlns = ns = '' + if kw.has_key('depth'): + depth = kw['depth'] + del kw['depth'] + else: + depth = 0 + assert not kw, 'unknown arguments' + body = XML_DOC_HEADER + \ + '<DAV:propfind xmlns:DAV="DAV:"' + xmlns + '><DAV:prop><' + ns + \ + string.joinfields(names, '/><' + ns) + \ + '/></DAV:prop></DAV:propfind>' + return self.propfind(url, body, depth) + + def delprops(self, url, *names, **kw): + assert names, 'at least one property name must be provided' + if kw.has_key('ns'): + xmlns = ' xmlns:NS="' + kw['ns'] + '"' + ns = 'NS:' + del kw['ns'] + else: + xmlns = ns = '' + assert not kw, 'unknown arguments' + body = XML_DOC_HEADER + \ + '<DAV:propertyupdate xmlns:DAV="DAV:"' + xmlns + \ + '><DAV:remove><DAV:prop><' + ns + \ + string.joinfields(names, '/><' + ns) + \ + '/></DAV:prop></DAV:remove></DAV:propertyupdate>' + return self.proppatch(url, body) + + def setprops(self, url, *xmlprops, **props): + assert xmlprops or props, 'at least one property must be provided' + xmlprops = list(xmlprops) + if props.has_key('ns'): + xmlns = ' xmlns:NS="' + props['ns'] + '"' + ns = 'NS:' + del props['ns'] + else: + xmlns = ns = '' + for key, value in props.items(): + if value: + xmlprops.append('<%s%s>%s</%s%s>' % (ns, key, value, ns, key)) + else: + xmlprops.append('<%s%s/>' % (ns, key)) + elems = string.joinfields(xmlprops, '') + body = XML_DOC_HEADER + \ + '<DAV:propertyupdate xmlns:DAV="DAV:"' + xmlns + \ + '><DAV:set><DAV:prop>' + \ + elems + \ + '</DAV:prop></DAV:set></DAV:propertyupdate>' + return self.proppatch(url, body) + + def get_lock(self, url, owner='', timeout=None, depth=None): + response = self.lock(url, owner, timeout, depth) + response.parse_lock_response() + return response.locktoken diff --git a/LTA/LTAIngest/dav/httplib.py b/LTA/LTAIngest/dav/httplib.py new file mode 100644 index 0000000000000000000000000000000000000000..b25feb61dbe373fc8a5375ac88ee1dc63aa244b1 --- /dev/null +++ b/LTA/LTAIngest/dav/httplib.py @@ -0,0 +1,1459 @@ +"""HTTP/1.1 client library + +<intro stuff goes here> +<other stuff, too> + +HTTPConnection goes through a number of "states", which define when a client +may legally make another request or fetch the response for a particular +request. This diagram details these state transitions: + + (null) + | + | HTTPConnection() + v + Idle + | + | putrequest() + v + Request-started + | + | ( putheader() )* endheaders() + v + Request-sent + | + | response = getresponse() + v + Unread-response [Response-headers-read] + |\____________________ + | | + | response.read() | putrequest() + v v + Idle Req-started-unread-response + ______/| + / | + response.read() | | ( putheader() )* endheaders() + v v + Request-started Req-sent-unread-response + | + | response.read() + v + Request-sent + +This diagram presents the following rules: + -- a second request may not be started until {response-headers-read} + -- a response [object] cannot be retrieved until {request-sent} + -- there is no differentiation between an unread response body and a + partially read response body + +Note: this enforcement is applied by the HTTPConnection class. The + HTTPResponse class does not enforce this state machine, which + implies sophisticated clients may accelerate the request/response + pipeline. Caution should be taken, though: accelerating the states + beyond the above pattern may imply knowledge of the server's + connection-close behavior for certain requests. For example, it + is impossible to tell whether the server will close the connection + UNTIL the response headers have been read; this means that further + requests cannot be placed into the pipeline until it is known that + the server will NOT be closing the connection. + +Logical State __state __response +------------- ------- ---------- +Idle _CS_IDLE None +Request-started _CS_REQ_STARTED None +Request-sent _CS_REQ_SENT None +Unread-response _CS_IDLE <response_class> +Req-started-unread-response _CS_REQ_STARTED <response_class> +Req-sent-unread-response _CS_REQ_SENT <response_class> +""" + +import errno +import mimetools +import socket +from urlparse import urlsplit + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +__all__ = ["HTTP", "HTTPResponse", "HTTPConnection", "HTTPSConnection", + "HTTPException", "NotConnected", "UnknownProtocol", + "UnknownTransferEncoding", "UnimplementedFileMode", + "IncompleteRead", "InvalidURL", "ImproperConnectionState", + "CannotSendRequest", "CannotSendHeader", "ResponseNotReady", + "BadStatusLine", "error", "responses"] + +HTTP_PORT = 80 +HTTPS_PORT = 443 + +_UNKNOWN = 'UNKNOWN' + +# connection states +_CS_IDLE = 'Idle' +_CS_REQ_STARTED = 'Request-started' +_CS_REQ_SENT = 'Request-sent' + +# status codes +# informational +CONTINUE = 100 +SWITCHING_PROTOCOLS = 101 +PROCESSING = 102 + +# successful +OK = 200 +CREATED = 201 +ACCEPTED = 202 +NON_AUTHORITATIVE_INFORMATION = 203 +NO_CONTENT = 204 +RESET_CONTENT = 205 +PARTIAL_CONTENT = 206 +MULTI_STATUS = 207 +IM_USED = 226 + +# redirection +MULTIPLE_CHOICES = 300 +MOVED_PERMANENTLY = 301 +FOUND = 302 +SEE_OTHER = 303 +NOT_MODIFIED = 304 +USE_PROXY = 305 +TEMPORARY_REDIRECT = 307 + +# client error +BAD_REQUEST = 400 +UNAUTHORIZED = 401 +PAYMENT_REQUIRED = 402 +FORBIDDEN = 403 +NOT_FOUND = 404 +METHOD_NOT_ALLOWED = 405 +NOT_ACCEPTABLE = 406 +PROXY_AUTHENTICATION_REQUIRED = 407 +REQUEST_TIMEOUT = 408 +CONFLICT = 409 +GONE = 410 +LENGTH_REQUIRED = 411 +PRECONDITION_FAILED = 412 +REQUEST_ENTITY_TOO_LARGE = 413 +REQUEST_URI_TOO_LONG = 414 +UNSUPPORTED_MEDIA_TYPE = 415 +REQUESTED_RANGE_NOT_SATISFIABLE = 416 +EXPECTATION_FAILED = 417 +UNPROCESSABLE_ENTITY = 422 +LOCKED = 423 +FAILED_DEPENDENCY = 424 +UPGRADE_REQUIRED = 426 + +# server error +INTERNAL_SERVER_ERROR = 500 +NOT_IMPLEMENTED = 501 +BAD_GATEWAY = 502 +SERVICE_UNAVAILABLE = 503 +GATEWAY_TIMEOUT = 504 +HTTP_VERSION_NOT_SUPPORTED = 505 +INSUFFICIENT_STORAGE = 507 +NOT_EXTENDED = 510 + +# Mapping status codes to official W3C names +responses = { + 100: 'Continue', + 101: 'Switching Protocols', + + 200: 'OK', + 201: 'Created', + 202: 'Accepted', + 203: 'Non-Authoritative Information', + 204: 'No Content', + 205: 'Reset Content', + 206: 'Partial Content', + + 300: 'Multiple Choices', + 301: 'Moved Permanently', + 302: 'Found', + 303: 'See Other', + 304: 'Not Modified', + 305: 'Use Proxy', + 306: '(Unused)', + 307: 'Temporary Redirect', + + 400: 'Bad Request', + 401: 'Unauthorized', + 402: 'Payment Required', + 403: 'Forbidden', + 404: 'Not Found', + 405: 'Method Not Allowed', + 406: 'Not Acceptable', + 407: 'Proxy Authentication Required', + 408: 'Request Timeout', + 409: 'Conflict', + 410: 'Gone', + 411: 'Length Required', + 412: 'Precondition Failed', + 413: 'Request Entity Too Large', + 414: 'Request-URI Too Long', + 415: 'Unsupported Media Type', + 416: 'Requested Range Not Satisfiable', + 417: 'Expectation Failed', + + 500: 'Internal Server Error', + 501: 'Not Implemented', + 502: 'Bad Gateway', + 503: 'Service Unavailable', + 504: 'Gateway Timeout', + 505: 'HTTP Version Not Supported', +} + +# maximal amount of data to read at one time in _safe_read +MAXAMOUNT = 1048576 + +class HTTPMessage(mimetools.Message): + + def addheader(self, key, value): + """Add header for field key handling repeats.""" + prev = self.dict.get(key) + if prev is None: + self.dict[key] = value + else: + combined = ", ".join((prev, value)) + self.dict[key] = combined + + def addcontinue(self, key, more): + """Add more field data from a continuation line.""" + prev = self.dict[key] + self.dict[key] = prev + "\n " + more + + def readheaders(self): + """Read header lines. + + Read header lines up to the entirely blank line that terminates them. + The (normally blank) line that ends the headers is skipped, but not + included in the returned list. If a non-header line ends the headers, + (which is an error), an attempt is made to backspace over it; it is + never included in the returned list. + + The variable self.status is set to the empty string if all went well, + otherwise it is an error message. The variable self.headers is a + completely uninterpreted list of lines contained in the header (so + printing them will reproduce the header exactly as it appears in the + file). + + If multiple header fields with the same name occur, they are combined + according to the rules in RFC 2616 sec 4.2: + + Appending each subsequent field-value to the first, each separated + by a comma. The order in which header fields with the same field-name + are received is significant to the interpretation of the combined + field value. + """ + # XXX The implementation overrides the readheaders() method of + # rfc822.Message. The base class design isn't amenable to + # customized behavior here so the method here is a copy of the + # base class code with a few small changes. + + self.dict = {} + self.unixfrom = '' + self.headers = hlist = [] + self.status = '' + headerseen = "" + firstline = 1 + startofline = unread = tell = None + if hasattr(self.fp, 'unread'): + unread = self.fp.unread + elif self.seekable: + tell = self.fp.tell + while True: + if tell: + try: + startofline = tell() + except IOError: + startofline = tell = None + self.seekable = 0 + line = self.fp.readline() + if not line: + self.status = 'EOF in headers' + break + # Skip unix From name time lines + if firstline and line.startswith('From '): + self.unixfrom = self.unixfrom + line + continue + firstline = 0 + if headerseen and line[0] in ' \t': + # XXX Not sure if continuation lines are handled properly + # for http and/or for repeating headers + # It's a continuation line. + hlist.append(line) + self.addcontinue(headerseen, line.strip()) + continue + elif self.iscomment(line): + # It's a comment. Ignore it. + continue + elif self.islast(line): + # Note! No pushback here! The delimiter line gets eaten. + break + headerseen = self.isheader(line) + if headerseen: + # It's a legal header line, save it. + hlist.append(line) + self.addheader(headerseen, line[len(headerseen)+1:].strip()) + continue + else: + # It's not a header line; throw it back and stop here. + if not self.dict: + self.status = 'No headers' + else: + self.status = 'Non-header line where header expected' + # Try to undo the read. + if unread: + unread(line) + elif tell: + self.fp.seek(startofline) + else: + self.status = self.status + '; bad seek' + break + +class HTTPResponse: + + # strict: If true, raise BadStatusLine if the status line can't be + # parsed as a valid HTTP/1.0 or 1.1 status line. By default it is + # false because it prevents clients from talking to HTTP/0.9 + # servers. Note that a response with a sufficiently corrupted + # status line will look like an HTTP/0.9 response. + + # See RFC 2616 sec 19.6 and RFC 1945 sec 6 for details. + + def __init__(self, sock, debuglevel=0, strict=0, method=None): + self.fp = sock.makefile('rb', 0) + self.debuglevel = debuglevel + self.strict = strict + self._method = method + + self.msg = None + + # from the Status-Line of the response + self.version = _UNKNOWN # HTTP-Version + self.status = _UNKNOWN # Status-Code + self.reason = _UNKNOWN # Reason-Phrase + + self.chunked = _UNKNOWN # is "chunked" being used? + self.chunk_left = _UNKNOWN # bytes left to read in current chunk + self.length = _UNKNOWN # number of bytes left in response + self.will_close = _UNKNOWN # conn will close at end of response + + def _read_status(self): + # Initialize with Simple-Response defaults + line = self.fp.readline() + if self.debuglevel > 0: + print "reply:", repr(line) + if not line: + # Presumably, the server closed the connection before + # sending a valid response. + raise BadStatusLine(line) + try: + [version, status, reason] = line.split(None, 2) + except ValueError: + try: + [version, status] = line.split(None, 1) + reason = "" + except ValueError: + # empty version will cause next test to fail and status + # will be treated as 0.9 response. + version = "" + if not version.startswith('HTTP/'): + if self.strict: + self.close() + raise BadStatusLine(line) + else: + # assume it's a Simple-Response from an 0.9 server + self.fp = LineAndFileWrapper(line, self.fp) + return "HTTP/0.9", 200, "" + + # The status code is a three-digit number + try: + status = int(status) + if status < 100 or status > 999: + raise BadStatusLine(line) + except ValueError: + raise BadStatusLine(line) + return version, status, reason + + def begin(self): + if self.msg is not None: + # we've already started reading the response + return + + # read until we get a non-100 response + while True: + version, status, reason = self._read_status() + if status != CONTINUE: + break + # skip the header from the 100 response + while True: + skip = self.fp.readline().strip() + if not skip: + break + if self.debuglevel > 0: + print "header:", skip + + self.status = status + self.reason = reason.strip() + if version == 'HTTP/1.0': + self.version = 10 + elif version.startswith('HTTP/1.'): + self.version = 11 # use HTTP/1.1 code for HTTP/1.x where x>=1 + elif version == 'HTTP/0.9': + self.version = 9 + else: + raise UnknownProtocol(version) + + if self.version == 9: + self.length = None + self.chunked = 0 + self.will_close = 1 + self.msg = HTTPMessage(StringIO()) + return + + self.msg = HTTPMessage(self.fp, 0) + if self.debuglevel > 0: + for hdr in self.msg.headers: + print "header:", hdr, + + # don't let the msg keep an fp + self.msg.fp = None + + # are we using the chunked-style of transfer encoding? + tr_enc = self.msg.getheader('transfer-encoding') + if tr_enc and tr_enc.lower() == "chunked": + self.chunked = 1 + self.chunk_left = None + else: + self.chunked = 0 + + # will the connection close at the end of the response? + self.will_close = self._check_close() + + # do we have a Content-Length? + # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked" + length = self.msg.getheader('content-length') + if length and not self.chunked: + try: + self.length = int(length) + except ValueError: + self.length = None + else: + self.length = None + + # does the body have a fixed length? (of zero) + if (status == NO_CONTENT or status == NOT_MODIFIED or + 100 <= status < 200 or # 1xx codes + self._method == 'HEAD'): + self.length = 0 + + # if the connection remains open, and we aren't using chunked, and + # a content-length was not provided, then assume that the connection + # WILL close. + if not self.will_close and \ + not self.chunked and \ + self.length is None: + self.will_close = 1 + + def _check_close(self): + conn = self.msg.getheader('connection') + if self.version == 11: + # An HTTP/1.1 proxy is assumed to stay open unless + # explicitly closed. + conn = self.msg.getheader('connection') + if conn and "close" in conn.lower(): + return True + return False + + # Some HTTP/1.0 implementations have support for persistent + # connections, using rules different than HTTP/1.1. + + # For older HTTP, Keep-Alive indiciates persistent connection. + if self.msg.getheader('keep-alive'): + return False + + # At least Akamai returns a "Connection: Keep-Alive" header, + # which was supposed to be sent by the client. + if conn and "keep-alive" in conn.lower(): + return False + + # Proxy-Connection is a netscape hack. + pconn = self.msg.getheader('proxy-connection') + if pconn and "keep-alive" in pconn.lower(): + return False + + # otherwise, assume it will close + return True + + def close(self): + if self.fp: + self.fp.close() + self.fp = None + + def isclosed(self): + # NOTE: it is possible that we will not ever call self.close(). This + # case occurs when will_close is TRUE, length is None, and we + # read up to the last byte, but NOT past it. + # + # IMPLIES: if will_close is FALSE, then self.close() will ALWAYS be + # called, meaning self.isclosed() is meaningful. + return self.fp is None + + # XXX It would be nice to have readline and __iter__ for this, too. + + def read(self, amt=None): + if self.fp is None: + return '' + + if self.chunked: + return self._read_chunked(amt) + + if amt is None: + # unbounded read + if self.length is None: + s = self.fp.read() + else: + s = self._safe_read(self.length) + self.length = 0 + self.close() # we read everything + return s + + if self.length is not None: + if amt > self.length: + # clip the read to the "end of response" + amt = self.length + + # we do not use _safe_read() here because this may be a .will_close + # connection, and the user is reading more bytes than will be provided + # (for example, reading in 1k chunks) + s = self.fp.read(amt) + if self.length is not None: + self.length -= len(s) + + return s + + def _read_chunked(self, amt): + assert self.chunked != _UNKNOWN + chunk_left = self.chunk_left + value = '' + + # XXX This accumulates chunks by repeated string concatenation, + # which is not efficient as the number or size of chunks gets big. + while True: + if chunk_left is None: + line = self.fp.readline() + i = line.find(';') + if i >= 0: + line = line[:i] # strip chunk-extensions + chunk_left = int(line, 16) + if chunk_left == 0: + break + if amt is None: + value += self._safe_read(chunk_left) + elif amt < chunk_left: + value += self._safe_read(amt) + self.chunk_left = chunk_left - amt + return value + elif amt == chunk_left: + value += self._safe_read(amt) + self._safe_read(2) # toss the CRLF at the end of the chunk + self.chunk_left = None + return value + else: + value += self._safe_read(chunk_left) + amt -= chunk_left + + # we read the whole chunk, get another + self._safe_read(2) # toss the CRLF at the end of the chunk + chunk_left = None + + # read and discard trailer up to the CRLF terminator + ### note: we shouldn't have any trailers! + while True: + line = self.fp.readline() + if not line: + # a vanishingly small number of sites EOF without + # sending the trailer + break + if line == '\r\n': + break + + # we read everything; close the "file" + self.close() + + return value + + def _safe_read(self, amt): + """Read the number of bytes requested, compensating for partial reads. + + Normally, we have a blocking socket, but a read() can be interrupted + by a signal (resulting in a partial read). + + Note that we cannot distinguish between EOF and an interrupt when zero + bytes have been read. IncompleteRead() will be raised in this + situation. + + This function should be used when <amt> bytes "should" be present for + reading. If the bytes are truly not available (due to EOF), then the + IncompleteRead exception can be used to detect the problem. + """ + s = [] + while amt > 0: + chunk = self.fp.read(min(amt, MAXAMOUNT)) + if not chunk: + raise IncompleteRead(s) + s.append(chunk) + amt -= len(chunk) + return ''.join(s) + + def getheader(self, name, default=None): + if self.msg is None: + raise ResponseNotReady() + return self.msg.getheader(name, default) + + def getheaders(self): + """Return list of (header, value) tuples.""" + if self.msg is None: + raise ResponseNotReady() + return self.msg.items() + + +class HTTPConnection: + + _http_vsn = 11 + _http_vsn_str = 'HTTP/1.1' + + response_class = HTTPResponse + default_port = HTTP_PORT + auto_open = 1 + debuglevel = 0 + strict = 0 + + def __init__(self, host, port=None, strict=None): + self.sock = None + self._buffer = [] + self.__response = None + self.__state = _CS_IDLE + self._method = None + + self._set_hostport(host, port) + if strict is not None: + self.strict = strict + + def _set_hostport(self, host, port): + if port is None: + i = host.rfind(':') + j = host.rfind(']') # ipv6 addresses have [...] + if i > j: + try: + port = int(host[i+1:]) + except ValueError: + raise InvalidURL("nonnumeric port: '%s'" % host[i+1:]) + host = host[:i] + else: + port = self.default_port + if host and host[0] == '[' and host[-1] == ']': + host = host[1:-1] + self.host = host + self.port = port + + def set_debuglevel(self, level): + self.debuglevel = level + + def connect(self): + """Connect to the host and port specified in __init__.""" + msg = "getaddrinfo returns an empty list" + for res in socket.getaddrinfo(self.host, self.port, 0, + socket.SOCK_STREAM): + af, socktype, proto, canonname, sa = res + try: + self.sock = socket.socket(af, socktype, proto) + if self.debuglevel > 0: + print "connect: (%s, %s)" % (self.host, self.port) + self.sock.connect(sa) + except socket.error, msg: + if self.debuglevel > 0: + print 'connect fail:', (self.host, self.port) + if self.sock: + self.sock.close() + self.sock = None + continue + break + if not self.sock: + raise socket.error, msg + + def close(self): + """Close the connection to the HTTP server.""" + if self.sock: + self.sock.close() # close it manually... there may be other refs + self.sock = None + if self.__response: + self.__response.close() + self.__response = None + self.__state = _CS_IDLE + + def send(self, str): + """Send `str' to the server.""" + if self.sock is None: + if self.auto_open: + self.connect() + else: + raise NotConnected() + + # send the data to the server. if we get a broken pipe, then close + # the socket. we want to reconnect when somebody tries to send again. + # + # NOTE: we DO propagate the error, though, because we cannot simply + # ignore the error... the caller will know if they can retry. + if self.debuglevel > 0: + print "send:", repr(str) + try: + self.sock.sendall(str) + except socket.error, v: + if v[0] == 32: # Broken pipe + self.close() + raise + + def sendbinary(self, fp, blocksize=8192): + '''Send a file in binary mode.''' + if self.sock is None: + if self.auto_open: + self.connect() + else: + raise NotConnected() + if self.debuglevel > 0: + print "sending file: ", fp.name + try: + while 1: + buf = fp.read(blocksize) + if not buf: break + self.sock.sendall(buf) + except socket.error, v: + if v[0] == 32: # Broken pipe + self.close() + raise + + def _output(self, s): + """Add a line of output to the current request buffer. + + Assumes that the line does *not* end with \\r\\n. + """ + self._buffer.append(s) + + def _send_output(self): + """Send the currently buffered request and clear the buffer. + + Appends an extra \\r\\n to the buffer. + """ + self._buffer.extend(("", "")) + msg = "\r\n".join(self._buffer) + del self._buffer[:] + self.send(msg) + + def putrequest(self, method, url, skip_host=0, skip_accept_encoding=0): + """Send a request to the server. + + `method' specifies an HTTP request method, e.g. 'GET'. + `url' specifies the object being requested, e.g. '/index.html'. + `skip_host' if True does not add automatically a 'Host:' header + `skip_accept_encoding' if True does not add automatically an + 'Accept-Encoding:' header + """ + + # if a prior response has been completed, then forget about it. + if self.__response and self.__response.isclosed(): + self.__response = None + + + # in certain cases, we cannot issue another request on this connection. + # this occurs when: + # 1) we are in the process of sending a request. (_CS_REQ_STARTED) + # 2) a response to a previous request has signalled that it is going + # to close the connection upon completion. + # 3) the headers for the previous response have not been read, thus + # we cannot determine whether point (2) is true. (_CS_REQ_SENT) + # + # if there is no prior response, then we can request at will. + # + # if point (2) is true, then we will have passed the socket to the + # response (effectively meaning, "there is no prior response"), and + # will open a new one when a new request is made. + # + # Note: if a prior response exists, then we *can* start a new request. + # We are not allowed to begin fetching the response to this new + # request, however, until that prior response is complete. + # + if self.__state == _CS_IDLE: + self.__state = _CS_REQ_STARTED + else: + raise CannotSendRequest() + + # Save the method we use, we need it later in the response phase + self._method = method + if not url: + url = '/' + str = '%s %s %s' % (method, url, self._http_vsn_str) + + self._output(str) + + if self._http_vsn == 11: + # Issue some standard headers for better HTTP/1.1 compliance + + if not skip_host: + # this header is issued *only* for HTTP/1.1 + # connections. more specifically, this means it is + # only issued when the client uses the new + # HTTPConnection() class. backwards-compat clients + # will be using HTTP/1.0 and those clients may be + # issuing this header themselves. we should NOT issue + # it twice; some web servers (such as Apache) barf + # when they see two Host: headers + + # If we need a non-standard port,include it in the + # header. If the request is going through a proxy, + # but the host of the actual URL, not the host of the + # proxy. + + netloc = '' + if url.startswith('http'): + nil, netloc, nil, nil, nil = urlsplit(url) + + if netloc: + try: + netloc_enc = netloc.encode("ascii") + except UnicodeEncodeError: + netloc_enc = netloc.encode("idna") + self.putheader('Host', netloc_enc) + else: + try: + host_enc = self.host.encode("ascii") + except UnicodeEncodeError: + host_enc = self.host.encode("idna") + if self.port == HTTP_PORT: + self.putheader('Host', host_enc) + else: + self.putheader('Host', "%s:%s" % (host_enc, self.port)) + + # note: we are assuming that clients will not attempt to set these + # headers since *this* library must deal with the + # consequences. this also means that when the supporting + # libraries are updated to recognize other forms, then this + # code should be changed (removed or updated). + + # we only want a Content-Encoding of "identity" since we don't + # support encodings such as x-gzip or x-deflate. + if not skip_accept_encoding: + self.putheader('Accept-Encoding', 'identity') + + # we can accept "chunked" Transfer-Encodings, but no others + # NOTE: no TE header implies *only* "chunked" + #self.putheader('TE', 'chunked') + + # if TE is supplied in the header, then it must appear in a + # Connection header. + #self.putheader('Connection', 'TE') + + else: + # For HTTP/1.0, the server will assume "not chunked" + pass + + def putheader(self, header, value): + """Send a request header line to the server. + + For example: h.putheader('Accept', 'text/html') + """ + if self.__state != _CS_REQ_STARTED: + raise CannotSendHeader() + + str = '%s: %s' % (header, value) + self._output(str) + + def endheaders(self): + """Indicate that the last header line has been sent to the server.""" + + if self.__state == _CS_REQ_STARTED: + self.__state = _CS_REQ_SENT + else: + raise CannotSendHeader() + + self._send_output() + + def request(self, method, url, body=None, headers={}): + """Send a complete request to the server.""" + + try: + self._send_request(method, url, body, headers) + except socket.error, v: + # trap 'Broken pipe' if we're allowed to automatically reconnect + if v[0] != 32 or not self.auto_open: + raise + # try one more time + self._send_request(method, url, body, headers) + + def _send_request(self, method, url, body, headers): + import os + # honour explicitly requested Host: and Accept-Encoding headers + header_names = dict.fromkeys([k.lower() for k in headers]) + skips = {} + if 'host' in header_names: + skips['skip_host'] = 1 + if 'accept-encoding' in header_names: + skips['skip_accept_encoding'] = 1 + + self.putrequest(method, url, **skips) + + if body: + if isinstance(body, file): + s = os.stat(body.name) ## how big is this file? + self.putheader('Content-Length', str(s.st_size)) + else: + self.putheader('Content-Length', str(len(body))) + for hdr, value in headers.iteritems(): + self.putheader(hdr, value) + self.endheaders() + + if body: + if isinstance(body, file): + self.sendbinary(body) + else: + self.send(body) + + def getresponse(self): + "Get the response from the server." + + # if a prior response has been completed, then forget about it. + if self.__response and self.__response.isclosed(): + self.__response = None + + # + # if a prior response exists, then it must be completed (otherwise, we + # cannot read this response's header to determine the connection-close + # behavior) + # + # note: if a prior response existed, but was connection-close, then the + # socket and response were made independent of this HTTPConnection + # object since a new request requires that we open a whole new + # connection + # + # this means the prior response had one of two states: + # 1) will_close: this connection was reset and the prior socket and + # response operate independently + # 2) persistent: the response was retained and we await its + # isclosed() status to become true. + # + if self.__state != _CS_REQ_SENT or self.__response: + raise ResponseNotReady() + + if self.debuglevel > 0: + response = self.response_class(self.sock, self.debuglevel, + strict=self.strict, + method=self._method) + else: + response = self.response_class(self.sock, strict=self.strict, + method=self._method) + + response.begin() + assert response.will_close != _UNKNOWN + self.__state = _CS_IDLE + + if response.will_close: + # this effectively passes the connection to the response + self.close() + else: + # remember this, so we can tell when it is complete + self.__response = response + + return response + +# The next several classes are used to define FakeSocket, a socket-like +# interface to an SSL connection. + +# The primary complexity comes from faking a makefile() method. The +# standard socket makefile() implementation calls dup() on the socket +# file descriptor. As a consequence, clients can call close() on the +# parent socket and its makefile children in any order. The underlying +# socket isn't closed until they are all closed. + +# The implementation uses reference counting to keep the socket open +# until the last client calls close(). SharedSocket keeps track of +# the reference counting and SharedSocketClient provides an constructor +# and close() method that call incref() and decref() correctly. + +class SharedSocket: + + def __init__(self, sock): + self.sock = sock + self._refcnt = 0 + + def incref(self): + self._refcnt += 1 + + def decref(self): + self._refcnt -= 1 + assert self._refcnt >= 0 + if self._refcnt == 0: + self.sock.close() + + def __del__(self): + self.sock.close() + +class SharedSocketClient: + + def __init__(self, shared): + self._closed = 0 + self._shared = shared + self._shared.incref() + self._sock = shared.sock + + def close(self): + if not self._closed: + self._shared.decref() + self._closed = 1 + self._shared = None + +class SSLFile(SharedSocketClient): + """File-like object wrapping an SSL socket.""" + + BUFSIZE = 8192 + + def __init__(self, sock, ssl, bufsize=None): + SharedSocketClient.__init__(self, sock) + self._ssl = ssl + self._buf = '' + self._bufsize = bufsize or self.__class__.BUFSIZE + + def _read(self): + buf = '' + # put in a loop so that we retry on transient errors + while True: + try: + buf = self._ssl.read(self._bufsize) + except socket.sslerror, err: + if (err[0] == socket.SSL_ERROR_WANT_READ + or err[0] == socket.SSL_ERROR_WANT_WRITE): + continue + if (err[0] == socket.SSL_ERROR_ZERO_RETURN + or err[0] == socket.SSL_ERROR_EOF): + break + raise + except socket.error, err: + if err[0] == errno.EINTR: + continue + if err[0] == errno.EBADF: + # XXX socket was closed? + break + raise + else: + break + return buf + + def read(self, size=None): + L = [self._buf] + avail = len(self._buf) + while size is None or avail < size: + s = self._read() + if s == '': + break + L.append(s) + avail += len(s) + all = "".join(L) + if size is None: + self._buf = '' + return all + else: + self._buf = all[size:] + return all[:size] + + def readline(self): + L = [self._buf] + self._buf = '' + while 1: + i = L[-1].find("\n") + if i >= 0: + break + s = self._read() + if s == '': + break + L.append(s) + if i == -1: + # loop exited because there is no more data + return "".join(L) + else: + all = "".join(L) + # XXX could do enough bookkeeping not to do a 2nd search + i = all.find("\n") + 1 + line = all[:i] + self._buf = all[i:] + return line + + def readlines(self, sizehint=0): + total = 0 + list = [] + while True: + line = self.readline() + if not line: + break + list.append(line) + total += len(line) + if sizehint and total >= sizehint: + break + return list + + def fileno(self): + return self._sock.fileno() + + def __iter__(self): + return self + + def next(self): + line = self.readline() + if not line: + raise StopIteration + return line + +class FakeSocket(SharedSocketClient): + + class _closedsocket: + def __getattr__(self, name): + raise error(9, 'Bad file descriptor') + + def __init__(self, sock, ssl): + sock = SharedSocket(sock) + SharedSocketClient.__init__(self, sock) + self._ssl = ssl + + def close(self): + SharedSocketClient.close(self) + self._sock = self.__class__._closedsocket() + + def makefile(self, mode, bufsize=None): + if mode != 'r' and mode != 'rb': + raise UnimplementedFileMode() + return SSLFile(self._shared, self._ssl, bufsize) + + def send(self, stuff, flags = 0): + return self._ssl.write(stuff) + + sendall = send + + def recv(self, len = 1024, flags = 0): + return self._ssl.read(len) + + def __getattr__(self, attr): + return getattr(self._sock, attr) + + +class HTTPSConnection(HTTPConnection): + "This class allows communication via SSL." + + default_port = HTTPS_PORT + + def __init__(self, host, port=None, key_file=None, cert_file=None, + strict=None): + HTTPConnection.__init__(self, host, port, strict) + self.key_file = key_file + self.cert_file = cert_file + + def connect(self): + "Connect to a host on a given (SSL) port." + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((self.host, self.port)) + ssl = socket.ssl(sock, self.key_file, self.cert_file) + self.sock = FakeSocket(sock, ssl) + + +class HTTP: + "Compatibility class with httplib.py from 1.5." + + _http_vsn = 10 + _http_vsn_str = 'HTTP/1.0' + + debuglevel = 0 + + _connection_class = HTTPConnection + + def __init__(self, host='', port=None, strict=None): + "Provide a default host, since the superclass requires one." + + # some joker passed 0 explicitly, meaning default port + if port == 0: + port = None + + # Note that we may pass an empty string as the host; this will throw + # an error when we attempt to connect. Presumably, the client code + # will call connect before then, with a proper host. + self._setup(self._connection_class(host, port, strict)) + + def _setup(self, conn): + self._conn = conn + + # set up delegation to flesh out interface + self.send = conn.send + self.putrequest = conn.putrequest + self.endheaders = conn.endheaders + self.set_debuglevel = conn.set_debuglevel + + conn._http_vsn = self._http_vsn + conn._http_vsn_str = self._http_vsn_str + + self.file = None + + def connect(self, host=None, port=None): + "Accept arguments to set the host/port, since the superclass doesn't." + + if host is not None: + self._conn._set_hostport(host, port) + self._conn.connect() + + def getfile(self): + "Provide a getfile, since the superclass' does not use this concept." + return self.file + + def putheader(self, header, *values): + "The superclass allows only one value argument." + self._conn.putheader(header, '\r\n\t'.join(values)) + + def getreply(self): + """Compat definition since superclass does not define it. + + Returns a tuple consisting of: + - server status code (e.g. '200' if all goes well) + - server "reason" corresponding to status code + - any RFC822 headers in the response from the server + """ + try: + response = self._conn.getresponse() + except BadStatusLine, e: + ### hmm. if getresponse() ever closes the socket on a bad request, + ### then we are going to have problems with self.sock + + ### should we keep this behavior? do people use it? + # keep the socket open (as a file), and return it + self.file = self._conn.sock.makefile('rb', 0) + + # close our socket -- we want to restart after any protocol error + self.close() + + self.headers = None + return -1, e.line, None + + self.headers = response.msg + self.file = response.fp + return response.status, response.reason, response.msg + + def close(self): + self._conn.close() + + # note that self.file == response.fp, which gets closed by the + # superclass. just clear the object ref here. + ### hmm. messy. if status==-1, then self.file is owned by us. + ### well... we aren't explicitly closing, but losing this ref will + ### do it + self.file = None + +if hasattr(socket, 'ssl'): + class HTTPS(HTTP): + """Compatibility with 1.5 httplib interface + + Python 1.5.2 did not have an HTTPS class, but it defined an + interface for sending http requests that is also useful for + https. + """ + + _connection_class = HTTPSConnection + + def __init__(self, host='', port=None, key_file=None, cert_file=None, + strict=None): + # provide a default host, pass the X509 cert info + + # urf. compensate for bad input. + if port == 0: + port = None + self._setup(self._connection_class(host, port, key_file, + cert_file, strict)) + + # we never actually use these for anything, but we keep them + # here for compatibility with post-1.5.2 CVS. + self.key_file = key_file + self.cert_file = cert_file + + +class HTTPException(Exception): + # Subclasses that define an __init__ must call Exception.__init__ + # or define self.args. Otherwise, str() will fail. + pass + +class NotConnected(HTTPException): + pass + +class InvalidURL(HTTPException): + pass + +class UnknownProtocol(HTTPException): + def __init__(self, version): + self.args = version, + self.version = version + +class UnknownTransferEncoding(HTTPException): + pass + +class UnimplementedFileMode(HTTPException): + pass + +class IncompleteRead(HTTPException): + def __init__(self, partial): + self.args = partial, + self.partial = partial + +class ImproperConnectionState(HTTPException): + pass + +class CannotSendRequest(ImproperConnectionState): + pass + +class CannotSendHeader(ImproperConnectionState): + pass + +class ResponseNotReady(ImproperConnectionState): + pass + +class BadStatusLine(HTTPException): + def __init__(self, line): + self.args = line, + self.line = line + +# for backwards compatibility +error = HTTPException + +class LineAndFileWrapper: + """A limited file-like object for HTTP/0.9 responses.""" + + # The status-line parsing code calls readline(), which normally + # get the HTTP status line. For a 0.9 response, however, this is + # actually the first line of the body! Clients need to get a + # readable file object that contains that line. + + def __init__(self, line, file): + self._line = line + self._file = file + self._line_consumed = 0 + self._line_offset = 0 + self._line_left = len(line) + + def __getattr__(self, attr): + return getattr(self._file, attr) + + def _done(self): + # called when the last byte is read from the line. After the + # call, all read methods are delegated to the underlying file + # object. + self._line_consumed = 1 + self.read = self._file.read + self.readline = self._file.readline + self.readlines = self._file.readlines + + def read(self, amt=None): + if self._line_consumed: + return self._file.read(amt) + assert self._line_left + if amt is None or amt > self._line_left: + s = self._line[self._line_offset:] + self._done() + if amt is None: + return s + self._file.read() + else: + return s + self._file.read(amt - len(s)) + else: + assert amt <= self._line_left + i = self._line_offset + j = i + amt + s = self._line[i:j] + self._line_offset = j + self._line_left -= amt + if self._line_left == 0: + self._done() + return s + + def readline(self): + if self._line_consumed: + return self._file.readline() + assert self._line_left + s = self._line[self._line_offset:] + self._done() + return s + + def readlines(self, size=None): + if self._line_consumed: + return self._file.readlines(size) + assert self._line_left + L = [self._line[self._line_offset:]] + self._done() + if size is None: + return L + self._file.readlines() + else: + return L + self._file.readlines(size) + +def test(): + """Test this module. + + A hodge podge of tests collected here, because they have too many + external dependencies for the regular test suite. + """ + + import sys + import getopt + opts, args = getopt.getopt(sys.argv[1:], 'd') + dl = 0 + for o, a in opts: + if o == '-d': dl = dl + 1 + host = 'www.python.org' + selector = '/' + if args[0:]: host = args[0] + if args[1:]: selector = args[1] + h = HTTP() + h.set_debuglevel(dl) + h.connect(host) + h.putrequest('GET', selector) + h.endheaders() + status, reason, headers = h.getreply() + print 'status =', status + print 'reason =', reason + print "read", len(h.getfile().read()) + print + if headers: + for header in headers.headers: print header.strip() + print + + # minimal test that code to extract host from url works + class HTTP11(HTTP): + _http_vsn = 11 + _http_vsn_str = 'HTTP/1.1' + + h = HTTP11('www.python.org') + h.putrequest('GET', 'http://www.python.org/~jeremy/') + h.endheaders() + h.getreply() + h.close() + + if hasattr(socket, 'ssl'): + + for host, selector in (('sourceforge.net', '/projects/python'), + ): + print "https://%s%s" % (host, selector) + hs = HTTPS() + hs.set_debuglevel(dl) + hs.connect(host) + hs.putrequest('GET', selector) + hs.endheaders() + status, reason, headers = hs.getreply() + print 'status =', status + print 'reason =', reason + print "read", len(hs.getfile().read()) + print + if headers: + for header in headers.headers: print header.strip() + print + +if __name__ == '__main__': + test() diff --git a/LTA/LTAIngest/dav/qp_xml.py b/LTA/LTAIngest/dav/qp_xml.py new file mode 100644 index 0000000000000000000000000000000000000000..1c299dc88a574f6ab92752535ea4a9c73398245b --- /dev/null +++ b/LTA/LTAIngest/dav/qp_xml.py @@ -0,0 +1,243 @@ +# pylint: disable-msg=W0311,E1101,E1103,W0201,C0103,W0622,W0402,W0706,R0911,W0613,W0612,R0912,W0141,C0111,C0121 + +# qp_xml: Quick Parsing for XML +# +# Written by Greg Stein. Public Domain. +# No Copyright, no Rights Reserved, and no Warranties. +# +# This module is maintained by Greg and is available as part of the XML-SIG +# distribution. This module and its changelog can be fetched at: +# http://www.lyra.org/cgi-bin/viewcvs.cgi/xml/xml/utils/qp_xml.py +# +# Additional information can be found on Greg's Python page at: +# http://www.lyra.org/greg/python/ +# +# This module was added to the XML-SIG distribution on February 14, 2000. +# As part of that distribution, it falls under the XML distribution license. +# + +import string + +try: + import pyexpat +except ImportError: + from xml.parsers import pyexpat + +error = __name__ + '.error' + + +# +# The parsing class. Instantiate and pass a string/file to .parse() +# +class Parser: + def __init__(self): + self.reset() + + def reset(self): + self.root = None + self.cur_elem = None + + def find_prefix(self, prefix): + elem = self.cur_elem + while elem: + if elem.ns_scope.has_key(prefix): + return elem.ns_scope[prefix] + elem = elem.parent + + if prefix == '': + return '' # empty URL for "no namespace" + + return None + + def process_prefix(self, name, use_default): + idx = string.find(name, ':') + if idx == -1: + if use_default: + return self.find_prefix(''), name + return '', name # no namespace + + if string.lower(name[:3]) == 'xml': + return '', name # name is reserved by XML. don't break out a NS. + + ns = self.find_prefix(name[:idx]) + if ns is None: + raise error, 'namespace prefix ("%s") not found' % name[:idx] + + return ns, name[idx+1:] + + def start(self, name, attrs): + elem = _element(name=name, lang=None, parent=None, + children=[], ns_scope={}, attrs={}, + first_cdata='', following_cdata='') + + if self.cur_elem: + elem.parent = self.cur_elem + elem.parent.children.append(elem) + self.cur_elem = elem + else: + self.cur_elem = self.root = elem + + work_attrs = [ ] + + # scan for namespace declarations (and xml:lang while we're at it) + for name, value in attrs.items(): + if name == 'xmlns': + elem.ns_scope[''] = value + elif name[:6] == 'xmlns:': + elem.ns_scope[name[6:]] = value + elif name == 'xml:lang': + elem.lang = value + else: + work_attrs.append((name, value)) + + # inherit xml:lang from parent + if elem.lang is None and elem.parent: + elem.lang = elem.parent.lang + + # process prefix of the element name + elem.ns, elem.name = self.process_prefix(elem.name, 1) + + # process attributes' namespace prefixes + for name, value in work_attrs: + elem.attrs[self.process_prefix(name, 0)] = value + + def end(self, name): + parent = self.cur_elem.parent + + del self.cur_elem.ns_scope + del self.cur_elem.parent + + self.cur_elem = parent + + def cdata(self, data): + elem = self.cur_elem + if elem.children: + last = elem.children[-1] + last.following_cdata = last.following_cdata + data + else: + elem.first_cdata = elem.first_cdata + data + + def parse(self, input): + self.reset() + + p = pyexpat.ParserCreate() + p.StartElementHandler = self.start + p.EndElementHandler = self.end + p.CharacterDataHandler = self.cdata + + try: + if type(input) == type(''): + p.Parse(input, 1) + else: + while 1: + s = input.read(_BLOCKSIZE) + if not s: + p.Parse('', 1) + break + + p.Parse(s, 0) + + finally: + if self.root: + _clean_tree(self.root) + + return self.root + + +# +# handy function for dumping a tree that is returned by Parser +# +def dump(f, root): + f.write('<?xml version="1.0"?>\n') + namespaces = _collect_ns(root) + _dump_recurse(f, root, namespaces, dump_ns=1) + f.write('\n') + + +# +# This function returns the element's CDATA. Note: this is not recursive -- +# it only returns the CDATA immediately within the element, excluding the +# CDATA in child elements. +# +def textof(elem): + return elem.textof() + + +######################################################################### +# +# private stuff for qp_xml +# + +_BLOCKSIZE = 16384 # chunk size for parsing input + +class _element: + def __init__(self, **kw): + self.__dict__.update(kw) + + def textof(self): + '''Return the CDATA of this element. + + Note: this is not recursive -- it only returns the CDATA immediately + within the element, excluding the CDATA in child elements. + ''' + s = self.first_cdata + for child in self.children: + s = s + child.following_cdata + return s + + def find(self, name, ns=''): + for elem in self.children: + if elem.name == name and elem.ns == ns: + return elem + return None + + +def _clean_tree(elem): + elem.parent = None + del elem.parent + map(_clean_tree, elem.children) + + +def _collect_recurse(elem, dict): + dict[elem.ns] = None + for ns, name in elem.attrs.keys(): + dict[ns] = None + for child in elem.children: + _collect_recurse(child, dict) + +def _collect_ns(elem): + "Collect all namespaces into a NAMESPACE -> PREFIX mapping." + d = { '' : None } + _collect_recurse(elem, d) + del d[''] # make sure we don't pick up no-namespace entries + keys = d.keys() + for i in range(len(keys)): + d[keys[i]] = i + return d + +def _dump_recurse(f, elem, namespaces, lang=None, dump_ns=0): + if elem.ns: + f.write('<ns%d:%s' % (namespaces[elem.ns], elem.name)) + else: + f.write('<' + elem.name) + for (ns, name), value in elem.attrs.items(): + if ns: + f.write(' ns%d:%s="%s"' % (namespaces[ns], name, value)) + else: + f.write(' %s="%s"' % (name, value)) + if dump_ns: + for ns, id in namespaces.items(): + f.write(' xmlns:ns%d="%s"' % (id, ns)) + if elem.lang != lang: + f.write(' xml:lang="%s"' % elem.lang) + if elem.children or elem.first_cdata: + f.write('>' + elem.first_cdata) + for child in elem.children: + _dump_recurse(f, child, namespaces, elem.lang) + f.write(child.following_cdata) + if elem.ns: + f.write('</ns%d:%s>' % (namespaces[elem.ns], elem.name)) + else: + f.write('</%s>' % elem.name) + else: + f.write('/>') diff --git a/LTA/LTAIngest/dav/webdav/Condition.py b/LTA/LTAIngest/dav/webdav/Condition.py new file mode 100644 index 0000000000000000000000000000000000000000..76acf94ca3d99a5cf24fe7410fe721ff65c18540 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/Condition.py @@ -0,0 +1,475 @@ +# pylint: disable-msg=R0921,W0704,R0901,W0511,R0201 +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This module contains classes for creating a search condition according to the DASL draft. +The classes will output the WHERE part of a search request to a WebDAV server. + +Instances of the classes defined in this module form a tree data structure which represents +a search condition. This tree is made up of AND-nodes, OR-nodes, Operator- and comparison- +nodes and from property (i.e. variable) and constant leaf nodes. +""" + + +import types +from time import strftime +from calendar import timegm +from rfc822 import formatdate + +from webdav.Constants import NS_DAV, PROP_LAST_MODIFIED, DATE_FORMAT_ISO8601 + + +__version__ = "$Revision$"[11:-2] + + +class ConditionTerm(object): + """ + This is the abstact base class for all condition terms. + """ + def __init__(self): + pass + + def toXML(self): + """ + Abstact method which return a XML string which can be passed to a WebDAV server + for a search condition. + """ + raise NotImplementedError + + # start Tamino workaround for missing like-op: + def postFilter(self, resultSet): + """ + Abstact method for temporary workaround for Tamino's absense of the like-operator. + This method shall filter the given result set for those resources which match + all Contains-trems. + """ + return resultSet + # end of workaround + + +class IsCollectionTerm(ConditionTerm): + """ Leaf condition. Checks if the matching resources are collections. """ + + def __init__(self): + """ Constructor. """ + + ConditionTerm.__init__(self) + + def toXML(self): + """ + Returns XML encoding. + """ + + return "<D:is-collection/>" + + +class Literal(ConditionTerm): + """ + A leaf class for condition expressions representing a constant value. + """ + def __init__(self, literal): + ConditionTerm.__init__(self) + self.literal = literal + + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:literal>" + self.literal + "</D:literal>" + + +class UnaryTerm(ConditionTerm): + """ + Base class of all nodes with a single child node. + """ + def __init__(self, child): + ConditionTerm.__init__(self) + self.child = child + + def toXML(self): + ''' + Returns XML encoding. + ''' + return self.child.toXML() + + +class BinaryTerm(ConditionTerm): + """ + Base class of all nodes with two child nodes + """ + def __init__(self, left, right): + ConditionTerm.__init__(self) + self.left = left + self.right = right + + def toXML(self): + ''' + Returns XML encoding. + ''' + return self.left.toXML() + self.right.toXML() + +class TupleTerm(ConditionTerm): + """ + Base class of all nodes with multiple single child nodes. + """ + def __init__(self, terms): + ConditionTerm.__init__(self) + self.terms = terms + + def addTerm(self, term): + ''' + Removes a term. + + @param term: term to add + ''' + self.terms.append(term) + + def removeTerm(self, term): + ''' + Adds a term. + + @param term: term to remove + ''' + try: + self.terms.remove(term) + except ValueError: + pass + + def toXML(self): + ''' + Returns XML encoding. + ''' + result = "" + for term in self.terms: + result += term.toXML() + return result + + +class AndTerm(TupleTerm): + """ + This class represents and logical AND-condition with multiple sub terms. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:and>" + TupleTerm.toXML(self) + "</D:and>" + + # start Tamino workaround for missing like-op: + def postFilter(self, resultSet): + ''' + Filters the given result set. This is a TAMINO WebDav server workaround + for the missing 'like' tag. + + @param resultSet: the result set that needs to be filtered. + ''' + for term in self.terms: + filtered = term.postFilter(resultSet) + resultSet = filtered + return resultSet + # end of workaround + +class OrTerm(TupleTerm): + """ + This class represents and logical OR-condition with multiple sub terms. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:or>" + TupleTerm.toXML(self) + "</D:or>" + + # start Tamino workaround for missing like-op: + def postFilter(self, resultSet): + ''' + Filters the given result set. This is a TAMINO WebDav server workaround + for the missing 'like' tag. + + @param resultSet: the result set that needs to be filtered. + ''' + raise NotImplementedError + + +class NotTerm(UnaryTerm): + """ + This class represents a negation term for the contained sub term. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + # start Tamino workaround for missing like-op: + if isinstance(self.child, ContainsTerm): + return "" + # end of workaround + return "<D:not>" + UnaryTerm.toXML(self) + "</D:not>" + + # start Tamino workaround for missing like-op: + def postFilter(self, resultSet): + ''' + Filters the given result set. This is a TAMINO WebDav server workaround + for the missing 'like' tag. + + @param resultSet: the result set that needs to be filtered. + ''' + if isinstance(self.child, ContainsTerm): + self.child.negate = 1 + # TODO: pass on filter + return self.child.postFilter(resultSet) + + +class ExistsTerm(UnaryTerm): + """ + Nodes of this class must have a single child with tuple type (of len 2) representing a + WebDAV property. + This leaf term evaluates to true if the (child) property exists. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return '<D:is-defined><D:prop xmlns="%s"><%s' % self.child + ' /></D:prop></D:is-defined>' + +class ContentContainsTerm(UnaryTerm): + """ + This class can be used to search for a given phrase in resources' contents. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:contains>" + self.child + "</D:contains>" + + + +class BinaryRelationTerm(BinaryTerm): + """ + This is the abstact base class for the following relation operands. + """ + def __init__(self, left, right): + BinaryTerm.__init__(self, left, right) + if isinstance(self.left, types.StringType): # Must be namespace + name pair + self.left = ('DAV:', self.left) + if not isinstance(self.right, Literal): + self.right = Literal(self.right) # Must be Literal instance + + def toXML(self): + ''' + Returns XML encoding. + ''' + ## TODO: extract name space and create shortcut for left element + return '<D:prop xmlns="%s"><%s /></D:prop>' % self.left + self.right.toXML() + + +class StringRelationTerm(BinaryRelationTerm): + """ + This is the abstact base class for the following string relation classes. + """ + def __init__(self, left, right, caseless=None): + """ + @param left: webdav property (namespace, name) + @param right: string/unicode literal + qparam caseless: 1 for case sensitive comparison + """ + BinaryRelationTerm.__init__(self, left, Literal(right)) + self.caseless = caseless + if self.caseless: + self.attrCaseless = "yes" + else: + self.attrCaseless = "no" + +class NumberRelationTerm(BinaryRelationTerm): + """ + This is the abstact base class for the following number comparison classes. + """ + def __init__(self, left, right): + """ + @param left: webdav property (namespace, name) + @param right: constant number + """ + ## TODO: implemet typed literal + BinaryRelationTerm.__init__(self, left, Literal(str(right))) + +class DateRelationTerm(BinaryRelationTerm): + """ + This is the abstact base class for the following date comparison classes. + """ + def __init__(self, left, right): + """ + @param left: webdav property (namespace, name) + @param right: string literal containing a date in ISO8601 format + """ + ## TODO: implemet typed literal + assert len(right) == 9, "No time is specified for literal: " + str(right) + BinaryRelationTerm.__init__(self, left, right) + if self.left == (NS_DAV, PROP_LAST_MODIFIED): + rfc822Time = formatdate(timegm(right)) # must not use locale setting + self.right = Literal(rfc822Time) + else: + self.right = Literal(strftime(DATE_FORMAT_ISO8601, right)) + + +class MatchesTerm(StringRelationTerm): + """ + Nodes of this class evaluate to true if the (child) property's value matches the (child) string. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return '<D:eq caseless="%s">' % self.attrCaseless + StringRelationTerm.toXML(self) + "</D:eq>" + +class ContainsTerm(StringRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) property's value contains the + (right child) string. + """ + def __init__(self, left, right, isTaminoWorkaround=False): + right = unicode(right) + StringRelationTerm.__init__(self, left, "%" + right + "%") + # Tamino workaround: operator like is not yet implemented: + self.negate = 0 + self.isTaminoWorkaround = isTaminoWorkaround + + def toXML(self): + ''' + Returns XML encoding. + ''' + # Tamino workaround: operator like is not yet implemented: + # Produce a is-defined-condition instead + if self.isTaminoWorkaround: + return "<D:isdefined><D:prop xmlns='%s'><%s" % self.left + " /></D:prop></D:isdefined>" + else: + return '<D:like caseless="%s">' % self.attrCaseless + StringRelationTerm.toXML(self) + "</D:like>" + + # start Tamino workaround for missing like-op: + def postFilter(self, resultSet): + ''' + Filters the given result set. This is a TAMINO WebDav server workaround + for the missing 'like' tag. + + @param resultSet: the result set that needs to be filtered. + ''' + newResult = {} + word = self.right.literal[1:-1] # remove leading and trailing '%' characters (see __init__()) + for url, properties in resultSet.items(): + value = properties.get(self.left) + if self.negate: + if not value or value.textof().find(word) < 0: + newResult[url] = properties + else: + if value and value.textof().find(word) >= 0: + newResult[url] = properties + return newResult + # end of workaround + +class IsEqualTerm(NumberRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) numerical property's value is equal + to the (right child) number. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:eq>" + NumberRelationTerm.toXML(self) + "</D:eq>" + +class IsGreaterTerm(NumberRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) numerical property's value is greater + than the (right child) number. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:gt>" + NumberRelationTerm.toXML(self) + "</D:gt>" + +class IsGreaterOrEqualTerm(NumberRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) numerical property's value is greater + than or equal to the (right child) number. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:gte>" + NumberRelationTerm.toXML(self) + "</D:gte>" + +class IsSmallerTerm(NumberRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) numerical property's value is less + than the (right child) number. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:lt>" + NumberRelationTerm.toXML(self) + "</D:lt>" + +class IsSmallerOrEqualTerm(NumberRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) numerical property's value is less + than or equal to the (right child) number. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:lte>" + NumberRelationTerm.toXML(self) + "</D:lte>" + + +class OnTerm(DateRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) property's value is a date + equal to the (right child) date. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:eq>" + DateRelationTerm.toXML(self) + "</D:eq>" + +class AfterTerm(DateRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) property's value is a date + succeeding the (right child) date. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:gt>" + DateRelationTerm.toXML(self) + "</D:gt>" + +class BeforeTerm(DateRelationTerm): + """ + Nodes of this class evaluate to true if the (left child) property's value is a date + preceeding the (right child) date. + """ + def toXML(self): + ''' + Returns XML encoding. + ''' + return "<D:lt>" + DateRelationTerm.toXML(self) + "</D:lt>" + + + +# Simple module test +if __name__ == '__main__': + # use the example from the webdav specification + condition = AndTerm( (MatchesTerm('getcontenttype', 'image/gif'), \ + IsGreaterTerm('getcontentlength', 4096)) ) + print "Where: " + condition.toXML() diff --git a/LTA/LTAIngest/dav/webdav/Connection.py b/LTA/LTAIngest/dav/webdav/Connection.py new file mode 100644 index 0000000000000000000000000000000000000000..7d7762fadf282d7f4d3ea018994d34ef15487b95 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/Connection.py @@ -0,0 +1,242 @@ +# pylint: disable-msg=W0142,W0102,R0901,R0904,E0203,E1101,C0103 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +The contained class extends the HTTPConnection class for WebDAV support. +""" + + +from httplib import HTTPConnection, CannotSendRequest, BadStatusLine, ResponseNotReady +from copy import copy +import base64 # for basic authentication +import md5 +import mimetypes +import os # file handling +import urllib +import types +import socket # to "catch" socket.error +from threading import RLock +from davlib import DAV +from qp_xml import Parser + +from webdav.WebdavResponse import MultiStatusResponse, ResponseFormatError +from webdav import Constants +from webdav.logger import getDefaultLogger + + +__version__ = "$LastChangedRevision$" + + +class Connection(DAV): + """ + This class handles a connection to a WebDAV server. + This class is used internally. Client code should prefer classes + L{WebdavClient.ResourceStorer} and L{WebdavClient.CollectionStorer}. + + @author: Roland Betz + """ + + # Constants + # The following switch activates a workaround for the Tamino webdav server: + # Tamino expects URLs which are passed in a HTTP header to be Latin-1 encoded + # instead of Utf-8 encoded. + # Set this switch to zero in order to communicate with conformant servers. + blockSize = 30000 + MaxRetries = 10 + + def __init__(self, *args, **kwArgs): + DAV.__init__(self, *args, **kwArgs) + self.__authorizationInfo = None + self.logger = getDefaultLogger() + self.isConnectedToCatacomb = True + self.serverTypeChecked = False + self.lock = RLock() + + def _request(self, method, url, body=None, extra_hdrs={}): + + self.lock.acquire() + try: + # add the authorization header + extraHeaders = copy(extra_hdrs) + if self.__authorizationInfo: + extraHeaders["AUTHORIZATION"] = self.__authorizationInfo.authorization + + # encode message parts + body = _toUtf8(body) + url = _urlEncode(url) + for key, value in extraHeaders.items(): + extraHeaders[key] = _toUtf8(value) + if key == "Destination": # copy/move header + if self.isConnectedToCatacomb: + extraHeaders[key] = _toUtf8(value.replace(Constants.SHARP, Constants.QUOTED_SHARP)) + + else: # in case of TAMINO 4.4 + extraHeaders[key] = _urlEncode(value) + # pass message to httplib class + for retry in range(0, Connection.MaxRetries): # retry loop + try: + self.logger.debug("REQUEST Send %s for %s" % (method, url)) + self.logger.debug("REQUEST Body: " + repr(body)) + for hdr in extraHeaders.items(): + self.logger.debug("REQUEST Header: " + repr(hdr)) + self.request(method, url, body, extraHeaders) + response = self.getresponse() + break # no retry needed + except (CannotSendRequest, socket.error, BadStatusLine, ResponseNotReady), exc: + # Workaround, start: reconnect and retry... + self.logger.debug("Exception: " + str(exc) + " Retry ... ") + self.close() + try: + self.connect() + except (CannotSendRequest, socket.error, BadStatusLine, ResponseNotReady), exc: + raise WebdavError("Cannot perform request. Connection failed.") + if retry == Connection.MaxRetries - 1: + raise WebdavError("Cannot perform request.") + return self.__evaluateResponse(method, response) + finally: + self.lock.release() + + def __evaluateResponse(self, method, response): + """ Evaluates the response of the WebDAV server. """ + + status, reason = response.status, response.reason + self.logger.debug("Method: " + method + " Status %d: " % status + reason) + + if status >= Constants.CODE_LOWEST_ERROR: # error has occured ? + self.logger.debug("ERROR Response: " + response.read()) + response.close() + raise WebdavError(reason, status) + + if status == Constants.CODE_MULTISTATUS: + content = response.read() + ## check for UTF-8 encodig + response.root = Parser().parse(content) + try: + response.msr = MultiStatusResponse(response.root) + except ResponseFormatError: + raise WebdavError("Invalid WebDAV response.") + response.close() + self.logger.debug("RESPONSE (Multi-Status): " + unicode(response.msr)) + elif method == 'LOCK' and status == Constants.CODE_SUCCEEDED: + response.parse_lock_response() + response.close() + elif method != 'GET' and method != 'PUT': + self.logger.debug("RESPONSE Body: " + response.read()) + response.close() + return response + + def addBasicAuthorization(self, user, password, realm=None): + if user and len(user) > 0: + self.__authorizationInfo = _BasicAuthenticationInfo(realm=realm, user=user, password=password) + + def addDigestAuthorization(self, user, password, realm=None): + if user and len(user) > 0: + self.__authorizationInfo = _DigestAuthenticationInfo(realm=realm, user=user, password=password) + + def putFile(self, path, srcfile, header={}): + self.lock.acquire() + try: + # Assemble header + size = os.fstat(srcfile.fileno()).st_size + header["Content-length"] = str(size) + contentType, contentEnc = mimetypes.guess_type(path) + if contentType: + header['Content-Type'] = contentType + if contentEnc: + header['Content-Encoding'] = contentEnc + if self.__authorizationInfo: + header["AUTHORIZATION"] = self.__authorizationInfo.authorization + + # send first request + path = _urlEncode(path) + try: + HTTPConnection.request(self, 'PUT', path, "", header) + self._blockCopySocket(srcfile, self, Connection.blockSize) + srcfile.close() + response = self.getresponse() + except (CannotSendRequest, socket.error, BadStatusLine, ResponseNotReady), exc: + self.logger.debug("Exception: " + str(exc) + " Retry ... ") + raise WebdavError("Cannot perform request.") + status, reason = (response.status, response.reason) + self.logger.debug("Status %d: %s" % (status, reason)) + try: + if status >= Constants.CODE_LOWEST_ERROR: # error has occured ? + raise WebdavError(reason, status) + finally: + self.logger.debug("RESPONSE Body: " + response.read()) + response.close() + return response + finally: + self.lock.release() + + def _blockCopySocket(self, source, toSocket, blockSize): + transferedBytes = 0 + block = source.read(blockSize) + #while source.readinto(block, blockSize): + while len(block): + toSocket.send(block) + self.logger.debug("Wrote %d bytes." % len(block)) + transferedBytes += len(block) + block = source.read(blockSize) + self.logger.info("Transfered %d bytes." % transferedBytes) + + def __str__(self): + return self.protocol + "://" + self.host + ':' + str(self.port) + + +class _BasicAuthenticationInfo(object): + def __init__(self, **kwArgs): + self.__dict__.update(kwArgs) + self.cookie = base64.encodestring("%s:%s" % (self.user, self.password) ).strip() + self.authorization = "Basic " + self.cookie + self.password = None # protect password security + +class _DigestAuthenticationInfo(object): + def __init__(self, **kwArgs): + self.__dict__.update(kwArgs) + value = "%s:%s:%s" % (self.user, self.realm, self.password) + value = value.strip() + self.extra = md5.new(value).digest() + self.authorization = "Digest realm=%s,user=%s" + + +class WebdavError(IOError): + def __init__(self, reason, code=0): + IOError.__init__(self, code) + self.code = code + self.reason = reason + def __str__(self): + return self.reason + +def _toUtf8(body): + if body: + if type(body) != types.UnicodeType: + body = unicode(body, 'latin-1') + body = body.encode('utf-8') + return body + +def _toLatin1(body): + if type(body) == types.UnicodeType: # unicode text detected + body = body.encode('latin-1') + return body + +def _urlEncode(url): + if type(url) != types.UnicodeType: + url = unicode(url, 'latin-1') + if Constants.CONFIG_UNICODE_URL: + url = url.encode('utf-8') + return urllib.quote(url) diff --git a/LTA/LTAIngest/dav/webdav/Constants.py b/LTA/LTAIngest/dav/webdav/Constants.py new file mode 100644 index 0000000000000000000000000000000000000000..c5c237aebdf0415747b6ff1df98c8c1425c52357 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/Constants.py @@ -0,0 +1,199 @@ +# pylint: disable-msg=C0103 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Contains XML tag names for the WebDAV protocol (RFC 2815) +and further WebDAV related constants. +""" + + +__version__ = "$Revision$"[11:-2] + + +QUOTED_SHARP = "%23" +SHARP = "#" + +# Date formats +DATE_FORMAT_ISO8601 = r"%Y-%m-%dT%H:%M:%SZ" +DATE_FORMAT_HTTP = r"%a, %d %b %Y %H:%M:%S GMT" # not used, substituted by rfc822 function + +NS_DAV = 'DAV:' +NS_TAMINO = 'http://namespaces.softwareag.com/tamino/response2' + +TAG_PROPERTY_FIND = 'propfind' +TAG_PROPERTY_NAME = 'propname' +TAG_PROPERTY_UPDATE = 'propertyupdate' +TAG_PROPERTY_SET = 'set' +TAG_PROPERTY_REMOVE = 'remove' +TAG_ALL_PROPERTY = 'allprop' +TAG_PROP = 'prop' + +TAG_MULTISTATUS = 'multistatus' +TAG_RESPONSE = 'response' +TAG_HREF = 'href' +TAG_PROPERTY_STATUS = 'propstat' +TAG_STATUS = 'status' +TAG_RESPONSEDESCRIPTION = 'responsdescription' + +PROP_CREATION_DATE = 'creationdate' +PROP_DISPLAY_NAME = 'displayname' +PROP_CONTENT_LANGUAGE = 'getcontentlanguage' +PROP_CONTENT_LENGTH = 'getcontentlength' +PROP_CONTENT_TYPE = 'getcontenttype' +PROP_ETAG = 'getetag' +PROP_MODIFICATION_DATE = 'modificationdate' # this property is supported by +# Tamino 4.4 but not by Catacomb; the date format is ISO8601 +PROP_LAST_MODIFIED = 'getlastmodified' +PROP_LOCK_DISCOVERY = 'lockdiscovery' +PROP_RESOURCE_TYPE = 'resourcetype' +PROP_SOURCE = 'source' +PROP_SUPPORTED_LOCK = 'supportedlock' +PROP_OWNER = 'owner' + +PROP_RESOURCE_TYPE_RESOURCE = 'resource' +PROP_RESOURCE_TYPE_COLLECTION = 'collection' + +TAG_LINK = 'link' +TAG_LINK_SOURCE = 'src' +TAG_LINK_DESTINATION = 'dst' + +TAG_LOCK_ENTRY = 'lockentry' +TAG_LOCK_SCOPE = 'lockscope' +TAG_LOCK_TYPE = 'locktype' +TAG_LOCK_INFO = 'lockinfo' +TAG_ACTIVE_LOCK = 'activelock' +TAG_LOCK_DEPTH = 'depth' +TAG_LOCK_TOKEN = 'locktoken' +TAG_LOCK_TIMEOUT = 'timeout' +TAG_LOCK_EXCLUSIVE = 'exclusive' +TAG_LOCK_SHARED = 'shared' +TAG_LOCK_OWNER = 'owner' + +# HTTP error code constants +CODE_MULTISTATUS = 207 +CODE_SUCCEEDED = 200 +CODE_CREATED = 201 +CODE_NOCONTENT = 204 + +CODE_LOWEST_ERROR = 300 + +CODE_UNAUTHORIZED = 401 +CODE_FORBIDDEN = 403 +CODE_NOT_FOUND = 404 +CODE_CONFLICT = 409 +CODE_PRECONDITION_FAILED = 412 +CODE_LOCKED = 423 # no permission +CODE_FAILED_DEPENDENCY = 424 + +CODE_OUTOFMEM = 507 + +# ? +CONFIG_UNICODE_URL = 1 + +# constants for WebDAV DASL according to draft + +TAG_SEARCH_REQUEST = 'searchrequest' +TAG_SEARCH_BASIC = 'basicsearch' +TAG_SEARCH_SELECT = 'select' +TAG_SEARCH_FROM = 'from' +TAG_SEARCH_SCOPE = 'scope' +TAG_SEARCH_WHERE = 'where' + +# constants for WebDAV ACP (according to draft-ietf-webdav-acl-09) below ... + +TAG_ACL = 'acl' +TAG_ACE = 'ace' +TAG_GRANT = 'grant' +TAG_DENY = 'deny' +TAG_PRIVILEGE = 'privilege' +TAG_PRINCIPAL = 'principal' +TAG_ALL = 'all' +TAG_AUTHENTICATED = 'authenticated' +TAG_UNAUTHENTICATED = 'unauthenticated' +TAG_OWNER = 'owner' +TAG_PROPERTY = 'property' +TAG_SELF = 'self' +TAG_INHERITED = 'inherited' +TAG_PROTECTED = 'protected' +TAG_SUPPORTED_PRIVILEGE = 'supported-privilege' +TAG_DESCRIPTION = 'description' + +# privileges for WebDAV ACP: +TAG_READ = 'read' +TAG_WRITE = 'write' +TAG_WRITE_PROPERTIES = 'write-properties' +TAG_WRITE_CONTENT = 'write-content' +TAG_UNLOCK = 'unlock' +TAG_READ_ACL = 'read-acl' +TAG_READ_CURRENT_USER_PRIVILEGE_SET = 'read-current-user-privilege-set' +TAG_WRITE_ACL = 'write-acl' +TAG_ALL = 'all' +TAG_BIND = 'bind' +TAG_UNBIND = 'unbind' + +# tamino specific security option +TAG_TAMINO_SECURITY = 'security' + +# maybe this shouldn't be hard coded in here, but for now we'll just have to +# live with it this way ... +TAMINO_PRIVILEGES = (TAG_READ, TAG_WRITE, TAG_READ_ACL, TAG_WRITE_ACL) + +# properties for WebDAV ACP: +PROP_CURRENT_USER_PRIVILEGE_SET = 'current-user-privilege-set' +PROP_SUPPORTED_PRIVILEGE_SET = 'supported-privilege-set' +PROP_PRINCIPAL_COLLECTION_SET = 'principal-collection-set' + +# reports for WebDAV ACP +REPORT_ACL_PRINCIPAL_PROP_SET = 'acl-principal-prop-set' + + + +# constants for WebDAV Delta-V + +# WebDAV Delta-V method names +METHOD_REPORT = 'REPORT' +METHOD_VERSION_CONTROL = 'VERSION-CONTROL' +METHOD_UNCHECKOUT = 'UNCHECKOUT' +METHOD_CHECKOUT = 'CHECKOUT' +METHOD_CHECKIN = 'CHECKIN' +METHOD_UPDATE = 'UPDATE' + +# Special properties +PROP_SUCCESSOR_SET = (NS_DAV, 'successor-set') +PROP_PREDECESSOR_SET = (NS_DAV, 'predecessor-set') +PROP_VERSION_HISTORY = (NS_DAV, 'version-history') +PROP_CREATOR = (NS_DAV, 'creator-displayname') +PROP_VERSION_NAME = (NS_DAV, 'version-name') +PROP_CHECKEDIN = (NS_DAV, 'checked-in') +PROP_CHECKEDOUT = (NS_DAV, 'checked-out') +PROP_COMMENT = (NS_DAV, 'comment') + +# XML tags for request body +TAG_VERSION_TREE = 'version-tree' +TAG_LOCATE_BY_HISTORY = 'locate-by-history' +TAG_UPDATE = 'update' +TAG_VERSION = 'version' + +# HTTP header constants +HTTP_HEADER_DEPTH_INFINITY = 'infinity' +HTTP_HEADER_IF = 'if' +HTTP_HEADER_DAV = 'dav' +HTTP_HEADER_DASL = 'dasl' +HTTP_HEADER_OPTION_ACL = 'access-control' +HTTP_HEADER_OPTION_DAV_BASIC_SEARCH = 'DAV:basicsearch' +HTTP_HEADER_SERVER = 'server' +HTTP_HEADER_SERVER_TAMINO = 'Apache/2.0.54 (Win32)' diff --git a/LTA/LTAIngest/dav/webdav/NameCheck.py b/LTA/LTAIngest/dav/webdav/NameCheck.py new file mode 100644 index 0000000000000000000000000000000000000000..7976973441b6c831875e345b3334577cff3359d8 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/NameCheck.py @@ -0,0 +1,193 @@ +# pylint: disable-msg=R0904,W0142,W0511,W0104,C0321,E1103,W0212 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Check name of new collections/resources for "illegal" characters. +""" + + +import re +import unicodedata + + +__version__ = "$LastChangedRevision$" + + +_unicodeUmlaut = [unicodedata.lookup("LATIN CAPITAL LETTER A WITH DIAERESIS"), + unicodedata.lookup("LATIN SMALL LETTER A WITH DIAERESIS"), + unicodedata.lookup("LATIN CAPITAL LETTER O WITH DIAERESIS"), + unicodedata.lookup("LATIN SMALL LETTER O WITH DIAERESIS"), + unicodedata.lookup("LATIN CAPITAL LETTER U WITH DIAERESIS"), + unicodedata.lookup("LATIN SMALL LETTER U WITH DIAERESIS"), + unicodedata.lookup("LATIN SMALL LETTER SHARP S")] + +# Define characters and character base sets +_german = u"".join(_unicodeUmlaut) +_alpha = "A-Za-z" +_num = "0-9" +_alphaNum = _alpha + _num +_space = " " +_under = "_" +_dash = "\-" +_dot = "\." +_exclam = "\!" +_tilde = "\~" +_dollar = "\$" +_plus = "+" +_equal = "=" +_sharp = "#" + +# Define character groups +_letterNum = _alphaNum + _german +_letter = _alpha + _german + +# Define character sets for names +firstPropertyChar = _letter + _under +propertyChar = firstPropertyChar + _num + _dash + _dot +firstResourceChar = firstPropertyChar + _num + _tilde + _exclam + _dollar + \ + _dot + _dash + _plus + _equal + _sharp +resourceChar = firstResourceChar + _space + +# Define regular expressions for name validation +_propertyFirstRe = re.compile(u"^["+ firstPropertyChar +"]") + +_propertyRe = re.compile(u"[^"+ propertyChar +"]") +_resourceFirstRe = re.compile(u"^["+ firstResourceChar +"]") +_resourceRe = re.compile(u"[^"+ resourceChar +"]") + + +def isValidPropertyName(name): + """ + Check if the given property name is valid. + + @param name: Property name. + @type name: C{unicode} + + @return: Boolean indicating whether the given property name is valid or not. + @rtype: C{bool} + """ + + illegalChar = _propertyRe.search(name) + return illegalChar == None and _propertyFirstRe.match(name) != None + + +def isValidResourceName(name): + """ + Check if the given resource name is valid. + + @param name: Resource name. + @type name: C{unicode} + + @return: Boolean indicating whether the given resource name is valid or not. + @rtype: C{bool} + """ + + illegalChar = _resourceRe.search(name) + return illegalChar == None and _resourceFirstRe.match(name) != None + + +def validatePropertyName(name): + """ + Check if the given property name is valid. + + @param name: Property name. + @type name: C{unicode} + @raise WrongNameError: if validation fails (see L{datafinder.common.NameCheck.WrongNameError}) + """ + + illegalChar = _propertyRe.search(name) + if illegalChar: + raise WrongNameError(illegalChar.start(), name[illegalChar.start()]) + if not _propertyFirstRe.match(name): + if len(name) > 0: + raise WrongNameError(0, name[0]) + else: + raise WrongNameError(0, 0) + + +def validateResourceName(name): + """ + Check if the given resource name is valid. + + @param name: name of resource/collection + @type name: C{unicode} + @raise WrongNameError: if validation fails (@see L{datafinder.common.NameCheck.WrongNameError}) + """ + + illegalChar = _resourceRe.search(name) + if illegalChar: + raise WrongNameError(illegalChar.start(), name[illegalChar.start()]) + if not _resourceFirstRe.match(name): + if len(name) > 0: + raise WrongNameError(0, name[0]) + else: + raise WrongNameError(0, 0) + + +def getResourceNameErrorPosition(name): + """ + Get position of illegal character (and the error-message). + This method can be used to get this information if L{isValidPropertyName} + or L{isValidResourceName} failed. + + @param name: Resource name. + @type name: C{unicode} + + @return: Tuple of error position and message. + @rtype: C{tuple} of C{int} and C{unicode} + """ + + result = (-1, None) + illegalChar = _resourceRe.search(name) + if illegalChar: + result = (illegalChar.start(), \ + u"Illegal character '%s' at index %d." % \ + (name[illegalChar.start()], illegalChar.start())) + elif not _resourceFirstRe.match(name): + result = (0, u"Illegal character '%s' at index %d." % (name[0], 0)) + return result + + +class WrongNameError(ValueError): + """ + Exception raised if an "illegal" character was found. + + @ivar character: character that caused the exception + @type character: C{unicode} + @ivar position: position of C{character} + @type position: C{int} + """ + + def __init__(self, position, character): + """ + Constructor. + + @param character: Character that caused the exception. + @type character: C{unicode} + @param position: Position of C{character} + @type position: C{int} + """ + + ValueError.__init__(self) + self.character = character + self.position = position + + def __str__(self): + """ Returns string representation. """ + + return ValueError.__str__(self) + \ + "Character '%s' at index %d." % (self.character, self.position) diff --git a/LTA/LTAIngest/dav/webdav/Utils.py b/LTA/LTAIngest/dav/webdav/Utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ec05755d251e160ce37f810af10cf6eb57837d8d --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/Utils.py @@ -0,0 +1,154 @@ +# pylint: disable-msg=W0141,R0912 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +The module contains functions to support use of the WebDav functionalities. +""" + + +import os +import sys + +from webdav.WebdavClient import CollectionStorer, ResourceStorer +from webdav.Constants import NS_DAV, PROP_RESOURCE_TYPE, CODE_NOT_FOUND, PROP_RESOURCE_TYPE_RESOURCE +from webdav.Connection import WebdavError + + +__version__ = "$Revision$"[11:-2] + + +def resourceExists(node, name = None, resourceType = PROP_RESOURCE_TYPE_RESOURCE): + """ + Check if resource exists. + + Usage: + - resourceExists(ResourceStorer-object): + check if resource exists + - resourceExists(CollectionStorer-object, name): + check if resource name exists in collection + + @param node: node that has to be checked or node of collection + @type node: L{ResourceStorer<webdav.WebdavClient.ResourceStorer>} + @param name: name of resource (in collection node) that has to be checked + @type name: string + + @return: boolean + + @raise WebdavError: all WebDAV errors except WebDAV error 404 (not found) + """ + + exists = False + if not node: + return exists + try: + myResourceType = "" + if name: + # make sure it's unicode: + if not isinstance(name, unicode): + name = name.decode(sys.getfilesystemencoding()) + url = node.url + if url.endswith("/"): + url = url + name + else: + url = url + "/" + name + newNode = ResourceStorer(url, node.connection) + element = newNode.readProperty(NS_DAV, PROP_RESOURCE_TYPE) + else: # name is "None": + element = node.readProperty(NS_DAV, PROP_RESOURCE_TYPE) + + if len(element.children) > 0: + myResourceType = element.children[0].name + if resourceType == myResourceType or resourceType == PROP_RESOURCE_TYPE_RESOURCE: + exists = True + else: + exists = False + except WebdavError, wderr: + if wderr.code == CODE_NOT_FOUND: + # node doesn't exist -> exists = False: + exists = False + else: + # another exception occured -> "re-raise" it: + raise + return exists + + +def downloadCollectionContent(destinationPath, collectionToDownload): + """ + Downloads the resources contained to the given directory. + + @param destinationPath: Path to download the files to, will be created if it not exists. + @type destinationPath: C{String} + @param collectionToDownload: Collection to download the content from. + @type collectionToDownload: instance of L{CollectionStorer<webdav.WebdavClient.CollectionStorer>} + + @raise WebdavError: If something goes wrong. + """ + + from time import mktime, gmtime + + downloadCount = 0 + + listOfItems = collectionToDownload.getCollectionContents() + + if not os.path.exists(destinationPath): + try: + os.makedirs(destinationPath) + except OSError: + errorMessage = "Cannot create download destination directory '%s'." % destinationPath + raise WebdavError(errorMessage) + + try: + itemsInPath = os.listdir(destinationPath) + except OSError: + errorMessage = "Cannot read the content of download destination directory '%s'." % destinationPath + raise WebdavError(errorMessage) + + for item in listOfItems: + # skip collections + if not isinstance(item[0], CollectionStorer): + itemSavePath = os.path.join(destinationPath, item[0].name) + existsItemSavePath = os.path.exists(itemSavePath) + + # update? + if existsItemSavePath: + try: + isUpdateNecessary = mktime(item[1].getLastModified()) > mktime(gmtime(os.path.getmtime(itemSavePath))) + except (ValueError, OverflowError): + isUpdateNecessary = True + # windows is not case sensitive + for realItem in itemsInPath: + if realItem.lower() == item[0].name.lower(): + itemsInPath.remove(realItem) + else: + isUpdateNecessary = True + + # download + if not existsItemSavePath or (existsItemSavePath and isUpdateNecessary): + item[0].downloadFile(itemSavePath) + downloadCount = downloadCount + 1 + + # delete old items + try: + for item in itemsInPath: + os.remove(os.path.join(destinationPath, item)) + except OSError, e: + if e.errno == 13: # permission error + sys.stderr.write("permission problem on '%s' in %s\n" % (e.filename, e.strerror)) + else: + raise + + return downloadCount diff --git a/LTA/LTAIngest/dav/webdav/VersionHandler.py b/LTA/LTAIngest/dav/webdav/VersionHandler.py new file mode 100644 index 0000000000000000000000000000000000000000..a1962c658b421662a0a95b8e898f392efca42795 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/VersionHandler.py @@ -0,0 +1,198 @@ +# pylint: disable-msg=W0612,W0142 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +The WebDAV client module forwards Delta-V related method invocations to +the following VersionHandler class. +""" + +__version__ = '$Revision$'[11:-2] + + +import types + +from webdav import Constants +from davlib import XML_CONTENT_TYPE, XML_DOC_HEADER + + +class VersionHandler(object): + """ + Implements a client interface for WebDAV Delta-V methods + For the Delta-V see RFC 3253 at http://www.ietf.org/rfc/rfc3253.txt + """ + + # restrict instance variables + __slots__ = ('path', 'connection') + + + def __init__(self, connection, path): + """ + Construct a VersionHandler with a URL path and a WebDAV connection. + This constructor must not be called outside class ResourceStorer. + + @param connection: L{webdav.Connection} instance + @param path: resource's path part of URL + """ + #assert isinstance(connection, Connection), \ + # "Class of connection is %s." % connection.__class__.__name__ + self.connection = connection + self.path = path + + + def activateVersionControl(self): + """ + Turns version control on for this resource. + The resource becomes a version controlled resource (VCR) + """ + response = self.connection._request(Constants.METHOD_VERSION_CONTROL, self.path, None, {}) + # set auto-versioning to DAV:locked-checkout + ## parse response body in case of an error + + def uncheckout(self, lockToken=None): + """ + Undos a previous check-out operation on this VCR. + The VCR is reverted to the state before the checkout/lock operation. + Beware: Property or content changes will be lost ! + A (optional) lock has to be removed seperatedly. + + @param lockToken: returned by a preceeding lock() method invocation or None + """ + headers = {} + if lockToken: + headers = lockToken.toHeader() + response = self.connection._request(Constants.METHOD_UNCHECKOUT, self.path, None, headers) + ## parse response body in case of an error + + def listAllVersions(self): + """ + List version history. + + @return: List of versions for this VCR. Each version entry is a tuple adhering + to the format (URL-path, name, creator, tuple of successor URL-paths). + If there are no branches then there is at most one successor within the tuple. + """ + # implementation is similar to the propfind method + headers = {} + headers['Content-Type'] = XML_CONTENT_TYPE + body = _createReportVersionTreeBody() + response = self.connection._request(Constants.METHOD_REPORT, self.path, body, headers) + # response is multi-status + result = [] + for path, properties in response.msr.items(): + # parse the successor-set value from XML into alist + result.append( (path, str(properties[Constants.PROP_VERSION_NAME]), \ + str(properties[Constants.PROP_CREATOR]), \ + _extractSuccessorList(properties[Constants.PROP_SUCCESSOR_SET])) ) + ## TODO: sort for path and produce list + result.sort() + return result + + # warning: not tested yet + def readVersionProperties(self): + """ + Provide version related information on this VCR. + This include a reference to the latest version resource, + check-out state information and a comment. + + @return: map of version properties with values. + """ + versionProperties = (Constants.PROP_CHECKEDIN, Constants.PROP_CHECKEDOUT, Constants.PROP_COMMENT) + return self.connection.readProperties(*versionProperties) + + + def revertToVersion(self, oldVersion): + """ + Revert this VCR to the given version. + Beware: All versions succeeding the given version are made unavailable. + + @param oldVersion: URL-path of a previous version of this VCR. + """ + ## send an update request + assert isinstance(oldVersion, types.StringType) or isinstance(oldVersion, types.UnicodeType) + response = self.connection._request(Constants.METHOD_UPDATE, self.path, + _createUpdateBody(oldVersion), {}) + return response + + + # the following is not needed when using auto-versioning + + # warning: not tested yet + def checkout(self): + """ + Marks resource as checked-out + This is usually followed by a GET (download) operation. + """ + response = self.connection._request(Constants.METHOD_CHECKOUT, self.path, None, {}) + ## parse response body in case of an error + + # warning: not tested yet + def checkin(self): + """ + Creates a new version from the VCR's content. + This opeartion is usually preceeded by one or more write operations. + """ + response = self.connection._request(Constants.METHOD_CHECKIN, self.path, None, {}) + versionUrl = response.getheader('Location') + return versionUrl + ## parse response body in case of an error + + + + +# Helper functions +def _createReportVersionTreeBody(): + """ + TBD + + @return: ... + @rtype: string + """ + versions = 'D:' + Constants.TAG_VERSION_TREE + prop = 'D:' + Constants.TAG_PROP + nameList = [Constants.PROP_SUCCESSOR_SET, Constants.PROP_VERSION_NAME, Constants.PROP_CREATOR] + return XML_DOC_HEADER + \ + '<%s xmlns:D="DAV:"><%s>' % (versions, prop) + \ + reduce(lambda xml, name: xml + "<D:%s/>" % name[1], [''] + nameList) + \ + '</%s></%s>' % (prop, versions) + +def _createUpdateBody(path): + """ + TBD + + @return: ... + @rtype: string + """ + update = 'D:' + Constants.TAG_UPDATE + version = 'D:' + Constants.TAG_VERSION + href = 'D:' + Constants.TAG_HREF + #PROP = 'D:' + TAG_PROP + return XML_DOC_HEADER + \ + '<%s xmlns:D="DAV:"><%s><%s>' % (update, version, href) + \ + path + \ + '</%s></%s></%s>' % (href, version, update) + +def _extractSuccessorList(element): + """ + TBD + + @return: ... + @rtype: tuple of strings + """ + result = [] + for href in element.children: + result.append(href.textof()) + return tuple(result) diff --git a/LTA/LTAIngest/dav/webdav/WebdavClient.py b/LTA/LTAIngest/dav/webdav/WebdavClient.py new file mode 100644 index 0000000000000000000000000000000000000000..54daae40c540c08b4740dc024e29caa6d14bea64 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/WebdavClient.py @@ -0,0 +1,828 @@ +# pylint: disable-msg=R0904,W0142,W0511,W0104,C0321,E1103,W0212 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This module contains the classes ResourceStorer and CollectionStorer for accessing WebDAV resources. +""" + + +from davlib import XML_CONTENT_TYPE + +from urlparse import urlsplit +import types + +from webdav import Constants +from webdav.WebdavResponse import LiveProperties +from webdav.WebdavRequests import createFindBody, createUpdateBody, createDeleteBody, createSearchBody +from webdav.Condition import ConditionTerm +from webdav.Connection import Connection, WebdavError +from webdav.VersionHandler import VersionHandler + +from webdav.acp.Privilege import Privilege +from webdav.acp.Acl import ACL +from webdav.NameCheck import validateResourceName, WrongNameError + + + +__version__ = '$Revision$'[11:-2] + + +def switchUnicodeUrlOn(switch): + """ + Configure whether to use unicode (UTF-8) encoded URLs (default) or + Latin-1 encoded URLs. + + @param switch: 1 if unicode URLs shall be used + """ + assert switch == 0 or switch == 1, "Pass boolean argument, please." + Constants.CONFIG_UNICODE_URL = switch + + +class ResourceStorer(object): + """ + This class provides client access to a WebDAV resource + identified by an URI. It provides all WebDAV class 2 features which include + uploading data, getting and setting properties qualified by a XML name space, + locking and unlocking the resource. + This class does not cache resource data. This has to be performed by its clients. + + @author: Roland Betz + """ + + # Instance properties + url = property(lambda self: str(self.connection) + self.path, None, None, "Resource's URL") + + def __init__(self, url, connection=None, validateResourceNames=True): + """ + Creates an instance for the given URL + User must invoke validate() after construction to check the resource on the server. + + @param url: Unique resource location for this storer. + @type url: C{string} + @param connection: this optional parameter contains a Connection object + for the host part of the given URL. Passing a connection saves + memory by sharing this connection. (defaults to None) + @type connection: L{webdav.Connection} + @raise WebdavError: If validation of resource name path parts fails. + """ + + assert connection == None or isinstance(connection, Connection) + parts = urlsplit(url, allow_fragments=False) + self.path = parts[2] + self.validateResourceNames = validateResourceNames + + # validate URL path + for part in self.path.split('/'): + if part != '' and not "ino:" in part: # explicitly allowing this character sequence as a part of a path (Tamino 4.4) + if self.validateResourceNames: + try: + validateResourceName(part) + except WrongNameError: + raise WebdavError("Found invalid resource name part.") + self.name = part + # was: filter(lambda part: part and validateResourceName(part), self.path.split('/')) + # but filter is deprecated + + self.defaultNamespace = None # default XML name space of properties + if connection: + self.connection = connection + else: + conn = parts[1].split(":") + if len(conn) == 1: + self.connection = Connection(conn[0], protocol = parts[0]) # host and protocol + else: + self.connection = Connection(conn[0], int(conn[1]), protocol = parts[0]) # host and port and protocol + self.versionHandler = VersionHandler(self.connection, self.path) + + + def validate(self): + """ + Check whether URL contains a WebDAV resource + Uses the WebDAV OPTIONS method. + + @raise WebdavError: L{WebdavError} if URL does not contain a WebDAV resource + """ + #davHeader = response.getheader(HTTP_HEADER_DAV) + davHeader = self.getSpecificOption(Constants.HTTP_HEADER_DAV) + self.connection.logger.debug("HEADER DAV: %s" % davHeader) + if not(davHeader) or davHeader.find("2") < 0: # DAV class 2 supported ? + raise WebdavError("URL does not support WebDAV", 0) + + def options(self): + """ + Send an OPTIONS request to server and return all HTTP headers. + + @return: map of all HTTP headers returned by the OPTIONS method. + """ + response = self.connection.options(self.path) + result = {} + result.update(response.msg) + self.connection.logger.debug("OPTION returns: " + str(result.keys())) + return result + + def _getAclSupportAvailable(self): + """ + Returns True if the current connection has got ACL support. + + @return: ACL support (True / False) + @rtype: C{bool} + """ + options = self.getSpecificOption(Constants.HTTP_HEADER_DAV) + if options.find(Constants.HTTP_HEADER_OPTION_ACL) >= 0: + return True + else: + return False + + aclSupportAvailable = property(_getAclSupportAvailable) + + def _getDaslBasicsearchSupportAvailable(self): + """ + Returns True if the current connection supports DASL basic search. + + @return: DASL basic search support (True / False) + @rtype: C{bool} + """ + options = self.getSpecificOption(Constants.HTTP_HEADER_DASL) + if not options or \ + not options.find(Constants.HTTP_HEADER_OPTION_DAV_BASIC_SEARCH) >= 0: + return False + else: + return True + + daslBasicsearchSupportAvailable = property(_getDaslBasicsearchSupportAvailable) + + def isConnectedToCatacombServer(self): + """ + Returns True if connected to a Catacomb WebDav server. + + @return: if connected to Catacomb Webdav server (True / False) + @rtype: C{bool} + """ + if not self.connection.serverTypeChecked: + options = self.getSpecificOption(Constants.HTTP_HEADER_SERVER) + if options.find(Constants.HTTP_HEADER_SERVER_TAMINO) >= 0: + self.connection.isConnectedToCatacomb = False + else: + self.connection.isConnectedToCatacomb = True + self.connection.serverTypeChecked = True + return self.connection.isConnectedToCatacomb + + def getSpecificOption(self, option): + """ + Returns specified WebDav options. + @param option: name of the option + + @return: String containing the value of the option. + @rtype: C{string} + """ + options = '' + try: + options = self.options().get(option) + except KeyError: + return options + return options + + ### delegate some method invocations + def __getattr__(self, name): + """ + Build-in method: + Forwards unknow lookups (methods) to delegate object 'versionHandler'. + + @param name: name of unknown attribute + """ + # delegate Delta-V methods + return getattr(self.versionHandler, name) + + def copy(self, toUrl, infinity=True): + """ + Copies this resource. + + @param toUrl: target URI path + @param infinity: Flag that indicates that the complete content of collection is copied. (default) + @type depth: C{boolean} + """ + self.connection.logger.debug("Copy to " + repr(toUrl)); + _checkUrl(toUrl) + if infinity: + response = self.connection.copy(self.path, toUrl) + else: + response = self.connection.copy(self.path, toUrl, 0) + if response.status == Constants.CODE_MULTISTATUS and response.msr.errorCount > 0: + raise WebdavError("Request failed: " + response.msr.reason, response.msr.code) + + def delete(self, lockToken=None): + """ + Deletes this resource. + + @param lockToken: String returned by last lock operation or null. + @type lockToken: L{LockToken} + """ + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + header = {} + if lockToken: + header = lockToken.toHeader() + response = self.connection.delete(self.path, header) + if response.status == Constants.CODE_MULTISTATUS and response.msr.errorCount > 0: + raise WebdavError("Request failed: " + response.msr.reason, response.msr.code) + + def move(self, toUrl): + """ + Moves this resource to the given path or renames it. + + @param toUrl: new (URI) path + """ + self.connection.logger.debug("Move to " + repr(toUrl)); + _checkUrl(toUrl) + response = self.connection.move(self.path, toUrl) + if response.status == Constants.CODE_MULTISTATUS and response.msr.errorCount > 0: + raise WebdavError("Request failed: " + response.msr.reason, response.msr.code) + + + def lock(self, owner): + """ + Locks this resource for exclusive write access. This means that for succeeding + write operations the returned lock token has to be passed. + If the methode does not throw an exception the lock has been granted. + + @param owner: describes the lock holder + @return: lock token string (automatically generated) + @rtype: L{LockToken} + """ + response = self.connection.lock(self.path, owner) + if response.status == Constants.CODE_MULTISTATUS and response.msr.errorCount > 0: + raise WebdavError("Request failed: " + response.msr.reason, response.msr.code) + return LockToken(self.url, response.locktoken) + + def unlock(self, lockToken): + """ + Removes the lock from this resource. + + @param lockToken: which has been return by the lock() methode + @type lockToken: L{LockToken} + """ + self.connection.unlock(self.path, lockToken.token) + + + def deleteContent(self, lockToken=None): + """ + Delete binary data at permanent storage. + + @param lockToken: None or lock token from last lock request + @type lockToken: L{LockToken} + """ + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + header = {} + if lockToken: + header = lockToken.toHeader() + self.connection.put(self.path, "", extra_hdrs=header) + + def uploadContent(self, content, lockToken=None): + """ + Write binary data to permanent storage. + + @param content: containing binary data + @param lockToken: None or lock token from last lock request + @type lockToken: L{LockToken} + """ + assert not content or isinstance(content, types.UnicodeType) or\ + isinstance(content, types.StringType), "Content is not a string: " + content.__class__.__name__ + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + header = {} + if lockToken: + header = lockToken.toHeader() + response = None + try: + response = self.connection.put(self.path, content, extra_hdrs=header) + finally: + if response: + self.connection.logger.debug(response.read()) + response.close() + + def uploadFile(self, newFile, lockToken=None): + """ + Write binary data to permanent storage. + + @param newFile: File containing binary data. + @param lockToken: None or lock token from last lock request + @type lockToken: L{LockToken} + """ + assert isinstance(newFile, types.FileType), "Argument is no file: " + file.__class__.__name__ + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + header = {} + if lockToken: + header = lockToken.toHeader() + self.connection.putFile(self.path, newFile, header=header) + + def downloadContent(self): + """ + Read binary data from permanent storage. + """ + response = self.connection.get(self.path) + # TODO: Other interface ? return self.connection.getfile() + return response + + def downloadFile(self, localFileName): + """ + Copy binary data from permanent storage to a local file. + + @param localFileName: file to write binary data to + """ + localFile = open(localFileName, 'wb') + remoteFile = self.downloadContent() + _blockCopyFile(remoteFile, localFile, Connection.blockSize) + remoteFile.close() + localFile.close() + + def readProperties(self, *names): + """ + Reads the given properties. + + @param names: a list of property names. + A property name is a (XmlNameSpace, propertyName) tuple. + @return: a map from property names to DOM Element or String values. + """ + assert names, "Property names are missing." + body = createFindBody(names, self.defaultNamespace) + response = self.connection.propfind(self.path, body, depth=0) + properties = response.msr.values()[0] + if properties.errorCount > 0: + raise WebdavError("Property is missing on '%s': %s" % (self.path, properties.reason), properties.code) + return properties + + def readProperty(self, nameSpace, name): + """ + Reads the given property. + + @param nameSpace: XML-namespace + @type nameSpace: string + @param name: A property name. + @type name: string + + @return: a map from property names to DOM Element or String values. + """ + results = self.readProperties((nameSpace, name)) + if len(results) == 0: + raise WebdavError("Property is missing: " + results.reason) + return results.values()[0] + + def readAllProperties(self): + """ + Reads all properties of this resource. + + @return: a map from property names to DOM Element or String values. + """ + response = self.connection.allprops(self.path, depth=0) + return response.msr.values()[0] + + def readAllPropertyNames(self): + """ + Returns the names of all properties attached to this resource. + + @return: List of property names + """ + response = self.connection.propnames(self.path, depth=0) + return response.msr.values()[0] + + def readStandardProperties(self): + """ + Read all WebDAV live properties. + + @return: A L{LiveProperties} instance which contains a getter method for each live property. + """ + body = createFindBody(LiveProperties.NAMES, Constants.NS_DAV) + response = self.connection.propfind(self.path, body, depth=0) + properties = response.msr.values()[0] + return LiveProperties(properties) + + def writeProperties(self, properties, lockToken=None): + """ + Sets or updates the given properties. + + @param lockToken: if the resource has been locked this is the lock token. + @type lockToken: L{LockToken} + @param properties: a map from property names to a String or + DOM element value for each property to add or update. + """ + assert isinstance(properties, types.DictType) + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + header = {} + if lockToken: + header = lockToken.toHeader() + body = createUpdateBody(properties, self.defaultNamespace) + response = self.connection.proppatch(self.path, body, header) + if response.msr.errorCount > 0: + raise WebdavError("Request failed: " + response.msr.reason, response.msr.code) + + def deleteProperties(self, lockToken=None, *names): + """ + Removes the given properties from this resource. + + @param lockToken: if the resource has been locked this is the lock token. + @type lockToken: L{LockToken} + @param names: a collection of property names. + A property name is a (XmlNameSpace, propertyName) tuple. + """ + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + header = {} + if lockToken: + header = lockToken.toHeader() + body = createDeleteBody(names, self.defaultNamespace) + response = self.connection.proppatch(self.path, body, header) + if response.msr.errorCount > 0: + raise WebdavError("Request failed: " + response.msr.reason, response.msr.code) + + # ACP extension + def setAcl(self, acl, lockToken=None): + """ + Sets ACEs in the non-inherited and non-protected ACL or the resource. + This is the implementation of the ACL method of the WebDAV ACP. + + @param acl: ACL to be set on resource as ACL object. + @param lockToken: If the resource has been locked this is the lock token (defaults to None). + @type lockToken: L{LockToken} + """ + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + headers = {} + if lockToken: + headers = lockToken.toHeader() + headers['Content-Type'] = XML_CONTENT_TYPE + body = acl.toXML() + response = self.connection._request('ACL', self.path, body, headers) + return response + ## TODO: parse DAV:error response + + def getAcl(self): + """ + Returns this resource's ACL in an ACL instance. + + @return: Access Control List. + @rtype: L{ACL<webdav.acp.Acl.ACL>} + """ + xmlAcl = self.readProperty(Constants.NS_DAV, Constants.TAG_ACL) + return ACL(xmlAcl) + + def getCurrentUserPrivileges(self): + """ + Returns a tuple of the current user privileges. + + @return: list of Privilege instances + @rtype: list of L{Privilege<webdav.acp.Privilege.Privilege>} + """ + privileges = self.readProperty(Constants.NS_DAV, Constants.PROP_CURRENT_USER_PRIVILEGE_SET) + result = [] + for child in privileges.children: + result.append(Privilege(domroot=child)) + return result + + def getPrincipalCollections(self): + """ + Returns a list principal collection URLs. + + @return: list of principal collection URLs + @rtype: C{list} of C{unicode} elements + """ + webdavQueryResult = self.readProperty(Constants.NS_DAV, Constants.PROP_PRINCIPAL_COLLECTION_SET) + principalCollectionList = [] + for child in webdavQueryResult.children: + principalCollectionList.append(child.first_cdata) + return principalCollectionList + + def getOwnerUrl(self): + """ Explicitly retireve the Url of the owner. """ + + result = self.readProperty(Constants.NS_DAV, Constants.PROP_OWNER) + if result and len(result.children): + return result.children[0].textof() + return None + +class CollectionStorer(ResourceStorer): + """ + This class provides client access to a WebDAV collection resource identified by an URI. + This class does not cache resource data. This has to be performed by its clients. + + @author: Roland Betz + """ + + def __init__(self, url, connection=None, validateResourceNames=True): + """ + Creates a CollectionStorer instance for a URL and an optional Connection object. + User must invoke validate() after constuction to check the resource on the server. + + @see: L{webdav.WebdavClient.ResourceStorer.__init__} + @param url: unique resource location for this storer + @param connection: this optional parameter contains a Connection object for the host part + of the given URL. Passing a connection saves memory by sharing this connection. + """ + if url[-1] != '/': # Collection URL must end with slash + url += '/' + ResourceStorer.__init__(self, url, connection, validateResourceNames) + + def getResourceStorer(self, name): + """ + Return a ResourceStorer instance for a child resource (member) of this Collection. + + @param name: leaf name of child resource + @return: L{ResourceStorer} instance + """ + assert isinstance(name, types.StringType) or isinstance(name, types.UnicodeType) + return ResourceStorer(self.url + name, self.connection, self.validateResourceNames) + + def validate(self): + """ + Check whether this URL contains a WebDAV collection. + Uses the WebDAV OPTION method. + + @raise WebdavError: L{WebdavError} if URL does not contain a WebDAV collection resource. + """ + super(CollectionStorer, self).validate() + isCollection = self.readProperty(Constants.NS_DAV, Constants.PROP_RESOURCE_TYPE) + if not (isCollection and isCollection.children): + raise WebdavError("Not a collection URL.", 0) + + def addCollection(self, name, lockToken=None): + """ + Make a new WebDAV collection resource within this collection. + + @param name: of the new collection + @param lockToken: None or token returned by last lock operation + @type lockToken: L{LockToken} + """ + assert isinstance(name, types.StringType) or isinstance(name, types.UnicodeType) + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + header = {} + if lockToken: + header = lockToken.toHeader() + if self.validateResourceNames: + validateResourceName(name) + if name[-1] != '/': # Collection URL must end with slash + name += '/' + self.connection.mkcol(self.path + name, header) + return CollectionStorer(self.url + name, self.connection, self.validateResourceNames) + + def addResource(self, name, content=None, properties=None, lockToken=None): + """ + Create a new empty WebDAV resource contained in this collection with the given + properties. + + @param name: leaf name of the new resource + @param content: None or initial binary content of resource + @param properties: name/value-map containing properties + @param lockToken: None or token returned by last lock operation + @type lockToken: L{LockToken} + """ + assert isinstance(name, types.StringType) or isinstance(name, types.UnicodeType) + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + if self.validateResourceNames: + validateResourceName(name) ## check for invalid characters + resource_ = ResourceStorer(self.url + name, self.connection, self.validateResourceNames) + resource_.uploadContent(content, lockToken) + if properties: + resource_.writeProperties(properties, lockToken) + return resource_ + + def deleteResource(self, name, lockToken=None): + """ + Delete a collection which is contained within this collection + + @param name: leaf name of a contained collection resource + @param lockToken: None or token returned by last lock operation + @type lockToken: L{LockToken} + """ + assert isinstance(name, types.StringType) or isinstance(name, types.UnicodeType) + assert lockToken == None or isinstance(lockToken, LockToken), \ + "Invalid lockToken argument %s" % type(lockToken) + header = {} + if lockToken: + header = lockToken.toHeader() + if self.validateResourceNames: + validateResourceName(name) + response = self.connection.delete(self.path + name, header) + if response.status == Constants.CODE_MULTISTATUS and response.msr.errorCount > 0: + raise WebdavError("Request failed: %s" % response.msr.reason, response.msr.code) + + def lockAll(self, owner): + """ + Locks this collection resource for exclusive write access. This means that for + succeeding write operations the returned lock token has to be passed. + The operation is applied recursively to all contained resources. + If the methode does not throw an exception then the lock has been granted. + + @param owner: describes the lock holder + @return: Lock token string (automatically generated). + @rtype: L{LockToken} + """ + assert isinstance(owner, types.StringType) or isinstance(owner, types.UnicodeType) + response = self.connection.lock(self.path, owner, depth=Constants.HTTP_HEADER_DEPTH_INFINITY) + return LockToken(self.url, response.locktoken) + + def listResources(self): + """ + Describe all members within this collection. + + @return: map from URI to a L{LiveProperties} instance containing the WebDAV + live attributes of the contained resource + """ + # *LiveProperties.NAMES denotes the list of all live properties as an + # argument to the method call. + response = self.connection.getprops(self.path, + depth=1, + ns=Constants.NS_DAV, + *LiveProperties.NAMES) + result = {} + for path, properties in response.msr.items(): + if path == self.path: # omit this collection resource + continue + ## some servers do not append a trailing slash to collection paths + if self.path.endswith('/') and self.path[0:-1] == path: + continue + result[path] = LiveProperties(properties=properties) + return result + + def getCollectionContents(self): + """ + Return a list of the tuple (resources or collection) / properties) + + @return: a list of the tuple (resources or collection) / properties) + @rtype: C{list} + """ + self.validate() + collectionContents = [] + result = self.listResources() + for url, properties_ in result.items(): + if not self.path == url: + if properties_.getResourceType() == 'resource': + myWebDavStorer = ResourceStorer(url, self.connection, self.validateResourceNames) + else: + myWebDavStorer = CollectionStorer(url, self.connection, self.validateResourceNames) + collectionContents.append((myWebDavStorer, properties_)) + return collectionContents + + def findProperties(self, *names): + """ + Retrieve given properties for this collection and all directly contained resources. + + @param names: a list of property names + @return: a map from resource URI to a map from property name to value. + """ + assert isinstance(names, types.ListType) or isinstance(names, types.TupleType), \ + "Argument name has type %s" % str(type(names)) + body = createFindBody(names, self.defaultNamespace) + response = self.connection.propfind(self.path, body, depth=1) + return response.msr + + def deepFindProperties(self, *names): + """ + Retrieve given properties for this collection and all contained (nested) resources. + + Note: + ===== + This operation can take a long time if used with recursive=true and is therefore + disabled on some WebDAV servers. + + @param names: a list of property names + @return: a map from resource URI to a map from property name to value. + """ + assert isinstance(names, types.ListType.__class__) or isinstance(names, types.TupleType), \ + "Argument name has type %s" % str(type(names)) + body = createFindBody(names, self.defaultNamespace) + response = self.connection.propfind(self.path, body, depth=Constants.HTTP_HEADER_DEPTH_INFINITY) + return response.msr + + def findAllProperties(self): + """ + Retrieve all properties for this collection and all directly contained resources. + + @return: a map from resource URI to a map from property name to value. + """ + response = self.connection.allprops(self.path, depth=1) + return response.msr + + + # DASL extension + def search(self, conditions, selects): + """ + Search for contained resources which match the given search condition. + + @param conditions: tree of ConditionTerm instances representing a logical search term + @param selects: list of property names to retrieve for the found resources + """ + assert isinstance(conditions, ConditionTerm) + headers = { 'Content-Type' : XML_CONTENT_TYPE, "depth": Constants.HTTP_HEADER_DEPTH_INFINITY} + body = createSearchBody(selects, self.path, conditions) + response = self.connection._request('SEARCH', self.path, body, headers) + return response.msr + + +class LockToken(object): + """ + This class provides help on handling WebDAV lock tokens. + + @author: Roland Betz + """ + # restrict instance variables + __slots__ = ('url', 'token') + + def __init__(self, url, token): + assert isinstance(url, types.StringType) or isinstance(url, types.UnicodeType), \ + "Invalid url argument %s" % type(url) + assert isinstance(token, types.StringType) or isinstance(token, types.UnicodeType), \ + "Invalid lockToken argument %s" % type(token) + self.url = url + self.token = token + + def value(self): + """ + Descriptive string containing the lock token's URL and the token itself. + + @return: Descriptive lock token with URL. + @rtype: C{string} + """ + return "<" + self.url + "> (<" + self.token + ">)" + + def toHeader(self): + """ + Header fragment for WebDAV request. + + @return: Dictionary containing an entry for the lock token query. + @rtype: C{dictionary} + """ + return {Constants.HTTP_HEADER_IF: self.value()} + + def __str__(self): + return self.value() + + +def _blockCopyFile(source, dest, blockSize): + """ + Copies a file in chunks of C{blockSize}. + + @param source: Source file. + @type source: FileIO buffer. + @param dest: Destination file. + @type dest: FileIO buffer. + @param blockSize: Size of block in bytes. + @type blockSize: C{int} + """ + transferedBytes = 0 + block = source.read(blockSize) + while len(block): + dest.write(block) + transferedBytes += len(block); + block = source.read(blockSize) + +def _checkUrl(url): + """ + Checks the given URL for validity. + + @param url: URL to check. + @type url: C{string} + + @raise ValueError: If the URL does not contain valid/usable content. + """ + + parts = urlsplit(url, allow_fragments=False) + if len(parts[0]) == 0 or len(parts[1]) == 0 or len(parts[2]) == 0: + raise ValueError("Invalid URL: " + repr(url)) + +# small test +# asks for WebDAV colection, username, password and lists the content of the collection. +if __name__ == "__main__": + import sys + + webdavUrl = raw_input("WebDAV Collection (URL):").strip() + username = raw_input("Username:").strip() + password = raw_input("Password:").strip() + + webdavConnection = CollectionStorer(webdavUrl, validateResourceNames=False) + webdavConnection.connection.addBasicAuthorization(username, password) + print "Contents of resource %s:" % webdavConnection.path + for resource, properties in webdavConnection.getCollectionContents(): + try: + print(resource.path.encode(sys.getfilesystemencoding())) + print(unicode(properties).encode(sys.getfilesystemencoding())) + except UnicodeEncodeError: + print("Cannot encode resource path or properties.") + print("\n") diff --git a/LTA/LTAIngest/dav/webdav/WebdavRequests.py b/LTA/LTAIngest/dav/webdav/WebdavRequests.py new file mode 100644 index 0000000000000000000000000000000000000000..79e586a6b2beaf905dbc553be9348d74ac03c99c --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/WebdavRequests.py @@ -0,0 +1,205 @@ +# pylint: disable-msg=W0511,W0212,E1111 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This module handles WebDav server requests. +""" + + +import types +from webdav import Constants +import qp_xml +from tempfile import TemporaryFile + +from davlib import XML_DOC_HEADER + +from webdav.NameCheck import validatePropertyName + + +__version__ = "$LastChangedRevision$" + + +## TODO: create a property list class + +class XmlNameSpaceMangler(object): + ''' + Handles WebDav requests. + ''' + + # restrict instance variables + __slots__ = ('shortcuts', 'defaultNameSpace') + + def __init__(self, nameList, defaultNameSpace = None): + ''' + + @param nameList: + @param defaultNameSpace: + ''' + + assert isinstance(nameList, types.ListType) or isinstance(nameList, types.TupleType), \ + "1. argument has wrong type %s" % type(nameList) + self.shortcuts = {} + self.defaultNameSpace = defaultNameSpace + for name in nameList: + if not isinstance(name, types.TupleType): + name = (defaultNameSpace, name) + assert isinstance(name, types.TupleType) and len(name) == 2, \ + "Name is not a namespace, name tuple: %s" % type(name) + validatePropertyName(name[1]) + if name[0] and not self.shortcuts.has_key(name[0]): + self.shortcuts[name[0]] = 'ns%d' % len(self.shortcuts) + + def getNameSpaces(self): + ''' + Returns the namespace. + ''' + + result = "" + for namespace, short in self.shortcuts.items(): + result += ' xmlns:%s="%s"' % (short, namespace) + return result + + def getUpdateElements(self, valueMap): + ''' + + @param valueMap: + ''' + + elements = "" + for name in valueMap.keys(): + fullname = name + if isinstance(name, types.StringType): + fullname = (self.defaultNameSpace, name) + if not fullname[0]: + tag = fullname[1] + else: + tag = self.shortcuts[fullname[0]] + ':' + fullname[1] + value = valueMap[name] + if value: + if isinstance(value, qp_xml._element): + tmpFile = TemporaryFile('w+') + value = qp_xml.dump(tmpFile, value) + tmpFile.flush() + tmpFile.seek(0) + tmpFile.readline() + value = tmpFile.read() + else: + value = "<![CDATA[%s]]>" % value + else: + value = "" + elements += "<%s>%s</%s>" % (tag, value, tag) + return elements + + def getNameElements(self, nameList): + ''' + + @param nameList: + ''' + + elements = "" + for name in nameList: + if isinstance(name, types.StringType): + name = (self.defaultNameSpace, name) + if not name[0]: + tag = name[1] + else: + tag = self.shortcuts[name[0]] + ':' + name[1] + elements += "<%s />" % tag + return elements + + + +def createUpdateBody(propertyDict, defaultNameSpace = None): + ''' + + @param propertyDict: + @param defaultNameSpace: + ''' + + updateTag = 'D:' + Constants.TAG_PROPERTY_UPDATE + setTag = 'D:' + Constants.TAG_PROPERTY_SET + propTag = 'D:' + Constants.TAG_PROP + mangler = XmlNameSpaceMangler(propertyDict.keys(), defaultNameSpace) + return XML_DOC_HEADER + \ + '<%s xmlns:D="DAV:"><%s><%s %s>' % (updateTag, setTag, propTag, mangler.getNameSpaces()) + \ + mangler.getUpdateElements(propertyDict) + \ + '</%s></%s></%s>' % (propTag, setTag, updateTag) + + +def createDeleteBody(nameList, defaultNameSpace = None): + ''' + + @param nameList: + @param defaultNameSpace: + ''' + + updateTag = 'D:' + Constants.TAG_PROPERTY_UPDATE + removeTag = 'D:' + Constants.TAG_PROPERTY_REMOVE + propTag = 'D:' + Constants.TAG_PROP + mangler = XmlNameSpaceMangler(nameList, defaultNameSpace) + return XML_DOC_HEADER + \ + '<%s xmlns:D="DAV:"><%s><%s %s>' % (updateTag, removeTag, propTag, mangler.getNameSpaces()) + \ + mangler.getNameElements(nameList) + \ + '</%s></%s></%s>' % (propTag, removeTag, updateTag) + + +def createFindBody(nameList, defaultNameSpace = None): + ''' + + @param nameList: + @param defaultNameSpace: + ''' + + findTag = 'D:' + Constants.TAG_PROPERTY_FIND + propTag = 'D:' + Constants.TAG_PROP + mangler = XmlNameSpaceMangler(nameList, defaultNameSpace) + return XML_DOC_HEADER + \ + '<%s xmlns:D="DAV:"><%s %s>' % (findTag, propTag, mangler.getNameSpaces()) + \ + mangler.getNameElements(nameList) + \ + '</%s></%s>' % (propTag, findTag) + + +def createSearchBody(selects, path, conditions, defaultNameSpace = None): + ''' + Creates DASL XML body. + + @param selects: list of property names to retrieve for the found resources + @param path: list of conditions + @param conditions: tree of ConditionTerm instances representing a logical search term + @param defaultNameSpace: default namespace + ''' + + searchTag = 'D:' + Constants.TAG_SEARCH_REQUEST + basicTag = 'D:' + Constants.TAG_SEARCH_BASIC + selectTag = 'D:' + Constants.TAG_SEARCH_SELECT + fromTag = 'D:' + Constants.TAG_SEARCH_FROM + scopeTag = 'D:' + Constants.TAG_SEARCH_SCOPE + whereTag = 'D:' + Constants.TAG_SEARCH_WHERE + propTag = 'D:' + Constants.TAG_PROP + hrefTag = 'D:' + Constants.TAG_HREF + depthTag = 'D:' + Constants.TAG_LOCK_DEPTH + depthValue = Constants.HTTP_HEADER_DEPTH_INFINITY + mangler = XmlNameSpaceMangler(selects, defaultNameSpace) + return XML_DOC_HEADER + \ + '<%s xmlns:D="DAV:"><%s>' % (searchTag, basicTag) + \ + '<%s><%s %s>%s</%s></%s>' % (selectTag, propTag, mangler.getNameSpaces(), + mangler.getNameElements(selects), propTag, selectTag) + \ + '<%s><%s><%s>%s</%s><%s>%s</%s></%s></%s>' % (fromTag, scopeTag, hrefTag, path, hrefTag, + depthTag, depthValue, depthTag, scopeTag, fromTag) + \ + '<%s>%s</%s>' % (whereTag, conditions.toXML(),whereTag) + \ + '</%s></%s>' % (basicTag, searchTag) + \ No newline at end of file diff --git a/LTA/LTAIngest/dav/webdav/WebdavResponse.py b/LTA/LTAIngest/dav/webdav/WebdavResponse.py new file mode 100644 index 0000000000000000000000000000000000000000..3edd736313c0ea802fe175d8af4d252fb7f724c5 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/WebdavResponse.py @@ -0,0 +1,509 @@ +# pylint: disable-msg=R0903,W0142,W0221,W0212,W0104,W0511,C0103,R0901 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Handles WebDAV responses. +""" + + +from davlib import _parse_status +import qp_xml +from webdav import Constants +import time +import rfc822 +import urllib + + +__version__ = "$LastChangedRevision$" + + +class HttpStatus(object): + """ + TBD + + @ivar code: + @type code: + @ivar reason: + @type reason: + @ivar errorCount: + @type errorCount: int + """ + + def __init__(self, elem): + """ + TBD + + @param elem: ... + @type elem: instance of L{Element} + """ + self.code, self.reason = _parse_status(elem) + self.errorCount = (self.code >= Constants.CODE_LOWEST_ERROR) + def __str__(self): + return "HTTP status %d: %s" % (self.code, self.reason) + + +class MultiStatusResponse(dict): + """ + TBD + + @ivar status: + @type status: + @ivar reason: + @type reason: + @ivar errorCount: + @type errorCount: + """ + + # restrict instance variables + __slots__ = ('errorCount', 'reason', 'status') + + def __init__(self, domroot): + dict.__init__(self) + self.errorCount = 0 + self.reason = None + self.status = Constants.CODE_MULTISTATUS + if (domroot.ns != Constants.NS_DAV) or (domroot.name != Constants.TAG_MULTISTATUS): + raise ResponseFormatError(domroot, 'Invalid response: <DAV:multistatus> expected.') + self._scan(domroot) + + def getCode(self): + if self.errorCount == 0: + return Constants.CODE_SUCCEEDED + if len(self) > self.errorCount: + return Constants.CODE_MULTISTATUS + return self.values()[0].code + + def getReason(self): + result = "" + for response in self.values(): + if response.code > Constants.CODE_LOWEST_ERROR: + result += response.reason + return result + + def __str__(self): + result = "" + for key, value in self.items(): + if isinstance(value, PropertyResponse): + result += "Resource at %s has %d properties and %d errors.\n" % (key, len(value), value.errorCount) + else: + result += "Resource at %s returned " % key + str(value) + return result + + def _scan(self, root): + for child in root.children: + if child.ns != Constants.NS_DAV: + continue + if child.name == Constants.TAG_RESPONSEDESCRIPTION: + self.reason = child.textof() + elif child.name == Constants.TAG_RESPONSE: + self._scanResponse(child) + ### unknown child element + + def _scanResponse(self, elem): + hrefs = [] + response = None + for child in elem.children: + if child.ns != Constants.NS_DAV: + continue + if child.name == Constants.TAG_HREF: + try: + href = _unquoteHref(child.textof()) + except UnicodeDecodeError: + raise ResponseFormatError(child, "Invalid 'href' data encoding.") + hrefs.append(href) + elif child.name == Constants.TAG_STATUS: + self._scanStatus(child, *hrefs) + elif child.name == Constants.TAG_PROPERTY_STATUS: + if not response: + if len(hrefs) != 1: + raise ResponseFormatError(child, 'Invalid response: One <DAV:href> expected.') + response = PropertyResponse() + self[hrefs[0]] = response + response._scan(child) + elif child.name == Constants.TAG_RESPONSEDESCRIPTION: + for href in hrefs: + self[href].reasons.append(child.textOf()) + ### unknown child element + if response and response.errorCount > 0: + self.errorCount += 1 + + def _scanStatus(self, elem, *hrefs): + if len(hrefs) == 0: + raise ResponseFormatError(elem, 'Invalid response: <DAV:href> expected.') + status = HttpStatus(elem) + for href in hrefs: + self[href] = status + if status.errorCount: + self.errorCount += 1 + + # Instance properties + code = property(getCode, None, None, "HTTP response code") + + + +class PropertyResponse(dict): + """ + TBD + + @ivar errors: + @type errors: list of ... + @ivar reasons: + @type reasons: list of ... + @ivar failedProperties: + @type failedProperties: dict of ... + """ + + # restrict instance variables + __slots__ = ('errors', 'reasons', 'failedProperties') + + def __init__(self): + dict.__init__(self) + self.errors = [] + self.reasons = [] + self.failedProperties = {} + + def __str__(self): + result = "" + for value in self.values(): + result += value.name + '= ' + value.textof() + '\n' + result += self.getReason() + return result + + def getCode(self): + if len(self.errors) == 0: + return Constants.CODE_SUCCEEDED + if len(self) > 0: + return Constants.CODE_MULTISTATUS + return self.errors[-1].code + + def getReason(self): + result = "" + if len(self.errors) > 0: + result = "Failed for: " + repr(self.failedProperties.keys()) + "\n" + for error in self.errors: + result += "%s (%d). " % (error.reason, error.code) + for reason in self.reasons: + result += "%s. " % reason + return result + + def _scan(self, element): + status = None + statusElement = element.find(Constants.TAG_STATUS, Constants.NS_DAV) + if statusElement: + status = HttpStatus(statusElement) + if status.errorCount: + self.errors.append(status) + + propElement = element.find(Constants.TAG_PROP, Constants.NS_DAV) + if propElement: + for prop in propElement.children: + if status.errorCount: + self.failedProperties[(prop.ns, prop.name)]= status + else: + prop.__class__ = Element # bad, bad trick + self[prop.fullname] = prop + reasonElement = element.find(Constants.TAG_RESPONSEDESCRIPTION, Constants.NS_DAV) + if reasonElement: + self.reasons.append(reasonElement.textOf()) + + # Instance properties + code = property(getCode, None, None, "HTTP response code") + errorCount = property(lambda self: len(self.errors), None, None, "HTTP response code") + reason = property(getReason, None, None, "HTTP response code") + + + + +class LiveProperties(object): + """ + This class provides convenient access to the WebDAV 'live' properties of a resource. + WebDav 'live' properties are defined in RFC 2518, Section 13. + Each property is converted from string to its natural data type. + + @version: $Revision$ + @author: Roland Betz + """ + + # restrict instance variables + __slots__ = ('properties') + + NAMES = (Constants.PROP_CREATION_DATE, Constants.PROP_DISPLAY_NAME, + Constants.PROP_CONTENT_LENGTH, Constants.PROP_CONTENT_TYPE, Constants.PROP_ETAG, + Constants.PROP_LAST_MODIFIED, Constants.PROP_OWNER, + Constants.PROP_LOCK_DISCOVERY, Constants.PROP_RESOURCE_TYPE, Constants.PROP_SUPPORTED_LOCK ) + + def __init__(self, properties=None, propElement=None): + """ + Construct <code>StandardProperties</code> from a map of properties containing + live properties or from a XML 'prop' element containing live properties + + @param properties: map as implemented by class L{PropertyResponse} + @param propElement: an C{Element} value + """ + assert isinstance(properties, PropertyResponse) or \ + isinstance(propElement, qp_xml._element), \ + "Argument properties has type %s" % str(type(properties)) + self.properties = {} + for name, value in properties.items(): + if name[0] == Constants.NS_DAV and name[1] in self.NAMES: + self.properties[name[1]] = value + + def getContentLanguage(self): + """ + Return the language of a resource's textual content or null + + @return: string + """ + + result = "" + if not self.properties.get(Constants.PROP_CONTENT_LANGUAGE, None) is None: + result = self.properties.get(Constants.PROP_CONTENT_LANGUAGE).textof() + return result + + def getContentLength(self): + """ + Returns the length of the resource's content in bytes. + + @return: number of bytes + """ + + result = 0 + if not self.properties.get(Constants.PROP_CONTENT_LENGTH, None) is None: + result = int(self.properties.get(Constants.PROP_CONTENT_LENGTH).textof()) + return result + + def getContentType(self): + """ + Return the resource's content MIME type. + + @return: MIME type string + """ + + result = "" + if not self.properties.get(Constants.PROP_CONTENT_TYPE, None) is None: + result = self.properties.get(Constants.PROP_CONTENT_TYPE).textof() + return result + + def getCreationDate(self): + """ + Return date of creation as time tuple. + + @return: time tuple + @rtype: C{time.struct_time} + + @raise ValueError: If string is not in the expected format (ISO 8601). + """ + + datetimeString = "" + if not self.properties.get(Constants.PROP_CREATION_DATE, None) is None: + datetimeString = self.properties.get(Constants.PROP_CREATION_DATE).textof() + return _parseIso8601String(datetimeString) + + def getEntityTag(self): + """ + Return a entity tag which is unique for a particular version of a resource. + Different resources or one resource before and after modification have different etags. + + @return: entity tag string + """ + + result = "" + if not self.properties.get(Constants.PROP_ETAG, None) is None: + result = self.properties.get(Constants.PROP_ETAG).textof() + return result + + def getDisplayName(self): + """ + Returns a resource's display name. + + @return: string + """ + + result = "" + if not self.properties.get(Constants.PROP_DISPLAY_NAME, None) is None: + result = self.properties.get(Constants.PROP_DISPLAY_NAME).textof() + return result + + def getLastModified(self): + """ + Return last modification of resource as time tuple. + + @return: Modification date time. + @rtype: C{time.struct_time} + + @raise ValueError: If the date time string is not in the expected format (RFC 822 / ISO 8601). + """ + + datetimeString = None + if not self.properties.get(Constants.PROP_LAST_MODIFIED, None) is None: + datetimeString = self.properties.get(Constants.PROP_LAST_MODIFIED).textof() + result = rfc822.parsedate(datetimeString) + if result is None: + result = _parseIso8601String(datetimeString) + return time.struct_time(result) + + def getLockDiscovery(self): + """ + Return all current lock's applied to a resource or null if it is not locked. + + @return: a lockdiscovery DOM element according to RFC 2815 + """ + + xml = self.properties.get(Constants.PROP_LOCK_DISCOVERY) + return _scanLockDiscovery(xml) + + def getResourceType(self): + """ + Return a resource's WebDAV type. + + @return: 'collection' or 'resource' + """ + + xml = self.properties.get(Constants.PROP_RESOURCE_TYPE) + if xml and xml.children: + return xml.children[0].name + return "resource" + + def getSupportedLock(self): + """ + Return a DOM element describing all supported lock options for a resource. + Usually this is shared and exclusive write lock. + + @return: supportedlock DOM element according to RFC 2815 + """ + + xml = self.properties.get(Constants.PROP_SUPPORTED_LOCK) + return xml + + def getOwnerAsUrl(self): + """ + Return a resource's owner in form of a URL. + + @return: string + """ + + xml = self.properties.get(Constants.PROP_OWNER) + if xml and len(xml.children): + return xml.children[0].textof() + return None + + def __str__(self): + result = "" + result += " Name=" + self.getDisplayName() + result += "\n Type=" + self.getResourceType() + result += "\n Length=" + str(self.getContentLength()) + result += "\n Content Type="+ self.getContentType() + result += "\n ETag=" + self.getEntityTag() + result += "\n Created=" + time.strftime("%c GMT", self.getCreationDate()) + result += "\n Modified=" + time.strftime("%c GMT", self.getLastModified()) + return result + + +def _parseIso8601String(date): + """ + Parses the given ISO 8601 string and returns a time tuple. + The strings should be formatted according to RFC 3339 (see section 5.6). + But currently there are two exceptions: + 1. Time offset is limited to "Z". + 2. Fragments of seconds are ignored. + """ + + if "." in date and "Z" in date: # Contains fragments of second? + secondFragmentPos = date.rfind(".") + timeOffsetPos = date.rfind("Z") + date = date[:secondFragmentPos] + date[timeOffsetPos:] + timeTuple = time.strptime(date, Constants.DATE_FORMAT_ISO8601) + return timeTuple + + +class ResponseFormatError(IOError): + """ + An instance of this class is raised when the web server returned a webdav + reply which does not adhere to the standard and cannot be recognized. + """ + def __init__(self, element, message= None): + IOError.__init__(self, "ResponseFormatError at element %s: %s" % (element.name, message)) + self.element = element + self.message = message + + +class Element(qp_xml._element): + """ + This class improves the DOM interface (i.e. element interface) provided by the qp_xml module + TODO: substitute qp_xml by 'real' implementation. e.g. domlette + """ + def __init__(self, namespace, name, cdata=''): + qp_xml._element.__init__(self, ns=namespace, name=name, lang=None, parent=None, + children=[], ns_scope={}, attrs={}, + first_cdata=cdata, following_cdata='') + + def __str__(self): + return self.textof() + + def __getattr__(self, name): + if (name == 'fullname'): + return (self.__dict__['ns'], self.__dict__['name']) + raise AttributeError, name + + def add(self, child): + self.children.append(child) + return child + +def _scanLockDiscovery(root): + assert root.name == Constants.PROP_LOCK_DISCOVERY, "Invalid lock discovery XML element" + active = root.find(Constants.TAG_ACTIVE_LOCK, Constants.NS_DAV) + if active: + return _scanActivelock(active) + return None + +def _scanActivelock(root): + assert root.name == Constants.TAG_ACTIVE_LOCK, "Invalid active lock XML element" + token = _scanOrError(root, Constants.TAG_LOCK_TOKEN) + value = _scanOrError(token, Constants.TAG_HREF) + owner = _scanOwner(root) + depth = _scanOrError(root, Constants.TAG_LOCK_DEPTH) + return (value.textof(), owner, depth.textof()) + +def _scanOwner(root): + owner = root.find(Constants.TAG_LOCK_OWNER, Constants.NS_DAV) + if owner: + href = owner.find(Constants.TAG_HREF, Constants.NS_DAV) + if href: + return href.textof() + return owner.textof() + return None + +def _scanOrError(elem, childName): + child = elem.find(childName, Constants.NS_DAV) + if not child: + raise ResponseFormatError(elem, "Invalid response: <"+childName+"> expected") + return child + + +def _unquoteHref(href): + #print "*** Response HREF=", repr(href) + if type(href) == type(u""): + try: + href = href.encode('ascii') + except UnicodeError: # URL contains unescaped non-ascii character + # handle bug in Tamino webdav server + return urllib.unquote(href) + href = urllib.unquote(href) + if Constants.CONFIG_UNICODE_URL: + return unicode(href, 'utf-8') + else: + return unicode(href, 'latin-1') diff --git a/LTA/LTAIngest/dav/webdav/__init__.py b/LTA/LTAIngest/dav/webdav/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e466096df84f8adff93c80a524426d47c139960 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +__version__ = "$LastChangedRevision$" diff --git a/LTA/LTAIngest/dav/webdav/acp/Ace.py b/LTA/LTAIngest/dav/webdav/acp/Ace.py new file mode 100644 index 0000000000000000000000000000000000000000..8321d41b9aef63d9a481ca0d0f4d20249ec4fbe6 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/acp/Ace.py @@ -0,0 +1,293 @@ +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +ACE object handling according to WebDAV ACP specification. +""" + + +from webdav.acp.Principal import Principal +from webdav.acp.GrantDeny import GrantDeny +from webdav import Constants +from webdav.Connection import WebdavError + + +__version__ = "$LastChangedRevision$" + + +class ACE(object): + """ + This class provides functionality for handling ACEs + + @ivar principal: A principal (user or group) + @type principal: L{Principal} object + @ivar invert: Flag indicating whether ACE should invert the principal. + @type invert: C{bool} + @ivar grantDenies: Grant or deny clauses for privileges + @type grantDenies: C{list} of L{GrantDeny} objects + @ivar protected: Flag indicating whether ACE is protected. + @type protected: C{bool} + @ivar inherited: URL indicating the source from where the ACE is inherited. + @type inherited: C{string} + """ + + # restrict instance variables + __slots__ = ('principal', 'invert', 'grantDenies', 'protected', 'inherited') + + def __init__(self, domroot=None, principal=None, grantDenies=None): + """ + Constructor should be called with either no parameters (create blank ACE), + one parameter (a DOM tree or principal), or two parameters (principal and + sequence of GrantDenies). + + @param domroot: A DOM tree (default: None). + @type domroot: L{webdav.WebdavResponse.Element} object + @param principal: A principal (user or group), (default: None). + @type principal: L{Principal} object + @param grantDenies: Grant and deny clauses for privileges (default: None). + @type grantDenies: sequence of L{GrantDeny} objects + + @raise WebdavError: When non-valid parameters are passed a L{WebdavError} is raised. + """ + self.principal = Principal() + self.protected = None + self.inherited = None + self.invert = None + self.grantDenies = [] + + if domroot: + self.principal = Principal(domroot=domroot.find(Constants.TAG_PRINCIPAL, Constants.NS_DAV)) + self.inherited = domroot.find(Constants.TAG_INHERITED, Constants.NS_DAV) + if self.inherited: + self.inherited = self.inherited.children[0].textof() + if domroot.find(Constants.TAG_PROTECTED, Constants.NS_DAV): + self.protected = 1 + for child in domroot.children: + if child.ns == Constants.NS_DAV \ + and (child.name == Constants.TAG_GRANT or child.name == Constants.TAG_DENY): + self.grantDenies.append(GrantDeny(domroot=child)) + elif isinstance(principal, Principal): + newPrincipal = Principal() + newPrincipal.copy(principal) + self.principal = newPrincipal + if (isinstance(grantDenies, list) or isinstance(grantDenies, tuple)): + self.addGrantDenies(grantDenies) + elif domroot == None and grantDenies == None: + # no param ==> blank ACE + pass + else: + # This shouldn't happen, someone screwed up with the params ... + raise WebdavError('non-valid parameters handed to ACE constructor') + + def __cmp__(self, other): + if not isinstance(other, ACE): + return 1 + if self.principal == other.principal \ + and self.invert == other.invert \ + and self.protected == other.protected \ + and self.inherited == other.inherited: + equal = 1 + for grantDeny in self.grantDenies: + inList = 0 + for otherGrantDeny in other.grantDenies: + if grantDeny == otherGrantDeny: + inList = 1 + if inList == 0: + equal = 0 + return not equal + else: + return 1 + + def __repr__(self): + repr = '<class ACE: ' + if self.invert: + repr += 'inverted principal, ' % (self.invert) + if self.principal: + repr += 'principal: %s, ' % (self.principal) + if self.protected: + repr += 'protected, ' + if self.inherited: + repr += 'inherited href: %s, ' % (self.inherited) + first = 1 + repr += 'grantDenies: [' + for grantDeny in self.grantDenies: + if first: + repr += '%s' % grantDeny + first = 0 + else: + repr += ', %s' % grantDeny + return '%s]>' % (repr) + + def copy(self, other): + '''Copy an ACE object. + + @param other: Another ACE to copy. + @type other: L{ACE} object + + @raise WebdavError: When an object that is not an L{ACE} is passed + a L{WebdavError} is raised. + ''' + if not isinstance(other, ACE): + raise WebdavError('Non-ACE object passed to copy method: %s.' % other.__class__) + self.invert = other.invert + self.protected = other.protected + self.inherited = other.inherited + self.principal = Principal() + if other.principal: + self.principal.copy(other.principal) + if other.grantDenies: + self.addGrantDenies(other.grantDenies) + + def isValid(self): + """ + Returns true/false (1/0) whether necessarry props + principal and grantDenies are set and whether the ACE contains one + grant or deny clauses. + + @return: Validity of ACE. + @rtype: C{bool} + """ + return self.principal and len(self.grantDenies) == 1 + + def isGrant(self): + ''' + Returns true/false (1/0) if ACE contains only grant clauses. + + @return: Value whether the ACE is of grant type. + @rtype: C{bool} + ''' + if self.isMixed() or len(self.grantDenies) < 1: + return 0 + else: + return self.grantDenies[0].isGrant() + + def isDeny(self): + ''' + Returns true/false (1/0) if ACE contains only deny clauses. + + @return: Value whether the ACE is of deny type. + @rtype: C{bool} + ''' + if self.isMixed() or len(self.grantDenies) < 1: + return 0 + else: + return self.grantDenies[0].isDeny() + + def isMixed(self): + ''' + Returns true/false (1/0) if ACE contains both types (grant and deny) of clauses. + + @return: Value whether the ACE is of mixed (grant and deny) type. + @rtype: C{bool} + ''' + mixed = 0 + if len(self.grantDenies): + first = self.grantDenies[0].grantDeny + for grantDeny in self.grantDenies: + if grantDeny.grantDeny != first: + mixed = 1 + return mixed + + def toXML(self, defaultNameSpace=None): + """ + Returns ACE content as a string of valid XML as described in WebDAV ACP. + + @param defaultNameSpace: Name space (default: None). + @type defaultNameSpace: C(string) + """ + assert self.isValid(), "ACE is not initialized or does not contain valid content!" + + ACE = 'D:' + Constants.TAG_ACE + res = self.principal.toXML(self.invert) + for grantDeny in self.grantDenies: + res += grantDeny.toXML() + if self.protected: + res += '<D:protected/>' + if self.inherited: + res += '<D:inherited><D:href>%s</D:href></D:inherited>' % (self.inherited) + return '<%s>%s</%s>' % (ACE, res, ACE) + + def setPrincipal(self, principal): + ''' + Sets the passed principal on the ACE. + + @param principal: A principal. + @type principal: L{Principal} object + ''' + self.principal = Principal() + self.principal.copy(principal) + + def setInherited(self, href): + ''' + Sets the passed URL on the ACE to denote from where it is inherited. + + @param href: A URL. + @type href: C{string} + ''' + self.inherited = href + + def addGrantDeny(self, grantDeny): + ''' + Adds the passed GrantDeny object to list if it's not in it, yet. + + @param grantDeny: A grant or deny clause. + @type grantDeny: L{GrantDeny} object + ''' + # only add it if it's not in the list, yet ... + inList = 0 + for element in self.grantDenies: + if element == grantDeny: + inList = 1 + if not inList: + newGrantDeny = GrantDeny() + newGrantDeny.copy(grantDeny) + self.grantDenies.append(newGrantDeny) + + def addGrantDenies(self, grantDenies): + '''Adds the list of passed grant/deny objects to list. + + @param grantDenies: Grant or deny clauses. + @type grantDenies: sequence of L{GrantDeny} objects + ''' + map(lambda grantDeny: self.addGrantDeny(grantDeny), grantDenies) + + def delGrantDeny(self, grantDeny): + '''Deletes the passed GrantDeny object from list. + + @param grantDeny: A grant or deny clause. + @type grantDeny: L{GrantDeny} object + + @raise WebdavError: A L{WebdavError} is raised if the clause to be + deleted is not present. + ''' + # only add it if it's not in the list, yet ... + count = 0 + index = 0 + for element in self.grantDenies: + count += 1 + if element == grantDeny: + index = count + if index: + self.grantDenies.pop(index - 1) + else: + raise WebdavError('GrantDeny to be deleted not in list: %s.' % grantDeny) + + def delGrantDenies(self, grantDenies): + '''Deletes the list of passed grant/deny objects from list. + + @param grantDenies: Grant or deny clauses. + @type grantDenies: sequence of L{GrantDeny} objects + ''' + map(lambda grantDeny: self.delGrantDeny(grantDeny), grantDenies) diff --git a/LTA/LTAIngest/dav/webdav/acp/AceHandler.py b/LTA/LTAIngest/dav/webdav/acp/AceHandler.py new file mode 100644 index 0000000000000000000000000000000000000000..955a561c50d9dd12652948f67967b02442977476 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/acp/AceHandler.py @@ -0,0 +1,182 @@ +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Handling of WebDAV Access Protocol Extensions and ACL preparation for UI. +""" + + +from webdav import Constants +from webdav.WebdavClient import ResourceStorer +from webdav.Connection import WebdavError + + +__version__ = "$LastChangedRevision$" + + +def extractSupportedPrivilegeSet(userPrivileges): + """ + Returns a dictionary of supported privileges. + + @param userPrivileges: A DOM tree. + @type userPrivileges: L{webdav.WebdavResponse.Element} object + + @raise WebdavError: When unknown elements appear in the + C{DAV:supported-privilege} appear a L{WebdavError} is raised. + + @return: A dictionary with privilege names as keys and privilege descriptions as values. + @rtype: C{dictionary} + """ + result = {} + for element in userPrivileges.children: + if element.name == Constants.TAG_SUPPORTED_PRIVILEGE: + privName = '' + privDescription = '' + for privilege in element.children: + if privilege.name == Constants.TAG_PRIVILEGE: + privName = privilege.children[0].name + elif privilege.name == Constants.TAG_DESCRIPTION: + privDescription = privilege.textof() + else: + raise WebdavError('Unknown element in DAV:supported-privilege: ' + privilege.name) + + if privName and privDescription: + result[privName] = privDescription + privName = '' + privDescription = '' + else: + raise WebdavError('Invalid element tag in DAV:supported-privilege-set: ' + element.name) + return result + + +def _insertAclDisplaynames(acl): + """ + Modifies the ACL by adding the human readable names + (DAV:displayname property) of each principal found in an ACL. + + This should be done with the REPORT method, but it is not supported by + Jacarta Slide, yet. (As of Aug. 1, 2003 in CVS repository) + + So we are going to do it differently by foot the harder way ... + + @param acl: An ACL object for which the displaynames should be retrieved. + @type acl: L{ACL} object + """ + ## This is redundant code to be still kept for the REPORT method way of doing it ... + ## property = '''<D:prop><D:displayname/></D:prop>''' + ## return self.getReport(REPORT_ACL_PRINCIPAL_PROP_SET, property) + for ace in acl.aces: + if not ace.principal.property: + principalConnection = \ + ResourceStorer(ace.principal.principalURL) + ace.principal.displayname = \ + principalConnection.readProperty(Constants.NS_DAV, Constants.PROP_DISPLAY_NAME) + + +def prepareAcls(acls): + """ + Returns all ACLs describing the behaviour of the resource. The information + in the ACL is modified to contain all information needed to display in the UI. + + @param acls: ACL objects. + @type acls: C{list} of L{ACL} objects + + @return: (non-valid) ACLs that contain both grant and deny clauses in an ACE. + Displaynames are added to the Principals where needed. + @rtype: C{list} of L{ACL} objects + """ + for acl in acls.keys(): + acls[acl] = acls[acl].joinGrantDeny() + _insertAclDisplaynames(acls[acl]) + return acls + + +def prepareAcl(acl): + """ + Returns an ACL describing the behaviour of the resource. The information + in the ACL is modified to contain all information needed to display in the UI. + + @param acl: An ACL object. + @type acl: L{ACL} object + + @return: A (non-valid) ACL that contains both grant and deny clauses in an ACE. + Displaynames are added to the Principals where needed. + @rtype: L{ACL} object + """ + acl = acl.joinGrantDeny() + _insertAclDisplaynames(acl) + return acl + + +def refineAclForSet(acl): + """ + Sets the ACL composed from the UI on the WebDAV server. For that purpose the + ACL object gets refined first to form a well accepted ACL to be set by the + ACL WebDAV method. + + @param acl: An ACL object to be refined. + @type acl: L{ACL} object + + @return: A valid ACL that contains only grant or deny clauses in an ACE. + Inherited and protected ACEs are stripped out. + @rtype: L{ACL} object + """ + acl = acl.splitGrantDeny() + acl = acl.stripAces() + return acl + + +##~ unsupported or unfinished methods: +##~ +##~ def report(self, report, request=None, lockToken=None): +##~ """ +##~ This method implements the WebDAV ACP method: REPORT for given report +##~ types. +##~ +##~ Parameters: +##~ +##~ 'report' -- Report type as a string. +##~ +##~ 'request' -- XML content of the request for the report (defaults to None). +##~ +##~ 'lockToken' -- Lock token to be set (defaults to None). +##~ """ +##~ raise WebdavError('Reports are not supported by our Jacarta Slide, yet (as of Aug. 1, 2003 in CVS).') +##~ +##~ headers = createCondition(lockToken) +##~ headers['Content-Type'] = XML_CONTENT_TYPE +##~ body = '<D:%s xmlns:D="DAV:">%s</D:%s>' % (report, request, report) +##~ print "Body: ", body +##~ response = self.connection._request('REPORT', self.path, body, headers) +##~ return response +##~ ## TODO: parse DAV:error response +##~ +##~ +##~ def getAllAcls(self): +##~ """ +##~ Returns a dictionary of ACL resources with respective ACL objects +##~ that apply to the given resource. +##~ +##~ ### This method needs to be extended for inherited ACLs when Tamino +##~ support tells me (Guy) how to get to them. +##~ """ +##~ acls = {self.path: self.getAcl()} +##~ for ace in acls[self.path].aces: +##~ if ace.inherited: +##~ if not ace.inherited in acls: +##~ acls[ace.inherited] = self.getAcl() +##~ +##~ # append some more stuff here to acls for possible inherited ACLs +##~ return acls diff --git a/LTA/LTAIngest/dav/webdav/acp/Acl.py b/LTA/LTAIngest/dav/webdav/acp/Acl.py new file mode 100644 index 0000000000000000000000000000000000000000..c67caa8ce4cfe51304e48f00a63b79533c8df49d --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/acp/Acl.py @@ -0,0 +1,311 @@ +# pylint: disable-msg=W0622 +# +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +""" +ACL object handling according to WebDAV ACP specification. +""" + + +from webdav.acp.Ace import ACE +from webdav import Constants +from webdav.Connection import WebdavError +from davlib import XML_DOC_HEADER + + +__version__ = "$LastChangedRevision$" + + +class ACL(object): + """ + This class provides access to Access Control List funcionality + as specified in the WebDAV ACP. + + @ivar aces: ACEs in ACL + @type aces: C{list} of L{ACE} objects + @ivar withInherited: Flag indicating whether ACL contains inherited ACEs. + @type withInherited: C{bool} + """ + + # restrict instance variables + __slots__ = ('aces', 'withInherited') + + def __init__(self, domroot=None, aces=None): + """ + Constructor should be called with either no parameters (create blank ACE), + or one parameter (a DOM tree or ACE list). + + @param domroot: A DOM tree (default: None). + @type domroot: L{webdav.WebdavResponse.Element} object + @param aces: ACE objects (default: None) + @type aces: C{list} of L{ACE} objects + + @raise WebdavError: When non-valid parameters are passed a L{WebdavError} is raised. + """ + self.withInherited = None + self.aces = [] + + if domroot: + for child in domroot.children: + if child.name == Constants.TAG_ACE and child.ns == Constants.NS_DAV: + self.addAce(ACE(child)) + else: + # This shouldn't happen, someone screwed up with the params ... + raise WebdavError('Non-ACE tag handed to ACL constructor: ' + child.ns + child.name) + elif isinstance(aces, list) or isinstance(aces, tuple): + self.addAces(aces) + elif domroot == None and aces == None: + # no param ==> blank object + pass + else: + # This shouldn't happen, someone screwed up with the params ... + raise WebdavError('non-valid parameters handed to ACL constructor') + + def __cmp__(self, other): + if not isinstance(other, ACL): + return 1 + if self.withInherited == other.withInherited: + equal = 1 + for ace in self.aces: + inList = 0 + for otherAce in other.aces: + if ace == otherAce: + inList = 1 + if inList == 0: + equal = 0 + return not equal + else: + return 1 + + def __repr__(self): + repr = '<class ACL: ' + if self.withInherited: + repr += 'with inherited, ' + first = 1 + repr += 'aces: [' + for ace in self.aces: + if first: + repr += '%s' % ace + first = 0 + else: + repr += ', %s' % ace + return '%s]>' % (repr) + + def copy(self, other): + '''Copy an ACL object. + + @param other: Another ACL to copy. + @type other: L{ACL} object + + @raise WebdavError: When an object that is not an L{ACL} is passed + a L{WebdavError} is raised. + ''' + if not isinstance(other, ACL): + raise WebdavError('Non-ACL object passed to copy method: %s' % other.__class__) + self.withInherited = other.withInherited + if other.aces: + self.addAces(other.aces) + + def toXML(self): + """ + Returns ACL content as a string of valid XML as described in WebDAV ACP. + """ + aclTag = 'D:' + Constants.TAG_ACL + return XML_DOC_HEADER +\ + '<' + aclTag + ' xmlns:D="DAV:">' + reduce(lambda xml, ace: xml + ace.toXML() + '\n', [''] + self.aces) +\ + '</' + aclTag + '>' + + def addAce(self, ace): + ''' + Adds the passed ACE object to list if it's not in it, yet. + + @param ace: An ACE. + @type ace: L{ACE} object + ''' + newAce = ACE() + newAce.copy(ace) + # only add it if it's not in the list, yet ... + inList = 0 + for element in self.aces: + if element == ace: + inList = 1 + if not inList: + self.aces.append(newAce) + + def addAces(self, aces): + '''Adds the list of passed ACE objects to list. + + @param aces: ACEs + @type aces: sequence of L{ACE} objects + ''' + for ace in aces: + self.addAce(ace) + + def delAce(self, ace): + '''Deletes the passed ACE object from list. + + @param ace: An ACE. + @type ace: L{ACE} object + + @raise WebdavError: When the ACE to be deleted is not within the ACL + a L{WebdavError} is raised. + ''' + # find where it is and delete it ... + count = 0 + index = 0 + for element in self.aces: + count += 1 + if element == ace: + index = count + if index: + self.aces.pop(index - 1) + else: + raise WebdavError('ACE to be deleted not in list: %s.' % ace) + + def delAces(self, aces): + '''Deletes the list of passed ACE objects from list. + + @param aces: ACEs + @type aces: sequence of L{ACE} objects + ''' + for ace in aces: + self.delAce(ace) + + def delPrincipalsAces(self, principal): + """ + Deletes all ACEs in ACL by given principal. + + @param principal: A principal. + @type principal: L{Principal} object + """ + # find where it is and delete it ... + index = 0 + while index < len(self.aces): + if self.aces[index].principal.principalURL == principal.principalURL: + self.aces.pop(index) + else: + index += 1 + + def joinGrantDeny(self): + """ + Returns a "refined" ACL of the ACL for ease of use in the UI. + The purpose is to post the user an ACE that can contain both, granted + and denied, privileges. So possible pairs of grant and deny ACEs are joined + to return them in one ACE. This resulting ACE then of course IS NOT valid + for setting ACLs anymore. They will have to be reconverted to yield valid + ACLs for the ACL method. + + @return: A (non-valid) ACL that contains both grant and deny clauses in an ACE. + @rtype: L{ACL} object + """ + joinedAces = {} + for ace in self.aces: + if not ace.principal.principalURL is None: + principalKey = ace.principal.principalURL + elif not ace.principal.property is None: + principalKey = ace.principal.property + else: + principalKey = None + if ace.inherited: + principalKey = ace.inherited + ":" + principalKey + if principalKey in joinedAces: + joinedAces[principalKey].addGrantDenies(ace.grantDenies) + else: + joinedAces[principalKey] = ACE() + joinedAces[principalKey].copy(ace) + newAcl = ACL() + newAcl.addAces(joinedAces.values()) + return newAcl + + def splitGrantDeny(self): + """ + Returns a "refined" ACL of the ACL for ease of use in the UI. + The purpose is to post the user an ACE that can contain both, granted + and denied, privileges. So possible joined grant and deny clauses in ACEs + splitted to return them in separate ACEs. This resulting ACE then is valid + for setting ACLs again. This method is to be seen in conjunction with the + method joinGrantDeny as it reverts its effect. + + @return: A valid ACL that contains only ACEs with either grant or deny clauses. + @rtype: L{ACL} object + """ + acesGrant = {} + acesDeny = {} + for ace in self.aces: + for grantDeny in ace.grantDenies: + if grantDeny.isGrant(): + if ace.principal.principalURL in acesGrant: + ace.addGrantDeny(grantDeny) + else: + acesGrant[ace.principal.principalURL] = ACE() + acesGrant[ace.principal.principalURL].copy(ace) + acesGrant[ace.principal.principalURL].grantDenies = [] + acesGrant[ace.principal.principalURL].addGrantDeny(grantDeny) + else: + if ace.principal.principalURL in acesDeny: + ace.addGrantDeny(grantDeny) + else: + acesDeny[ace.principal.principalURL] = ACE() + acesDeny[ace.principal.principalURL].copy(ace) + acesDeny[ace.principal.principalURL].grantDenies = [] + acesDeny[ace.principal.principalURL].addGrantDeny(grantDeny) + newAcl = ACL() + newAcl.addAces(acesGrant.values()) + newAcl.addAces(acesDeny.values()) + return newAcl + + def isValid(self): + """ + Returns true (1) if all contained ACE objects are valid, + otherwise false (0) is returned. + + @return: Validity of ACL. + @rtype: C{bool} + """ + valid = 1 + if len(self.aces): + for ace in self.aces: + if not ace.isValid(): + valid = 0 + return valid + + def stripAces(self, inherited=True, protected=True): + """ + Returns an ACL object with all ACEs stripped that are inherited + and/or protected. + + @param inherited: Flag to indicate whether inherited ACEs should + be stripped (default: True). + @type inherited: C{bool} + @param protected: Flag to indicate whether protected ACEs should + be stripped (default: True). + @type protected: C{bool} + + @return: An ACL without the stripped ACEs. + @rtype: L{ACL} object + """ + newAcl = ACL() + if len(self.aces): + for ace in self.aces: + keep = 1 + if inherited and ace.inherited: + keep = 0 + elif protected and ace.protected: + keep = 0 + if keep: + newAcl.addAce(ace) + return newAcl diff --git a/LTA/LTAIngest/dav/webdav/acp/GrantDeny.py b/LTA/LTAIngest/dav/webdav/acp/GrantDeny.py new file mode 100644 index 0000000000000000000000000000000000000000..8d7230cf8c070e38662b6da5386f190d9007bac4 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/acp/GrantDeny.py @@ -0,0 +1,242 @@ +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Handling of grant and deny clauses in ACEs according to WebDAV ACP specification. +""" + + +from webdav.acp.Privilege import Privilege +from webdav import Constants +from webdav.Connection import WebdavError + + +__version__ = "$LastChangedRevision$" + + +class GrantDeny(object): + """ + This class provides functionality for handling + grant and deny clauses in ACEs. + + @ivar grantDeny: Flag indicating whether clause grants or denies. + @type grantDeny: C{bool} + @ivar privileges: Privileges to be granted or denied. + @type privileges: C{list} of L{Privilege} objects + """ + + def __init__(self, domroot=None): + """ + Constructor should be called with either no parameters + (create blank GrantDeny), or one parameter (a DOM tree). + + @param domroot: A DOM tree (default: None). + @type domroot: L{webdav.WebdavResponse.Element} object + + @raise WebdavError: When non-valid parameters are passed a L{WebdavError} is raised. + """ + self.grantDeny = 0 # 0: deny, 1: grant + self.privileges = [] + + if domroot: + self.grantDeny = (domroot.name == Constants.TAG_GRANT) + for child in domroot.children: + if child.name == Constants.TAG_PRIVILEGE and child.ns == Constants.NS_DAV: + self.privileges.append(Privilege(domroot=child)) + else: + # This shouldn't happen, someone screwed up with the params ... + raise WebdavError('Non-privilege tag handed to GrantDeny constructor: %s' \ + % child.name) + elif domroot == None: + # no param ==> blank object + pass + else: + # This shouldn't happen, someone screwed up with the params ... + raise WebdavError('Non-valid parameters handed to GrantDeny constructor.') + + def __cmp__(self, other): + """ Compares two GrantDeny instances. """ + if not isinstance(other, GrantDeny): + return 1 + if self.grantDeny == other.grantDeny: + equal = 1 + for priv in self.privileges: + inList = 0 + for otherPriv in other.privileges: + if priv == otherPriv: + inList = 1 + if inList == 0: + equal = 0 + return not equal + else: + return 1 + + def __repr__(self): + """ Returns the representation of an instance. """ + representation = '<class GrantDeny: ' + if self.grantDeny: + representation += 'grant privileges: [' + else: + representation += 'deny privileges: [' + first = 1 + for priv in self.privileges: + if first: + representation += '%s' % priv + first = 0 + else: + representation += ', %s' % priv + return '%s]>' % (representation) + + def copy(self, other): + """ + Copy a GrantDeny object. + + @param other: Another grant or deny clause to copy. + @type other: L{GrantDeny} object + + @raise WebdavError: When an object that is not an L{GrantDeny} is passed + a L{WebdavError} is raised. + """ + if not isinstance(other, GrantDeny): + raise WebdavError('Non-GrantDeny object passed to copy method: %s' \ + % other) + self.grantDeny = other.grantDeny + if other.privileges: + self.addPrivileges(other.privileges) + + def isGrant(self): + """ + Returns whether the set of privileges is of type "grant" + indicating true or false. + + @return: Value whether the clause is of grant type. + @rtype: C{bool} + """ + return self.grantDeny + + def isDeny(self): + """ + Returns whether the set of privileges is of type "deny" + indicating true or false. + + @return: Value whether the clause is of deny type. + @rtype: C{bool} + """ + return not self.grantDeny + + def setGrantDeny(self, grantDeny): + """ + Sets the set of privileges to given value for grantDeny. + + @param grantDeny: Grant/deny value for clause (grant: True/1, deny: False/0). + @type grantDeny: C{bool} + """ + if grantDeny == 0 or grantDeny == 1: + self.grantDeny = grantDeny + + def setGrant(self): + """ Sets the set of privileges to type "grant". """ + self.grantDeny = 1 + + def setDeny(self): + """ Sets the set of privileges to type "deny". """ + self.grantDeny = 0 + + def isAll(self): + """ + Checks whether the privileges contained are equal + to aggregate DAV:all privilege. + + @return: Value whether all un-aggregated privileges are present. + @rtype: C{bool} + """ + if len(self.privileges) == len(Constants.TAMINO_PRIVILEGES): + return 1 + elif len(self.privileges) == 1 and self.privileges[0].name == Constants.TAG_ALL: + return 1 + return 0 + + def addPrivilege(self, privilege): + """ + Adds the passed privilege to list if it's not in it, yet. + + @param privilege: A privilege. + @type privilege: L{Privilege} object + """ + inList = False + for priv in self.privileges: + if priv == privilege: + inList = True + if not inList: + newPrivilege = Privilege() + newPrivilege.copy(privilege) + self.privileges.append(newPrivilege) + + def addPrivileges(self, privileges): + """ + Adds the list of passed privileges to list. + + @param privileges: Several privileges. + @type privileges: sequence of L{Privilege} objects + """ + for priv in privileges: + self.addPrivilege(priv) + + def delPrivilege(self, privilege): + """ + Deletes the passed privilege from list if it's in it. + + @param privilege: A privilege. + @type privilege: L{Privilege} object + + @raise WebdavError: A L{WebdavError} is raised if the privilege to be + deleted is not present. + """ + count = 0 + index = 0 + for priv in self.privileges: + count += 1 + if priv == privilege: + index = count + if index: + self.privileges.pop(index - 1) + else: + raise WebdavError('Privilege to be deleted not in list: %s' % privilege) + + def delPrivileges(self, privileges): + """ + Deletes the list of passed privileges from list. + + @param privileges: Several privileges. + @type privileges: sequence of L{Privilege} objects + """ + for priv in privileges: + self.delPrivilege(priv) + + def toXML(self): + """ + Returns string of GrantDeny content to valid XML as described in WebDAV ACP. + """ + assert self.privileges, "GrantDeny object is not initialized or does not contain content!" + + if self.isGrant(): + tag = 'D:' + Constants.TAG_GRANT + else: + tag = 'D:' + Constants.TAG_DENY + + res = '' + for privilege in self.privileges: + res += privilege.toXML() + return '<%s>%s</%s>' % (tag, res, tag) diff --git a/LTA/LTAIngest/dav/webdav/acp/Principal.py b/LTA/LTAIngest/dav/webdav/acp/Principal.py new file mode 100644 index 0000000000000000000000000000000000000000..a0d5ec97e0f24cc688276c2c22eda01e21dc3684 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/acp/Principal.py @@ -0,0 +1,189 @@ +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Handling of principals for ACEs according to WebDAV ACP specification. +""" + + +from webdav import Constants +from webdav.Connection import WebdavError + + +__version__ = "$LastChangedRevision$" + + +class Principal(object): + """ + This class provides functionality for handling + principals according to the WebDAV ACP. + + @ivar displayname: Name of the principal for output + @type displayname: C{string} + @ivar principalURL: URL under which the principal can be referenced on the server. + @type principalURL: C{string} + @ivar property: Information on type of a pseudo/jproperty principal, e. g. + DAV:owner, DAV:authenticated, etc. + @type property: C{string} + + @cvar _TAG_LIST_PRINCIPALS: List of allowed XML tags within a principal declaration. + @type _TAG_LIST_PRINCIPALS: C{tuple} of C{string}s + @cvar _TAG_LIST_STATUS: List of XML tags for the status of a pseudo principal. + @type _TAG_LIST_STATUS: C{tuple} of C{string}s + """ + + # some local constants for this class to make things easier/more readable: + _TAG_LIST_PRINCIPALS = (Constants.TAG_HREF, # directly by URL + Constants.TAG_ALL, Constants.TAG_AUTHENTICATED, Constants.TAG_UNAUTHENTICATED, + # by log-in status + Constants.TAG_PROPERTY, # for property info, e. g. 'owner' + Constants.TAG_SELF, # only if the resource is the principal itself + Constants.TAG_PROP) # contains property info like 'displayname' + _TAG_LIST_STATUS = (Constants.TAG_ALL, Constants.TAG_AUTHENTICATED, Constants.TAG_UNAUTHENTICATED) + + # restrict instance variables + __slots__ = ('displayname', 'principalURL', 'property') + + def __init__(self, domroot=None, displayname=None, principalURL=None): + """ + Constructor should be called with either no parameters (create blank Principal), + one parameter (a DOM tree), or two parameters (displayname and URL or property tag). + + @param domroot: A DOM tree (default: None). + @type domroot: L{webdav.WebdavResponse.Element} object + @param displayname: The display name of a principal (default: None). + @type displayname: C{string} + @param principalURL: The URL representing a principal (default: None). + @type principalURL: C{string} + + @raise WebdavError: When non-valid parameters or sets of parameters are + passed a L{WebdavError} is raised. + """ + self.displayname = None + self.principalURL = None + self.property = None + + if domroot: + for child in domroot.children: + if child.ns == Constants.NS_DAV and (child.name in self._TAG_LIST_PRINCIPALS): + if child.name == Constants.TAG_PROP: + self.displayname = \ + child.find(Constants.PROP_DISPLAY_NAME, Constants.NS_DAV) + elif child.name == Constants.TAG_HREF: + self.principalURL = child.textof() + if self.principalURL and self.property in self._TAG_LIST_STATUS: + raise WebdavError('Principal cannot contain a URL and "%s"' % (self.property)) + elif child.name == Constants.TAG_PROPERTY: + if child.count() == 1: + if self.property: + raise WebdavError('Property for principal has already been set: old "%s", new "%s"' \ + % (self.property, child.pop().name)) + elif self.principalURL: + raise WebdavError('Principal cannot contain a URL and "%s"' % (self.property)) + else: + self.property = child.pop().name + else: + raise WebdavError("There should be only one value in the property for a principal, we have: %s" \ + % child.name) + else: + if self.property: + raise WebdavError('Property for principal has already been set: old "%s", new "%s"' \ + % (self.property, child.name)) + else: + self.property = child.name + if self.principalURL and self.property in self._TAG_LIST_STATUS: + raise WebdavError('Principal cannot contain a URL and "%s"' % (self.property)) + else: # This shouldn't happen, something's wrong with the DOM tree + raise WebdavError('Non-valid tag in principal DOM tree for constructor: %s' % child.name) + elif displayname == None or principalURL == None: + if displayname: + self.displayname = displayname + if principalURL: + self.principalURL = principalURL + else: + # This shouldn't happen, someone screwed up with the params ... + raise WebdavError('Non-valid parameters handed to Principal constructor.') + + def __cmp__(self, other): + if not isinstance(other, Principal): + return 1 + if self.displayname == other.displayname \ + and self.principalURL == other.principalURL \ + and self.property == other.property: + return 0 + else: + return 1 + + def __repr__(self): + return '<class Principal: displayname: "%s", principalURL: "%s", property: "%s">' \ + % (self.displayname, self.principalURL, self.property) + + def copy(self, other): + """Copy Principal object. + + @param other: Another principal to copy. + @type other: L{Principal} object + + @raise WebdavError: When an object that is not a L{Principal} is passed + a L{WebdavError} is raised. + """ + if not isinstance(other, Principal): + raise WebdavError('Non-Principal object passed to copy method: ' % other.__class__) + self.displayname = other.displayname + self.principalURL = other.principalURL + self.property = other.property + + def isValid(self): + """ + Checks whether necessarry props for principal are set. + + @return: Validity of principal. + @rtype: C{bool} + """ + return (self.displayname and + (self.principalURL or self.property) and + not (self.principalURL and self.property)) + + def toXML(self, invert=False, displayname=False, defaultNameSpace=None): + """Returns string of Principal content in valid XML as described in WebDAV ACP. + + @param defaultNameSpace: Name space (default: None). + @type defaultNameSpace: C(string) + @param invert: True if principal should be inverted (default: False). + @type invert: C{bool} + @param displayname: True if displayname should be in output (default: False). + @type displayname: C{bool} + """ + # this check is needed for setting principals only: + # assert self.isValid(), "principal is not initialized or does not contain valid content!" + + PRINCIPAL = 'D:' + Constants.TAG_PRINCIPAL + res = '' + if self.principalURL: + res += '<D:%s>%s</D:%s>' % (Constants.TAG_HREF, self.principalURL, Constants.TAG_HREF) + elif self.property in self._TAG_LIST_STATUS \ + or self.property == Constants.TAG_SELF: + res += '<D:%s/>' % (self.property) + elif self.property: + res += '<D:%s><D:%s/></D:%s>' \ + % (Constants.TAG_PROPERTY, self.property, Constants.TAG_PROPERTY) + if self.displayname and displayname: + res += '<D:%s><D:%s>%s</D:%s></D:%s>' \ + % (Constants.TAG_PROP, Constants.PROP_DISPLAY_NAME, + self.displayname, + Constants.PROP_DISPLAY_NAME, Constants.TAG_PROP) + if invert: + res = '<D:invert>%s</D:invert>' % (res) + return '<%s>%s</%s>' % (PRINCIPAL, res, PRINCIPAL) diff --git a/LTA/LTAIngest/dav/webdav/acp/Privilege.py b/LTA/LTAIngest/dav/webdav/acp/Privilege.py new file mode 100644 index 0000000000000000000000000000000000000000..2e2a8dac00d2f0ee559a4261ecce3efcfc9e61ef --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/acp/Privilege.py @@ -0,0 +1,118 @@ +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Handling for privileges for grant and deny clauses in ACEs +according to WebDAV ACP specification. +""" + + +from webdav import Constants +from webdav.Connection import WebdavError + + +__version__ = "$LastChangedRevision$" + + +class Privilege(object): + """This class provides functionality for handling privileges for ACEs. + + @ivar name: Name of the privilege. + @type name: C{string} + + @cvar _TAG_LIST_PRIVILEGES: List of allowed XML tags for privileges. + @type _TAG_LIST_PRIVILEGES: C{tuple} of C{string}s + @cvar _TAG_LIST_PRIVILEGES_TAMINO: List of special Tamino XML tags for privileges. + @type _TAG_LIST_PRIVILEGES: C{tuple} of C{string}s + """ + _TAG_LIST_PRIVILEGES = (Constants.TAG_READ, Constants.TAG_WRITE, Constants.TAG_WRITE_PROPERTIES, + Constants.TAG_WRITE_CONTENT, Constants.TAG_UNLOCK, Constants.TAG_READ_ACL, + Constants.TAG_READ_CURRENT_USER_PRIVILEGE_SET, + Constants.TAG_WRITE_ACL, Constants.TAG_ALL, + Constants.TAG_BIND, Constants.TAG_UNBIND) + _TAG_LIST_PRIVILEGES_TAMINO = (Constants.TAG_TAMINO_SECURITY) + + + def __init__(self, privilege=None, domroot=None): + """ + Constructor should be called with either no parameters (create blank Privilege), + one parameter (a DOM tree or privilege name to inicialize it directly). + + @param domroot: A DOM tree (default: None). + @type domroot: L{webdav.WebdavResponse.Element} object + @param privilege: The valid name of a privilege (default: None). + @type privilege: C{string} + + @raise WebdavError: When non-valid parameters or sets of parameters are + passed a L{WebdavError} is raised. + """ + self.name = None + + if domroot: + if len(domroot.children) != 1: + raise WebdavError('Wrong number of elements for Privilege constructor, we have: %i' \ + % (len(domroot.children))) + else: + child = domroot.children[0] + if child.ns == Constants.NS_DAV and child.name in self._TAG_LIST_PRIVILEGES or \ + child.ns == Constants.NS_TAMINO and child.name in self._TAG_LIST_PRIVILEGES_TAMINO: + self.name = child.name + else: + raise WebdavError('Not a valid privilege tag, we have: %s%s' \ + % (child.ns, child.name)) + elif privilege: + if privilege in self._TAG_LIST_PRIVILEGES: + self.name = privilege + else: + raise WebdavError('Not a valid privilege tag, we have: %s.' % str(privilege)) + + def __cmp__(self, other): + """ Compares two Privilege instances. """ + if not isinstance(other, Privilege): + return 1 + if self.name != other.name: + return 1 + else: + return 0 + + def __repr__(self): + """ Returns the string representation of an instance. """ + return '<class Privilege: name: "%s">' % (self.name) + + def copy(self, other): + """ + Copy Privilege object. + + @param other: Another privilege to copy. + @type other: L{Privilege} object + + @raise WebdavError: When an object that is not a L{Privilege} is passed + a L{WebdavError} is raised. + """ + if not isinstance(other, Privilege): + raise WebdavError('Non-Privilege object passed to copy method: %s' % other.__class__) + self.name = other.name + + def toXML(self): + """ + Returns privilege content as string in valid XML as described in WebDAV ACP. + + @param defaultNameSpace: Name space (default: None). + @type defaultNameSpace: C(string) + """ + assert self.name != None, "privilege is not initialized or does not contain valid content!" + + privilege = 'D:' + Constants.TAG_PRIVILEGE + return '<%s><D:%s/></%s>' % (privilege, self.name, privilege) diff --git a/LTA/LTAIngest/dav/webdav/acp/__init__.py b/LTA/LTAIngest/dav/webdav/acp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..82d0c5dfc688578f3fe59c7f55bad9d5c0bb7551 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/acp/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from webdav.acp.Acl import ACL +from webdav.acp.Ace import ACE +from webdav.acp.GrantDeny import GrantDeny +from webdav.acp.Privilege import Privilege +from webdav.acp.Principal import Principal + +__version__ = "$LastChangedRevision$" diff --git a/LTA/LTAIngest/dav/webdav/logger.py b/LTA/LTAIngest/dav/webdav/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..d2538ef10e71517d316b392b7e3bf68e1c34eea6 --- /dev/null +++ b/LTA/LTAIngest/dav/webdav/logger.py @@ -0,0 +1,51 @@ +# Copyright 2008 German Aerospace Center (DLR) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""" +Module provides access to a configured logger instance. +The logger writes C{sys.stdout}. +""" + + +import logging +import sys + + +__version__ = "$LastChangedRevision$"[11:-2] + + +_defaultLoggerName = "webdavLogger" +_fileLogFormat = "%(asctime)s: %(levelname)s: %(message)s" + + +def getDefaultLogger(handler=None): + """ + Returns a configured logger object. + + @return: Logger instance. + @rtype: C{logging.Logger} + """ + + myLogger = logging.getLogger(_defaultLoggerName) + if len(myLogger.handlers) == 0: + myLogger.level = logging.DEBUG + formatter = logging.Formatter(_fileLogFormat) + if handler is None: + stdoutHandler = logging.StreamHandler(sys.stdout) + stdoutHandler.setFormatter(formatter) + myLogger.addHandler(stdoutHandler) + else: + myLogger.addHandler(handler) + return myLogger diff --git a/LTA/LTAIngest/dav/wsrt_webdavlib.py b/LTA/LTAIngest/dav/wsrt_webdavlib.py new file mode 100644 index 0000000000000000000000000000000000000000..d6d530e10a3bf0f3c77638f7ac50da356449b92f --- /dev/null +++ b/LTA/LTAIngest/dav/wsrt_webdavlib.py @@ -0,0 +1,245 @@ +import davlib + +class wsrt_webdavlib(): + def __init__(self, logger): + ## No inputs and outputs + self.server = 'localhost' + self.davdir = '' + self.session_cookies = None + self.logger = logger + + ## Help text + self.helptext = """ + Library containing webdav communication functions.""" + + ## separate module that shares the webdav interaction of several export modules + # the contents of the config should probably become part of this class in the future... + def getConfig(self, name=None): ## The format of .serverrc is the same as the .nftprc of the FTP module + ## Find the config file + import os.path, re + r_field = re.compile(r'(?s)([^\n:]+): (.*?)(?=\n[^ \t]|\Z)') + cwd = os.path.realpath(os.path.dirname(__file__)) + '/' + config = cwd + '.serverrc' + self.logger.info('using server cofiguration in: ' + config) + if os.path.exists(config): + s = open(config).read() + else: + raise Exception('Server config not found') + + ## Parse the config file + conf = {} + s = s.replace('\r\n', '\n') + s = s.replace('\r', '\n') + for item in s.split('\n\n'): + meta = dict(r_field.findall(item.strip())) + if meta.has_key('name'): + fname = meta['name'] + del meta['name'] + conf[fname] = meta + else: + raise Exception('Server config must include a name') + + if name is not None: + return conf[name] + else: return conf + + def login(self): + import binascii + meta = self.getConfig(self.server) + auth = binascii.b2a_base64(meta['username'] + ':' + meta['password']) + self.dav = davlib.DAV(meta['host'], meta['port'], protocol='http') + self.davdir = (meta['remotedir']) + logindir = self.davdir + '/' ## N.B. "Magic" extra '/' needed in some cases, or we get a 302 +# self.dav.set_debuglevel(1) # for testing + self.dav.connect() + response = self.dav.get(logindir, {'Authorization': 'Basic %s' % auth.strip()}) + self.session_cookies = response.getheader('Set-Cookie') + self.logger.debug('Status = %s, Reason = %s, Version = %s' % (response.status, response.reason, response.version)) + if not (response.version >= 10): + raise Exception('Unknown protocol version:' + str(response.version)) + if not (response.status == 200 and response.reason == 'OK'): +## or response.status == 302 and response.reason == 'Moved Temporarily'): + self.logger.error('Got unrecognised answer with Status = %s, Reason = %s' % (response.status, response.reason)) + raise Exception('Problem logging in to WebDAV default repository: ' + meta['host']+ ':' + str(meta['port']) + self.davdir) + response.close() + self.dav.close() + + def setserver(self, server): + self.server = server + self.session_cookies = None + + def get(self, davURL): + if not self.session_cookies: + self.login() + self.dav.connect() + response = self.dav.get(self.davdir + davURL, {'Cookie':self.session_cookies}) + self.logger.debug('Status = %s, Reason = %s, Version = %s' % (response.status, response.reason, response.version)) + if not (response.version >= 10): + raise Exception('Unknown protocol version:' + str(response.version)) + if not (response.status == 404 or response.status == 200): ##404 == does not exist, 200 == exists + raise Exception('Unknown status response:' + str(response.status) + ':' + str(response.reason)) + response.close() + self.dav.close() + return response.status + + def mkdir(self, davdir): + if not self.session_cookies: + self.login() + self.dav.connect() + response = self.dav.mkcol(self.davdir + davdir, {'Cookie':self.session_cookies}) + self.logger.debug('Status = %s, Reason = %s, Version = %s' % (response.status, response.reason, response.version)) + if not (response.version >= 10): + raise Exception('Unknown protocol version:' + str(response.version)) + if not (response.status == 405 or response.status == 201): ##405 == exists, 201 == created + raise Exception('Unknown status response:' + str(response.status) + ':' + str(response.reason)) + response.close() + self.dav.close() + return response.status + + def storbinary(self, targetfile, binfile): + if not self.session_cookies: + self.login() + self.dav.connect() + response = self.dav.put(self.davdir + targetfile, binfile, None, None, {'Cookie':self.session_cookies}) + self.logger.debug('Status = %s, Reason = %s, Version = %s' % (response.status, response.reason, response.version)) + if not (response.version >= 10): + raise Exception("Unknown protocol version:" + str(response.version)) + if not (response.status == 200 or response.status == 201 or response.status == 204): ##200 == exists, 201 == created, 204 = no content + raise Exception('Unexpected status response:' + str(response.status) + ':' + str(response.reason)) + response.close() + self.dav.close() + return response.status + + def getbinary(self, sourcefile, binfile): + if not self.session_cookies: + self.login() + self.dav.connect() + response = self.dav.get(self.davdir + sourcefile, {'Cookie':self.session_cookies}) + self.logger.debug('Status = %s, Reason = %s, Version = %s' % (response.status, response.reason, response.version)) + if not (response.version >= 10): + raise Exception("Unknown protocol version:" + str(response.version)) + if not (response.status == 200 or response.status == 404): ##200 == exists, 404 == does not exist + raise Exception('Unexpected status response:' + str(response.status) + ':' + str(response.reason)) + binfile.write(response.read()) + response.close() + self.dav.close() + return response.status + + def upload(self, target, davroot): + """target is a (source/root dir, destination dir/filename) tuple, + the file to be sent is in target[0] + '/' + target[1], + davpath is the base path in webdav, where target[1] will be put, + target[1] can contain directories, those will be created + if necessary.""" + import os.path + path, filename = os.path.split(target[1]) + path = path + if self.get(davroot + path) == 404: + self.logger.info('Creating folder %s' % path) + temppath = davroot + self.mkdir(temppath) + for folder in path.split('/'): + temppath += '/' + folder + self.mkdir(temppath) + binfile = open(target[0] + '/' + target[1], 'rb') + self.logger.info('Storing %s' % target[0] + '/' + target[1]) + try: + ## does this raise an exception if something's wrong ? + result = self.storbinary(davroot + path + '/' + filename, binfile) + except Exception, e: + self.logger.error('Unable to store file %s' % target[1]) + binfile.close() + raise Exception('WebDAV transfer failed with error: ' + str(e)) + binfile.close() + + ## input looks something like this: + ##<?xml version="1.0" encoding="UTF-8"?> + ##<D:multistatus xmlns:D="DAV:"> + ## <D:response> + ## <D:href>/repository/mom2/R06B/006/207939/inspection_files</D:href> + ## <D:propstat> + ## <D:prop> + ## <D:resourcetype> + ## <D:collection /> + ## </D:resourcetype> + ## </D:prop> + ## <D:status>HTTP/1.1 200 OK</D:status> + ## </D:propstat> + ## </D:response> + ##</D:multistatus> + def parse_propfind(self, response): +# print response ## for testing + from xml.dom import minidom, Node + doc = minidom.parseString(response) + files = [] + dirs = [] + if doc.documentElement.nodeName == 'D:multistatus': + for node in doc.documentElement.childNodes: + if node.nodeName == 'D:response': + collection = False + name = '' + status = '' + for responsenode in node.childNodes: + if responsenode.nodeName == 'D:href': + name = responsenode.childNodes[0].nodeValue + elif responsenode.nodeName == 'D:propstat': + for propstatnode in responsenode.childNodes: + if propstatnode.nodeName == 'D:prop': + for propnode in propstatnode.childNodes: + if propnode.nodeName == 'D:resourcetype': + for resourcetypenode in propnode.childNodes: + if resourcetypenode.nodeName == 'D:collection': + collection = True + elif propstatnode.nodeName == 'D:status': + status = propstatnode.childNodes[0].nodeValue + if name and collection and status == "HTTP/1.1 200 OK": + dirs.append(name.replace(self.davdir, '', 1)) +## old repository elif name and status == "HTTP/1.1 404 Not Found": ## files do not have resourcetype + elif name and status == "HTTP/1.1 200 OK": ## files do not have resourcetype + files.append(name.replace(self.davdir, '', 1)) + return (files, dirs) + + def propfind(self, remotepath): + """Actually only tries to find the resourcetype, not every property. + As this will tell if its a directory. """ + if not self.session_cookies: + self.login() + self.dav.connect() + response = self.dav.propfind(self.davdir + remotepath, '<propfind xmlns="DAV:"><prop><resourcetype/></prop></propfind>', 1, {'Cookie':self.session_cookies}) + if (response.status == 207): ## 207 is multistatus + lines = response.read() + response.close() + self.dav.close() + return lines + else: + self.dav.close() + raise Exception("Propfind can't find the directory: " + remotepath) + + + ## for example davroot can be /repository/mom2/R06B/006/207939/inspection_files + def download(self, davroot, targetpath): + """davroot is what to retrieve, targetpath is where to put it.""" + import os + if self.get(davroot) == 404: + self.logger.info('Folder does not exist %s' % davroot) + return [] + dirs = [davroot] + files = [] + count = 0 + while dirs: + for d in dirs: + f, dirs = self.parse_propfind(self.propfind(d)) + files.extend(f) +## old repository dirs.remove(d) + + for f in files: + self.logger.debug('Retrieving file %s' % f) + path, filename = os.path.split(f.replace(davroot, '', 1)) + if not os.path.exists(targetpath + path): + os.makedirs(targetpath + path) + binfile = open(targetpath + path + '/' + filename, 'wb+') + if self.getbinary(f, binfile) == 404: + self.logger.debug('file does not exist in repository: ' + str(f)) + binfile.close() + + return files diff --git a/LTA/LTAIngest/do_ltacp.py b/LTA/LTAIngest/do_ltacp.py new file mode 100755 index 0000000000000000000000000000000000000000..94db0b66df30ec0c14c3f17245c617ffea457c68 --- /dev/null +++ b/LTA/LTAIngest/do_ltacp.py @@ -0,0 +1,147 @@ +#!/usr/bin/python +# script to automatically generate NDPPP script + +import os, optparse, subprocess, sys, signal, time + +##----------------------input options---------------------------- +parser = optparse.OptionParser() +parser.add_option("-O", "--Observation", dest="Observation", + help="Observation name (L2010_12345)") +parser.add_option("-T", "--Type", dest="Type", choices = ["lse", "lce"], + help="Where to look for Observation (lse or lce)") +parser.add_option("-r", "--rundir", dest="Rundir", + help="Runtime directory (/home/renting/ltacp)") +parser.add_option("-d", "--debug", dest="Debug", default = False, + help="Verbose (more detailed output)") +parser.add_option("-S", "--Subdir", dest="Subdir", + help="Non-default sub directory (e.g. /data/scratch/pizzo instead of /data1 or /data/scratch)") +parser.add_option("-E", "--Exclude", dest="Exclude", default = [], + help="Nodes to exclude (e.g. 1,14,19)") +parser.add_option("-L", "--Location", dest="Location", default = "/data4", + help="Where to store the files") +parser.add_option("-H", "--Host", dest="Host", choices = ["lexar001", "lexar002"], + help="Host on which to store the files") + +(options, args) = parser.parse_args() +if not options.Observation: + parser.error("Observation not set") +if not (options.Type == "lse" or options.Type == "lce"): + parser.error("Not a valid type: " + str(options.Type)) +if not options.Location: + parser.error("Location not set") +if not options.Host: + parser.error("Host not set") +if not options.Rundir: + parser.error("Runtime directory not set") + +print options + +exclude = [] +try: + if len(options.Exclude) > 0: + inputs = options.Exclude.split(',') + for i in inputs: + exclude.append(int(i)) +except ValueError: + print "Exclude values can not be parsed" + exit(-1) + +obs = options.Observation +node = options.Type +debug = options.Debug +lexar = options.Host +location = options.Location + +runtime_location = options.Rundir + +print "Processing started for " + obs + " on " + node + " nodes." +if debug: print "excluded nodes for searching: " + str(exclude) + +if node == "lse": + nodes = range(1,25) + locations = ["/data1", "/data2", "/data3", "/data4"] +else: + nodes = range(1,73) + locations = ['/data/scratch'] +if options.Subdir: + locations = [options.Subdir] + if debug: print "Using non-standard directory: " + options.Subdir + +##-----------------find the files ------------------------------- +def find_files(): + for n in nodes: + if n in exclude: + if debug: print "Skipping: " + str(n) + continue + else: + for location in locations: + command = ["ssh", "-T", "%s%03i" % (node, n), 'python %s/find_files.py %s' % (runtime_location, location + '/' + obs)] + file_finder = subprocess.Popen(command, stdout=subprocess.PIPE) + if file_finder.returncode: + print "error: %i" % (file_finder.return_code) + else: + output = file_finder.communicate()[0] + file_list = output.split() + for f in file_list: + files.append((n, location, f)) + sys.stdout.write('.') + sys.stdout.flush() + +files = [] +find_files() +if debug: print files +print "\nFound " + str(len(files)) + " datasets to process" +server = 0 + +#-------------------spawn------------------------ +def spawn(command): + try: + pid = os.fork() + except OSError, e: + print 'Unable to fork:' + str(e) + os._exit(1) + if pid == 0: ## the new client + os.system(command) + os._exit(0) + else: ## parent process + server = pid + +print "Starting server on " + lexar +if lexar == "lexar001": + command = "ssh -T lexar001.offline.lofar 'java -cp %s/ltacp.jar nl.astron.ltacp.server.LtaCpServer 10.178.1.1 2011 8 50 > /data4/ltacp-server.out'" % runtime_location + if debug: print command + spawn(command) +if debug: print "Server: " + str(server) +time.sleep(5) ##wait a few seconds for the server to start + +#------------processing the files----------------------- +print "processing the files" + +checksums = [] +for f in files: + command = "ssh -T %s%03i 'java -jar %s/ltacp.jar %s 2011 %s/%s/%s.tar %s/%s/%s'" % (node, f[0], runtime_location, lexar, location, obs, f[2], f[1], obs, f[2]) + if debug: print command + comm = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + if comm.returncode: + print "error: %i" % (comm.return_code) + else: + output = comm.communicate()[0] + lines = output.split() + for l in lines: + if l[0:11] == "<checksums>": + checksums.append("%s: %s" % (f[2], l)) + sys.stdout.write('.') + sys.stdout.flush() + +outfile = open("%s/logs/%s.log" % (runtime_location, obs), 'w+') +for c in checksums: + outfile.write(c + "\n") +outfile.close() + +print "Finished with processing files" + +#os.kill(server, signal.SIGTERM) +print "Stopped server" +os.system("scp %s/logs/%s.log %s:%s/%s/" % (runtime_location, obs, lexar, location, obs)) + +print "Done" diff --git a/LTA/LTAIngest/doc/LTA-SIP.xsd b/LTA/LTAIngest/doc/LTA-SIP.xsd new file mode 100644 index 0000000000000000000000000000000000000000..993aed9c258c1b4d8b6e662d266cf218b03b45b4 --- /dev/null +++ b/LTA/LTAIngest/doc/LTA-SIP.xsd @@ -0,0 +1,1210 @@ +<?xml version="1.0" encoding="UTF-8"?> +<xs:schema targetNamespace="http://www.astron.nl/SIP-Lofar" version="2.6.0" xmlns="http://www.astron.nl/SIP-Lofar" xmlns:xs="http://www.w3.org/2001/XMLSchema"> + <xs:annotation> + <xs:documentation> + XML Schema for data model Submission Information Package LOFAR Long Term Archive + + The basic layout of a SIP is to have the Project that owns the data described, the DataProduct + that is being ingested, and the process (Observation or PipelineRun) that generated it. + If the generating process is a PipelineRun, then it will usually have input DataProducts. These + will recursively be described in the relatedDataProduct entries, with the processes that generated + them. These relatedDataProducts do not need to have been ingested into the archive themselves. It is + sufficient that there are described, with their related Obsrvation/Pipelines in this document to + be able to recreate the full provenance of the DataProduct. + </xs:documentation> + </xs:annotation> + <xs:annotation> + <xs:documentation>============================Generic Types============================ + + Below are generic types that are used at various places within the document. It basically contains descriptions + of units like Frequency, Length, Time, astronomical entities like Equinox, Pointing and Angle + and some useful container types like lists and indentifiers. + </xs:documentation> + </xs:annotation> + <xs:simpleType name="FrequencyUnit"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Hz"/> + <xs:enumeration value="kHz"/> + <xs:enumeration value="MHz"/> + <xs:enumeration value="GHz"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="Frequency"> + <xs:simpleContent> + <xs:extension base="xs:double"> + <xs:attribute name="units" type="FrequencyUnit" use="required"/> + </xs:extension> + </xs:simpleContent> + </xs:complexType> + <xs:simpleType name="LengthUnit"> + <xs:restriction base="xs:string"> + <xs:enumeration value="m"/> + <xs:enumeration value="km"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="Length"> + <xs:simpleContent> + <xs:extension base="xs:double"> + <xs:attribute name="units" type="LengthUnit" use="required"/> + </xs:extension> + </xs:simpleContent> + </xs:complexType> + <xs:simpleType name="TimeUnit"> + <xs:restriction base="xs:string"> + <xs:enumeration value="s"/> + <xs:enumeration value="ms"/> + <xs:enumeration value="us"/> + <xs:enumeration value="ns"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="Time"> + <xs:simpleContent> + <xs:extension base="xs:double"> + <xs:attribute name="units" type="TimeUnit" use="required"/> + </xs:extension> + </xs:simpleContent> + </xs:complexType> + <xs:simpleType name="AngleUnit"> + <xs:restriction base="xs:string"> + <xs:enumeration value="radians"/> + <xs:enumeration value="degrees"/> + <xs:enumeration value="arcsec"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="Angle"> + <xs:simpleContent> + <xs:extension base="xs:double"> + <xs:attribute name="units" type="AngleUnit" use="required"/> + </xs:extension> + </xs:simpleContent> + </xs:complexType> + <xs:simpleType name="PixelUnit"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Jy/beam"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="Pixel"> + <xs:simpleContent> + <xs:extension base="xs:double"> + <xs:attribute name="units" type="PixelUnit" use="required"/> + </xs:extension> + </xs:simpleContent> + </xs:complexType> + <xs:simpleType name="ListOfDouble"> + <xs:list itemType="xs:double"/> + </xs:simpleType> + <xs:simpleType name="ListOfString"> + <xs:list itemType="xs:string"/> + </xs:simpleType> + <xs:simpleType name="ListOfSubbands"> + <xs:list itemType="xs:unsignedShort"/> + </xs:simpleType> + <xs:complexType name="ListOfFrequencies"> + <xs:sequence> + <xs:element name="frequencies" type="ListOfDouble"/> + <xs:element name="unit" type="FrequencyUnit"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>Generic identifier type. Currently two sources are supported: MoM and SAS + </xs:documentation> + </xs:annotation> + <xs:complexType name="IdentifierType"> + <xs:sequence> + <xs:element name="source" type="xs:string"/> + <xs:element name="identifier" type="xs:string"/> + <xs:element name="name" minOccurs="0" type="xs:string"/> + </xs:sequence> + </xs:complexType> + <xs:simpleType name="EquinoxType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="B1950"/> + <xs:enumeration value="J2000"/> + <xs:enumeration value="SUN"/> + </xs:restriction> + </xs:simpleType> + <xs:annotation> + <xs:documentation>Pointing, either RA/DEC or AZ/EL, can't be coded as a choice due to Eclipse parser limitations.</xs:documentation> + </xs:annotation> + <xs:complexType name="Pointing"> + <xs:sequence> + <xs:choice> + <xs:element name="rightAscension" type="Angle"/> + <xs:element name="azimuth" type="Angle"/> + </xs:choice> + <xs:choice> + <xs:element name="declination" type="Angle"/> + <xs:element name="altitude" type="Angle"/> + </xs:choice> + <xs:element name="equinox" type="EquinoxType"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>============================Stations============================ + + Below is information related to Stations and AntennaFields. Stations have one or more AntennaFields, each of which is + a single sensing element within the array for most observation types. AntennaFields are the end points for baselines in + Interferometry mode, the array elements in Beam Formed mode, etc. Only Transient Buffer Mode reads the individual antennas. + </xs:documentation> + </xs:annotation> + <xs:annotation> + <xs:documentation>Usually the coordinates of a station will by in x,y,z in ITRF, but we also have the option to use coordinates on a sphere.</xs:documentation> + </xs:annotation> + <xs:complexType name="Coordinates"> + <xs:sequence> + <xs:element name="coordinateSystem"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="WGS84"/> + <xs:enumeration value="ITRF2000"/> + <xs:enumeration value="ITRF2005"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:choice> + <xs:sequence> + <xs:element name="x" type="Length"/> + <xs:element name="y" type="Length"/> + <xs:element name="z" type="Length"/> + </xs:sequence> + <xs:sequence> + <xs:element name="radius" type="Length"/> + <xs:element name="longitude" type="Angle"/> + <xs:element name="latitude" type="Angle"/> + </xs:sequence> + </xs:choice> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation> + AntennaFields per station are currently either 2 (HBA/LBA) or 3 for core stations (HBA0/HBA1/LBA). + When the signals from HBA0 and HBA1 are combined, they show up as HBA in the SIP, not HBA0+HBA1. + </xs:documentation> + </xs:annotation> + <xs:simpleType name="AntennaFieldType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="HBA0"/> + <xs:enumeration value="HBA1"/> + <xs:enumeration value="HBA"/> + <xs:enumeration value="LBA"/> + </xs:restriction> + </xs:simpleType> + <xs:annotation> + <xs:documentation>Currently Superterp is not a separate type here.</xs:documentation> + </xs:annotation> + <xs:simpleType name="StationTypeType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Core"/> + <xs:enumeration value="Remote"/> + <xs:enumeration value="International"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="AntennaField"> + <xs:sequence> + <xs:element name="name" type="AntennaFieldType"/> + <xs:element name="location" type="Coordinates"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="Stations"> + <xs:sequence> + <xs:element maxOccurs="unbounded" name="station" type="Station"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>Currently only one (LBA/HBA/HBA0/HBA1) or two (HBA0+HBA1) antennafields can be active at the same time.</xs:documentation> + </xs:annotation> + <xs:complexType name="Station"> + <xs:sequence> + <xs:element name="name" type="xs:string"/> + <xs:element name="stationType" type="StationTypeType"/> + <xs:element minOccurs="1" maxOccurs="2" name="antennaField" type="AntennaField"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>============================Process============================ + + Below is the generic Process class. Currently it has two subclasses: Observation and PipelineRun. + All processes are currently run in Tier0, specified in MoM/Scheduler and controlled by SAS/MAC. + processIdentifier: MoM Id + observationId: SAS VIC Tree Id (Note that it's not called observationId-entifier) + strategyName/Description: Strategy template for observing/processing, known within SAS as the default template + </xs:documentation> + </xs:annotation> + <xs:annotation> + <xs:documentation>Currently only one relation type is defined (GroupID), but others are likely to be added, like Target-Calibrator, Slice. + Please note that this also can apply to Sub-Array Pointings.</xs:documentation> + </xs:annotation> + <xs:simpleType name="ProcessRelationType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="GroupID"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="ProcessRelation"> + <xs:sequence> + <xs:element name="relationType" type="ProcessRelationType"/> + <xs:element name="identifier" type="IdentifierType"/> + <xs:element minOccurs="0" name="name" type="xs:string"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="ProcessRelations"> + <xs:sequence> + <xs:element minOccurs="0" maxOccurs="unbounded" name="relation" type="ProcessRelation"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="Process"> + <xs:sequence> + <xs:element name="processIdentifier" type="IdentifierType"/> + <xs:element name="observationId" type="IdentifierType"/> + <xs:element minOccurs="0" name="parset" type="IdentifierType"/> + <xs:element name="strategyName" type="xs:string"/> + <xs:element name="strategyDescription" type="xs:string"/> + <xs:element name="startTime" type="xs:dateTime"/> + <xs:element name="duration" type="xs:duration"/> + <xs:element name="relations" type="ProcessRelations"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>============================Observation============================ + + Below is information related to the Observation process. + </xs:documentation> + </xs:annotation> + <xs:simpleType name="FilterSelectionType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="10-70 MHz"/> + <xs:enumeration value="10-90 MHz"/> + <xs:enumeration value="30-70 MHz"/> + <xs:enumeration value="30-90 MHz"/> + <xs:enumeration value="110-190 MHz"/> + <xs:enumeration value="170-230 MHz"/> + <xs:enumeration value="210-250 MHz"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="ClockType"> + <xs:simpleContent> + <xs:restriction base="Frequency"> + <xs:enumeration value="160"/> + <xs:enumeration value="200"/> + <xs:attribute fixed="MHz" name="units" type="FrequencyUnit" use="required"/> + </xs:restriction> + </xs:simpleContent> + </xs:complexType> + <xs:simpleType name="AntennaSetType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="HBA Zero"/> + <xs:enumeration value="HBA One"/> + <xs:enumeration value="HBA Dual"/> + <xs:enumeration value="HBA Joined"/> + <xs:enumeration value="LBA Outer"/> + <xs:enumeration value="LBA Inner"/> + <xs:enumeration value="LBA Sparse Even"/> + <xs:enumeration value="LBA Sparse Odd"/> + <xs:enumeration value="LBA X"/> + <xs:enumeration value="LBA Y"/> + <xs:enumeration value="HBA Zero Inner"/> + <xs:enumeration value="HBA One Inner"/> + <xs:enumeration value="HBA Dual Inner"/> + <xs:enumeration value="HBA Joined Inner"/> + </xs:restriction> + </xs:simpleType> + <xs:simpleType name="StationSelectionType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Single"/> + <xs:enumeration value="Core"/> + <xs:enumeration value="Dutch"/> + <xs:enumeration value="International"/> + <xs:enumeration value="Custom"/> + </xs:restriction> + </xs:simpleType> + <xs:simpleType name="ObservingModeType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Interferometer"/> + <xs:enumeration value="Beam Observation"/> + <xs:enumeration value="TBB (standalone)"/> + <xs:enumeration value="TBB (piggyback)"/> + <xs:enumeration value="Direct Data Storage"/> + <xs:enumeration value="Non Standard"/> + <xs:enumeration value="Unknown"/> + </xs:restriction> + </xs:simpleType> + <xs:simpleType name="TimeSystemType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="UTC"/> + <xs:enumeration value="LST"/> + </xs:restriction> + </xs:simpleType> + <xs:annotation> + <xs:documentation>Observation is one of the core classes of the SIP. It describes one of the main datagenerating processes. + The big difference with a PipelineRun process is in that an Observation has no input dataproducts as it is a + direct measurement of the physical process. + + Notes:Is unsignedShort enough for numberOftransientBufferBoardEvents? + Backward compatibility with the BlueGene: With old BG observations channelWidth and channelsPerSubband are set at Observation level and not at RealTimeProcess level. + For BG frequencyDownsamplingFactor and numberOfCollapsedChannels are set. + For Cobalt observations the reverse is true: channelWidth and channelsPerSubband are only set at RealTimeProcess level and not at Observation level. + For Cobalt frequencyDownsamplingFactor en numberOfCollapsedChannels are not set. + </xs:documentation> + </xs:annotation> + <xs:complexType name="Observation"> + <xs:complexContent> + <xs:extension base="Process"> + <xs:sequence> + <xs:element name="observingMode" type="ObservingModeType"/> + <xs:element minOccurs="0" name="observationDescription" type="xs:string"/> + <xs:element name="instrumentFilter" type="FilterSelectionType"/> + <xs:element name="clock" type="ClockType"/> + <xs:element name="stationSelection" type="StationSelectionType"/> + <xs:element name="antennaSet" type="AntennaSetType"/> + <xs:element name="timeSystem" type="TimeSystemType"/> + <xs:element minOccurs="0" name="channelWidth" type="Frequency"/><!--BlueGene compatibility--> + <xs:element minOccurs="0" name="channelsPerSubband" type="xs:unsignedShort"/><!--BlueGene compatibility--> + <xs:element name="numberOfStations" type="xs:unsignedByte"/> + <xs:element name="stations" type="Stations"/> + <xs:element name="numberOfSubArrayPointings" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="subArrayPointings" type="SubArrayPointings"/> + <xs:element name="numberOftransientBufferBoardEvents" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="transientBufferBoardEvents" type="TransientBufferBoardEvents"/> + <xs:element name="numberOfCorrelatedDataProducts" type="xs:unsignedShort"/> + <xs:element name="numberOfBeamFormedDataProducts" type="xs:unsignedShort"/> + <xs:element name="numberOfBitsPerSample" type="xs:unsignedShort"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="DirectDataMeasurement"> + <xs:complexContent> + <xs:extension base="Process"> + <xs:sequence> + <xs:element name="observingMode" type="ObservingModeType"/> + <xs:element name="station" type="Station"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>===================Generic/Unspecified====================== + + Please note that the difference between Generic and Unspecified is that the first describes a non standard process, while the second describes an unknown + process. The latter is mostly used when there are partial errors during the ingest of data into the archive. + </xs:documentation> + </xs:annotation> + <xs:complexType name="GenericMeasurement"> + <xs:complexContent> + <xs:extension base="Process"> + <xs:sequence> + <xs:element name="observingMode" type="ObservingModeType"/> + <xs:element name="description" type="xs:string"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="UnspecifiedProcess"> + <xs:complexContent> + <xs:extension base="Process"> + <xs:sequence> + <xs:element name="observingMode" type="ObservingModeType"/> + <xs:element name="description" type="xs:string"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>============================Online Processing============================ + + This describes the various types of realtime/online processing that can happen after the data is sent from the stations to + the central processing. It's still part of the Observation. + </xs:documentation> + </xs:annotation> + <xs:simpleType name="ProcessingType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Correlator"/> + <xs:enumeration value="Coherent Stokes"/> + <xs:enumeration value="Incoherent Stokes"/> + <xs:enumeration value="Fly's Eye"/> + <xs:enumeration value="Non Standard"/> + </xs:restriction> + </xs:simpleType> + <xs:simpleType name="MeasurementType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Test"/> + <xs:enumeration value="Tune Up"/> + <xs:enumeration value="Calibration"/> + <xs:enumeration value="Target"/> + <xs:enumeration value="All Sky"/> + <xs:enumeration value="Miscellaneous"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="Processing"> + <xs:sequence> + <xs:element minOccurs="0" name="correlator" type="Correlator"/> + <xs:element minOccurs="0" name="coherentStokes" type="CoherentStokes"/> + <xs:element minOccurs="0" name="incoherentStokes" type="IncoherentStokes"/> + <xs:element minOccurs="0" name="flysEye" type="FlysEye"/> + <xs:element minOccurs="0" name="nonStandard" type="NonStandard"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="RealTimeProcess"> + <xs:sequence> + <xs:element name="processingType" type="ProcessingType"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="Correlator"> + <xs:complexContent> + <xs:extension base="RealTimeProcess"> + <xs:sequence> + <xs:element name="integrationInterval" type="Time"/> + <xs:element minOccurs="0" name="channelWidth" type="Frequency"/><!--BlueGene compatibility--> + <xs:element minOccurs="0" name="channelsPerSubband" type="xs:unsignedShort"/><!--BlueGene compatibility--> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>The CoherentStokes and IncoherentStokes do further processing on the data after the polyphase filter + on the BlueGene. The numberOfColapsedChannels is what is actually written to disk, the frequencyDownsamplingFactor is thus + Observation:channelsPerSubband divided by the numberOfcolapsedChannels. + There is also downsampling in time from the rawSamplingTime coming out of the polyphasefilter, usually in nanoseconds, using the timeDownsamplingFactor to + get to the samplingTime. The timeDownsamplingFactor can be quite large, with the resulting samplingtime in the miliseconds. + + Also note that within the same Observation, these settings can be different for CoherentStokes and IncoherentStokes. if both types are being generated. + </xs:documentation> + </xs:annotation> + <xs:complexType name="CoherentStokes"> + <xs:complexContent> + <xs:extension base="RealTimeProcess"> + <xs:sequence> + <xs:element name="rawSamplingTime" type="Time"/> + <xs:element name="timeDownsamplingFactor" type="xs:unsignedInt"/> + <xs:element name="samplingTime" type="Time"/> + <xs:element minOccurs="0" name="frequencyDownsamplingFactor" type="xs:unsignedShort"/><!--BlueGene compatibility--> + <xs:element minOccurs="0" name="numberOfCollapsedChannels" type="xs:unsignedShort"/><!--BlueGene compatibility--> + <xs:element name="stokes" type="PolarizationType" maxOccurs="4"/> + <xs:element name="numberOfStations" type="xs:unsignedByte"/> + <xs:element name="stations" type="Stations"/> + <xs:element minOccurs="0" name="channelWidth" type="Frequency"/><!--BlueGene compatibility--> + <xs:element minOccurs="0" name="channelsPerSubband" type="xs:unsignedShort"/><!--BlueGene compatibility--> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="IncoherentStokes"> + <xs:complexContent> + <xs:extension base="RealTimeProcess"> + <xs:sequence> + <xs:element name="rawSamplingTime" type="Time"/> + <xs:element name="timeDownsamplingFactor" type="xs:unsignedInt"/> + <xs:element name="samplingTime" type="Time"/> + <xs:element minOccurs="0" name="frequencyDownsamplingFactor" type="xs:unsignedShort"/><!--BlueGene compatibility--> + <xs:element minOccurs="0" name="numberOfCollapsedChannels" type="xs:unsignedShort"/><!--BlueGene compatibility--> + <xs:element name="stokes" type="PolarizationType" maxOccurs="4"/> + <xs:element name="numberOfStations" type="xs:unsignedByte"/> + <xs:element name="stations" type="Stations"/> + <xs:element minOccurs="0" name="channelWidth" type="Frequency"/><!--BlueGene compatibility--> + <xs:element minOccurs="0" name="channelsPerSubband" type="xs:unsignedShort"/><!--BlueGene compatibility--> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="FlysEye"> + <xs:complexContent> + <xs:extension base="RealTimeProcess"> + <xs:sequence> + <xs:element name="rawSamplingTime" type="Time"/> + <xs:element name="timeDownsamplingFactor" type="xs:unsignedInt"/> + <xs:element name="samplingTime" type="Time"/> + <xs:element name="stokes" type="PolarizationType" maxOccurs="4"/> + <xs:element minOccurs="0" name="channelWidth" type="Frequency"/><!--BlueGene compatibility--> + <xs:element minOccurs="0" name="channelsPerSubband" type="xs:unsignedShort"/><!--BlueGene compatibility--> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="NonStandard"> + <xs:complexContent> + <xs:extension base="RealTimeProcess"> + <xs:sequence> + <xs:element name="channelWidth" type="Frequency"/> + <xs:element name="channelsPerSubband" type="xs:unsignedShort"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="TransientBufferBoardEvents"> + <xs:sequence> + <xs:element maxOccurs="unbounded" name="transientBufferBoardEvent" type="TransientBufferBoardEvent"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="TransientBufferBoardEvent"> + <xs:sequence> + <xs:element name="eventSource" type="xs:string"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="SubArrayPointings"> + <xs:sequence> + <xs:element maxOccurs="unbounded" name="subArrayPointing" type="SubArrayPointing"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>SubArrayPointing is one of the core classes of the SIP. It contains important information on + what direction the telescope is pointing and what object was the target as well as the length of time the patch + of sky was was observed. + + See the XML standard for the format of xs::duration. + </xs:documentation> + </xs:annotation> + <xs:complexType name="SubArrayPointing"> + <xs:sequence> + <xs:element name="pointing" type="Pointing"/> + <xs:element name="beamNumber" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="measurementDescription" type="xs:string"/> + <xs:element name="subArrayPointingIdentifier" type="IdentifierType"/> + <xs:element name="measurementType" type="MeasurementType"/> + <xs:element name="targetName" type="xs:string"/> + <xs:element name="startTime" type="xs:dateTime"/> + <xs:element name="duration" type="xs:duration"/> + <xs:element name="numberOfProcessing" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="processing" type="Processing"/> + <xs:element name="numberOfCorrelatedDataProducts" type="xs:unsignedShort"/> + <xs:element name="numberOfBeamFormedDataProducts" type="xs:unsignedShort"/> + <xs:element name="relations" type="ProcessRelations"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="DataSources"> + <xs:annotation> + <xs:documentation>============================Pipeline============================ + + This section describes the various pipelines. + </xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:element name="dataProductIdentifier" type="IdentifierType" maxOccurs="unbounded"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="PipelineRun"> + <xs:complexContent> + <xs:extension base="Process"> + <xs:sequence> + <xs:element name="pipelineName" type="xs:string"/> + <xs:element name="pipelineVersion" type="xs:string"/> + <xs:element name="sourceData" type="DataSources"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>This definition might not be entirely finished as the ImagingPipeline is still being worked on.</xs:documentation> + </xs:annotation> + <xs:complexType name="ImagingPipeline"> + <xs:complexContent> + <xs:extension base="PipelineRun"> + <xs:sequence> + <xs:element minOccurs="0" name="frequencyIntegrationStep" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="timeIntegrationStep" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="skyModelDatabase" type="xs:string"/> + <xs:element minOccurs="0" name="demixing" type="xs:boolean"/> + <xs:element name="imagerIntegrationTime" type="Time"/> + <xs:element name="numberOfMajorCycles" type="xs:unsignedShort"/> + <xs:element name="numberOfInstrumentModels" type="xs:unsignedShort"/> + <xs:element name="numberOfCorrelatedDataProducts" type="xs:unsignedShort"/> + <xs:element name="numberOfSkyImages" type="xs:unsignedShort"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="CalibrationPipeline"> + <xs:complexContent> + <xs:extension base="PipelineRun"> + <xs:sequence> + <xs:element minOccurs="0" name="frequencyIntegrationStep" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="timeIntegrationStep" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="flagAutoCorrelations" type="xs:boolean"/> + <xs:element minOccurs="0" name="demixing" type="xs:boolean"/> + <xs:element name="skyModelDatabase" type="xs:string"/> + <xs:element name="numberOfInstrumentModels" type="xs:unsignedShort"/> + <xs:element name="numberOfCorrelatedDataProducts" type="xs:unsignedShort"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="AveragingPipeline"> + <xs:complexContent> + <xs:extension base="PipelineRun"> + <xs:sequence> + <xs:element name="frequencyIntegrationStep" type="xs:unsignedShort"/> + <xs:element name="timeIntegrationStep" type="xs:unsignedShort"/> + <xs:element name="flagAutoCorrelations" type="xs:boolean"/> + <xs:element name="demixing" type="xs:boolean"/> + <xs:element name="numberOfCorrelatedDataProducts" type="xs:unsignedShort"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>Pulsar pipeline. Which pulsars are selected for processing is a complex system</xs:documentation> + </xs:annotation> + <xs:simpleType name="PulsarSelectionType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Pulsars in observation specs, file or SAP"/><!--Default--> + <xs:enumeration value="Pulsars in observation specs"/><!--parset--> + <xs:enumeration value="Pulsar specified in dataproduct"/><!--meta--> + <xs:enumeration value="Brightest known pulsar in SAP"/><!--sapfind--> + <xs:enumeration value="Three brightest known pulsars in SAP"/><!--sapfind3--> + <xs:enumeration value="Brightest known pulsar in TAB"/><!--tabfind--> + <xs:enumeration value="Pulsars in observation specs, file and brightest in SAP and TAB"/><!--tabfind+--> + <xs:enumeration value="Specified pulsar list"/><!--given list, comma separated--> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="PulsarPipeline"> + <xs:complexContent> + <xs:extension base="PipelineRun"> + <xs:sequence> + <xs:element name="pulsarSelection" type="PulsarSelectionType"/> + <xs:element name="pulsars" type="ListOfString"/> + <xs:element name="doSinglePulseAnalysis" type="xs:boolean"/><!--single-pulse--> + <xs:element name="convertRawTo8bit" type="xs:boolean"/><!--raw-to-8bit--> + <xs:element name="subintegrationLength" type="Time"/><!--tsubint--> + <xs:element name="skipRFIExcision" type="xs:boolean"/><!--norfi--> + <xs:element name="skipDataFolding" type="xs:boolean"/><!--nofold--> + <xs:element name="skipOptimizePulsarProfile" type="xs:boolean"/><!--nopdmp--> + <xs:element name="skipConvertRawIntoFoldedPSRFITS" type="xs:boolean"/><!--skip-dspsr--> + <xs:element name="runRotationalRAdioTransientsAnalysis" type="xs:boolean"/><!--rrats RRAT capitals on purpose--> + <xs:element name="skipDynamicSpectrum" type="xs:boolean"/><!--skip-dynamic-spectrum--> + <xs:element name="skipPreFold" type="xs:boolean"/><!--skip-prefold--> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="CosmicRayPipeline"> + <xs:complexContent> + <xs:extension base="PipelineRun"/> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="LongBaselinePipeline"> + <xs:complexContent> + <xs:extension base="PipelineRun"> + <xs:sequence> + <xs:element name="subbandsPerSubbandGroup" type="xs:unsignedShort"/> + <xs:element name="subbandGroupsPerMS" type="xs:unsignedShort"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="GenericPipeline"> + <xs:complexContent> + <xs:extension base="PipelineRun"/> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>============================DataProduct============================ + + This section describes the dataproducts. + </xs:documentation> + </xs:annotation> + <xs:simpleType name="DataProductType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Correlator data"/> + <xs:enumeration value="Beam Formed data"/> + <xs:enumeration value="Transient Buffer Board data"/> + <xs:enumeration value="Sky Image"/> + <xs:enumeration value="Pixel Map"/> + <xs:enumeration value="Direct Data Storage data"/> + <xs:enumeration value="Dynamic Spectra data"/> + <xs:enumeration value="Instrument Model"/> + <xs:enumeration value="Sky Model"/> + <xs:enumeration value="Pulsar pipeline output"/> + <xs:enumeration value="Pulsar pipeline summary output"/> + <xs:enumeration value="Non Standard"/> + <xs:enumeration value="Unknown"/> + </xs:restriction> + </xs:simpleType> + <xs:simpleType name="ChecksumAlgorithm"> + <xs:restriction base="xs:string"> + <xs:enumeration value="MD5"/> + <xs:enumeration value="Adler32"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="ChecksumType"> + <xs:sequence> + <xs:element name="algorithm" type="ChecksumAlgorithm"/> + <xs:element name="value" type="xs:string"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>We plan to support three types of file formats currently in the LTA.</xs:documentation> + </xs:annotation> + <xs:simpleType name="FileFormatType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="FITS"/> + <xs:enumeration value="AIPS++/CASA"/> + <xs:enumeration value="HDF5"/> + <xs:enumeration value="PULP"/> + </xs:restriction> + </xs:simpleType> + <xs:annotation> + <xs:documentation>This very well defined yet! type probably needs to be an enumeration</xs:documentation> + </xs:annotation> + <xs:complexType name="TBBTrigger"> + <xs:sequence> + <xs:element name="type" type="xs:string"/> + <xs:element name="value" type="xs:string"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>From AIPS++/CASA: None=0,I=1,Q=2,U=3,V=4,RR=5,RL=6,LR=7,LL=8,XX=9,XY=10,YX=11,YY=12</xs:documentation> + </xs:annotation> + <xs:simpleType name="PolarizationType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="None"/> + <xs:enumeration value="I"/> + <xs:enumeration value="Q"/> + <xs:enumeration value="U"/> + <xs:enumeration value="V"/> + <xs:enumeration value="RR"/> + <xs:enumeration value="RL"/> + <xs:enumeration value="LR"/> + <xs:enumeration value="LL"/> + <xs:enumeration value="XX"/> + <xs:enumeration value="XY"/> + <xs:enumeration value="YX"/> + <xs:enumeration value="YY"/> + <xs:enumeration value="Xre"/> + <xs:enumeration value="Xim"/> + <xs:enumeration value="Yre"/> + <xs:enumeration value="Yim"/> + </xs:restriction> + </xs:simpleType> + <xs:annotation> + <xs:documentation>Base class of the various DataProducts</xs:documentation> + </xs:annotation> + <xs:complexType name="DataProduct"> + <xs:sequence> + <xs:element name="dataProductType" type="DataProductType"/> + <xs:element name="dataProductIdentifier" type="IdentifierType"/> + <xs:element minOccurs="0" name="storageTicket" type="xs:string"/> + <xs:element name="size" type="xs:unsignedLong"/><!--Bytes--> + <xs:element maxOccurs="unbounded" minOccurs="0" name="checksum" type="ChecksumType"/> + <xs:element name="fileName" type="xs:string"/> + <xs:element name="fileFormat" type="FileFormatType"/> + <xs:element name="processIdentifier" type="IdentifierType"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>======================Interferometer=========================== + + subArrayPointingIdentifier (also knows as MeasurementIdentifier within MoM) is not an optional parameter as the + LTA catalog needs it because it can't otherwise find this information because of limits in the PipelineRun model. + See the XML standard for the format of xs::duration. + </xs:documentation> + </xs:annotation> + <xs:complexType name="CorrelatedDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"> + <xs:sequence> + <xs:element name="subArrayPointingIdentifier" type="IdentifierType"/> + <xs:element name="subband" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="stationSubband" type="xs:unsignedShort"/> + <xs:element name="startTime" type="xs:dateTime"/> + <xs:element name="duration" type="xs:duration"/> + <xs:element name="integrationInterval" type="Time"/> + <xs:element name="centralFrequency" type="Frequency"/> + <xs:element name="channelWidth" type="Frequency"/> + <xs:element name="channelsPerSubband" type="xs:unsignedShort"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>This currently describes the ParmDB. No fields are defined because in the model is functions + as a link between Calibrator and Target observations and doesn't contain any searchable metadata at the moment. + </xs:documentation> + </xs:annotation> + <xs:complexType name="InstrumentModelDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"/> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>This currently describes the SourceDB. No fields are defined because in the model is functions + as a link between Calibrator and Target observations and doesn't contain any searchable metadata at the moment. + </xs:documentation> + </xs:annotation> + <xs:complexType name="SkyModelDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"/> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>=====================TransientBufferBoard===================== + + The timeStamp contains the exact number of seconds from Observation::startTime + </xs:documentation> + </xs:annotation> + <xs:complexType name="TransientBufferBoardDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"> + <xs:sequence> + <xs:element name="numberOfSamples" type="xs:unsignedInt"/> + <xs:element name="timeStamp" type="xs:unsignedInt"/> + <xs:element name="triggerParameters" type="TBBTrigger"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>=====================BeamFormed===================== + + This section describes BeamFormed dataproducts. The important part is that this is where all the individual ArrayBeams are + described, the actual BeamFormedDataProduct is basically just a container. + </xs:documentation> + </xs:annotation> + <xs:complexType name="ArrayBeams"> + <xs:sequence> + <xs:element maxOccurs="unbounded" name="arrayBeam" type="ArrayBeam"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>SamplingTime is the duration of a single sample usually in ms or ns.</xs:documentation> + </xs:annotation> + <xs:complexType name="ArrayBeam"> + <xs:sequence> + <xs:element name="subArrayPointingIdentifier" type="IdentifierType"/> + <xs:element name="beamNumber" type="xs:unsignedShort"/> + <xs:element name="dispersionMeasure" type="xs:double"/> + <xs:element name="numberOfSubbands" type="xs:unsignedShort"/> + <xs:element name="stationSubbands" type="ListOfSubbands"/> + <xs:element name="samplingTime" type="Time"/> + <xs:element name="centralFrequencies" type="ListOfFrequencies"/> + <xs:element name="channelWidth" type="Frequency"/> + <xs:element name="channelsPerSubband" type="xs:unsignedShort"/> + <xs:element name="stokes" type="PolarizationType" maxOccurs="4"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>The pointing is the actual pointing of the ArrayBeam + The offset is the difference in the pointing of the ArrayBeam and the SubArrayPointing + </xs:documentation> + </xs:annotation> + <xs:complexType name="CoherentStokesBeam"> + <xs:complexContent> + <xs:extension base="ArrayBeam"> + <xs:sequence> + <xs:element name="pointing" type="Pointing"/> + <xs:element name="offset" type="Pointing"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="IncoherentStokesBeam"> + <xs:complexContent> + <xs:extension base="ArrayBeam"> + <xs:sequence/> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="FlysEyeBeam"> + <xs:complexContent> + <xs:extension base="ArrayBeam"> + <xs:sequence> + <xs:element name="station" type="Station"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="BeamFormedDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"> + <xs:sequence> + <xs:element name="numberOfBeams" type="xs:unsignedShort"/> + <xs:element minOccurs="0" name="beams" type="ArrayBeams"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>=====================Pulsar Pipeline Dataproducts===================== + + PULP is a name for the output of the pulsar pipeline, it contains a list of small files. + It also contains pointers back to the observation/SAP/beam that the raw data came out of. + </xs:documentation> + </xs:annotation> + <xs:simpleType name="PulsarPipelineDataType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="CoherentStokes"/> + <xs:enumeration value="IncoherentStokes"/> + <xs:enumeration value="ComplexVoltages"/> + <xs:enumeration value="SummaryCoherentStokes"/> + <xs:enumeration value="SummaryIncoherentStokes"/> + <xs:enumeration value="SummaryComplexVoltages"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="PulpSummaryDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"> + <xs:sequence> + <xs:element name="fileContent" type="ListOfString"/> + <xs:element name="dataType" type="PulsarPipelineDataType"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="PulpDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"> + <xs:sequence> + <xs:element name="fileContent" type="ListOfString"/> + <xs:element name="dataType" type="PulsarPipelineDataType"/> + <xs:element name="arrayBeam" type="ArrayBeam"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>===================Generic/Unspecified====================== + + Please note that the difference between Generic and Unspecified is that the first describes a non standard dataproduct, while the second describes an unknown + dataproduct. The latter is mostly used when there are partial errors during the ingest of data into the archive. + </xs:documentation> + </xs:annotation> + <xs:complexType name="GenericDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"/> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="UnspecifiedDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"/> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>===================Images====================== + + Below are structures related to image type dataproducts. The main dataproduct here is the SkyImage, which contains three levels + of data. At the top there is the general information, below that there are coordinate types and those have one or more axes. + </xs:documentation> + </xs:annotation> + <xs:complexType name="Axis"> + <xs:sequence> + <xs:element name="number" type="xs:unsignedShort"/> + <xs:element name="name" type="xs:string"/> + <xs:element name="units" type="xs:string"/> + <xs:element name="length" type="xs:unsignedInt"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="LinearAxis"> + <xs:complexContent> + <xs:extension base="Axis"> + <xs:sequence> + <xs:element name="increment" type="xs:double"/> + <xs:element name="referencePixel" type="xs:double"/> + <xs:element name="referenceValue" type="xs:double"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="TabularAxis"> + <xs:complexContent> + <xs:extension base="Axis"> + <xs:sequence/> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="Coordinate"> + <xs:sequence/> + </xs:complexType> + <xs:simpleType name="RaDecSystem"> + <xs:restriction base="xs:string"> + <xs:enumeration value="ICRS"/> + <xs:enumeration value="FK5"/> + <xs:enumeration value="FK4"/> + <xs:enumeration value="FK4-NO-E"/> + <xs:enumeration value="GAPPT"/> + </xs:restriction> + </xs:simpleType> + <xs:simpleType name="LocationFrame"> + <xs:restriction base="xs:string"> + <xs:enumeration value="GEOCENTER"/> + <xs:enumeration value="BARYCENTER"/> + <xs:enumeration value="HELIOCENTER"/> + <xs:enumeration value="TOPOCENTER"/> + <xs:enumeration value="LSRK"/> + <xs:enumeration value="LSRD"/> + <xs:enumeration value="GALACTIC"/> + <xs:enumeration value="LOCAL_GROUP"/> + <xs:enumeration value="RELOCATABLE"/> + </xs:restriction> + </xs:simpleType> + <xs:annotation> + <xs:documentation>The DirectionCoordinate defines the RA and DEC axes and their projection on the celestial sphere.</xs:documentation> + </xs:annotation> + <xs:complexType name="DirectionCoordinate"> + <xs:complexContent> + <xs:extension base="Coordinate"> + <xs:sequence> + <xs:element minOccurs="2" maxOccurs="2" name="directionLinearAxis" type="LinearAxis"/> + <xs:element name="PC0_0" type="xs:double"/> + <xs:element name="PC0_1" type="xs:double"/> + <xs:element name="PC1_0" type="xs:double"/> + <xs:element name="PC1_1" type="xs:double"/> + <xs:element name="equinox" type="xs:string"/> + <xs:element name="raDecSystem" type="RaDecSystem"/> + <xs:element name="projection" type="xs:string"/> + <xs:element name="projectionParameters" type="ListOfDouble"/> + <xs:element name="longitudePole" type="Angle"/> + <xs:element name="latitudePole" type="Angle"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:simpleType name="SpectralQuantityType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="Frequency"/> + <xs:enumeration value="Energy"/> + <xs:enumeration value="Wavenumber"/> + <xs:enumeration value="VelocityRadio"/> + <xs:enumeration value="VelocityOptical"/> + <xs:enumeration value="VelocityAppRadial"/> + <xs:enumeration value="Redshift"/> + <xs:enumeration value="WaveLengthVacuum"/> + <xs:enumeration value="WaveLengthAir"/> + <xs:enumeration value="BetaFactor"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="SpectralQuantity"> + <xs:sequence> + <xs:element name="type" type="SpectralQuantityType"/> + <xs:element name="value" type="xs:double"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="SpectralCoordinate"> + <xs:complexContent> + <xs:extension base="Coordinate"> + <xs:sequence> + <xs:choice> + <xs:element name="spectralLinearAxis" type="LinearAxis"/> + <xs:element name="spectralTabularAxis" type="TabularAxis"/> + </xs:choice> + <xs:element name="spectralQuantity" type="SpectralQuantity"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="TimeCoordinate"> + <xs:complexContent> + <xs:extension base="Coordinate"> + <xs:sequence> + <xs:choice> + <xs:element name="timeLinearAxis" type="LinearAxis"/> + <xs:element name="timeTabularAxis" type="TabularAxis"/> + </xs:choice> + <xs:element name="equinox" type="EquinoxType"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:complexType name="PolarizationCoordinate"> + <xs:complexContent> + <xs:extension base="Coordinate"> + <xs:sequence> + <xs:element name="polarizationTabularAxis" type="TabularAxis"/> + <xs:element maxOccurs="4" name="polarization" type="PolarizationType"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>PixelMap is a generic base class. Currently only SkyImage is derived from it, but in the future others will be, lik DynamicSpectra + The limit of 999 is based on the limits in the FITS standard. In practice it's not likely to be reached. + </xs:documentation> + </xs:annotation> + <xs:complexType name="PixelMapDataProduct"> + <xs:complexContent> + <xs:extension base="DataProduct"> + <xs:sequence> + <xs:element name="numberOfAxes" type="xs:unsignedShort"/> + <xs:element name="numberOfCoordinates" type="xs:unsignedShort"/> + <xs:element maxOccurs="999" name="coordinate" type="Coordinate"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>The SkyImage has two pointings: One for the actual image, encoded in the axes of + the DirectionCoordinate, and an observationPointing, which is the direction the telescope was actually facing. This need not + be the same although it often will be. + + Usually a SkyImage will have one DirectionCoordinate, one PolarizationCoordinate and one SpectralCoordinate. + </xs:documentation> + </xs:annotation> + <xs:complexType name="SkyImageDataProduct"> + <xs:complexContent> + <xs:extension base="PixelMapDataProduct"> + <xs:sequence> + <xs:element name="locationFrame" type="LocationFrame"/> + <xs:element name="timeFrame" type="xs:string"/> + <xs:element name="observationPointing" type="Pointing"/> + <xs:element name="restoringBeamMajor" type="Angle"/> + <xs:element name="restoringBeamMinor" type="Angle"/> + <xs:element name="rmsNoise" type="Pixel"/> + </xs:sequence> + </xs:extension> + </xs:complexContent> + </xs:complexType> + <xs:annotation> + <xs:documentation>============================Parset============================ + + This section describes the Parset. It's an optional section, given that not all processes that will be run to generate data might have + been run from SAS/MAC in the future. At the moment practically all processes should have a parset as we only run stuff on CEP/Tier 0. + </xs:documentation> + </xs:annotation> + <xs:complexType name="Parset"> + <xs:sequence> + <xs:element name="identifier" type="IdentifierType"/> + <xs:element name="contents" type="xs:string"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>============================Project============================ + + This section describes Project information. + </xs:documentation> + </xs:annotation> + <xs:simpleType name="Telescope"> + <xs:restriction base="xs:string"> + <xs:enumeration value="LOFAR"/> + </xs:restriction> + </xs:simpleType> + <xs:annotation> + <xs:documentation>The assumption is that all processes leading to the dataproduct are in the same project or public. + A dataproduct that would be created from non-public data from different projects is not modelled in the archive.</xs:documentation> + </xs:annotation> + <xs:complexType name="Project"> + <xs:sequence> + <xs:element name="projectCode" type="xs:string"/> + <xs:element name="primaryInvestigator" type="xs:string"/> + <xs:element name="coInvestigator" type="xs:string" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="contactAuthor" type="xs:string"/> + <xs:element name="telescope" type="Telescope"/> + <xs:element name="projectDescription" type="xs:string"/> + </xs:sequence> + </xs:complexType> + <xs:annotation> + <xs:documentation>============================LTASip root element============================ + + This is the root of the LTA SIP. It should have at least one Observation or PipelineRun, describing the process that generated the + dataProduct. + </xs:documentation> + </xs:annotation> + <xs:element name="ltaSip" type="LTASip"/> + <xs:complexType name="LTASip"> + <xs:sequence> + <xs:element name="sipGeneratorVersion" type="xs:string"/> + <xs:element name="project" type="Project"/> + <xs:element name="dataProduct" type="DataProduct"/> + <xs:element maxOccurs="unbounded" minOccurs="0" name="observation" type="Observation"/> + <xs:element maxOccurs="unbounded" minOccurs="0" name="pipelineRun" type="PipelineRun"/> + <xs:element maxOccurs="unbounded" minOccurs="0" name="unspecifiedProcess" type="UnspecifiedProcess"/> + <xs:element name="relatedDataProduct" type="DataProduct" minOccurs="0" maxOccurs="unbounded"/> + <xs:element maxOccurs="unbounded" minOccurs="0" name="parset" type="Parset"/> + </xs:sequence> + </xs:complexType> +</xs:schema> diff --git a/LTA/LTAIngest/example.job b/LTA/LTAIngest/example.job new file mode 100644 index 0000000000000000000000000000000000000000..b0c9108bd02c2b1eaf867c678ec1f3e850666505 --- /dev/null +++ b/LTA/LTAIngest/example.job @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<exportjob exportID="A_114_114_12600_352399_L6509_SAP000_SB000_uv.MS"> + <scriptname>IngestPipeline</scriptname> + <input name="DataProduct">L6509_SAP000_SB000_uv.MS</input> + <input name="Project">test-lofar</input> + <input name="JobId">A_114_114_12600_352399_L6509_SAP000_SB000_uv.MS</input> + <input name="MomId">352399</input> + <input name="ObservationId">6509</input> + <input name="Subband">0</input> + <input name="Location">lexar002:/data2/L6509/L6509_SAP000_SB000_uv.MS</input> + <repository> + <server>webdav_lofartest_repository2</server> + <resultdir>/mom2/test-lofar/19104/ArchiveLogs</resultdir> + </repository> +</exportjob> diff --git a/LTA/LTAIngest/find_files.py b/LTA/LTAIngest/find_files.py new file mode 100644 index 0000000000000000000000000000000000000000..33d84cc7844a394a69961e37321f1a8b5656d41a --- /dev/null +++ b/LTA/LTAIngest/find_files.py @@ -0,0 +1,10 @@ +import os, sys +path = sys.argv[1] +if os.path.isdir(path): + files = os.listdir(path) + for f in files: + if f[-5:] == '.dppp' or f[-3:] == '.MS' or f[-4:] == '.dp3': + print f +else: + exit(1) +exit(0) \ No newline at end of file diff --git a/LTA/LTAIngest/fpconst-0.7.0/PKG-INFO b/LTA/LTAIngest/fpconst-0.7.0/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..b623f6fbdbeceee0cb1d4510c410dfc3952ccf85 --- /dev/null +++ b/LTA/LTAIngest/fpconst-0.7.0/PKG-INFO @@ -0,0 +1,31 @@ +Metadata-Version: 1.0 +Name: fpconst +Version: 0.7.0 +Summary: Utilities for handling IEEE 754 floating point special values +Home-page: http://www.analytics.washington.edu/statcomp/projects/rzope/fpconst/ +Author: Gregory Warnes +Author-email: gregory_r_warnes@groton.pfizer.com +License: UNKNOWN +Description: Utilities for handling IEEE 754 floating point special values + + This python module implements constants and functions for working with + IEEE754 double-precision special values. It provides constants for + Not-a-Number (NaN), Positive Infinity (PosInf), and Negative Infinity + (NegInf), as well as functions to test for these values. + + The code is implemented in pure python by taking advantage of the + 'struct' standard module. Care has been taken to generate proper + results on both big-endian and little-endian machines. Some efficiency + could be gained by translating the core routines into C. + + See <http://babbage.cs.qc.edu/courses/cs341/IEEE-754references.html> + for reference material on the IEEE 754 floating point standard. + + Further information on this package is available at + <http://www.analytics.washington.edu/statcomp/projects/rzope/fpconst/>. + + Author: Gregory R. Warnes <gregory_r_warnes@groton.pfizer.com> + Date:: 2003-04-08 + Copyright: (c) 2003, Pfizer, Inc. + +Platform: UNKNOWN diff --git a/LTA/LTAIngest/fpconst-0.7.0/README b/LTA/LTAIngest/fpconst-0.7.0/README new file mode 100644 index 0000000000000000000000000000000000000000..3c27c7c8e5d85a6460b309c2ed8ab4c5cd610e4c --- /dev/null +++ b/LTA/LTAIngest/fpconst-0.7.0/README @@ -0,0 +1,53 @@ +------------------------------------------------------------------ +fpconst: A Python module for handling IEEE 754 floating point special + values +------------------------------------------------------------------ +Author: Gregory R. Warnes <gregory_r_warnes@groton.pfizer.com> +Date:: 2003-04-08 +Version 0.6.0 +Copyright: (c) 2003 Pfizer, Inc +------------------------------------------------------------------ + +This module provides 'fpconst' a reference implementation of Python +Enhancement Protocol (PEP) 754, "IEEE 754 Floating Point Special +Values". See the file pep-0754.txt or +http://www.python.org/peps/pep-0754.html for the text of PEP 754. + +INSTALLATION + + This is a standard Python source package. Consequently, simply unpack + the source distribution, change into the diretory, and run + + $ Python setup.py install + +LICENSE + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + + Neither the name of Pfizer, Inc. nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------------- diff --git a/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/PKG-INFO b/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..b623f6fbdbeceee0cb1d4510c410dfc3952ccf85 --- /dev/null +++ b/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/PKG-INFO @@ -0,0 +1,31 @@ +Metadata-Version: 1.0 +Name: fpconst +Version: 0.7.0 +Summary: Utilities for handling IEEE 754 floating point special values +Home-page: http://www.analytics.washington.edu/statcomp/projects/rzope/fpconst/ +Author: Gregory Warnes +Author-email: gregory_r_warnes@groton.pfizer.com +License: UNKNOWN +Description: Utilities for handling IEEE 754 floating point special values + + This python module implements constants and functions for working with + IEEE754 double-precision special values. It provides constants for + Not-a-Number (NaN), Positive Infinity (PosInf), and Negative Infinity + (NegInf), as well as functions to test for these values. + + The code is implemented in pure python by taking advantage of the + 'struct' standard module. Care has been taken to generate proper + results on both big-endian and little-endian machines. Some efficiency + could be gained by translating the core routines into C. + + See <http://babbage.cs.qc.edu/courses/cs341/IEEE-754references.html> + for reference material on the IEEE 754 floating point standard. + + Further information on this package is available at + <http://www.analytics.washington.edu/statcomp/projects/rzope/fpconst/>. + + Author: Gregory R. Warnes <gregory_r_warnes@groton.pfizer.com> + Date:: 2003-04-08 + Copyright: (c) 2003, Pfizer, Inc. + +Platform: UNKNOWN diff --git a/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/README b/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/README new file mode 100644 index 0000000000000000000000000000000000000000..3c27c7c8e5d85a6460b309c2ed8ab4c5cd610e4c --- /dev/null +++ b/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/README @@ -0,0 +1,53 @@ +------------------------------------------------------------------ +fpconst: A Python module for handling IEEE 754 floating point special + values +------------------------------------------------------------------ +Author: Gregory R. Warnes <gregory_r_warnes@groton.pfizer.com> +Date:: 2003-04-08 +Version 0.6.0 +Copyright: (c) 2003 Pfizer, Inc +------------------------------------------------------------------ + +This module provides 'fpconst' a reference implementation of Python +Enhancement Protocol (PEP) 754, "IEEE 754 Floating Point Special +Values". See the file pep-0754.txt or +http://www.python.org/peps/pep-0754.html for the text of PEP 754. + +INSTALLATION + + This is a standard Python source package. Consequently, simply unpack + the source distribution, change into the diretory, and run + + $ Python setup.py install + +LICENSE + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + + Neither the name of Pfizer, Inc. nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------------- diff --git a/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/fpconst.py b/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/fpconst.py new file mode 100644 index 0000000000000000000000000000000000000000..286eb81187262a4700574e7c3cf260a8f6795844 --- /dev/null +++ b/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/fpconst.py @@ -0,0 +1,163 @@ +"""Utilities for handling IEEE 754 floating point special values + +This python module implements constants and functions for working with +IEEE754 double-precision special values. It provides constants for +Not-a-Number (NaN), Positive Infinity (PosInf), and Negative Infinity +(NegInf), as well as functions to test for these values. + +The code is implemented in pure python by taking advantage of the +'struct' standard module. Care has been taken to generate proper +results on both big-endian and little-endian machines. Some efficiency +could be gained by translating the core routines into C. + +See <http://babbage.cs.qc.edu/courses/cs341/IEEE-754references.html> +for reference material on the IEEE 754 floating point standard. + +Further information on this package is available at +<http://www.analytics.washington.edu/statcomp/projects/rzope/fpconst/>. + +Author: Gregory R. Warnes <gregory_r_warnes@groton.pfizer.com> +Date:: 2003-04-08 +Copyright: (c) 2003, Pfizer, Inc. +""" + +__version__ = "0.7.0" +ident = "$Id$" + +import struct, operator + +# check endianess +_big_endian = struct.pack('i',1)[0] != '\x01' + +# and define appropriate constants +if(_big_endian): + NaN = struct.unpack('d', '\x7F\xF8\x00\x00\x00\x00\x00\x00')[0] + PosInf = struct.unpack('d', '\x7F\xF0\x00\x00\x00\x00\x00\x00')[0] + NegInf = -PosInf +else: + NaN = struct.unpack('d', '\x00\x00\x00\x00\x00\x00\xf8\xff')[0] + PosInf = struct.unpack('d', '\x00\x00\x00\x00\x00\x00\xf0\x7f')[0] + NegInf = -PosInf + +def _double_as_bytes(dval): + "Use struct.unpack to decode a double precision float into eight bytes" + tmp = list(struct.unpack('8B',struct.pack('d', dval))) + if not _big_endian: + tmp.reverse() + return tmp + +## +## Functions to extract components of the IEEE 754 floating point format +## + +def _sign(dval): + "Extract the sign bit from a double-precision floating point value" + bb = _double_as_bytes(dval) + return bb[0] >> 7 & 0x01 + +def _exponent(dval): + """Extract the exponentent bits from a double-precision floating + point value. + + Note that for normalized values, the exponent bits have an offset + of 1023. As a consequence, the actual exponentent is obtained + by subtracting 1023 from the value returned by this function + """ + bb = _double_as_bytes(dval) + return (bb[0] << 4 | bb[1] >> 4) & 0x7ff + +def _mantissa(dval): + """Extract the _mantissa bits from a double-precision floating + point value.""" + + bb = _double_as_bytes(dval) + mantissa = bb[1] & 0x0f << 48 + mantissa += bb[2] << 40 + mantissa += bb[3] << 32 + mantissa += bb[4] + return mantissa + +def _zero_mantissa(dval): + """Determine whether the mantissa bits of the given double are all + zero.""" + bb = _double_as_bytes(dval) + return ((bb[1] & 0x0f) | reduce(operator.or_, bb[2:])) == 0 + +## +## Functions to test for IEEE 754 special values +## + +def isNaN(value): + "Determine if the argument is a IEEE 754 NaN (Not a Number) value." + return (_exponent(value)==0x7ff and not _zero_mantissa(value)) + +def isInf(value): + """Determine if the argument is an infinite IEEE 754 value (positive + or negative inifinity)""" + return (_exponent(value)==0x7ff and _zero_mantissa(value)) + +def isFinite(value): + """Determine if the argument is an finite IEEE 754 value (i.e., is + not NaN, positive or negative inifinity)""" + return (_exponent(value)!=0x7ff) + +def isPosInf(value): + "Determine if the argument is a IEEE 754 positive infinity value" + return (_sign(value)==0 and _exponent(value)==0x7ff and \ + _zero_mantissa(value)) + +def isNegInf(value): + "Determine if the argument is a IEEE 754 negative infinity value" + return (_sign(value)==1 and _exponent(value)==0x7ff and \ + _zero_mantissa(value)) + +## +## Functions to test public functions. +## + +def test_isNaN(): + assert( not isNaN(PosInf) ) + assert( not isNaN(NegInf) ) + assert( isNaN(NaN ) ) + assert( not isNaN( 1.0) ) + assert( not isNaN( -1.0) ) + +def test_isInf(): + assert( isInf(PosInf) ) + assert( isInf(NegInf) ) + assert( not isInf(NaN ) ) + assert( not isInf( 1.0) ) + assert( not isInf( -1.0) ) + +def test_isFinite(): + assert( not isFinite(PosInf) ) + assert( not isFinite(NegInf) ) + assert( not isFinite(NaN ) ) + assert( isFinite( 1.0) ) + assert( isFinite( -1.0) ) + +def test_isPosInf(): + assert( isPosInf(PosInf) ) + assert( not isPosInf(NegInf) ) + assert( not isPosInf(NaN ) ) + assert( not isPosInf( 1.0) ) + assert( not isPosInf( -1.0) ) + +def test_isNegInf(): + assert( not isNegInf(PosInf) ) + assert( isNegInf(NegInf) ) + assert( not isNegInf(NaN ) ) + assert( not isNegInf( 1.0) ) + assert( not isNegInf( -1.0) ) + +# overall test +def test(): + test_isNaN() + test_isInf() + test_isFinite() + test_isPosInf() + test_isNegInf() + +if __name__ == "__main__": + test() + diff --git a/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/setup.py b/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..ba197ba9782098f553693cdb6886034f4ba5426e --- /dev/null +++ b/LTA/LTAIngest/fpconst-0.7.0/fpconst-0.7.0/setup.py @@ -0,0 +1,16 @@ +from distutils.core import setup + +url="http://www.analytics.washington.edu/statcomp/projects/rzope/fpconst/" + +import fpconst + +setup(name="fpconst", + version=fpconst.__version__, + description="Utilities for handling IEEE 754 floating point special values", + author="Gregory Warnes", + author_email="gregory_r_warnes@groton.pfizer.com", + url = url, + long_description=fpconst.__doc__, + py_modules=['fpconst'] + ) + diff --git a/LTA/LTAIngest/fpconst-0.7.0/fpconst.py b/LTA/LTAIngest/fpconst-0.7.0/fpconst.py new file mode 100644 index 0000000000000000000000000000000000000000..286eb81187262a4700574e7c3cf260a8f6795844 --- /dev/null +++ b/LTA/LTAIngest/fpconst-0.7.0/fpconst.py @@ -0,0 +1,163 @@ +"""Utilities for handling IEEE 754 floating point special values + +This python module implements constants and functions for working with +IEEE754 double-precision special values. It provides constants for +Not-a-Number (NaN), Positive Infinity (PosInf), and Negative Infinity +(NegInf), as well as functions to test for these values. + +The code is implemented in pure python by taking advantage of the +'struct' standard module. Care has been taken to generate proper +results on both big-endian and little-endian machines. Some efficiency +could be gained by translating the core routines into C. + +See <http://babbage.cs.qc.edu/courses/cs341/IEEE-754references.html> +for reference material on the IEEE 754 floating point standard. + +Further information on this package is available at +<http://www.analytics.washington.edu/statcomp/projects/rzope/fpconst/>. + +Author: Gregory R. Warnes <gregory_r_warnes@groton.pfizer.com> +Date:: 2003-04-08 +Copyright: (c) 2003, Pfizer, Inc. +""" + +__version__ = "0.7.0" +ident = "$Id$" + +import struct, operator + +# check endianess +_big_endian = struct.pack('i',1)[0] != '\x01' + +# and define appropriate constants +if(_big_endian): + NaN = struct.unpack('d', '\x7F\xF8\x00\x00\x00\x00\x00\x00')[0] + PosInf = struct.unpack('d', '\x7F\xF0\x00\x00\x00\x00\x00\x00')[0] + NegInf = -PosInf +else: + NaN = struct.unpack('d', '\x00\x00\x00\x00\x00\x00\xf8\xff')[0] + PosInf = struct.unpack('d', '\x00\x00\x00\x00\x00\x00\xf0\x7f')[0] + NegInf = -PosInf + +def _double_as_bytes(dval): + "Use struct.unpack to decode a double precision float into eight bytes" + tmp = list(struct.unpack('8B',struct.pack('d', dval))) + if not _big_endian: + tmp.reverse() + return tmp + +## +## Functions to extract components of the IEEE 754 floating point format +## + +def _sign(dval): + "Extract the sign bit from a double-precision floating point value" + bb = _double_as_bytes(dval) + return bb[0] >> 7 & 0x01 + +def _exponent(dval): + """Extract the exponentent bits from a double-precision floating + point value. + + Note that for normalized values, the exponent bits have an offset + of 1023. As a consequence, the actual exponentent is obtained + by subtracting 1023 from the value returned by this function + """ + bb = _double_as_bytes(dval) + return (bb[0] << 4 | bb[1] >> 4) & 0x7ff + +def _mantissa(dval): + """Extract the _mantissa bits from a double-precision floating + point value.""" + + bb = _double_as_bytes(dval) + mantissa = bb[1] & 0x0f << 48 + mantissa += bb[2] << 40 + mantissa += bb[3] << 32 + mantissa += bb[4] + return mantissa + +def _zero_mantissa(dval): + """Determine whether the mantissa bits of the given double are all + zero.""" + bb = _double_as_bytes(dval) + return ((bb[1] & 0x0f) | reduce(operator.or_, bb[2:])) == 0 + +## +## Functions to test for IEEE 754 special values +## + +def isNaN(value): + "Determine if the argument is a IEEE 754 NaN (Not a Number) value." + return (_exponent(value)==0x7ff and not _zero_mantissa(value)) + +def isInf(value): + """Determine if the argument is an infinite IEEE 754 value (positive + or negative inifinity)""" + return (_exponent(value)==0x7ff and _zero_mantissa(value)) + +def isFinite(value): + """Determine if the argument is an finite IEEE 754 value (i.e., is + not NaN, positive or negative inifinity)""" + return (_exponent(value)!=0x7ff) + +def isPosInf(value): + "Determine if the argument is a IEEE 754 positive infinity value" + return (_sign(value)==0 and _exponent(value)==0x7ff and \ + _zero_mantissa(value)) + +def isNegInf(value): + "Determine if the argument is a IEEE 754 negative infinity value" + return (_sign(value)==1 and _exponent(value)==0x7ff and \ + _zero_mantissa(value)) + +## +## Functions to test public functions. +## + +def test_isNaN(): + assert( not isNaN(PosInf) ) + assert( not isNaN(NegInf) ) + assert( isNaN(NaN ) ) + assert( not isNaN( 1.0) ) + assert( not isNaN( -1.0) ) + +def test_isInf(): + assert( isInf(PosInf) ) + assert( isInf(NegInf) ) + assert( not isInf(NaN ) ) + assert( not isInf( 1.0) ) + assert( not isInf( -1.0) ) + +def test_isFinite(): + assert( not isFinite(PosInf) ) + assert( not isFinite(NegInf) ) + assert( not isFinite(NaN ) ) + assert( isFinite( 1.0) ) + assert( isFinite( -1.0) ) + +def test_isPosInf(): + assert( isPosInf(PosInf) ) + assert( not isPosInf(NegInf) ) + assert( not isPosInf(NaN ) ) + assert( not isPosInf( 1.0) ) + assert( not isPosInf( -1.0) ) + +def test_isNegInf(): + assert( not isNegInf(PosInf) ) + assert( isNegInf(NegInf) ) + assert( not isNegInf(NaN ) ) + assert( not isNegInf( 1.0) ) + assert( not isNegInf( -1.0) ) + +# overall test +def test(): + test_isNaN() + test_isInf() + test_isFinite() + test_isPosInf() + test_isNegInf() + +if __name__ == "__main__": + test() + diff --git a/LTA/LTAIngest/fpconst-0.7.0/setup.py b/LTA/LTAIngest/fpconst-0.7.0/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..ba197ba9782098f553693cdb6886034f4ba5426e --- /dev/null +++ b/LTA/LTAIngest/fpconst-0.7.0/setup.py @@ -0,0 +1,16 @@ +from distutils.core import setup + +url="http://www.analytics.washington.edu/statcomp/projects/rzope/fpconst/" + +import fpconst + +setup(name="fpconst", + version=fpconst.__version__, + description="Utilities for handling IEEE 754 floating point special values", + author="Gregory Warnes", + author_email="gregory_r_warnes@groton.pfizer.com", + url = url, + long_description=fpconst.__doc__, + py_modules=['fpconst'] + ) + diff --git a/LTA/LTAIngest/h5_check.py b/LTA/LTAIngest/h5_check.py new file mode 100644 index 0000000000000000000000000000000000000000..ac5e2fe4b6936d590fa6cbd71fac824a611e017c --- /dev/null +++ b/LTA/LTAIngest/h5_check.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# +# Script that prints all BF-raw files of a h5 meta data file of +# a BF observation +# +# File: bfexternalfiles.py +# Author: Sven Duscha (duscha_at_astron.nl) +# Date: 2012-06-17 +# Last change: 2012-06-17 + + +import sys +import DAL + + +filename=sys.argv[1] +fh=DAL.BF_File(filename) + + +for sapNr in range(0, fh.nofSubArrayPointings().value): + if fh.subArrayPointing(sapNr).exists(): + sap=fh.subArrayPointing(sapNr) + for beamNr in range(0, sap.nofBeams().value): + if sap.beam(beamNr).exists(): + beam=sap.beam(beamNr) + for stokesNr in range(0, beam.nofStokes().value): + if beam.stokes(stokesNr).exists(): + list=beam.stokes(stokesNr).externalFiles() +# print list + for i in range(0, list.size()): + print list[i] diff --git a/LTA/LTAIngest/ingest_config.py b/LTA/LTAIngest/ingest_config.py new file mode 100644 index 0000000000000000000000000000000000000000..06471051bd886b0f717e7c4e2e19a7af243ef456 --- /dev/null +++ b/LTA/LTAIngest/ingest_config.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +import xmlrpclib, socket, os, logging, logging.handlers, time +import sitecustomize +try: + #from wsrt_webdavlib import wsrt_webdavlib # just in here so we can test it exists + import SOAPpy, mom_http #Old deprecated stuff, should be replaced? +except: + SOAPpy = None + +exportClient = None +momClient = None +momServer = None +ltaClient = None + +## Determine host +host = socket.gethostname() +## Using netifaces module or something similar would be nicer, but it doesn't want to install in a custom dir +## So we use this hack +if 'lexar' in host: + host = host + '.offline.lofar' +if 'gridftp01.target.rug.nl' in host: + host = 'lotar2.staging.lofar' +if 'gridftp02.target.rug.nl' in host: + host = 'lotar4.staging.lofar' + +ipaddress = socket.gethostbyname(host) +ltacpport = 8802 + +## Ingest Master server settings +masterAddress = '10.178.1.2' +masterPort = 2012 +masterAuth = 'lta' +maxMasterTalkerQueue=0 + +def isOpen(ip,port): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.connect((ip, int(port))) + s.shutdown(2) + return True + except: + return False + +#We might need a more elegant solution for this, but for now the convenience of having one config for everything is nice +if ipaddress == masterAddress: + if not isOpen(masterAddress, masterPort): #No master is running on this machine + master = True + else: + master = False +else: + master = False + +## Make directories if needed, should include trailing / +#default (local) logroot +logroot = '/tmp/ingest/' + +#special cases for production lexar and lotart systems +if 'lexar' in host: + logroot = '/log/ingest/' +if 'lotar' in host: + logroot = '/home/lofarlocal/log/ingest' + +jobsdir = logroot + '/jobs/' +retrydir = logroot + '/jobs_retry/' +faileddir = logroot + '/log/failed/' +logdir = logroot + '/log/' +donedir = logroot + '/log/done/' + +if not os.path.exists(jobsdir): + os.makedirs(jobsdir) +if not os.path.exists(retrydir): + os.makedirs(retrydir) +if not os.path.exists(faileddir): + os.makedirs(faileddir) +if not os.path.exists(logdir): + os.makedirs(logdir) +if not os.path.exists(donedir): + os.makedirs(donedir) + +## Set logger +#use WatchedFileHandler which creates a new log file if the OS moves it with logrotate +log_handler = logging.handlers.WatchedFileHandler(logdir + host + '_' + ('master' if master else 'slave') + '.log') +formatter = logging.Formatter("%(asctime)-15s %(levelname)s %(message)s") +formatter.converter = time.gmtime +log_handler.setFormatter(formatter) +logger = logging.getLogger('Master' if master else 'Slave') +logger.addHandler(log_handler) +logger.setLevel(logging.DEBUG) +logger.info('--------- Logger initialized ---------') + +## Now we can check if the legacy WSRT code loaded and report it to the logger +if not SOAPpy: + logger.critical('SOAPpy, http_login or wsrt_webdavlib not found, set your PYTHONPATH !') + exit(2) + +## Ingest Slave setting, might become dependent on host name +parallelJobs = 20 +maxSlaveTalkerQueue = 80 + +pipelineRetry = 3 + +## LTA Catalog settings +# application53 (de ingest xml-rpc services) +# +# lofar-ingest.target.rug.nl:9443 +# lofar-ingest-test.target.rug.nl:19443 +# +LTAurl = 'https://@lofar-ingest.target.rug.nl:9443/' +try: + ltaClient = xmlrpclib.ServerProxy(LTAurl) +except: + ltaClient = None + logger.excepion('Configuration failed on LTA client') + raise +ltaRetry = 5 + +## MoM client settings +exportLogin = 'https://lcs029.control.lofar:8443/export/systemprograms/login.jsp' +exportStatus = 'https://lcs029.control.lofar:8443/export/interface/pipeline/setStatus.do' +exportLogout = 'https://lcs029.control.lofar:8443/export/processLogout.do' + +momURLlogin = 'https://lcs029.control.lofar:8443/useradministration/user/systemlogin.do' +momURLgetSIP = 'https://lcs029.control.lofar:8443/mom3/interface/importXML2.do' +momURLlogout = 'https://lcs029.control.lofar:8443/useradministration/user/logout.do' +momRetry = 3 + +srmRetry = 2 +if 'lexar' in host: + srmInit = '/globalhome/ingest/service/bin/init.sh' +if 'lotar' in host: + srmInit = '/home/lofarlocal/ltacp/bin/init.sh' + +try: + exportClient= mom_http.client(exportLogin, exportStatus, exportLogout) + exportClient.username = '' #fill in during build/install + exportClient.password = '' #fill in during build/install + momClient = mom_http.client(momURLlogin, momURLgetSIP, momURLlogout) + momClient.username = '' #fill in during build/install + momClient.password = '' #fill in during build/install +except: + momClient = None + exportClient = None + logger.exception('Configuration failed on MoM export') + raise + +## Initialize MoM listener +if master: + try: + momPort = 2010 + ## move to xmlrpc?: momServer = xmlrpclib.Server(ipaddress:momPort) + momServer = SOAPpy.SOAPServer((ipaddress, momPort), ) + except: + momServer = None + logger.exception('Configuration failed on MoM listener') + raise + +# Used by Master to send error and (re) start messages +mailCommand = '-c renting@astron.nl,holties@astron.nl,wjvriend@astro.rug.nl,observer@astron.nl sciencesupport@astron.nl' +# Used by Slave to send SendStatus warnings +mailSlCommand = '-c holties@astron.nl,wjvriend@astro.rug.nl renting@astron.nl' + +logger.info("Configuration complete") diff --git a/LTA/LTAIngest/ingest_config_test.py b/LTA/LTAIngest/ingest_config_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b3a56eb174899d746837031047eb16767bff4b7b --- /dev/null +++ b/LTA/LTAIngest/ingest_config_test.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +import xmlrpclib, socket, os, logging, logging.handlers, time +try: + #from wsrt_webdavlib import wsrt_webdavlib # just in here so we can test it exists + import SOAPpy, mom_http #Old deprecated stuff, should be replaced? +except: + SOAPpy = None + +exportClient = None +momClient = None +momServer = None +ltaClient = None + +## Determine host +host = socket.gethostname() +## Using netifaces module or something similar would be nicer, but it doesn't want to install in a custom dir +## So we use this hack +if 'lexar' in host: + host = host + '.offline.lofar' +if 'gridftp01.target.rug.nl' in host: + host = 'lotar2.staging.lofar' +if 'gridftp02.target.rug.nl' in host: + host = 'lotar4.staging.lofar' + +ipaddress = socket.gethostbyname(host) +ltacpport = 8801 + +## Ingest Master server settings +masterAddress = '10.178.1.2' +masterPort = 2011 +masterAuth = 'lta' +maxMasterTalkerQueue=0 + +def isOpen(ip,port): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.connect((ip, int(port))) + s.shutdown(2) + return True + except: + return False + +#We might need a more elegant solution for this, but for now the convenience of having one config for everything is nice +if ipaddress == masterAddress: + if not isOpen(masterAddress, masterPort): #No master is running on this machine + master = True + else: + master = False +else: + master = False + +## Make directories if needed, should include trailing / +jobsdir = '/tmp/ingest/jobs/' +retrydir = '/tmp/ingest/jobs_retry/' +faileddir = '/tmp/ingest/log/failed/' +logdir = '/tmp/ingest/log/' +donedir = '/tmp/ingest/log/done/' + +if not os.path.exists(jobsdir): + os.makedirs(jobsdir) +if not os.path.exists(retrydir): + os.makedirs(retrydir) +if not os.path.exists(faileddir): + os.makedirs(faileddir) +if not os.path.exists(logdir): + os.makedirs(logdir) +if not os.path.exists(donedir): + os.makedirs(donedir) + +## Set logger +#use WatchedFileHandler which creates a new log file if the OS moves it with logrotate +log_handler = logging.handlers.WatchedFileHandler(logdir + host + '_' + ('master' if master else 'slave') + '.log') +formatter = logging.Formatter("%(asctime)-15s %(levelname)s %(message)s") +formatter.converter = time.gmtime +log_handler.setFormatter(formatter) +logger = logging.getLogger('Master' if master else 'Slave') +logger.addHandler(log_handler) +logger.setLevel(logging.DEBUG) +logger.info('--------- Logger initialized ---------') + +## Now we can check if the legacy WSRT code loaded and report it to the logger +if not SOAPpy: + logger.critical('SOAPpy, http_login or wsrt_webdavlib not found, set your PYTHONPATH !') + exit(2) + +## Ingest Slave setting, might become dependent on host name +parallelJobs = 20 +maxSlaveTalkerQueue = 80 + +pipelineRetry = 3 + +## LTA Catalog settings +# application53 (de ingest xml-rpc services) +# +# lofar-ingest.target.rug.nl:9443 +# lofar-ingest-test.target.rug.nl:19443 +# +LTAurl = 'https://user:pass@lofar-ingest-test.target.rug.nl:19443' #fill in during build/install +try: + ltaClient = xmlrpclib.ServerProxy(LTAurl) +except: + ltaClient = None + logger.excepion('Configuration failed on LTA client') + raise +ltaRetry = 20 + +## MoM client settings +exportLogin = 'https://lcs028.control.lofar:8443/export/systemprograms/login.jsp' +exportStatus = 'https://lcs028.control.lofar:8443/export/interface/pipeline/setStatus.do' +exportLogout = 'https://lcs028.control.lofar:8443/export/processLogout.do' + +momURLlogin = 'https://lcs028.control.lofar:8443/useradministration/user/systemlogin.do' +momURLgetSIP = 'https://lcs028.control.lofar:8443/mom3/interface/importXML2.do' +momURLlogout = 'https://lcs028.control.lofar:8443/useradministration/user/logout.do' +momRetry = 2 + +srmRetry = 2 +if 'lexar' in host: + srmInit = '/home/renting/grid/init.sh' +if 'lotar' in host: + srmInit = '/home/lofarlocal/ltacp/bin/init.sh' + +try: + exportClient= mom_http.client(exportLogin, exportStatus, exportLogout) + exportClient.username = '' #fill in during build/install + exportClient.password = '' #fill in during build/install + momClient = mom_http.client(momURLlogin, momURLgetSIP, momURLlogout) + momClient.username = '' #fill in during build/install + momClient.password = '' #fill in during build/install +except: + momClient = None + exportClient = None + logger.exception('Configuration failed on MoM export') + raise + +## Initialize MoM listener +if master: + try: + momPort = 2009 + ## move to xmlrpc?: momServer = xmlrpclib.Server(ipaddress:momPort) + momServer = SOAPpy.SOAPServer((ipaddress, momPort), ) + except: + momServer = None + logger.exception('Configuration failed on MoM listener') + raise + +# Used by Master to send error and (re) start messages +mailCommand = ' renting@astron.nl' +# Used by Slave to send SendStatus warnings +mailSlCommand = ' renting@astron.nl' + +logger.info("Configuration complete") diff --git a/LTA/LTAIngest/ingestpipeline.py b/LTA/LTAIngest/ingestpipeline.py new file mode 100755 index 0000000000000000000000000000000000000000..fa584497c096557db9ae797e3f98e2c4c55003d1 --- /dev/null +++ b/LTA/LTAIngest/ingestpipeline.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python +import logging, os, time, xmlrpclib, subprocess, random, unspecifiedSIP +from lxml import etree +from cStringIO import StringIO +from job_group import corr_type, bf_type, img_type, unspec_type, pulp_type + +IngestStarted = 10 +## 20 not used +IngestSIPComplete = 30 +IngestSuccessful = 40 +IngestFailed = -10 +Removed = -20 + +PipelineJobFailedError = 1 +PipelineNoSourceError = 2 +PipelineAlreadyInLTAError = 3 +PipelineNoProjectInLTAError = 4 +#---------------------- Custom Exception ---------------------------------------- + +class PipelineError(Exception): + def __init__(self, message, source, type = PipelineJobFailedError): + Exception.__init__(self, message) + self.type = type + self.source = source + +#---------------------- IngestPipeline ------------------------------------------ +class IngestPipeline(): + def __init__(self, logdir, job, momClient, ltaClient, ltacphost, ltacpport, mailCommand, momRetry, ltaRetry, srmRetry, srmInit): + self.logdir = logdir + self.job = job + self.momClient = momClient + self.ltaClient = ltaClient + self.ltacphost = ltacphost + self.ltacpport = ltacpport + self.mailCommand = mailCommand + + self.Project = job['Project'] + self.DataProduct = job['DataProduct'] + self.FileType = unspec_type + if 'sky' in self.DataProduct or 'FITS' in self.DataProduct: #Not for FITS and HDF5 Images + self.FileName = self.DataProduct + self.FileType = img_type + elif '.tar' in self.DataProduct: + self.FileName = self.DataProduct + else: + self.FileName = job['DataProduct'] + '.tar' + if 'uv' in self.DataProduct: ## hacks needs a better solution + self.FileType = corr_type + if 'bf' in self.DataProduct: + if 'h5' in self.DataProduct: + self.FileType = bf_type + else: + self.FileType = pulp_type + if 'summary' in self.DataProduct: + self.FileType = pulp_type + self.JobId = job['JobId'] + self.MomId = int(job['MomId']) + self.ObsId = int(job['ObservationId']) + self.HostLocation = job['Location'].split(':')[0] + self.Location = job['Location'].split(':')[1] + pos = self.Location.find(self.DataProduct) + if pos > 0: ## trick to support tar files with different names + self.LocationDir = self.Location[:pos] + if self.DataProduct[-3:] == '.h5' and 'bf' in self.DataProduct: #Temporary hack, should use h5_check.py + self.Source = self.DataProduct + ' ' + self.DataProduct[:-3] + '.raw' + else: + self.Source = self.DataProduct + else: + self.LocationDir = self.Location + self.Source = job['Source'] + self.ExportID = job['ExportID'] + self.Type = job["Type"] + + self.ticket = '' + self.FileSize = '-1' + self.MD5Checksum = '' + self.Adler32Checksum = '' + self.ChecksumResult = False + self.SIP = '' + self.tempPrimary = '' + self.tempSecondary = '' + self.PrimaryUri = '' + self.SecondaryUri = '' + self.srmInit = srmInit + self.momRetry = momRetry + self.ltaRetry = ltaRetry + self.srmRetry = srmRetry + self.status = IngestStarted + + ## Set logger + logging.basicConfig(filename=logdir + self.ExportID + '.log', level=logging.DEBUG, format="%(asctime)-15s %(levelname)s %(message)s") + self.logger =logging.getLogger() + self.logger.info('--------- Job logger initialized ---------') + + def GetStorageTicket(self): + try: + start = time.time() + result = self.ltaClient.GetStorageTicket(self.Project, self.FileName, self.FileSize, self.MomId, self.JobId, self.ObsId, True, self.Type) + self.logger.debug("GetStorageTicket for %s took %ds" % (self.JobId, time.time() - start)) + except xmlrpclib.Fault as err: + self.logger.error('Received XML-RPC Fault: %s %s' % (err.faultCode, err.faultString)) + raise + error = result['error'] + if error: + self.logger.error(error) ## StorageTicket with mom ID "8948214" and ID source "MoM" already exists + if 'StorageTicket with mom ID "%i"' % (self.MomId) in error: + if 'existing_ticket_id' in result and 'existing_ticket_state' in result: + self.logger.warning("Got a Tier 1 GetStorageTicket error for an incomplete storage ticket %s with status %s" % (result['existing_ticket_id'],result['existing_ticket_state'])) + if result['existing_ticket_state'] < IngestSuccessful: + try: + self.ticket = result['existing_ticket_id'] + self.logger.warning("trying to repair status of StorageTicket %s" % self.ticket) + self.RetryRun(self.SendStatus, self.ltaRetry, 'Resetting LTA status', IngestFailed) + except Exception as e: + self.logger.exception('ResettingStatus IngestFailed failed for %s' % self.ticket) + raise Exception ('Had to reset state for %s' % self.ticket) + else: + self.logger.warning("Tried to ingest a file that was already there %s" % self.JobId) + raise PipelineError('Got Tier 1 GetStorageTicket error: Dataproduct already in LTA for %s' % (self.JobId), 'GetStorageTicket', PipelineAlreadyInLTAError) + else: + raise Exception('Got a Tier 1 GetStorageTicket error I can''t interpret: %s' % result) + if 'no storage resources defined for project' in error or "project does not exists" in error: + raise PipelineError('Got Tier 1 GetStorageTicket error for project not known in LTA: %s' % error, 'GetStorageTicket', PipelineNoProjectInLTAError) + raise Exception('Got Tier 1 GetStorageTicket error: %s' % error) + else: + self.ticket = result['ticket'] + self.tempPrimary = result['primary_uri'] + self.tempSecondary = result['secondary_uri'] + self.PrimaryUri = result['primary_uri_rnd'] + if 'secondary_uri_rnd' in result.keys(): + self.SecondaryUri = result['secondary_uri_rnd'] + self.logger.debug('got tempURIs %s %s, random URIs %s %s and ticket %s' % (self.tempPrimary, self.tempSecondary, self.PrimaryUri, self.SecondaryUri, self.ticket)) + + +#(renting)lexar002> java -Xmx256m -jar /globalhome/ingest/ltacp/ltacp.jar lexar002 8803 srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofartest/ops/projects/L6512_SAP002_SB079_uv.MS.tar L6512_SAP002_SB079_uv.MS +#2012-04-10 14:18:17,974 DEBUG client.LtaCp:58 - Creating the socket +#2012-04-10 14:18:17,984 DEBUG client.LtaCp:81 - Writing the request header +#2012-04-10 14:18:17,985 DEBUG client.LtaCp:107 - Transfering data via lexar002:8803 to srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofartest/ops/projects/L6512_SAP002_SB079_uv.MS.tar +#2012-04-10 14:18:17,995 DEBUG client.LtaCp:122 - Starting to stream data +#... +#2012-04-10 14:18:38,352 INFO client.LtaCp:156 - Transfered 100% of 346 MB at 105 MB/s +#2012-04-10 14:18:38,353 DEBUG client.LtaCp:182 - Flushing the stream +#2012-04-10 14:18:38,353 DEBUG client.LtaCp:237 - Transfered 346960383 bytes +#2012-04-10 14:18:38,354 DEBUG client.LtaCp:242 - Closing the socket +#2012-04-10 14:18:38,354 DEBUG client.LtaCp:256 - Retrieving the checksums +#2012-04-10 14:18:51,161 INFO client.LtaCp:270 - Adler32 checksum for lexar002: 6367d2e1 +#2012-04-10 14:18:51,161 INFO client.LtaCp:272 - Checksums from server: <size>347074560</size><checksums><checksum><algorithm>MD5</algorithm><value>ae28093ed958e5aaf7f7cf5ff4188f37</value></checksum><checksum><algorithm>Adler32</algorithm><value>6367d2e1</value></checksum></checksums> +#2012-04-10 14:18:51,162 INFO client.LtaCp:276 - Transfered 346 MB in 20s at 17 MB/s average speed + + + def ParseLTAcpLog(self, log): + for l in log: + if 'Checksums from server:' in l: + if not '</checksums>' in l: + self.logger.debug('checksums incomplete %s' % l) + return False + checksums = l.split()[8] + pos = checksums.find('<value>') + self.MD5Checksum = checksums[pos+7:pos+39] + self.Adler32Checksum = checksums[pos+105:pos+113] + pos = checksums.find('<size>') + try: + self.FileSize = str(int(checksums[pos+6:checksums.find('</size>')])) #XML-RPC doesn't allow bigger than 32bit int + except ValueError: + self.logger.debug("No valid size found") + return False + return True + + def TransferFile(self): + self.logger.debug('Starting file transfer') + if self.PrimaryUri: + cmd = ["ssh", "-T", "ingest@" +self.HostLocation, "cd %s;java -Xmx256m -jar /globalhome/ingest/ltacp/ltacp.jar %s %s %s %s" % (self.LocationDir, self.ltacphost, self.ltacpport, self.PrimaryUri, self.Source)] + else: + cmd = ["ssh", "-T", "ingest@" + self.HostLocation, "cd %s;java -Xmx256m -jar /globalhome/ingest/ltacp/ltacp.jar %s %s %s/%s %s" % (self.LocationDir, self.ltacphost, self.ltacpport, self.tempPrimary, self.FileName, self.Source)] + ## SecondaryUri handling not implemented + self.logger.debug(cmd) + start = time.time() + p = subprocess.Popen(cmd, stdin=open('/dev/null'), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logs = p.communicate() + self.logger.debug("File transfer for %s took %ds" % (self.JobId, time.time() - start)) +## time.sleep(10) +## logs = ("hoeba","bla") + log = logs[0].split('\n') +## log = ["2012-04-10 14:18:51,161 INFO client.LtaCp:272 - Checksums from server: <size>347074560</size><checksums><checksum><algorithm>MD5</algorithm><value>ae28093ed958e5aaf7f7cf5ff4188f37</value></checksum><checksum><algorithm>Adler32</algorithm><value>6367d2e1</value></checksum></checksums>",""] + self.logger.debug('Shell command for %s exited with code %s' % (self.JobId, p.returncode)) + self.logger.debug('STD ERR of TransferFile command for %s:\n%s' % (self.JobId, logs[1])) + self.logger.debug(log) + if (not 'No such file or directory.' in logs[1]) and (not 'does not exist' in logs[0]): + if not self.ParseLTAcpLog(log): + self.logger.error("Parsing ltacp result failed for %s" % self.JobId) + raise Exception('File transfer failed of %s' % self.JobId) + else: + self.CheckChecksums() + else: # need to communicate that LTA transaction is to be rolled back but ingest not to be set to "hold" + #os.system('echo "Dataproduct for %s not found on %s.\nConsidering dataproduct to be non existent"|mailx -s "Warning: Dataproduct not found on CEP host" ' % (self.JobId, self.HostLocation) + self.mailCommand) + #self.logger.warn('Sent Warning: Dataproduct not found on CEP host to ' + self.mailCommand) + raise PipelineError('Dataproduct for %s not found on %s'% (self.JobId, self.HostLocation), 'TransferFile', PipelineNoSourceError) + self.logger.debug('Finished file transfer of %s' % self.JobId) + + def CheckChecksums(self): + if self.MD5Checksum and self.Adler32Checksum and self.FileSize: + try: + self.logger.debug('Valid checksums found for %s with filesize %s' % (self.JobId, self.FileSize)) + except: + self.logger.debug('Valid checksums found for %s' % (self.JobId)) + else: + self.logger.debug('Valid checksums not found for %s' % self.JobId) + raise Exception('No valid checkums found for %s' % self.JobId) + + def SendChecksums(self): + if self.PrimaryUri: + uris = {'primary_uri':self.PrimaryUri, 'secondary_uri':self.SecondaryUri} + else: + uris = '' + try: + start = time.time() + result = self.ltaClient.SendChecksums(self.Project, self.ticket, self.FileSize, {'MD5':self.MD5Checksum,'Adler32':self.Adler32Checksum}, uris) + self.logger.debug("SendChecksums for %s took %ds" % (self.JobId, time.time() - start)) + except xmlrpclib.Fault as err: + self.logger.error('Received XML-RPC Fault: %s %s' % (err.faultCode, err.faultString)) + raise + error = result['error'] + if not error: + self.PrimaryUri = result['primary_uri'] + self.SecondaryUri = result['secondary_uri'] + if error: + self.logger.error('Got an error back in SendChecksums for %s: %s' % (self.JobId, error)) + raise Exception('Got Tier 1 SendChecksums error for %s: %s' % (self.JobId, error)) + self.logger.debug('got URIs %s %s' % (self.PrimaryUri, self.SecondaryUri)) + + def SendStatus(self, state): + try: + start = time.time() + result = self.ltaClient.UpdateUriState(self.Project, self.ticket, self.PrimaryUri, state) + self.logger.debug("UpdateUriState for %s took %ds" % (self.JobId, time.time() - start)) + except xmlrpclib.Fault as err: + self.logger.error('Received XML-RPC Fault: %s %s' % (err.faultCode, err.faultString)) + raise + except Exception as e: + self.logger.error('Received unknown exception in SendStatus for %s: %s' % (self.JobId, str(e))) + raise + if result['result'] == 'ok': + self.logger.debug('Status update for %s to %s was successful: %s' % (self.PrimaryUri, state, result)) + else: + self.logger.error(result['error']) + if "No DataProduct found for StorageTicket" in result['error']: + self.logger.error('Database error, no dataproduct found for %s ' % self.JobId) + raise PipelineError('Database error, no dataproduct found for %s ' % self.JobId, 'SetStatus', PipelineJobFailedError) + else: + self.logger.error('Got Tier 1 SendStatus error for %s: %s' % (self.JobId, result['error'])) + raise Exception('Got Tier 1 SendStatus error for %s: %s' % (self.JobId, result['error'])) + +## Not needed right now +## def RenameFile(self): +## self.logger.debug('Rename file') + + def CheckSIP(self): + ##might do more than validate in the future + try: + start = time.time() + f = open('doc/LTA-SIP.xsd') + xml = etree.parse(f) + schema = etree.XMLSchema(xml) + sip = StringIO(self.SIP) + xml = etree.parse(sip) + result = schema.validate(xml) + self.logger.debug("CheckSIP for %s took %ds" % (self.JobId, time.time() - start)) + return result + except Exception as e: + self.logger.error('CheckSIP failed: ' + str(e)) + return False + + def GetSIP(self): + if self.Type == "MoM": + try: + start = time.time() + sip = self.momClient.getSIP(self.MomId, self.ticket, self.FileName, self.PrimaryUri, self.FileSize, self.MD5Checksum, self.Adler32Checksum) + self.SIP = sip.replace('<stationType>Europe</stationType>','<stationType>International</stationType>') + self.logger.debug("GetSIP for %s took %ds" % (self.JobId, time.time() - start)) + except: + self.logger.exception('Getting SIP from MoM failed') + raise + self.logger.debug('SIP received for %s from MoM with size %d: %s' % (self.JobId, len(self.SIP), self.SIP[0:400])) + else: + self.SIP = unspecifiedSIP.makeSIP(self.Project, self.ObsId, self.MomId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type) + self.FileType = unspec_type + if not self.CheckSIP(): + self.logger.debug('Got a malformed SIP from MoM: %s' % self.SIP[0:50]) + try: + self.SIP = unspecifiedSIP.makeSIP(self.Project, self.ObsId, self.MomId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type) + self.FileType = unspec_type + except Exception as e: + self.logger.error('GetSIP failed: ' + str(e)) + raise + self.logger.debug('Unspecified SIP created for %s: %s' % (self.JobId, self.SIP[0:400])) + ###raise Exception('Got a malformed SIP from MoM: %s' % self.SIP[0:50]) + + def SendSIP(self): + try: + start = time.time() + result = self.ltaClient.TransmitSIP(self.SIP, self.ticket) + self.logger.debug("TransmitSIP for %s took %ds" % (self.JobId, time.time() - start)) + except xmlrpclib.Fault as err: + self.logger.error('Received XML-RPC Fault: %s %s' % (err.faultCode, err.faultString)) + raise Exception('XML-RPC failed') + if result['result'] == 'ok': + self.logger.debug('Successfully sent SIP for %s' % self.JobId) + else: + self.logger.error(result['error']) + if "Exception in TransmitSIP, could not use SIP" in result['error']: + self.logger.error('Invalid SIP according to LTA catalog for %s' % self.JobId) + raise PipelineError('Invalid SIP according to LTA catalog for %s' % self.JobId, 'SendSIP', PipelineJobFailedError) + else: + raise Exception('Got Tier 1 TransmitSIP error for %s: %s' % (self.JobId, result['error'])) + + def RollBack(self): + self.logger.debug('Rolling back file transfer for %s' % self.JobId) + try: + if self.PrimaryUri: + cmd = ["bash", "-c", "source %s;srmrm %s" % (self.srmInit, self.PrimaryUri)] + else: + cmd = ["bash", "-c", "source %s;srmrm %s/%s" % (self.srmInit, self.tempPrimary, self.FileName)] + ## SecondaryUri handling not implemented + self.logger.debug(cmd) + start = time.time() + p = subprocess.Popen(cmd, stdin=open('/dev/null'), stdout=subprocess.PIPE) + log = p.communicate()[0].split('\n') + self.logger.debug("RollBack for %s took %ds" % (self.JobId, time.time() - start)) + self.logger.debug(log) + except: + self.logger.exception('Roll back failed for %s' % self.JobId) + + def RetryRun(self, func, times, errortext, *args): + error = '' + retry = 0 + while (retry < times): + try: + func(*args) + except PipelineError as pe: + ## function raised PipelineError itself. Assume retries not useful + raise + except Exception as e: + error += '\n' + str(e) + else: + if retry: + self.logger.debug(errortext + ' was tried %s times on %s before it succeeded. Got the following errors: %s' % (retry, self.JobId, error)) + else: + self.logger.debug(errortext + ' ran without a problem on %s' % self.JobId) + error = '' + break + retry += 1 + if retry < times: + time.sleep(random.randint(30, 60) * retry) + if error: + raise PipelineError(errortext + ' tried %s times but failed on %s. Got the following errors: %s' % (retry, self.JobId, error), func.__name__) + + def run(self): + try: + self.logger.debug("Ingest Pipeline started for %s" % self.JobId) + start = time.time() + self.RetryRun(self.GetStorageTicket, self.ltaRetry, 'Getting storage ticket') + self.RetryRun(self.TransferFile, self.srmRetry , 'Transfering file') + self.RetryRun(self.SendChecksums, self.ltaRetry, 'Sending Checksums') +# self.RenameFile() + self.RetryRun(self.GetSIP, self.momRetry, 'Get SIP from MoM') + self.RetryRun(self.SendSIP, self.ltaRetry, 'Sending SIP') + self.RetryRun(self.SendStatus, self.ltaRetry, 'Setting LTA status', IngestSuccessful) + self.logger.debug("Ingest Pipeline finished for %s in %d" % (self.JobId, time.time() - start)) + except PipelineError as pe: + self.logger.debug('Encountered PipelineError for %s' % (self.JobId)) + ## roll back transfer if necessary + if self.PrimaryUri or self.tempPrimary: + if not (pe.type == PipelineNoSourceError): + self.RollBack() + ## notify LTA the ingest has failed + ## ...but catch exceptions as we do not want to raise a new type of error + try: + if self.ticket: + self.RetryRun(self.SendStatus, self.ltaRetry, 'Setting LTA status', IngestFailed) + except Exception as e: + os.system('echo "Received unknown exception in SendStatus for %s to %s while handling another error:\n%s\n\nCheck LTA catalog and SRM!\n%s"|mailx -s "Warning: LTA catalog status update failed" ' % (self.JobId, IngestFailed, str(e), self.PrimaryUri) + self.mailCommand) + self.logger.error('Sent Mail: LTA catalog status update failed to ' + self.mailCommand) + self.logger.exception('SendStatus IngestFailed failed') + if pe.type == PipelineJobFailedError: + self.logger.debug('Encountered PipelineJobFailedError') + raise + elif pe.type == PipelineNoSourceError: + self.logger.debug('Encountered PipelineNoSourceError') + ## do not raise as it is not possible to continue trying to ingest the source file + elif pe.type == PipelineAlreadyInLTAError: + self.logger.debug('Encountered PipelineAlreadyInLTAError for %s' % (self.JobId)) + ## Do not raise as further attempts will generate the same result + elif pe.type == PipelineNoProjectInLTAError: + self.logger.debug('Encountered PipelineNoProjectInLTAError for %s' % (self.JobId)) + raise + elif pe.source == "SendStatus": + os.system('echo "Received unknown exception in SendStatus for %s to %s:\n%s\n\nCheck LTA catalog and SRM!\n%s"|mailx -s "Warning: LTA catalog status update failed" ' % (self.JobId, IngestFailed, str(e), self.PrimaryUri) + self.mailCommand) + self.logger.error('Sent Mail: LTA catalog status update failed to ' + self.mailCommand) + self.logger.error('SendStatus IngestFailed failed') + else: + self.logger.warn('Encountered unexpected PipelineErrorType: %s' % pe.type) + raise + except: + self.logger.debug('Encountered unexpected error for %s' % (self.JobId)) + if self.PrimaryUri or self.tempPrimary: + self.RollBack() + if self.ticket: + self.RetryRun(self.SendStatus, self.ltaRetry, 'Setting LTA status', IngestFailed) + raise + +#----------------------------------------------------------------- selfstarter - +if __name__ == '__main__': + standalone = IngestPipeline() + standalone.main() diff --git a/LTA/LTAIngest/ingestpipeline_test.py b/LTA/LTAIngest/ingestpipeline_test.py new file mode 100755 index 0000000000000000000000000000000000000000..b5cf4cf29bba3db580e27e16b2ab90376a1fbac8 --- /dev/null +++ b/LTA/LTAIngest/ingestpipeline_test.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python +import logging, os, time, xmlrpclib, subprocess, random, unspecifiedSIP +from lxml import etree +from cStringIO import StringIO +from job_group import corr_type, bf_type, img_type, unspec_type, pulp_type + +IngestStarted = 10 +## 20 not used +IngestSIPComplete = 30 +IngestSuccessful = 40 +IngestFailed = -10 +Removed = -20 + +PipelineJobFailedError = 1 +PipelineNoSourceError = 2 +PipelineAlreadyInLTAError = 3 +PipelineNoProjectInLTAError = 4 +#---------------------- Custom Exception ---------------------------------------- + +class PipelineError(Exception): + def __init__(self, message, source, type = PipelineJobFailedError): + Exception.__init__(self, message) + self.type = type + self.source = source + +#---------------------- IngestPipeline ------------------------------------------ +class IngestPipeline(): + def __init__(self, logdir, job, momClient, ltaClient, ltacphost, ltacpport, mailCommand, momRetry, ltaRetry, srmRetry, srmInit): + self.logdir = logdir + self.job = job + self.momClient = momClient + self.ltaClient = ltaClient + self.ltacphost = ltacphost + self.ltacpport = ltacpport + self.mailCommand = mailCommand + + self.Project = job['Project'] + self.DataProduct = job['DataProduct'] + self.FileType = unspec_type + if 'sky' in self.DataProduct or 'FITS' in self.DataProduct: #Not for FITS and HDF5 Images + self.FileName = self.DataProduct + self.FileType = img_type + elif '.tar' in self.DataProduct: + self.FileName = self.DataProduct + else: + self.FileName = job['DataProduct'] + '.tar' + if 'uv' in self.DataProduct: ## hacks needs a better solution + self.FileType = corr_type + if 'bf' in self.DataProduct: + if 'h5' in self.DataProduct: + self.FileType = bf_type + else: + self.FileType = pulp_type + if 'summary' in self.DataProduct: + self.FileType = pulp_type + self.JobId = job['JobId'] + self.MomId = int(job['MomId']) + self.ObsId = int(job['ObservationId']) + self.HostLocation = job['Location'].split(':')[0] + self.Location = job['Location'].split(':')[1] + pos = self.Location.find(self.DataProduct) + if pos > 0: ## trick to support tar files with different names + self.LocationDir = self.Location[:pos] + if self.DataProduct[-3:] == '.h5' and 'bf' in self.DataProduct: #Temporary hack, should use h5_check.py + self.Source = self.DataProduct + ' ' + self.DataProduct[:-3] + '.raw' + else: + self.Source = self.DataProduct + else: + self.LocationDir = self.Location + self.Source = job['Source'] + self.ExportID = job['ExportID'] + self.Type = job["Type"] + + self.ticket = '' + self.FileSize = '-1' + self.MD5Checksum = '' + self.Adler32Checksum = '' + self.ChecksumResult = False + self.SIP = '' + self.tempPrimary = '' + self.tempSecondary = '' + self.PrimaryUri = '' + self.SecondaryUri = '' + self.srmInit = srmInit + self.momRetry = momRetry + self.ltaRetry = ltaRetry + self.srmRetry = srmRetry + self.status = IngestStarted + + ## Set logger + logging.basicConfig(filename=logdir + self.ExportID + '.log', level=logging.DEBUG, format="%(asctime)-15s %(levelname)s %(message)s") + self.logger =logging.getLogger() + self.logger.info('--------- Job logger initialized ---------') + + def GetStorageTicket(self): + try: + start = time.time() + result = self.ltaClient.GetStorageTicket(self.Project, self.FileName, self.FileSize, self.MomId, self.JobId, self.ObsId, True, self.Type) + self.logger.debug("GetStorageTicket for %s took %ds" % (self.JobId, time.time() - start)) + except xmlrpclib.Fault as err: + self.logger.error('Received XML-RPC Fault: %s %s' % (err.faultCode, err.faultString)) + raise + error = result['error'] + if error: + self.logger.error(error) ## StorageTicket with mom ID "8948214" and ID source "MoM" already exists + if 'StorageTicket with mom ID "%i"' % (self.MomId) in error: + if 'existing_ticket_id' in result and 'existing_ticket_state' in result: + self.logger.warning("Got a Tier 1 GetStorageTicket error for an incomplete storage ticket %s with status %s" % (result['existing_ticket_id'],result['existing_ticket_state'])) + if result['existing_ticket_state'] < IngestSuccessful: + try: + self.ticket = result['existing_ticket_id'] + self.logger.warning("trying to repair status of StorageTicket %s" % self.ticket) + self.RetryRun(self.SendStatus, self.ltaRetry, 'Resetting LTA status', IngestFailed) + except Exception as e: + self.logger.exception('ResettingStatus IngestFailed failed for %s' % self.ticket) + raise Exception ('Had to reset state for %s' % self.ticket) + else: + self.logger.warning("Tried to ingest a file that was already there %s" % self.JobId) + raise PipelineError('Got Tier 1 GetStorageTicket error: Dataproduct already in LTA for %s' % (self.JobId), 'GetStorageTicket', PipelineAlreadyInLTAError) + else: + raise Exception('Got a Tier 1 GetStorageTicket error I can''t interpret: %s' % result) + if 'no storage resources defined for project' in error or "project does not exists" in error: + raise PipelineError('Got Tier 1 GetStorageTicket error for project not known in LTA: %s' % error, 'GetStorageTicket', PipelineNoProjectInLTAError) + raise Exception('Got Tier 1 GetStorageTicket error: %s' % error) + else: + self.ticket = result['ticket'] + self.tempPrimary = result['primary_uri'] + self.tempSecondary = result['secondary_uri'] + self.PrimaryUri = result['primary_uri_rnd'] + if 'secondary_uri_rnd' in result.keys(): + self.SecondaryUri = result['secondary_uri_rnd'] + self.logger.debug('got tempURIs %s %s, random URIs %s %s and ticket %s' % (self.tempPrimary, self.tempSecondary, self.PrimaryUri, self.SecondaryUri, self.ticket)) + + +#(renting)lexar002> java -Xmx256m -jar /globalhome/ingest/ltacp/ltacp.jar lexar002 8803 srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofartest/ops/projects/L6512_SAP002_SB079_uv.MS.tar L6512_SAP002_SB079_uv.MS +#2012-04-10 14:18:17,974 DEBUG client.LtaCp:58 - Creating the socket +#2012-04-10 14:18:17,984 DEBUG client.LtaCp:81 - Writing the request header +#2012-04-10 14:18:17,985 DEBUG client.LtaCp:107 - Transfering data via lexar002:8803 to srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofartest/ops/projects/L6512_SAP002_SB079_uv.MS.tar +#2012-04-10 14:18:17,995 DEBUG client.LtaCp:122 - Starting to stream data +#... +#2012-04-10 14:18:38,352 INFO client.LtaCp:156 - Transfered 100% of 346 MB at 105 MB/s +#2012-04-10 14:18:38,353 DEBUG client.LtaCp:182 - Flushing the stream +#2012-04-10 14:18:38,353 DEBUG client.LtaCp:237 - Transfered 346960383 bytes +#2012-04-10 14:18:38,354 DEBUG client.LtaCp:242 - Closing the socket +#2012-04-10 14:18:38,354 DEBUG client.LtaCp:256 - Retrieving the checksums +#2012-04-10 14:18:51,161 INFO client.LtaCp:270 - Adler32 checksum for lexar002: 6367d2e1 +#2012-04-10 14:18:51,161 INFO client.LtaCp:272 - Checksums from server: <size>347074560</size><checksums><checksum><algorithm>MD5</algorithm><value>ae28093ed958e5aaf7f7cf5ff4188f37</value></checksum><checksum><algorithm>Adler32</algorithm><value>6367d2e1</value></checksum></checksums> +#2012-04-10 14:18:51,162 INFO client.LtaCp:276 - Transfered 346 MB in 20s at 17 MB/s average speed + + + def ParseLTAcpLog(self, log): + for l in log: + if 'Checksums from server:' in l: + if not '</checksums>' in l: + self.logger.debug('checksums incomplete %s' % l) + return False + checksums = l.split()[8] + pos = checksums.find('<value>') + self.MD5Checksum = checksums[pos+7:pos+39] + self.Adler32Checksum = checksums[pos+105:pos+113] + pos = checksums.find('<size>') + try: + self.FileSize = str(int(checksums[pos+6:checksums.find('</size>')])) #XML-RPC doesn't allow bigger than 32bit int + except ValueError: + self.logger.debug("No valid size found") + return False + return True + + def TransferFile(self): + self.logger.debug('Starting file transfer') + if self.PrimaryUri: + cmd = ["ssh", "-T", self.HostLocation, "cd %s;java -Xmx256m -jar /globalhome/ingest/ltacp/ltacp.jar %s %s %s %s" % (self.LocationDir, self.ltacphost, self.ltacpport, self.PrimaryUri, self.Source)] + else: + cmd = ["ssh", "-T", self.HostLocation, "cd %s;java -Xmx256m -jar /globalhome/ingest/ltacp/ltacp.jar %s %s %s/%s %s" % (self.LocationDir, self.ltacphost, self.ltacpport, self.tempPrimary, self.FileName, self.Source)] + ## SecondaryUri handling not implemented + self.logger.debug(cmd) + start = time.time() +# p = subprocess.Popen(cmd, stdin=open('/dev/null'), stdout=subprocess.PIPE, stderr=subprocess.PIPE) +# logs = p.communicate() + self.logger.debug("File transfer for %s took %ds" % (self.JobId, time.time() - start)) + time.sleep(10) + logs = ("hoeba","bla") + log = logs[0].split('\n') + log = ["2012-04-10 14:18:51,161 INFO client.LtaCp:272 - Checksums from server: <size>347074560</size><checksums><checksum><algorithm>MD5</algorithm><value>ae28093ed958e5aaf7f7cf5ff4188f37</value></checksum><checksum><algorithm>Adler32</algorithm><value>6367d2e1</value></checksum></checksums>",""] +# self.logger.debug('Shell command for %s exited with code %s' % (self.JobId, p.returncode)) + self.logger.debug('STD ERR of TransferFile command for %s:\n%s' % (self.JobId, logs[1])) + self.logger.debug(log) + if (not 'No such file or directory.' in logs[1]) and (not 'does not exist' in logs[0]): + if not self.ParseLTAcpLog(log): + self.logger.error("Parsing ltacp result failed for %s" % self.JobId) + raise Exception('File transfer failed of %s' % self.JobId) + else: + self.CheckChecksums() + else: # need to communicate that LTA transaction is to be rolled back but ingest not to be set to "hold" + #os.system('echo "Dataproduct for %s not found on %s.\nConsidering dataproduct to be non existent"|mailx -s "Warning: Dataproduct not found on CEP host" ' % (self.JobId, self.HostLocation) + self.mailCommand) + #self.logger.warn('Sent Warning: Dataproduct not found on CEP host to ' + self.mailCommand) + raise PipelineError('Dataproduct for %s not found on %s'% (self.JobId, self.HostLocation), 'TransferFile', PipelineNoSourceError) + self.logger.debug('Finished file transfer of %s' % self.JobId) + + def CheckChecksums(self): + if self.MD5Checksum and self.Adler32Checksum and self.FileSize: + try: + self.logger.debug('Valid checksums found for %s with filesize %s' % (self.JobId, self.FileSize)) + except: + self.logger.debug('Valid checksums found for %s' % (self.JobId)) + else: + self.logger.debug('Valid checksums not found for %s' % self.JobId) + raise Exception('No valid checkums found for %s' % self.JobId) + + def SendChecksums(self): + if self.PrimaryUri: + uris = {'primary_uri':self.PrimaryUri, 'secondary_uri':self.SecondaryUri} + else: + uris = '' + try: + start = time.time() + result = self.ltaClient.SendChecksums(self.Project, self.ticket, self.FileSize, {'MD5':self.MD5Checksum,'Adler32':self.Adler32Checksum}, uris) + self.logger.debug("SendChecksums for %s took %ds" % (self.JobId, time.time() - start)) + except xmlrpclib.Fault as err: + self.logger.error('Received XML-RPC Fault: %s %s' % (err.faultCode, err.faultString)) + raise + error = result['error'] + if not error: + self.PrimaryUri = result['primary_uri'] + self.SecondaryUri = result['secondary_uri'] + if error: + self.logger.error('Got an error back in SendChecksums for %s: %s' % (self.JobId, error)) + raise Exception('Got Tier 1 SendChecksums error for %s: %s' % (self.JobId, error)) + self.logger.debug('got URIs %s %s' % (self.PrimaryUri, self.SecondaryUri)) + + def SendStatus(self, state): + try: + start = time.time() + result = self.ltaClient.UpdateUriState(self.Project, self.ticket, self.PrimaryUri, state) + self.logger.debug("UpdateUriState for %s took %ds" % (self.JobId, time.time() - start)) + except xmlrpclib.Fault as err: + self.logger.error('Received XML-RPC Fault: %s %s' % (err.faultCode, err.faultString)) + raise + except Exception as e: + self.logger.error('Received unknown exception in SendStatus for %s: %s' % (self.JobId, str(e))) + raise + if result['result'] == 'ok': + self.logger.debug('Status update for %s to %s was successful: %s' % (self.PrimaryUri, state, result)) + else: + self.logger.error(result['error']) + if "No DataProduct found for StorageTicket" in result['error']: + self.logger.error('Database error, no dataproduct found for %s ' % self.JobId) + raise PipelineError('Database error, no dataproduct found for %s ' % self.JobId, 'SetStatus', PipelineJobFailedError) + else: + self.logger.error('Got Tier 1 SendStatus error for %s: %s' % (self.JobId, result['error'])) + raise Exception('Got Tier 1 SendStatus error for %s: %s' % (self.JobId, result['error'])) + +## Not needed right now +## def RenameFile(self): +## self.logger.debug('Rename file') + + def CheckSIP(self): + ##might do more than validate in the future + try: + start = time.time() + f = open('doc/LTA-SIP.xsd') + xml = etree.parse(f) + schema = etree.XMLSchema(xml) + sip = StringIO(self.SIP) + xml = etree.parse(sip) + result = schema.validate(xml) + self.logger.debug("CheckSIP for %s took %ds" % (self.JobId, time.time() - start)) + return result + except Exception as e: + self.logger.error('CheckSIP failed: ' + str(e)) + return False + + def GetSIP(self): + if self.Type == "MoM": + try: + start = time.time() + sip = self.momClient.getSIP(self.MomId, self.ticket, self.FileName, self.PrimaryUri, self.FileSize, self.MD5Checksum, self.Adler32Checksum) + self.SIP = sip.replace('<stationType>Europe</stationType>','<stationType>International</stationType>') + self.logger.debug("GetSIP for %s took %ds" % (self.JobId, time.time() - start)) + except: + self.logger.exception('Getting SIP from MoM failed') + raise + self.logger.debug('SIP received for %s from MoM with size %d: %s' % (self.JobId, len(self.SIP), self.SIP[0:400])) + else: + self.SIP = unspecifiedSIP.makeSIP(self.Project, self.ObsId, self.MomId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type) + self.FileType = unspec_type + if not self.CheckSIP(): + self.logger.debug('Got a malformed SIP from MoM: %s' % self.SIP[0:50]) + try: + self.SIP = unspecifiedSIP.makeSIP(self.Project, self.ObsId, self.MomId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type) + self.FileType = unspec_type + except Exception as e: + self.logger.error('GetSIP failed: ' + str(e)) + raise + self.logger.debug('Unspecified SIP created for %s: %s' % (self.JobId, self.SIP[0:400])) + ###raise Exception('Got a malformed SIP from MoM: %s' % self.SIP[0:50]) + + def SendSIP(self): + try: + start = time.time() + result = self.ltaClient.TransmitSIP(self.SIP, self.ticket) + self.logger.debug("TransmitSIP for %s took %ds" % (self.JobId, time.time() - start)) + except xmlrpclib.Fault as err: + self.logger.error('Received XML-RPC Fault: %s %s' % (err.faultCode, err.faultString)) + raise Exception('XML-RPC failed') + if result['result'] == 'ok': + self.logger.debug('Successfully sent SIP for %s' % self.JobId) + else: + self.logger.error(result['error']) + if "Exception in TransmitSIP, could not use SIP" in result['error']: + self.logger.error('Invalid SIP according to LTA catalog for %s' % self.JobId) + raise PipelineError('Invalid SIP according to LTA catalog for %s' % self.JobId, 'SendSIP', PipelineJobFailedError) + else: + raise Exception('Got Tier 1 TransmitSIP error for %s: %s' % (self.JobId, result['error'])) + + def RollBack(self): + self.logger.debug('Rolling back file transfer for %s' % self.JobId) + try: + if self.PrimaryUri: + cmd = ["bash", "-c", "source %s;srmrm %s" % (self.srmInit, self.PrimaryUri)] + else: + cmd = ["bash", "-c", "source %s;srmrm %s/%s" % (self.srmInit, self.tempPrimary, self.FileName)] + ## SecondaryUri handling not implemented + self.logger.debug(cmd) + start = time.time() +# p = subprocess.Popen(cmd, stdin=open('/dev/null'), stdout=subprocess.PIPE) +# log = p.communicate()[0].split('\n') + self.logger.debug("RollBack for %s took %ds" % (self.JobId, time.time() - start)) +# self.logger.debug(log) + except: + self.logger.exception('Roll back failed for %s' % self.JobId) + + def RetryRun(self, func, times, errortext, *args): + error = '' + retry = 0 + while (retry < times): + try: + func(*args) + except PipelineError as pe: + ## function raised PipelineError itself. Assume retries not useful + raise + except Exception as e: + error += '\n' + str(e) + else: + if retry: + self.logger.debug(errortext + ' was tried %s times on %s before it succeeded. Got the following errors: %s' % (retry, self.JobId, error)) + else: + self.logger.debug(errortext + ' ran without a problem on %s' % self.JobId) + error = '' + break + retry += 1 + if retry < times: + time.sleep(random.randint(30, 60) * retry) + if error: + raise PipelineError(errortext + ' tried %s times but failed on %s. Got the following errors: %s' % (retry, self.JobId, error), func.__name__) + + def run(self): + try: + self.logger.debug("Ingest Pipeline started for %s" % self.JobId) + start = time.time() + self.RetryRun(self.GetStorageTicket, self.ltaRetry, 'Getting storage ticket') + self.RetryRun(self.TransferFile, self.srmRetry , 'Transfering file') + self.RetryRun(self.SendChecksums, self.ltaRetry, 'Sending Checksums') +# self.RenameFile() + self.RetryRun(self.GetSIP, self.momRetry, 'Get SIP from MoM') + self.RetryRun(self.SendSIP, self.ltaRetry, 'Sending SIP') + self.RetryRun(self.SendStatus, self.ltaRetry, 'Setting LTA status', IngestSuccessful) + self.logger.debug("Ingest Pipeline finished for %s in %d" % (self.JobId, time.time() - start)) + except PipelineError as pe: + self.logger.debug('Encountered PipelineError for %s' % (self.JobId)) + ## roll back transfer if necessary + if self.PrimaryUri or self.tempPrimary: + if not (pe.type == PipelineNoSourceError): + self.RollBack() + ## notify LTA the ingest has failed + ## ...but catch exceptions as we do not want to raise a new type of error + try: + if self.ticket: + self.RetryRun(self.SendStatus, self.ltaRetry, 'Setting LTA status', IngestFailed) + except Exception as e: + os.system('echo "Received unknown exception in SendStatus for %s to %s while handling another error:\n%s\n\nCheck LTA catalog and SRM!\n%s"|mailx -s "Warning: LTA catalog status update failed" ' % (self.JobId, IngestFailed, str(e), self.PrimaryUri) + self.mailCommand) + self.logger.error('Sent Mail: LTA catalog status update failed to ' + self.mailCommand) + self.logger.exception('SendStatus IngestFailed failed') + if pe.type == PipelineJobFailedError: + self.logger.debug('Encountered PipelineJobFailedError') + raise + elif pe.type == PipelineNoSourceError: + self.logger.debug('Encountered PipelineNoSourceError') + ## do not raise as it is not possible to continue trying to ingest the source file + elif pe.type == PipelineAlreadyInLTAError: + self.logger.debug('Encountered PipelineAlreadyInLTAError for %s' % (self.JobId)) + ## Do not raise as further attempts will generate the same result + elif pe.type == PipelineNoProjectInLTAError: + self.logger.debug('Encountered PipelineNoProjectInLTAError for %s' % (self.JobId)) + raise + elif pe.source == "SendStatus": + os.system('echo "Received unknown exception in SendStatus for %s to %s:\n%s\n\nCheck LTA catalog and SRM!\n%s"|mailx -s "Warning: LTA catalog status update failed" ' % (self.JobId, IngestFailed, str(e), self.PrimaryUri) + self.mailCommand) + self.logger.error('Sent Mail: LTA catalog status update failed to ' + self.mailCommand) + self.logger.error('SendStatus IngestFailed failed') + else: + self.logger.warn('Encountered unexpected PipelineErrorType: %s' % pe.type) + raise + except: + self.logger.debug('Encountered unexpected error for %s' % (self.JobId)) + if self.PrimaryUri or self.tempPrimary: + self.RollBack() + if self.ticket: + self.RetryRun(self.SendStatus, self.ltaRetry, 'Setting LTA status', IngestFailed) + raise + +#----------------------------------------------------------------- selfstarter - +if __name__ == '__main__': + standalone = IngestPipeline() + standalone.main() diff --git a/LTA/LTAIngest/job_group.py b/LTA/LTAIngest/job_group.py new file mode 100755 index 0000000000000000000000000000000000000000..6db0326d8fa38433569689c281e1339231fbc16b --- /dev/null +++ b/LTA/LTAIngest/job_group.py @@ -0,0 +1,270 @@ +#!/usr/bin/python +#This is a class for managing a group of jobs, usually a MoM export, but it can also be from other sources + +try: + import MySQLdb, datetime, os +except: + import datetime, os +from job_parser import JobRetry, JobError, JobHold, JobScheduled, JobProducing, JobProduced +import job_parser as parser + +corr_type = 0 +bf_type = 1 +img_type = 2 +unspec_type = 3 +pulp_type = 4 + +class job_group(): + """Class to keep track of a collection of jobs with the same lofar_export.exports.id in MoM + or the equivalent in the manual ingest script.""" + def __init__(self, logger, Id, Type, mailCommand): + self.logger = logger + self.Id = Id + self.jobs = {} + self.scheduled = {'Total':0} ##(, 'xxxxx' : 0) for each obsId + self.active = {'Total':0} + self.inactive = {'Total':0} + self.retry = {'Total':0} + self.failed = {'Total':0} + self.done = {'Total':0} + + self.corr = 0 + self.bf = 0 + self.img = 0 + self.unspec = 0 + self.pulp = 0 + self.Type = Type + self.parser = parser.parser(logger) + self.mailCommand = mailCommand + self.get_db_info() + self.logger.info('New job_group %i initialzed' % self.Id) + + def read_old_jobs(self, faileddir, donedir): + done_files = [] + failed_files = [] + if self.Type == 'MoM': + Dir = '/A_%s/' % self.Id + else: ## tier0-ingest + Dir = '/B_%s/' % self.Id + if os.path.isdir(donedir + Dir): + done_files = os.listdir(donedir + Dir) + for f in done_files: + job = self.parser.parse(donedir + Dir + f) + job['filename'] = f + if job['Status'] == JobScheduled: + job['Status'] = JobProduced + self.add_job(job) + self.update_job(job, JobScheduled, JobProduced, None) + if os.path.isdir(faileddir + Dir): + failed_files = os.listdir(faileddir + Dir) + for f in failed_files: + job = self.parser.parse(faileddir + Dir + f) + job['filename'] = f + if job['Status'] == JobScheduled: + job['Status'] = JobError + self.add_job(job) + self.update_job(job, JobScheduled, JobError, None) + self.logger.info('New job_group %i has read %i old jobs' % (self.Id, len(done_files) + len(failed_files))) + + def add_job(self, job): + self.jobs[job['ExportID']] = job + self.scheduled['Total'] += 1 + if job['ObservationId'] in self.scheduled: + self.scheduled[job['ObservationId']] += 1 + else: + self.scheduled[job['ObservationId']] = 1 + self.active[job['ObservationId']] = 0 + self.inactive[job['ObservationId']] = 0 + self.retry[job['ObservationId']] = 0 + self.failed[job['ObservationId']] = 0 + self.done[job['ObservationId']] = 0 + + def update_file_type(self, fileType): + if fileType == corr_type: self.corr += 1 + if fileType == bf_type: self.bf += 1 + if fileType == img_type: self.img += 1 + if fileType == unspec_type: self.unspec += 1 + if fileType == pulp_type: self.pulp += 1 + + def update_job(self, job, old_status, new_status, fileType): + self.jobs[job['ExportID']] = job + if old_status == JobScheduled: + self.scheduled[job['ObservationId']] -= 1 + self.scheduled['Total'] -= 1 + if old_status == JobProducing: + self.active[job['ObservationId']] -= 1 + self.active['Total'] -= 1 + if old_status == JobHold: + self.inactive[job['ObservationId']] -= 1 + self.inactive['Total'] -= 1 + if old_status == JobRetry: + self.retry[job['ObservationId']] -= 1 + self.retry['Total'] -= 1 + if old_status == JobError: + self.failed[job['ObservationId']] -= 1 + self.failed['Total'] -= 1 + if old_status == JobProduced: + self.done[job['ObservationId']] -= 1 + self.done['Total'] -= 1 + ##some of these status transitions should not happen. + if new_status == JobScheduled: + self.scheduled[job['ObservationId']] += 1 + self.scheduled['Total'] += 1 + if new_status == JobProducing: + self.active[job['ObservationId']] += 1 + self.active['Total'] += 1 + if new_status == JobHold: + self.inactive[job['ObservationId']] += 1 + self.inactive['Total'] += 1 + if new_status == JobRetry: + self.retry[job['ObservationId']] += 1 + self.retry['Total'] += 1 + if new_status == JobError: + self.failed[job['ObservationId']] += 1 + self.failed['Total'] += 1 + self.update_file_type(fileType) + if new_status == JobProduced: + self.done[job['ObservationId']] += 1 + self.done['Total'] += 1 + self.update_file_type(fileType) + + def check_finished(self): + total = len(self.jobs) + finished = (self.scheduled['Total'] == 0) and (self.active['Total'] == 0) and (self.inactive['Total'] == 0) and (self.retry['Total'] == 0) + if finished and (total == self.failed['Total'] + self.done['Total']): ## sanity check, somewhat redundant + self.logger.info('job_group %i is finished, total of %i files' % (self.Id, total)) + return True + return False + + def get_destination(self): # hack to support Target slaves, requires Target to be in the Resource name for the project + if 'Target' in self.job_info['ltalocation']: + return 'lotar' + return 'lexar' + + def get_db_info(self): + now = datetime.datetime.today().replace(microsecond=0) + self.job_info = {'id': self.Id, 'start_time': now, 'update_time': now, + 'user': "unknown", 'status': "unknown", 'name': 'unknown', + 'project': 'unknown', 'ltalocation': "unknown", 'eid': "unknown"} + if self.Type == 'MoM': + try: + ## should be read from config file + #m = "lofartest_sas099_mom3_two" + m = "lofar_mom3" + #e = "lofartest_sas099_export" + e = "lofar_export" + db = MySQLdb.connect(host="mysql1.control.lofar", user="momreadonly", passwd="daub673(ming", db=m) + c = db.cursor() + c.execute("SELECT a.id,toexportdate as started,a.status_date as last_update,a.exportername,b.name as state,d.name,e.name as projectname,g.name as location,a.data_location,d.mom2id FROM %s.exports AS a, %s.status AS b, %s.mom_references AS c, %s.mom2object AS d, %s.mom2object AS e, %s.resource AS f, %s.resourcetype AS g WHERE status_id=b.id AND a.mom_reference_id = c.id AND c.mom_id=d.mom2id AND d.ownerprojectid = e.id AND e.id=f.projectid AND f.resourcetypeid > 1 AND f.resourcetypeid = g.id and a.id = %i;" % (e,e,e,m,m,m,m,self.Id)) + db_job = c.fetchone() + ## (1137L, datetime.datetime(2014, 2, 6, 8, 30, 34), datetime.datetime(2014, 2, 8, 18, 41, 2), 'toribio', 'running', 'Orion', 'LC1_055', 'Lofar Storage (J\xfclich)', 'LC1_055/358795', 358795L) + self.job_info = {'id': db_job[0], 'start_time': db_job[1], 'update_time': db_job[2], + 'user': db_job[3].rjust(8), 'status': db_job[4], 'name': db_job[5].rjust(9), + 'project': db_job[6], 'ltalocation': db_job[7], 'eid': db_job[8]} + except Exception as e: + self.logger.warning('Caught an exception trying to talk to the Mom database: %s' % str(e)) + else: + try: + if len(self.jobs): #get_db_info now also happens in __init__ + job = self.jobs.values()[0] + self.job_info['name'] = job['Source'] + self.job_info['project'] = job['Project'] + except: + self.logger.warning('Caught an exception trying to create job info: %s' % str(e)) + self.job_info['run_time'] = now - self.job_info['update_time'] + self.job_info['duration'] = now - self.job_info['start_time'] + + def make_sub_report(self, Input, name): + if Input['Total'] > 0: + message = "Total %(name)s: %(total)i\nObsId : #files\n" % {'name': name, 'total': Input['Total']} + for (k,v) in Input.iteritems(): + if k == 'Total': continue + message += "L%(ObservationId)s: %(count)i\n" % {'ObservationId': k, 'count': v} + message += "\n" + else: + message = "" + return message + + def make_report(self): + header = """=== Report on your ingest Job "%(name)s" (%(id)i) === + +Status: %(status)s +User: %(user)s +Project: %(project)s +Start: %(start_time)s +Last update: %(update_time)s +Time in queue: %(duration)s +Time since last update: %(run_time)s +Stored at: %(ltalocation)s""" % {'name': self.job_info['name'], 'id': self.job_info['id'], 'status': self.job_info['status'], + 'user': self.job_info['user'], 'project': self.job_info['project'], 'start_time': self.job_info['start_time'].isoformat(), + 'update_time': self.job_info['update_time'].isoformat(), 'duration': str(self.job_info['duration']), + 'run_time': str(self.job_info['run_time']), 'ltalocation': self.job_info['ltalocation'].decode('latin1').encode('utf-8')} + + summary = """\n\n=== Summary === +Total Files Success: %(done)i +- Interferometer: %(corr)i +- Beamformed: %(bf)i +- SkyImages: %(img)i +- Unspecified: %(unspec)i +- Pulsar Pipeline: %(pulp)i + +Total Failed: %(failed)i""" % {'done': self.done['Total'], 'corr': self.corr, 'bf': self.bf, 'img': self.img, 'unspec': self.unspec, 'pulp': self.pulp, 'failed': self.failed['Total']} + + error_list = {} + failed_files = "\n\n==== Failed files: =====\n" + ##L169235_SB180_uv.dppp.MS, locus071: ssh connection failed + for j in self.jobs.values(): + if j['Status'] == JobError: + failed_files += "%(file_name)s, %(host)s: %(error)s\n" % {'file_name': j['DataProduct'], 'host': j['Location'].split(':')[0], 'error': j['errors']} + for e in j['errors']: + if e in error_list: + error_list[e] += 1 + else: + error_list[e] = 1 + + errors = "\n" ##"- ssh connection failed: 20" + for (k, v) in error_list.iteritems(): + errors += "- %s: %i\n" % (k, v) + + details = """\n\n===== Details =====\n""" + details += self.make_sub_report(self.scheduled, "files scheduled") + details += self.make_sub_report(self.active, "files running") + details += self.make_sub_report(self.inactive, "files on hold") + details += self.make_sub_report(self.retry, "files on retry") + details += self.make_sub_report(self.failed, "files failed") + details += self.make_sub_report(self.done, "files success") + + + message = header + summary + errors + failed_files + details + return message + + def send_mail(self): + self.get_db_info() + message = self.make_report() + os.system('echo "%s"|mailx -s "Ingest job of %s, %s(%i) has ended" ' % (message, self.job_info['user'], self.job_info['name'], self.Id) + self.mailCommand) + self.logger.info('job_group %i sent an email to %s' % (self.Id, self.mailCommand)) + + +## Stand alone execution code ------------------------------------------ +if __name__ == '__main__': + + ##This test code might need updating. + import logging + logging.basicConfig() + l = logging.getLogger() + l.setLevel(10) + Id = 476 + standalone = job_group(l, Id, 'MoM', 'renting@astron.nl') + jobs = [] + for i in range(2): + k = {'Status': JobScheduled, 'ExportID': 'jA_%s_%s_%i_L12345_SAP000_SB000_uv.MS' % (Id, Id, i), 'ObservationId': 12345, 'MoMId': 654321, 'DataProduct': 'L12345_SAP000_SB000_uv.MS', 'Location': 'locus123:/data/L12345/L12345_SAP000_SB000_uv.MS', 'errors': []} + jobs.append(k) + for job in jobs: + standalone.add_job(job) + for job in jobs: + standalone.update_job(job, JobScheduled, JobProducing, None) + for job in jobs: + standalone.update_job(job, JobProducing, JobProduced, corr_type) + standalone.get_db_info() + message = standalone.make_report() + print(message) diff --git a/LTA/LTAIngest/job_parser.py b/LTA/LTAIngest/job_parser.py new file mode 100755 index 0000000000000000000000000000000000000000..016f278aacec8be5d44131e43ec6a05a269be1f4 --- /dev/null +++ b/LTA/LTAIngest/job_parser.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +##Only internal in the Ingest +JobRetry = -2 +##Below are hardcoded defines for communicating with MoM! +JobError = -1 +JobHold = 0 +JobScheduled = 1 +JobProducing = 2 +JobProduced = 3 + +## Job should probably be refactored into a class at some point. +## Right now it's just a big dict. + +##------------------ Job keys -------------------------- +## job['Status'] : JobRetry, JobError, JobHold, JobScheduled, JobProducing, JobProduced +## job['ExportID'] : nodeName == 'exportjob' +## job['scriptname'] : nodeName == 'scriptname' +## job['repository'] : ('server','resultdir') in nodeName == 'repository' +## job['Location'] : <input name="Location">locus029:/data/L202708/L202708_SB243_uv.dppp.MS</input> +## job['host'] : job['Location'].split(':')[0] +## job['filename'] : SOAP call, filename argument in new_job +## Project = job['Project'] : <input name="Project">LC1_055</input> +## DataProduct = job['DataProduct'] : <input name="DataProduct">L202708_SB243_uv.dppp.MS</input> +## FileName = job['DataProduct'] (+ '.tar') +## JobId = job['JobId'] : <input name="JobId">A_1134_1134_3767569_10318605_L202708_SB243_uv.dppp.MS</input> +## MomId = int(job['MomId']) : <input name="MomId">10318605</input> +## ObsId = int(job['ObservationId']) : <input name="ObservationId">202708</input> +## unused : <input name="Subband">-1</input> +## Source = job['Source'] : <input name="Source">L201198_red</input> +## Source = self.DataProduct + ' ' + self.DataProduct[:-3] + '.raw' +## Source = self.DataProduct +## Type = job["Type"] : <input name="Type">tier0-ingest</input> +## Type = "MoM" +## HostLocation = job['Location'].split(':')[0] +## Location = job['Location'].split(':')[1] +## jobfile.split('_')[1] == exportId +## suffix = self.job['filename'].split('.')[-1] +## job['retry'] = 0 +## job['job_group'] = self.job['ExportID'].split('_')[1] +## job['errors'] = [] +## job['destination'] = self.job_groups[jg].get_destination() +## +## N.B. HostLocation == job['host'] + +class parser(): + def __init__(self, logger): + self.logger = logger + + ## Code to generate results --------------------------------------------- + def parse(self, job): + self.job = {} + try: + from xml.dom import minidom, Node + doc = minidom.parse(job) + if doc.documentElement.nodeName == 'exportjob': + self.job['ExportID'] = str(doc.documentElement.attributes.get('exportID').nodeValue) + for node in doc.documentElement.childNodes: + if node.nodeName == 'scriptname': + value = node.childNodes[0].nodeValue + self.job['scriptname'] = value + elif node.nodeName == 'repository': + for itemnode in node.childNodes: + if itemnode.nodeName == 'server': + name = itemnode.childNodes[0].nodeValue + elif itemnode.nodeName == 'resultdir': + res = itemnode.childNodes[0].nodeValue + if res and name: + self.job['repository'] = (name, res) + elif node.nodeName == 'inputlist': + name = "'" + node.attributes.get('name').nodeValue + "'" + exec(eval("'self.job[%s] = []' % (name)")) + for itemnode in node.childNodes: + if itemnode.nodeName == 'listitem': + value = itemnode.childNodes[0].nodeValue + exec(eval("'self.job[%s].append(%s)' % (name, value)")) + elif node.nodeName == 'input': + name = "'" + node.attributes.get('name').nodeValue + "'" + value = node.childNodes[0].nodeValue + if value == 'True' or value == 'False': + exec(eval("'self.job[%s] = %s' % (name, value)")) + else: + value = "'''" + value + "'''" ## tripple quotes because a value could be "8 O'clock" for example + exec(eval("'self.job[%s] = %s' % (name, value)")) + if self.job['ExportID']: ## we need an export ID to identify the job + if self.job['ObservationId'][0] == 'L': + self.job['ObservationId'] = self.job['ObservationId'][1:] + test = int(self.job['ObservationId']) ## check if it can be converted to an int + test = int(self.job['MomId']) ## check if it can be converted to an int + self.job['host'] = self.job['Location'].split(':')[0] + self.job['Status'] = JobScheduled + self.job['retry'] = 0 + self.job['job_group'] = int(self.job['ExportID'].split('_')[1]) + self.job['errors'] = [] + if not "Type" in self.job: + self.job["Type"] = "MoM" + return self.job + except: + self.logger.exception('Failed importing job: ' + job) + self.job['Status'] = JobError + return self.job diff --git a/LTA/LTAIngest/master.py b/LTA/LTAIngest/master.py new file mode 100755 index 0000000000000000000000000000000000000000..a88048e0d4d8594cc2d8224c8f7522f1c208f3dc --- /dev/null +++ b/LTA/LTAIngest/master.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python +from multiprocessing import Process, Queue, Manager, Value +from multiprocessing.managers import SyncManager +from Queue import Empty +from job_group import job_group +import job_parser as parser +from job_parser import JobRetry, JobError, JobHold, JobScheduled, JobProducing, JobProduced +import os, time, sys, shutil + + +##------------------ Listener for incomming jobs -------------------------- +class jobListener(Process): + """This listens for messages with new jobs using the (SOAP) server from the config. + It writes the file to disk but does not parse it, as it would take too long and keep + it from receiving the next message.""" + def __init__(self, logger, queue, jobsdir, server): + logger.info('Initializing Incomming job Listener') + self.logger = logger + self.jobs = queue + self.jobsdir = jobsdir + self.server = server + self.server.registerFunction(self.new_job, 'urn:pipeline.export') + super(jobListener, self).__init__() + logger.info('Incomming job Listener initialized') + + def new_job(self, fileName, fileContent): + self.logger.debug("Received a new job: %s" % fileName) + try: + if fileContent: + f = open(self.jobsdir + fileName, 'w') + f.write(fileContent) ## Save the job so we remember it is in the queue + f.close() + except: + self.logger.exception('Problem writing job: %s' % fileName) + if fileContent: + self.jobs.put(fileName) + elif os.path.exists(self.jobsdir + fileName): ## We've put it back into the queue. + self.jobs.put(fileName) + else: ## MoM can send alive messages with no fileContent to check if the ingest is running + self.logger("The received job was empty: %s" % fileName) + + def run(self): + self.server.socket.settimeout(60) + self.logger.info('Incomming job Listener started') + while True: + self.server.handle_request() + +##--------------------- MoM talker ----------------------------- + +class momTalker(Process): + """This sends messages of status updates to MoM. Currently has a maxTalkQueue to prevent + MoM from getting confused and messages being sent out of order. Needs to be improved.""" + def __init__(self, logger, client, count, maxTalkQueue): + self.logger = logger + self.jobs = Queue(maxTalkQueue) + self.exportClient = client + self.retryCount = count + super(momTalker, self).__init__() + logger.info('momTalker initialzed') + + def getQueue(self): + """Other worker processes only talk to the queue.""" + return self.jobs + + ## This function also exists in the slave, should be refactored at some point + def communicateJob(self, job): + """function to write to log and communicate with GUI""" + if job['Status'] == JobRetry: self.logger.info('Job:' + str(job['ExportID']) + ' will be retried') + elif job['Status'] == JobError: self.logger.info('Job:' + str(job['ExportID']) + ' Failed') + elif job['Status'] == JobHold: self.logger.info('Job:' + str(job['ExportID']) + ' is on Hold') + elif job['Status'] == JobScheduled: self.logger.info('Job:' + str(job['ExportID']) + ' Scheduled') + elif job['Status'] == JobProducing: self.logger.info('Job:' + str(job['ExportID']) + ' Started') + elif job['Status'] == JobProduced: self.logger.info('Job:' + str(job['ExportID']) + ' Produced') + try: + if job['Status'] == JobRetry: + self.logger.info('Job:' + str(job['ExportID']) + ' retry state not communicated to MoM') + return + if not job['Type'] == 'MoM': + self.logger.info('Job:' + str(job['ExportID']) + ' not communicated to MoM') + return + (status, message) = self.exportClient.setStatus(str(job['ExportID']), str(job['Status'])) + if status: ## we retry, because the exportClient does not do an internal retry, but only reports the problem + self.logger.warning("Problem communicating with MoM, retrying " + str(job['ExportID']) + ": " + message) + count = 1 + while (status and (count < self.retryCount)): + time.sleep(60 * count) + (status, message) = self.exportClient.setStatus(str(job['ExportID']), str(job['Status'])) + count += 1 + if status: + self.logger.warning(message) + self.logger.info(message) + except: + self.logger.exception('Could not update job %s status to %s.' % (str(job['ExportID']), str(job['Status']))) + + def run(self): + self.logger.info('momTalker started') + while True: + try: + job = self.jobs.get(True, 10) + self.communicateJob(job) + except Empty: pass + +##--------------------- Job handler -------------------------- +class jobHandler(Process): + def __init__(self, logger, incomming_jobs, scheduled_jobs, job_done_msg, update_job_msg, + jobsdir, faileddir, donedir, + talker, masterAddress, masterPort, masterAuth, mailCommand, parallelJobs): + logger.info('Initializing jobHandler') + self.masterAddress = masterAddress + self.masterPort = masterPort + self.masterAuth = masterAuth + self.mailCommand = mailCommand + self.scheduled = scheduled_jobs + self.job_done_msg = job_done_msg + self.update_job_msg = update_job_msg + self.active = dict() + self.logger = logger + self.jobsdir = jobsdir + self.faileddir = faileddir + self.donedir = donedir + self.parser = parser.parser(logger) + self.talker = talker + self.parallelJobs = parallelJobs + super(jobHandler, self).__init__() + self.logger.info('jobHandler initialzed') + + def update_file(self, job): + """Ugly function to interact with the file system. + The suffix is used to remember how often we tried to process this file. + We use one directory per job_group or otherwise the lists get too long. + Finished files go to the donedir, files that failed to much to the faileddir.""" + jobname = job['filename'] + if job['Type'] == 'MoM': + Dir = 'A_%s/' % job['job_group'] + else: ## tier0-ingest + Dir = 'B_%s/' % job['job_group'] + if job['Status'] == JobProduced: + old = self.jobsdir + jobname + new = self.donedir + Dir + jobname + if not os.path.isdir(self.donedir + Dir): + os.mkdir(self.donedir + Dir) + else: + suffix = jobname.split('.')[-1] + if suffix.isdigit(): + new_jobname = jobname[:-len(suffix)] + str(int(suffix) + 1) + else: + new_jobname = jobname + '.1' + old = self.jobsdir + jobname + if job['Status'] == JobRetry: + new = self.jobsdir + new_jobname + else: + new = self.faileddir + Dir + new_jobname + if not os.path.isdir(self.faileddir + Dir): + os.mkdir(self.faileddir + Dir) + job['filename'] = new_jobname + self.logger.debug('Moving %s to %s' % (old, new)) + shutil.move(old, new) + + def job_done(self, job): + """Remove the job from the active list, put it back in the queue if it needs to be + tried again, and update the location/filename to reflect it's state.""" + self.active.pop(job['ExportID']) + self.update_file(job) + if job['Status'] == JobRetry: + self.scheduled.put(job) ## We need to try it again + self.logger.debug("Job %s no longer active because of state %s" % (job['ExportID'], job['Status'])) + + def run(self): + ## ====== Waiting for slaves ====== + self.logger.info('Waiting for slaves to connect') + class manager(SyncManager): pass + manager.register('number') + manager.register('get') + self.manager = manager(address=(self.masterAddress, self.masterPort), authkey=self.masterAuth) + self.manager.connect() + nr_of_slaves = int(str(self.manager.number())) + while nr_of_slaves < 1: # There are no slaves + time.sleep(10) + nr_of_slaves = int(str(self.manager.number())) + time.sleep(10) #Let's wait a few seconds for any more slaves. Currently all slaves need to connect in 10 seconds. + nr_of_slaves = int(str(self.manager.number())) + self.logger.info('Slaves found: %d' % nr_of_slaves) + os.system('echo "The LTA Ingest has been restarted."|mailx -s "LTA Ingest restarted" ' + self.mailCommand) + + ## ======= Main loop ====== + first = True + while True: + sleep = True + try: ## See if any jobs have finished + job = self.job_done_msg.get_nowait() + sleep = False ## found a job + self.logger.debug("Job's done: %s (%i)" % (job['ExportID'], len(self.active))) + self.job_done(job) + self.logger.debug("Job done handled: %s (%i)" % (job['ExportID'], len(self.active))) + first = True + except Empty: pass + if len(self.active) < (self.parallelJobs * nr_of_slaves): + try: ## See if there is anything scheduled that we can start doing + job = self.scheduled.get_nowait() + sleep = False ## found a job + self.update_job_msg.put((job, job['Status'], JobProducing, None)) + job['Status'] = JobProducing + self.active[job['ExportID']] = job + self.manager.get(None, job['destination']).put(job) ## sends it to the slave with the shortest queue of the possible destinations + self.logger.debug("Job's started: %s (%i)" % (job['ExportID'], len(self.active))) + first = True + except Empty: pass + if sleep: ##nothing to do, time for a nap. + #self.emptyQueue.value = False + #self.logger.debug("sleeping, queue: %s" % len(self.active)) + if first: + self.logger.debug("sleeping, queue: %s" % len(self.active)) + self.logger.debug("Active exports: %s" % str(self.active.keys())) + first = False + time.sleep(10) + + +## Queue Handler ---------------------------------------------------------- +class queueHandler(Process): + """This schedules jobs in the queue if the can be parsed and makes sure that the job_group + administration is up-to-date so we know when to send a mail to the user.""" + def __init__(self, logger, talker, incomming_jobs, scheduled_jobs, update_job_msg, jobsdir, faileddir, donedir, mailCommand): + self.logger = logger + self.talker = talker + self.incomming = incomming_jobs ## FIFO queue of filenames + self.scheduled = scheduled_jobs ## FIFO queue of jobs + self.update_job_msg = update_job_msg ## FIFO queue with job status updates + self.jobsdir = jobsdir ## where to find the jobs + self.job_groups = dict() + self.faileddir = faileddir + self.donedir = donedir + self.mailCommand = mailCommand + self.parser = parser.parser(logger) + super(queueHandler, self).__init__() + self.logger.info('queueHandler initialzed') + + def update_job(self, job, old_status, new_status, fileType): + """This function does the job_group management. Please note that + old_status and new_status should be used, not job['Status'] to avoid race conditions.""" + jg = job['job_group'] + if not self.job_groups.has_key(jg): ## should only happen on JobScheduled, but let's be safe. + self.job_groups[jg] = job_group(self.logger, jg, job['Type'], self.mailCommand) + self.job_groups[jg].read_old_jobs(self.faileddir, self.donedir) + if new_status == JobScheduled: + self.job_groups[jg].add_job(job) + else: + self.job_groups[jg].update_job(job, old_status, new_status, fileType) + if self.job_groups[jg].check_finished(): + self.job_groups[jg].send_mail() + self.job_groups.pop(jg) + + def newJob(self, fileName): + """Read filename and add to the queue of scheduled jobs if it is a valid file.""" + self.logger.info("Processing job: %s" % fileName) + job = self.parser.parse(self.jobsdir + fileName) + job['filename'] = fileName + if job['Status'] == JobScheduled: + self.update_job(job, None, JobScheduled, None) + job['destination'] = self.job_groups[job['job_group']].get_destination() + self.scheduled.put(job) +# self.talker.put(job) ## Tell MoM we've done something + else: + self.logger.warning('Parsing ' + self.jobsdir + fileName + ' failed') + + def run(self): + while True: + sleep = True + try: + msg = self.update_job_msg.get_nowait() + sleep = False + self.update_job(msg[0], msg[1], msg[2], msg[3]) + except Empty: pass + try: + fileName = self.incomming.get_nowait() + sleep = False + self.newJob(fileName) + except Empty: pass + if sleep: ## nothing to do, time for a nap. + time.sleep(10) + + +## Startup ---------------------------------------------------------- +## This class reads the existing queues from disk in parallel to the main threads. +## The goal is to start processing as soon as some jobs have been read. +class startup(Process): + def __init__(self, logger, incomming_jobs, jobsdir, mailCommand): + logger.info('Initializing Master Startup') + self.jobs = incomming_jobs + self.logger = logger + self.jobsdir = jobsdir + self.mailCommand = mailCommand + super(startup, self).__init__() + logger.info('Master Startup initialzed') + + def run(self): + existingJobs = os.listdir(self.jobsdir) + existingJobs.sort() + self.logger.info('Found %d existing jobs' % len(existingJobs)) + for e in existingJobs: + self.jobs.put(e) + self.logger.info('Master Startup finished') + self.logger.info('Currently %s jobs in input queue' % self.jobs.qsize()) + os.system('echo "The LTA Ingest has been restarted. %d existing jobs still found in queue."|mailx -s "LTA Ingest existing jobs in queue" ' % len(existingJobs) + self.mailCommand) + + +## LTA Master ---------------------------------------------------------- +class ltaMaster(): + """Reads the config, starts the threads and talks to the slaves""" + def __init__(self, config): + self.incomming_jobs = Queue() ##FIFO queue of filenames + self.scheduled_jobs = Queue() ##FIFO queue of jobs + self.update_job_msg = Queue() + self.job_done_msg = Queue() + self.slaves = {} + configFile = config + try: + self.readConfig(configFile) + except Exception as e: + print ('\n%s' % e) + print('The Configuration is incomplete, exiting') + exit(2) + self.logger.info('Master initialized') + + def readConfig(self, configFile): + exec(eval("'from %s import *' % configFile")) + self.host = host + self.jobsdir = jobsdir + self.faileddir = faileddir + self.logdir = logdir + self.donedir = donedir + self.logger = logger + self.ltaClient = ltaClient + self.pipelineRetry = pipelineRetry + self.exportClient = exportClient + self.momRetry = momRetry + self.momServer = momServer + self.masterAddress = masterAddress + self.masterPort = masterPort + self.masterAuth = masterAuth + self.maxTalkQueue = maxMasterTalkerQueue + self.mailCommand = mailCommand + self.parallelJobs = parallelJobs + if momServer == None: #specific check on the master, this is no problem on the slave + raise Exception('No MoM to listen to!') + + def add_slave(self, slave): + self.slaves[slave] = Queue() + return self.slaves[slave] + + def slave_size(self): + return len(self.slaves) + + ##Gives you the shortest slave queue unless you ask for a specific one. + def get_slave(self, source, destination): + if source: ## this code was developed for use on lse nodes/staging area, not really used. + return self.slaves[source] + else: + result = None + length = sys.maxint + for k in self.slaves.keys(): + if destination in k:# subselection of slaves based on destination, bit of a hack right now: choice between: lexar,lotar + size = self.slaves[k].qsize() + if length > size: + result = self.slaves[k] + length = size + self.logger.debug('found slave %s' % k) + return result + + def remove_slave(self, slave): + q = self.slaves.pop(slave, None) + if q and not q.empty(): + self.logger.warning('Lingering items were left by %s' % slave) + + def slave_done(self, job, result, fileType): + if result: + job['errors'].append(result) + self.update_job_msg.put((job, JobProducing, job['Status'], fileType)) + self.job_done_msg.put(job) + self.logger.debug('Slave reported done with %s, status %s' % (job['ExportID'], job['Status'])) + + def serve(self): + class manager(SyncManager): pass + manager.register('add_slave', self.add_slave) + manager.register('number', self.slave_size) + manager.register('get', self.get_slave) + manager.register('remove_slave', self.remove_slave) + manager.register('slave_done', self.slave_done) + self.manager = manager(address=(self.masterAddress, self.masterPort), authkey=self.masterAuth) + + self.momTalker = momTalker(self.logger, self.exportClient, self.momRetry, self.maxTalkQueue) + self.momTalker.start() + talker = self.momTalker.getQueue() + + self.startup = startup(self.logger, self.incomming_jobs, self.jobsdir, self.mailCommand) + self.startup.start() + + self.queueHandler = queueHandler(self.logger, talker, self.incomming_jobs, self.scheduled_jobs, + self.update_job_msg, + self.jobsdir, self.faileddir, self.donedir, self.mailCommand) + self.queueHandler.start() + + self.jobHandler = jobHandler(self.logger, self.incomming_jobs, self.scheduled_jobs, self.job_done_msg, self.update_job_msg, + self.jobsdir, self.faileddir, self.donedir, talker, + self.masterAddress, self.masterPort, self.masterAuth, + self.mailCommand, self.parallelJobs) + self.jobHandler.start() + + self.jobListener = jobListener(self.logger, self.incomming_jobs, self.jobsdir, self.momServer) + self.jobListener.start() + + #This doesn't work??: self.manager.start(), we use serve_forever() instead + self.logger.info('Manager has been started') + self.manager.get_server().serve_forever() ## We would need a custom serve_forever to be able to stop. + +## Stand alone execution code ------------------------------------------ +if __name__ == '__main__': + usage = """Usage: + master.py <config> + config: Something like 'ingest_config' (without the .py)""" + + if len(sys.argv) < 2: + print usage + exit(1) + config = sys.argv[1] + standalone = ltaMaster(config) + standalone.serve() diff --git a/LTA/LTAIngest/md5adler/README b/LTA/LTAIngest/md5adler/README new file mode 100644 index 0000000000000000000000000000000000000000..69f1d24188bb172e5e6b2c01d48a5c688b47214e --- /dev/null +++ b/LTA/LTAIngest/md5adler/README @@ -0,0 +1,6 @@ +Crude, but effective md5sum / adler32 binary. The a32 command calculates +the adler32 sum of just one file. It does not really do any decent CLI +argument checking yet, intended just for testing purposes. +The md5a32 binary works just like the regular md5sum binary, but if you +specify a file (or files) as it's argument, it will also display the +adler32 sum (as last argument). diff --git a/LTA/LTAIngest/md5adler/a32 b/LTA/LTAIngest/md5adler/a32 new file mode 100755 index 0000000000000000000000000000000000000000..84ccddb252929cb377a26637dc60b4bad0142ea0 Binary files /dev/null and b/LTA/LTAIngest/md5adler/a32 differ diff --git a/LTA/LTAIngest/md5adler/a32.c b/LTA/LTAIngest/md5adler/a32.c new file mode 100644 index 0000000000000000000000000000000000000000..a362544b682e53df243e93f67c80995a0f3afe74 --- /dev/null +++ b/LTA/LTAIngest/md5adler/a32.c @@ -0,0 +1,85 @@ +# include <stdio.h> +#include <stdlib.h> +#include "md5.h" + +unsigned int adlercount = 1; + +int main(int argc, char *argv[]) { + FILE *fp; + char res[16]; + fp=fopen(argv[1],"rb"); + if (fp==(FILE *)NULL) { + fprintf (stderr,"failed: can't open %s", argv[1]); + exit(-1); + } + parse_stream(fp,res); + fprintf(stderr, "ADLER32 %x\n",adlercount); + fclose(fp); +} + + +void process_block(void *buffer, size_t size) { + adlercount=adler32(adlercount,buffer,size); +} + +void process_bytes (void *buffer, size_t sum) { + adlercount=adler32(adlercount,buffer, sum); +} + +int parse_stream (FILE *stream, void *resblock) +{ + char buffer[BLOCKSIZE + 72]; + size_t sum; + + + /* Iterate over full file contents. */ + while (1) + { + /* We read the file in blocks of BLOCKSIZE bytes. One call of the + computation function processes the whole buffer so that with the + next round of the loop another block can be read. */ + size_t n; + sum = 0; + + /* Read block. Take care for partial reads. */ + while (1) + { + n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream); + + sum += n; + + if (sum == BLOCKSIZE) + break; + + if (n == 0) + { + /* Check for the error flag IFF N == 0, so that we don't + exit the loop after a partial read due to e.g., EAGAIN + or EWOULDBLOCK. */ + if (ferror (stream)) + return 1; + goto process_partial_block; + } + + /* We've read at least one byte, so ignore errors. But always + check for EOF, since feof may be true even though N > 0. + Otherwise, we could end up calling fread after EOF. */ + if (feof (stream)) + goto process_partial_block; + } + + /* Process buffer with BLOCKSIZE bytes. Note that + BLOCKSIZE % 64 == 0 + */ + process_block (buffer, BLOCKSIZE); + } + +process_partial_block: + + /* Process any remaining bytes. */ + if (sum > 0) + process_bytes (buffer, sum); + + return 0; +} + diff --git a/LTA/LTAIngest/md5adler/adler32.c b/LTA/LTAIngest/md5adler/adler32.c new file mode 100644 index 0000000000000000000000000000000000000000..6c5894add0691c1a64c088d95a1878dbdf1e22fa --- /dev/null +++ b/LTA/LTAIngest/md5adler/adler32.c @@ -0,0 +1,152 @@ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2004 Mark Adler + * Minor adaptations by Henk Kloepping + */ + +/* on a 64 bit box: */ +#define uInt unsigned int /* 16 bits or more */ +#define uLong unsigned int /* 32 bits or more */ +#define Byte unsigned char /* 8 bits or more */ +#define Bytef unsigned char /* 8 bits or more */ +#define Z_NULL (Bytef) 0 /* empty buffer */ +#define z_off_t unsigned long /* offset in buffer */ + +#define BASE 65521UL /* largest prime smaller than 65536 */ +#define NMAX 5552 +/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ + +#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;} +#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); +#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); +#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); +#define DO16(buf) DO8(buf,0); DO8(buf,8); + +/* use NO_DIVIDE if your processor does not do division in hardware */ +#ifdef NO_DIVIDE +# define MOD(a) \ + do { \ + if (a >= (BASE << 16)) a -= (BASE << 16); \ + if (a >= (BASE << 15)) a -= (BASE << 15); \ + if (a >= (BASE << 14)) a -= (BASE << 14); \ + if (a >= (BASE << 13)) a -= (BASE << 13); \ + if (a >= (BASE << 12)) a -= (BASE << 12); \ + if (a >= (BASE << 11)) a -= (BASE << 11); \ + if (a >= (BASE << 10)) a -= (BASE << 10); \ + if (a >= (BASE << 9)) a -= (BASE << 9); \ + if (a >= (BASE << 8)) a -= (BASE << 8); \ + if (a >= (BASE << 7)) a -= (BASE << 7); \ + if (a >= (BASE << 6)) a -= (BASE << 6); \ + if (a >= (BASE << 5)) a -= (BASE << 5); \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +# define MOD4(a) \ + do { \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +#else +# define MOD(a) a %= BASE +# define MOD4(a) a %= BASE +#endif + +/* ========================================================================= */ +uLong adler32(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned long sum2; + unsigned n; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) { + adler += buf[0]; + if (adler >= BASE) + adler -= BASE; + sum2 += adler; + if (sum2 >= BASE) + sum2 -= BASE; + return adler | (sum2 << 16); + } + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == Z_NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < 16) { + while (len--) { + adler += *buf++; + sum2 += adler; + } + if (adler >= BASE) + adler -= BASE; + MOD4(sum2); /* only added so many BASE's */ + return adler | (sum2 << 16); + } + + /* do length NMAX blocks -- requires just one modulo operation */ + while (len >= NMAX) { + len -= NMAX; + n = NMAX / 16; /* NMAX is divisible by 16 */ + do { + DO16(buf); /* 16 sums unrolled */ + buf += 16; + } while (--n); + MOD(adler); + MOD(sum2); + } + + /* do remaining bytes (less than NMAX, still just one modulo) */ + if (len) { /* avoid modulos if none remaining */ + while (len >= 16) { + len -= 16; + DO16(buf); + buf += 16; + } + while (len--) { + adler += *buf++; + sum2 += adler; + } + MOD(adler); + MOD(sum2); + } + + /* return recombined sums */ + return adler | (sum2 << 16); +} + +/* ========================================================================= */ +uLong adler32_combine(adler1, adler2, len2) + uLong adler1; + uLong adler2; + z_off_t len2; +{ + unsigned long sum1; + unsigned long sum2; + unsigned rem; + + /* the derivation of this formula is left as an exercise for the reader */ + rem = (unsigned)(len2 % BASE); + sum1 = adler1 & 0xffff; + sum2 = rem * sum1; + MOD(sum2); + sum1 += (adler2 & 0xffff) + BASE - 1; + sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; + if (sum1 > BASE) sum1 -= BASE; + if (sum1 > BASE) sum1 -= BASE; + if (sum2 > (BASE << 1)) sum2 -= (BASE << 1); + if (sum2 > BASE) sum2 -= BASE; + return sum1 | (sum2 << 16); +} diff --git a/LTA/LTAIngest/md5adler/adler32/adler32.c b/LTA/LTAIngest/md5adler/adler32/adler32.c new file mode 100644 index 0000000000000000000000000000000000000000..007ba26277c8470d897faa87b7b9fb4b5d15e606 --- /dev/null +++ b/LTA/LTAIngest/md5adler/adler32/adler32.c @@ -0,0 +1,149 @@ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2004 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#define ZLIB_INTERNAL +#include "zlib.h" + +#define BASE 65521UL /* largest prime smaller than 65536 */ +#define NMAX 5552 +/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ + +#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;} +#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); +#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); +#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); +#define DO16(buf) DO8(buf,0); DO8(buf,8); + +/* use NO_DIVIDE if your processor does not do division in hardware */ +#ifdef NO_DIVIDE +# define MOD(a) \ + do { \ + if (a >= (BASE << 16)) a -= (BASE << 16); \ + if (a >= (BASE << 15)) a -= (BASE << 15); \ + if (a >= (BASE << 14)) a -= (BASE << 14); \ + if (a >= (BASE << 13)) a -= (BASE << 13); \ + if (a >= (BASE << 12)) a -= (BASE << 12); \ + if (a >= (BASE << 11)) a -= (BASE << 11); \ + if (a >= (BASE << 10)) a -= (BASE << 10); \ + if (a >= (BASE << 9)) a -= (BASE << 9); \ + if (a >= (BASE << 8)) a -= (BASE << 8); \ + if (a >= (BASE << 7)) a -= (BASE << 7); \ + if (a >= (BASE << 6)) a -= (BASE << 6); \ + if (a >= (BASE << 5)) a -= (BASE << 5); \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +# define MOD4(a) \ + do { \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +#else +# define MOD(a) a %= BASE +# define MOD4(a) a %= BASE +#endif + +/* ========================================================================= */ +uLong ZEXPORT adler32(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned long sum2; + unsigned n; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) { + adler += buf[0]; + if (adler >= BASE) + adler -= BASE; + sum2 += adler; + if (sum2 >= BASE) + sum2 -= BASE; + return adler | (sum2 << 16); + } + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == Z_NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < 16) { + while (len--) { + adler += *buf++; + sum2 += adler; + } + if (adler >= BASE) + adler -= BASE; + MOD4(sum2); /* only added so many BASE's */ + return adler | (sum2 << 16); + } + + /* do length NMAX blocks -- requires just one modulo operation */ + while (len >= NMAX) { + len -= NMAX; + n = NMAX / 16; /* NMAX is divisible by 16 */ + do { + DO16(buf); /* 16 sums unrolled */ + buf += 16; + } while (--n); + MOD(adler); + MOD(sum2); + } + + /* do remaining bytes (less than NMAX, still just one modulo) */ + if (len) { /* avoid modulos if none remaining */ + while (len >= 16) { + len -= 16; + DO16(buf); + buf += 16; + } + while (len--) { + adler += *buf++; + sum2 += adler; + } + MOD(adler); + MOD(sum2); + } + + /* return recombined sums */ + return adler | (sum2 << 16); +} + +/* ========================================================================= */ +uLong ZEXPORT adler32_combine(adler1, adler2, len2) + uLong adler1; + uLong adler2; + z_off_t len2; +{ + unsigned long sum1; + unsigned long sum2; + unsigned rem; + + /* the derivation of this formula is left as an exercise for the reader */ + rem = (unsigned)(len2 % BASE); + sum1 = adler1 & 0xffff; + sum2 = rem * sum1; + MOD(sum2); + sum1 += (adler2 & 0xffff) + BASE - 1; + sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; + if (sum1 > BASE) sum1 -= BASE; + if (sum1 > BASE) sum1 -= BASE; + if (sum2 > (BASE << 1)) sum2 -= (BASE << 1); + if (sum2 > BASE) sum2 -= BASE; + return sum1 | (sum2 << 16); +} diff --git a/LTA/LTAIngest/md5adler/adler32/zlib.h b/LTA/LTAIngest/md5adler/adler32/zlib.h new file mode 100644 index 0000000000000000000000000000000000000000..022817927ce3d6b1abe5ac57bff70e7de5291ae0 --- /dev/null +++ b/LTA/LTAIngest/md5adler/adler32/zlib.h @@ -0,0 +1,1357 @@ +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.3, July 18th, 2005 + + Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +#ifndef ZLIB_H +#define ZLIB_H + +#include "zconf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZLIB_VERSION "1.2.3" +#define ZLIB_VERNUM 0x1230 + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed + data. This version of the library supports only one compression method + (deflation) but other algorithms will be added later and will have the same + stream interface. + + Compression can be done in a single step if the buffers are large + enough (for example if an input file is mmap'ed), or can be done by + repeated calls of the compression function. In the latter case, the + application must provide more input and/or consume the output + (providing more output space) before each call. + + The compressed data format used by default by the in-memory functions is + the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped + around a deflate stream, which is itself documented in RFC 1951. + + The library also supports reading and writing files in gzip (.gz) format + with an interface similar to that of stdio using the functions that start + with "gz". The gzip format is different from the zlib format. gzip is a + gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. + + This library can optionally read and write gzip streams in memory as well. + + The zlib format was designed to be compact and fast for use in memory + and on communications channels. The gzip format was designed for single- + file compression on file systems, has a larger header than zlib to maintain + directory information, and uses a different, slower check method than zlib. + + The library does not install any signal handler. The decoder checks + the consistency of the compressed data, so the library should never + crash even in case of corrupted input. +*/ + +typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); +typedef void (*free_func) OF((voidpf opaque, voidpf address)); + +struct internal_state; + +typedef struct z_stream_s { + Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total nb of input bytes read so far */ + + Bytef *next_out; /* next output byte should be put there */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total nb of bytes output so far */ + + char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: binary or text */ + uLong adler; /* adler32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + gzip header information passed to and from zlib routines. See RFC 1952 + for more details on the meanings of these fields. +*/ +typedef struct gz_header_s { + int text; /* true if compressed data believed to be text */ + uLong time; /* modification time */ + int xflags; /* extra flags (not used when writing a gzip file) */ + int os; /* operating system */ + Bytef *extra; /* pointer to extra field or Z_NULL if none */ + uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ + uInt extra_max; /* space at extra (only when reading header) */ + Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ + uInt name_max; /* space at name (only when reading header) */ + Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ + uInt comm_max; /* space at comment (only when reading header) */ + int hcrc; /* true if there was or will be a header crc */ + int done; /* true when done reading gzip header (not used + when writing a gzip file) */ +} gz_header; + +typedef gz_header FAR *gz_headerp; + +/* + The application must update next_in and avail_in when avail_in has + dropped to zero. It must update next_out and avail_out when avail_out + has dropped to zero. The application must initialize zalloc, zfree and + opaque before calling the init function. All other fields are set by the + compression library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + If zlib is used in a multi-threaded application, zalloc and zfree must be + thread safe. + + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this + if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, + pointers returned by zalloc for objects of exactly 65536 bytes *must* + have their offset normalized to zero. The default allocation function + provided by this library ensures this (see zutil.c). To reduce memory + requirements and avoid any allocation of 64K objects, at the expense of + compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or + progress reports. After compression, total_in holds the total size of + the uncompressed data and may be saved for use in the decompressor + (particularly if the decompressor wants to decompress everything in + a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */ +#define Z_SYNC_FLUSH 2 +#define Z_FULL_FLUSH 3 +#define Z_FINISH 4 +#define Z_BLOCK 5 +/* Allowed flush values; see deflate() and inflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative + * values are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_RLE 3 +#define Z_FIXED 4 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_TEXT 1 +#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ +#define Z_UNKNOWN 2 +/* Possible values of the data_type field (though see inflate()) */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + /* basic functions */ + +ZEXTERN const char * ZEXPORT zlibVersion OF((void)); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is + not compatible with the zlib.h header file used by the application. + This check is automatically made by deflateInit and inflateInit. + */ + +/* +ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. + If zalloc and zfree are set to Z_NULL, deflateInit updates them to + use default allocation functions. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at + all (the input data is simply copied a block at a time). + Z_DEFAULT_COMPRESSION requests a default compromise between speed and + compression (currently equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if level is not a valid compression level, + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). + msg is set to null if there is no error message. deflateInit does not + perform any compression: this will be done by deflate(). +*/ + + +ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); +/* + deflate compresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce some + output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. deflate performs one or both of the + following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary (in interactive applications). + Some output may be provided even if flush is not set. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating avail_in or avail_out accordingly; avail_out + should never be zero before the call. The application can consume the + compressed output when it wants, for example when the output buffer is full + (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK + and with zero avail_out, it must be called again after making room in the + output buffer because there might be more output pending. + + Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to + decide how much data to accumualte before producing output, in order to + maximize compression. + + If the parameter flush is set to Z_SYNC_FLUSH, all pending output is + flushed to the output buffer and the output is aligned on a byte boundary, so + that the decompressor can get all input data available so far. (In particular + avail_in is zero after the call if enough output space has been provided + before the call.) Flushing may degrade compression for some compression + algorithms and so it should be used only when necessary. + + If flush is set to Z_FULL_FLUSH, all output is flushed as with + Z_SYNC_FLUSH, and the compression state is reset so that decompression can + restart from this point if previous compressed data has been damaged or if + random access is desired. Using Z_FULL_FLUSH too often can seriously degrade + compression. + + If deflate returns with avail_out == 0, this function must be called again + with the same value of the flush parameter and more output space (updated + avail_out), until the flush is complete (deflate returns with non-zero + avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that + avail_out is greater than six to avoid repeated flush markers due to + avail_out == 0 on return. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there + was enough output space; if deflate returns with Z_OK, this function must be + called again with Z_FINISH and more output space (updated avail_out) but no + more input data, until it returns with Z_STREAM_END or an error. After + deflate has returned Z_STREAM_END, the only possible operations on the + stream are deflateReset or deflateEnd. + + Z_FINISH can be used immediately after deflateInit if all the compression + is to be done in a single step. In this case, avail_out must be at least + the value returned by deflateBound (see below). If deflate does not return + Z_STREAM_END, then it must be called again as described above. + + deflate() sets strm->adler to the adler32 checksum of all input read + so far (that is, total_in bytes). + + deflate() may update strm->data_type if it can make a good guess about + the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered + binary. This field is only for information purposes and does not affect + the compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible + (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not + fatal, and deflate() can be called again with more input and more output + space to continue compressing. +*/ + + +ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, + msg may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); + + Initializes the internal stream state for decompression. The fields + next_in, avail_in, zalloc, zfree and opaque must be initialized before by + the caller. If next_in is not Z_NULL and avail_in is large enough (the exact + value depends on the compression method), inflateInit determines the + compression method from the zlib header and allocates all data structures + accordingly; otherwise the allocation will be deferred to the first call of + inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to + use default allocation functions. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller. msg is set to null if there is no error + message. inflateInit does not perform any decompression apart from reading + the zlib header if present: this will be done by inflate(). (So next_in and + avail_in may be modified, but next_out and avail_out are unchanged.) +*/ + + +ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); +/* + inflate decompresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. inflate performs one or both of the + following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in is updated and processing + will resume at this point for the next call of inflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there + is no more input data or no more space in the output buffer (see below + about the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating the next_* and avail_* values accordingly. + The application can consume the uncompressed output when it wants, for + example when the output buffer is full (avail_out == 0), or after each + call of inflate(). If inflate returns Z_OK and with zero avail_out, it + must be called again after making room in the output buffer because there + might be more output pending. + + The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, + Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much + output as possible to the output buffer. Z_BLOCK requests that inflate() stop + if and when it gets to the next deflate block boundary. When decoding the + zlib or gzip format, this will cause inflate() to return immediately after + the header and before the first block. When doing a raw inflate, inflate() + will go ahead and process the first block, and will return when it gets to + the end of that block, or when it runs out of data. + + The Z_BLOCK option assists in appending to or combining deflate streams. + Also to assist in this, on return inflate() will set strm->data_type to the + number of unused bits in the last byte taken from strm->next_in, plus 64 + if inflate() is currently decoding the last block in the deflate stream, + plus 128 if inflate() returned immediately after decoding an end-of-block + code or decoding the complete header up to just before the first byte of the + deflate stream. The end-of-block will not be indicated until all of the + uncompressed data from that block has been written to strm->next_out. The + number of unused bits may in general be greater than seven, except when + bit 7 of data_type is set, in which case the number of unused bits will be + less than eight. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step + (a single call of inflate), the parameter flush should be set to + Z_FINISH. In this case all pending input is processed and all pending + output is flushed; avail_out must be large enough to hold all the + uncompressed data. (The size of the uncompressed data may have been saved + by the compressor for this purpose.) The next operation on this stream must + be inflateEnd to deallocate the decompression state. The use of Z_FINISH + is never required, but can be used to inform inflate that a faster approach + may be used for the single inflate() call. + + In this implementation, inflate() always flushes as much output as + possible to the output buffer, and always uses the faster approach on the + first call. So the only effect of the flush parameter in this implementation + is on the return value of inflate(), as noted below, or when it returns early + because Z_BLOCK is used. + + If a preset dictionary is needed after this call (see inflateSetDictionary + below), inflate sets strm->adler to the adler32 checksum of the dictionary + chosen by the compressor and returns Z_NEED_DICT; otherwise it sets + strm->adler to the adler32 checksum of all output produced so far (that is, + total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described + below. At the end of the stream, inflate() checks that its computed adler32 + checksum is equal to that saved by the compressor and returns Z_STREAM_END + only if the checksum is correct. + + inflate() will decompress and check either zlib-wrapped or gzip-wrapped + deflate data. The header type is detected automatically. Any information + contained in the gzip header is not retained, so applications that need that + information should instead use raw inflate, see inflateInit2() below, or + inflateBack() and perform their own processing of the gzip header and + trailer. + + inflate() returns Z_OK if some progress has been made (more input processed + or more output produced), Z_STREAM_END if the end of the compressed data has + been reached and all uncompressed output has been produced, Z_NEED_DICT if a + preset dictionary is needed at this point, Z_DATA_ERROR if the input data was + corrupted (input stream not conforming to the zlib format or incorrect check + value), Z_STREAM_ERROR if the stream structure was inconsistent (for example + if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, + Z_BUF_ERROR if no progress is possible or if there was not enough room in the + output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and + inflate() can be called again with more input and more output space to + continue decompressing. If Z_DATA_ERROR is returned, the application may then + call inflateSync() to look for a good compression block if a partial recovery + of the data is desired. +*/ + + +ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state + was inconsistent. In the error case, msg may be set but then points to a + static string (which must not be deallocated). +*/ + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy)); + + This is another version of deflateInit with more compression options. The + fields next_in, zalloc, zfree and opaque must be initialized before by + the caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library. Larger values of this parameter result in better + compression at the expense of memory usage. The default value is 15 if + deflateInit is used instead. + + windowBits can also be -8..-15 for raw deflate. In this case, -windowBits + determines the window size. deflate() will then generate raw deflate data + with no zlib header or trailer, and will not compute an adler32 check value. + + windowBits can also be greater than 15 for optional gzip encoding. Add + 16 to windowBits to write a simple gzip header and trailer around the + compressed data instead of a zlib wrapper. The gzip header will have no + file name, no extra data, no comment, no modification time (set to zero), + no header crc, and the operating system will be set to 255 (unknown). If a + gzip stream is being written, strm->adler is a crc32 instead of an adler32. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but + is slow and reduces compression ratio; memLevel=9 uses maximum memory + for optimal speed. The default value is 8. See zconf.h for total memory + usage as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match), or Z_RLE to limit match distances to one (run-length + encoding). Filtered data consists mostly of small values with a somewhat + random distribution. In this case, the compression algorithm is tuned to + compress them better. The effect of Z_FILTERED is to force more Huffman + coding and less string matching; it is somewhat intermediate between + Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as + Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy + parameter only affects the compression ratio but not the correctness of the + compressed output even if it is not set appropriately. Z_FIXED prevents the + use of dynamic Huffman codes, allowing for a simpler decoder for special + applications. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid + method). msg is set to null if there is no error message. deflateInit2 does + not perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the compression dictionary from the given byte sequence + without producing any compressed output. This function must be called + immediately after deflateInit, deflateInit2 or deflateReset, before any + call of deflate. The compressor and decompressor must use exactly the same + dictionary (see inflateSetDictionary). + + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and can be + predicted with good accuracy; the data can then be compressed better than + with the default empty dictionary. + + Depending on the size of the compression data structures selected by + deflateInit or deflateInit2, a part of the dictionary may in effect be + discarded, for example if the dictionary is larger than the window size in + deflate or deflate2. Thus the strings most likely to be useful should be + put at the end of the dictionary, not at the front. In addition, the + current implementation of deflate will use at most the window size minus + 262 bytes of the provided dictionary. + + Upon return of this function, strm->adler is set to the adler32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The adler32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) If a raw deflate was requested, then the + adler32 value is not computed and strm->adler is not set. + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent (for example if deflate has already been called for this stream + or if the compression method is bsort). deflateSetDictionary does not + perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and + can consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); +/* + This function is equivalent to deflateEnd followed by deflateInit, + but does not free and reallocate all the internal compression state. + The stream will keep the same compression level and any other attributes + that may have been set by deflateInit2. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, + int level, + int strategy)); +/* + Dynamically update the compression level and compression strategy. The + interpretation of level and strategy is as in deflateInit2. This can be + used to switch between compression and straight copy of the input data, or + to switch to a different kind of input data requiring a different + strategy. If the compression level is changed, the input available so far + is compressed with the old level (and may be flushed); the new level will + take effect only at the next call of deflate(). + + Before the call of deflateParams, the stream state must be set as for + a call of deflate(), since the currently available input may have to + be compressed and flushed. In particular, strm->avail_out must be non-zero. + + deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source + stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR + if strm->avail_out was zero. +*/ + +ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm, + int good_length, + int max_lazy, + int nice_length, + int max_chain)); +/* + Fine tune deflate's internal compression parameters. This should only be + used by someone who understands the algorithm used by zlib's deflate for + searching for the best matching string, and even then only by the most + fanatic optimizer trying to squeeze out the last compressed bit for their + specific input data. Read the deflate.c source code for the meaning of the + max_lazy, good_length, nice_length, and max_chain parameters. + + deflateTune() can be called after deflateInit() or deflateInit2(), and + returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. + */ + +ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm, + uLong sourceLen)); +/* + deflateBound() returns an upper bound on the compressed size after + deflation of sourceLen bytes. It must be called after deflateInit() + or deflateInit2(). This would be used to allocate an output buffer + for deflation in a single pass, and so would be called before deflate(). +*/ + +ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + deflatePrime() inserts bits in the deflate output stream. The intent + is that this function is used to start off the deflate output with the + bits leftover from a previous deflate stream when appending to it. As such, + this function can only be used for raw deflate, and must be used before the + first deflate() call after a deflateInit2() or deflateReset(). bits must be + less than or equal to 16, and that many of the least significant bits of + value will be inserted in the output. + + deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm, + gz_headerp head)); +/* + deflateSetHeader() provides gzip header information for when a gzip + stream is requested by deflateInit2(). deflateSetHeader() may be called + after deflateInit2() or deflateReset() and before the first call of + deflate(). The text, time, os, extra field, name, and comment information + in the provided gz_header structure are written to the gzip header (xflag is + ignored -- the extra flags are set according to the compression level). The + caller must assure that, if not Z_NULL, name and comment are terminated with + a zero byte, and that if extra is not Z_NULL, that extra_len bytes are + available there. If hcrc is true, a gzip header crc is included. Note that + the current versions of the command-line version of gzip (up through version + 1.3.x) do not support header crc's, and will report that it is a "multi-part + gzip file" and give up. + + If deflateSetHeader is not used, the default gzip header has text false, + the time set to zero, and os set to 255, with no extra, name, or comment + fields. The gzip header is returned to the default state by deflateReset(). + + deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, + int windowBits)); + + This is another version of inflateInit with an extra parameter. The + fields next_in, avail_in, zalloc, zfree and opaque must be initialized + before by the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library. The default value is 15 if inflateInit is used + instead. windowBits must be greater than or equal to the windowBits value + provided to deflateInit2() while compressing, or it must be equal to 15 if + deflateInit2() was not used. If a compressed stream with a larger window + size is given as input, inflate() will return with the error code + Z_DATA_ERROR instead of trying to allocate a larger window. + + windowBits can also be -8..-15 for raw inflate. In this case, -windowBits + determines the window size. inflate() will then process raw deflate data, + not looking for a zlib or gzip header, not generating a check value, and not + looking for any check values for comparison at the end of the stream. This + is for use with other formats that use the deflate compressed data format + such as zip. Those formats provide their own check values. If a custom + format is developed using the raw deflate format for compressed data, it is + recommended that a check value such as an adler32 or a crc32 be applied to + the uncompressed data as is done in the zlib, gzip, and zip formats. For + most applications, the zlib format should be used as is. Note that comments + above on the use in deflateInit2() applies to the magnitude of windowBits. + + windowBits can also be greater than 15 for optional gzip decoding. Add + 32 to windowBits to enable zlib and gzip decoding with automatic header + detection, or add 16 to decode only the gzip format (the zlib format will + return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is + a crc32 instead of an adler32. + + inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg + is set to null if there is no error message. inflateInit2 does not perform + any decompression apart from reading the zlib header if present: this will + be done by inflate(). (So next_in and avail_in may be modified, but next_out + and avail_out are unchanged.) +*/ + +ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the decompression dictionary from the given uncompressed byte + sequence. This function must be called immediately after a call of inflate, + if that call returned Z_NEED_DICT. The dictionary chosen by the compressor + can be determined from the adler32 value returned by that call of inflate. + The compressor and decompressor must use exactly the same dictionary (see + deflateSetDictionary). For raw inflate, this function can be called + immediately after inflateInit2() or inflateReset() and before any call of + inflate() to set the dictionary. The application must insure that the + dictionary that was used for compression is provided. + + inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the + expected one (incorrect adler32 value). inflateSetDictionary does not + perform any decompression: this will be done by subsequent calls of + inflate(). +*/ + +ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); +/* + Skips invalid compressed data until a full flush point (see above the + description of deflate with Z_FULL_FLUSH) can be found, or until all + available input is skipped. No output is provided. + + inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR + if no more input was provided, Z_DATA_ERROR if no flush point has been found, + or Z_STREAM_ERROR if the stream structure was inconsistent. In the success + case, the application may save the current current value of total_in which + indicates where valid compressed data was found. In the error case, the + application may repeatedly call inflateSync, providing more input each time, + until success or end of the input data. +*/ + +ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when randomly accessing a large stream. The + first pass through the stream can periodically record the inflate state, + allowing restarting inflate at those points when randomly accessing the + stream. + + inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); +/* + This function is equivalent to inflateEnd followed by inflateInit, + but does not free and reallocate all the internal decompression state. + The stream will keep attributes that may have been set by inflateInit2. + + inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + This function inserts bits in the inflate input stream. The intent is + that this function is used to start inflating at a bit position in the + middle of a byte. The provided bits will be used before any bytes are used + from next_in. This function should only be used with raw inflate, and + should be used before the first inflate() call after inflateInit2() or + inflateReset(). bits must be less than or equal to 16, and that many of the + least significant bits of value will be inserted in the input. + + inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm, + gz_headerp head)); +/* + inflateGetHeader() requests that gzip header information be stored in the + provided gz_header structure. inflateGetHeader() may be called after + inflateInit2() or inflateReset(), and before the first call of inflate(). + As inflate() processes the gzip stream, head->done is zero until the header + is completed, at which time head->done is set to one. If a zlib stream is + being decoded, then head->done is set to -1 to indicate that there will be + no gzip header information forthcoming. Note that Z_BLOCK can be used to + force inflate() to return immediately after header processing is complete + and before any actual data is decompressed. + + The text, time, xflags, and os fields are filled in with the gzip header + contents. hcrc is set to true if there is a header CRC. (The header CRC + was valid if done is set to one.) If extra is not Z_NULL, then extra_max + contains the maximum number of bytes to write to extra. Once done is true, + extra_len contains the actual extra field length, and extra contains the + extra field, or that field truncated if extra_max is less than extra_len. + If name is not Z_NULL, then up to name_max characters are written there, + terminated with a zero unless the length is greater than name_max. If + comment is not Z_NULL, then up to comm_max characters are written there, + terminated with a zero unless the length is greater than comm_max. When + any of extra, name, or comment are not Z_NULL and the respective field is + not present in the header, then that field is set to Z_NULL to signal its + absence. This allows the use of deflateSetHeader() with the returned + structure to duplicate the header. However if those fields are set to + allocated memory, then the application will need to save those pointers + elsewhere so that they can be eventually freed. + + If inflateGetHeader is not used, then the header information is simply + discarded. The header is always checked for validity, including the header + CRC if present. inflateReset() will reset the process to discard the header + information. The application would need to call inflateGetHeader() again to + retrieve the header from the next gzip stream. + + inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits, + unsigned char FAR *window)); + + Initialize the internal stream state for decompression using inflateBack() + calls. The fields zalloc, zfree and opaque in strm must be initialized + before the call. If zalloc and zfree are Z_NULL, then the default library- + derived memory allocation routines are used. windowBits is the base two + logarithm of the window size, in the range 8..15. window is a caller + supplied buffer of that size. Except for special applications where it is + assured that deflate was used with small window sizes, windowBits must be 15 + and a 32K byte window must be supplied to be able to decompress general + deflate streams. + + See inflateBack() for the usage of these routines. + + inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of + the paramaters are invalid, Z_MEM_ERROR if the internal state could not + be allocated, or Z_VERSION_ERROR if the version of the library does not + match the version of the header file. +*/ + +typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *)); +typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned)); + +ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm, + in_func in, void FAR *in_desc, + out_func out, void FAR *out_desc)); +/* + inflateBack() does a raw inflate with a single call using a call-back + interface for input and output. This is more efficient than inflate() for + file i/o applications in that it avoids copying between the output and the + sliding window by simply making the window itself the output buffer. This + function trusts the application to not change the output buffer passed by + the output function, at least until inflateBack() returns. + + inflateBackInit() must be called first to allocate the internal state + and to initialize the state with the user-provided window buffer. + inflateBack() may then be used multiple times to inflate a complete, raw + deflate stream with each call. inflateBackEnd() is then called to free + the allocated state. + + A raw deflate stream is one with no zlib or gzip header or trailer. + This routine would normally be used in a utility that reads zip or gzip + files and writes out uncompressed files. The utility would decode the + header and process the trailer on its own, hence this routine expects + only the raw deflate stream to decompress. This is different from the + normal behavior of inflate(), which expects either a zlib or gzip header and + trailer around the deflate stream. + + inflateBack() uses two subroutines supplied by the caller that are then + called by inflateBack() for input and output. inflateBack() calls those + routines until it reads a complete deflate stream and writes out all of the + uncompressed data, or until it encounters an error. The function's + parameters and return types are defined above in the in_func and out_func + typedefs. inflateBack() will call in(in_desc, &buf) which should return the + number of bytes of provided input, and a pointer to that input in buf. If + there is no input available, in() must return zero--buf is ignored in that + case--and inflateBack() will return a buffer error. inflateBack() will call + out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out() + should return zero on success, or non-zero on failure. If out() returns + non-zero, inflateBack() will return with an error. Neither in() nor out() + are permitted to change the contents of the window provided to + inflateBackInit(), which is also the buffer that out() uses to write from. + The length written by out() will be at most the window size. Any non-zero + amount of input may be provided by in(). + + For convenience, inflateBack() can be provided input on the first call by + setting strm->next_in and strm->avail_in. If that input is exhausted, then + in() will be called. Therefore strm->next_in must be initialized before + calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called + immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in + must also be initialized, and then if strm->avail_in is not zero, input will + initially be taken from strm->next_in[0 .. strm->avail_in - 1]. + + The in_desc and out_desc parameters of inflateBack() is passed as the + first parameter of in() and out() respectively when they are called. These + descriptors can be optionally used to pass any information that the caller- + supplied in() and out() functions need to do their job. + + On return, inflateBack() will set strm->next_in and strm->avail_in to + pass back any unused input that was provided by the last in() call. The + return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR + if in() or out() returned an error, Z_DATA_ERROR if there was a format + error in the deflate stream (in which case strm->msg is set to indicate the + nature of the error), or Z_STREAM_ERROR if the stream was not properly + initialized. In the case of Z_BUF_ERROR, an input or output error can be + distinguished using strm->next_in which will be Z_NULL only if in() returned + an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to + out() returning non-zero. (in() will always be called before out(), so + strm->next_in is assured to be defined if out() returns non-zero.) Note + that inflateBack() cannot return Z_OK. +*/ + +ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm)); +/* + All memory allocated by inflateBackInit() is freed. + + inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream + state was inconsistent. +*/ + +ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void)); +/* Return flags indicating compile-time options. + + Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: + 1.0: size of uInt + 3.2: size of uLong + 5.4: size of voidpf (pointer) + 7.6: size of z_off_t + + Compiler, assembler, and debug options: + 8: DEBUG + 9: ASMV or ASMINF -- use ASM code + 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention + 11: 0 (reserved) + + One-time table building (smaller code, but not thread-safe if true): + 12: BUILDFIXED -- build static block decoding tables when needed + 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed + 14,15: 0 (reserved) + + Library content (indicates missing functionality): + 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking + deflate code when not needed) + 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect + and decode gzip streams (to avoid linking crc code) + 18-19: 0 (reserved) + + Operation variations (changes in library functionality): + 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate + 21: FASTEST -- deflate algorithm with only one, lowest compression level + 22,23: 0 (reserved) + + The sprintf variant used by gzprintf (zero is best): + 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format + 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! + 26: 0 = returns value, 1 = void -- 1 means inferred string length returned + + Remainder: + 27-31: 0 (reserved) + */ + + + /* utility functions */ + +/* + The following utility functions are implemented on top of the + basic stream-oriented functions. To simplify the interface, some + default options are assumed (compression level and memory usage, + standard memory allocation functions). The source code of these + utility functions can easily be modified if you need special options. +*/ + +ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be at least the value returned + by compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + This function can be used to compress a whole file at once if the + input file is mmap'ed. + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + +ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen, + int level)); +/* + Compresses the source buffer into the destination buffer. The level + parameter has the same meaning as in deflateInit. sourceLen is the byte + length of the source buffer. Upon entry, destLen is the total size of the + destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + + compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_BUF_ERROR if there was not enough room in the output buffer, + Z_STREAM_ERROR if the level parameter is invalid. +*/ + +ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen)); +/* + compressBound() returns an upper bound on the compressed size after + compress() or compress2() on sourceLen bytes. It would be used before + a compress() or compress2() call to allocate the destination buffer. +*/ + +ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be large enough to hold the + entire uncompressed data. (The size of the uncompressed data must have + been saved previously by the compressor and transmitted to the decompressor + by some mechanism outside the scope of this compression library.) + Upon exit, destLen is the actual size of the compressed buffer. + This function can be used to decompress a whole file at once if the + input file is mmap'ed. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. +*/ + + +typedef voidp gzFile; + +ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); +/* + Opens a gzip (.gz) file for reading or writing. The mode parameter + is as in fopen ("rb" or "wb") but can also include a compression level + ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for + Huffman only compression as in "wb1h", or 'R' for run-length encoding + as in "wb1R". (See the description of deflateInit2 for more information + about the strategy parameter.) + + gzopen can be used to read a file which is not in gzip format; in this + case gzread will directly read from the file without decompression. + + gzopen returns NULL if the file could not be opened or if there was + insufficient memory to allocate the (de)compression state; errno + can be checked to distinguish the two cases (if errno is zero, the + zlib error is Z_MEM_ERROR). */ + +ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); +/* + gzdopen() associates a gzFile with the file descriptor fd. File + descriptors are obtained from calls like open, dup, creat, pipe or + fileno (in the file has been previously opened with fopen). + The mode parameter is as in gzopen. + The next call of gzclose on the returned gzFile will also close the + file descriptor fd, just like fclose(fdopen(fd), mode) closes the file + descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). + gzdopen returns NULL if there was insufficient memory to allocate + the (de)compression state. +*/ + +ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); +/* + Dynamically update the compression level or strategy. See the description + of deflateInit2 for the meaning of these parameters. + gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not + opened for writing. +*/ + +ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); +/* + Reads the given number of uncompressed bytes from the compressed file. + If the input file was not in gzip format, gzread copies the given number + of bytes into the buffer. + gzread returns the number of uncompressed bytes actually read (0 for + end of file, -1 for error). */ + +ZEXTERN int ZEXPORT gzwrite OF((gzFile file, + voidpc buf, unsigned len)); +/* + Writes the given number of uncompressed bytes into the compressed file. + gzwrite returns the number of uncompressed bytes actually written + (0 in case of error). +*/ + +ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...)); +/* + Converts, formats, and writes the args to the compressed file under + control of the format string, as in fprintf. gzprintf returns the number of + uncompressed bytes actually written (0 in case of error). The number of + uncompressed bytes written is limited to 4095. The caller should assure that + this limit is not exceeded. If it is exceeded, then gzprintf() will return + return an error (0) with nothing written. In this case, there may also be a + buffer overflow with unpredictable consequences, which is possible only if + zlib was compiled with the insecure functions sprintf() or vsprintf() + because the secure snprintf() or vsnprintf() functions were not available. +*/ + +ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); +/* + Writes the given null-terminated string to the compressed file, excluding + the terminating null character. + gzputs returns the number of characters written, or -1 in case of error. +*/ + +ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); +/* + Reads bytes from the compressed file until len-1 characters are read, or + a newline character is read and transferred to buf, or an end-of-file + condition is encountered. The string is then terminated with a null + character. + gzgets returns buf, or Z_NULL in case of error. +*/ + +ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); +/* + Writes c, converted to an unsigned char, into the compressed file. + gzputc returns the value that was written, or -1 in case of error. +*/ + +ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); +/* + Reads one byte from the compressed file. gzgetc returns this byte + or -1 in case of end of file or error. +*/ + +ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file)); +/* + Push one character back onto the stream to be read again later. + Only one character of push-back is allowed. gzungetc() returns the + character pushed, or -1 on failure. gzungetc() will fail if a + character has been pushed but not read yet, or if c is -1. The pushed + character will be discarded if the stream is repositioned with gzseek() + or gzrewind(). +*/ + +ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); +/* + Flushes all pending output into the compressed file. The parameter + flush is as in the deflate() function. The return value is the zlib + error number (see function gzerror below). gzflush returns Z_OK if + the flush parameter is Z_FINISH and all output could be flushed. + gzflush should be called only when strictly necessary because it can + degrade compression. +*/ + +ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, + z_off_t offset, int whence)); +/* + Sets the starting position for the next gzread or gzwrite on the + given compressed file. The offset represents a number of bytes in the + uncompressed data stream. The whence parameter is defined as in lseek(2); + the value SEEK_END is not supported. + If the file is opened for reading, this function is emulated but can be + extremely slow. If the file is opened for writing, only forward seeks are + supported; gzseek then compresses a sequence of zeroes up to the new + starting position. + + gzseek returns the resulting offset location as measured in bytes from + the beginning of the uncompressed stream, or -1 in case of error, in + particular if the file is opened for writing and the new starting position + would be before the current position. +*/ + +ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); +/* + Rewinds the given file. This function is supported only for reading. + + gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) +*/ + +ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); +/* + Returns the starting position for the next gzread or gzwrite on the + given compressed file. This position represents a number of bytes in the + uncompressed data stream. + + gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) +*/ + +ZEXTERN int ZEXPORT gzeof OF((gzFile file)); +/* + Returns 1 when EOF has previously been detected reading the given + input stream, otherwise zero. +*/ + +ZEXTERN int ZEXPORT gzdirect OF((gzFile file)); +/* + Returns 1 if file is being read directly without decompression, otherwise + zero. +*/ + +ZEXTERN int ZEXPORT gzclose OF((gzFile file)); +/* + Flushes all pending output if necessary, closes the compressed file + and deallocates all the (de)compression state. The return value is the zlib + error number (see function gzerror below). +*/ + +ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); +/* + Returns the error message for the last error which occurred on the + given compressed file. errnum is set to zlib error number. If an + error occurred in the file system and not in the compression library, + errnum is set to Z_ERRNO and the application may consult errno + to get the exact error code. +*/ + +ZEXTERN void ZEXPORT gzclearerr OF((gzFile file)); +/* + Clears the error and end-of-file flags for file. This is analogous to the + clearerr() function in stdio. This is useful for continuing to read a gzip + file that is being written concurrently. +*/ + + /* checksum functions */ + +/* + These functions are not related to compression but are exported + anyway because they might be useful in applications using the + compression library. +*/ + +ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); +/* + Update a running Adler-32 checksum with the bytes buf[0..len-1] and + return the updated checksum. If buf is NULL, this function returns + the required initial value for the checksum. + An Adler-32 checksum is almost as reliable as a CRC32 but can be computed + much faster. Usage example: + + uLong adler = adler32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + adler = adler32(adler, buffer, length); + } + if (adler != original_adler) error(); +*/ + +ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2, + z_off_t len2)); +/* + Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 + and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for + each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of + seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. +*/ + +ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); +/* + Update a running CRC-32 with the bytes buf[0..len-1] and return the + updated CRC-32. If buf is NULL, this function returns the required initial + value for the for the crc. Pre- and post-conditioning (one's complement) is + performed within this function so it shouldn't be done by the application. + Usage example: + + uLong crc = crc32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + crc = crc32(crc, buffer, length); + } + if (crc != original_crc) error(); +*/ + +ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2)); + +/* + Combine two CRC-32 check values into one. For two sequences of bytes, + seq1 and seq2 with lengths len1 and len2, CRC-32 check values were + calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 + check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and + len2. +*/ + + + /* various hacks, don't look :) */ + +/* deflateInit and inflateInit are macros to allow checking the zlib version + * and the compiler's view of z_stream: + */ +ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, + int windowBits, int memLevel, + int strategy, const char *version, + int stream_size)); +ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits, + unsigned char FAR *window, + const char *version, + int stream_size)); +#define deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) +#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) +#define inflateBackInit(strm, windowBits, window) \ + inflateBackInit_((strm), (windowBits), (window), \ + ZLIB_VERSION, sizeof(z_stream)) + + +#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL) + struct internal_state {int dummy;}; /* hack for buggy compilers */ +#endif + +ZEXTERN const char * ZEXPORT zError OF((int)); +ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z)); +ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); + +#ifdef __cplusplus +} +#endif + +#endif /* ZLIB_H */ diff --git a/LTA/LTAIngest/md5adler/foo b/LTA/LTAIngest/md5adler/foo new file mode 100644 index 0000000000000000000000000000000000000000..121eab03772e6fb6cc03880c3424ffdee4b03df5 --- /dev/null +++ b/LTA/LTAIngest/md5adler/foo @@ -0,0 +1 @@ +Wikipedia \ No newline at end of file diff --git a/LTA/LTAIngest/md5adler/makefile b/LTA/LTAIngest/md5adler/makefile new file mode 100644 index 0000000000000000000000000000000000000000..1881dac90294337979667819652efac72b55ea0a --- /dev/null +++ b/LTA/LTAIngest/md5adler/makefile @@ -0,0 +1,18 @@ +#CFLAGS=-O3 -mfpmath=sse,387 -march=nocona + +CFLAGS=-O3 + +all: md5a32 a32 + +md5a32: md5a32.o adler32.o + +md5a32.o: md5.h + +adler32: adler32.c adler32.h + gcc -c adler32.c -o adler32.o + +a32: a32.o adler32.o + +clean: + rm -f adler32.o md5a32.o a32.o md5a32 a32 + diff --git a/LTA/LTAIngest/md5adler/md5.h b/LTA/LTAIngest/md5adler/md5.h new file mode 100644 index 0000000000000000000000000000000000000000..a45ba7ae955b4dc3f444389aba21165ccb804ae4 --- /dev/null +++ b/LTA/LTAIngest/md5adler/md5.h @@ -0,0 +1,62 @@ +/* + ********************************************************************** + ** md5.h -- Header file for implementation of MD5 ** + ** RSA Data Security, Inc. MD5 Message Digest Algorithm ** + ** Created: 2/17/90 RLR ** + ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version ** + ** Revised (for MD5): RLR 4/27/91 ** + ** -- G modified to have y&~z instead of y&z ** + ** -- FF, GG, HH modified to add in last register done ** + ** -- Access pattern: round 2 works mod 5, round 3 works mod 3 ** + ** -- distinct additive constant for each step ** + ** -- round 4 added, working mod 7 ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +#define BLOCKSIZE 4069 +/* typedef a 32 bit type */ +/* typedef unsigned long int UINT4; */ +typedef unsigned int UINT4; + +/* Data structure for MD5 (Message Digest) computation */ +typedef struct { + unsigned int adler32; /* adler 32 crc */ + UINT4 i[2]; /* number of _bits_ handled mod 2^64 */ + UINT4 buf[4]; /* scratch buffer */ + unsigned char in[64]; /* input buffer */ + unsigned char digest[16]; /* actual digest after MD5Final call */ +} MD5_CTX; + +void MD5Init (); +void MD5Update (); +void MD5Final (); + +/* + ********************************************************************** + ** End of md5.h ** + ******************************* (cut) ******************************** + */ diff --git a/LTA/LTAIngest/md5adler/md5a32 b/LTA/LTAIngest/md5adler/md5a32 new file mode 100755 index 0000000000000000000000000000000000000000..80e129514344734235beaa0b47f9ac464fe9e820 Binary files /dev/null and b/LTA/LTAIngest/md5adler/md5a32 differ diff --git a/LTA/LTAIngest/md5adler/md5a32.c b/LTA/LTAIngest/md5adler/md5a32.c new file mode 100644 index 0000000000000000000000000000000000000000..cd3ca5891dee408880f9929e2497e38b594b45e7 --- /dev/null +++ b/LTA/LTAIngest/md5adler/md5a32.c @@ -0,0 +1,496 @@ +/* + ********************************************************************** + ** md5.c ** + ** RSA Data Security, Inc. MD5 Message Digest Algorithm ** + ** Created: 2/17/90 RLR ** + ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +/* -- include the following line if the md5.h header file is separate -- */ +#include <stdio.h> +#include "md5.h" + +/* forward declaration */ +static void Transform (); + +static unsigned char PADDING[64] = { + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +/* F, G and H are basic MD5 functions: selection, majority, parity */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */ +/* Rotation is separate from addition to prevent recomputation */ +#define FF(a, b, c, d, x, s, ac) \ + {(a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) \ + {(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) \ + {(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) \ + {(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +void MD5Init (mdContext) +MD5_CTX *mdContext; +{ + mdContext->i[0] = mdContext->i[1] = (UINT4)0; + + /* Load magic initialization constants. + */ + mdContext->buf[0] = (UINT4)0x67452301; + mdContext->buf[1] = (UINT4)0xefcdab89; + mdContext->buf[2] = (UINT4)0x98badcfe; + mdContext->buf[3] = (UINT4)0x10325476; + + /* Set adler32 init value to one + */ + mdContext->adler32 = 1; +} + +A32MD5Update(mdContext, inBuf, inLen) +MD5_CTX *mdContext; +unsigned char *inBuf; +unsigned int inLen; +{ + /* calc adler32, then do regular MD5Update + */ + mdContext->adler32=adler32(mdContext->adler32,inBuf,inLen); + MD5Update(mdContext,inBuf,inLen); +} + +void MD5Update (mdContext, inBuf, inLen) +MD5_CTX *mdContext; +unsigned char *inBuf; +unsigned int inLen; +{ + UINT4 in[16]; + int mdi; + unsigned int i, ii; + + + /* compute number of bytes mod 64 */ + mdi = (int)((mdContext->i[0] >> 3) & 0x3F); + + /* update number of bits */ + if ((mdContext->i[0] + ((UINT4)inLen << 3)) < mdContext->i[0]) + mdContext->i[1]++; + mdContext->i[0] += ((UINT4)inLen << 3); + mdContext->i[1] += ((UINT4)inLen >> 29); + + while (inLen--) { + /* add new character to buffer, increment mdi */ + mdContext->in[mdi++] = *inBuf++; + + /* transform if necessary */ + if (mdi == 0x40) { + for (i = 0, ii = 0; i < 16; i++, ii += 4) + in[i] = (((UINT4)mdContext->in[ii+3]) << 24) | + (((UINT4)mdContext->in[ii+2]) << 16) | + (((UINT4)mdContext->in[ii+1]) << 8) | + ((UINT4)mdContext->in[ii]); + Transform (mdContext->buf, in); + mdi = 0; + } + } + +} + +void MD5Final (mdContext) +MD5_CTX *mdContext; +{ + UINT4 in[16]; + int mdi; + unsigned int i, ii; + unsigned int padLen; + + /* save number of bits */ + in[14] = mdContext->i[0]; + in[15] = mdContext->i[1]; + + /* compute number of bytes mod 64 */ + mdi = (int)((mdContext->i[0] >> 3) & 0x3F); + + /* pad out to 56 mod 64 */ + padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi); + MD5Update (mdContext, PADDING, padLen); + + /* append length in bits and transform */ + for (i = 0, ii = 0; i < 14; i++, ii += 4) + in[i] = (((UINT4)mdContext->in[ii+3]) << 24) | + (((UINT4)mdContext->in[ii+2]) << 16) | + (((UINT4)mdContext->in[ii+1]) << 8) | + ((UINT4)mdContext->in[ii]); + Transform (mdContext->buf, in); + + /* store buffer in digest */ + for (i = 0, ii = 0; i < 4; i++, ii += 4) { + mdContext->digest[ii] = (unsigned char)(mdContext->buf[i] & 0xFF); + mdContext->digest[ii+1] = + (unsigned char)((mdContext->buf[i] >> 8) & 0xFF); + mdContext->digest[ii+2] = + (unsigned char)((mdContext->buf[i] >> 16) & 0xFF); + mdContext->digest[ii+3] = + (unsigned char)((mdContext->buf[i] >> 24) & 0xFF); + } +} + +/* Basic MD5 step. Transform buf based on in. + */ +static void Transform (buf, in) +UINT4 *buf; +UINT4 *in; +{ + UINT4 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF ( a, b, c, d, in[ 0], S11, 3614090360); /* 1 */ + FF ( d, a, b, c, in[ 1], S12, 3905402710); /* 2 */ + FF ( c, d, a, b, in[ 2], S13, 606105819); /* 3 */ + FF ( b, c, d, a, in[ 3], S14, 3250441966); /* 4 */ + FF ( a, b, c, d, in[ 4], S11, 4118548399); /* 5 */ + FF ( d, a, b, c, in[ 5], S12, 1200080426); /* 6 */ + FF ( c, d, a, b, in[ 6], S13, 2821735955); /* 7 */ + FF ( b, c, d, a, in[ 7], S14, 4249261313); /* 8 */ + FF ( a, b, c, d, in[ 8], S11, 1770035416); /* 9 */ + FF ( d, a, b, c, in[ 9], S12, 2336552879); /* 10 */ + FF ( c, d, a, b, in[10], S13, 4294925233); /* 11 */ + FF ( b, c, d, a, in[11], S14, 2304563134); /* 12 */ + FF ( a, b, c, d, in[12], S11, 1804603682); /* 13 */ + FF ( d, a, b, c, in[13], S12, 4254626195); /* 14 */ + FF ( c, d, a, b, in[14], S13, 2792965006); /* 15 */ + FF ( b, c, d, a, in[15], S14, 1236535329); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG ( a, b, c, d, in[ 1], S21, 4129170786); /* 17 */ + GG ( d, a, b, c, in[ 6], S22, 3225465664); /* 18 */ + GG ( c, d, a, b, in[11], S23, 643717713); /* 19 */ + GG ( b, c, d, a, in[ 0], S24, 3921069994); /* 20 */ + GG ( a, b, c, d, in[ 5], S21, 3593408605); /* 21 */ + GG ( d, a, b, c, in[10], S22, 38016083); /* 22 */ + GG ( c, d, a, b, in[15], S23, 3634488961); /* 23 */ + GG ( b, c, d, a, in[ 4], S24, 3889429448); /* 24 */ + GG ( a, b, c, d, in[ 9], S21, 568446438); /* 25 */ + GG ( d, a, b, c, in[14], S22, 3275163606); /* 26 */ + GG ( c, d, a, b, in[ 3], S23, 4107603335); /* 27 */ + GG ( b, c, d, a, in[ 8], S24, 1163531501); /* 28 */ + GG ( a, b, c, d, in[13], S21, 2850285829); /* 29 */ + GG ( d, a, b, c, in[ 2], S22, 4243563512); /* 30 */ + GG ( c, d, a, b, in[ 7], S23, 1735328473); /* 31 */ + GG ( b, c, d, a, in[12], S24, 2368359562); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH ( a, b, c, d, in[ 5], S31, 4294588738); /* 33 */ + HH ( d, a, b, c, in[ 8], S32, 2272392833); /* 34 */ + HH ( c, d, a, b, in[11], S33, 1839030562); /* 35 */ + HH ( b, c, d, a, in[14], S34, 4259657740); /* 36 */ + HH ( a, b, c, d, in[ 1], S31, 2763975236); /* 37 */ + HH ( d, a, b, c, in[ 4], S32, 1272893353); /* 38 */ + HH ( c, d, a, b, in[ 7], S33, 4139469664); /* 39 */ + HH ( b, c, d, a, in[10], S34, 3200236656); /* 40 */ + HH ( a, b, c, d, in[13], S31, 681279174); /* 41 */ + HH ( d, a, b, c, in[ 0], S32, 3936430074); /* 42 */ + HH ( c, d, a, b, in[ 3], S33, 3572445317); /* 43 */ + HH ( b, c, d, a, in[ 6], S34, 76029189); /* 44 */ + HH ( a, b, c, d, in[ 9], S31, 3654602809); /* 45 */ + HH ( d, a, b, c, in[12], S32, 3873151461); /* 46 */ + HH ( c, d, a, b, in[15], S33, 530742520); /* 47 */ + HH ( b, c, d, a, in[ 2], S34, 3299628645); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II ( a, b, c, d, in[ 0], S41, 4096336452); /* 49 */ + II ( d, a, b, c, in[ 7], S42, 1126891415); /* 50 */ + II ( c, d, a, b, in[14], S43, 2878612391); /* 51 */ + II ( b, c, d, a, in[ 5], S44, 4237533241); /* 52 */ + II ( a, b, c, d, in[12], S41, 1700485571); /* 53 */ + II ( d, a, b, c, in[ 3], S42, 2399980690); /* 54 */ + II ( c, d, a, b, in[10], S43, 4293915773); /* 55 */ + II ( b, c, d, a, in[ 1], S44, 2240044497); /* 56 */ + II ( a, b, c, d, in[ 8], S41, 1873313359); /* 57 */ + II ( d, a, b, c, in[15], S42, 4264355552); /* 58 */ + II ( c, d, a, b, in[ 6], S43, 2734768916); /* 59 */ + II ( b, c, d, a, in[13], S44, 1309151649); /* 60 */ + II ( a, b, c, d, in[ 4], S41, 4149444226); /* 61 */ + II ( d, a, b, c, in[11], S42, 3174756917); /* 62 */ + II ( c, d, a, b, in[ 2], S43, 718787259); /* 63 */ + II ( b, c, d, a, in[ 9], S44, 3951481745); /* 64 */ + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +/* + ********************************************************************** + ** End of md5.c ** + ******************************* (cut) ******************************** + */ + +/* + ********************************************************************** + ** md5driver.c -- sample routines to test ** + ** RSA Data Security, Inc. MD5 message digest algorithm. ** + ** Created: 2/16/90 RLR ** + ** Updated: 1/91 SRD ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +#include <stdio.h> +#include <sys/types.h> +#include <time.h> +#include <string.h> +/* -- include the following file if the file md5.h is separate -- */ +/* #include "md5.h" */ + +/* Prints message digest buffer in mdContext as 32 hexadecimal digits. + Order is from low-order byte to high-order byte of digest. + Each byte is printed with high-order hexadecimal digit first. + */ +static void MDPrint (mdContext) +MD5_CTX *mdContext; +{ + int i; + + for (i = 0; i < 16; i++) + printf ("%02x", mdContext->digest[i]); +} + +/* size of test block */ +#define TEST_BLOCK_SIZE 1000 + +/* number of blocks to process */ +#define TEST_BLOCKS 10000 + +/* number of test bytes = TEST_BLOCK_SIZE * TEST_BLOCKS */ +static long TEST_BYTES = (long)TEST_BLOCK_SIZE * (long)TEST_BLOCKS; + +/* A time trial routine, to measure the speed of MD5. + Measures wall time required to digest TEST_BLOCKS * TEST_BLOCK_SIZE + characters. + */ +static void MDTimeTrial () +{ + MD5_CTX mdContext; + time_t endTime, startTime; + unsigned char data[TEST_BLOCK_SIZE]; + unsigned int i; + + /* initialize test data */ + for (i = 0; i < TEST_BLOCK_SIZE; i++) + data[i] = (unsigned char)(i & 0xFF); + + /* start timer */ + printf ("MD5 time trial. Processing %ld characters...\n", TEST_BYTES); + time (&startTime); + + /* digest data in TEST_BLOCK_SIZE byte blocks */ + MD5Init (&mdContext); + for (i = TEST_BLOCKS; i > 0; i--) + MD5Update (&mdContext, data, TEST_BLOCK_SIZE); + MD5Final (&mdContext); + + /* stop timer, get time difference */ + time (&endTime); + MDPrint (&mdContext); + printf (" is digest of test input.\n"); + printf + ("Seconds to process test input: %ld\n", (long)(endTime-startTime)); + printf + ("Characters processed per second: %ld\n", + TEST_BYTES/(endTime-startTime)); +} + +/* Computes the message digest for string inString. + Prints out message digest, a space, the string (in quotes) and a + carriage return. + */ +static void MDString (inString) +char *inString; +{ + MD5_CTX mdContext; + unsigned int len = strlen (inString); + + MD5Init (&mdContext); + MD5Update (&mdContext, inString, len); + MD5Final (&mdContext); + MDPrint (&mdContext); + printf (" \"%s\"\n\n", inString); +} + +/* Computes the message digest for a specified file. + Prints out message digest, a space, the file name, and a carriage + return. + */ +static void MDFile (filename) +char *filename; +{ + FILE *inFile = fopen (filename, "rb"); + MD5_CTX mdContext; + int bytes; + unsigned char data[BLOCKSIZE]; + + if (inFile == NULL) { + printf ("%s can't be opened.\n", filename); + return; + } + + MD5Init (&mdContext); + while ((bytes = fread (data, 1, BLOCKSIZE, inFile)) != 0) + A32MD5Update (&mdContext, data, bytes); + + MD5Final (&mdContext); + MDPrint (&mdContext); + printf (" %s %x\n", filename, mdContext.adler32); + fclose (inFile); +} + +/* Writes the message digest of the data from stdin onto stdout, + followed by a carriage return. + */ +static void MDFilter () +{ + MD5_CTX mdContext; + int bytes; + unsigned char data[16]; + + MD5Init (&mdContext); + while ((bytes = fread (data, 1, 16, stdin)) != 0) + MD5Update (&mdContext, data, bytes); + MD5Final (&mdContext); + MDPrint (&mdContext); + printf ("\n"); +} + +/* Runs a standard suite of test data. + */ +static void MDTestSuite () +{ + printf ("MD5 test suite results:\n\n"); + MDString (""); + MDString ("a"); + MDString ("abc"); + MDString ("message digest"); + MDString ("abcdefghijklmnopqrstuvwxyz"); + MDString + ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"); + MDString + ("1234567890123456789012345678901234567890\ +1234567890123456789012345678901234567890"); + /* Contents of file foo are "abc" */ + MDFile ("/etc/passwd"); +} + +int main (argc, argv) +int argc; +char *argv[]; +{ + int i; + + /* For each command line argument in turn: + ** filename -- prints message digest and name of file + ** -sstring -- prints message digest and contents of string + ** -t -- prints time trial statistics for 1M characters + ** -x -- execute a standard suite of test data + ** (no args) -- writes messages digest of stdin onto stdout + */ + if (argc == 1) + MDFilter (); + else + for (i = 1; i < argc; i++) + if (argv[i][0] == '-' && argv[i][1] == 's') + MDString (argv[i] + 2); + else if (strcmp (argv[i], "-t") == 0) + MDTimeTrial (); + else if (strcmp (argv[i], "-x") == 0) + MDTestSuite (); + else MDFile (argv[i]); + + return(0); +} + +/* + ********************************************************************** + ** End of md5.c ** + ******************************* (cut) ******************************** + */ diff --git a/LTA/LTAIngest/mechanize-0.2.5/COPYING.txt b/LTA/LTAIngest/mechanize-0.2.5/COPYING.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a8bf1ba319eba5043b3fda8aeadc6bc2ce4b326 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/COPYING.txt @@ -0,0 +1,101 @@ +All the code with the exception of _gzip.py is covered under either +the BSD-style license immediately below, or (at your choice) the ZPL +2.1. The code in _gzip.py is taken from the effbot.org library, and +falls under the effbot.org license (also BSD-style) that appears at +the end of this file. + + +Copyright (c) 2002-2010 John J. Lee <jjl@pobox.com> +Copyright (c) 1997-1999 Gisle Aas +Copyright (c) 1997-1999 Johnny Lee +Copyright (c) 2003 Andy Lester + + +BSD-style License +================== + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +Neither the name of the contributors nor the names of their employers +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + + + +ZPL 2.1 +================== + +Zope Public License (ZPL) Version 2.1 + +A copyright notice accompanies this license document that identifies the copyright holders. + +This license has been certified as open source. It has also been designated as GPL compatible by the Free Software Foundation (FSF). + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + 1. Redistributions in source code must retain the accompanying copyright notice, this list of conditions, and the following disclaimer. + 2. Redistributions in binary form must reproduce the accompanying copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. + 3. Names of the copyright holders must not be used to endorse or promote products derived from this software without prior written permission from the copyright holders. + 4. The right to distribute this software or to use it for any purpose does not give you the right to use Servicemarks (sm) or Trademarks (tm) of the copyright holders. Use of them is covered by separate agreement with the copyright holders. + 5. If any files are modified, you must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. + +Disclaimer + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + + + +-------------------------------------------------------------------- +The effbot.org Library is + +Copyright (c) 1999-2004 by Secret Labs AB +Copyright (c) 1999-2004 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its +associated documentation, you agree that you have read, understood, +and will comply with the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its +associated documentation for any purpose and without fee is hereby +granted, provided that the above copyright notice appears in all +copies, and that both that copyright notice and this permission notice +appear in supporting documentation, and that the name of Secret Labs +AB or the author not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR +ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +-------------------------------------------------------------------- diff --git a/LTA/LTAIngest/mechanize-0.2.5/INSTALL.txt b/LTA/LTAIngest/mechanize-0.2.5/INSTALL.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0d839df9c6cceae8683120aa816bcbc2f7211cf --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/INSTALL.txt @@ -0,0 +1,19 @@ +To install mechanize: + +See the web page for the version of Python required (included here as +docs/html/index.html). + +To install the package, run the following command: + + python setup.py install + + +Alternatively, just copy the whole mechanize directory into a directory on +your Python path (e.g. unix: /usr/local/lib/python2.7/site-packages, +Windows: C:\Python27\Lib\site-packages). Only copy the mechanize directory +that's inside the distributed tarball / zip archive, not the entire +mechanize-x.x.x directory! + + +John J. Lee <jjl@pobox.com> +July 2010 diff --git a/LTA/LTAIngest/mechanize-0.2.5/MANIFEST.in b/LTA/LTAIngest/mechanize-0.2.5/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..0f1edb682b7fe8476952efda1cee845556a5ec3d --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/MANIFEST.in @@ -0,0 +1,12 @@ +include COPYING.txt +include INSTALL.txt +include MANIFEST.in +include README.txt +include *.py +recursive-include examples *.py +recursive-include examples/forms *.dat *.txt *.html *.cgi *.py +recursive-include test/functional_tests_golden output +recursive-include test/test_form_data *.html +recursive-include test *.py *.doctest *.special_doctest +recursive-include test-tools *.py *.cgi +recursive-include docs *.txt *.html *.css *.js diff --git a/LTA/LTAIngest/mechanize-0.2.5/PKG-INFO b/LTA/LTAIngest/mechanize-0.2.5/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..d4fd759ef21020a02b0110d1de9c10b88d8110d4 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/PKG-INFO @@ -0,0 +1,61 @@ +Metadata-Version: 1.0 +Name: mechanize +Version: 0.2.5 +Summary: Stateful programmatic web browsing. +Home-page: http://wwwsearch.sourceforge.net/mechanize/ +Author: John J. Lee +Author-email: jjl@pobox.com +License: BSD +Download-URL: http://pypi.python.org/packages/source/m/mechanize/mechanize-0.2.5.tar.gz +Description: Stateful programmatic web browsing, after Andy Lester's Perl module + WWW::Mechanize. + + mechanize.Browser implements the urllib2.OpenerDirector interface. Browser + objects have state, including navigation history, HTML form state, cookies, + etc. The set of features and URL schemes handled by Browser objects is + configurable. The library also provides an API that is mostly compatible with + urllib2: your urllib2 program will likely still work if you replace "urllib2" + with "mechanize" everywhere. + + Features include: ftp:, http: and file: URL schemes, browser history, hyperlink + and HTML form support, HTTP cookies, HTTP-EQUIV and Refresh, Referer [sic] + header, robots.txt, redirections, proxies, and Basic and Digest HTTP + authentication. + + Much of the code originally derived from Perl code by Gisle Aas (libwww-perl), + Johnny Lee (MSIE Cookie support) and last but not least Andy Lester + (WWW::Mechanize). urllib2 was written by Jeremy Hylton. + + +Platform: any +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: License :: OSI Approved :: BSD License +Classifier: License :: OSI Approved :: Zope Public License +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.4 +Classifier: Programming Language :: Python :: 2.5 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Topic :: Internet +Classifier: Topic :: Internet :: File Transfer Protocol (FTP) +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Internet :: WWW/HTTP :: Browsers +Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search +Classifier: Topic :: Internet :: WWW/HTTP :: Site Management +Classifier: Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Software Development :: Testing +Classifier: Topic :: Software Development :: Testing :: Traffic Generation +Classifier: Topic :: System :: Archiving :: Mirroring +Classifier: Topic :: System :: Networking :: Monitoring +Classifier: Topic :: System :: Systems Administration +Classifier: Topic :: Text Processing +Classifier: Topic :: Text Processing :: Markup +Classifier: Topic :: Text Processing :: Markup :: HTML +Classifier: Topic :: Text Processing :: Markup :: XML diff --git a/LTA/LTAIngest/mechanize-0.2.5/README.txt b/LTA/LTAIngest/mechanize-0.2.5/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..70a064746909feb2c73e34c4459879d89ce182ce --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/README.txt @@ -0,0 +1,7 @@ +See INSTALL.txt for installation instructions. + +See docs/html/index.html and docstrings for documentation. + +If you have a git working tree rather than a release, you'll only have +the markdown source, e.g. mechanize/index.txt; release.py is used to +build the HTML docs. diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/development.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/development.txt new file mode 100644 index 0000000000000000000000000000000000000000..92c8f9661effc14af3d78e72d37d7c929fd753bd --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/development.txt @@ -0,0 +1,47 @@ +% mechanize -- Development + +git repository +-------------- + +The [git](http://git-scm.com/) repository is +[here](http://github.com/jjlee/mechanize). To check it out: + + `git clone git://github.com/jjlee/mechanize.git` + +There is also [another +repository](http://github.com/jjlee/mechanize-build-tools), which is only +useful for making mechanize releases: + + `git clone git://github.com/jjlee/mechanize-build-tools.git` + + +Old repository +-------------- + +The [old SVN repository](http://codespeak.net/svn/wwwsearch/) may be useful for +viewing ClientForm history. ClientForm used to be a dependency of mechanize, +but has been merged into mechanize as of release 0.2.0; the history wasn't +imported. To check out: + + `svn co http://codespeak.net/svn/wwwsearch/` + + +Bug tracker +----------- + +The bug tracker is [here on github](http://github.com/jjlee/mechanize/issues). +It's equally acceptable to file bugs on the tracker or post about them to the +[mailing list](http://lists.sourceforge.net/lists/listinfo/wwwsearch-general). +Feel free to send patches too! + + +Mailing list +------------ + +There is a [mailing +list](http://lists.sourceforge.net/lists/listinfo/wwwsearch-general). + + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/doc.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/doc.txt new file mode 100644 index 0000000000000000000000000000000000000000..93c40ad30d60ee752c9e709b4a36a14a1bd004fc --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/doc.txt @@ -0,0 +1,524 @@ +% mechanize -- Documentation + +<span class="docwarning">This documentation is in need of +reorganisation!</span> + +This page is the old ClientCookie documentation. It deals with operation on +the level of `urllib2 Handler` objects, and also with adding headers, +debugging, and cookie handling. See the [front page](./) for more typical use. + + +Examples +-------- + +~~~~{.python} +import mechanize +response = mechanize.urlopen("http://example.com/") +~~~~ + +This function behaves identically to `urllib2.urlopen()`, except that it deals +with cookies automatically. + +Here is a more complicated example, involving `Request` objects (useful if you +want to pass `Request`s around, add headers to them, etc.): + +~~~~{.python} +import mechanize +request = mechanize.Request("http://example.com/") +# note we're using the urlopen from mechanize, not urllib2 +response = mechanize.urlopen(request) +# let's say this next request requires a cookie that was set +# in response +request2 = mechanize.Request("http://example.com/spam.html") +response2 = mechanize.urlopen(request2) + +print response2.geturl() +print response2.info() # headers +print response2.read() # body (readline and readlines work too) +~~~~ + +In these examples, the workings are hidden inside the `mechanize.urlopen()` +function, which is an extension of `urllib2.urlopen()`. Redirects, proxies and +cookies are handled automatically by this function (note that you may need a +bit of configuration to get your proxies correctly set up: see `urllib2` +documentation). + +There is also a `urlretrieve()` function, which works like +`urllib.urlretrieve()`. + +An example at a slightly lower level shows how the module processes cookies +more clearly: + +~~~~{.python} +# Don't copy this blindly! You probably want to follow the examples +# above, not this one. +import mechanize + +# Build an opener that *doesn't* automatically call .add_cookie_header() +# and .extract_cookies(), so we can do it manually without interference. +class NullCookieProcessor(mechanize.HTTPCookieProcessor): + def http_request(self, request): return request + def http_response(self, request, response): return response +opener = mechanize.build_opener(NullCookieProcessor) + +request = mechanize.Request("http://example.com/") +response = mechanize.urlopen(request) +cj = mechanize.CookieJar() +cj.extract_cookies(response, request) +# let's say this next request requires a cookie that was set in response +request2 = mechanize.Request("http://example.com/spam.html") +cj.add_cookie_header(request2) +response2 = mechanize.urlopen(request2) +~~~~ + +The `CookieJar` class does all the work. There are essentially two operations: +`.extract_cookies()` extracts HTTP cookies from `Set-Cookie` (the original +[Netscape cookie standard](http://curl.haxx.se/rfc/cookie_spec.html)) and +`Set-Cookie2` ([RFC 2965](http://www.ietf.org/rfc/rfc2965.txt)) headers from a +response if and only if they should be set given the request, and +`.add_cookie_header()` adds `Cookie` headers if and only if they are +appropriate for a particular HTTP request. Incoming cookies are checked for +acceptability based on the host name, etc. Cookies are only set on outgoing +requests if they match the request's host name, path, etc. + +**Note that if you're using `mechanize.urlopen()` (or if you're using +`mechanize.HTTPCookieProcessor` by some other means), you don't need to call +`.extract_cookies()` or `.add_cookie_header()` yourself**. If, on the other +hand, you want to use mechanize to provide cookie handling for an HTTP client +other than mechanize itself, you will need to use this pair of methods. You +can make your own `request` and `response` objects, which must support the +interfaces described in the docstrings of `.extract_cookies()` and +`.add_cookie_header()`. + +There are also some `CookieJar` subclasses which can store cookies in files and +databases. `FileCookieJar` is the abstract class for `CookieJar`s that can +store cookies in disk files. `LWPCookieJar` saves cookies in a format +compatible with the libwww-perl library. This class is convenient if you want +to store cookies in a human-readable file: + +~~~~{.python} +import mechanize +cj = mechanize.LWPCookieJar() +cj.revert("cookie3.txt") +opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) +r = opener.open("http://foobar.com/") +cj.save("cookie3.txt") +~~~~ + +The `.revert()` method discards all existing cookies held by the `CookieJar` +(it won't lose any existing cookies if the load fails). The `.load()` method, +on the other hand, adds the loaded cookies to existing cookies held in the +`CookieJar` (old cookies are kept unless overwritten by newly loaded ones). + +`MozillaCookieJar` can load and save to the Mozilla/Netscape/lynx-compatible +`'cookies.txt'` format. This format loses some information (unusual and +nonstandard cookie attributes such as comment, and also information specific to +RFC 2965 cookies). The subclass `MSIECookieJar` can load (but not save) from +Microsoft Internet Explorer's cookie files on Windows. + + +Important note +-------------- + +Only use names you can import directly from the `mechanize` package, and that +don't start with a single underscore. Everything else is subject to change or +disappearance without notice. + + +Cooperating with Browsers +------------------------- + +**Firefox since version 3 persists cookies in an sqlite database, which is not +supported by MozillaCookieJar.** + +The subclass `MozillaCookieJar` differs from `CookieJar` only in storing +cookies using a different, Firefox 2/Mozilla/Netscape-compatible, file format +known as "cookies.txt". The lynx browser also uses this format. This file +format can't store RFC 2965 cookies, so they are downgraded to Netscape cookies +on saving. `LWPCookieJar` itself uses a libwww-perl specific format +(\`Set-Cookie3') -- see the example above. Python and your browser should be +able to share a cookies file (note that the file location here will differ on +non-unix OSes): + +**WARNING:** you may want to back up your browser's cookies file if you use +`MozillaCookieJar` to save cookies. I *think* it works, but there have been +bugs in the past! + +~~~~{.python} +import os, mechanize +cookies = mechanize.MozillaCookieJar() +cookies.load(os.path.join(os.environ["HOME"], "/.netscape/cookies.txt")) +# see also the save and revert methods +~~~~ + +Note that cookies saved while Mozilla is running will get clobbered by Mozilla +-- see `MozillaCookieJar.__doc__`. + +`MSIECookieJar` does the same for Microsoft Internet Explorer (MSIE) 5.x and +6.x on Windows, but does not allow saving cookies in this format. In future, +the Windows API calls might be used to load and save (though the index has to +be read directly, since there is no API for that, AFAIK; there's also an +unfinished `MSIEDBCookieJar`, which uses (reads and writes) the Windows MSIE +cookie database directly, rather than storing copies of cookies as +`MSIECookieJar` does). + +~~~~{.python} +import mechanize +cj = mechanize.MSIECookieJar(delayload=True) +cj.load_from_registry() # finds cookie index file from registry +~~~~ + +A true `delayload` argument speeds things up. + +On Windows 9x (win 95, win 98, win ME), you need to supply a username to the +`.load_from_registry()` method: + +~~~~{.python} +cj.load_from_registry(username="jbloggs") +~~~~ + +Konqueror/Safari and Opera use different file formats, which aren't yet +supported. + + +Saving cookies in a file +------------------------ + +If you have no need to co-operate with a browser, the most convenient way to +save cookies on disk between sessions in human-readable form is to use +`LWPCookieJar`. This class uses a libwww-perl specific format +(\`Set-Cookie3'). Unlike `MozilliaCookieJar`, this file format doesn't lose +information. + + +Supplying a CookieJar +--------------------- + +You might want to do this to [use your browser's +cookies](#cooperating-with-browsers), to customize `CookieJar`'s behaviour by +passing constructor arguments, or to be able to get at the cookies it will hold +(for example, for saving cookies between sessions and for debugging). + +If you're using the higher-level `urllib2`-like interface (`urlopen()`, etc), +you'll have to let it know what `CookieJar` it should use: + +~~~~{.python} +import mechanize +cookies = mechanize.CookieJar() +# build_opener() adds standard handlers (such as HTTPHandler and +# HTTPCookieProcessor) by default. The cookie processor we supply +# will replace the default one. +opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) + +r = opener.open("http://example.com/") # GET +r = opener.open("http://example.com/", data) # POST +~~~~ + +The `urlopen()` function uses a global `OpenerDirector` instance to do its +work, so if you want to use `urlopen()` with your own `CookieJar`, install the +`OpenerDirector` you built with `build_opener()` using the +`mechanize.install_opener()` function, then proceed as usual: + +~~~~{.python} +mechanize.install_opener(opener) +r = mechanize.urlopen("http://example.com/") +~~~~ + +Of course, everyone using `urlopen` is using the same global `CookieJar` +instance! + +<a name="policy" /> + +You can set a policy object (must satisfy the interface defined by +`mechanize.CookiePolicy`), which determines which cookies are allowed to be set +and returned. Use the `policy` argument to the `CookieJar` constructor, or use +the `.set\_policy()` method. The default implementation has some useful +switches: + +~~~~{.python} +from mechanize import CookieJar, DefaultCookiePolicy as Policy +cookies = CookieJar() +# turn on RFC 2965 cookies, be more strict about domains when setting and +# returning Netscape cookies, and block some domains from setting cookies +# or having them returned (read the DefaultCookiePolicy docstring for the +# domain matching rules here) +policy = Policy(rfc2965=True, strict_ns_domain=Policy.DomainStrict, + blocked_domains=["ads.net", ".ads.net"]) +cookies.set_policy(policy) +~~~~ + + +Additional Handlers +------------------- + +The following handlers are provided in addition to those provided by `urllib2`: + +`HTTPRobotRulesProcessor` +: WWW Robots (also called wanderers or spiders) are programs that traverse many + pages in the World Wide Web by recursively retrieving linked pages. This + kind of program can place significant loads on web servers, so there is a + [standard](http://www.robotstxt.org/wc/norobots.html) for a `robots.txt` + file by which web site operators can request robots to keep out of their + site, or out of particular areas of it. This handler uses the standard + Python library's `robotparser` module. It raises + `mechanize.RobotExclusionError` (subclass of `mechanize.HTTPError`) if an + attempt is made to open a URL prohibited by `robots.txt`. + +`HTTPEquivProcessor` +: The `<META HTTP-EQUIV>` tag is a way of including data in HTML to be treated + as if it were part of the HTTP headers. mechanize can automatically read + these tags and add the `HTTP-EQUIV` headers to the response object's real + HTTP headers. The HTML is left unchanged. + +`HTTPRefreshProcessor` +: The `Refresh` HTTP header is a non-standard header which is widely used. It + requests that the user-agent follow a URL after a specified time delay. + mechanize can treat these headers (which may have been set in `<META + HTTP-EQUIV>` tags) as if they were 302 redirections. Exactly when and how + `Refresh` headers are handled is configurable using the constructor + arguments. + +`HTTPRefererProcessor` +: The `Referer` HTTP header lets the server know which URL you've just visited. + Some servers use this header as state information, and don't like it if + this is not present. It's a chore to add this header by hand every time + you make a request. This adds it automatically. **NOTE**: this only makes + sense if you use each handler for a single chain of HTTP requests (so, for + example, if you use a single HTTPRefererProcessor to fetch a series of URLs + extracted from a single page, **this will break**). + [mechanize.Browser](../mechanize/) does this properly. + +Example: + +~~~~{.python} +import mechanize +cookies = mechanize.CookieJar() + +opener = mechanize.build_opener(mechanize.HTTPRefererProcessor, + mechanize.HTTPEquivProcessor, + mechanize.HTTPRefreshProcessor, + ) +opener.open("http://www.rhubarb.com/") +~~~~ + + +Seekable responses +------------------ + +Response objects returned from (or raised as exceptions by) +`mechanize.SeekableResponseOpener`, `mechanize.UserAgent` (if +`.set_seekable_responses(True)` has been called) and `mechanize.Browser()` have +`.seek()`, `.get_data()` and `.set_data()` methods: + +~~~~{.python} +import mechanize +opener = mechanize.OpenerFactory(mechanize.SeekableResponseOpener).build_opener() +response = opener.open("http://example.com/") +# same return value as .read(), but without affecting seek position +total_nr_bytes = len(response.get_data()) +assert len(response.read()) == total_nr_bytes +assert len(response.read()) == 0 # we've already read the data +response.seek(0) +assert len(response.read()) == total_nr_bytes +response.set_data("blah\n") +assert response.get_data() == "blah\n" +... +~~~~ + +This caching behaviour can be avoided by using `mechanize.OpenerDirector`. It +can also be avoided with `mechanize.UserAgent`. Note that `HTTPEquivProcessor` +and `HTTPResponseDebugProcessor` require seekable responses and so are not +compatible with `mechanize.OpenerDirector` and `mechanize.UserAgent`. + +~~~~{.python} +import mechanize +ua = mechanize.UserAgent() +ua.set_seekable_responses(False) +ua.set_handle_equiv(False) +ua.set_debug_responses(False) +~~~~ + +Note that if you turn on features that use seekable responses (currently: +HTTP-EQUIV handling and response body debug printing), returned responses *may* +be seekable as a side-effect of these features. However, this is not +guaranteed (currently, in these cases, returned response objects are seekable, +but raised respose objects — `mechanize.HTTPError` instances — are not +seekable). This applies regardless of whether you use `mechanize.UserAgent` or +`mechanize.OpenerDirector`. If you explicitly request seekable responses by +calling `.set_seekable_responses(True)` on a `mechanize.UserAgent` instance, or +by using `mechanize.Browser` or `mechanize.SeekableResponseOpener`, which +always return seekable responses, then both returned and raised responses are +guaranteed to be seekable. + +Handlers should call `response = mechanize.seek_wrapped_response(response)` if +they require the `.seek()`, `.get_data()` or `.set_data()` methods. + + +Request object lifetime +----------------------- + +Note that handlers may create new `Request` instances (for example when +performing redirects) rather than adding headers to existing `Request` objects. + + +Adding headers +-------------- + +Adding headers is done like so: + +~~~~{.python} +import mechanize +req = mechanize.Request("http://foobar.com/") +req.add_header("Referer", "http://wwwsearch.sourceforge.net/mechanize/") +r = mechanize.urlopen(req) +~~~~ + +You can also use the `headers` argument to the `mechanize.Request` constructor. + +mechanize adds some headers to `Request` objects automatically -- see the next +section for details. + + +Automatically-added headers +--------------------------- + +`OpenerDirector` automatically adds a `User-Agent` header to every `Request`. + +To change this and/or add similar headers, use your own `OpenerDirector`: + +~~~~{.python} +import mechanize +cookies = mechanize.CookieJar() +opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) +opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; MyProgram/0.1)"), + ("From", "responsible.person@example.com")] +~~~~ + +Again, to use `urlopen()`, install your `OpenerDirector` globally: + +~~~~{.python} +mechanize.install_opener(opener) +r = mechanize.urlopen("http://example.com/") +~~~~ + +Also, a few standard headers (`Content-Length`, `Content-Type` and `Host`) are +added when the `Request` is passed to `urlopen()` (or `OpenerDirector.open()`). +You shouldn't need to change these headers, but since this is done by +`AbstractHTTPHandler`, you can change the way it works by passing a subclass of +that handler to `build_opener()` (or, as always, by constructing an opener +yourself and calling `.add_handler()`). + + + +Initiating unverifiable transactions +------------------------------------ + +This section is only of interest for correct handling of third-party HTTP +cookies. See [below](#note-about-cookie-standards) for an explanation of +'third-party'. + +First, some terminology. + +An *unverifiable request* (defined fully by ([RFC +2965](http://www.ietf.org/rfc/rfc2965.txt)) is one whose URL the user did not +have the option to approve. For example, a transaction is unverifiable if the +request is for an image in an HTML document, and the user had no option to +approve the fetching of the image from a particular URL. + +The *request-host of the origin transaction* (defined fully by RFC 2965) is the +host name or IP address of the original request that was initiated by the user. +For example, if the request is for an image in an HTML document, this is the +request-host of the request for the page containing the image. + +**mechanize knows that redirected transactions are unverifiable, and will +handle that on its own (ie. you don't need to think about the origin +request-host or verifiability yourself).** + +If you want to initiate an unverifiable transaction yourself (which you should +if, for example, you're downloading the images from a page, and 'the user' +hasn't explicitly OKed those URLs): + +~~~~{.python} +request = Request(origin_req_host="www.example.com", unverifiable=True) +~~~~ + + +RFC 2965 support +---------------- + +Support for the RFC 2965 protocol is switched off by default, because few +browsers implement it, so the RFC 2965 protocol is essentially never seen on +the internet. To switch it on, see [here](#policy). + + +Parsing HTTP dates +------------------ + +A function named `str2time` is provided by the package, which may be useful for +parsing dates in HTTP headers. `str2time` is intended to be liberal, since +HTTP date/time formats are poorly standardised in practice. There is no need +to use this function in normal operations: `CookieJar` instances keep track of +cookie lifetimes automatically. This function will stay around in some form, +though the supported date/time formats may change. + + +Dealing with bad HTML +--------------------- + +XXX Intro + +XXX Test me + + +Note about cookie standards +--------------------------- + +There are several standards relevant to HTTP cookies. + +The Netscape protocol is the only standard supported by most web browsers +(including Internet Explorer and Firefox). This is a *de facto* standard +defined by the behaviour of popular browsers, and neither the +[cookie\_spec.html](http://curl.haxx.se/rfc/cookie_spec.html) document that was +published by Netscape, nor the RFCs that were published later, describe the +Netscape protocol accurately or completely. Netscape protocol cookies are also +known as V0 cookies, to distinguish them from RFC 2109 or RFC 2965 cookies, +which have a version cookie-attribute with a value of 1. + +[RFC 2109](http://www.ietf.org/rfc/rfc2109.txt) was introduced to fix some +problems identified with the Netscape protocol, while still keeping the same +HTTP headers (`Cookie` and `Set-Cookie`). The most prominent of these problems +is the 'third-party' cookie issue, which was an accidental feature of the +Netscape protocol. Some features defined by RFC2109 (such as the port and +max-age cookie attributes) are now part of the de facto Netscape protocol, but +the RFC was never implemented fully by browsers, because of differences in +behaviour between the Netscape and Internet Explorer browsers of the time. + +[RFC 2965](http://www.ietf.org/rfc/rfc2965.txt) attempted to fix the +compatibility problem by introducing two new headers, `Set-Cookie2` and +`Cookie2`. Unlike the `Cookie` header, `Cookie2` does *not* carry cookies to +the server -- rather, it simply advertises to the server that RFC 2965 is +understood. `Set-Cookie2` *does* carry cookies, from server to client: the new +header means that both IE and Netscape ignore these cookies. This preserves +backwards compatibility, but popular browsers did not implement the RFC, so it +was never widely adopted. One confusing point to note about RFC 2965 is that +it uses the same value (1) of the Version attribute in HTTP headers as does RFC +2109. See also [RFC 2964](http://www.ietf.org/rfc/rfc2964.txt), which +discusses use of the protocol. + +Because Netscape cookies are so poorly specified, the general philosophy of the +module's Netscape protocol implementation is to start with RFC 2965 and open +holes where required for Netscape protocol-compatibility. RFC 2965 cookies are +*always* treated as RFC 2965 requires, of course. + +There is more information about the history of HTTP cookies in [this paper by +David Kristol](http://arxiv.org/abs/cs.SE/0105018). + +Recently (2011), [an IETF effort has +started](http://tools.ietf.org/html/draft-ietf-httpstate-cookie) to specify the +syntax and semantics of the `Cookie` and `Set-Cookie` headers as they are +actually used on the internet. + + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/documentation.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/documentation.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e0793d413b0fdb1191683c293d61f663618a91d --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/documentation.txt @@ -0,0 +1,132 @@ +% mechanize -- Documentation + +Full API documentation is in the docstrings and the documentation of +[`urllib2`](http://docs.python.org/release/2.6/library/urllib2.html). The +documentation in these web pages is in need of reorganisation at the moment, +after the merge of ClientCookie and ClientForm into mechanize. + + +Tests and examples +------------------ + +### Examples ### + +The [front page](./) has some introductory examples. + +The `examples` directory in the source packages contains a couple of silly, +but working, scripts to demonstrate basic use of the module. + +See also the [forms examples](./forms.html) (these examples use the forms API +independently of `mechanize.Browser`). + + +### Tests ### + +To run the tests: + + python test.py + +There are some tests that try to fetch URLs from the internet. To include +those in the test run: + + python test.py discover --tag internet + + +The `urllib2` interface +----------------------- + +mechanize exports the complete interface of `urllib2`. See the [`urllib2` +documentation](http://docs.python.org/release/2.6/library/urllib2.html). For +example: + +~~~~{.python} +import mechanize +response = mechanize.urlopen("http://www.example.com/") +print response.read() +~~~~ + + +Compatibility +------------- + +These notes explain the relationship between mechanize, ClientCookie, +ClientForm, `cookielib` and `urllib2`, and which to use when. If you're just +using mechanize, and not any of those other libraries, you can ignore this +section. + + #. mechanize works with Python 2.4, Python 2.5, Python 2.6, and Python 2.7. + + #. When using mechanize, anything you would normally import from `urllib2` + should be imported from `mechanize` instead. + + #. Use of mechanize classes with `urllib2` (and vice-versa) is no longer + supported. However, existing classes implementing the `urllib2 Handler` + interface are likely to work unchanged with mechanize. + + #. mechanize now only imports `urllib2.URLError` and `urllib2.HTTPError` from + `urllib2`. The rest is forked. I intend to merge fixes from Python trunk + frequently. + + #. ClientForm is no longer maintained as a separate package. The code is + now part of mechanize, and its interface is now exported through module + mechanize (since mechanize 0.2.0). Old code can simply be changed to + `import mechanize as ClientForm` and should continue to work. + + #. ClientCookie is no longer maintained as a separate package. The code is + now part of mechanize, and its interface is now exported through module + mechanize (since mechanize 0.1.0). Old code can simply be changed to + `import mechanize as ClientCookie` and should continue to work. + + #. The cookie handling parts of mechanize are in Python 2.4 standard library + as module `cookielib` and extensions to module `urllib2`. mechanize does + not currently use `cookielib`, due to the presence of thread + synchronisation code in `cookielib` that is not present in the mechanize + fork of `cookielib`. + +API differences between mechanize and `urllib2`: + + #. mechanize provides additional features. + + #. `mechanize.urlopen` differs in its behaviour: it handles cookies, whereas + `urllib2.urlopen` does not. To make a `urlopen` function with the + `urllib2` behaviour: + +~~~~{.python} +import mechanize +handler_classes = [mechanize.ProxyHandler, + mechanize.UnknownHandler, + mechanize.HTTPHandler, + mechanize.HTTPDefaultErrorHandler, + mechanize.HTTPRedirectHandler, + mechanize.FTPHandler, + mechanize.FileHandler, + mechanize.HTTPErrorProcessor] +opener = mechanize.OpenerDirector() +for handler_class in handler_classes: + opener.add_handler(handler_class()) +urlopen = opener.open +~~~~ + + #. Since Python 2.6, `urllib2` uses a `.timeout` attribute on `Request` + objects internally. However, `urllib2.Request` has no timeout constructor + argument, and `urllib2.urlopen()` ignores this parameter. + `mechanize.Request` has a `timeout` constructor argument which is used to + set the attribute of the same name, and `mechanize.urlopen()` does not + ignore the timeout attribute. + + +UserAgent vs UserAgentBase +-------------------------- + +`mechanize.UserAgent` is a trivial subclass of `mechanize.UserAgentBase`, +adding just one method, `.set_seekable_responses()` (see the [documentation on +seekable responses](./doc.html#seekable-responses)). + +The reason for the extra class is that `mechanize.Browser` depends on seekable +response objects (because response objects are used to implement the browser +history). + + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/download.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/download.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3c3c72380eed5948c208d659a12db4706e0bffd --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/download.txt @@ -0,0 +1,55 @@ +% mechanize -- Download + +There is more than one way to obtain mechanize: + +_Note re Windows and Mac support: currently the tests are only routinely run on +[Ubuntu](http://www.ubuntu.com/) 9.10 ("karmic"). However, as far as I know, +mechanize works fine on Windows and Mac platforms._ + + +easy_install +------------ + + #. Install [EasyInstall](http://peak.telecommunity.com/DevCenter/EasyInstall) + + #. `easy_install mechanize` + +Easy install will automatically download the latest source code release and +install it. + + +Source code release +------------------- + + #. Download the source from one of the links below + + #. Unpack the source distribution and change directory to the resulting +top-level directory. + + #. `python setup.py install` + + +This is a stable release. + + * [`mechanize-0.2.5.tar.gz`](http://pypi.python.org/packages/source/m/mechanize/mechanize-0.2.5.tar.gz) + + * [`mechanize-0.2.5.zip`](http://pypi.python.org/packages/source/m/mechanize/mechanize-0.2.5.zip) + + * [Older versions.](./src/) Note: these are hosted on sourceforge, which at the time of writing (2011-03-31) is returning invalid HTTP responses -- you can also find old releases on [PyPI](http://pypi.python.org/)) + +All the documentation (these web pages, docstrings, and [the +changelog](./ChangeLog.txt)) is included in the distribution. + + +git repository +-------------- + +The [git](http://git-scm.com/) repository is +[here](http://github.com/jjlee/mechanize). To check it out: + + #. <p>`git clone git://github.com/jjlee/mechanize.git`</p> + + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/faq.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/faq.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdfab475c35ca411d5cc60ec23778bed99164793 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/faq.txt @@ -0,0 +1,368 @@ +% mechanize -- FAQ + +<div class="expanded"> + + + * <span class="q">Which version of Python do I need?</span> + + Python 2.4, 2.5, 2.6, or 2.7. Python 3 is not yet supported. + + * <span class="q">Does mechanize depend on BeautifulSoup?</span> + + No. mechanize offers a few classes that make use of BeautifulSoup, but +these classes are not required to use mechanize. mechanize bundles +BeautifulSoup version 2, so that module is no longer required. A future +version of mechanize will support BeautifulSoup version 3, at which point +mechanize will likely no longer bundle the module. + + * <span class="q">Does mechanize depend on ClientForm?</span> + + No, ClientForm is now part of mechanize. + + * <span class="q">Which license?</span> + + mechanize is dual-licensed: you may pick either the [BSD +license](http://www.opensource.org/licenses/bsd-license.php), or the [ZPL +2.1](http://www.zope.org/Resources/ZPL) (both are included in the +distribution). + + +Usage +----- + + * <span class="q">I'm not getting the HTML page I expected to see.</span> + + [Debugging tips](hints.html) + + * <span class="q">`Browser` doesn't have all of the forms/links I see in the +HTML. Why not?</span> + + Perhaps the default parser can't cope with invalid HTML. Try using the +included BeautifulSoup 2 parser instead: + +~~~~{.python} +import mechanize + +browser = mechanize.Browser(factory=mechanize.RobustFactory()) +browser.open("http://example.com/") +print browser.forms +~~~~ + + Alternatively, you can process the HTML (and headers) arbitrarily: + +~~~~{.python} +browser = mechanize.Browser() +browser.open("http://example.com/") +html = browser.response().get_data().replace("<br/>", "<br />") +response = mechanize.make_response( + html, [("Content-Type", "text/html")], + "http://example.com/", 200, "OK") +browser.set_response(response) +~~~~ + + * <span class="q">Is JavaScript supported?</span> + + No, sorry. See [FAQs](#change-value) [below](#script). + + * <span class="q">My HTTP response data is truncated.</span> + + `mechanize.Browser's` response objects support the `.seek()` method, and +can still be used after `.close()` has been called. Response data is not +fetched until it is needed, so navigation away from a URL before fetching all +of the response will truncate it. Call `response.get_data()` before navigation +if you don't want that to happen. + + * <a name="xhtml" /><span class="q">I'm *sure* this page is HTML, why does `mechanize.Browser` +think otherwise?</span> + +~~~~{.python} +b = mechanize.Browser( + # mechanize's XHTML support needs work, so is currently switched off. If + # we want to get our work done, we have to turn it on by supplying a + # mechanize.Factory (with XHTML support turned on): + factory=mechanize.DefaultFactory(i_want_broken_xhtml_support=True) + ) +~~~~ + + * <span class="q">Why don't timeouts work for me?</span> + + Timeouts are ignored with with versions of Python earlier than 2.6. +Timeouts do not apply to DNS lookups. + + * <span class="q">Is there any example code?</span> + + Look in the `examples/` directory. Note that the examples on the [forms + page](./forms.html) are executable as-is. Contributions of example code + would be very welcome! + + +Cookies +------- + + * <span class="q">Doesn't the standard Python library module, `Cookie`, do + this?</span> + + No: module `Cookie` does the server end of the job. It doesn't know when + to accept cookies from a server or when to send them back. Part of + mechanize has been contributed back to the standard library as module + `cookielib` (there are a few differences, notably that `cookielib` contains + thread synchronization code; mechanize does not use `cookielib`). + + * <span class="q">Which HTTP cookie protocols does mechanize support?</span> + + Netscape and [RFC 2965](http://www.ietf.org/rfc/rfc2965.txt). RFC 2965 + handling is switched off by default. + + * <span class="q">What about RFC 2109?</span> + + RFC 2109 cookies are currently parsed as Netscape cookies, and treated + by default as RFC 2965 cookies thereafter if RFC 2965 handling is enabled, + or as Netscape cookies otherwise. + + + * <span class="q">Why don't I have any cookies?</span> + + See [here](hints.html#cookies). + + * <span class="q">My response claims to be empty, but I know it's not!</span> + + Did you call `response.read()` (e.g., in a debug statement), then forget + that all the data has already been read? In that case, you may want to use + `mechanize.response_seek_wrapper`. `mechanize.Browser` always returns + [seekable responses](doc.html#seekable-responses), so it's not necessary to + use this explicitly in that case. + + * <span class="q">What's the difference between the `.load()` and `.revert()` + methods of `CookieJar`?</span> + + `.load()` *appends* cookies from a file. `.revert()` discards all + existing cookies held by the `CookieJar` first (but it won't lose any + existing cookies if the loading fails). + + * <span class="q">Is it threadsafe?</span> + + No. As far as I know, you can use mechanize in threaded code, but it + provides no synchronisation: you have to provide that yourself. + + * <span class="q">How do I do <X\></span> + + Refer to the API documentation in docstrings. + + +Forms +----- + + * <span class="q">Doesn't the standard Python library module, `cgi`, do this?</span> + + No: the `cgi` module does the server end of the job. It doesn't know + how to parse or fill in a form or how to send it back to the server. + + * <span class="q">How do I figure out what control names and values to use?</span> + + `print form` is usually all you need. In your code, things like the + `HTMLForm.items` attribute of `HTMLForm` instances can be useful to inspect + forms at runtime. Note that it's possible to use item labels instead of + item names, which can be useful — use the `by_label` arguments to the + various methods, and the `.get_value_by_label()` / `.set_value_by_label()` + methods on `ListControl`. + + * <span class="q">What do those `'*'` characters mean in the string + representations of list controls?</span> + + A `*` next to an item means that item is selected. + + * <span class="q">What do those parentheses (round brackets) mean in the string + representations of list controls?</span> + + Parentheses `(foo)` around an item mean that item is disabled. + + * <span class="q">Why doesn't <some control\> turn up in the data returned by + `.click*()` when that control has non-`None` value?</span> + + Either the control is disabled, or it is not successful for some other + reason. 'Successful' (see [HTML 4 + specification](http://www.w3.org/TR/REC-html40/interact/forms.html#h-17.13.2)) + means that the control will cause data to get sent to the server. + + * <span class="q">Why does mechanize not follow the HTML 4.0 / RFC 1866 + standards for `RADIO` and multiple-selection `SELECT` controls?</span> + + Because by default, it follows browser behaviour when setting the + initially-selected items in list controls that have no items explicitly + selected in the HTML. Use the `select_default` argument to `ParseResponse` + if you want to follow the RFC 1866 rules instead. Note that browser + behaviour violates the HTML 4.01 specification in the case of `RADIO` + controls. + + * <span class="q">Why does `.click()`ing on a button not work for me?</span> + + * Clicking on a `RESET` button doesn't do anything, by design - this is a + library for web automation, not an interactive browser. Even in an + interactive browser, clicking on `RESET` sends nothing to the server, + so there is little point in having `.click()` do anything special here. + + * Clicking on a `BUTTON TYPE=BUTTON` doesn't do anything either, also by + design. This time, the reason is that that `BUTTON` is only in the + HTML standard so that one can attach JavaScript callbacks to its + events. Their execution may result in information getting sent back to + the server. mechanize, however, knows nothing about these callbacks, + so it can't do anything useful with a click on a `BUTTON` whose type is + `BUTTON`. + + * Generally, JavaScript may be messing things up in all kinds of ways. + See the answer to the next question. + + * <a name="change-value" /><span class="q">How do I change `INPUT +TYPE=HIDDEN` field values (for example, to emulate the effect of JavaScript +code)?</span> + + As with any control, set the control's `readonly` attribute false. + +~~~~{.python} +form.find_control("foo").readonly = False # allow changing .value of control foo +form.set_all_readonly(False) # allow changing the .value of all controls +~~~~ + + * <span class="q">I'm having trouble debugging my code.</span> + + See [here](hints.html) for few relevant tips. + + * <span class="q">I have a control containing a list of integers. How do I + select the one whose value is nearest to the one I want?</span> + +~~~~{.python} +import bisect +def closest_int_value(form, ctrl_name, value): + values = map(int, [item.name for item in form.find_control(ctrl_name).items]) + return str(values[bisect.bisect(values, value) - 1]) + +form["distance"] = [closest_int_value(form, "distance", 23)] +~~~~ + + +General +------- + + * <a name="sniffing" /><span class="q">I want to see what my web browser is + doing, but standard network sniffers like + [wireshark](http://www.wireshark.org/) or netcat (nc) don't work for HTTPS. + How do I sniff HTTPS traffic?</span> + + Three good options: + + * Mozilla plugin: [LiveHTTPHeaders](http://livehttpheaders.mozdev.org/). + + * [ieHTTPHeaders](http://www.blunck.info/iehttpheaders.html) does + the same for MSIE. + + * Use [`lynx`](http://lynx.browser.org/) `-trace`, and filter out + the junk with a script. + + * <a name="script" /><span class="q">JavaScript is messing up my + web-scraping. What do I do?</span> + + JavaScript is used in web pages for many purposes -- for example: creating + content that was not present in the page at load time, submitting or + filling in parts of forms in response to user actions, setting cookies, + etc. mechanize does not provide any support for JavaScript. + + If you come across this in a page you want to automate, you have four + options. Here they are, roughly in order of simplicity. + + * Figure out what the JavaScript is doing and emulate it in your Python + code: for example, by manually adding cookies to your `CookieJar` + instance, calling methods on `HTMLForm`s, calling `urlopen`, etc. See + [above](#change-value) re forms. + + * Use Java's [HtmlUnit](http://htmlunit.sourceforge.net/) or + [HttpUnit](http://httpunit.sourceforge.net) from Jython, since they + know some JavaScript. + + * Instead of using mechanize, automate a browser instead. For example + use MS Internet Explorer via its COM automation interfaces, using the + [Python for Windows + extensions](http://starship.python.net/crew/mhammond/), aka pywin32, + aka win32all (e.g. [simple + function](http://vsbabu.org/mt/archives/2003/06/13/ie_automation.html), + [pamie](http://pamie.sourceforge.net/); [pywin32 chapter from the + O'Reilly + book](http://www.oreilly.com/catalog/pythonwin32/chapter/ch12.html)) or + [ctypes](http://python.net/crew/theller/ctypes/) + ([example](http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/305273)). + [This](http://www.brunningonline.net/simon/blog/archives/winGuiAuto.py.html) + kind of thing may also come in useful on Windows for cases where the + automation API is lacking. For Firefox, there is + [PyXPCOM](https://developer.mozilla.org/en/PyXPCOM). + + * Get ambitious and automatically delegate the work to an appropriate + interpreter (Mozilla's JavaScript interpreter, for instance). This is + what HtmlUnit and httpunit do. I did a spike along these lines some + years ago, but I think it would (still) be quite a lot of work to do + well. + + * <span class="q">Misc links</span> + + * <a name="parsing" />The following libraries can be useful for dealing + with bad HTML: [lxml.html](http://codespeak.net/lxml/lxmlhtml.html), + [html5lib](http://code.google.com/p/html5lib/), [BeautifulSoup + 3](http://www.crummy.com/software/BeautifulSoup/CHANGELOG.html), + [mxTidy](http://www.egenix.com/files/python/mxTidy.html) and + [mu-Tidylib](http://utidylib.berlios.de/). + + * [Selenium](http://www.openqa.org/selenium/): In-browser web functional + testing. If you need to test websites against real browsers, this is a + standard way to do it. + + * O'Reilly book: [Spidering + Hacks](http://oreilly.com/catalog/9780596005771). Very Perl-oriented. + + * Standard extensions for web development with Firefox, which are also + handy if you're scraping the web: [Web + Developer](http://chrispederick.com/work/webdeveloper/) (amongst other + things, this can display HTML form information), + [Firebug](http://getfirebug.com/). + + * Similar functionality for IE6 and IE7: [Internet Explorer Developer + Toolbar](http://www.google.co.uk/search?q=internet+explorer+developer+toolbar&btnI=I'm+Feeling+Lucky) + (IE8 comes with something equivalent built-in, as does Google Chrome). + + * [Open source functional testing + tools](http://www.opensourcetesting.org/functional.php). + + * [A HOWTO on web + scraping](http://www.rexx.com/~dkuhlman/quixote_htmlscraping.html) from + Dave Kuhlman. + + * <span class="q">Will any of this code make its way into the Python standard + library?</span> + + The request / response processing extensions to `urllib2` from mechanize + have been merged into `urllib2` for Python 2.4. The cookie processing has + been added, as module `cookielib`. There are other features that would be + appropriate additions to `urllib2`, but since Python 2 is heading into + bugfix-only mode, and I'm not using Python 3, they're unlikely to be added. + + * <span class="q">Where can I find out about the relevant standards?</span> + + * [HTML 4.01 Specification](http://www.w3.org/TR/html401/) + + * [Draft HTML 5 Specification](http://dev.w3.org/html5/spec/) + + * [RFC 1866](http://www.ietf.org/rfc/rfc1866.txt) - the HTML 2.0 + standard (you don't want to read this) + + * [RFC 1867](http://www.ietf.org/rfc/rfc1867.txt) - Form-based file + upload + + * [RFC 2616](http://www.ietf.org/rfc/rfc2616.txt) - HTTP 1.1 + Specification + + * [RFC 3986](http://www.ietf.org/rfc/rfc3986.txt) - URIs + + * [RFC 3987](http://www.ietf.org/rfc/rfc3987.txt) - IRIs + +</div> + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/forms.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/forms.txt new file mode 100644 index 0000000000000000000000000000000000000000..d50f868e9eaa204c1ea0712fb1677ae44620b06c --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/forms.txt @@ -0,0 +1,310 @@ +% mechanize -- Forms + +<span class="docwarning">This documentation is in need of reorganisation!</span> + +This page is the old ClientForm documentation. ClientForm is now part of +mechanize, but the documentation hasn't been fully updated to reflect that: +what's here is correct, but not well-integrated with the rest of the +documentation. This page deals with HTML form handling: parsing HTML forms, +filling them in and returning the completed forms to the server. See the +[front page](./) for how to obtain form objects from a `mechanize.Browser`. + +Simple working example (`examples/forms/simple.py` in the source distribution): + +~~~~{.python} +import sys + +from mechanize import ParseResponse, urlopen, urljoin + +if len(sys.argv) == 1: + uri = "http://wwwsearch.sourceforge.net/" +else: + uri = sys.argv[1] + +response = urlopen(urljoin(uri, "mechanize/example.html")) +forms = ParseResponse(response, backwards_compat=False) +form = forms[0] +print form +form["comments"] = "Thanks, Gisle" + +# form.click() returns a mechanize.Request object +# (see HTMLForm.click.__doc__ if you want to use only the forms support, and +# not the rest of mechanize) +print urlopen(form.click()).read() + +~~~~ + +A more complicated working example (from `examples/forms/example.py` in the +source distribution): + +~~~~{.python} +import sys + +import mechanize + +if len(sys.argv) == 1: + uri = "http://wwwsearch.sourceforge.net/" +else: + uri = sys.argv[1] + +request = mechanize.Request(mechanize.urljoin(uri, "mechanize/example.html")) +response = mechanize.urlopen(request) +forms = mechanize.ParseResponse(response, backwards_compat=False) +response.close() +## f = open("example.html") +## forms = mechanize.ParseFile(f, "http://example.com/example.html", +## backwards_compat=False) +## f.close() +form = forms[0] +print form # very useful! + +# A 'control' is a graphical HTML form widget: a text entry box, a +# dropdown 'select' list, a checkbox, etc. + +# Indexing allows setting and retrieval of control values +original_text = form["comments"] # a string, NOT a Control instance +form["comments"] = "Blah." + +# Controls that represent lists (checkbox, select and radio lists) are +# ListControl instances. Their values are sequences of list item names. +# They come in two flavours: single- and multiple-selection: +form["favorite_cheese"] = ["brie"] # single +form["cheeses"] = ["parmesan", "leicester", "cheddar"] # multi +# equivalent, but more flexible: +form.set_value(["parmesan", "leicester", "cheddar"], name="cheeses") + +# Add files to FILE controls with .add_file(). Only call this multiple +# times if the server is expecting multiple files. +# add a file, default value for MIME type, no filename sent to server +form.add_file(open("data.dat")) +# add a second file, explicitly giving MIME type, and telling the server +# what the filename is +form.add_file(open("data.txt"), "text/plain", "data.txt") + +# All Controls may be disabled (equivalent of greyed-out in browser)... +control = form.find_control("comments") +print control.disabled +# ...or readonly +print control.readonly +# readonly and disabled attributes can be assigned to +control.disabled = False +# convenience method, used here to make all controls writable (unless +# they're disabled): +form.set_all_readonly(False) + +# A couple of notes about list controls and HTML: + +# 1. List controls correspond to either a single SELECT element, or +# multiple INPUT elements. Items correspond to either OPTION or INPUT +# elements. For example, this is a SELECT control, named "control1": + +# <select name="control1"> +# <option>foo</option> +# <option value="1">bar</option> +# </select> + +# and this is a CHECKBOX control, named "control2": + +# <input type="checkbox" name="control2" value="foo" id="cbe1"> +# <input type="checkbox" name="control2" value="bar" id="cbe2"> + +# You know the latter is a single control because all the name attributes +# are the same. + +# 2. Item names are the strings that go to make up the value that should +# be returned to the server. These strings come from various different +# pieces of text in the HTML. The HTML standard and the mechanize +# docstrings explain in detail, but playing around with an HTML file, +# ParseFile() and 'print form' is very useful to understand this! + +# You can get the Control instances from inside the form... +control = form.find_control("cheeses", type="select") +print control.name, control.value, control.type +control.value = ["mascarpone", "curd"] +# ...and the Item instances from inside the Control +item = control.get("curd") +print item.name, item.selected, item.id, item.attrs +item.selected = False + +# Controls may be referred to by label: +# find control with label that has a *substring* "Cheeses" +# (e.g., a label "Please select a cheese" would match). +control = form.find_control(label="select a cheese") + +# You can explicitly say that you're referring to a ListControl: +# set value of "cheeses" ListControl +form.set_value(["gouda"], name="cheeses", kind="list") +# equivalent: +form.find_control(name="cheeses", kind="list").value = ["gouda"] +# the first example is also almost equivalent to the following (but +# insists that the control be a ListControl -- so it will skip any +# non-list controls that come before the control we want) +form["cheeses"] = ["gouda"] +# The kind argument can also take values "multilist", "singlelist", "text", +# "clickable" and "file": +# find first control that will accept text, and scribble in it +form.set_value("rhubarb rhubarb", kind="text", nr=0) +# find, and set the value of, the first single-selection list control +form.set_value(["spam"], kind="singlelist", nr=0) + +# You can find controls with a general predicate function: +def control_has_caerphilly(control): + for item in control.items: + if item.name == "caerphilly": return True +form.find_control(kind="list", predicate=control_has_caerphilly) + +# HTMLForm.controls is a list of all controls in the form +for control in form.controls: + if control.value == "inquisition": sys.exit() + +# Control.items is a list of all Item instances in the control +for item in form.find_control("cheeses").items: + print item.name + +# To remove items from a list control, remove it from .items: +cheeses = form.find_control("cheeses") +curd = cheeses.get("curd") +del cheeses.items[cheeses.items.index(curd)] +# To add items to a list container, instantiate an Item with its control +# and attributes: +# Note that you are responsible for getting the attributes correct here, +# and these are not quite identical to the original HTML, due to +# defaulting rules and a few special attributes (e.g. Items that represent +# OPTIONs have a special "contents" key in their .attrs dict). In future +# there will be an explicitly supported way of using the parsing logic to +# add items and controls from HTML strings without knowing these details. +mechanize.Item(cheeses, {"contents": "mascarpone", + "value": "mascarpone"}) + +# You can specify list items by label using set/get_value_by_label() and +# the label argument of the .get() method. Sometimes labels are easier to +# maintain than names, sometimes the other way around. +form.set_value_by_label(["Mozzarella", "Caerphilly"], "cheeses") + +# Which items are present, selected, and successful? +# is the "parmesan" item of the "cheeses" control successful (selected +# and not disabled)? +print "parmesan" in form["cheeses"] +# is the "parmesan" item of the "cheeses" control selected? +print "parmesan" in [ + item.name for item in form.find_control("cheeses").items if item.selected] +# does cheeses control have a "caerphilly" item? +print "caerphilly" in [item.name for item in form.find_control("cheeses").items] + +# Sometimes one wants to set or clear individual items in a list, rather +# than setting the whole .value: +# select the item named "gorgonzola" in the first control named "cheeses" +form.find_control("cheeses").get("gorgonzola").selected = True +# You can be more specific: +# deselect "edam" in third CHECKBOX control +form.find_control(type="checkbox", nr=2).get("edam").selected = False +# deselect item labelled "Mozzarella" in control with id "chz" +form.find_control(id="chz").get(label="Mozzarella").selected = False + +# Often, a single checkbox (a CHECKBOX control with a single item) is +# present. In that case, the name of the single item isn't of much +# interest, so it's a good idea to check and uncheck the box without +# using the item name: +form.find_control("smelly").items[0].selected = True # check +form.find_control("smelly").items[0].selected = False # uncheck + +# Items may be disabled (selecting or de-selecting a disabled item is +# not allowed): +control = form.find_control("cheeses") +print control.get("emmenthal").disabled +control.get("emmenthal").disabled = True +# enable all items in control +control.set_all_items_disabled(False) + +request2 = form.click() # mechanize.Request object +try: + response2 = mechanize.urlopen(request2) +except mechanize.HTTPError, response2: + pass + +print response2.geturl() +# headers +for name, value in response2.info().items(): + if name != "date": + print "%s: %s" % (name.title(), value) +print response2.read() # body +response2.close() + +~~~~ + +All of the standard control types are supported: `TEXT`, `PASSWORD`, `HIDDEN`, +`TEXTAREA`, `ISINDEX`, `RESET`, `BUTTON` (`INPUT TYPE=BUTTON` and the various +`BUTTON` types), `SUBMIT`, `IMAGE`, `RADIO`, `CHECKBOX`, `SELECT`/`OPTION` and +`FILE` (for file upload). Both standard form encodings +(`application/x-www-form-urlencoded` and `multipart/form-data`) are supported. + +The module is designed for testing and automation of web interfaces, not for +implementing interactive user agents. + +***Security note*: Remember that any passwords you store in `HTMLForm` +instances will be saved to disk in the clear if, for example, you +[pickle](http://docs.python.org/library/pickle.html) them.** + + +Parsers +------- + +There are two parsers. + +TODO: more! + +See also the FAQ entries on [XHTML](faq.html#xhtml) and [parsing bad +HTML](./faq.html#parsing). + + +Backwards-compatibility mode +---------------------------- + +mechanize (and ClientForm 0.2) includes three minor backwards-incompatible +interface changes from ClientForm version 0.1. + +To make upgrading from ClientForm 0.1 easier, and to allow me to stop +supporting version ClientForm 0.1 sooner, there is support for operating in a +backwards-compatible mode, under which code written for ClientForm 0.1 should +work without modification. This is done on a per-`HTMLForm` basis via the +`.backwards_compat` attribute, but for convenience the `ParseResponse()` and +`ParseFile()` factory functions accept `backwards_compat` arguments. These +backwards-compatibility features will be removed soon. The default is to +operate in backwards-compatible mode. To run with backwards compatible mode +turned ***OFF*** (**strongly recommended**): + +~~~~{.python} +from mechanize import ParseResponse, urlopen +forms = ParseResponse(urlopen("http://example.com/"), backwards_compat=False) +# ... +~~~~ + +The backwards-incompatible changes are: + + * Ambiguous specification of controls or items now results in AmbiguityError. + If you want the old behaviour, explicitly pass `nr=0` to indicate you want + the first matching control or item. + + * Item label matching is now done by substring, not by strict string-equality + (but note leading and trailing space is always stripped). (Control label + matching is always done by substring.) + + * Handling of disabled list items has changed. First, note that handling of + disabled list items in ClientForm 0.1 (and in ClientForm 0.2's + backwards-compatibility mode!) is buggy: disabled items are successful + (ie. disabled item names are sent back to the server). As a result, there + was no distinction to be made between successful items and selected items. + In ClientForm 0.2, the bug is fixed, so this is no longer the case, and it + is important to note that list controls' `.value` attribute contains only + the *successful* item names; items that are *selected* but not successful + (because disabled) are not included in `.value`. Second, disabled list + items may no longer be deselected: AttributeError is raised in ClientForm + 0.2, whereas deselection was allowed in ClientForm 0.1. The bug in + ClientForm 0.1 and in ClientForm 0.2's backwards-compatibility mode will + not be fixed, to preserve compatibility and to encourage people to upgrade + to the new ClientForm 0.2 `backwards_compat=False` behaviour. + + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/hints.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/hints.txt new file mode 100644 index 0000000000000000000000000000000000000000..35e1db0bc0bb245f497b98060feea276be95ad16 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/hints.txt @@ -0,0 +1,154 @@ +% mechanize -- Hints + +Hints for debugging programs that use mechanize. + +Cookies +------- + +A common mistake is to use `mechanize.urlopen()`, *and* the +`.extract_cookies()` and `.add_cookie_header()` methods on a cookie object +themselves. If you use `mechanize.urlopen()` (or `OpenerDirector.open()`), the +module handles extraction and adding of cookies by itself, so you should not +call `.extract_cookies()` or `.add_cookie_header()`. + +Are you sure the server is sending you any cookies in the first place? Maybe +the server is keeping track of state in some other way (`HIDDEN` HTML form +entries (possibly in a separate page referenced by a frame), URL-encoded +session keys, IP address, HTTP `Referer` headers)? Perhaps some embedded +script in the HTML is setting cookies (see below)? Turn on +[logging](#logging). + +When you `.save()` to or `.load()`/`.revert()` from a file, single-session +cookies will expire unless you explicitly request otherwise with the +`ignore_discard` argument. This may be your problem if you find cookies are +going away after saving and loading. + +~~~~{.python} +import mechanize +cj = mechanize.LWPCookieJar() +opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) +mechanize.install_opener(opener) +r = mechanize.urlopen("http://foobar.com/") +cj.save("/some/file", ignore_discard=True, ignore_expires=True) +~~~~ + +JavaScript code can set cookies; mechanize does not support this. See [the +FAQ](faq.html#script). + + +General +------- + +Enable [logging](#logging). + +Sometimes, a server wants particular HTTP headers set to the values it expects. +For example, the `User-Agent` header may need to be [set](./doc.html#headers) +to a value like that of a popular browser. + +Check that the browser is able to do manually what you're trying to achieve +programatically. Make sure that what you do manually is *exactly* the same as +what you're trying to do from Python -- you may simply be hitting a server bug +that only gets revealed if you view pages in a particular order, for example. + +Try comparing the headers and data that your program sends with those that a +browser sends. Often this will give you the clue you need. There are [browser +addons](faq.html#sniffing) available that allow you to see what the browser +sends and receives even if HTTPS is in use. + +If nothing is obviously wrong with the requests your program is sending and +you're out of ideas, you can reliably locate the problem by copying the headers +that a browser sends, and then changing headers until your program stops +working again. Temporarily switch to explicitly sending individual HTTP +headers (by calling `.add_header()`, or by using `httplib` directly). Start by +sending exactly the headers that Firefox or IE send. You may need to make sure +that a valid session ID is sent -- the one you got from your browser may no +longer be valid. If that works, you can begin the tedious process of changing +your headers and data until they match what your original code was sending. +You should end up with a minimal set of changes. If you think that reveals a +bug in mechanize, please [report it](support.html). + + +Logging +------- + +To enable logging to stdout: + +~~~~{.python} +import sys, logging +logger = logging.getLogger("mechanize") +logger.addHandler(logging.StreamHandler(sys.stdout)) +logger.setLevel(logging.DEBUG) +~~~~ + +You can reduce the amount of information shown by setting the level to +`logging.INFO` instead of `logging.DEBUG`, or by only enabling logging for one +of the following logger names instead of `"mechanize"`: + + * `"mechanize"`: Everything. + + * `"mechanize.cookies"`: Why particular cookies are accepted or rejected and why +they are or are not returned. Requires logging enabled at the `DEBUG` level. + + * `"mechanize.http_responses"`: HTTP response body data. + + * `"mechanize.http_redirects"`: HTTP redirect information. + + +HTTP headers +------------ + +An example showing how to enable printing of HTTP headers to stdout, logging of +HTTP response bodies, and logging of information about redirections: + +~~~~{.python} +import sys, logging +import mechanize + +logger = logging.getLogger("mechanize") +logger.addHandler(logging.StreamHandler(sys.stdout)) +logger.setLevel(logging.DEBUG) + +browser = mechanize.Browser() +browser.set_debug_http(True) +browser.set_debug_responses(True) +browser.set_debug_redirects(True) +response = browser.open("http://python.org/") +~~~~ + +Alternatively, you can examine request and response objects to see what's going +on. Note that requests may involve "sub-requests" in cases such as +redirection, in which case you will not see everything that's going on just by +examining the original request and final response. It's often useful to [use +the `.get_data()` method](./doc.html#seekable-responses) on responses during +debugging. + +### Handlers ### + +**This section is not relevant if you use `mechanize.Browser`.** + +An example showing how to enable printing of HTTP headers to stdout, at the +`HTTPHandler` level: + +~~~~{.python} +import mechanize +hh = mechanize.HTTPHandler() # you might want HTTPSHandler, too +hh.set_http_debuglevel(1) +opener = mechanize.build_opener(hh) +response = opener.open(url) +~~~~ + +The following handlers are available: + +**NOTE**: as well as having these handlers in your `OpenerDirector` (for +example, by passing them to `build_opener()`) you have to [turn on +logging](#logging) at the `INFO` level or lower in order to see any output. + +`HTTPRedirectDebugProcessor`: logs information about redirections + +`HTTPResponseDebugProcessor`: logs HTTP response bodies (including those that +are read during redirections) + + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/ChangeLog.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/html/ChangeLog.txt new file mode 100644 index 0000000000000000000000000000000000000000..d41559fb666f3915b3f2b7d82ec5334e70744173 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/ChangeLog.txt @@ -0,0 +1,553 @@ +This isn't really in proper GNU ChangeLog format, it just happens to +look that way. + +2011-03-31 John J Lee <jjl@pobox.com> + * 0.2.5 release. + * This is essentially a no-changes release to fix easy_install + breakage caused by a SourceForge issue + * Sourceforge is returning invalid HTTP responses, make download + links point to PyPI instead + * Include cookietest.cgi in source distribution + * Note new IETF cookie standardisation effort + +2010-10-28 John J Lee <jjl@pobox.com> + * 0.2.4 release. + * Fix IndexError on empty Content-type header value. (GH-18) + * Fall back to another encoding if an unknown one is declared. + Fixes traceback on unknoqn encoding in Content-type header. + (GH-30) + +2010-10-16 John J Lee <jjl@pobox.com> + * 0.2.3 release. + * Fix str(ParseError()) traceback. (GH-25) + * Add equality methods to mechanize.Cookie . (GH-29) + +2010-07-17 John J Lee <jjl@pobox.com> + * 0.2.2 release. + * Officially support Python 2.7 (no changes were required) + * Fix TypeError on .open()ing ftp: URL (only affects Python 2.4 + and 2.5) + * Don't include HTTPSHandler in __all__ if it's not available + +2010-05-16 John J Lee <jjl@pobox.com> + * 0.2.1 release. + * API change: Change argument order of + HTTPRedirectHandler.redirect_request() to match urllib2. + * Fix failure to use bundled BeautifulSoup for forms. (GH-15) + * Fix default cookie path where request path has query containing + / character. (http://bugs.python.org/issue3704) + * Fix failure to raise on click for nonexistent label. (GH-16) + * Documentation fixes. + +2010-04-22 John J Lee <jjl@pobox.com> + * 0.2.0 release. + * Behaviour change: merged upstream urllib2 change (allegedly a + "bug fix") to return a response for all 2** HTTP responses (e.g. + 206 Partial Content). Previously, only 200 caused a response + object to be returned. All other HTTP response codes resulted + in a response object being raised as an exception. + * Behaviour change: Use of mechanize classes with `urllib2` (and + vice-versa) is no longer supported. However, existing classes + implementing the urllib2 Handler interface are likely to work + unchanged with mechanize. Removed RequestUpgradeProcessor, + ResponseUpgradeProcessor, SeekableProcessor. + * ClientForm has been merged into mechanize. This means that + mechanize has no dependencies other than Python itself. The + ClientForm API is still available -- to switch from ClientForm to + mechanize, just s/ClientForm/mechanize in your source code, and + ensure any use of the module logging logger named "ClientForm" is + updated to use the new logger name "mechanize.forms". I probably + won't do further standalone releases of ClientForm. + * Stop monkey-patching Python stdlib. + * Merge fixes from urllib2 trunk + * Close file objects on .read() failure in .retrieve() + * Fix a python 2.4 bug due to buggy urllib.splithost + * Fix Python 2.4 syntax error in _firefox3cookiejar + * Fix __init__.py typo that hid mechanize.seek_wrapped_response and + mechanize.str2time. Fixes + http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=465206 + * Fix an obvious bug with experimental firefox 3 cookiejar support. + It's still experimental and not properly tested. + * Change documentation to not require a .port attribute on request + objects, since that's unused. + * Doc fixes + * Added mechanize.urljoin (RFC 3986 compliant function for joining + a base URI with a URI reference) + * Merge of ClientForm (see above). + * Moved to git (from SVN) http://github.com/jjlee/mechanize + * Created an issue tracker http://github.com/jjlee/mechanize/issues + * Docs are now in markdown format (thanks John Gabriele). + * Website rearranged. The old website has been archived at + http://wwwsearch.sourceforge.net/old/ . The new website is + essentially just the mechanize pages, rearranged and cleaned up a + bit. + * Source code rearranged for easier merging with upstream urllib2 + * Fully automated release process. + * New test runner. Single test suite; tests create their own HTTP + server fixtures (server fixtures are cached where possible for + speed). + +2009-02-07 John J Lee <jjl@pobox.com> + * 0.1.11 release. + * Fix quadratic performance in number of .read() calls (and add an + automated performance test). + +2008-12-03 John J Lee <jjl@pobox.com> + * 0.1.10 release. + * Add support for Python 2.6: Raise URLError on file: URL errors, + not IOError (port of upstream urllib2 fix). Add support for + Python 2.6's per-connection timeouts: Add timeout arguments to + urlopen(), Request constructor, .open(), and .open_novisit(). + * Drop support for Python 2.3 + * Add Content-length header to Request object (httplib bug that + prevented doing that was fixed in Python 2.4). There's no + change is what is actually sent over the wire here, just in what + headers get added to the Request object. + * Fix AttributeError on .retrieve() with a Request (as opposed to + URL string) argument + * Don't change CookieJar state in .make_cookies(). + * Fix AttributeError in case where .make_cookies() or + .cookies_for_request() is called before other methods like + .extract_cookies() or .make_cookie_header() + * Fixes affecting version cookie-attribute + (http://bugs.python.org/issue3924). + * Silence module logging's "no handlers could be found for logger + mechanize" warning in a way that doesn't clobber attempts to set + log level sometimes + * Don't use private attribute of request in request upgrade + handler (what was I thinking??) + * Don't call setup() on import of setup.py + * Add new public function effective_request_host + * Add .get_policy() method to CookieJar + * Add method CookieJar.cookies_for_request() + * Fix documented interface required of requests and responses (and + add some tests for this!) + * Allow either .is_unverifiable() or .unverifiable on request + objects (preferring the former) + * Note that there's a new functional test for digest auth, which + fails when run against the sourceforge site (which is the + default). It looks like this reflects the fact that digest auth + has been fairly broken since it was introduced in urllib2. I + don't plan to fix this myself. + +2008-09-24 John J Lee <jjl@pobox.com> + * 0.1.9 release. + * Fix ImportError if sqlite3 not available + * Fix a couple of functional tests not to wait 5 seconds each + +2008-09-13 John J Lee <jjl@pobox.com> + * 0.1.8 release. + * Close sockets. This only affects Python 2.5 (and later) - + earlier versions of Python were unaffected. See + http://bugs.python.org/issue1627441 + * Make title parsing follow Firefox behaviour wrt child + elements (previously the behaviour differed between Factory and + RobustFactory). + * Fix BeautifulSoup RobustLinksFactory (hence RobustFactory) link + text parsing for case of link text containing tags (Titus Brown) + * Fix issue where more tags after <title> caused default parser to + raise an exception + * Handle missing cookie max-age value. Previously, a warning was + emitted in this case. + * Fix thoroughly broken digest auth (still need functional + test!) (trebor74hr@...) + * Handle cookies containing embedded tabs in mozilla format files + * Remove an assertion about mozilla format cookies file + contents (raise LoadError instead) + * Fix MechanizeRobotFileParser.set_opener() + * Fix selection of global form using .select_form() (Titus Brown) + * Log skipped Refreshes + * Stop tests from clobbering files that happen to be lying around + in cwd (!) + * Use SO_REUSEADDR for local test server. + * Raise exception if local test server fails to start. + * Tests no longer (accidentally) depend on third-party coverage + module + * The usual docs and test fixes. + * Add convenience method Browser.open_local_file(filename) + * Add experimental support for Firefox 3 cookie jars + ("cookies.sqlite"). Requires Python 2.5 + * Fix a _gzip.py NameError (gzip support is experimental) + +2007-05-31 John J Lee <jjl@pobox.com> + * 0.1.7b release. + * Sub-requests should not usually be visiting, so make it so. In + fact the visible behaviour wasn't really broken here, since + .back() skips over None responses (which is odd in itself, but + won't be changed until after stable release is branched). + However, this patch does change visible behaviour in that it + creates new Request objects for sub-requests (e.g. basic auth + retries) where previously we just mutated the existing Request + object. + * Changes to sort out abuse of by SeekableProcessor and + ResponseUpgradeProcessor (latter shouldn't have been public in + the first place) and resulting confusing / unclear / broken + behaviour. Deprecate SeekableProcessor and + ResponseUpgradeProcessor. Add SeekableResponseOpener. Remove + SeekableProcessor and ResponseUpgradeProcessor from Browser. + Move UserAgentBase.add_referer_header() to Browser (it was on by + default, breaking UserAgent, and should never really have been + there). + * Fix HTTP proxy support: r29110 meant that Request.get_selector() + didn't take into account the change to .__r_host + (Thanks tgates@...). + * Redirected robots.txt fetch no longer results in another + attempted robots.txt fetch to check the redirection is allowed! + * Fix exception raised by RFC 3986 implementation with + urljoin(base, '/..') + * Fix two multiple-response-wrapping bugs. + * Add missing import in tests (caused failure on Windows). + * Set svn:eol-style to native for all text files in SVN. + * Add some tests for upgrade_response(). + * Add a functional test for 302 + 404 case. + * Add an -l option to run the functional tests against a local + twisted.web2-based server (you need Twisted installed for this + to work). This is much faster than running against + wwwsearch.sourceforge.net + * Add -u switch to skip unittests (and only run the doctests). + +2007-01-07 John J Lee <jjl@pobox.com> + + * 0.1.6b release + * Add mechanize.ParseError class, document it as part of the + mechanize.Factory interface, and raise it from all Factory + implementations. This is backwards-compatible, since the new + exception derives from the old exceptions. + * Bug fix: Truncation when there is no full .read() before + navigating to the next page, and an old response is read after + navigation. This happened e.g. with r = br.open(); + r.readline(); br.open(url); r.read(); br.back() . + * Bug fix: when .back() caused a reload, it was returning the old + response, not the .reload()ed one. + * Bug fix: .back() was not returning a copy of the response, which + presumably would cause seek position problems. + * Bug fix: base tag without href attribute would override document + URL with a None value, causing a crash (thanks Nathan Eror). + * Fix .set_response() to close current response first. + * Fix non-idempotent behaviour of Factory.forms() / .links() . + Previously, if for example you got a ParseError during execution + of .forms(), you could call it again and have it not raise an + exception, because it started out where it left off! + * Add a missing copy.copy() to RobustFactory . + * Fix redirection to 'URIs' that contain characters that are not + allowed in URIs (thanks Riko Wichmann). Also, Request + constructor now logs a module logging warning about any such bad + URIs. + * Add .global_form() method to Browser to support form controls + whose HTML elements are not descendants of any FORM element. + * Add a new method .visit_response() . This creates a new history + entry from a response object, rather than just changing the + current visited response. This is useful e.g. when you want to + use Browser features in a handler. + * Misc minor bug fixes. + +2006-10-25 John J Lee <jjl@pobox.com> + + * 0.1.5b release: Update setuptools dependencies to depend on + ClientForm>=0.2.5 (for an important bug fix affecting fragments + in URLs). There are no other changes in this release -- this + release was done purely so that people upgrading to the latest + version of mechanize will get the latest ClientForm too. + +2006-10-14 John J Lee <jjl@pobox.com> + * 0.1.4b release: (skipped a version deliberately for obscure + reasons) + * Improved auth & proxies support. + * Follow RFC 3986. + * Add a .set_cookie() method to Browser . + * Add Browser.open_novisit() and Request.visit to allow fetching + files without affecting Browser state. + * UserAgent and Browser are now subclasses of UserAgentBase. + UserAgent's only role in life above what UserAgentBase does is + to provide the .set_seekable_responses() method (it lives there + because Browser depends on seekable responses, because that's + how browser history is implemented). + * Bundle BeautifulSoup 2.1.1. No more dependency pain! Note that + BeautifulSoup is, and always was, optional, and that mechanize + will eventually switch to BeautifulSoup version 3, at which + point it may well stop bundling BeautifulSoup. Note also that + the module is only used internally, and is not available as a + public attribute of the package. If you dare, you can import it + ("from mechanize import _beautifulsoup"), but beware that it + will go away later, and that the API of BeautifulSoup will + change when the upgrade to 3 happens. Also, BeautifulSoup + support (mainly RobustFactory) is still a little experimental + and buggy. + * Fix HTTP-EQUIV with no content attribute case (thanks Pratik + Dam). + * Fix bug with quoted META Refresh URL (thanks Nilton Volpato). + * Fix crash with </base> tag (yajdbgr02@...). + * Somebody found a server that (incorrectly) depends on HTTP + header case, so follow the Title-Case convention. Note that the + Request headers interface(s), which were (somewhat oddly -- this + is an inheritance from urllib2 that should really be fixed in a + better way than it is currently) always case-sensitive still + are; the only thing that changed is what actually eventually + gets sent over the wire. + * Use mechanize (not urllib) to open robots.txt. Don't consult + RobotFileParser instance about non-HTTP URLs. + * Fix OpenerDirector.retrieve(), which was very broken (thanks + Duncan Booth). + * Crash in a much more obvious way if trying to use OpenerDirector + after .close() . + * .reload() on .back() if necessary (necessary iff response was + not fully .read() on first .open()ing ) * Strip fragments before + retrieving URLs (fixed Request.get_selector() to strip fragment) + * Fix catching HTTPError subclasses while still preserving all + their response behaviour + * Correct over-enthusiastic documented guarantees of + closeable_response . + * Fix assumption that httplib.HTTPMessage treats dict-style + __setitem__ as append rather than set (where on earth did I get + that from?). + * Expose History in mechanize/__init__.py (though interface is + still experimental). + * Lots of other "internals" bugs fixed (thanks to reports / + patches from Benji York especially, also Titus Brown, Duncan + Booth, and me ;-), where I'm not 100% sure exactly when they + were introduced, so not listing them here in detail. + * Numerous other minor fixes. + * Some code cleanup. + +2006-05-21 John J Lee <jjl@pobox.com> + * 0.1.2b release: + * mechanize now exports the whole urllib2 interface. + * Pull in bugfixed auth/proxy support code from Python 2.5. + * Bugfix: strip leading and trailing whitespace from link URLs + * Fix .any_response() / .any_request() methods to have ordering. + consistent with rest of handlers rather than coming before all + of them. + * Tell cookie-handling code about new TLDs. + * Remove Browser.set_seekable_responses() (they always are + anyway). + * Show in web page examples how to munge responses and how to do + proxy/auth. + * Rename 0.1.* changes document 0.1.0-changes.txt --> + 0.1-changes.txt. + * In 0.1 changes document, note change of logger name from + "ClientCookie" to "mechanize" + * Add something about response objects to changes document + * Improve Browser.__str__ + * Accept regexp strings as well as regexp objects when finding + links. + * Add crappy gzip transfer encoding support. This is off by + default and warns if you turn it on (hopefully will get better + later :-). + * A bit of internal cleanup following merge with pullparser / + ClientCookie. + +2006-05-06 John J Lee <jjl@pobox.com> + * 0.1.1a release: + * Merge ClientCookie and pullparser with mechanize. + * Response object fixes. + * Remove accidental dependency on BeautifulSoup introduced in + 0.1.0a (the BeautifulSoup support is still here, but + BeautifulSoup is not required to use mechanize). + +2006-05-03 John J Lee <jjl@pobox.com> + * 0.1.0a release: + * Stop trying to record precise dates in changelog, since that's + silly ;-) + * A fair number of interface changes: see 0.1.0-changes.txt. + * Depend on recent ClientCookie with copy.copy()able response + objects. + * Don't do broken XHTML handling by default (need to review code + before switching this back on, e.g. should use a real XML parser + for first-try at parsing). To get the old behaviour, pass + i_want_broken_xhtml_support=True to mechanize.DefaultFactory / + .RobustFactory constructor. + * Numerous small bug fixes. + * Documentation & setup.py fixes. + * Don't use cookielib, to avoid having to work around Python 2.4 + RFC 2109 bug, and to avoid my braindead thread synchronisation + code in cookielib :-((((( (I haven't encountered specific + breakage due to latter, but since it's braindead I may as well + avoid it). + +2005-11-30 John J Lee <jjl@pobox.com> + * Fixed setuptools support. + * Release 0.0.11a. + +2005-11-19 John J Lee <jjl@pobox.com> + * Release 0.0.10a. + +2005-11-17 John J Lee <jjl@pobox.com> + * Fix set_handle_referer. + +2005-11-12 John J Lee <jjl@pobox.com> + * Fix history (Gary Poster). + * Close responses on reload (Gary Poster). + * Don't depend on SSL support (Gary Poster). + +2005-10-31 John J Lee <jjl@pobox.com> + * Add setuptools support. + +2005-10-30 John J Lee <jjl@pobox.com> + * Don't mask AttributeError exception messages from ClientForm. + * Document intent of .links() vs. .get_links_iter(); Rename + LinksFactory method. + * Remove pullparser import dependency. + * Remove Browser.urltags (now an argument to LinksFactory). + * Document Browser constructor as taking keyword args only (and + change positional arg spec). + * Cleanup of lazy parsing (may fix bugs, not sure...). + +2005-10-28 John J Lee <jjl@pobox.com> + * Support ClientForm backwards_compat switch. + +2005-08-28 John J Lee <jjl@pobox.com> + * Apply optimisation patch (Stephan Richter). + +2005-08-15 John J Lee <jjl@pobox.com> + * Close responses (ie. close the file handles but leave response + still .read()able &c., thanks to the response objects we're + using) (aurel@nexedi.com). + +2005-08-14 John J Lee <jjl@pobox.com> + * Add missing argument to UserAgent's _add_referer_header stub. + * Doc and comment improvements. + +2005-06-28 John J Lee <jjl@pobox.com> + * Allow specifying parser class for equiv handling. + * Ensure correct default constructor args are passed to + HTTPRefererProcessor. + * Allow configuring details of Refresh handling. + * Switch to tolerant parser. + +2005-06-11 John J Lee <jjl@pobox.com> + * Do .seek(0) after link parsing in a finally block. + * Regard text/xhtml as HTML. + * Fix 2.4-compatibility bugs. + * Fix spelling of '_equiv' feature string. + +2005-05-30 John J Lee <jjl@pobox.com> + * Turn on Referer, Refresh and HTTP-Equiv handling by default. + +2005-05-08 John J Lee <jjl@pobox.com> + * Fix .reload() to not update history (thanks to Titus Brown). + * Use cookielib where available + +2005-03-01 John J Lee <jjl@pobox.com> + * Fix referer bugs: Don't send URL fragments; Don't add in Referer + header in redirected request unless original request had a + Referer header. + +2005-02-19 John J Lee <jjl@pobox.com> + * Allow supplying own mechanize.FormsFactory, so eg. can use + ClientForm.XHTMLFormParser. Also allow supplying own Request + class, and use sensible defaults for this. Now depends on + ClientForm 0.1.17. Side effect is that, since we use the + correct Request class by default, there's (I hope) no need for + using RequestUpgradeProcessor in Browser._add_referer_header() + :-) + +2005-01-30 John J Lee <jjl@pobox.com> + * Released 0.0.9a. + +2005-01-05 John J Lee <jjl@pobox.com> + * Fix examples (scraped sites have changed). + * Fix .set_*() method boolean arguments. + * The .response attribute is now a method, .response() + * Don't depend on BaseProcessor (no longer exists). + +2004-05-18 John J Lee <jjl@pobox.com> + * Released 0.0.8a: + * Added robots.txt observance, controlled by + * BASE element has attribute 'href', not 'uri'! (patch from Jochen + Knuth) + * Fixed several bugs in handling of Referer header. + * Link.__eq__ now returns False instead of raising AttributeError + on comparison with non-Link (patch from Jim Jewett) + * Removed dependencies on HTTPS support in Python and on + ClientCookie.HTTPRobotRulesProcessor + +2004-01-18 John J Lee <jjl@pobox.com> + * Added robots.txt observance, controlled by + UserAgent.set_handle_robots(). This is now on by default. + * Removed set_persistent_headers() method -- just use .addheaders, + as in base class. + +2004-01-09 John J Lee <jjl@pobox.com> + * Removed unnecessary dependence on SSL support in Python. Thanks + to Krzysztof Kowalczyk for bug report. + * Released 0.0.7a. + +2004-01-06 John J Lee <jjl@pobox.com> + * Link instances may now be passed to .click_link() and + .follow_link(). + * Added a new example program, pypi.py. + +2004-01-05 John J Lee <jjl@pobox.com> + * Released 0.0.5a. + * If <title> tag was missing, links and forms would not be parsed. + Also, base element (giving base URI) was ignored. Now parse + title lazily, and get base URI while parsing links. Also, fixed + ClientForm to take note of base element. Thanks to Phillip J. + Eby for bug report. + * Released 0.0.6a. + +2004-01-04 John J Lee <jjl@pobox.com> + * Fixed _useragent._replace_handler() to update self.handlers + correctly. + * Updated required pullparser version check. + * Visiting a URL now deselects form (sets self.form to None). + * Only first Content-Type header is now checked by + ._viewing_html(), if there are more than one. + * Stopped using getheaders from ClientCookie -- don't need it, + since depend on Python 2.2, which has .getheaders() method on + responses. Improved comments. + * .open() now resets .response to None. Also rearranged .open() a + bit so instance remains in consistent state on failure. + * .geturl() now checks for non-None .response, and raises Browser. + * .back() now checks for non-None .response, and doesn't attempt + to parse if it's None. + * .reload() no longer adds new history item. + * Documented tag argument to .find_link(). + * Fixed a few places where non-keyword arguments for .find_link() + were silently ignored. Now raises ValueError. + +2004-01-02 John J Lee <jjl@pobox.com> + * Use response_seek_wrapper instead of seek_wrapper, which broke + use of reponses after they're closed. + * (Fixed response_seek_wrapper in ClientCookie.) + * Fixed adding of Referer header. Thanks to Per Cederqvist for + bug report. + * Released 0.0.4a. + * Updated required ClientCookie version check. + +2003-12-30 John J Lee <jjl@pobox.com> + * Added support for character encodings (for matching link text). + * Released 0.0.3a. + +2003-12-28 John J Lee <jjl@pobox.com> + * Attribute lookups are no longer forwarded to .response -- + you have to do it explicitly. + * Added .geturl() method, which just delegates to .response. + * Big rehash of UserAgent, which was broken. Added a test. + * Discovered that zip() doesn't raise an exception when its + arguments are of different length, so several tests could pass + when they should have failed. Fixed. + * Fixed <A/> case in ._parse_html(). + * Released 0.0.2a. + +2003-12-27 John J Lee <jjl@pobox.com> + * Added and improved docstrings. + * Browser.form is now a public attribute. Also documented + Browser's public attributes. + * Added base_url and absolute_url attributes to Link. + * Tidied up .open(). Relative URL Request objects are no longer + converted to absolute URLs -- they should probably be absolute + in the first place anyway. + * Added proper Referer handling (the handler in ClientCookie is a + hack that only covers a restricted case). + * Added click_link method, for symmetry with .click() / .submit() + methods (which latter apply to forms). Of these methods, + .click/.click_link() returns a request, and .submit/ + .follow_link() actually .open()s the request. + * Updated broken example code. + +2003-12-24 John J Lee <jjl@pobox.com> + * Modified setup.py so can easily register with PyPI. + +2003-12-22 John J Lee <jjl@pobox.com> + * Released 0.0.1a. diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/development.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/development.html new file mode 100644 index 0000000000000000000000000000000000000000..27444cea760c01c5a6f84d8e51b53bb8301a45d1 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/development.html @@ -0,0 +1,87 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2010-04-18"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize — Development</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<a href="./"> +Home +</a> +</li> +<li> +<a href="download.html"> +Download +</a> +</li> +<li> +<a href="support.html"> +Support +</a> +</li> +<li> +<span class="thispage"> +Development +</span> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize — Development</h1> + + + +<div id="TOC"><ul> +<li><a href="#git-repository">git repository</a></li> +<li><a href="#old-repository">Old repository</a></li> +<li><a href="#bug-tracker">Bug tracker</a></li> +<li><a href="#mailing-list">Mailing list</a></li> +</ul></div> + +<div id="git-repository"> +<h2>git repository</h2> +<p>The <a href="http://git-scm.com/">git</a> repository is <a href="http://github.com/jjlee/mechanize">here</a>. To check it out:</p> +<p><code>git clone git://github.com/jjlee/mechanize.git</code></p> +<p>There is also <a href="http://github.com/jjlee/mechanize-build-tools">another repository</a>, which is only useful for making mechanize releases:</p> +<p><code>git clone git://github.com/jjlee/mechanize-build-tools.git</code></p> +</div> +<div id="old-repository"> +<h2>Old repository</h2> +<p>The <a href="http://codespeak.net/svn/wwwsearch/">old SVN repository</a> may be useful for viewing ClientForm history. ClientForm used to be a dependency of mechanize, but has been merged into mechanize as of release 0.2.0; the history wasn’t imported. To check out:</p> +<p><code>svn co http://codespeak.net/svn/wwwsearch/</code></p> +</div> +<div id="bug-tracker"> +<h2>Bug tracker</h2> +<p>The bug tracker is <a href="http://github.com/jjlee/mechanize/issues">here on github</a>. It’s equally acceptable to file bugs on the tracker or post about them to the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing list</a>. Feel free to send patches too!</p> +</div> +<div id="mailing-list"> +<h2>Mailing list</h2> +<p>There is a <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing list</a>.</p> +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, April 2010. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/doc.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/doc.html new file mode 100644 index 0000000000000000000000000000000000000000..29cd8f68c63a0639d8c9e59f1b71986748c16932 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/doc.html @@ -0,0 +1,247 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2011-03-31"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize — Documentation</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<a href="./"> +Home +</a> +</li> +<li> +<a href="download.html"> +Download +</a> +</li> +<li> +<span class="thispage"> +Support +</span> +</li> +<li> +<a href="development.html"> +Development +</a> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize — Documentation</h1> + +<ul id="subnav"> +<li> +<a href="support.html"> +Support +</a> +<ul> +<li> +<a href="documentation.html"> +Documentation +</a> +<ul> +<li> +<a href="faq.html"> +FAQ +</a> +</li> +<li> +<span class="thispage"> +Handlers etc. +</span> +</li> +<li> +<a href="forms.html"> +Forms +</a> +</li> +<li> +<a href="hints.html"> +Hints +</a> +</li> +</ul> +</li> +<li> +<a href="ChangeLog.txt"> +Changelog +</a> +</li> +</ul> +</li> +</ul> +<div id="TOC"><ul> +<li><a href="#examples">Examples</a></li> +<li><a href="#important-note">Important note</a></li> +<li><a href="#cooperating-with-browsers">Cooperating with Browsers</a></li> +<li><a href="#saving-cookies-in-a-file">Saving cookies in a file</a></li> +<li><a href="#supplying-a-cookiejar">Supplying a CookieJar</a></li> +<li><a href="#additional-handlers">Additional Handlers</a></li> +<li><a href="#seekable-responses">Seekable responses</a></li> +<li><a href="#request-object-lifetime">Request object lifetime</a></li> +<li><a href="#adding-headers">Adding headers</a></li> +<li><a href="#automatically-added-headers">Automatically-added headers</a></li> +<li><a href="#initiating-unverifiable-transactions">Initiating unverifiable transactions</a></li> +<li><a href="#rfc-2965-support">RFC 2965 support</a></li> +<li><a href="#parsing-http-dates">Parsing HTTP dates</a></li> +<li><a href="#dealing-with-bad-html">Dealing with bad HTML</a></li> +<li><a href="#note-about-cookie-standards">Note about cookie standards</a></li> +</ul></div> + +<p><span class="docwarning">This documentation is in need of reorganisation!</span></p> +<p>This page is the old ClientCookie documentation. It deals with operation on the level of <code>urllib2 Handler</code> objects, and also with adding headers, debugging, and cookie handling. See the <a href="./">front page</a> for more typical use.</p> +<div id="examples"> +<h2>Examples</h2> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>response = mechanize.urlopen(<span class="st">"http://example.com/"</span>)<br></pre> +<p>This function behaves identically to <code>urllib2.urlopen()</code>, except that it deals with cookies automatically.</p> +<p>Here is a more complicated example, involving <code>Request</code> objects (useful if you want to pass <code>Request</code>s around, add headers to them, etc.):</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>request = mechanize.Request(<span class="st">"http://example.com/"</span>)<br><span class="co"># note we're using the urlopen from mechanize, not urllib2</span><br>response = mechanize.urlopen(request)<br><span class="co"># let's say this next request requires a cookie that was set</span><br><span class="co"># in response</span><br>request2 = mechanize.Request(<span class="st">"http://example.com/spam.html"</span>)<br>response2 = mechanize.urlopen(request2)<br><br><span class="kw">print</span> response2.geturl()<br><span class="kw">print</span> response2.info() <span class="co"># headers</span><br><span class="kw">print</span> response2.read() <span class="co"># body (readline and readlines work too)</span><br></pre> +<p>In these examples, the workings are hidden inside the <code>mechanize.urlopen()</code> function, which is an extension of <code>urllib2.urlopen()</code>. Redirects, proxies and cookies are handled automatically by this function (note that you may need a bit of configuration to get your proxies correctly set up: see <code>urllib2</code> documentation).</p> +<p>There is also a <code>urlretrieve()</code> function, which works like <code>urllib.urlretrieve()</code>.</p> +<p>An example at a slightly lower level shows how the module processes cookies more clearly:</p> +<pre class="sourceCode python"><span class="co"># Don't copy this blindly! You probably want to follow the examples</span><br><span class="co"># above, not this one.</span><br><span class="ch">import</span> mechanize<br><br><span class="co"># Build an opener that *doesn't* automatically call .add_cookie_header()</span><br><span class="co"># and .extract_cookies(), so we can do it manually without interference.</span><br><span class="kw">class</span> NullCookieProcessor(mechanize.HTTPCookieProcessor):<br><span class="kw">def</span> http_request(<span class="ot">self</span>, request): <span class="kw">return</span> request<br><span class="kw">def</span> http_response(<span class="ot">self</span>, request, response): <span class="kw">return</span> response<br>opener = mechanize.build_opener(NullCookieProcessor)<br><br>request = mechanize.Request(<span class="st">"http://example.com/"</span>)<br>response = mechanize.urlopen(request)<br>cj = mechanize.CookieJar()<br>cj.extract_cookies(response, request)<br><span class="co"># let's say this next request requires a cookie that was set in response</span><br>request2 = mechanize.Request(<span class="st">"http://example.com/spam.html"</span>)<br>cj.add_cookie_header(request2)<br>response2 = mechanize.urlopen(request2)<br></pre> +<p>The <code>CookieJar</code> class does all the work. There are essentially two operations: <code>.extract_cookies()</code> extracts HTTP cookies from <code>Set-Cookie</code> (the original <a href="http://curl.haxx.se/rfc/cookie_spec.html">Netscape cookie standard</a>) and <code>Set-Cookie2</code> (<a href="http://www.ietf.org/rfc/rfc2965.txt">RFC 2965</a>) headers from a response if and only if they should be set given the request, and <code>.add_cookie_header()</code> adds <code>Cookie</code> headers if and only if they are appropriate for a particular HTTP request. Incoming cookies are checked for acceptability based on the host name, etc. Cookies are only set on outgoing requests if they match the request’s host name, path, etc.</p> +<p><strong>Note that if you’re using <code>mechanize.urlopen()</code> (or if you’re using <code>mechanize.HTTPCookieProcessor</code> by some other means), you don’t need to call <code>.extract_cookies()</code> or <code>.add_cookie_header()</code> yourself</strong>. If, on the other hand, you want to use mechanize to provide cookie handling for an HTTP client other than mechanize itself, you will need to use this pair of methods. You can make your own <code>request</code> and <code>response</code> objects, which must support the interfaces described in the docstrings of <code>.extract_cookies()</code> and <code>.add_cookie_header()</code>.</p> +<p>There are also some <code>CookieJar</code> subclasses which can store cookies in files and databases. <code>FileCookieJar</code> is the abstract class for <code>CookieJar</code>s that can store cookies in disk files. <code>LWPCookieJar</code> saves cookies in a format compatible with the libwww-perl library. This class is convenient if you want to store cookies in a human-readable file:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>cj = mechanize.LWPCookieJar()<br>cj.revert(<span class="st">"cookie3.txt"</span>)<br>opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))<br>r = opener.<span class="dt">open</span>(<span class="st">"http://foobar.com/"</span>)<br>cj.save(<span class="st">"cookie3.txt"</span>)<br></pre> +<p>The <code>.revert()</code> method discards all existing cookies held by the <code>CookieJar</code> (it won’t lose any existing cookies if the load fails). The <code>.load()</code> method, on the other hand, adds the loaded cookies to existing cookies held in the <code>CookieJar</code> (old cookies are kept unless overwritten by newly loaded ones).</p> +<p><code>MozillaCookieJar</code> can load and save to the Mozilla/Netscape/lynx-compatible <code>'cookies.txt'</code> format. This format loses some information (unusual and nonstandard cookie attributes such as comment, and also information specific to RFC 2965 cookies). The subclass <code>MSIECookieJar</code> can load (but not save) from Microsoft Internet Explorer’s cookie files on Windows.</p> +</div> +<div id="important-note"> +<h2>Important note</h2> +<p>Only use names you can import directly from the <code>mechanize</code> package, and that don’t start with a single underscore. Everything else is subject to change or disappearance without notice.</p> +</div> +<div id="cooperating-with-browsers"> +<h2>Cooperating with Browsers</h2> +<p><strong>Firefox since version 3 persists cookies in an sqlite database, which is not supported by MozillaCookieJar.</strong></p> +<p>The subclass <code>MozillaCookieJar</code> differs from <code>CookieJar</code> only in storing cookies using a different, Firefox 2/Mozilla/Netscape-compatible, file format known as “cookies.txt”. The lynx browser also uses this format. This file format can’t store RFC 2965 cookies, so they are downgraded to Netscape cookies on saving. <code>LWPCookieJar</code> itself uses a libwww-perl specific format (`Set-Cookie3’) — see the example above. Python and your browser should be able to share a cookies file (note that the file location here will differ on non-unix OSes):</p> +<p><strong>WARNING:</strong> you may want to back up your browser’s cookies file if you use <code>MozillaCookieJar</code> to save cookies. I <em>think</em> it works, but there have been bugs in the past!</p> +<pre class="sourceCode python"><span class="ch">import</span> os, mechanize<br>cookies = mechanize.MozillaCookieJar()<br>cookies.load(os.path.join(os.environ[<span class="st">"HOME"</span>], <span class="st">"/.netscape/cookies.txt"</span>))<br><span class="co"># see also the save and revert methods</span><br></pre> +<p>Note that cookies saved while Mozilla is running will get clobbered by Mozilla — see <code>MozillaCookieJar.__doc__</code>.</p> +<p><code>MSIECookieJar</code> does the same for Microsoft Internet Explorer (MSIE) 5.x and 6.x on Windows, but does not allow saving cookies in this format. In future, the Windows API calls might be used to load and save (though the index has to be read directly, since there is no API for that, AFAIK; there’s also an unfinished <code>MSIEDBCookieJar</code>, which uses (reads and writes) the Windows MSIE cookie database directly, rather than storing copies of cookies as <code>MSIECookieJar</code> does).</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>cj = mechanize.MSIECookieJar(delayload=<span class="ot">True</span>)<br>cj.load_from_registry() <span class="co"># finds cookie index file from registry</span><br></pre> +<p>A true <code>delayload</code> argument speeds things up.</p> +<p>On Windows 9x (win 95, win 98, win ME), you need to supply a username to the <code>.load_from_registry()</code> method:</p> +<pre class="sourceCode python">cj.load_from_registry(username=<span class="st">"jbloggs"</span>)<br></pre> +<p>Konqueror/Safari and Opera use different file formats, which aren’t yet supported.</p> +</div> +<div id="saving-cookies-in-a-file"> +<h2>Saving cookies in a file</h2> +<p>If you have no need to co-operate with a browser, the most convenient way to save cookies on disk between sessions in human-readable form is to use <code>LWPCookieJar</code>. This class uses a libwww-perl specific format (`Set-Cookie3’). Unlike <code>MozilliaCookieJar</code>, this file format doesn’t lose information.</p> +</div> +<div id="supplying-a-cookiejar"> +<h2>Supplying a CookieJar</h2> +<p>You might want to do this to <a href="#cooperating-with-browsers">use your browser’s cookies</a>, to customize <code>CookieJar</code>’s behaviour by passing constructor arguments, or to be able to get at the cookies it will hold (for example, for saving cookies between sessions and for debugging).</p> +<p>If you’re using the higher-level <code>urllib2</code>-like interface (<code>urlopen()</code>, etc), you’ll have to let it know what <code>CookieJar</code> it should use:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>cookies = mechanize.CookieJar()<br><span class="co"># build_opener() adds standard handlers (such as HTTPHandler and</span><br><span class="co"># HTTPCookieProcessor) by default. The cookie processor we supply</span><br><span class="co"># will replace the default one.</span><br>opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))<br><br>r = opener.<span class="dt">open</span>(<span class="st">"http://example.com/"</span>) <span class="co"># GET</span><br>r = opener.<span class="dt">open</span>(<span class="st">"http://example.com/"</span>, data) <span class="co"># POST</span><br></pre> +<p>The <code>urlopen()</code> function uses a global <code>OpenerDirector</code> instance to do its work, so if you want to use <code>urlopen()</code> with your own <code>CookieJar</code>, install the <code>OpenerDirector</code> you built with <code>build_opener()</code> using the <code>mechanize.install_opener()</code> function, then proceed as usual:</p> +<pre class="sourceCode python">mechanize.install_opener(opener)<br>r = mechanize.urlopen(<span class="st">"http://example.com/"</span>)<br></pre> +<p>Of course, everyone using <code>urlopen</code> is using the same global <code>CookieJar</code> instance!</p> +<a name="policy"></a><p>You can set a policy object (must satisfy the interface defined by <code>mechanize.CookiePolicy</code>), which determines which cookies are allowed to be set and returned. Use the <code>policy</code> argument to the <code>CookieJar</code> constructor, or use the <code>.set\_policy()</code> method. The default implementation has some useful switches:</p> +<pre class="sourceCode python"><span class="ch">from</span> mechanize <span class="ch">import</span> CookieJar, DefaultCookiePolicy <span class="ch">as</span> Policy<br>cookies = CookieJar()<br><span class="co"># turn on RFC 2965 cookies, be more strict about domains when setting and</span><br><span class="co"># returning Netscape cookies, and block some domains from setting cookies</span><br><span class="co"># or having them returned (read the DefaultCookiePolicy docstring for the</span><br><span class="co"># domain matching rules here)</span><br>policy = Policy(rfc2965=<span class="ot">True</span>, strict_ns_domain=Policy.DomainStrict,<br> blocked_domains=[<span class="st">"ads.net"</span>, <span class="st">".ads.net"</span>])<br>cookies.set_policy(policy)<br></pre> +</div> +<div id="additional-handlers"> +<h2>Additional Handlers</h2> +<p>The following handlers are provided in addition to those provided by <code>urllib2</code>:</p> +<dl> +<dt><code>HTTPRobotRulesProcessor</code></dt> +<dd><p>WWW Robots (also called wanderers or spiders) are programs that traverse many pages in the World Wide Web by recursively retrieving linked pages. This kind of program can place significant loads on web servers, so there is a <a href="http://www.robotstxt.org/wc/norobots.html">standard</a> for a <code>robots.txt</code> file by which web site operators can request robots to keep out of their site, or out of particular areas of it. This handler uses the standard Python library’s <code>robotparser</code> module. It raises <code>mechanize.RobotExclusionError</code> (subclass of <code>mechanize.HTTPError</code>) if an attempt is made to open a URL prohibited by <code>robots.txt</code>.</p></dd> +<dt><code>HTTPEquivProcessor</code></dt> +<dd><p>The <code><META HTTP-EQUIV></code> tag is a way of including data in HTML to be treated as if it were part of the HTTP headers. mechanize can automatically read these tags and add the <code>HTTP-EQUIV</code> headers to the response object’s real HTTP headers. The HTML is left unchanged.</p></dd> +<dt><code>HTTPRefreshProcessor</code></dt> +<dd><p>The <code>Refresh</code> HTTP header is a non-standard header which is widely used. It requests that the user-agent follow a URL after a specified time delay. mechanize can treat these headers (which may have been set in <code><META HTTP-EQUIV></code> tags) as if they were 302 redirections. Exactly when and how <code>Refresh</code> headers are handled is configurable using the constructor arguments.</p></dd> +<dt><code>HTTPRefererProcessor</code></dt> +<dd><p>The <code>Referer</code> HTTP header lets the server know which URL you’ve just visited. Some servers use this header as state information, and don’t like it if this is not present. It’s a chore to add this header by hand every time you make a request. This adds it automatically. <strong>NOTE</strong>: this only makes sense if you use each handler for a single chain of HTTP requests (so, for example, if you use a single HTTPRefererProcessor to fetch a series of URLs extracted from a single page, <strong>this will break</strong>). <a href="../mechanize/">mechanize.Browser</a> does this properly.</p></dd> +</dl> +<p>Example:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>cookies = mechanize.CookieJar()<br><br>opener = mechanize.build_opener(mechanize.HTTPRefererProcessor,<br> mechanize.HTTPEquivProcessor,<br> mechanize.HTTPRefreshProcessor,<br> )<br>opener.<span class="dt">open</span>(<span class="st">"http://www.rhubarb.com/"</span>)<br></pre> +</div> +<div id="seekable-responses"> +<h2>Seekable responses</h2> +<p>Response objects returned from (or raised as exceptions by) <code>mechanize.SeekableResponseOpener</code>, <code>mechanize.UserAgent</code> (if <code>.set_seekable_responses(True)</code> has been called) and <code>mechanize.Browser()</code> have <code>.seek()</code>, <code>.get_data()</code> and <code>.set_data()</code> methods:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>opener = mechanize.OpenerFactory(mechanize.SeekableResponseOpener).build_opener()<br>response = opener.<span class="dt">open</span>(<span class="st">"http://example.com/"</span>)<br><span class="co"># same return value as .read(), but without affecting seek position</span><br>total_nr_bytes = <span class="dt">len</span>(response.get_data())<br>assert <span class="dt">len</span>(response.read()) == total_nr_bytes<br>assert <span class="dt">len</span>(response.read()) == <span class="dv">0</span> <span class="co"># we've already read the data</span><br>response.seek(<span class="dv">0</span>)<br>assert <span class="dt">len</span>(response.read()) == total_nr_bytes<br>response.set_data(<span class="st">"blah</span><span class="ch">\n</span><span class="st">"</span>)<br>assert response.get_data() == <span class="st">"blah</span><span class="ch">\n</span><span class="st">"</span><br>...<br></pre> +<p>This caching behaviour can be avoided by using <code>mechanize.OpenerDirector</code>. It can also be avoided with <code>mechanize.UserAgent</code>. Note that <code>HTTPEquivProcessor</code> and <code>HTTPResponseDebugProcessor</code> require seekable responses and so are not compatible with <code>mechanize.OpenerDirector</code> and <code>mechanize.UserAgent</code>.</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>ua = mechanize.UserAgent()<br>ua.set_seekable_responses(<span class="ot">False</span>)<br>ua.set_handle_equiv(<span class="ot">False</span>)<br>ua.set_debug_responses(<span class="ot">False</span>)<br></pre> +<p>Note that if you turn on features that use seekable responses (currently: HTTP-EQUIV handling and response body debug printing), returned responses <em>may</em> be seekable as a side-effect of these features. However, this is not guaranteed (currently, in these cases, returned response objects are seekable, but raised respose objects — <code>mechanize.HTTPError</code> instances — are not seekable). This applies regardless of whether you use <code>mechanize.UserAgent</code> or <code>mechanize.OpenerDirector</code>. If you explicitly request seekable responses by calling <code>.set_seekable_responses(True)</code> on a <code>mechanize.UserAgent</code> instance, or by using <code>mechanize.Browser</code> or <code>mechanize.SeekableResponseOpener</code>, which always return seekable responses, then both returned and raised responses are guaranteed to be seekable.</p> +<p>Handlers should call <code>response = mechanize.seek_wrapped_response(response)</code> if they require the <code>.seek()</code>, <code>.get_data()</code> or <code>.set_data()</code> methods.</p> +</div> +<div id="request-object-lifetime"> +<h2>Request object lifetime</h2> +<p>Note that handlers may create new <code>Request</code> instances (for example when performing redirects) rather than adding headers to existing <code>Request</code> objects.</p> +</div> +<div id="adding-headers"> +<h2>Adding headers</h2> +<p>Adding headers is done like so:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>req = mechanize.Request(<span class="st">"http://foobar.com/"</span>)<br>req.add_header(<span class="st">"Referer"</span>, <span class="st">"http://wwwsearch.sourceforge.net/mechanize/"</span>)<br>r = mechanize.urlopen(req)<br></pre> +<p>You can also use the <code>headers</code> argument to the <code>mechanize.Request</code> constructor.</p> +<p>mechanize adds some headers to <code>Request</code> objects automatically — see the next section for details.</p> +</div> +<div id="automatically-added-headers"> +<h2>Automatically-added headers</h2> +<p><code>OpenerDirector</code> automatically adds a <code>User-Agent</code> header to every <code>Request</code>.</p> +<p>To change this and/or add similar headers, use your own <code>OpenerDirector</code>:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>cookies = mechanize.CookieJar()<br>opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))<br>opener.addheaders = [(<span class="st">"User-agent"</span>, <span class="st">"Mozilla/5.0 (compatible; MyProgram/0.1)"</span>),<br> (<span class="st">"From"</span>, <span class="st">"responsible.person@example.com"</span>)]<br></pre> +<p>Again, to use <code>urlopen()</code>, install your <code>OpenerDirector</code> globally:</p> +<pre class="sourceCode python">mechanize.install_opener(opener)<br>r = mechanize.urlopen(<span class="st">"http://example.com/"</span>)<br></pre> +<p>Also, a few standard headers (<code>Content-Length</code>, <code>Content-Type</code> and <code>Host</code>) are added when the <code>Request</code> is passed to <code>urlopen()</code> (or <code>OpenerDirector.open()</code>). You shouldn’t need to change these headers, but since this is done by <code>AbstractHTTPHandler</code>, you can change the way it works by passing a subclass of that handler to <code>build_opener()</code> (or, as always, by constructing an opener yourself and calling <code>.add_handler()</code>).</p> +</div> +<div id="initiating-unverifiable-transactions"> +<h2>Initiating unverifiable transactions</h2> +<p>This section is only of interest for correct handling of third-party HTTP cookies. See <a href="#note-about-cookie-standards">below</a> for an explanation of ‘third-party’.</p> +<p>First, some terminology.</p> +<p>An <em>unverifiable request</em> (defined fully by (<a href="http://www.ietf.org/rfc/rfc2965.txt">RFC 2965</a>) is one whose URL the user did not have the option to approve. For example, a transaction is unverifiable if the request is for an image in an HTML document, and the user had no option to approve the fetching of the image from a particular URL.</p> +<p>The <em>request-host of the origin transaction</em> (defined fully by RFC 2965) is the host name or IP address of the original request that was initiated by the user. For example, if the request is for an image in an HTML document, this is the request-host of the request for the page containing the image.</p> +<p><strong>mechanize knows that redirected transactions are unverifiable, and will handle that on its own (ie. you don’t need to think about the origin request-host or verifiability yourself).</strong></p> +<p>If you want to initiate an unverifiable transaction yourself (which you should if, for example, you’re downloading the images from a page, and ‘the user’ hasn’t explicitly OKed those URLs):</p> +<pre class="sourceCode python">request = Request(origin_req_host=<span class="st">"www.example.com"</span>, unverifiable=<span class="ot">True</span>)<br></pre> +</div> +<div id="rfc-2965-support"> +<h2>RFC 2965 support</h2> +<p>Support for the RFC 2965 protocol is switched off by default, because few browsers implement it, so the RFC 2965 protocol is essentially never seen on the internet. To switch it on, see <a href="#policy">here</a>.</p> +</div> +<div id="parsing-http-dates"> +<h2>Parsing HTTP dates</h2> +<p>A function named <code>str2time</code> is provided by the package, which may be useful for parsing dates in HTTP headers. <code>str2time</code> is intended to be liberal, since HTTP date/time formats are poorly standardised in practice. There is no need to use this function in normal operations: <code>CookieJar</code> instances keep track of cookie lifetimes automatically. This function will stay around in some form, though the supported date/time formats may change.</p> +</div> +<div id="dealing-with-bad-html"> +<h2>Dealing with bad HTML</h2> +<p>XXX Intro</p> +<p>XXX Test me</p> +</div> +<div id="note-about-cookie-standards"> +<h2>Note about cookie standards</h2> +<p>There are several standards relevant to HTTP cookies.</p> +<p>The Netscape protocol is the only standard supported by most web browsers (including Internet Explorer and Firefox). This is a <em>de facto</em> standard defined by the behaviour of popular browsers, and neither the <a href="http://curl.haxx.se/rfc/cookie_spec.html">cookie_spec.html</a> document that was published by Netscape, nor the RFCs that were published later, describe the Netscape protocol accurately or completely. Netscape protocol cookies are also known as V0 cookies, to distinguish them from RFC 2109 or RFC 2965 cookies, which have a version cookie-attribute with a value of 1.</p> +<p><a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> was introduced to fix some problems identified with the Netscape protocol, while still keeping the same HTTP headers (<code>Cookie</code> and <code>Set-Cookie</code>). The most prominent of these problems is the ‘third-party’ cookie issue, which was an accidental feature of the Netscape protocol. Some features defined by RFC2109 (such as the port and max-age cookie attributes) are now part of the de facto Netscape protocol, but the RFC was never implemented fully by browsers, because of differences in behaviour between the Netscape and Internet Explorer browsers of the time.</p> +<p><a href="http://www.ietf.org/rfc/rfc2965.txt">RFC 2965</a> attempted to fix the compatibility problem by introducing two new headers, <code>Set-Cookie2</code> and <code>Cookie2</code>. Unlike the <code>Cookie</code> header, <code>Cookie2</code> does <em>not</em> carry cookies to the server — rather, it simply advertises to the server that RFC 2965 is understood. <code>Set-Cookie2</code> <em>does</em> carry cookies, from server to client: the new header means that both IE and Netscape ignore these cookies. This preserves backwards compatibility, but popular browsers did not implement the RFC, so it was never widely adopted. One confusing point to note about RFC 2965 is that it uses the same value (1) of the Version attribute in HTTP headers as does RFC 2109. See also <a href="http://www.ietf.org/rfc/rfc2964.txt">RFC 2964</a>, which discusses use of the protocol.</p> +<p>Because Netscape cookies are so poorly specified, the general philosophy of the module’s Netscape protocol implementation is to start with RFC 2965 and open holes where required for Netscape protocol-compatibility. RFC 2965 cookies are <em>always</em> treated as RFC 2965 requires, of course.</p> +<p>There is more information about the history of HTTP cookies in <a href="http://arxiv.org/abs/cs.SE/0105018">this paper by David Kristol</a>.</p> +<p>Recently (2011), <a href="http://tools.ietf.org/html/draft-ietf-httpstate-cookie">an IETF effort has started</a> to specify the syntax and semantics of the <code>Cookie</code> and <code>Set-Cookie</code> headers as they are actually used on the internet.</p> +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, March 2011. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/documentation.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/documentation.html new file mode 100644 index 0000000000000000000000000000000000000000..07c9e391241b681bc76b48f12cdcd14e5cbbfbec --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/documentation.html @@ -0,0 +1,160 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2010-07-17"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize — Documentation</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<a href="./"> +Home +</a> +</li> +<li> +<a href="download.html"> +Download +</a> +</li> +<li> +<span class="thispage"> +Support +</span> +</li> +<li> +<a href="development.html"> +Development +</a> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize — Documentation</h1> + +<ul id="subnav"> +<li> +<a href="support.html"> +Support +</a> +<ul> +<li> +<span class="thispage"> +Documentation +</span> +<ul> +<li> +<a href="faq.html"> +FAQ +</a> +</li> +<li> +<a href="doc.html"> +Handlers etc. +</a> +</li> +<li> +<a href="forms.html"> +Forms +</a> +</li> +<li> +<a href="hints.html"> +Hints +</a> +</li> +</ul> +</li> +<li> +<a href="ChangeLog.txt"> +Changelog +</a> +</li> +</ul> +</li> +</ul> +<div id="TOC"><ul> +<li> +<a href="#tests-and-examples">Tests and examples</a><ul> +<li><a href="#examples">Examples</a></li> +<li><a href="#tests">Tests</a></li> +</ul> +</li> +<li><a href="#the-urllib2-interface">The <code>urllib2</code> interface</a></li> +<li><a href="#compatibility">Compatibility</a></li> +<li><a href="#useragent-vs-useragentbase">UserAgent vs UserAgentBase</a></li> +</ul></div> + +<p>Full API documentation is in the docstrings and the documentation of <a href="http://docs.python.org/release/2.6/library/urllib2.html"><code>urllib2</code></a>. The documentation in these web pages is in need of reorganisation at the moment, after the merge of ClientCookie and ClientForm into mechanize.</p> +<div id="tests-and-examples"> +<h2>Tests and examples</h2> +<div id="examples"> +<h3>Examples</h3> +<p>The <a href="./">front page</a> has some introductory examples.</p> +<p>The <code>examples</code> directory in the source packages contains a couple of silly, but working, scripts to demonstrate basic use of the module.</p> +<p>See also the <a href="./forms.html">forms examples</a> (these examples use the forms API independently of <code>mechanize.Browser</code>).</p> +</div> +<div id="tests"> +<h3>Tests</h3> +<p>To run the tests:</p> +<pre>python test.py +</pre> +<p>There are some tests that try to fetch URLs from the internet. To include those in the test run:</p> +<pre>python test.py discover --tag internet +</pre> +</div> +</div> +<div id="the-urllib2-interface"> +<h2>The <code>urllib2</code> interface</h2> +<p>mechanize exports the complete interface of <code>urllib2</code>. See the <a href="http://docs.python.org/release/2.6/library/urllib2.html"><code>urllib2</code> documentation</a>. For example:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>response = mechanize.urlopen(<span class="st">"http://www.example.com/"</span>)<br><span class="kw">print</span> response.read()<br></pre> +</div> +<div id="compatibility"> +<h2>Compatibility</h2> +<p>These notes explain the relationship between mechanize, ClientCookie, ClientForm, <code>cookielib</code> and <code>urllib2</code>, and which to use when. If you’re just using mechanize, and not any of those other libraries, you can ignore this section.</p> +<ol> +<li><p>mechanize works with Python 2.4, Python 2.5, Python 2.6, and Python 2.7.</p></li> +<li><p>When using mechanize, anything you would normally import from <code>urllib2</code> should be imported from <code>mechanize</code> instead.</p></li> +<li><p>Use of mechanize classes with <code>urllib2</code> (and vice-versa) is no longer supported. However, existing classes implementing the <code>urllib2 Handler</code> interface are likely to work unchanged with mechanize.</p></li> +<li><p>mechanize now only imports <code>urllib2.URLError</code> and <code>urllib2.HTTPError</code> from <code>urllib2</code>. The rest is forked. I intend to merge fixes from Python trunk frequently.</p></li> +<li><p>ClientForm is no longer maintained as a separate package. The code is now part of mechanize, and its interface is now exported through module mechanize (since mechanize 0.2.0). Old code can simply be changed to <code>import mechanize as ClientForm</code> and should continue to work.</p></li> +<li><p>ClientCookie is no longer maintained as a separate package. The code is now part of mechanize, and its interface is now exported through module mechanize (since mechanize 0.1.0). Old code can simply be changed to <code>import mechanize as ClientCookie</code> and should continue to work.</p></li> +<li><p>The cookie handling parts of mechanize are in Python 2.4 standard library as module <code>cookielib</code> and extensions to module <code>urllib2</code>. mechanize does not currently use <code>cookielib</code>, due to the presence of thread synchronisation code in <code>cookielib</code> that is not present in the mechanize fork of <code>cookielib</code>.</p></li> +</ol> +<p>API differences between mechanize and <code>urllib2</code>:</p> +<ol> +<li><p>mechanize provides additional features.</p></li> +<li><p><code>mechanize.urlopen</code> differs in its behaviour: it handles cookies, whereas <code>urllib2.urlopen</code> does not. To make a <code>urlopen</code> function with the <code>urllib2</code> behaviour:</p></li> +</ol> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>handler_classes = [mechanize.ProxyHandler,<br> mechanize.UnknownHandler,<br> mechanize.HTTPHandler,<br> mechanize.HTTPDefaultErrorHandler,<br> mechanize.HTTPRedirectHandler,<br> mechanize.FTPHandler,<br> mechanize.FileHandler,<br> mechanize.HTTPErrorProcessor]<br>opener = mechanize.OpenerDirector()<br><span class="kw">for</span> handler_class in handler_classes:<br> opener.add_handler(handler_class())<br>urlopen = opener.<span class="dt">open</span><br></pre> +<ol><li>Since Python 2.6, <code>urllib2</code> uses a <code>.timeout</code> attribute on <code>Request</code> objects internally. However, <code>urllib2.Request</code> has no timeout constructor argument, and <code>urllib2.urlopen()</code> ignores this parameter. <code>mechanize.Request</code> has a <code>timeout</code> constructor argument which is used to set the attribute of the same name, and <code>mechanize.urlopen()</code> does not ignore the timeout attribute.</li></ol> +</div> +<div id="useragent-vs-useragentbase"> +<h2>UserAgent vs UserAgentBase</h2> +<p><code>mechanize.UserAgent</code> is a trivial subclass of <code>mechanize.UserAgentBase</code>, adding just one method, <code>.set_seekable_responses()</code> (see the <a href="./doc.html#seekable-responses">documentation on seekable responses</a>).</p> +<p>The reason for the extra class is that <code>mechanize.Browser</code> depends on seekable response objects (because response objects are used to implement the browser history).</p> +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, July 2010. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/download.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/download.html new file mode 100644 index 0000000000000000000000000000000000000000..7fc7ebde72ae8c2c48062d878a49b30ac15cfc39 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/download.html @@ -0,0 +1,99 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2011-03-31"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize — Download</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<a href="./"> +Home +</a> +</li> +<li> +<span class="thispage"> +Download +</span> +</li> +<li> +<a href="support.html"> +Support +</a> +</li> +<li> +<a href="development.html"> +Development +</a> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize — Download</h1> + + + +<div id="TOC"><ul> +<li><a href="#easy_install">easy_install</a></li> +<li><a href="#source-code-release">Source code release</a></li> +<li><a href="#git-repository">git repository</a></li> +</ul></div> + +<p>There is more than one way to obtain mechanize:</p> +<p><em>Note re Windows and Mac support: currently the tests are only routinely run on <a href="http://www.ubuntu.com/">Ubuntu</a> 9.10 (“karmic”). However, as far as I know, mechanize works fine on Windows and Mac platforms.</em></p> +<div id="easy_install"> +<h2>easy_install</h2> +<ol> +<li><p>Install <a href="http://peak.telecommunity.com/DevCenter/EasyInstall">EasyInstall</a></p></li> +<li><p><code>easy_install mechanize</code></p></li> +</ol> +<p>Easy install will automatically download the latest source code release and install it.</p> +</div> +<div id="source-code-release"> +<h2>Source code release</h2> +<ol> +<li><p>Download the source from one of the links below</p></li> +<li><p>Unpack the source distribution and change directory to the resulting top-level directory.</p></li> +<li><p><code>python setup.py install</code></p></li> +</ol> +<p>This is a stable release.</p> +<ul> +<li><p><a href="http://pypi.python.org/packages/source/m/mechanize/mechanize-0.2.5.tar.gz"><code>mechanize-0.2.5.tar.gz</code></a></p></li> +<li><p><a href="http://pypi.python.org/packages/source/m/mechanize/mechanize-0.2.5.zip"><code>mechanize-0.2.5.zip</code></a></p></li> +<li><p><a href="./src/">Older versions.</a> Note: these are hosted on sourceforge, which at the time of writing (2011–03–31) is returning invalid HTTP responses — you can also find old releases on <a href="http://pypi.python.org/">PyPI</a>)</p></li> +</ul> +<p>All the documentation (these web pages, docstrings, and <a href="./ChangeLog.txt">the changelog</a>) is included in the distribution.</p> +</div> +<div id="git-repository"> +<h2>git repository</h2> +<p>The <a href="http://git-scm.com/">git</a> repository is <a href="http://github.com/jjlee/mechanize">here</a>. To check it out:</p> +<ol><li> +<p><code>git clone git://github.com/jjlee/mechanize.git</code></p> + +</li></ol> +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, March 2011. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/faq.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/faq.html new file mode 100644 index 0000000000000000000000000000000000000000..5d222ceb09440991147bbb6364f10b6aa63a47f5 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/faq.html @@ -0,0 +1,307 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2010-10-16"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize — FAQ</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<a href="./"> +Home +</a> +</li> +<li> +<a href="download.html"> +Download +</a> +</li> +<li> +<span class="thispage"> +Support +</span> +</li> +<li> +<a href="development.html"> +Development +</a> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize — FAQ</h1> + +<ul id="subnav"> +<li> +<a href="support.html"> +Support +</a> +<ul> +<li> +<a href="documentation.html"> +Documentation +</a> +<ul> +<li> +<span class="thispage"> +FAQ +</span> +</li> +<li> +<a href="doc.html"> +Handlers etc. +</a> +</li> +<li> +<a href="forms.html"> +Forms +</a> +</li> +<li> +<a href="hints.html"> +Hints +</a> +</li> +</ul> +</li> +<li> +<a href="ChangeLog.txt"> +Changelog +</a> +</li> +</ul> +</li> +</ul> +<div id="TOC"><ul> +<li><a href="#usage">Usage</a></li> +<li><a href="#cookies">Cookies</a></li> +<li><a href="#forms">Forms</a></li> +<li><a href="#general">General</a></li> +</ul></div> + +<div class="expanded"> + +<ul> +<li> +<p class="q">Which version of Python do I need?</p> +<p>Python 2.4, 2.5, 2.6, or 2.7. Python 3 is not yet supported.</p> +</li> +<li> +<p class="q">Does mechanize depend on BeautifulSoup?</p> +<p>No. mechanize offers a few classes that make use of BeautifulSoup, but these classes are not required to use mechanize. mechanize bundles BeautifulSoup version 2, so that module is no longer required. A future version of mechanize will support BeautifulSoup version 3, at which point mechanize will likely no longer bundle the module.</p> +</li> +<li> +<p class="q">Does mechanize depend on ClientForm?</p> +<p>No, ClientForm is now part of mechanize.</p> +</li> +<li> +<p class="q">Which license?</p> +<p>mechanize is dual-licensed: you may pick either the <a href="http://www.opensource.org/licenses/bsd-license.php">BSD license</a>, or the <a href="http://www.zope.org/Resources/ZPL">ZPL 2.1</a> (both are included in the distribution).</p> +</li> +</ul> +<div id="usage"> +<h2>Usage</h2> +<ul> +<li> +<p class="q">I’m not getting the HTML page I expected to see.</p> +<p><a href="hints.html">Debugging tips</a></p> +</li> +<li> +<p class="q"><code>Browser</code> doesn’t have all of the forms/links I see in the HTML. Why not?</p> +<p>Perhaps the default parser can’t cope with invalid HTML. Try using the included BeautifulSoup 2 parser instead:</p> +</li> +</ul> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br><br>browser = mechanize.Browser(factory=mechanize.RobustFactory())<br>browser.<span class="dt">open</span>(<span class="st">"http://example.com/"</span>)<br><span class="kw">print</span> browser.forms<br></pre> +<pre>Alternatively, you can process the HTML (and headers) arbitrarily: +</pre> +<pre class="sourceCode python">browser = mechanize.Browser()<br>browser.<span class="dt">open</span>(<span class="st">"http://example.com/"</span>)<br>html = browser.response().get_data().replace(<span class="st">"<br/>"</span>, <span class="st">"<br />"</span>)<br>response = mechanize.make_response(<br> html, [(<span class="st">"Content-Type"</span>, <span class="st">"text/html"</span>)],<br><span class="st">"http://example.com/"</span>, <span class="dv">200</span>, <span class="st">"OK"</span>)<br>browser.set_response(response)<br></pre> +<ul> +<li> +<p class="q">Is JavaScript supported?</p> +<p>No, sorry. See <a href="#change-value">FAQs</a> <a href="#script">below</a>.</p> +</li> +<li> +<p class="q">My HTTP response data is truncated.</p> +<p><code>mechanize.Browser's</code> response objects support the <code>.seek()</code> method, and can still be used after <code>.close()</code> has been called. Response data is not fetched until it is needed, so navigation away from a URL before fetching all of the response will truncate it. Call <code>response.get_data()</code> before navigation if you don’t want that to happen.</p> +</li> +<li><p class="q"><a name="xhtml"></a>I’m <em>sure</em> this page is HTML, why does <code>mechanize.Browser</code> think otherwise?</p></li> +</ul> +<pre class="sourceCode python">b = mechanize.Browser(<br><span class="co"># mechanize's XHTML support needs work, so is currently switched off. If</span><br><span class="co"># we want to get our work done, we have to turn it on by supplying a</span><br><span class="co"># mechanize.Factory (with XHTML support turned on):</span><br> factory=mechanize.DefaultFactory(i_want_broken_xhtml_support=<span class="ot">True</span>)<br> )<br></pre> +<ul> +<li> +<p class="q">Why don’t timeouts work for me?</p> +<p>Timeouts are ignored with with versions of Python earlier than 2.6. Timeouts do not apply to DNS lookups.</p> +</li> +<li> +<p class="q">Is there any example code?</p> +<p>Look in the <code>examples/</code> directory. Note that the examples on the <a href="./forms.html">forms page</a> are executable as-is. Contributions of example code would be very welcome!</p> +</li> +</ul> +</div> +<div id="cookies"> +<h2>Cookies</h2> +<ul> +<li> +<p class="q">Doesn’t the standard Python library module, <code>Cookie</code>, do this?</p> +<p>No: module <code>Cookie</code> does the server end of the job. It doesn’t know when to accept cookies from a server or when to send them back. Part of mechanize has been contributed back to the standard library as module <code>cookielib</code> (there are a few differences, notably that <code>cookielib</code> contains thread synchronization code; mechanize does not use <code>cookielib</code>).</p> +</li> +<li> +<p class="q">Which HTTP cookie protocols does mechanize support?</p> +<p>Netscape and <a href="http://www.ietf.org/rfc/rfc2965.txt">RFC 2965</a>. RFC 2965 handling is switched off by default.</p> +</li> +<li> +<p class="q">What about RFC 2109?</p> +<p>RFC 2109 cookies are currently parsed as Netscape cookies, and treated by default as RFC 2965 cookies thereafter if RFC 2965 handling is enabled, or as Netscape cookies otherwise.</p> +</li> +<li> +<p class="q">Why don’t I have any cookies?</p> +<p>See <a href="hints.html#cookies">here</a>.</p> +</li> +<li> +<p class="q">My response claims to be empty, but I know it’s not!</p> +<p>Did you call <code>response.read()</code> (e.g., in a debug statement), then forget that all the data has already been read? In that case, you may want to use <code>mechanize.response_seek_wrapper</code>. <code>mechanize.Browser</code> always returns <a href="doc.html#seekable-responses">seekable responses</a>, so it’s not necessary to use this explicitly in that case.</p> +</li> +<li> +<p class="q">What’s the difference between the <code>.load()</code> and <code>.revert()</code> methods of <code>CookieJar</code>?</p> +<p><code>.load()</code> <em>appends</em> cookies from a file. <code>.revert()</code> discards all existing cookies held by the <code>CookieJar</code> first (but it won’t lose any existing cookies if the loading fails).</p> +</li> +<li> +<p class="q">Is it threadsafe?</p> +<p>No. As far as I know, you can use mechanize in threaded code, but it provides no synchronisation: you have to provide that yourself.</p> +</li> +<li> +<p class="q">How do I do <X></p> +<p>Refer to the API documentation in docstrings.</p> +</li> +</ul> +</div> +<div id="forms"> +<h2>Forms</h2> +<ul> +<li> +<p class="q">Doesn’t the standard Python library module, <code>cgi</code>, do this?</p> +<p>No: the <code>cgi</code> module does the server end of the job. It doesn’t know how to parse or fill in a form or how to send it back to the server.</p> +</li> +<li> +<p class="q">How do I figure out what control names and values to use?</p> +<p><code>print form</code> is usually all you need. In your code, things like the <code>HTMLForm.items</code> attribute of <code>HTMLForm</code> instances can be useful to inspect forms at runtime. Note that it’s possible to use item labels instead of item names, which can be useful — use the <code>by_label</code> arguments to the various methods, and the <code>.get_value_by_label()</code> / <code>.set_value_by_label()</code> methods on <code>ListControl</code>.</p> +</li> +<li> +<p class="q">What do those <code>'*'</code> characters mean in the string representations of list controls?</p> +<p>A <code>*</code> next to an item means that item is selected.</p> +</li> +<li> +<p class="q">What do those parentheses (round brackets) mean in the string representations of list controls?</p> +<p>Parentheses <code>(foo)</code> around an item mean that item is disabled.</p> +</li> +<li> +<p class="q">Why doesn’t <some control> turn up in the data returned by <code>.click*()</code> when that control has non-<code>None</code> value?</p> +<p>Either the control is disabled, or it is not successful for some other reason. ‘Successful’ (see <a href="http://www.w3.org/TR/REC-html40/interact/forms.html#h-17.13.2">HTML 4 specification</a>) means that the control will cause data to get sent to the server.</p> +</li> +<li> +<p class="q">Why does mechanize not follow the HTML 4.0 / RFC 1866 standards for <code>RADIO</code> and multiple-selection <code>SELECT</code> controls?</p> +<p>Because by default, it follows browser behaviour when setting the initially-selected items in list controls that have no items explicitly selected in the HTML. Use the <code>select_default</code> argument to <code>ParseResponse</code> if you want to follow the RFC 1866 rules instead. Note that browser behaviour violates the HTML 4.01 specification in the case of <code>RADIO</code> controls.</p> +</li> +<li> +<p class="q">Why does <code>.click()</code>ing on a button not work for me?</p> +<ul> +<li><p>Clicking on a <code>RESET</code> button doesn’t do anything, by design - this is a library for web automation, not an interactive browser. Even in an interactive browser, clicking on <code>RESET</code> sends nothing to the server, so there is little point in having <code>.click()</code> do anything special here.</p></li> +<li><p>Clicking on a <code>BUTTON TYPE=BUTTON</code> doesn’t do anything either, also by design. This time, the reason is that that <code>BUTTON</code> is only in the HTML standard so that one can attach JavaScript callbacks to its events. Their execution may result in information getting sent back to the server. mechanize, however, knows nothing about these callbacks, so it can’t do anything useful with a click on a <code>BUTTON</code> whose type is <code>BUTTON</code>.</p></li> +<li><p>Generally, JavaScript may be messing things up in all kinds of ways. See the answer to the next question.</p></li> +</ul> +</li> +<li> +<p class="q"><a name="change-value"></a>How do I change <code>INPUT TYPE=HIDDEN</code> field values (for example, to emulate the effect of JavaScript code)?</p> +<p>As with any control, set the control’s <code>readonly</code> attribute false.</p> +</li> +</ul> +<pre class="sourceCode python">form.find_control(<span class="st">"foo"</span>).readonly = <span class="ot">False</span> <span class="co"># allow changing .value of control foo</span><br>form.set_all_readonly(<span class="ot">False</span>) <span class="co"># allow changing the .value of all controls</span><br></pre> +<ul> +<li> +<p class="q">I’m having trouble debugging my code.</p> +<p>See <a href="hints.html">here</a> for few relevant tips.</p> +</li> +<li><p class="q">I have a control containing a list of integers. How do I select the one whose value is nearest to the one I want?</p></li> +</ul> +<pre class="sourceCode python"><span class="ch">import</span> bisect<br><span class="kw">def</span> closest_int_value(form, ctrl_name, value):<br> values = <span class="dt">map</span>(<span class="dt">int</span>, [item.name <span class="kw">for</span> item in form.find_control(ctrl_name).items])<br><span class="kw">return</span> <span class="dt">str</span>(values[bisect.bisect(values, value) - <span class="dv">1</span>])<br><br>form[<span class="st">"distance"</span>] = [closest_int_value(form, <span class="st">"distance"</span>, <span class="dv">23</span>)]<br></pre> +</div> +<div id="general"> +<h2>General</h2> +<ul> +<li> +<p class="q"><a name="sniffing"></a>I want to see what my web browser is doing, but standard network sniffers like <a href="http://www.wireshark.org/">wireshark</a> or netcat (nc) don’t work for HTTPS. How do I sniff HTTPS traffic?</p> +<p>Three good options:</p> +<ul> +<li><p>Mozilla plugin: <a href="http://livehttpheaders.mozdev.org/">LiveHTTPHeaders</a>.</p></li> +<li><p><a href="http://www.blunck.info/iehttpheaders.html">ieHTTPHeaders</a> does the same for MSIE.</p></li> +<li><p>Use <a href="http://lynx.browser.org/"><code>lynx</code></a> <code>-trace</code>, and filter out the junk with a script.</p></li> +</ul> +</li> +<li> +<p class="q"><a name="script"></a>JavaScript is messing up my web-scraping. What do I do?</p> +<p>JavaScript is used in web pages for many purposes — for example: creating content that was not present in the page at load time, submitting or filling in parts of forms in response to user actions, setting cookies, etc. mechanize does not provide any support for JavaScript.</p> +<p>If you come across this in a page you want to automate, you have four options. Here they are, roughly in order of simplicity.</p> +<ul> +<li><p>Figure out what the JavaScript is doing and emulate it in your Python code: for example, by manually adding cookies to your <code>CookieJar</code> instance, calling methods on <code>HTMLForm</code>s, calling <code>urlopen</code>, etc. See <a href="#change-value">above</a> re forms.</p></li> +<li><p>Use Java’s <a href="http://htmlunit.sourceforge.net/">HtmlUnit</a> or <a href="http://httpunit.sourceforge.net">HttpUnit</a> from Jython, since they know some JavaScript.</p></li> +<li><p>Instead of using mechanize, automate a browser instead. For example use MS Internet Explorer via its COM automation interfaces, using the <a href="http://starship.python.net/crew/mhammond/">Python for Windows extensions</a>, aka pywin32, aka win32all (e.g. <a href="http://vsbabu.org/mt/archives/2003/06/13/ie_automation.html">simple function</a>, <a href="http://pamie.sourceforge.net/">pamie</a>; <a href="http://www.oreilly.com/catalog/pythonwin32/chapter/ch12.html">pywin32 chapter from the O’Reilly book</a>) or <a href="http://python.net/crew/theller/ctypes/">ctypes</a> (<a href="http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/305273">example</a>). <a href="http://www.brunningonline.net/simon/blog/archives/winGuiAuto.py.html">This</a> kind of thing may also come in useful on Windows for cases where the automation API is lacking. For Firefox, there is <a href="https://developer.mozilla.org/en/PyXPCOM">PyXPCOM</a>.</p></li> +<li><p>Get ambitious and automatically delegate the work to an appropriate interpreter (Mozilla’s JavaScript interpreter, for instance). This is what HtmlUnit and httpunit do. I did a spike along these lines some years ago, but I think it would (still) be quite a lot of work to do well.</p></li> +</ul> +</li> +<li> +<p class="q">Misc links</p> +<ul> +<li><p><a name="parsing"></a>The following libraries can be useful for dealing with bad HTML: <a href="http://codespeak.net/lxml/lxmlhtml.html">lxml.html</a>, <a href="http://code.google.com/p/html5lib/">html5lib</a>, <a href="http://www.crummy.com/software/BeautifulSoup/CHANGELOG.html">BeautifulSoup 3</a>, <a href="http://www.egenix.com/files/python/mxTidy.html">mxTidy</a> and <a href="http://utidylib.berlios.de/">mu-Tidylib</a>.</p></li> +<li><p><a href="http://www.openqa.org/selenium/">Selenium</a>: In-browser web functional testing. If you need to test websites against real browsers, this is a standard way to do it.</p></li> +<li><p>O’Reilly book: <a href="http://oreilly.com/catalog/9780596005771">Spidering Hacks</a>. Very Perl-oriented.</p></li> +<li><p>Standard extensions for web development with Firefox, which are also handy if you’re scraping the web: <a href="http://chrispederick.com/work/webdeveloper/">Web Developer</a> (amongst other things, this can display HTML form information), <a href="http://getfirebug.com/">Firebug</a>.</p></li> +<li><p>Similar functionality for IE6 and IE7: <a href="http://www.google.co.uk/search?q=internet+explorer+developer+toolbar&btnI=I'm+Feeling+Lucky">Internet Explorer Developer Toolbar</a> (IE8 comes with something equivalent built-in, as does Google Chrome).</p></li> +<li><p><a href="http://www.opensourcetesting.org/functional.php">Open source functional testing tools</a>.</p></li> +<li><p><a href="http://www.rexx.com/~dkuhlman/quixote_htmlscraping.html">A HOWTO on web scraping</a> from Dave Kuhlman.</p></li> +</ul> +</li> +<li> +<p class="q">Will any of this code make its way into the Python standard library?</p> +<p>The request / response processing extensions to <code>urllib2</code> from mechanize have been merged into <code>urllib2</code> for Python 2.4. The cookie processing has been added, as module <code>cookielib</code>. There are other features that would be appropriate additions to <code>urllib2</code>, but since Python 2 is heading into bugfix-only mode, and I’m not using Python 3, they’re unlikely to be added.</p> +</li> +<li> +<p class="q">Where can I find out about the relevant standards?</p> +<ul> +<li><p><a href="http://www.w3.org/TR/html401/">HTML 4.01 Specification</a></p></li> +<li><p><a href="http://dev.w3.org/html5/spec/">Draft HTML 5 Specification</a></p></li> +<li><p><a href="http://www.ietf.org/rfc/rfc1866.txt">RFC 1866</a> - the HTML 2.0 standard (you don’t want to read this)</p></li> +<li><p><a href="http://www.ietf.org/rfc/rfc1867.txt">RFC 1867</a> - Form-based file upload</p></li> +<li><p><a href="http://www.ietf.org/rfc/rfc2616.txt">RFC 2616</a> - HTTP 1.1 Specification</p></li> +<li><p><a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a> - URIs</p></li> +<li><p><a href="http://www.ietf.org/rfc/rfc3987.txt">RFC 3987</a> - IRIs</p></li> +</ul> +</li> +</ul> +</div> + +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, October 2010. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/forms.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/forms.html new file mode 100644 index 0000000000000000000000000000000000000000..5e12926223303c07fe3c51d374369cfe0ab9e65b --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/forms.html @@ -0,0 +1,131 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2010-04-22"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize — Forms</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<a href="./"> +Home +</a> +</li> +<li> +<a href="download.html"> +Download +</a> +</li> +<li> +<span class="thispage"> +Support +</span> +</li> +<li> +<a href="development.html"> +Development +</a> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize — Forms</h1> + +<ul id="subnav"> +<li> +<a href="support.html"> +Support +</a> +<ul> +<li> +<a href="documentation.html"> +Documentation +</a> +<ul> +<li> +<a href="faq.html"> +FAQ +</a> +</li> +<li> +<a href="doc.html"> +Handlers etc. +</a> +</li> +<li> +<span class="thispage"> +Forms +</span> +</li> +<li> +<a href="hints.html"> +Hints +</a> +</li> +</ul> +</li> +<li> +<a href="ChangeLog.txt"> +Changelog +</a> +</li> +</ul> +</li> +</ul> +<div id="TOC"><ul> +<li><a href="#parsers">Parsers</a></li> +<li><a href="#backwards-compatibility-mode">Backwards-compatibility mode</a></li> +</ul></div> + +<p><span class="docwarning">This documentation is in need of reorganisation!</span></p> +<p>This page is the old ClientForm documentation. ClientForm is now part of mechanize, but the documentation hasn’t been fully updated to reflect that: what’s here is correct, but not well-integrated with the rest of the documentation. This page deals with HTML form handling: parsing HTML forms, filling them in and returning the completed forms to the server. See the <a href="./">front page</a> for how to obtain form objects from a <code>mechanize.Browser</code>.</p> +<p>Simple working example (<code>examples/forms/simple.py</code> in the source distribution):</p> +<pre class="sourceCode python"><span class="ch">import</span> sys<br><br><span class="ch">from</span> mechanize <span class="ch">import</span> ParseResponse, urlopen, urljoin<br><br><span class="kw">if</span> <span class="dt">len</span>(sys.argv) == <span class="dv">1</span>:<br> uri = <span class="st">"http://wwwsearch.sourceforge.net/"</span><br><span class="kw">else</span>:<br> uri = sys.argv[<span class="dv">1</span>]<br><br>response = urlopen(urljoin(uri, <span class="st">"mechanize/example.html"</span>))<br>forms = ParseResponse(response, backwards_compat=<span class="ot">False</span>)<br>form = forms[<span class="dv">0</span>]<br><span class="kw">print</span> form<br>form[<span class="st">"comments"</span>] = <span class="st">"Thanks, Gisle"</span><br><br><span class="co"># form.click() returns a mechanize.Request object</span><br><span class="co"># (see HTMLForm.click.__doc__ if you want to use only the forms support, and</span><br><span class="co"># not the rest of mechanize)</span><br><span class="kw">print</span> urlopen(form.click()).read()<br></pre> +<p>A more complicated working example (from <code>examples/forms/example.py</code> in the source distribution):</p> +<pre class="sourceCode python"><span class="ch">import</span> sys<br><br><span class="ch">import</span> mechanize<br><br><span class="kw">if</span> <span class="dt">len</span>(sys.argv) == <span class="dv">1</span>:<br> uri = <span class="st">"http://wwwsearch.sourceforge.net/"</span><br><span class="kw">else</span>:<br> uri = sys.argv[<span class="dv">1</span>]<br><br>request = mechanize.Request(mechanize.urljoin(uri, <span class="st">"mechanize/example.html"</span>))<br>response = mechanize.urlopen(request)<br>forms = mechanize.ParseResponse(response, backwards_compat=<span class="ot">False</span>)<br>response.close()<br><span class="co">## f = open("example.html")</span><br><span class="co">## forms = mechanize.ParseFile(f, "http://example.com/example.html",</span><br><span class="co">## backwards_compat=False)</span><br><span class="co">## f.close()</span><br>form = forms[<span class="dv">0</span>]<br><span class="kw">print</span> form <span class="co"># very useful!</span><br><br><span class="co"># A 'control' is a graphical HTML form widget: a text entry box, a</span><br><span class="co"># dropdown 'select' list, a checkbox, etc.</span><br><br><span class="co"># Indexing allows setting and retrieval of control values</span><br>original_text = form[<span class="st">"comments"</span>] <span class="co"># a string, NOT a Control instance</span><br>form[<span class="st">"comments"</span>] = <span class="st">"Blah."</span><br><br><span class="co"># Controls that represent lists (checkbox, select and radio lists) are</span><br><span class="co"># ListControl instances. Their values are sequences of list item names.</span><br><span class="co"># They come in two flavours: single- and multiple-selection:</span><br>form[<span class="st">"favorite_cheese"</span>] = [<span class="st">"brie"</span>] <span class="co"># single</span><br>form[<span class="st">"cheeses"</span>] = [<span class="st">"parmesan"</span>, <span class="st">"leicester"</span>, <span class="st">"cheddar"</span>] <span class="co"># multi</span><br><span class="co"># equivalent, but more flexible:</span><br>form.set_value([<span class="st">"parmesan"</span>, <span class="st">"leicester"</span>, <span class="st">"cheddar"</span>], name=<span class="st">"cheeses"</span>)<br><br><span class="co"># Add files to FILE controls with .add_file(). Only call this multiple</span><br><span class="co"># times if the server is expecting multiple files.</span><br><span class="co"># add a file, default value for MIME type, no filename sent to server</span><br>form.add_file(<span class="dt">open</span>(<span class="st">"data.dat"</span>))<br><span class="co"># add a second file, explicitly giving MIME type, and telling the server</span><br><span class="co"># what the filename is</span><br>form.add_file(<span class="dt">open</span>(<span class="st">"data.txt"</span>), <span class="st">"text/plain"</span>, <span class="st">"data.txt"</span>)<br><br><span class="co"># All Controls may be disabled (equivalent of greyed-out in browser)...</span><br>control = form.find_control(<span class="st">"comments"</span>)<br><span class="kw">print</span> control.disabled<br><span class="co"># ...or readonly</span><br><span class="kw">print</span> control.readonly<br><span class="co"># readonly and disabled attributes can be assigned to</span><br>control.disabled = <span class="ot">False</span><br><span class="co"># convenience method, used here to make all controls writable (unless</span><br><span class="co"># they're disabled):</span><br>form.set_all_readonly(<span class="ot">False</span>)<br><br><span class="co"># A couple of notes about list controls and HTML:</span><br><br><span class="co"># 1. List controls correspond to either a single SELECT element, or</span><br><span class="co"># multiple INPUT elements. Items correspond to either OPTION or INPUT</span><br><span class="co"># elements. For example, this is a SELECT control, named "control1":</span><br><br><span class="co"># <select name="control1"></span><br><span class="co"># <option>foo</option></span><br><span class="co"># <option value="1">bar</option></span><br><span class="co"># </select></span><br><br><span class="co"># and this is a CHECKBOX control, named "control2":</span><br><br><span class="co"># <input type="checkbox" name="control2" value="foo" id="cbe1"></span><br><span class="co"># <input type="checkbox" name="control2" value="bar" id="cbe2"></span><br><br><span class="co"># You know the latter is a single control because all the name attributes</span><br><span class="co"># are the same.</span><br><br><span class="co"># 2. Item names are the strings that go to make up the value that should</span><br><span class="co"># be returned to the server. These strings come from various different</span><br><span class="co"># pieces of text in the HTML. The HTML standard and the mechanize</span><br><span class="co"># docstrings explain in detail, but playing around with an HTML file,</span><br><span class="co"># ParseFile() and 'print form' is very useful to understand this!</span><br><br><span class="co"># You can get the Control instances from inside the form...</span><br>control = form.find_control(<span class="st">"cheeses"</span>, <span class="dt">type</span>=<span class="st">"select"</span>)<br><span class="kw">print</span> control.name, control.value, control.<span class="dt">type</span><br>control.value = [<span class="st">"mascarpone"</span>, <span class="st">"curd"</span>]<br><span class="co"># ...and the Item instances from inside the Control</span><br>item = control.get(<span class="st">"curd"</span>)<br><span class="kw">print</span> item.name, item.selected, item.<span class="dt">id</span>, item.attrs<br>item.selected = <span class="ot">False</span><br><br><span class="co"># Controls may be referred to by label:</span><br><span class="co"># find control with label that has a *substring* "Cheeses"</span><br><span class="co"># (e.g., a label "Please select a cheese" would match).</span><br>control = form.find_control(label=<span class="st">"select a cheese"</span>)<br><br><span class="co"># You can explicitly say that you're referring to a ListControl:</span><br><span class="co"># set value of "cheeses" ListControl</span><br>form.set_value([<span class="st">"gouda"</span>], name=<span class="st">"cheeses"</span>, kind=<span class="st">"list"</span>)<br><span class="co"># equivalent:</span><br>form.find_control(name=<span class="st">"cheeses"</span>, kind=<span class="st">"list"</span>).value = [<span class="st">"gouda"</span>]<br><span class="co"># the first example is also almost equivalent to the following (but</span><br><span class="co"># insists that the control be a ListControl -- so it will skip any</span><br><span class="co"># non-list controls that come before the control we want)</span><br>form[<span class="st">"cheeses"</span>] = [<span class="st">"gouda"</span>]<br><span class="co"># The kind argument can also take values "multilist", "singlelist", "text",</span><br><span class="co"># "clickable" and "file":</span><br><span class="co"># find first control that will accept text, and scribble in it</span><br>form.set_value(<span class="st">"rhubarb rhubarb"</span>, kind=<span class="st">"text"</span>, nr=<span class="dv">0</span>)<br><span class="co"># find, and set the value of, the first single-selection list control</span><br>form.set_value([<span class="st">"spam"</span>], kind=<span class="st">"singlelist"</span>, nr=<span class="dv">0</span>)<br><br><span class="co"># You can find controls with a general predicate function:</span><br><span class="kw">def</span> control_has_caerphilly(control):<br><span class="kw">for</span> item in control.items:<br><span class="kw">if</span> item.name == <span class="st">"caerphilly"</span>: <span class="kw">return</span> <span class="ot">True</span><br>form.find_control(kind=<span class="st">"list"</span>, predicate=control_has_caerphilly)<br><br><span class="co"># HTMLForm.controls is a list of all controls in the form</span><br><span class="kw">for</span> control in form.controls:<br><span class="kw">if</span> control.value == <span class="st">"inquisition"</span>: sys.exit()<br><br><span class="co"># Control.items is a list of all Item instances in the control</span><br><span class="kw">for</span> item in form.find_control(<span class="st">"cheeses"</span>).items:<br><span class="kw">print</span> item.name<br><br><span class="co"># To remove items from a list control, remove it from .items:</span><br>cheeses = form.find_control(<span class="st">"cheeses"</span>)<br>curd = cheeses.get(<span class="st">"curd"</span>)<br><span class="kw">del</span> cheeses.items[cheeses.items.index(curd)]<br><span class="co"># To add items to a list container, instantiate an Item with its control</span><br><span class="co"># and attributes:</span><br><span class="co"># Note that you are responsible for getting the attributes correct here,</span><br><span class="co"># and these are not quite identical to the original HTML, due to</span><br><span class="co"># defaulting rules and a few special attributes (e.g. Items that represent</span><br><span class="co"># OPTIONs have a special "contents" key in their .attrs dict). In future</span><br><span class="co"># there will be an explicitly supported way of using the parsing logic to</span><br><span class="co"># add items and controls from HTML strings without knowing these details.</span><br>mechanize.Item(cheeses, {<span class="st">"contents"</span>: <span class="st">"mascarpone"</span>,<br><span class="st">"value"</span>: <span class="st">"mascarpone"</span>})<br><br><span class="co"># You can specify list items by label using set/get_value_by_label() and</span><br><span class="co"># the label argument of the .get() method. Sometimes labels are easier to</span><br><span class="co"># maintain than names, sometimes the other way around.</span><br>form.set_value_by_label([<span class="st">"Mozzarella"</span>, <span class="st">"Caerphilly"</span>], <span class="st">"cheeses"</span>)<br><br><span class="co"># Which items are present, selected, and successful?</span><br><span class="co"># is the "parmesan" item of the "cheeses" control successful (selected</span><br><span class="co"># and not disabled)?</span><br><span class="kw">print</span> <span class="st">"parmesan"</span> in form[<span class="st">"cheeses"</span>]<br><span class="co"># is the "parmesan" item of the "cheeses" control selected?</span><br><span class="kw">print</span> <span class="st">"parmesan"</span> in [<br> item.name <span class="kw">for</span> item in form.find_control(<span class="st">"cheeses"</span>).items <span class="kw">if</span> item.selected]<br><span class="co"># does cheeses control have a "caerphilly" item?</span><br><span class="kw">print</span> <span class="st">"caerphilly"</span> in [item.name <span class="kw">for</span> item in form.find_control(<span class="st">"cheeses"</span>).items]<br><br><span class="co"># Sometimes one wants to set or clear individual items in a list, rather</span><br><span class="co"># than setting the whole .value:</span><br><span class="co"># select the item named "gorgonzola" in the first control named "cheeses"</span><br>form.find_control(<span class="st">"cheeses"</span>).get(<span class="st">"gorgonzola"</span>).selected = <span class="ot">True</span><br><span class="co"># You can be more specific:</span><br><span class="co"># deselect "edam" in third CHECKBOX control</span><br>form.find_control(<span class="dt">type</span>=<span class="st">"checkbox"</span>, nr=<span class="dv">2</span>).get(<span class="st">"edam"</span>).selected = <span class="ot">False</span><br><span class="co"># deselect item labelled "Mozzarella" in control with id "chz"</span><br>form.find_control(<span class="dt">id</span>=<span class="st">"chz"</span>).get(label=<span class="st">"Mozzarella"</span>).selected = <span class="ot">False</span><br><br><span class="co"># Often, a single checkbox (a CHECKBOX control with a single item) is</span><br><span class="co"># present. In that case, the name of the single item isn't of much</span><br><span class="co"># interest, so it's a good idea to check and uncheck the box without</span><br><span class="co"># using the item name:</span><br>form.find_control(<span class="st">"smelly"</span>).items[<span class="dv">0</span>].selected = <span class="ot">True</span> <span class="co"># check</span><br>form.find_control(<span class="st">"smelly"</span>).items[<span class="dv">0</span>].selected = <span class="ot">False</span> <span class="co"># uncheck</span><br><br><span class="co"># Items may be disabled (selecting or de-selecting a disabled item is</span><br><span class="co"># not allowed):</span><br>control = form.find_control(<span class="st">"cheeses"</span>)<br><span class="kw">print</span> control.get(<span class="st">"emmenthal"</span>).disabled<br>control.get(<span class="st">"emmenthal"</span>).disabled = <span class="ot">True</span><br><span class="co"># enable all items in control</span><br>control.set_all_items_disabled(<span class="ot">False</span>)<br><br>request2 = form.click() <span class="co"># mechanize.Request object</span><br><span class="kw">try</span>:<br> response2 = mechanize.urlopen(request2)<br><span class="kw">except</span> mechanize.HTTPError, response2:<br><span class="kw">pass</span><br><br><span class="kw">print</span> response2.geturl()<br><span class="co"># headers</span><br><span class="kw">for</span> name, value in response2.info().items():<br><span class="kw">if</span> name != <span class="st">"date"</span>:<br><span class="kw">print</span> <span class="st">"</span>%s<span class="st">: </span>%s<span class="st">"</span> % (name.title(), value)<br><span class="kw">print</span> response2.read() <span class="co"># body</span><br>response2.close()<br></pre> +<p>All of the standard control types are supported: <code>TEXT</code>, <code>PASSWORD</code>, <code>HIDDEN</code>, <code>TEXTAREA</code>, <code>ISINDEX</code>, <code>RESET</code>, <code>BUTTON</code> (<code>INPUT TYPE=BUTTON</code> and the various <code>BUTTON</code> types), <code>SUBMIT</code>, <code>IMAGE</code>, <code>RADIO</code>, <code>CHECKBOX</code>, <code>SELECT</code>/<code>OPTION</code> and <code>FILE</code> (for file upload). Both standard form encodings (<code>application/x-www-form-urlencoded</code> and <code>multipart/form-data</code>) are supported.</p> +<p>The module is designed for testing and automation of web interfaces, not for implementing interactive user agents.</p> +<p><strong><em>Security note</em>: Remember that any passwords you store in <code>HTMLForm</code> instances will be saved to disk in the clear if, for example, you <a href="http://docs.python.org/library/pickle.html">pickle</a> them.</strong></p> +<div id="parsers"> +<h2>Parsers</h2> +<p>There are two parsers.</p> +<p>TODO: more!</p> +<p>See also the FAQ entries on <a href="faq.html#xhtml">XHTML</a> and <a href="./faq.html#parsing">parsing bad HTML</a>.</p> +</div> +<div id="backwards-compatibility-mode"> +<h2>Backwards-compatibility mode</h2> +<p>mechanize (and ClientForm 0.2) includes three minor backwards-incompatible interface changes from ClientForm version 0.1.</p> +<p>To make upgrading from ClientForm 0.1 easier, and to allow me to stop supporting version ClientForm 0.1 sooner, there is support for operating in a backwards-compatible mode, under which code written for ClientForm 0.1 should work without modification. This is done on a per-<code>HTMLForm</code> basis via the <code>.backwards_compat</code> attribute, but for convenience the <code>ParseResponse()</code> and <code>ParseFile()</code> factory functions accept <code>backwards_compat</code> arguments. These backwards-compatibility features will be removed soon. The default is to operate in backwards-compatible mode. To run with backwards compatible mode turned <em><strong>OFF</strong></em> (<strong>strongly recommended</strong>):</p> +<pre class="sourceCode python"><span class="ch">from</span> mechanize <span class="ch">import</span> ParseResponse, urlopen<br>forms = ParseResponse(urlopen(<span class="st">"http://example.com/"</span>), backwards_compat=<span class="ot">False</span>)<br><span class="co"># ...</span><br></pre> +<p>The backwards-incompatible changes are:</p> +<ul> +<li><p>Ambiguous specification of controls or items now results in AmbiguityError. If you want the old behaviour, explicitly pass <code>nr=0</code> to indicate you want the first matching control or item.</p></li> +<li><p>Item label matching is now done by substring, not by strict string-equality (but note leading and trailing space is always stripped). (Control label matching is always done by substring.)</p></li> +<li><p>Handling of disabled list items has changed. First, note that handling of disabled list items in ClientForm 0.1 (and in ClientForm 0.2’s backwards-compatibility mode!) is buggy: disabled items are successful (ie. disabled item names are sent back to the server). As a result, there was no distinction to be made between successful items and selected items. In ClientForm 0.2, the bug is fixed, so this is no longer the case, and it is important to note that list controls’ <code>.value</code> attribute contains only the <em>successful</em> item names; items that are <em>selected</em> but not successful (because disabled) are not included in <code>.value</code>. Second, disabled list items may no longer be deselected: AttributeError is raised in ClientForm 0.2, whereas deselection was allowed in ClientForm 0.1. The bug in ClientForm 0.1 and in ClientForm 0.2’s backwards-compatibility mode will not be fixed, to preserve compatibility and to encourage people to upgrade to the new ClientForm 0.2 <code>backwards_compat=False</code> behaviour.</p></li> +</ul> +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, April 2010. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/hints.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/hints.html new file mode 100644 index 0000000000000000000000000000000000000000..292e50f8a84b70246e32400d76646fe2395f8178 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/hints.html @@ -0,0 +1,153 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2010-04-22"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize — Hints</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<a href="./"> +Home +</a> +</li> +<li> +<a href="download.html"> +Download +</a> +</li> +<li> +<span class="thispage"> +Support +</span> +</li> +<li> +<a href="development.html"> +Development +</a> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize — Hints</h1> + +<ul id="subnav"> +<li> +<a href="support.html"> +Support +</a> +<ul> +<li> +<a href="documentation.html"> +Documentation +</a> +<ul> +<li> +<a href="faq.html"> +FAQ +</a> +</li> +<li> +<a href="doc.html"> +Handlers etc. +</a> +</li> +<li> +<a href="forms.html"> +Forms +</a> +</li> +<li> +<span class="thispage"> +Hints +</span> +</li> +</ul> +</li> +<li> +<a href="ChangeLog.txt"> +Changelog +</a> +</li> +</ul> +</li> +</ul> +<div id="TOC"><ul> +<li><a href="#cookies">Cookies</a></li> +<li><a href="#general">General</a></li> +<li><a href="#logging">Logging</a></li> +<li> +<a href="#http-headers">HTTP headers</a><ul><li><a href="#handlers">Handlers</a></li></ul> +</li> +</ul></div> + +<p>Hints for debugging programs that use mechanize.</p> +<div id="cookies"> +<h2>Cookies</h2> +<p>A common mistake is to use <code>mechanize.urlopen()</code>, <em>and</em> the <code>.extract_cookies()</code> and <code>.add_cookie_header()</code> methods on a cookie object themselves. If you use <code>mechanize.urlopen()</code> (or <code>OpenerDirector.open()</code>), the module handles extraction and adding of cookies by itself, so you should not call <code>.extract_cookies()</code> or <code>.add_cookie_header()</code>.</p> +<p>Are you sure the server is sending you any cookies in the first place? Maybe the server is keeping track of state in some other way (<code>HIDDEN</code> HTML form entries (possibly in a separate page referenced by a frame), URL-encoded session keys, IP address, HTTP <code>Referer</code> headers)? Perhaps some embedded script in the HTML is setting cookies (see below)? Turn on <a href="#logging">logging</a>.</p> +<p>When you <code>.save()</code> to or <code>.load()</code>/<code>.revert()</code> from a file, single-session cookies will expire unless you explicitly request otherwise with the <code>ignore_discard</code> argument. This may be your problem if you find cookies are going away after saving and loading.</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>cj = mechanize.LWPCookieJar()<br>opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))<br>mechanize.install_opener(opener)<br>r = mechanize.urlopen(<span class="st">"http://foobar.com/"</span>)<br>cj.save(<span class="st">"/some/file"</span>, ignore_discard=<span class="ot">True</span>, ignore_expires=<span class="ot">True</span>)<br></pre> +<p>JavaScript code can set cookies; mechanize does not support this. See <a href="faq.html#script">the FAQ</a>.</p> +</div> +<div id="general"> +<h2>General</h2> +<p>Enable <a href="#logging">logging</a>.</p> +<p>Sometimes, a server wants particular HTTP headers set to the values it expects. For example, the <code>User-Agent</code> header may need to be <a href="./doc.html#headers">set</a> to a value like that of a popular browser.</p> +<p>Check that the browser is able to do manually what you’re trying to achieve programatically. Make sure that what you do manually is <em>exactly</em> the same as what you’re trying to do from Python — you may simply be hitting a server bug that only gets revealed if you view pages in a particular order, for example.</p> +<p>Try comparing the headers and data that your program sends with those that a browser sends. Often this will give you the clue you need. There are <a href="faq.html#sniffing">browser addons</a> available that allow you to see what the browser sends and receives even if HTTPS is in use.</p> +<p>If nothing is obviously wrong with the requests your program is sending and you’re out of ideas, you can reliably locate the problem by copying the headers that a browser sends, and then changing headers until your program stops working again. Temporarily switch to explicitly sending individual HTTP headers (by calling <code>.add_header()</code>, or by using <code>httplib</code> directly). Start by sending exactly the headers that Firefox or IE send. You may need to make sure that a valid session ID is sent — the one you got from your browser may no longer be valid. If that works, you can begin the tedious process of changing your headers and data until they match what your original code was sending. You should end up with a minimal set of changes. If you think that reveals a bug in mechanize, please <a href="support.html">report it</a>.</p> +</div> +<div id="logging"> +<h2>Logging</h2> +<p>To enable logging to stdout:</p> +<pre class="sourceCode python"><span class="ch">import</span> sys, logging<br>logger = logging.getLogger(<span class="st">"mechanize"</span>)<br>logger.addHandler(logging.StreamHandler(sys.stdout))<br>logger.setLevel(logging.DEBUG)<br></pre> +<p>You can reduce the amount of information shown by setting the level to <code>logging.INFO</code> instead of <code>logging.DEBUG</code>, or by only enabling logging for one of the following logger names instead of <code>"mechanize"</code>:</p> +<ul> +<li><p><code>"mechanize"</code>: Everything.</p></li> +<li><p><code>"mechanize.cookies"</code>: Why particular cookies are accepted or rejected and why they are or are not returned. Requires logging enabled at the <code>DEBUG</code> level.</p></li> +<li><p><code>"mechanize.http_responses"</code>: HTTP response body data.</p></li> +<li><p><code>"mechanize.http_redirects"</code>: HTTP redirect information.</p></li> +</ul> +</div> +<div id="http-headers"> +<h2>HTTP headers</h2> +<p>An example showing how to enable printing of HTTP headers to stdout, logging of HTTP response bodies, and logging of information about redirections:</p> +<pre class="sourceCode python"><span class="ch">import</span> sys, logging<br><span class="ch">import</span> mechanize<br><br>logger = logging.getLogger(<span class="st">"mechanize"</span>)<br>logger.addHandler(logging.StreamHandler(sys.stdout))<br>logger.setLevel(logging.DEBUG)<br><br>browser = mechanize.Browser()<br>browser.set_debug_http(<span class="ot">True</span>)<br>browser.set_debug_responses(<span class="ot">True</span>)<br>browser.set_debug_redirects(<span class="ot">True</span>)<br>response = browser.<span class="dt">open</span>(<span class="st">"http://python.org/"</span>)<br></pre> +<p>Alternatively, you can examine request and response objects to see what’s going on. Note that requests may involve “sub-requests” in cases such as redirection, in which case you will not see everything that’s going on just by examining the original request and final response. It’s often useful to <a href="./doc.html#seekable-responses">use the <code>.get_data()</code> method</a> on responses during debugging.</p> +<div id="handlers"> +<h3>Handlers</h3> +<p><strong>This section is not relevant if you use <code>mechanize.Browser</code>.</strong></p> +<p>An example showing how to enable printing of HTTP headers to stdout, at the <code>HTTPHandler</code> level:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>hh = mechanize.HTTPHandler() <span class="co"># you might want HTTPSHandler, too</span><br>hh.set_http_debuglevel(<span class="dv">1</span>)<br>opener = mechanize.build_opener(hh)<br>response = opener.<span class="dt">open</span>(url)<br></pre> +<p>The following handlers are available:</p> +<p><strong>NOTE</strong>: as well as having these handlers in your <code>OpenerDirector</code> (for example, by passing them to <code>build_opener()</code>) you have to <a href="#logging">turn on logging</a> at the <code>INFO</code> level or lower in order to see any output.</p> +<p><code>HTTPRedirectDebugProcessor</code>: logs information about redirections</p> +<p><code>HTTPResponseDebugProcessor</code>: logs HTTP response bodies (including those that are read during redirections)</p> +</div> +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, April 2010. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/index.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/index.html new file mode 100644 index 0000000000000000000000000000000000000000..6c500597a3f002a7c102758795afdb0457b5befb --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/index.html @@ -0,0 +1,117 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2010-04-22"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<span class="thispage"> +Home +</span> +</li> +<li> +<a href="download.html"> +Download +</a> +</li> +<li> +<a href="support.html"> +Support +</a> +</li> +<li> +<a href="development.html"> +Development +</a> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize</h1> + + + +<div id="TOC"><ul> +<li><a href="#examples">Examples</a></li> +<li><a href="#credits">Credits</a></li> +<li><a href="#see-also">See also</a></li> +</ul></div> + +<p>Stateful programmatic web browsing in Python, after Andy Lester’s Perl module <a href="http://search.cpan.org/dist/WWW-Mechanize/"><code>WWW::Mechanize</code></a>.</p> +<ul> +<li> +<p><code>mechanize.Browser</code> and <code>mechanize.UserAgentBase</code> implement the interface of <code>urllib2.OpenerDirector</code>, so:</p> +<ul> +<li><p>any URL can be opened, not just <code>http:</code></p></li> +<li><p><code>mechanize.UserAgentBase</code> offers easy dynamic configuration of user-agent features like protocol, cookie, redirection and <code>robots.txt</code> handling, without having to make a new <code>OpenerDirector</code> each time, e.g. by calling <code>build_opener()</code>.</p></li> +</ul> +</li> +<li><p>Easy HTML form filling.</p></li> +<li><p>Convenient link parsing and following.</p></li> +<li><p>Browser history (<code>.back()</code> and <code>.reload()</code> methods).</p></li> +<li><p>The <code>Referer</code> HTTP header is added properly (optional).</p></li> +<li><p>Automatic observance of <a href="http://www.robotstxt.org/wc/norobots.html"><code>robots.txt</code></a>.</p></li> +<li><p>Automatic handling of HTTP-Equiv and Refresh.</p></li> +</ul> +<div id="examples"> +<h2>Examples</h2> +<p>The examples below are written for a website that does not exist (<code>example.com</code>), so cannot be run. There are also some <a href="documentation.html#examples">working examples</a> that you can run.</p> +<pre class="sourceCode python"><span class="ch">import</span> re<br><span class="ch">import</span> mechanize<br><br>br = mechanize.Browser()<br>br.<span class="dt">open</span>(<span class="st">"http://www.example.com/"</span>)<br><span class="co"># follow second link with element text matching regular expression</span><br>response1 = br.follow_link(text_regex=<span class="st">r"cheese\s*shop"</span>, nr=<span class="dv">1</span>)<br>assert br.viewing_html()<br><span class="kw">print</span> br.title()<br><span class="kw">print</span> response1.geturl()<br><span class="kw">print</span> response1.info() <span class="co"># headers</span><br><span class="kw">print</span> response1.read() <span class="co"># body</span><br><br>br.select_form(name=<span class="st">"order"</span>)<br><span class="co"># Browser passes through unknown attributes (including methods)</span><br><span class="co"># to the selected HTMLForm.</span><br>br[<span class="st">"cheeses"</span>] = [<span class="st">"mozzarella"</span>, <span class="st">"caerphilly"</span>] <span class="co"># (the method here is __setitem__)</span><br><span class="co"># Submit current form. Browser calls .close() on the current response on</span><br><span class="co"># navigation, so this closes response1</span><br>response2 = br.submit()<br><br><span class="co"># print currently selected form (don't call .submit() on this, use br.submit())</span><br><span class="kw">print</span> br.form<br><br>response3 = br.back() <span class="co"># back to cheese shop (same data as response1)</span><br><span class="co"># the history mechanism returns cached response objects</span><br><span class="co"># we can still use the response, even though it was .close()d</span><br>response3.get_data() <span class="co"># like .seek(0) followed by .read()</span><br>response4 = br.<span class="dt">reload</span>() <span class="co"># fetches from server</span><br><br><span class="kw">for</span> form in br.forms():<br><span class="kw">print</span> form<br><span class="co"># .links() optionally accepts the keyword args of .follow_/.find_link()</span><br><span class="kw">for</span> link in br.links(url_regex=<span class="st">"python.org"</span>):<br><span class="kw">print</span> link<br> br.follow_link(link) <span class="co"># takes EITHER Link instance OR keyword args</span><br> br.back()<br></pre> +<p>You may control the browser’s policy by using the methods of <code>mechanize.Browser</code>’s base class, <code>mechanize.UserAgent</code>. For example:</p> +<pre class="sourceCode python">br = mechanize.Browser()<br><span class="co"># Explicitly configure proxies (Browser will attempt to set good defaults).</span><br><span class="co"># Note the userinfo ("joe:password@") and port number (":3128") are optional.</span><br>br.set_proxies({<span class="st">"http"</span>: <span class="st">"joe:password@myproxy.example.com:3128"</span>,<br><span class="st">"ftp"</span>: <span class="st">"proxy.example.com"</span>,<br> })<br><span class="co"># Add HTTP Basic/Digest auth username and password for HTTP proxy access.</span><br><span class="co"># (equivalent to using "joe:password@..." form above)</span><br>br.add_proxy_password(<span class="st">"joe"</span>, <span class="st">"password"</span>)<br><span class="co"># Add HTTP Basic/Digest auth username and password for website access.</span><br>br.add_password(<span class="st">"http://example.com/protected/"</span>, <span class="st">"joe"</span>, <span class="st">"password"</span>)<br><span class="co"># Don't handle HTTP-EQUIV headers (HTTP headers embedded in HTML).</span><br>br.set_handle_equiv(<span class="ot">False</span>)<br><span class="co"># Ignore robots.txt. Do not do this without thought and consideration.</span><br>br.set_handle_robots(<span class="ot">False</span>)<br><span class="co"># Don't add Referer (sic) header</span><br>br.set_handle_referer(<span class="ot">False</span>)<br><span class="co"># Don't handle Refresh redirections</span><br>br.set_handle_refresh(<span class="ot">False</span>)<br><span class="co"># Don't handle cookies</span><br>br.set_cookiejar()<br><span class="co"># Supply your own mechanize.CookieJar (NOTE: cookie handling is ON by</span><br><span class="co"># default: no need to do this unless you have some reason to use a</span><br><span class="co"># particular cookiejar)</span><br>br.set_cookiejar(cj)<br><span class="co"># Log information about HTTP redirects and Refreshes.</span><br>br.set_debug_redirects(<span class="ot">True</span>)<br><span class="co"># Log HTTP response bodies (ie. the HTML, most of the time).</span><br>br.set_debug_responses(<span class="ot">True</span>)<br><span class="co"># Print HTTP headers.</span><br>br.set_debug_http(<span class="ot">True</span>)<br><br><span class="co"># To make sure you're seeing all debug output:</span><br>logger = logging.getLogger(<span class="st">"mechanize"</span>)<br>logger.addHandler(logging.StreamHandler(sys.stdout))<br>logger.setLevel(logging.INFO)<br><br><span class="co"># Sometimes it's useful to process bad headers or bad HTML:</span><br>response = br.response() <span class="co"># this is a copy of response</span><br>headers = response.info() <span class="co"># currently, this is a mimetools.Message</span><br>headers[<span class="st">"Content-type"</span>] = <span class="st">"text/html; charset=utf-8"</span><br>response.set_data(response.get_data().replace(<span class="st">"<!---"</span>, <span class="st">"<!--"</span>))<br>br.set_response(response)<br></pre> +<p>mechanize exports the complete interface of <code>urllib2</code>:</p> +<pre class="sourceCode python"><span class="ch">import</span> mechanize<br>response = mechanize.urlopen(<span class="st">"http://www.example.com/"</span>)<br><span class="kw">print</span> response.read()<br></pre> +<p>When using mechanize, anything you would normally import from <code>urllib2</code> should be imported from mechanize instead.</p> +</div> +<div id="credits"> +<h2>Credits</h2> +<p>Much of the code was originally derived from the work of the following people:</p> +<ul> +<li><p>Gisle Aas — <a href="http://search.cpan.org/dist/libwww-perl/">libwww-perl</a></p></li> +<li><p>Jeremy Hylton (and many others) — <a href="http://docs.python.org/release/2.6/library/urllib2.html">urllib2</a></p></li> +<li><p>Andy Lester — <a href="http://search.cpan.org/dist/WWW-Mechanize/">WWW::Mechanize</a></p></li> +<li><p>Johnny Lee (coincidentally-named) — MSIE CookieJar Perl code from which mechanize’s support for that is derived.</p></li> +</ul> +<p>Also:</p> +<ul> +<li><p>Gary Poster and Benji York at Zope Corporation — contributed significant changes to the HTML forms code</p></li> +<li><p>Ronald Tschalar — provided help with Netscape cookies</p></li> +</ul> +<p>Thanks also to the many people who have contributed <a href="support.html">bug reports and patches</a>.</p> +</div> +<div id="see-also"> +<h2>See also</h2> +<p>There are several wrappers around mechanize designed for functional testing of web applications:</p> +<ul> +<li><p><a href="http://cheeseshop.python.org/pypi?:action=display&name=zope.testbrowser"><code>zope.testbrowser</code></a></p></li> +<li><p><a href="http://twill.idyll.org/">twill</a></p></li> +</ul> +<p>See <a href="faq.html">the FAQ</a> page for other links to related software.</p> +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, April 2010. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/html/support.html b/LTA/LTAIngest/mechanize-0.2.5/docs/html/support.html new file mode 100644 index 0000000000000000000000000000000000000000..20f6447f1da09c5b9ccedc2bff86fa84451c0cb1 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/html/support.html @@ -0,0 +1,117 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<!--This file was generated by pandoc: do not edit--><head> + +<meta name="author" content="John J. Lee <jjl@pobox.com>"> +<meta name="date" content="2010-04-22"> +<meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> +<meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> +<style type="text/css" media="screen">@import "../styles/style.css";</style> +<!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--><!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style> +<!--<![endif]--><!--max-width--><!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--><title>mechanize — Support</title> +</head> +<body> +<div id="sf"> +<a href="http://sourceforge.net/"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=48205&type=10" width="80" height="15" alt="SourceForge.net. Fast, secure and Free Open Source software downloads"></a> +</div> +<div id="content"> + +<ul id="nav"> +<li> +<a href="./"> +Home +</a> +</li> +<li> +<a href="download.html"> +Download +</a> +</li> +<li> +<span class="thispage"> +Support +</span> +</li> +<li> +<a href="development.html"> +Development +</a> +</li> +</ul> +<div id="main"> + +<h1 class="title">mechanize — Support</h1> + +<ul id="subnav"> +<li> +<span class="thispage"> +Support +</span> +<ul> +<li> +<a href="documentation.html"> +Documentation +</a> +<ul> +<li> +<a href="faq.html"> +FAQ +</a> +</li> +<li> +<a href="doc.html"> +Handlers etc. +</a> +</li> +<li> +<a href="forms.html"> +Forms +</a> +</li> +<li> +<a href="hints.html"> +Hints +</a> +</li> +</ul> +</li> +<li> +<a href="ChangeLog.txt"> +Changelog +</a> +</li> +</ul> +</li> +</ul> +<div id="TOC"><ul> +<li><a href="#documentation">Documentation</a></li> +<li><a href="#bug-tracker">Bug tracker</a></li> +<li><a href="#contact">Contact</a></li> +</ul></div> + +<div id="documentation"> +<h2>Documentation</h2> +<p>See links at right. <a href="documentation.html">Start here</a>.</p> +</div> +<div id="bug-tracker"> +<h2>Bug tracker</h2> +<p>The bug tracker is <a href="http://github.com/jjlee/mechanize/issues">here on github</a>. It’s equally acceptable to file bugs on the tracker or post about them to the mailing list.</p> +</div> +<div id="contact"> +<h2>Contact</h2> +<p>There is a <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing list</a>.</p> +</div> + +<p>I prefer questions and comments to be sent to +the <a href="http://lists.sourceforge.net/lists/listinfo/wwwsearch-general">mailing +list</a> rather than direct to me.</p> + +<p><a href="mailto:jjl@pobox.com">John J. Lee</a>, April 2010. + +</p> +<hr> +</div> +</div> +</body> +</html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/index.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/index.txt new file mode 100644 index 0000000000000000000000000000000000000000..a258d13bb257e5c65d01d2f8d52c995f152f805e --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/index.txt @@ -0,0 +1,180 @@ +% mechanize + +Stateful programmatic web browsing in Python, after Andy Lester's Perl +module [`WWW::Mechanize`](http://search.cpan.org/dist/WWW-Mechanize/). + + * `mechanize.Browser` and `mechanize.UserAgentBase` implement the + interface of `urllib2.OpenerDirector`, so: + + * any URL can be opened, not just `http:` + + * `mechanize.UserAgentBase` offers easy dynamic configuration of + user-agent features like protocol, cookie, redirection and + `robots.txt` handling, without having to make a new + `OpenerDirector` each time, e.g. by calling `build_opener()`. + + * Easy HTML form filling. + + * Convenient link parsing and following. + + * Browser history (`.back()` and `.reload()` methods). + + * The `Referer` HTTP header is added properly (optional). + + * Automatic observance of + [`robots.txt`](http://www.robotstxt.org/wc/norobots.html). + + * Automatic handling of HTTP-Equiv and Refresh. + + +Examples +-------- + +The examples below are written for a website that does not exist +(`example.com`), so cannot be run. There are also some [working +examples](documentation.html#examples) that you can run. + +~~~~{.python} +import re +import mechanize + +br = mechanize.Browser() +br.open("http://www.example.com/") +# follow second link with element text matching regular expression +response1 = br.follow_link(text_regex=r"cheese\s*shop", nr=1) +assert br.viewing_html() +print br.title() +print response1.geturl() +print response1.info() # headers +print response1.read() # body + +br.select_form(name="order") +# Browser passes through unknown attributes (including methods) +# to the selected HTMLForm. +br["cheeses"] = ["mozzarella", "caerphilly"] # (the method here is __setitem__) +# Submit current form. Browser calls .close() on the current response on +# navigation, so this closes response1 +response2 = br.submit() + +# print currently selected form (don't call .submit() on this, use br.submit()) +print br.form + +response3 = br.back() # back to cheese shop (same data as response1) +# the history mechanism returns cached response objects +# we can still use the response, even though it was .close()d +response3.get_data() # like .seek(0) followed by .read() +response4 = br.reload() # fetches from server + +for form in br.forms(): + print form +# .links() optionally accepts the keyword args of .follow_/.find_link() +for link in br.links(url_regex="python.org"): + print link + br.follow_link(link) # takes EITHER Link instance OR keyword args + br.back() +~~~~ + +You may control the browser's policy by using the methods of +`mechanize.Browser`'s base class, `mechanize.UserAgent`. For example: + +~~~~{.python} +br = mechanize.Browser() +# Explicitly configure proxies (Browser will attempt to set good defaults). +# Note the userinfo ("joe:password@") and port number (":3128") are optional. +br.set_proxies({"http": "joe:password@myproxy.example.com:3128", + "ftp": "proxy.example.com", + }) +# Add HTTP Basic/Digest auth username and password for HTTP proxy access. +# (equivalent to using "joe:password@..." form above) +br.add_proxy_password("joe", "password") +# Add HTTP Basic/Digest auth username and password for website access. +br.add_password("http://example.com/protected/", "joe", "password") +# Don't handle HTTP-EQUIV headers (HTTP headers embedded in HTML). +br.set_handle_equiv(False) +# Ignore robots.txt. Do not do this without thought and consideration. +br.set_handle_robots(False) +# Don't add Referer (sic) header +br.set_handle_referer(False) +# Don't handle Refresh redirections +br.set_handle_refresh(False) +# Don't handle cookies +br.set_cookiejar() +# Supply your own mechanize.CookieJar (NOTE: cookie handling is ON by +# default: no need to do this unless you have some reason to use a +# particular cookiejar) +br.set_cookiejar(cj) +# Log information about HTTP redirects and Refreshes. +br.set_debug_redirects(True) +# Log HTTP response bodies (ie. the HTML, most of the time). +br.set_debug_responses(True) +# Print HTTP headers. +br.set_debug_http(True) + +# To make sure you're seeing all debug output: +logger = logging.getLogger("mechanize") +logger.addHandler(logging.StreamHandler(sys.stdout)) +logger.setLevel(logging.INFO) + +# Sometimes it's useful to process bad headers or bad HTML: +response = br.response() # this is a copy of response +headers = response.info() # currently, this is a mimetools.Message +headers["Content-type"] = "text/html; charset=utf-8" +response.set_data(response.get_data().replace("<!---", "<!--")) +br.set_response(response) +~~~~ + +mechanize exports the complete interface of `urllib2`: + +~~~~{.python} +import mechanize +response = mechanize.urlopen("http://www.example.com/") +print response.read() +~~~~ + +When using mechanize, anything you would normally import from `urllib2` should +be imported from mechanize instead. + + +Credits +------- + +Much of the code was originally derived from the work of the following people: + + * Gisle Aas -- [libwww-perl](http://search.cpan.org/dist/libwww-perl/) + + * Jeremy Hylton (and many others) -- +[urllib2](http://docs.python.org/release/2.6/library/urllib2.html) + + * Andy Lester -- [WWW::Mechanize](http://search.cpan.org/dist/WWW-Mechanize/) + + * Johnny Lee (coincidentally-named) -- MSIE CookieJar Perl code from which +mechanize's support for that is derived. + +Also: + + * Gary Poster and Benji York at Zope Corporation -- contributed significant +changes to the HTML forms code + + * Ronald Tschalar -- provided help with Netscape cookies + +Thanks also to the many people who have contributed [bug reports and +patches](support.html). + + +See also +-------- + +There are several wrappers around mechanize designed for functional testing of +web applications: + + * [`zope.testbrowser`](http://cheeseshop.python.org/pypi?:action=display&name=zope.testbrowser) + + * [twill](http://twill.idyll.org/) + +See [the FAQ](faq.html) page for other links to related +software. + + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/styles/ie6.js b/LTA/LTAIngest/mechanize-0.2.5/docs/styles/ie6.js new file mode 100644 index 0000000000000000000000000000000000000000..c6ad705c06f5c3dfff9fe52f0587e9502493222c --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/styles/ie6.js @@ -0,0 +1,11 @@ +function add_style_element(relative_ref) { + var head = document.getElementsByTagName("head")[0]; + var css = document.createElement("link"); + css.type = "text/css"; + css.rel = "stylesheet"; + css.href = relative_ref; + css.media = "screen"; + head.appendChild(css); +} +/* enable max-width workaround */ +add_style_element("/styles/maxwidth.css"); diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/styles/maxwidth.css b/LTA/LTAIngest/mechanize-0.2.5/docs/styles/maxwidth.css new file mode 100644 index 0000000000000000000000000000000000000000..bb736421d99bc728871e537f914365b6355b23ff --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/styles/maxwidth.css @@ -0,0 +1,9 @@ +/* min-/max-width work-alike */ +#content { + padding:10px; + width: expression(document.documentElement.clientWidth < 398 ? "400px" : document.documentElement.clientWidth > 752 ? "750px" : "auto");/*novalidate*/ + margin: 0 50px; + padding-left: 40px; + background-color:#FFF; +/* background: #fff url('/images/gridbg.gif');*/ +} diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/styles/style.css b/LTA/LTAIngest/mechanize-0.2.5/docs/styles/style.css new file mode 100644 index 0000000000000000000000000000000000000000..54eb12f39160e963991149f8af86a7c2a27dc997 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/styles/style.css @@ -0,0 +1,195 @@ +body { + /* for IE6; text size for non-IE browsers is in .html files */ + font-size:87.5%; +} +body,div,dl,dt,dd,ul,ol,li,h1,h2,h3,h4,h5,h6,pre,form,fieldset,p,blockquote,th,td { + margin:0; + padding:0; +} +#sf a img { + float:right; + border:none; +} +#content { + padding-top:18px; + padding-bottom:18px; + padding-left:10px; + padding-right:10px; + max-width:750px; + min-width:400px; + margin:0 50px; + padding-left:40px; + /* there's a half-abandoned attempt to stick to baseline grid here: +seems too easy for that to get messed up by font-weight variations +&c.; due to choice of 14/21 px grid? */ + /* background:#fff url('../images/gridbg.gif'); */ +} +#main { + clear:both; +} +#nav { + float:right; + display:inline; + padding-top:1.429em; + padding-right:1.5em; + list-style-type:none; + padding-bottom:0px; + border-bottom:1px solid black; +} +#nav li { + float:left; + /* display:inline; */ + _height:0;/*novalidate*/ + margin-left:0.6em; + padding-bottom:0; + padding-top:0; +} +#nav a, #nav .thispage { + padding:0 11px 0 11px; + /* display:inline; */ + font:bold 1.143em verdana, arial, helvetica, sans-serif; + line-height:1.3125em; + margin:1.3125em 0; + overflow:auto; +} +#subnav li, #TOC li { + list-style-type:none; + margin-left:10px; +} +ol { + font-weight:bold; + font-style:italic; + font-family:verdana, arial, helvetica, sans-serif; +} +ol p { + font-weight:normal; + font-style:normal; + font-family:times, serif; +} +li p { + margin:0; +} +p.q { + font-weight:bold; + font-style:italic; + font-family:verdana, arial, helvetica, sans-serif; + margin-top:1.5em; +} +#subnav a, #subnav .thispage, +#TOC a, #TOC .thispage { + font-family:lucida, sans-serif; + font-weight:600; +} +#subnav, #TOC { + float:right; + clear:right; + width:162px; + padding:10px 20px; + background-color:#eee; + margin-left:3em; + margin-bottom:3em; + border-left:thick solid #2d4706; +} +#TOC { + border-left:none; +} +h1 { + padding-left:10px; + padding-top:20px; + font:2em verdana, arial, helvetica, sans-serif; + line-height:1.5em; + margin:1.5em 0; + color:#c4c496; +} +h2 { + font:bold 1.143em verdana, arial, helvetica, sans-serif; + line-height:1.313em; + margin:1.313em 0; +} +h3 { + font:bold 1em verdana, arial, helvetica, sans-serif; + line-height:1.5em; + margin:1.5em 0; + font-style:italic; +} +p { + font:normal 1em times, serif; + line-height:1.5em; + margin:1.5em 0; +} +pre { + font-family:"Courier New", Courier, monospace; + line-height:1.5em; + margin-top:1.5em; + margin-bottom:1.5em; + margin-left:10px; +} +code { + font-family:"Courier New", Courier, monospace; + line-height:1.3em; /* Avoid breaking baseline grid in firefox :-( */ + margin:1.5em 0; +} +ul { + border-bottom:1.5em; +} +li { + margin-left:2em; +} +dt { + line-height:1.5em; + margin:1.5em 0; +} +dd { + line-height:1.5em; + margin-top:1.5em; + margin-bottom:1.5em; + margin-left:2em; +} +.expanded li { + margin-left:2em; + margin-top:1.5em; + margin-bottom:1.5em; +} +.expanded li li { + margin-left:2em; + margin-top:0; + margin-bottom:0; +} +a { + color:#2d4706; +} +a, .thispage { + text-decoration:none; + font-weight:bold; +} +a:link { + color:#2d4706; +} +a:visited { + color:#41680d; +} +a:hover { + color:#000; +} +.warning { + background-color:#ffffaa; +} +.docwarning { + background-color:#f3ecd2; +} + +pre.sourceCode { } +pre.sourceCode span.Normal { } +pre.sourceCode span.Keyword { color: #007020; font-weight: bold; } +pre.sourceCode span.DataType { color: #902000; } +pre.sourceCode span.DecVal { color: #40a070; } +pre.sourceCode span.BaseN { color: #40a070; } +pre.sourceCode span.Float { color: #40a070; } +pre.sourceCode span.Char { color: #4070a0; } +pre.sourceCode span.String { color: #4070a0; } +pre.sourceCode span.Comment { color: #60a0b0; font-style: italic; } +pre.sourceCode span.Others { color: #007020; } +pre.sourceCode span.Alert { color: red; font-weight: bold; } +pre.sourceCode span.Function { color: #06287e; } +pre.sourceCode span.RegionMarker { } +pre.sourceCode span.Error { color: red; font-weight: bold; } diff --git a/LTA/LTAIngest/mechanize-0.2.5/docs/support.txt b/LTA/LTAIngest/mechanize-0.2.5/docs/support.txt new file mode 100644 index 0000000000000000000000000000000000000000..41670de393bec4fecea1ed3ec5a75318a19dff6d --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/docs/support.txt @@ -0,0 +1,26 @@ +% mechanize -- Support + +Documentation +------------- + +See links at right. [Start here](documentation.html). + + +Bug tracker +----------- + +The bug tracker is [here on github](http://github.com/jjlee/mechanize/issues). +It's equally acceptable to file bugs on the tracker or post about them to the +mailing list. + + +Contact +------- + +There is a [mailing +list](http://lists.sourceforge.net/lists/listinfo/wwwsearch-general). + + +<!-- Local Variables: --> +<!-- fill-column:79 --> +<!-- End: --> diff --git a/LTA/LTAIngest/mechanize-0.2.5/examples/forms/data.dat b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/data.dat new file mode 100644 index 0000000000000000000000000000000000000000..d9ca6b0eb920d4e96fc6112208e04f18de76c347 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/data.dat @@ -0,0 +1 @@ +Let's pretend this is a binary file. diff --git a/LTA/LTAIngest/mechanize-0.2.5/examples/forms/data.txt b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/data.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfc5b73207b8a3204e917f8d5c8d1aaec9fd341b --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/data.txt @@ -0,0 +1,3 @@ +Text, text, text. + +Blah. diff --git a/LTA/LTAIngest/mechanize-0.2.5/examples/forms/echo.cgi b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/echo.cgi new file mode 100755 index 0000000000000000000000000000000000000000..2cbfb3f325614b97e9934ffccdab4dbe00b25c01 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/echo.cgi @@ -0,0 +1,23 @@ +#!/usr/bin/python +# -*-python-*- + +print "Content-Type: text/html\n" +import sys, os, string, cgi + +from types import ListType + +print "<html><head><title>Form submission parameters</title></head>" +form = cgi.FieldStorage() +print "<p>Received parameters:</p>" +print "<pre>" +for k in form.keys(): + v = form[k] + if isinstance(v, ListType): + vs = [] + for item in v: + vs.append(item.value) + text = string.join(vs, ", ") + else: + text = v.value + print "%s: %s" % (cgi.escape(k), cgi.escape(text)) +print "</pre></html>" diff --git a/LTA/LTAIngest/mechanize-0.2.5/examples/forms/example.html b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/example.html new file mode 100644 index 0000000000000000000000000000000000000000..c1878f3e6f6d8bb9942576695acee1acf9f293a5 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/example.html @@ -0,0 +1,54 @@ +<html> +<head> +<title>Example</title> +</head> +<body> + +<!--Note that echo.cgi doesn't show file upload data--> +<form action="/cgi-bin/echo.cgi" + method="POST" enctype="multipart/form-data"> + <input type="textarea" name="blah"></input> + + <input type="text" name="comments"></input> + + <input type="checkbox" name="eggs" value="spam"></input> + +<label for="chz">Please select a cheese</label> + <select multiple name="cheeses" id="chz"> + <option value="mozz">Mozzarella</option> + <option value="caerphilly">Caerphilly</option> + <option>gouda</option> + <option>gorgonzola</option> + <option>parmesan</option> + <option>leicester</option> + <option>cheddar</option> + <option>mascarpone</option> + <option>curd</option> + <option>limburger</option> + <option>emmenthal</option> + </select> + + <input type="checkbox" name="apples" value="pears"></input> + + <input type="checkbox" name="whocares" value="edam"></input> + <input type="checkbox" name="whocares" value="gouda"></input> + + <input type="radio" name="spam" value="spam"></input> + <input type="radio" name="spam" value="rhubarb"></input> + + <input type="radio" name="smelly"></input> + +<label for="fchz" value="What's your favourite cheese?" /> + <select single name="favorite_cheese" id="fchz"> + <option>cheddar</option> + <option>brie</option> + <option>leicester</option> + <option>jahlsberg</option> + </select> + + <input type="file"></input> +</form> + + +</body> +</html> diff --git a/LTA/LTAIngest/mechanize-0.2.5/examples/forms/example.py b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/example.py new file mode 100755 index 0000000000000000000000000000000000000000..6c2f82e57e9692e245e7bb060080bedf36b01632 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/example.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python + +import sys + +import mechanize + +if len(sys.argv) == 1: + uri = "http://wwwsearch.sourceforge.net/" +else: + uri = sys.argv[1] + +request = mechanize.Request(mechanize.urljoin(uri, "mechanize/example.html")) +response = mechanize.urlopen(request) +forms = mechanize.ParseResponse(response, backwards_compat=False) +response.close() +## f = open("example.html") +## forms = mechanize.ParseFile(f, "http://example.com/example.html", +## backwards_compat=False) +## f.close() +form = forms[0] +print form # very useful! + +# A 'control' is a graphical HTML form widget: a text entry box, a +# dropdown 'select' list, a checkbox, etc. + +# Indexing allows setting and retrieval of control values +original_text = form["comments"] # a string, NOT a Control instance +form["comments"] = "Blah." + +# Controls that represent lists (checkbox, select and radio lists) are +# ListControl instances. Their values are sequences of list item names. +# They come in two flavours: single- and multiple-selection: +form["favorite_cheese"] = ["brie"] # single +form["cheeses"] = ["parmesan", "leicester", "cheddar"] # multi +# equivalent, but more flexible: +form.set_value(["parmesan", "leicester", "cheddar"], name="cheeses") + +# Add files to FILE controls with .add_file(). Only call this multiple +# times if the server is expecting multiple files. +# add a file, default value for MIME type, no filename sent to server +form.add_file(open("data.dat")) +# add a second file, explicitly giving MIME type, and telling the server +# what the filename is +form.add_file(open("data.txt"), "text/plain", "data.txt") + +# All Controls may be disabled (equivalent of greyed-out in browser)... +control = form.find_control("comments") +print control.disabled +# ...or readonly +print control.readonly +# readonly and disabled attributes can be assigned to +control.disabled = False +# convenience method, used here to make all controls writable (unless +# they're disabled): +form.set_all_readonly(False) + +# A couple of notes about list controls and HTML: + +# 1. List controls correspond to either a single SELECT element, or +# multiple INPUT elements. Items correspond to either OPTION or INPUT +# elements. For example, this is a SELECT control, named "control1": + +# <select name="control1"> +# <option>foo</option> +# <option value="1">bar</option> +# </select> + +# and this is a CHECKBOX control, named "control2": + +# <input type="checkbox" name="control2" value="foo" id="cbe1"> +# <input type="checkbox" name="control2" value="bar" id="cbe2"> + +# You know the latter is a single control because all the name attributes +# are the same. + +# 2. Item names are the strings that go to make up the value that should +# be returned to the server. These strings come from various different +# pieces of text in the HTML. The HTML standard and the mechanize +# docstrings explain in detail, but playing around with an HTML file, +# ParseFile() and 'print form' is very useful to understand this! + +# You can get the Control instances from inside the form... +control = form.find_control("cheeses", type="select") +print control.name, control.value, control.type +control.value = ["mascarpone", "curd"] +# ...and the Item instances from inside the Control +item = control.get("curd") +print item.name, item.selected, item.id, item.attrs +item.selected = False + +# Controls may be referred to by label: +# find control with label that has a *substring* "Cheeses" +# (e.g., a label "Please select a cheese" would match). +control = form.find_control(label="select a cheese") + +# You can explicitly say that you're referring to a ListControl: +# set value of "cheeses" ListControl +form.set_value(["gouda"], name="cheeses", kind="list") +# equivalent: +form.find_control(name="cheeses", kind="list").value = ["gouda"] +# the first example is also almost equivalent to the following (but +# insists that the control be a ListControl -- so it will skip any +# non-list controls that come before the control we want) +form["cheeses"] = ["gouda"] +# The kind argument can also take values "multilist", "singlelist", "text", +# "clickable" and "file": +# find first control that will accept text, and scribble in it +form.set_value("rhubarb rhubarb", kind="text", nr=0) +# find, and set the value of, the first single-selection list control +form.set_value(["spam"], kind="singlelist", nr=0) + +# You can find controls with a general predicate function: +def control_has_caerphilly(control): + for item in control.items: + if item.name == "caerphilly": return True +form.find_control(kind="list", predicate=control_has_caerphilly) + +# HTMLForm.controls is a list of all controls in the form +for control in form.controls: + if control.value == "inquisition": sys.exit() + +# Control.items is a list of all Item instances in the control +for item in form.find_control("cheeses").items: + print item.name + +# To remove items from a list control, remove it from .items: +cheeses = form.find_control("cheeses") +curd = cheeses.get("curd") +del cheeses.items[cheeses.items.index(curd)] +# To add items to a list container, instantiate an Item with its control +# and attributes: +# Note that you are responsible for getting the attributes correct here, +# and these are not quite identical to the original HTML, due to +# defaulting rules and a few special attributes (e.g. Items that represent +# OPTIONs have a special "contents" key in their .attrs dict). In future +# there will be an explicitly supported way of using the parsing logic to +# add items and controls from HTML strings without knowing these details. +mechanize.Item(cheeses, {"contents": "mascarpone", + "value": "mascarpone"}) + +# You can specify list items by label using set/get_value_by_label() and +# the label argument of the .get() method. Sometimes labels are easier to +# maintain than names, sometimes the other way around. +form.set_value_by_label(["Mozzarella", "Caerphilly"], "cheeses") + +# Which items are present, selected, and successful? +# is the "parmesan" item of the "cheeses" control successful (selected +# and not disabled)? +print "parmesan" in form["cheeses"] +# is the "parmesan" item of the "cheeses" control selected? +print "parmesan" in [ + item.name for item in form.find_control("cheeses").items if item.selected] +# does cheeses control have a "caerphilly" item? +print "caerphilly" in [item.name for item in form.find_control("cheeses").items] + +# Sometimes one wants to set or clear individual items in a list, rather +# than setting the whole .value: +# select the item named "gorgonzola" in the first control named "cheeses" +form.find_control("cheeses").get("gorgonzola").selected = True +# You can be more specific: +# deselect "edam" in third CHECKBOX control +form.find_control(type="checkbox", nr=2).get("edam").selected = False +# deselect item labelled "Mozzarella" in control with id "chz" +form.find_control(id="chz").get(label="Mozzarella").selected = False + +# Often, a single checkbox (a CHECKBOX control with a single item) is +# present. In that case, the name of the single item isn't of much +# interest, so it's a good idea to check and uncheck the box without +# using the item name: +form.find_control("smelly").items[0].selected = True # check +form.find_control("smelly").items[0].selected = False # uncheck + +# Items may be disabled (selecting or de-selecting a disabled item is +# not allowed): +control = form.find_control("cheeses") +print control.get("emmenthal").disabled +control.get("emmenthal").disabled = True +# enable all items in control +control.set_all_items_disabled(False) + +request2 = form.click() # mechanize.Request object +try: + response2 = mechanize.urlopen(request2) +except mechanize.HTTPError, response2: + pass + +print response2.geturl() +# headers +for name, value in response2.info().items(): + if name != "date": + print "%s: %s" % (name.title(), value) +print response2.read() # body +response2.close() diff --git a/LTA/LTAIngest/mechanize-0.2.5/examples/forms/simple.py b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/simple.py new file mode 100755 index 0000000000000000000000000000000000000000..581651df8170aaa13eb24fbb6f16af65bae28825 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/examples/forms/simple.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +import sys + +from mechanize import ParseResponse, urlopen, urljoin + +if len(sys.argv) == 1: + uri = "http://wwwsearch.sourceforge.net/" +else: + uri = sys.argv[1] + +response = urlopen(urljoin(uri, "mechanize/example.html")) +forms = ParseResponse(response, backwards_compat=False) +form = forms[0] +print form +form["comments"] = "Thanks, Gisle" + +# form.click() returns a mechanize.Request object +# (see HTMLForm.click.__doc__ if you want to use only the forms support, and +# not the rest of mechanize) +print urlopen(form.click()).read() diff --git a/LTA/LTAIngest/mechanize-0.2.5/examples/hack21.py b/LTA/LTAIngest/mechanize-0.2.5/examples/hack21.py new file mode 100644 index 0000000000000000000000000000000000000000..785de0352283f6d49c4f55f38949af8bde1bd346 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/examples/hack21.py @@ -0,0 +1,60 @@ +#/usr/bin/env python + +# Port of Hack 21 from the O'Reilly book "Spidering Hacks" by Tara +# Calishain and Kevin Hemenway. Of course, there's no need to explicitly +# catch exceptions in Python, unlike checking error return values in Perl, +# but I've left those in for the sake of a direct port. + +import sys, os, re +from urllib2 import HTTPError + +import mechanize +assert mechanize.__version__ >= (0, 0, 6, "a") + +mech = mechanize.Browser() +# Addition 2005-01-05: Be naughty, since robots.txt asks not to +# access /search now. We're not madly searching for everything, so +# I don't feel too guilty. +mech.set_handle_robots(False) +#mech.set_debug_http(True) + +# Get the starting search page +try: + mech.open("http://search.cpan.org") +except HTTPError, e: + sys.exit("%d: %s" % (e.code, e.msg)) + +# Select the form, fill the fields, and submit +mech.select_form(nr=0) +mech["query"] = "Lester" +mech["mode"] = ["author"] +try: + mech.submit() +except HTTPError, e: + sys.exit("post failed: %d: %s" % (e.code, e.msg)) + +# Find the link for "Andy" +try: + mech.follow_link(text_regex=re.compile("Andy")) +except HTTPError, e: + sys.exit("post failed: %d: %s" % (e.code, e.msg)) + +# Get all the tarballs +urls = [link.absolute_url for link in + mech.links(url_regex=re.compile(r"\.tar\.gz$"))] +print "Found", len(urls), "tarballs to download" + +if "--all" not in sys.argv[1:]: + urls = urls[:1] + +for url in urls: + filename = os.path.basename(url) + f = open(filename, "wb") + print "%s -->" % filename, + r = mech.open(url) + while 1: + data = r.read(1024) + if not data: break + f.write(data) + f.close() + print os.stat(filename).st_size, "bytes" diff --git a/LTA/LTAIngest/mechanize-0.2.5/examples/pypi.py b/LTA/LTAIngest/mechanize-0.2.5/examples/pypi.py new file mode 100644 index 0000000000000000000000000000000000000000..5763dbd987d9bd88a470069a70331ca396cc6fc8 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/examples/pypi.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +# Search PyPI, the Python Package Index, and retrieve latest mechanize tarball. + +# This is just to demonstrate mechanize: You should use easy_install to do +# this, not this silly script. + +import sys +import os +import urlparse + +import mechanize + + +def download_mechanize(): + browser = mechanize.Browser(factory=mechanize.RobustFactory()) + browser.set_handle_robots(False) + + browser.open("http://pypi.python.org/pypi") + browser.follow_link(text="Package Index", nr=0) + browser.select_form(name="searchform") + browser.form["term"] = "mechanize" + browser.submit() + browser.follow_link(text_regex="mechanize-?(.*)") + link = browser.find_link(text_regex=r"\.tar\.gz") + filename = os.path.basename(urlparse.urlsplit(link.url)[2]) + if os.path.exists(filename): + sys.exit("%s already exists, not grabbing" % filename) + browser.retrieve(link.url, filename) + + +if __name__ == "__main__": + download_mechanize() diff --git a/LTA/LTAIngest/mechanize-0.2.5/ez_setup.py b/LTA/LTAIngest/mechanize-0.2.5/ez_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..1ff1d3e7a6839f4f441b38407f4e897193af3184 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/ez_setup.py @@ -0,0 +1,284 @@ +#!python +"""Bootstrap setuptools installation + +If you want to use setuptools in your package's setup.py, just include this +file in the same directory with it, and add this to the top of your setup.py:: + + from ez_setup import use_setuptools + use_setuptools() + +If you want to require a specific version of setuptools, set a download +mirror, or use an alternate download directory, you can do so by supplying +the appropriate options to ``use_setuptools()``. + +This file can also be run as a script to install or upgrade setuptools. +""" +import sys +DEFAULT_VERSION = "0.6c11" +DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] + +md5_data = { + 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', + 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', + 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', + 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', + 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', + 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', + 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', + 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', + 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', + 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', + 'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090', + 'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4', + 'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7', + 'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5', + 'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de', + 'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b', + 'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2', + 'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086', + 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', + 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', + 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', + 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', + 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', + 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', + 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', + 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', + 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', + 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', + 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', + 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', + 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', + 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', + 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', + 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', + 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', + 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', + 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', + 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', + 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', + 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', + 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', + 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', +} + +import sys, os +try: from hashlib import md5 +except ImportError: from md5 import md5 + +def _validate_md5(egg_name, data): + if egg_name in md5_data: + digest = md5(data).hexdigest() + if digest != md5_data[egg_name]: + print >>sys.stderr, ( + "md5 validation of %s failed! (Possible download problem?)" + % egg_name + ) + sys.exit(2) + return data + +def use_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + download_delay=15 +): + """Automatically find/download setuptools and make it available on sys.path + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end with + a '/'). `to_dir` is the directory where setuptools will be downloaded, if + it is not already available. If `download_delay` is specified, it should + be the number of seconds that will be paused before initiating a download, + should one be required. If an older version of setuptools is installed, + this routine will print a message to ``sys.stderr`` and raise SystemExit in + an attempt to abort the calling script. + """ + was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules + def do_download(): + egg = download_setuptools(version, download_base, to_dir, download_delay) + sys.path.insert(0, egg) + import setuptools; setuptools.bootstrap_install_from = egg + try: + import pkg_resources + except ImportError: + return do_download() + try: + pkg_resources.require("setuptools>="+version); return + except pkg_resources.VersionConflict, e: + if was_imported: + print >>sys.stderr, ( + "The required version of setuptools (>=%s) is not available, and\n" + "can't be installed while this script is running. Please install\n" + " a more recent version first, using 'easy_install -U setuptools'." + "\n\n(Currently using %r)" + ) % (version, e.args[0]) + sys.exit(2) + else: + del pkg_resources, sys.modules['pkg_resources'] # reload ok + return do_download() + except pkg_resources.DistributionNotFound: + return do_download() + +def download_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + delay = 15 +): + """Download setuptools from a specified location and return its filename + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end + with a '/'). `to_dir` is the directory where the egg will be downloaded. + `delay` is the number of seconds to pause before an actual download attempt. + """ + import urllib2, shutil + egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) + url = download_base + egg_name + saveto = os.path.join(to_dir, egg_name) + src = dst = None + if not os.path.exists(saveto): # Avoid repeated downloads + try: + from distutils import log + if delay: + log.warn(""" +--------------------------------------------------------------------------- +This script requires setuptools version %s to run (even to display +help). I will attempt to download it for you (from +%s), but +you may need to enable firewall access for this script first. +I will start the download in %d seconds. + +(Note: if this machine does not have network access, please obtain the file + + %s + +and place it in this directory before rerunning this script.) +---------------------------------------------------------------------------""", + version, download_base, delay, url + ); from time import sleep; sleep(delay) + log.warn("Downloading %s", url) + src = urllib2.urlopen(url) + # Read/write all in one block, so we don't create a corrupt file + # if the download is interrupted. + data = _validate_md5(egg_name, src.read()) + dst = open(saveto,"wb"); dst.write(data) + finally: + if src: src.close() + if dst: dst.close() + return os.path.realpath(saveto) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def main(argv, version=DEFAULT_VERSION): + """Install or upgrade setuptools and EasyInstall""" + try: + import setuptools + except ImportError: + egg = None + try: + egg = download_setuptools(version, delay=0) + sys.path.insert(0,egg) + from setuptools.command.easy_install import main + return main(list(argv)+[egg]) # we're done here + finally: + if egg and os.path.exists(egg): + os.unlink(egg) + else: + if setuptools.__version__ == '0.0.1': + print >>sys.stderr, ( + "You have an obsolete version of setuptools installed. Please\n" + "remove it from your system entirely before rerunning this script." + ) + sys.exit(2) + + req = "setuptools>="+version + import pkg_resources + try: + pkg_resources.require(req) + except pkg_resources.VersionConflict: + try: + from setuptools.command.easy_install import main + except ImportError: + from easy_install import main + main(list(argv)+[download_setuptools(delay=0)]) + sys.exit(0) # try to force an exit + else: + if argv: + from setuptools.command.easy_install import main + main(argv) + else: + print "Setuptools version",version,"or greater has been installed." + print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' + +def update_md5(filenames): + """Update our built-in md5 registry""" + + import re + + for name in filenames: + base = os.path.basename(name) + f = open(name,'rb') + md5_data[base] = md5(f.read()).hexdigest() + f.close() + + data = [" %r: %r,\n" % it for it in md5_data.items()] + data.sort() + repl = "".join(data) + + import inspect + srcfile = inspect.getsourcefile(sys.modules[__name__]) + f = open(srcfile, 'rb'); src = f.read(); f.close() + + match = re.search("\nmd5_data = {\n([^}]+)}", src) + if not match: + print >>sys.stderr, "Internal error!" + sys.exit(2) + + src = src[:match.start(1)] + repl + src[match.end(1):] + f = open(srcfile,'w') + f.write(src) + f.close() + + +if __name__=='__main__': + if len(sys.argv)>2 and sys.argv[1]=='--md5update': + update_md5(sys.argv[2:]) + else: + main(sys.argv[1:]) + + + + + + diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/PKG-INFO b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..d4fd759ef21020a02b0110d1de9c10b88d8110d4 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/PKG-INFO @@ -0,0 +1,61 @@ +Metadata-Version: 1.0 +Name: mechanize +Version: 0.2.5 +Summary: Stateful programmatic web browsing. +Home-page: http://wwwsearch.sourceforge.net/mechanize/ +Author: John J. Lee +Author-email: jjl@pobox.com +License: BSD +Download-URL: http://pypi.python.org/packages/source/m/mechanize/mechanize-0.2.5.tar.gz +Description: Stateful programmatic web browsing, after Andy Lester's Perl module + WWW::Mechanize. + + mechanize.Browser implements the urllib2.OpenerDirector interface. Browser + objects have state, including navigation history, HTML form state, cookies, + etc. The set of features and URL schemes handled by Browser objects is + configurable. The library also provides an API that is mostly compatible with + urllib2: your urllib2 program will likely still work if you replace "urllib2" + with "mechanize" everywhere. + + Features include: ftp:, http: and file: URL schemes, browser history, hyperlink + and HTML form support, HTTP cookies, HTTP-EQUIV and Refresh, Referer [sic] + header, robots.txt, redirections, proxies, and Basic and Digest HTTP + authentication. + + Much of the code originally derived from Perl code by Gisle Aas (libwww-perl), + Johnny Lee (MSIE Cookie support) and last but not least Andy Lester + (WWW::Mechanize). urllib2 was written by Jeremy Hylton. + + +Platform: any +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: License :: OSI Approved :: BSD License +Classifier: License :: OSI Approved :: Zope Public License +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.4 +Classifier: Programming Language :: Python :: 2.5 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Topic :: Internet +Classifier: Topic :: Internet :: File Transfer Protocol (FTP) +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Internet :: WWW/HTTP :: Browsers +Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search +Classifier: Topic :: Internet :: WWW/HTTP :: Site Management +Classifier: Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Software Development :: Testing +Classifier: Topic :: Software Development :: Testing :: Traffic Generation +Classifier: Topic :: System :: Archiving :: Mirroring +Classifier: Topic :: System :: Networking :: Monitoring +Classifier: Topic :: System :: Systems Administration +Classifier: Topic :: Text Processing +Classifier: Topic :: Text Processing :: Markup +Classifier: Topic :: Text Processing :: Markup :: HTML +Classifier: Topic :: Text Processing :: Markup :: XML diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/SOURCES.txt b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e4e4a2ccb8380381175b32fd5fce6cb735bd46f --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/SOURCES.txt @@ -0,0 +1,129 @@ +COPYING.txt +INSTALL.txt +MANIFEST.in +README.txt +ez_setup.py +release.py +setup.cfg +setup.py +test.py +docs/development.txt +docs/doc.txt +docs/documentation.txt +docs/download.txt +docs/faq.txt +docs/forms.txt +docs/hints.txt +docs/index.txt +docs/support.txt +docs/html/ChangeLog.txt +docs/html/development.html +docs/html/doc.html +docs/html/documentation.html +docs/html/download.html +docs/html/faq.html +docs/html/forms.html +docs/html/hints.html +docs/html/index.html +docs/html/support.html +docs/styles/ie6.js +docs/styles/maxwidth.css +docs/styles/style.css +examples/hack21.py +examples/pypi.py +examples/forms/data.dat +examples/forms/data.txt +examples/forms/echo.cgi +examples/forms/example.html +examples/forms/example.py +examples/forms/simple.py +mechanize/__init__.py +mechanize/_auth.py +mechanize/_beautifulsoup.py +mechanize/_clientcookie.py +mechanize/_debug.py +mechanize/_firefox3cookiejar.py +mechanize/_form.py +mechanize/_gzip.py +mechanize/_headersutil.py +mechanize/_html.py +mechanize/_http.py +mechanize/_lwpcookiejar.py +mechanize/_markupbase.py +mechanize/_mechanize.py +mechanize/_mozillacookiejar.py +mechanize/_msiecookiejar.py +mechanize/_opener.py +mechanize/_pullparser.py +mechanize/_request.py +mechanize/_response.py +mechanize/_rfc3986.py +mechanize/_sgmllib_copy.py +mechanize/_sockettimeout.py +mechanize/_testcase.py +mechanize/_urllib2.py +mechanize/_urllib2_fork.py +mechanize/_useragent.py +mechanize/_util.py +mechanize/_version.py +mechanize.egg-info/PKG-INFO +mechanize.egg-info/SOURCES.txt +mechanize.egg-info/dependency_links.txt +mechanize.egg-info/top_level.txt +mechanize.egg-info/zip-safe +test/__init__.py +test/test_api.py +test/test_browser.doctest +test/test_browser.py +test/test_cookie.py +test/test_cookies.py +test/test_date.py +test/test_form.py +test/test_form_mutation.py +test/test_forms.doctest +test/test_functional.py +test/test_headers.py +test/test_history.doctest +test/test_html.doctest +test/test_html.py +test/test_import.py +test/test_opener.doctest +test/test_opener.py +test/test_password_manager.special_doctest +test/test_performance.py +test/test_pickle.py +test/test_pullparser.py +test/test_request.doctest +test/test_response.doctest +test/test_response.py +test/test_rfc3986.doctest +test/test_robotfileparser.doctest +test/test_unittest.py +test/test_urllib2.py +test/test_urllib2_localnet.py +test/test_useragent.py +test-tools/cookietest.cgi +test-tools/doctest.py +test-tools/functools_copy.py +test-tools/linecache_copy.py +test-tools/testprogram.py +test-tools/twisted-ftpserver.py +test-tools/twisted-localserver.py +test-tools/unittest/__init__.py +test-tools/unittest/__main__.py +test-tools/unittest/case.py +test-tools/unittest/loader.py +test-tools/unittest/main.py +test-tools/unittest/result.py +test-tools/unittest/runner.py +test-tools/unittest/suite.py +test-tools/unittest/util.py +test/functional_tests_golden/FormsExamplesTests.test_example/output +test/functional_tests_golden/FormsExamplesTests.test_simple/output +test/test_form_data/Auth.html +test/test_form_data/FullSearch.html +test/test_form_data/GeneralSearch.html +test/test_form_data/MarkedRecords.html +test/test_form_data/MarkedResults.html +test/test_form_data/Results.html +test/test_form_data/SearchType.html \ No newline at end of file diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/dependency_links.txt b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/top_level.txt b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..c02c0deb6986572f0f9bd8609f581c47cc42e0ba --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/top_level.txt @@ -0,0 +1 @@ +mechanize diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/zip-safe b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/zip-safe new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize.egg-info/zip-safe @@ -0,0 +1 @@ + diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/__init__.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c4429be394a39a739b8ceae874a79013a297c3d4 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/__init__.py @@ -0,0 +1,211 @@ +__all__ = [ + 'AbstractBasicAuthHandler', + 'AbstractDigestAuthHandler', + 'BaseHandler', + 'Browser', + 'BrowserStateError', + 'CacheFTPHandler', + 'ContentTooShortError', + 'Cookie', + 'CookieJar', + 'CookiePolicy', + 'DefaultCookiePolicy', + 'DefaultFactory', + 'FTPHandler', + 'Factory', + 'FileCookieJar', + 'FileHandler', + 'FormNotFoundError', + 'FormsFactory', + 'HTTPBasicAuthHandler', + 'HTTPCookieProcessor', + 'HTTPDefaultErrorHandler', + 'HTTPDigestAuthHandler', + 'HTTPEquivProcessor', + 'HTTPError', + 'HTTPErrorProcessor', + 'HTTPHandler', + 'HTTPPasswordMgr', + 'HTTPPasswordMgrWithDefaultRealm', + 'HTTPProxyPasswordMgr', + 'HTTPRedirectDebugProcessor', + 'HTTPRedirectHandler', + 'HTTPRefererProcessor', + 'HTTPRefreshProcessor', + 'HTTPResponseDebugProcessor', + 'HTTPRobotRulesProcessor', + 'HTTPSClientCertMgr', + 'HeadParser', + 'History', + 'LWPCookieJar', + 'Link', + 'LinkNotFoundError', + 'LinksFactory', + 'LoadError', + 'MSIECookieJar', + 'MozillaCookieJar', + 'OpenerDirector', + 'OpenerFactory', + 'ParseError', + 'ProxyBasicAuthHandler', + 'ProxyDigestAuthHandler', + 'ProxyHandler', + 'Request', + 'RobotExclusionError', + 'RobustFactory', + 'RobustFormsFactory', + 'RobustLinksFactory', + 'RobustTitleFactory', + 'SeekableResponseOpener', + 'TitleFactory', + 'URLError', + 'USE_BARE_EXCEPT', + 'UnknownHandler', + 'UserAgent', + 'UserAgentBase', + 'XHTMLCompatibleHeadParser', + '__version__', + 'build_opener', + 'install_opener', + 'lwp_cookie_str', + 'make_response', + 'request_host', + 'response_seek_wrapper', # XXX deprecate in public interface? + 'seek_wrapped_response', # XXX should probably use this internally in place of response_seek_wrapper() + 'str2time', + 'urlopen', + 'urlretrieve', + 'urljoin', + + # ClientForm API + 'AmbiguityError', + 'ControlNotFoundError', + 'FormParser', + 'ItemCountError', + 'ItemNotFoundError', + 'LocateError', + 'Missing', + 'ParseFile', + 'ParseFileEx', + 'ParseResponse', + 'ParseResponseEx', + 'ParseString', + 'XHTMLCompatibleFormParser', + # deprecated + 'CheckboxControl', + 'Control', + 'FileControl', + 'HTMLForm', + 'HiddenControl', + 'IgnoreControl', + 'ImageControl', + 'IsindexControl', + 'Item', + 'Label', + 'ListControl', + 'PasswordControl', + 'RadioControl', + 'ScalarControl', + 'SelectControl', + 'SubmitButtonControl', + 'SubmitControl', + 'TextControl', + 'TextareaControl', + ] + +import logging +import sys + +from _version import __version__ + +# high-level stateful browser-style interface +from _mechanize import \ + Browser, History, \ + BrowserStateError, LinkNotFoundError, FormNotFoundError + +# configurable URL-opener interface +from _useragent import UserAgentBase, UserAgent +from _html import \ + Link, \ + Factory, DefaultFactory, RobustFactory, \ + FormsFactory, LinksFactory, TitleFactory, \ + RobustFormsFactory, RobustLinksFactory, RobustTitleFactory + +# urllib2 work-alike interface. This is a superset of the urllib2 interface. +from _urllib2 import * +import _urllib2 +if hasattr(_urllib2, "HTTPSHandler"): + __all__.append("HTTPSHandler") +del _urllib2 + +# misc +from _http import HeadParser +from _http import XHTMLCompatibleHeadParser +from _opener import ContentTooShortError, OpenerFactory, urlretrieve +from _response import \ + response_seek_wrapper, seek_wrapped_response, make_response +from _rfc3986 import urljoin +from _util import http2time as str2time + +# cookies +from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \ + CookieJar, FileCookieJar, LoadError, request_host_lc as request_host, \ + effective_request_host +from _lwpcookiejar import LWPCookieJar, lwp_cookie_str +# 2.4 raises SyntaxError due to generator / try/finally use +if sys.version_info[:2] > (2,4): + try: + import sqlite3 + except ImportError: + pass + else: + from _firefox3cookiejar import Firefox3CookieJar +from _mozillacookiejar import MozillaCookieJar +from _msiecookiejar import MSIECookieJar + +# forms +from _form import ( + AmbiguityError, + ControlNotFoundError, + FormParser, + ItemCountError, + ItemNotFoundError, + LocateError, + Missing, + ParseError, + ParseFile, + ParseFileEx, + ParseResponse, + ParseResponseEx, + ParseString, + XHTMLCompatibleFormParser, + # deprecated + CheckboxControl, + Control, + FileControl, + HTMLForm, + HiddenControl, + IgnoreControl, + ImageControl, + IsindexControl, + Item, + Label, + ListControl, + PasswordControl, + RadioControl, + ScalarControl, + SelectControl, + SubmitButtonControl, + SubmitControl, + TextControl, + TextareaControl, + ) + +# If you hate the idea of turning bugs into warnings, do: +# import mechanize; mechanize.USE_BARE_EXCEPT = False +USE_BARE_EXCEPT = True + +logger = logging.getLogger("mechanize") +if logger.level is logging.NOTSET: + logger.setLevel(logging.CRITICAL) +del logger diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_auth.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_auth.py new file mode 100644 index 0000000000000000000000000000000000000000..900e201ea6029d502a17cdab9e043c95b49056e2 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_auth.py @@ -0,0 +1,68 @@ +"""HTTP Authentication and Proxy support. + + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +from _urllib2_fork import HTTPPasswordMgr + + +# TODO: stop deriving from HTTPPasswordMgr +class HTTPProxyPasswordMgr(HTTPPasswordMgr): + # has default realm and host/port + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if uri is None or isinstance(uri, basestring): + uris = [uri] + else: + uris = uri + passwd_by_domain = self.passwd.setdefault(realm, {}) + for uri in uris: + for default_port in True, False: + reduced_uri = self.reduce_uri(uri, default_port) + passwd_by_domain[reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + attempts = [(realm, authuri), (None, authuri)] + # bleh, want default realm to take precedence over default + # URI/authority, hence this outer loop + for default_uri in False, True: + for realm, authuri in attempts: + authinfo_by_domain = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uri, authinfo in authinfo_by_domain.iteritems(): + if uri is None and not default_uri: + continue + if self.is_suburi(uri, reduced_authuri): + return authinfo + user, password = None, None + + if user is not None: + break + return user, password + + def reduce_uri(self, uri, default_port=True): + if uri is None: + return None + return HTTPPasswordMgr.reduce_uri(self, uri, default_port) + + def is_suburi(self, base, test): + if base is None: + # default to the proxy's host/port + hostport, path = test + base = (hostport, "/") + return HTTPPasswordMgr.is_suburi(self, base, test) + + +class HTTPSClientCertMgr(HTTPPasswordMgr): + # implementation inheritance: this is not a proper subclass + def add_key_cert(self, uri, key_file, cert_file): + self.add_password(None, uri, key_file, cert_file) + def find_key_cert(self, authuri): + return HTTPPasswordMgr.find_user_password(self, None, authuri) diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_beautifulsoup.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_beautifulsoup.py new file mode 100644 index 0000000000000000000000000000000000000000..a157ef279cc49f46d7eb41463e8a6dfddba3cc4d --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_beautifulsoup.py @@ -0,0 +1,1077 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +v2.1.1 +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance +into a tree representation. It provides methods and Pythonic idioms +that make it easy to search and modify the tree. + +A well-formed XML/HTML document will yield a well-formed data +structure. An ill-formed XML/HTML document will yield a +correspondingly ill-formed data structure. If your document is only +locally well-formed, you can use this library to find and process the +well-formed part of it. The BeautifulSoup class has heuristics for +obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup has no external dependencies. It works with Python 2.2 +and up. + +Beautiful Soup defines classes for four different parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. + + * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML + that trips up BeautifulSoup. + + * BeautifulSOAP, for making it easier to parse XML documents that use + lots of subelements containing a single string, where you'd prefer + they put that string into an attribute (such as SOAP messages). + +You can subclass BeautifulStoneSoup or BeautifulSoup to create a +parsing strategy specific to an XML schema or a particular bizarre +HTML document. Typically your subclass would just override +SELF_CLOSING_TAGS and/or NESTABLE_TAGS. +""" #" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "2.1.1" +__date__ = "$Date$" +__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" +__license__ = "PSF" + +from _sgmllib_copy import SGMLParser, SGMLParseError +import types +import re +import _sgmllib_copy as sgmllib + +class NullType(object): + + """Similar to NoneType with a corresponding singleton instance + 'Null' that, unlike None, accepts any message and returns itself. + + Examples: + >>> Null("send", "a", "message")("and one more", + ... "and what you get still") is Null + True + """ + + def __new__(cls): return Null + def __call__(self, *args, **kwargs): return Null +## def __getstate__(self, *args): return Null + def __getattr__(self, attr): return Null + def __getitem__(self, item): return Null + def __setattr__(self, attr, value): pass + def __setitem__(self, item, value): pass + def __len__(self): return 0 + # FIXME: is this a python bug? otherwise ``for x in Null: pass`` + # never terminates... + def __iter__(self): return iter([]) + def __contains__(self, item): return False + def __repr__(self): return "Null" +Null = object.__new__(NullType) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=Null, previous=Null): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = Null + self.previousSibling = Null + self.nextSibling = Null + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def findNext(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._first(self.fetchNext, name, attrs, text) + firstNext = findNext + + def fetchNext(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before after Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextGenerator) + + def findNextSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._first(self.fetchNextSiblings, name, attrs, text) + firstNextSibling = findNextSibling + + def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) + + def findPrevious(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._first(self.fetchPrevious, name, attrs, text) + + def fetchPrevious(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.previousGenerator) + firstPrevious = findPrevious + + def findPreviousSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._first(self.fetchPreviousSiblings, name, attrs, text) + firstPreviousSibling = findPreviousSibling + + def fetchPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, + self.previousSiblingGenerator) + + def findParent(self, name=None, attrs={}): + """Returns the closest parent of this Tag that matches the given + criteria.""" + r = Null + l = self.fetchParents(name, attrs, 1) + if l: + r = l[0] + return r + firstParent = findParent + + def fetchParents(self, name=None, attrs={}, limit=None): + """Returns the parents of this Tag that match the given + criteria.""" + return self._fetch(name, attrs, None, limit, self.parentGenerator) + + #These methods do the real heavy lifting. + + def _first(self, method, name, attrs, text): + r = Null + l = method(name, attrs, text, 1) + if l: + r = l[0] + return r + + def _fetch(self, name, attrs, text, limit, generator): + "Iterates over a generator looking for things that match." + if not hasattr(attrs, 'items'): + attrs = {'class' : attrs} + + results = [] + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + found = None + if isinstance(i, Tag): + if not text: + if not name or self._matches(i, name): + match = True + for attr, matchAgainst in attrs.items(): + check = i.get(attr) + if not self._matches(check, matchAgainst): + match = False + break + if match: + found = i + elif text: + if self._matches(i, text): + found = i + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #Generators that can be used to navigate starting from both + #NavigableTexts and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + def _matches(self, chunk, howToMatch): + #print 'looking for %s in %s' % (howToMatch, chunk) + # + # If given a list of items, return true if the list contains a + # text element that matches. + if isList(chunk) and not isinstance(chunk, Tag): + for tag in chunk: + if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): + return True + return False + if callable(howToMatch): + return howToMatch(chunk) + if isinstance(chunk, Tag): + #Custom match methods take the tag as an argument, but all other + #ways of matching match the tag name as a string + chunk = chunk.name + #Now we know that chunk is a string + if not isinstance(chunk, basestring): + chunk = str(chunk) + if hasattr(howToMatch, 'match'): + # It's a regexp object. + return howToMatch.search(chunk) + if isList(howToMatch): + return chunk in howToMatch + if hasattr(howToMatch, 'items'): + return howToMatch.has_key(chunk) + #It's just a string + return str(howToMatch) == chunk + +class NavigableText(PageElement): + + def __getattr__(self, attr): + "For backwards compatibility, text.string gives you text" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + +class NavigableString(str, NavigableText): + pass + +class NavigableUnicodeString(unicode, NavigableText): + pass + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, name, attrs=None, parent=Null, previous=Null): + "Basic constructor." + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + fetch() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.fetch, args, kwargs) + + def __getattr__(self, tag): + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.first(tag[:-3]) + elif tag.find('__') != 0: + return self.first(tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self): + """Renders this tag as a string.""" + return str(self) + + def __unicode__(self): + return self.__str__(1) + + def __str__(self, needUnicode=None, showStructureIndent=None): + """Returns a string or Unicode representation of this tag and + its contents. + + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + attrs.append('%s="%s"' % (key, val)) + close = '' + closeTag = '' + if self.isSelfClosing(): + close = ' /' + else: + closeTag = '</%s>' % self.name + indentIncrement = None + if showStructureIndent != None: + indentIncrement = showStructureIndent + if not self.hidden: + indentIncrement += 1 + contents = self.renderContents(indentIncrement, needUnicode=needUnicode) + if showStructureIndent: + space = '\n%s' % (' ' * showStructureIndent) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if showStructureIndent: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + s.append(contents) + if closeTag and showStructureIndent != None: + s.append(space) + s.append(closeTag) + s = ''.join(s) + isUnicode = type(s) == types.UnicodeType + if needUnicode and not isUnicode: + s = unicode(s) + elif isUnicode and needUnicode==False: + s = str(s) + return s + + def prettify(self, needUnicode=None): + return self.__str__(needUnicode, showStructureIndent=True) + + def renderContents(self, showStructureIndent=None, needUnicode=None): + """Renders the contents of this tag as a (possibly Unicode) + string.""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: + text = unicode(c) + elif isinstance(c, Tag): + s.append(c.__str__(needUnicode, showStructureIndent)) + elif needUnicode: + text = unicode(c) + else: + text = str(c) + if text: + if showStructureIndent != None: + if text[-1] == '\n': + text = text[:-1] + s.append(text) + return ''.join(s) + + #Soup methods + + def firstText(self, text, recursive=True): + """Convenience method to retrieve the first piece of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.first(recursive=recursive, text=text) + + def fetchText(self, text, recursive=True, limit=None): + """Convenience method to retrieve all pieces of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.fetch(recursive=recursive, text=text, limit=limit) + + def first(self, name=None, attrs={}, recursive=True, text=None): + """Return only the first child of this + Tag matching the given criteria.""" + r = Null + l = self.fetch(name, attrs, recursive, text, 1) + if l: + r = l[0] + return r + findChild = first + + def fetch(self, name=None, attrs={}, recursive=True, text=None, + limit=None): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._fetch(name, attrs, text, limit, generator) + fetchChildren = fetch + + #Utility methods + + def isSelfClosing(self): + """Returns true iff this is a self-closing tag as defined in the HTML + standard. + + TODO: This is specific to BeautifulSoup and its subclasses, but it's + used by __str__""" + return self.name in BeautifulSoup.SELF_CLOSING_TAGS + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.contents.append(tag) + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out + of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +class BeautifulStoneSoup(Tag, SGMLParser): + + """This class contains the basic parser and fetch code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "<foo><bar></foo>" actually means + "<foo><bar></bar></foo>". + + [Another possible explanation is "<foo><bar /></foo>", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + + #As a public service we will by default silently replace MS smart quotes + #and similar characters with their HTML or ASCII equivalents. + MS_CHARS = { '\x80' : '€', + '\x81' : ' ', + '\x82' : '‚', + '\x83' : 'ƒ', + '\x84' : '„', + '\x85' : '…', + '\x86' : '†', + '\x87' : '‡', + '\x88' : '⁁', + '\x89' : '%', + '\x8A' : 'Š', + '\x8B' : '<', + '\x8C' : 'Œ', + '\x8D' : '?', + '\x8E' : 'Z', + '\x8F' : '?', + '\x90' : '?', + '\x91' : '‘', + '\x92' : '’', + '\x93' : '“', + '\x94' : '”', + '\x95' : '•', + '\x96' : '–', + '\x97' : '—', + '\x98' : '˜', + '\x99' : '™', + '\x9a' : 'š', + '\x9b' : '>', + '\x9c' : 'œ', + '\x9d' : '?', + '\x9e' : 'z', + '\x9f' : 'Ÿ',} + + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda(x):'<!' + x.group(1) + '>'), + (re.compile("([\x80-\x9f])"), + lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1))) + ] + + ROOT_TAG_NAME = '[document]' + + def __init__(self, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + """Initialize this as the 'root tag' and feed in any text to + the parser. + + NOTE about avoidParserProblems: sgmllib will process most bad + HTML, and BeautifulSoup has tricks for dealing with some HTML + that kills sgmllib, but Beautiful Soup can nonetheless choke + or lose data if your data uses self-closing tags or + declarations incorrectly. By default, Beautiful Soup sanitizes + its input to avoid the vast majority of these problems. The + problems are relatively rare, even in bad HTML, so feel free + to pass in False to avoidParserProblems if they don't apply to + you, and you'll get better performance. The only reason I have + this turned on by default is so I don't get so many tech + support questions. + + The two most common instances of invalid HTML that will choke + sgmllib are fixed by the default parser massage techniques: + + <br/> (No space between name of closing tag and tag close) + <! --Comment--> (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + Tag.__init__(self, self.ROOT_TAG_NAME) + if avoidParserProblems \ + and not isList(avoidParserProblems): + avoidParserProblems = self.PARSER_MASSAGE + self.avoidParserProblems = avoidParserProblems + SGMLParser.__init__(self) + self.quoteStack = [] + self.hidden = 1 + self.reset() + if hasattr(text, 'read'): + #It's a file-type object. + text = text.read() + if text: + self.feed(text) + if initialTextIsEverything: + self.done() + + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + + def feed(self, text): + if self.avoidParserProblems: + for fix, m in self.avoidParserProblems: + text = fix.sub(m, text) + SGMLParser.feed(self, text) + + def done(self): + """Called when you're done parsing, so that the unclosed tags can be + correctly processed.""" + self.endData() #NEW + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + SGMLParser.reset(self) + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableText): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self): + currentData = ''.join(self.currentData) + if currentData: + if not currentData.strip(): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + c = NavigableString + if type(currentData) == types.UnicodeType: + c = NavigableUnicodeString + o = c(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + self.currentData = [] + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: + <p>Foo<b>Bar<p> should pop to 'p', not 'b'. + <p>Foo<table>Bar<p> should pop to 'table', not 'p'. + <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'. + <p>Foo<b>Bar<p> should pop to 'p', not 'b'. + + <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. + <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' + <td><tr><td> *<td>* should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s" % name + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + if not name in self.SELF_CLOSING_TAGS and not selfClosing: + self._smartPop(name) + tag = Tag(name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or name in self.SELF_CLOSING_TAGS: + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + + def unknown_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print "</%s> is not real!" % name + self.handle_data('</%s>' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def handle_pi(self, text): + "Propagate processing instructions right through." + self.handle_data("<?%s>" % text) + + def handle_comment(self, text): + "Propagate comments right through." + self.handle_data("<!--%s-->" % text) + + def handle_charref(self, ref): + "Propagate char refs right through." + self.handle_data('&#%s;' % ref) + + def handle_entityref(self, ref): + "Propagate entity refs right through." + self.handle_data('&%s;' % ref) + + def handle_decl(self, data): + "Propagate DOCTYPEs and the like right through." + self.handle_data('<!%s>' % data) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as regular data.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + self.handle_data(self.rawdata[i+9:k]) + j = k+3 + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a <p> tag should implicitly close the previous <p> tag. + + <p>Para1<p>Para2 + should be transformed into: + <p>Para1</p><p>Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a <blockquote> tag should _not_ implicitly close the previous + <blockquote> tag. + + Alice said: <blockquote>Bob said: <blockquote>Blah + should NOT be transformed into: + Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a <tr> tag should + implicitly close the previous <tr> tag within the same <table>, + but not close a <tr> tag in another table. + + <table><tr>Blah<tr>Blah + should be transformed into: + <table><tr>Blah</tr><tr>Blah + but, + <tr>Blah<table><tr>Blah + should NOT be transformed into + <tr>Blah<table></tr><tr>Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup before writing your own + subclass.""" + + SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + QUOTE_TAGS = {'script': None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + <b>Foo<b>Bar</b></b> + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "<b>Foo<b>Bar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '</b></b>' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close (eg.) a 'b' + tag than to actually use nested 'b' tags, and the BeautifulSoup + class handles the common case. This class handles the + not-co-common case: where you can't believe someone wrote what + they did, but it's valid HTML and BeautifulSoup screwed up by + assuming it wouldn't be. + + If this doesn't do what you need, try subclassing this class or + BeautifulSoup, and providing your own list of NESTABLE_TAGS.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class BeautifulSOAP(BeautifulStoneSoup): + """This class will push a tag with only a single string child into + the tag's parent as an attribute. The attribute's name is the tag + name, and the value is the string child. An example should give + the flavor of the change: + + <foo><bar>baz</bar></foo> + => + <foo bar="baz"><bar>baz</bar></foo> + + You can then access fooTag['bar'] instead of fooTag.barTag.string. + + This is, of course, useful for scraping structures that tend to + use subelements instead of attributes, such as SOAP messages. Note + that it modifies its input, so don't print the modified version + out. + + I'm not sure how many people really want to use this class; let me + know if you do. Mainly I like the name.""" + + def popTag(self): + if len(self.tagStack) > 1: + tag = self.tagStack[-1] + parent = self.tagStack[-2] + parent._getAttrMap() + if (isinstance(tag, Tag) and len(tag.contents) == 1 and + isinstance(tag.contents[0], NavigableText) and + not parent.attrMap.has_key(tag.name)): + parent[tag.name] = tag.contents[0] + BeautifulStoneSoup.popTag(self) + +#Enterprise class names! It has come to our attention that some people +#think the names of the Beautiful Soup parser classes are too silly +#and "unprofessional" for use in enterprise screen-scraping. We feel +#your pain! For such-minded folk, the Beautiful Soup Consortium And +#All-Night Kosher Bakery recommends renaming this file to +#"RobustParser.py" (or, in cases of extreme enterprisitude, +#"RobustParserBeanInterface.class") and using the following +#enterprise-friendly class aliases: +class RobustXMLParser(BeautifulStoneSoup): + pass +class RobustHTMLParser(BeautifulSoup): + pass +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): + pass +class SimplifyingSOAPParser(BeautifulSOAP): + pass + +### + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulStoneSoup(sys.stdin.read()) + print soup.prettify() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_clientcookie.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_clientcookie.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed4c878271a11bf564b0a64b377fd8a7fbe6a2a --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_clientcookie.py @@ -0,0 +1,1725 @@ +"""HTTP cookie handling for web clients. + +This module originally developed from my port of Gisle Aas' Perl module +HTTP::Cookies, from the libwww-perl library. + +Docstrings, comments and debug strings in this code refer to the +attributes of the HTTP cookie system as cookie-attributes, to distinguish +them clearly from Python attributes. + + CookieJar____ + / \ \ + FileCookieJar \ \ + / | \ \ \ + MozillaCookieJar | LWPCookieJar \ \ + | | \ + | ---MSIEBase | \ + | / | | \ + | / MSIEDBCookieJar BSDDBCookieJar + |/ + MSIECookieJar + +Comments to John J Lee <jjl@pobox.com>. + + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) +Copyright 2002-2003 Johnny Lee (original MSIE Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import sys, re, copy, time, urllib, types, logging +try: + import threading + _threading = threading; del threading +except ImportError: + import dummy_threading + _threading = dummy_threading; del dummy_threading + +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") +DEFAULT_HTTP_PORT = "80" + +from _headersutil import split_header_words, parse_ns_headers +from _util import isstringlike +import _rfc3986 + +debug = logging.getLogger("mechanize.cookies").debug + + +def reraise_unmasked_exceptions(unmasked=()): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. + # This function re-raises some exceptions we don't want to trap. + import mechanize, warnings + if not mechanize.USE_BARE_EXCEPT: + raise + unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) + etype = sys.exc_info()[0] + if issubclass(etype, unmasked): + raise + # swallowed an exception + import traceback, StringIO + f = StringIO.StringIO() + traceback.print_exc(None, f) + msg = f.getvalue() + warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2) + + +IPV4_RE = re.compile(r"\.\d+$") +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + return not (IPV4_RE.search(text) or + text == "" or + text[0] == "." or text[-1] == ".") + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = A.lower() + B = B.lower() + if A == B: + return True + if not is_HDN(A): + return False + i = A.rfind(B) + has_form_nb = not (i == -1 or i == 0) + return ( + has_form_nb and + B.startswith(".") and + is_HDN(B[1:]) + ) + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + return not IPV4_RE.search(text) + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = A.lower() + B = B.lower() + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = B.startswith(".") + if initial_dot and A.endswith(B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = _rfc3986.urlsplit(url)[1] + if host is None: + host = request.get_header("Host", "") + # remove port, if present + return cut_port_re.sub("", host, 1) + +def request_host_lc(request): + return request_host(request).lower() + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name).""" + erhn = req_host = request_host(request) + if req_host.find(".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def eff_request_host_lc(request): + req_host, erhn = eff_request_host(request) + return req_host.lower(), erhn.lower() + +def effective_request_host(request): + """Return the effective request-host, as defined by RFC 2965.""" + return eff_request_host(request)[1] + +def request_path(request): + """Return path component of request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + path = escape_path(_rfc3986.urlsplit(url)[2]) + if not path.startswith("/"): + path = "/" + path + return path + +def request_port(request): + host = request.get_host() + i = host.find(':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + debug("nonnumeric port: '%s'", port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +def request_is_unverifiable(request): + try: + return request.is_unverifiable() + except AttributeError: + if hasattr(request, "unverifiable"): + return request.unverifiable + else: + raise + +# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't +# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). +HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" +ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +def uppercase_escaped_char(match): + return "%%%s" % match.group(1).upper() +def escape_path(path): + """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" + # There's no knowing what character encoding was used to create URLs + # containing %-escapes, but since we have to pick one to escape invalid + # path characters, we pick UTF-8, as recommended in the HTML 4.0 + # specification: + # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 + # And here, kind of: draft-fielding-uri-rfc2396bis-03 + # (And in draft IRI specification: draft-duerst-iri-05) + # (And here, for new URI schemes: RFC 2718) + if isinstance(path, types.UnicodeType): + path = path.encode("utf-8") + path = urllib.quote(path, HTTP_PATH_SAFE) + path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = h.find(".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = b.find(".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = request_host_lc(request) + # the origin request's request-host was stuffed into request by + # _urllib2_support.AbstractHTTPHandler + return not domain_match(req_host, reach(request.origin_req_host)) + + +try: + all +except NameError: + # python 2.4 + def all(iterable): + for x in iterable: + if not x: + return False + return True + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + version: integer; + name: string; + value: string (may be None); + port: string; None indicates no attribute was supplied (e.g. "Port", rather + than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list + string (e.g. "80,8080") + port_specified: boolean; true if a value was supplied with the Port + cookie-attribute + domain: string; + domain_specified: boolean; true if Domain was explicitly set + domain_initial_dot: boolean; true if Domain as set in HTTP header by server + started with a dot (yes, this really is necessary!) + path: string; + path_specified: boolean; true if Path was explicitly set + secure: boolean; true if should only be returned over secure connection + expires: integer; seconds since epoch (RFC 2965 cookies should calculate + this value from the Max-Age attribute) + discard: boolean, true if this is a session cookie; (if no expires value, + this should be true) + comment: string; + comment_url: string; + rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not + Set-Cookie2:) header, but had a version cookie-attribute of 1 + rest: mapping of other cookie-attributes + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + + _attrs = ("version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + "rfc2109", "_rest") + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest, + rfc2109=False, + ): + + if version is not None: version = int(version) + if expires is not None: expires = int(expires) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = domain.lower() + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + self.rfc2109 = rfc2109 + + self._rest = copy.copy(rest) + + def has_nonstandard_attr(self, name): + return self._rest.has_key(name) + def get_nonstandard_attr(self, name, default=None): + return self._rest.get(name, default) + def set_nonstandard_attr(self, name, value): + self._rest[name] = value + def nonstandard_attr_keys(self): + return self._rest.keys() + + def is_expired(self, now=None): + if now is None: now = time.time() + return (self.expires is not None) and (self.expires <= now) + + def __eq__(self, other): + return all(getattr(self, a) == getattr(other, a) for a in self._attrs) + + def __ne__(self, other): + return not (self == other) + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.value is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.name + return "<Cookie %s for %s>" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ["version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + ]: + attr = getattr(self, name) + args.append("%s=%s" % (name, repr(attr))) + args.append("rest=%s" % repr(self._rest)) + args.append("rfc2109=%s" % repr(self.rfc2109)) + return "Cookie(%s)" % ", ".join(args) + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + May also modify cookies. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customised policy. + + As well as implementing set_ok and return_ok, implementations of this + interface must also supply the following attributes, indicating which + protocols should be used, and how. These can be read and set at any time, + though whether that makes complete sense from the protocol point of view is + doubtful. + + Public attributes: + + netscape: implement netscape protocol + rfc2965: implement RFC 2965 protocol + rfc2109_as_netscape: + WARNING: This argument will change or go away if is not accepted into + the Python standard library in this form! + If true, treat RFC 2109 cookies as though they were Netscape cookies. The + default is for this attribute to be None, which means treat 2109 cookies + as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is, + by default), and as Netscape cookies otherwise. + hide_cookie2: don't add Cookie2 header to requests (the presence of + this header indicates to the server that we understand RFC 2965 + cookies) + + """ + def set_ok(self, cookie, request): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.extract_cookies.__doc__ + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request): + """Return true if (and only if) cookie should be returned to server. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.add_cookie_header.__doc__ + + """ + raise NotImplementedError() + + def domain_return_ok(self, domain, request): + """Return false if cookies should not be returned, given cookie domain. + + This is here as an optimization, to remove the need for checking every + cookie with a particular domain (which may involve reading many files). + The default implementations of domain_return_ok and path_return_ok + (return True) leave all the work to return_ok. + + If domain_return_ok returns true for the cookie domain, path_return_ok + is called for the cookie path. Otherwise, path_return_ok and return_ok + are never called for that cookie domain. If path_return_ok returns + true, return_ok is called with the Cookie object itself for a full + check. Otherwise, return_ok is never called for that cookie path. + + Note that domain_return_ok is called for every *cookie* domain, not + just for the *request* domain. For example, the function might be + called with both ".acme.com" and "www.acme.com" if the request domain + is "www.acme.com". The same goes for path_return_ok. + + For argument documentation, see the docstring for return_ok. + + """ + return True + + def path_return_ok(self, path, request): + """Return false if cookies should not be returned, given cookie path. + + See the docstring for domain_return_ok. + + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies. + + Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is + switched off by default. + + The easiest way to provide your own policy is to override this class and + call its methods in your overriden implementations before adding your own + additional checks. + + import mechanize + class MyCookiePolicy(mechanize.DefaultCookiePolicy): + def set_ok(self, cookie, request): + if not mechanize.DefaultCookiePolicy.set_ok( + self, cookie, request): + return False + if i_dont_want_to_store_this_cookie(): + return False + return True + + In addition to the features required to implement the CookiePolicy + interface, this class allows you to block and allow domains from setting + and receiving cookies. There are also some strictness switches that allow + you to tighten up the rather loose Netscape protocol rules a little bit (at + the cost of blocking some benign cookies). + + A domain blacklist and whitelist is provided (both off by default). Only + domains not in the blacklist and present in the whitelist (if the whitelist + is active) participate in cookie setting and returning. Use the + blocked_domains constructor argument, and blocked_domains and + set_blocked_domains methods (and the corresponding argument and methods for + allowed_domains). If you set a whitelist, you can turn it off again by + setting it to None. + + Domains in block or allow lists that do not start with a dot must + string-compare equal. For example, "acme.com" matches a blacklist entry of + "acme.com", but "www.acme.com" does not. Domains that do start with a dot + are matched by more specific domains too. For example, both "www.acme.com" + and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does + not). IP addresses are an exception, and must match exactly. For example, + if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is + blocked, but 193.168.1.2 is not. + + Additional Public Attributes: + + General strictness switches + + strict_domain: don't allow sites to set two-component domains with + country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. + This is far from perfect and isn't guaranteed to work! + + RFC 2965 protocol strictness switches + + strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable + transactions (usually, an unverifiable transaction is one resulting from + a redirect or an image hosted on another site); if this is false, cookies + are NEVER blocked on the basis of verifiability + + Netscape protocol strictness switches + + strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions + even to Netscape cookies + strict_ns_domain: flags indicating how strict to be with domain-matching + rules for Netscape cookies: + DomainStrictNoDots: when setting cookies, host prefix must not contain a + dot (e.g. www.foo.bar.com can't set a cookie for .bar.com, because + www.foo contains a dot) + DomainStrictNonDomain: cookies that did not explicitly specify a Domain + cookie-attribute can only be returned to a domain that string-compares + equal to the domain that set the cookie (e.g. rockets.acme.com won't + be returned cookies from acme.com that had no Domain cookie-attribute) + DomainRFC2965Match: when setting cookies, require a full RFC 2965 + domain-match + DomainLiberal and DomainStrict are the most useful combinations of the + above flags, for convenience + strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that + have names starting with '$' + strict_ns_set_path: don't allow setting cookies whose path doesn't + path-match request URI + + """ + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=False, + # WARNING: this argument will change or go away if is not + # accepted into the Python standard library in this form! + # default, ie. treat 2109 as netscape iff not rfc2965 + rfc2109_as_netscape=None, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False, + ): + """ + Constructor arguments should be used as keyword arguments only. + + blocked_domains: sequence of domain names that we never accept cookies + from, nor return cookies to + allowed_domains: if not None, this is a sequence of the only domains + for which we accept and return cookies + + For other arguments, see CookiePolicy.__doc__ and + DefaultCookiePolicy.__doc__.. + + """ + self.netscape = netscape + self.rfc2965 = rfc2965 + self.rfc2109_as_netscape = rfc2109_as_netscape + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request): + """ + If you override set_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + debug(" - checking cookie %s", cookie) + + assert cookie.name is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + + return True + + def set_ok_version(self, cookie, request): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + debug(" Set-Cookie2 without version attribute (%s)", cookie) + return False + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during " + "unverifiable transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during " + "unverifiable transaction") + return False + return True + + def set_ok_name(self, cookie, request): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + cookie.name.startswith("$")): + debug(" illegal name (starts with '$'): '%s'", cookie.name) + return False + return True + + def set_ok_path(self, cookie, request): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not req_path.startswith(cookie.path)): + debug(" path attribute %s is not a prefix of request " + "path %s", cookie.path, req_path) + return False + return True + + def set_ok_countrycode_domain(self, cookie, request): + """Return False if explicit cookie domain is not acceptable. + + Called by set_ok_domain, for convenience of overriding by + subclasses. + + """ + if cookie.domain_specified and self.strict_domain: + domain = cookie.domain + # since domain was specified, we know that: + assert domain.startswith(".") + if domain.count(".") == 2: + # domain like .foo.bar + i = domain.rfind(".") + tld = domain[i+1:] + sld = domain[1:i] + if (sld.lower() in [ + "co", "ac", + "com", "edu", "org", "net", "gov", "mil", "int", + "aero", "biz", "cat", "coop", "info", "jobs", "mobi", + "museum", "name", "pro", "travel", + ] and + len(tld) == 2): + # domain like .co.uk + return False + return True + + def set_ok_domain(self, cookie, request): + if self.is_blocked(cookie.domain): + debug(" domain %s is in user block-list", cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + debug(" domain %s is not in user allow-list", cookie.domain) + return False + if not self.set_ok_countrycode_domain(cookie, request): + debug(" country-code second level domain %s", cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + if domain.startswith("."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (undotted_domain.find(".") >= 0) + if not embedded_dots and domain != ".local": + debug(" non-local domain %s contains no embedded dot", + domain) + return False + if cookie.version == 0: + if (not erhn.endswith(domain) and + (not erhn.startswith(".") and + not ("."+erhn).endswith(domain))): + debug(" effective request-host %s (even with added " + "initial dot) does not end end with %s", + erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + debug(" effective request-host %s does not domain-match " + "%s", erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (host_prefix.find(".") >= 0 and + not IPV4_RE.search(req_host)): + debug(" host prefix %s for domain %s contains a dot", + host_prefix, domain) + return False + return True + + def set_ok_port(self, cookie, request): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in cookie.port.split(","): + try: + int(p) + except ValueError: + debug(" bad port %s (not numeric)", p) + return False + if p == req_port: + break + else: + debug(" request port (%s) not found in %s", + req_port, cookie.port) + return False + return True + + def return_ok(self, cookie, request): + """ + If you override return_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to return). + + """ + # Path has already been checked by path_return_ok, and domain blocking + # done by domain_return_ok. + debug(" - checking cookie %s", cookie) + + for n in ("version", "verifiability", "secure", "expires", "port", + "domain"): + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + return True + + def return_ok_version(self, cookie, request): + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request): + if cookie.secure and request.get_type() != "https": + debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request): + if cookie.is_expired(self._now): + debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in cookie.port.split(","): + if p == req_port: + break + else: + debug(" request port %s does not match cookie port %s", + req_port, cookie.port) + return False + return True + + def return_ok_domain(self, cookie, request): + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s", erhn, domain) + return False + if cookie.version == 0 and not ("."+erhn).endswith(domain): + debug(" request-host %s does not match Netscape cookie domain " + "%s", req_host, domain) + return False + return True + + def domain_return_ok(self, domain, request): + # Liberal check of domain. This is here as an optimization to avoid + # having to load lots of MSIE cookie files unless necessary. + + # Munge req_host and erhn to always start with a dot, so as to err on + # the side of letting cookies through. + dotted_req_host, dotted_erhn = eff_request_host_lc(request) + if not dotted_req_host.startswith("."): + dotted_req_host = "."+dotted_req_host + if not dotted_erhn.startswith("."): + dotted_erhn = "."+dotted_erhn + if not (dotted_req_host.endswith(domain) or + dotted_erhn.endswith(domain)): + #debug(" request domain %s does not match cookie domain %s", + # req_host, domain) + return False + + if self.is_blocked(domain): + debug(" domain %s is in user block-list", domain) + return False + if self.is_not_allowed(domain): + debug(" domain %s is not in user allow-list", domain) + return False + + return True + + def path_return_ok(self, path, request): + debug("- checking cookie path=%s", path) + req_path = request_path(request) + if not req_path.startswith(path): + debug(" %s does not path-match %s", req_path, path) + return False + return True + + +def vals_sorted_by_key(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +class MappingIterator: + """Iterates over nested mapping, depth-first, in sorted order by key.""" + def __init__(self, mapping): + self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack + + def __iter__(self): return self + + def next(self): + # this is hairy because of lack of generators + while 1: + try: + vals, i, prev_item = self._s.pop() + except IndexError: + raise StopIteration() + if i < len(vals): + item = vals[i] + i = i + 1 + self._s.append((vals, i, prev_item)) + try: + item.items + except AttributeError: + # non-mapping + break + else: + # mapping + self._s.append((vals_sorted_by_key(item), 0, item)) + continue + return item + + +# Used as second parameter to dict.get method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + You may not need to know about this class: try mechanize.urlopen(). + + The major methods are extract_cookies and add_cookie_header; these are all + you are likely to need. + + CookieJar supports the iterator protocol: + + for cookie in cookiejar: + # do something with cookie + + Methods: + + add_cookie_header(request) + extract_cookies(response, request) + get_policy() + set_policy(policy) + cookies_for_request(request) + make_cookies(response, request) + set_cookie_if_ok(cookie, request) + set_cookie(cookie) + clear_session_cookies() + clear_expired_cookies() + clear(domain=None, path=None, name=None) + + Public attributes + + policy: CookiePolicy object + + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + def __init__(self, policy=None): + """ + See CookieJar.__doc__ for argument documentation. + + """ + if policy is None: + policy = DefaultCookiePolicy() + self._policy = policy + + self._cookies = {} + + # for __getitem__ iteration in pre-2.2 Pythons + self._prev_getitem_index = 0 + + def get_policy(self): + return self._policy + + def set_policy(self, policy): + self._policy = policy + + def _cookies_for_domain(self, domain, request): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + cookies_by_path = self._cookies[domain] + for path in cookies_by_path.keys(): + if not self._policy.path_return_ok(path, request): + continue + cookies_by_name = cookies_by_path[path] + for cookie in cookies_by_name.values(): + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + cookies.append(cookie) + return cookies + + def cookies_for_request(self, request): + """Return a list of cookies to be returned to server. + + The returned list of cookie instances is sorted in the order they + should appear in the Cookie: header for return to the server. + + See add_cookie_header.__doc__ for the interface required of the + request argument. + + New in version 0.1.10 + + """ + self._policy._now = self._now = int(time.time()) + cookies = self._cookies_for_request(request) + # add cookies in order of most specific (i.e. longest) path first + def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) + cookies.sort(decreasing_size) + return cookies + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + # this method still exists (alongside cookies_for_request) because it + # is part of an implied protected interface for subclasses of cookiejar + # XXX document that implied interface, or provide another way of + # implementing cookiejars than subclassing + cookies = [] + for domain in self._cookies.keys(): + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + The $Version attribute is also added when appropriate (currently only + once per request). + + >>> jar = CookieJar() + >>> ns_cookie = Cookie(0, "foo", '"bar"', None, False, + ... "example.com", False, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([ns_cookie]) + ['foo="bar"'] + >>> rfc2965_cookie = Cookie(1, "foo", "bar", None, False, + ... ".example.com", True, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([rfc2965_cookie]) + ['$Version=1', 'foo=bar', '$Domain="example.com"'] + + """ + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if ((cookie.value is not None) and + self.non_word_re.search(cookie.value) and version > 0): + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.value is None: + attrs.append(cookie.name) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if cookie.domain.startswith("."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + domain.startswith(".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request): + """Add correct Cookie: header to request (mechanize.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + The request object (usually a mechanize.Request instance) must support + the methods get_full_url, get_host, is_unverifiable, get_type, + has_header, get_header, header_items and add_unredirected_header, as + documented by urllib2. + """ + debug("add_cookie_header") + cookies = self.cookies_for_request(request) + + attrs = self._cookie_attrs(cookies) + if attrs: + if not request.has_header("Cookie"): + request.add_unredirected_header("Cookie", "; ".join(attrs)) + + # if necessary, advertise that we know RFC 2965 + if self._policy.rfc2965 and not self._policy.hide_cookie2: + for cookie in cookies: + if cookie.version != 1 and not request.has_header("Cookie2"): + request.add_unredirected_header("Cookie2", '$Version="1"') + break + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = k.lower() + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if standard.has_key(k): + # only first value is significant + continue + if k == "domain": + if v is None: + debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = v.lower() + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + if v is None: + debug(" missing value for max-age attribute") + bad_cookie = True + break + try: + v = int(v) + except ValueError: + debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age is a + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ["port", "comment", "commenturl"]): + debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: + try: + version = int(version) + except ValueError: + return None # invalid version, ignore cookie + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = escape_path(path) + else: + path_specified = False + path = request_path(request) + i = path.rfind("/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(domain.startswith(".")) + if domain is Absent: + req_host, erhn = eff_request_host_lc(request) + domain = erhn + elif not domain.startswith("."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def _process_rfc2109_cookies(self, cookies): + if self._policy.rfc2109_as_netscape is None: + rfc2109_as_netscape = not self._policy.rfc2965 + else: + rfc2109_as_netscape = self._policy.rfc2109_as_netscape + for cookie in cookies: + if cookie.version == 1: + cookie.rfc2109 = True + if rfc2109_as_netscape: + # treat 2109 cookies as Netscape cookies rather than + # as RFC2965 cookies + cookie.version = 0 + + def _make_cookies(self, response, request): + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = headers.getheaders("Set-Cookie2") + ns_hdrs = headers.getheaders("Set-Cookie") + + rfc2965 = self._policy.rfc2965 + netscape = self._policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except: + reraise_unmasked_exceptions() + cookies = [] + + if ns_hdrs and netscape: + try: + # RFC 2109 and Netscape cookies + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except: + reraise_unmasked_exceptions() + ns_cookies = [] + self._process_rfc2109_cookies(ns_cookies) + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return not lookup.has_key(key) + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object. + + See extract_cookies.__doc__ for the interface required of the + response and request arguments. + + """ + self._policy._now = self._now = int(time.time()) + return [cookie for cookie in self._make_cookies(response, request) + if cookie.expires is None or not cookie.expires <= self._now] + + def set_cookie_if_ok(self, cookie, request): + """Set a cookie if policy says it's OK to do so. + + cookie: mechanize.Cookie instance + request: see extract_cookies.__doc__ for the required interface + + """ + self._policy._now = self._now = int(time.time()) + + if self._policy.set_ok(cookie, request): + self.set_cookie(cookie) + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set. + + cookie: mechanize.Cookie instance + """ + c = self._cookies + if not c.has_key(cookie.domain): c[cookie.domain] = {} + c2 = c[cookie.domain] + if not c2.has_key(cookie.path): c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + + def extract_cookies(self, response, request): + """Extract cookies from response, where allowable given the request. + + Look for allowable Set-Cookie: and Set-Cookie2: headers in the response + object passed as argument. Any of these headers that are found are + used to update the state of the object (subject to the policy.set_ok + method's approval). + + The response object (usually be the result of a call to + mechanize.urlopen, or similar) should support an info method, which + returns a mimetools.Message object (in fact, the 'mimetools.Message + object' may be any object that provides a getheaders method). + + The request object (usually a mechanize.Request instance) must support + the methods get_full_url, get_type, get_host, and is_unverifiable, as + documented by mechanize, and the port attribute (the port number). The + request is used to set default values for cookie-attributes as well as + for checking that the cookie is OK to be set. + + """ + debug("extract_cookies: %s", response.info()) + self._policy._now = self._now = int(time.time()) + + for cookie in self._make_cookies(response, request): + if cookie.expires is not None and cookie.expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + self.clear(cookie.domain, cookie.path, cookie.name) + except KeyError: + pass + debug("Expiring cookie, domain='%s', path='%s', name='%s'", + cookie.domain, cookie.path, cookie.name) + elif self._policy.set_ok(cookie, request): + debug(" setting cookie: %s", cookie) + self.set_cookie(cookie) + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self._cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self._cookies[domain][path] + elif domain is not None: + del self._cookies[domain] + else: + self._cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Discards all cookies held by object which had either no Max-Age or + Expires cookie-attribute or an explicit Discard cookie-attribute, or + which otherwise have ended up with a true discard attribute. For + interactive browsers, the end of a session usually corresponds to + closing the browser window. + + Note that the save method won't save session cookies anyway, unless you + ask otherwise by passing a true ignore_discard argument. + + """ + for cookie in self: + if cookie.discard: + self.clear(cookie.domain, cookie.path, cookie.name) + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the save + method won't save expired cookies anyway (unless you ask otherwise by + passing a true ignore_expires argument). + + """ + now = time.time() + for cookie in self: + if cookie.is_expired(now): + self.clear(cookie.domain, cookie.path, cookie.name) + + def __getitem__(self, i): + if i == 0: + self._getitem_iterator = self.__iter__() + elif self._prev_getitem_index != i-1: raise IndexError( + "CookieJar.__getitem__ only supports sequential iteration") + self._prev_getitem_index = i + try: + return self._getitem_iterator.next() + except StopIteration: + raise IndexError() + + def __iter__(self): + return MappingIterator(self._cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + +class LoadError(Exception): pass + +class FileCookieJar(CookieJar): + """CookieJar that can be loaded from and saved to a file. + + Additional methods + + save(filename=None, ignore_discard=False, ignore_expires=False) + load(filename=None, ignore_discard=False, ignore_expires=False) + revert(filename=None, ignore_discard=False, ignore_expires=False) + + Additional public attributes + + filename: filename for loading and saving cookies + + Additional public readable attributes + + delayload: request that cookies are lazily loaded from disk; this is only + a hint since this only affects performance, not behaviour (unless the + cookies on disk are changing); a CookieJar object may ignore it (in fact, + only MSIECookieJar lazily loads cookies at the moment) + + """ + + def __init__(self, filename=None, delayload=False, policy=None): + """ + See FileCookieJar.__doc__ for argument documentation. + + Cookies are NOT loaded from the named file until either the load or + revert method is called. + + """ + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self.delayload = bool(delayload) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file. + + filename: name of file in which to save cookies + ignore_discard: save even cookies set to be discarded + ignore_expires: save even cookies that have expired + + The file is overwritten if it already exists, thus wiping all its + cookies. Saved cookies can be restored later using the load or revert + methods. If filename is not specified, self.filename is used; if + self.filename is None, ValueError is raised. + + """ + raise NotImplementedError() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file. + + Old cookies are kept unless overwritten by newly loaded ones. + + Arguments are as for .save(). + + If filename is not specified, self.filename is used; if self.filename + is None, ValueError is raised. The named file must be in the format + understood by the class, or LoadError will be raised. This format will + be identical to that written by the save method, unless the load format + is not sufficiently well understood (as is the case for MSIECookieJar). + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename) + try: + self._really_load(f, filename, ignore_discard, ignore_expires) + finally: + f.close() + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises LoadError (or IOError) if reversion is not successful; the + object's state will not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + old_state = copy.deepcopy(self._cookies) + self._cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except (LoadError, IOError): + self._cookies = old_state + raise diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_debug.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_debug.py new file mode 100644 index 0000000000000000000000000000000000000000..8243969990ddf98865bbcf8bcd910819cc18dfb4 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_debug.py @@ -0,0 +1,28 @@ +import logging + +from _response import response_seek_wrapper +from _urllib2_fork import BaseHandler + + +class HTTPResponseDebugProcessor(BaseHandler): + handler_order = 900 # before redirections, after everything else + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + info = logging.getLogger("mechanize.http_responses").info + try: + info(response.read()) + finally: + response.seek(0) + info("*****************************************************") + return response + + https_response = http_response + +class HTTPRedirectDebugProcessor(BaseHandler): + def http_request(self, request): + if hasattr(request, "redirect_dict"): + info = logging.getLogger("mechanize.http_redirects").info + info("redirecting to %s", request.get_full_url()) + return request diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_firefox3cookiejar.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_firefox3cookiejar.py new file mode 100644 index 0000000000000000000000000000000000000000..a64d70f35d43af4492db93174f55fb74a104fa92 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_firefox3cookiejar.py @@ -0,0 +1,248 @@ +"""Firefox 3 "cookies.sqlite" cookie persistence. + +Copyright 2008 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import logging +import time + +from _clientcookie import CookieJar, Cookie, MappingIterator +from _util import isstringlike, experimental +debug = logging.getLogger("mechanize.cookies").debug + + +class Firefox3CookieJar(CookieJar): + + """Firefox 3 cookie jar. + + The cookies are stored in Firefox 3's "cookies.sqlite" format. + + Constructor arguments: + + filename: filename of cookies.sqlite (typically found at the top level + of a firefox profile directory) + autoconnect: as a convenience, connect to the SQLite cookies database at + Firefox3CookieJar construction time (default True) + policy: an object satisfying the mechanize.CookiePolicy interface + + Note that this is NOT a FileCookieJar, and there are no .load(), + .save() or .restore() methods. The database is in sync with the + cookiejar object's state after each public method call. + + Following Firefox's own behaviour, session cookies are never saved to + the database. + + The file is created, and an sqlite database written to it, if it does + not already exist. The moz_cookies database table is created if it does + not already exist. + """ + + # XXX + # handle DatabaseError exceptions + # add a FileCookieJar (explicit .save() / .revert() / .load() methods) + + def __init__(self, filename, autoconnect=True, policy=None): + experimental("Firefox3CookieJar is experimental code") + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self._conn = None + if autoconnect: + self.connect() + + def connect(self): + import sqlite3 # not available in Python 2.4 stdlib + self._conn = sqlite3.connect(self.filename) + self._conn.isolation_level = "DEFERRED" + self._create_table_if_necessary() + + def close(self): + self._conn.close() + + def _transaction(self, func): + try: + cur = self._conn.cursor() + try: + result = func(cur) + finally: + cur.close() + except: + self._conn.rollback() + raise + else: + self._conn.commit() + return result + + def _execute(self, query, params=()): + return self._transaction(lambda cur: cur.execute(query, params)) + + def _query(self, query, params=()): + # XXX should we bother with a transaction? + cur = self._conn.cursor() + try: + cur.execute(query, params) + return cur.fetchall() + finally: + cur.close() + + def _create_table_if_necessary(self): + self._execute("""\ +CREATE TABLE IF NOT EXISTS moz_cookies (id INTEGER PRIMARY KEY, name TEXT, + value TEXT, host TEXT, path TEXT,expiry INTEGER, + lastAccessed INTEGER, isSecure INTEGER, isHttpOnly INTEGER)""") + + def _cookie_from_row(self, row): + (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) = row + + version = 0 + domain = domain.encode("ascii", "ignore") + path = path.encode("ascii", "ignore") + name = name.encode("ascii", "ignore") + value = value.encode("ascii", "ignore") + secure = bool(secure) + + # last_accessed isn't a cookie attribute, so isn't added to rest + rest = {} + if http_only: + rest["HttpOnly"] = None + + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + domain_specified = initial_dot + + discard = False + if expires == "": + expires = None + discard = True + + return Cookie(version, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + rest) + + def clear(self, domain=None, path=None, name=None): + CookieJar.clear(self, domain, path, name) + where_parts = [] + sql_params = [] + if domain is not None: + where_parts.append("host = ?") + sql_params.append(domain) + if path is not None: + where_parts.append("path = ?") + sql_params.append(path) + if name is not None: + where_parts.append("name = ?") + sql_params.append(name) + where = " AND ".join(where_parts) + if where: + where = " WHERE " + where + def clear(cur): + cur.execute("DELETE FROM moz_cookies%s" % where, + tuple(sql_params)) + self._transaction(clear) + + def _row_from_cookie(self, cookie, cur): + expires = cookie.expires + if cookie.discard: + expires = "" + + domain = unicode(cookie.domain) + path = unicode(cookie.path) + name = unicode(cookie.name) + value = unicode(cookie.value) + secure = bool(int(cookie.secure)) + + if value is None: + value = name + name = "" + + last_accessed = int(time.time()) + http_only = cookie.has_nonstandard_attr("HttpOnly") + + query = cur.execute("""SELECT MAX(id) + 1 from moz_cookies""") + pk = query.fetchone()[0] + if pk is None: + pk = 1 + + return (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) + + def set_cookie(self, cookie): + if cookie.discard: + CookieJar.set_cookie(self, cookie) + return + + def set_cookie(cur): + # XXX + # is this RFC 2965-correct? + # could this do an UPDATE instead? + row = self._row_from_cookie(cookie, cur) + name, unused, domain, path = row[1:5] + cur.execute("""\ +DELETE FROM moz_cookies WHERE host = ? AND path = ? AND name = ?""", + (domain, path, name)) + cur.execute("""\ +INSERT INTO moz_cookies VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +""", row) + self._transaction(set_cookie) + + def __iter__(self): + # session (non-persistent) cookies + for cookie in MappingIterator(self._cookies): + yield cookie + # persistent cookies + for row in self._query("""\ +SELECT * FROM moz_cookies ORDER BY name, path, host"""): + yield self._cookie_from_row(row) + + def _cookies_for_request(self, request): + session_cookies = CookieJar._cookies_for_request(self, request) + def get_cookies(cur): + query = cur.execute("SELECT host from moz_cookies") + domains = [row[0] for row in query.fetchall()] + cookies = [] + for domain in domains: + cookies += self._persistent_cookies_for_domain(domain, + request, cur) + return cookies + persistent_coookies = self._transaction(get_cookies) + return session_cookies + persistent_coookies + + def _persistent_cookies_for_domain(self, domain, request, cur): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + query = cur.execute("""\ +SELECT * from moz_cookies WHERE host = ? ORDER BY path""", + (domain,)) + cookies = [self._cookie_from_row(row) for row in query.fetchall()] + last_path = None + r = [] + for cookie in cookies: + if (cookie.path != last_path and + not self._policy.path_return_ok(cookie.path, request)): + last_path = cookie.path + continue + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + r.append(cookie) + return r diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_form.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_form.py new file mode 100644 index 0000000000000000000000000000000000000000..d45bdfc395e266f14912d0934326dbbdf7a5a832 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_form.py @@ -0,0 +1,3280 @@ +"""HTML form handling for web clients. + +HTML form handling for web clients: useful for parsing HTML forms, filling them +in and returning the completed forms to the server. This code developed from a +port of Gisle Aas' Perl module HTML::Form, from the libwww-perl library, but +the interface is not the same. + +The most useful docstring is the one for HTMLForm. + +RFC 1866: HTML 2.0 +RFC 1867: Form-based File Upload in HTML +RFC 2388: Returning Values from Forms: multipart/form-data +HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) +HTML 4.01 Specification, W3C Recommendation 24 December 1999 + + +Copyright 2002-2007 John J. Lee <jjl@pobox.com> +Copyright 2005 Gary Poster +Copyright 2005 Zope Corporation +Copyright 1998-2000 Gisle Aas. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# TODO: +# Clean up post the merge into mechanize +# * Remove code that was duplicated in ClientForm and mechanize +# * Remove weird import stuff +# * Remove pre-Python 2.4 compatibility cruft +# * Clean up tests +# * Later release: Remove the ClientForm 0.1 backwards-compatibility switch +# Remove parser testing hack +# Clean action URI +# Switch to unicode throughout +# See Wichert Akkerman's 2004-01-22 message to c.l.py. +# Apply recommendations from google code project CURLIES +# Apply recommendations from HTML 5 spec +# Add charset parameter to Content-type headers? How to find value?? +# Functional tests to add: +# Single and multiple file upload +# File upload with missing name (check standards) +# mailto: submission & enctype text/plain?? + +# Replace by_label etc. with moniker / selector concept. Allows, e.g., a +# choice between selection by value / id / label / element contents. Or +# choice between matching labels exactly or by substring. etc. + + +__all__ = ['AmbiguityError', 'CheckboxControl', 'Control', + 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm', + 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl', + 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label', + 'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile', + 'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl', + 'RadioControl', 'ScalarControl', 'SelectControl', + 'SubmitButtonControl', 'SubmitControl', 'TextControl', + 'TextareaControl', 'XHTMLCompatibleFormParser'] + +import HTMLParser +from cStringIO import StringIO +import inspect +import logging +import random +import re +import sys +import urllib +import urlparse +import warnings + +import _beautifulsoup +import _request + +# from Python itself, for backwards compatibility of raised exceptions +import sgmllib +# bundled copy of sgmllib +import _sgmllib_copy + + +VERSION = "0.2.11" + +CHUNK = 1024 # size of chunks fed to parser, in bytes + +DEFAULT_ENCODING = "latin-1" + +_logger = logging.getLogger("mechanize.forms") +OPTIMIZATION_HACK = True + +def debug(msg, *args, **kwds): + if OPTIMIZATION_HACK: + return + + caller_name = inspect.stack()[1][3] + extended_msg = '%%s %s' % msg + extended_args = (caller_name,)+args + _logger.debug(extended_msg, *extended_args, **kwds) + +def _show_debug_messages(): + global OPTIMIZATION_HACK + OPTIMIZATION_HACK = False + _logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.DEBUG) + _logger.addHandler(handler) + + +def deprecation(message, stack_offset=0): + warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset) + + +class Missing: pass + +_compress_re = re.compile(r"\s+") +def compress_text(text): return _compress_re.sub(" ", text.strip()) + +def normalize_line_endings(text): + return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text) + + +def unescape(data, entities, encoding=DEFAULT_ENCODING): + if data is None or "&" not in data: + return data + + def replace_entities(match, entities=entities, encoding=encoding): + ent = match.group() + if ent[1] == "#": + return unescape_charref(ent[2:-1], encoding) + + repl = entities.get(ent) + if repl is not None: + if type(repl) != type(""): + try: + repl = repl.encode(encoding) + except UnicodeError: + repl = ent + else: + repl = ent + + return repl + + return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) + +def unescape_charref(data, encoding): + name, base = data, 10 + if name.startswith("x"): + name, base= name[1:], 16 + uc = unichr(int(name, base)) + if encoding is None: + return uc + else: + try: + repl = uc.encode(encoding) + except UnicodeError: + repl = "&#%s;" % data + return repl + +def get_entitydefs(): + import htmlentitydefs + from codecs import latin_1_decode + entitydefs = {} + try: + htmlentitydefs.name2codepoint + except AttributeError: + entitydefs = {} + for name, char in htmlentitydefs.entitydefs.items(): + uc = latin_1_decode(char)[0] + if uc.startswith("&#") and uc.endswith(";"): + uc = unescape_charref(uc[2:-1], None) + entitydefs["&%s;" % name] = uc + else: + for name, codepoint in htmlentitydefs.name2codepoint.items(): + entitydefs["&%s;" % name] = unichr(codepoint) + return entitydefs + + +def issequence(x): + try: + x[0] + except (TypeError, KeyError): + return False + except IndexError: + pass + return True + +def isstringlike(x): + try: x+"" + except: return False + else: return True + + +def choose_boundary(): + """Return a string usable as a multipart boundary.""" + # follow IE and firefox + nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2]) + return "-"*27 + nonce + +# This cut-n-pasted MimeWriter from standard library is here so can add +# to HTTP headers rather than message body when appropriate. It also uses +# \r\n in place of \n. This is a bit nasty. +class MimeWriter: + + """Generic MIME writer. + + Methods: + + __init__() + addheader() + flushheaders() + startbody() + startmultipartbody() + nextpart() + lastpart() + + A MIME writer is much more primitive than a MIME parser. It + doesn't seek around on the output file, and it doesn't use large + amounts of buffer space, so you have to write the parts in the + order they should occur on the output file. It does buffer the + headers you add, allowing you to rearrange their order. + + General usage is: + + f = <open the output file> + w = MimeWriter(f) + ...call w.addheader(key, value) 0 or more times... + + followed by either: + + f = w.startbody(content_type) + ...call f.write(data) for body data... + + or: + + w.startmultipartbody(subtype) + for each part: + subwriter = w.nextpart() + ...use the subwriter's methods to create the subpart... + w.lastpart() + + The subwriter is another MimeWriter instance, and should be + treated in the same way as the toplevel MimeWriter. This way, + writing recursive body parts is easy. + + Warning: don't forget to call lastpart()! + + XXX There should be more state so calls made in the wrong order + are detected. + + Some special cases: + + - startbody() just returns the file passed to the constructor; + but don't use this knowledge, as it may be changed. + + - startmultipartbody() actually returns a file as well; + this can be used to write the initial 'if you can read this your + mailer is not MIME-aware' message. + + - If you call flushheaders(), the headers accumulated so far are + written out (and forgotten); this is useful if you don't need a + body part at all, e.g. for a subpart of type message/rfc822 + that's (mis)used to store some header-like information. + + - Passing a keyword argument 'prefix=<flag>' to addheader(), + start*body() affects where the header is inserted; 0 means + append at the end, 1 means insert at the start; default is + append for addheader(), but insert for start*body(), which use + it to determine where the Content-type header goes. + + """ + + def __init__(self, fp, http_hdrs=None): + self._http_hdrs = http_hdrs + self._fp = fp + self._headers = [] + self._boundary = [] + self._first_part = True + + def addheader(self, key, value, prefix=0, + add_to_http_hdrs=0): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + lines = value.split("\r\n") + while lines and not lines[-1]: del lines[-1] + while lines and not lines[0]: del lines[0] + if add_to_http_hdrs: + value = "".join(lines) + # 2.2 urllib2 doesn't normalize header case + self._http_hdrs.append((key.capitalize(), value)) + else: + for i in range(1, len(lines)): + lines[i] = " " + lines[i].strip() + value = "\r\n".join(lines) + "\r\n" + line = key.title() + ": " + value + if prefix: + self._headers.insert(0, line) + else: + self._headers.append(line) + + def flushheaders(self): + self._fp.writelines(self._headers) + self._headers = [] + + def startbody(self, ctype=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + if content_type and ctype: + for name, value in plist: + ctype = ctype + ';\r\n %s=%s' % (name, value) + self.addheader("Content-Type", ctype, prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs) + self.flushheaders() + if not add_to_http_hdrs: self._fp.write("\r\n") + self._first_part = True + return self._fp + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + boundary = boundary or choose_boundary() + self._boundary.append(boundary) + return self.startbody("multipart/" + subtype, + [("boundary", boundary)] + plist, + prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs, + content_type=content_type) + + def nextpart(self): + boundary = self._boundary[-1] + if self._first_part: + self._first_part = False + else: + self._fp.write("\r\n") + self._fp.write("--" + boundary + "\r\n") + return self.__class__(self._fp) + + def lastpart(self): + if self._first_part: + self.nextpart() + boundary = self._boundary.pop() + self._fp.write("\r\n--" + boundary + "--\r\n") + + +class LocateError(ValueError): pass +class AmbiguityError(LocateError): pass +class ControlNotFoundError(LocateError): pass +class ItemNotFoundError(LocateError): pass + +class ItemCountError(ValueError): pass + +# for backwards compatibility, ParseError derives from exceptions that were +# raised by versions of ClientForm <= 0.2.5 +# TODO: move to _html +class ParseError(sgmllib.SGMLParseError, + HTMLParser.HTMLParseError): + + def __init__(self, *args, **kwds): + Exception.__init__(self, *args, **kwds) + + def __str__(self): + return Exception.__str__(self) + + +class _AbstractFormParser: + """forms attribute contains HTMLForm instances on completion.""" + # thanks to Moshe Zadka for an example of sgmllib/htmllib usage + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + if entitydefs is None: + entitydefs = get_entitydefs() + self._entitydefs = entitydefs + self._encoding = encoding + + self.base = None + self.forms = [] + self.labels = [] + self._current_label = None + self._current_form = None + self._select = None + self._optgroup = None + self._option = None + self._textarea = None + + # forms[0] will contain all controls that are outside of any form + # self._global_form is an alias for self.forms[0] + self._global_form = None + self.start_form([]) + self.end_form() + self._current_form = self._global_form = self.forms[0] + + def do_base(self, attrs): + debug("%s", attrs) + for key, value in attrs: + if key == "href": + self.base = self.unescape_attr_if_required(value) + + def end_body(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is not self._global_form: + self.end_form() + + def start_form(self, attrs): + debug("%s", attrs) + if self._current_form is not self._global_form: + raise ParseError("nested FORMs") + name = None + action = None + enctype = "application/x-www-form-urlencoded" + method = "GET" + d = {} + for key, value in attrs: + if key == "name": + name = self.unescape_attr_if_required(value) + elif key == "action": + action = self.unescape_attr_if_required(value) + elif key == "method": + method = self.unescape_attr_if_required(value.upper()) + elif key == "enctype": + enctype = self.unescape_attr_if_required(value.lower()) + d[key] = self.unescape_attr_if_required(value) + controls = [] + self._current_form = (name, action, method, enctype), d, controls + + def end_form(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is self._global_form: + raise ParseError("end of FORM before start") + self.forms.append(self._current_form) + self._current_form = self._global_form + + def start_select(self, attrs): + debug("%s", attrs) + if self._select is not None: + raise ParseError("nested SELECTs") + if self._textarea is not None: + raise ParseError("SELECT inside TEXTAREA") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._select = d + self._add_label(d) + + self._append_select_control({"__select": d}) + + def end_select(self): + debug("") + if self._select is None: + raise ParseError("end of SELECT before start") + + if self._option is not None: + self._end_option() + + self._select = None + + def start_optgroup(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTGROUP outside of SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._optgroup = d + + def end_optgroup(self): + debug("") + if self._optgroup is None: + raise ParseError("end of OPTGROUP before start") + self._optgroup = None + + def _start_option(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTION outside of SELECT") + if self._option is not None: + self._end_option() + + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._option = {} + self._option.update(d) + if (self._optgroup and self._optgroup.has_key("disabled") and + not self._option.has_key("disabled")): + self._option["disabled"] = None + + def _end_option(self): + debug("") + if self._option is None: + raise ParseError("end of OPTION before start") + + contents = self._option.get("contents", "").strip() + self._option["contents"] = contents + if not self._option.has_key("value"): + self._option["value"] = contents + if not self._option.has_key("label"): + self._option["label"] = contents + # stuff dict of SELECT HTML attrs into a special private key + # (gets deleted again later) + self._option["__select"] = self._select + self._append_select_control(self._option) + self._option = None + + def _append_select_control(self, attrs): + debug("%s", attrs) + controls = self._current_form[2] + name = self._select.get("name") + controls.append(("select", name, attrs)) + + def start_textarea(self, attrs): + debug("%s", attrs) + if self._textarea is not None: + raise ParseError("nested TEXTAREAs") + if self._select is not None: + raise ParseError("TEXTAREA inside SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + self._add_label(d) + + self._textarea = d + + def end_textarea(self): + debug("") + if self._textarea is None: + raise ParseError("end of TEXTAREA before start") + controls = self._current_form[2] + name = self._textarea.get("name") + controls.append(("textarea", name, self._textarea)) + self._textarea = None + + def start_label(self, attrs): + debug("%s", attrs) + if self._current_label: + self.end_label() + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + taken = bool(d.get("for")) # empty id is invalid + d["__text"] = "" + d["__taken"] = taken + if taken: + self.labels.append(d) + self._current_label = d + + def end_label(self): + debug("") + label = self._current_label + if label is None: + # something is ugly in the HTML, but we're ignoring it + return + self._current_label = None + # if it is staying around, it is True in all cases + del label["__taken"] + + def _add_label(self, d): + #debug("%s", d) + if self._current_label is not None: + if not self._current_label["__taken"]: + self._current_label["__taken"] = True + d["__label"] = self._current_label + + def handle_data(self, data): + debug("%s", data) + + if self._option is not None: + # self._option is a dictionary of the OPTION element's HTML + # attributes, but it has two special keys, one of which is the + # special "contents" key contains text between OPTION tags (the + # other is the "__select" key: see the end_option method) + map = self._option + key = "contents" + elif self._textarea is not None: + map = self._textarea + key = "value" + data = normalize_line_endings(data) + # not if within option or textarea + elif self._current_label is not None: + map = self._current_label + key = "__text" + else: + return + + if data and not map.has_key(key): + # according to + # http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break + # immediately after start tags or immediately before end tags must + # be ignored, but real browsers only ignore a line break after a + # start tag, so we'll do that. + if data[0:2] == "\r\n": + data = data[2:] + elif data[0:1] in ["\n", "\r"]: + data = data[1:] + map[key] = data + else: + map[key] = map[key] + data + + def do_button(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "submit" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + # we don't want to lose information, so use a type string that + # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON} + # e.g. type for BUTTON/RESET is "resetbutton" + # (type for INPUT/RESET is "reset") + type = type+"button" + self._add_label(d) + controls.append((type, name, d)) + + def do_input(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "text" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + self._add_label(d) + controls.append((type, name, d)) + + def do_isindex(self, attrs): + debug("%s", attrs) + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + self._add_label(d) + # isindex doesn't have type or name HTML attributes + controls.append(("isindex", None, d)) + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + try: + val.items + except AttributeError: + escaped_attrs[key] = self.unescape_attr(val) + else: + # e.g. "__select" -- yuck! + escaped_attrs[key] = self.unescape_attrs(val) + return escaped_attrs + + def unknown_entityref(self, ref): self.handle_data("&%s;" % ref) + def unknown_charref(self, ref): self.handle_data("&#%s;" % ref) + + +class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): + """Good for XHTML, bad for tolerance of incorrect HTML.""" + # thanks to Michael Howitz for this! + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + HTMLParser.HTMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + HTMLParser.HTMLParser.feed(self, data) + except HTMLParser.HTMLParseError, exc: + raise ParseError(exc) + + def start_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + def end_option(self): + _AbstractFormParser._end_option(self) + + def handle_starttag(self, tag, attrs): + try: + method = getattr(self, "start_" + tag) + except AttributeError: + try: + method = getattr(self, "do_" + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + try: + method = getattr(self, "end_" + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + def close(self): + HTMLParser.HTMLParser.close(self) + self.end_body() + + +class _AbstractSgmllibParser(_AbstractFormParser): + + def do_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + # we override this attr to decode hex charrefs + entity_or_charref = re.compile( + '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)') + def convert_entityref(self, name): + return unescape("&%s;" % name, self._entitydefs, self._encoding) + def convert_charref(self, name): + return unescape_charref("%s" % name, self._encoding) + def unescape_attr_if_required(self, name): + return name # sgmllib already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + +class FormParser(_AbstractSgmllibParser, _sgmllib_copy.SGMLParser): + """Good for tolerance of incorrect HTML, bad for XHTML.""" + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _sgmllib_copy.SGMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + _sgmllib_copy.SGMLParser.feed(self, data) + except _sgmllib_copy.SGMLParseError, exc: + raise ParseError(exc) + + def close(self): + _sgmllib_copy.SGMLParser.close(self) + self.end_body() + + +class _AbstractBSFormParser(_AbstractSgmllibParser): + + bs_base_class = None + + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _AbstractFormParser.__init__(self, entitydefs, encoding) + self.bs_base_class.__init__(self) + + def handle_data(self, data): + _AbstractFormParser.handle_data(self, data) + self.bs_base_class.handle_data(self, data) + + def feed(self, data): + try: + self.bs_base_class.feed(self, data) + except _sgmllib_copy.SGMLParseError, exc: + raise ParseError(exc) + + def close(self): + self.bs_base_class.close(self) + self.end_body() + + +class RobustFormParser(_AbstractBSFormParser, _beautifulsoup.BeautifulSoup): + + """Tries to be highly tolerant of incorrect HTML.""" + + bs_base_class = _beautifulsoup.BeautifulSoup + + +class NestingRobustFormParser(_AbstractBSFormParser, + _beautifulsoup.ICantBelieveItsBeautifulSoup): + + """Tries to be highly tolerant of incorrect HTML. + + Different from RobustFormParser in that it more often guesses nesting + above missing end tags (see BeautifulSoup docs). + """ + + bs_base_class = _beautifulsoup.ICantBelieveItsBeautifulSoup + + +#FormParser = XHTMLCompatibleFormParser # testing hack +#FormParser = RobustFormParser # testing hack + + +def ParseResponseEx(response, + select_default=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseResponse, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(response, response.geturl(), + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseFileEx(file, base_uri, + select_default=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseFile, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(file, base_uri, + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseString(text, base_uri, *args, **kwds): + fh = StringIO(text) + return ParseFileEx(fh, base_uri, *args, **kwds) + +def ParseResponse(response, *args, **kwds): + """Parse HTTP response and return a list of HTMLForm instances. + + The return value of mechanize.urlopen can be conveniently passed to this + function as the response parameter. + + mechanize.ParseError is raised on parse errors. + + response: file-like object (supporting read() method) with a method + geturl(), returning the URI of the HTTP response + select_default: for multiple-selection SELECT controls and RADIO controls, + pick the first item as the default if none are selected in the HTML + form_parser_class: class to instantiate and use to pass + request_class: class to return from .click() method (default is + mechanize.Request) + entitydefs: mapping like {"&": "&", ...} containing HTML entity + definitions (a sensible default is used) + encoding: character encoding used for encoding numeric character references + when matching link text. mechanize does not attempt to find the encoding + in a META HTTP-EQUIV attribute in the document itself (mechanize, for + example, does do that and will pass the correct value to mechanize using + this parameter). + + backwards_compat: boolean that determines whether the returned HTMLForm + objects are backwards-compatible with old code. If backwards_compat is + true: + + - ClientForm 0.1 code will continue to work as before. + + - Label searches that do not specify a nr (number or count) will always + get the first match, even if other controls match. If + backwards_compat is False, label searches that have ambiguous results + will raise an AmbiguityError. + + - Item label matching is done by strict string comparison rather than + substring matching. + + - De-selecting individual list items is allowed even if the Item is + disabled. + + The backwards_compat argument will be removed in a future release. + + Pass a true value for select_default if you want the behaviour specified by + RFC 1866 (the HTML 2.0 standard), which is to select the first item in a + RADIO or multiple-selection SELECT control if none were selected in the + HTML. Most browsers (including Microsoft Internet Explorer (IE) and + Netscape Navigator) instead leave all items unselected in these cases. The + W3C HTML 4.0 standard leaves this behaviour undefined in the case of + multiple-selection SELECT controls, but insists that at least one RADIO + button should be checked at all times, in contradiction to browser + behaviour. + + There is a choice of parsers. mechanize.XHTMLCompatibleFormParser (uses + HTMLParser.HTMLParser) works best for XHTML, mechanize.FormParser (uses + bundled copy of sgmllib.SGMLParser) (the default) works better for ordinary + grubby HTML. Note that HTMLParser is only available in Python 2.2 and + later. You can pass your own class in here as a hack to work around bad + HTML, but at your own risk: there is no well-defined interface. + + """ + return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:] + +def ParseFile(file, base_uri, *args, **kwds): + """Parse HTML and return a list of HTMLForm instances. + + mechanize.ParseError is raised on parse errors. + + file: file-like object (supporting read() method) containing HTML with zero + or more forms to be parsed + base_uri: the URI of the document (note that the base URI used to submit + the form will be that given in the BASE element if present, not that of + the document) + + For the other arguments and further details, see ParseResponse.__doc__. + + """ + return _ParseFileEx(file, base_uri, *args, **kwds)[1:] + +def _ParseFileEx(file, base_uri, + select_default=False, + ignore_errors=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + backwards_compat=True, + encoding=DEFAULT_ENCODING, + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + if backwards_compat: + deprecation("operating in backwards-compatibility mode", 1) + fp = form_parser_class(entitydefs, encoding) + while 1: + data = file.read(CHUNK) + try: + fp.feed(data) + except ParseError, e: + e.base_uri = base_uri + raise + if len(data) != CHUNK: break + fp.close() + if fp.base is not None: + # HTML BASE element takes precedence over document URI + base_uri = fp.base + labels = [] # Label(label) for label in fp.labels] + id_to_labels = {} + for l in fp.labels: + label = Label(l) + labels.append(label) + for_id = l["for"] + coll = id_to_labels.get(for_id) + if coll is None: + id_to_labels[for_id] = [label] + else: + coll.append(label) + forms = [] + for (name, action, method, enctype), attrs, controls in fp.forms: + if action is None: + action = base_uri + else: + action = _urljoin(base_uri, action) + # would be nice to make HTMLForm class (form builder) pluggable + form = HTMLForm( + action, method, enctype, name, attrs, request_class, + forms, labels, id_to_labels, backwards_compat) + form._urlparse = _urlparse + form._urlunparse = _urlunparse + for ii in range(len(controls)): + type, name, attrs = controls[ii] + # index=ii*10 allows ImageControl to return multiple ordered pairs + form.new_control( + type, name, attrs, select_default=select_default, index=ii*10) + forms.append(form) + for form in forms: + form.fixup() + return forms + + +class Label: + def __init__(self, attrs): + self.id = attrs.get("for") + self._text = attrs.get("__text").strip() + self._ctext = compress_text(self._text) + self.attrs = attrs + self._backwards_compat = False # maintained by HTMLForm + + def __getattr__(self, name): + if name == "text": + if self._backwards_compat: + return self._text + else: + return self._ctext + return getattr(Label, name) + + def __setattr__(self, name, value): + if name == "text": + # don't see any need for this, so make it read-only + raise AttributeError("text attribute is read-only") + self.__dict__[name] = value + + def __str__(self): + return "<Label(id=%r, text=%r)>" % (self.id, self.text) + + +def _get_label(attrs): + text = attrs.get("__label") + if text is not None: + return Label(text) + else: + return None + +class Control: + """An HTML form control. + + An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm + are accessed using the HTMLForm.find_control method or the + HTMLForm.controls attribute. + + Control instances are usually constructed using the ParseFile / + ParseResponse functions. If you use those functions, you can ignore the + rest of this paragraph. A Control is only properly initialised after the + fixup method has been called. In fact, this is only strictly necessary for + ListControl instances. This is necessary because ListControls are built up + from ListControls each containing only a single item, and their initial + value(s) can only be known after the sequence is complete. + + The types and values that are acceptable for assignment to the value + attribute are defined by subclasses. + + If the disabled attribute is true, this represents the state typically + represented by browsers by 'greying out' a control. If the disabled + attribute is true, the Control will raise AttributeError if an attempt is + made to change its value. In addition, the control will not be considered + 'successful' as defined by the W3C HTML 4 standard -- ie. it will + contribute no data to the return value of the HTMLForm.click* methods. To + enable a control, set the disabled attribute to a false value. + + If the readonly attribute is true, the Control will raise AttributeError if + an attempt is made to change its value. To make a control writable, set + the readonly attribute to a false value. + + All controls have the disabled and readonly attributes, not only those that + may have the HTML attributes of the same names. + + On assignment to the value attribute, the following exceptions are raised: + TypeError, AttributeError (if the value attribute should not be assigned + to, because the control is disabled, for example) and ValueError. + + If the name or value attributes are None, or the value is an empty list, or + if the control is disabled, the control is not successful. + + Public attributes: + + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) (readonly) + name: name of control (readonly) + value: current value of control (subclasses may allow a single value, a + sequence of values, or either) + disabled: disabled state + readonly: readonly state + id: value of id HTML attribute + + """ + def __init__(self, type, name, attrs, index=None): + """ + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) + name: control name + attrs: HTML attributes of control's HTML element + + """ + raise NotImplementedError() + + def add_to_form(self, form): + self._form = form + form.controls.append(self) + + def fixup(self): + pass + + def is_of_kind(self, kind): + raise NotImplementedError() + + def clear(self): + raise NotImplementedError() + + def __getattr__(self, name): raise NotImplementedError() + def __setattr__(self, name, value): raise NotImplementedError() + + def pairs(self): + """Return list of (key, value) pairs suitable for passing to urlencode. + """ + return [(k, v) for (i, k, v) in self._totally_ordered_pairs()] + + def _totally_ordered_pairs(self): + """Return list of (key, value, index) tuples. + + Like pairs, but allows preserving correct ordering even where several + controls are involved. + + """ + raise NotImplementedError() + + def _write_mime_data(self, mw, name, value): + """Write data for a subitem of this control to a MimeWriter.""" + # called by HTMLForm + mw2 = mw.nextpart() + mw2.addheader("Content-Disposition", + 'form-data; name="%s"' % name, 1) + f = mw2.startbody(prefix=0) + f.write(value) + + def __str__(self): + raise NotImplementedError() + + def get_labels(self): + """Return all labels (Label instances) for this control. + + If the control was surrounded by a <label> tag, that will be the first + label; all other labels, connected by 'for' and 'id', are in the order + that appear in the HTML. + + """ + res = [] + if self._label: + res.append(self._label) + if self.id: + res.extend(self._form._id_to_labels.get(self.id, ())) + return res + + +#--------------------------------------------------- +class ScalarControl(Control): + """Control whose value is not restricted to one of a prescribed set. + + Some ScalarControls don't accept any value attribute. Otherwise, takes a + single value, which must be string-like. + + Additional read-only public attribute: + + attrs: dictionary mapping the names of original HTML attributes of the + control to their values + + """ + def __init__(self, type, name, attrs, index=None): + self._index = index + self._label = _get_label(attrs) + self.__dict__["type"] = type.lower() + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = attrs.has_key("disabled") + self.readonly = attrs.has_key("readonly") + self.id = attrs.get("id") + + self.attrs = attrs.copy() + + self._clicked = False + + self._urlparse = urlparse.urlparse + self._urlunparse = urlparse.urlunparse + + def __getattr__(self, name): + if name == "value": + return self.__dict__["_value"] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if not isstringlike(value): + raise TypeError("must assign a string") + elif self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + elif self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + self.__dict__["_value"] = value + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _totally_ordered_pairs(self): + name = self.name + value = self.value + if name is None or value is None or self.disabled: + return [] + return [(self._index, name, value)] + + def clear(self): + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self.__dict__["_value"] = None + + def __str__(self): + name = self.name + value = self.value + if name is None: name = "<None>" + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class TextControl(ScalarControl): + """Textual input control. + + Covers: + + INPUT/TEXT + INPUT/PASSWORD + INPUT/HIDDEN + TEXTAREA + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + if self.type == "hidden": self.readonly = True + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind == "text" + +#--------------------------------------------------- +class FileControl(ScalarControl): + """File upload with INPUT TYPE=FILE. + + The value attribute of a FileControl is always None. Use add_file instead. + + Additional public method: add_file + + """ + + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + self._value = None + self._upload_data = [] + + def is_of_kind(self, kind): return kind == "file" + + def clear(self): + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._upload_data = [] + + def __setattr__(self, name, value): + if name in ("value", "name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def add_file(self, file_object, content_type=None, filename=None): + if not hasattr(file_object, "read"): + raise TypeError("file-like object must have read method") + if content_type is not None and not isstringlike(content_type): + raise TypeError("content type must be None or string-like") + if filename is not None and not isstringlike(filename): + raise TypeError("filename must be None or string-like") + if content_type is None: + content_type = "application/octet-stream" + self._upload_data.append((file_object, content_type, filename)) + + def _totally_ordered_pairs(self): + # XXX should it be successful even if unnamed? + if self.name is None or self.disabled: + return [] + return [(self._index, self.name, "")] + + # If enctype is application/x-www-form-urlencoded and there's a FILE + # control present, what should be sent? Strictly, it should be 'name=data' + # (see HTML 4.01 spec., section 17.13.2), but code sends "name=" ATM. What + # about multiple file upload? + def _write_mime_data(self, mw, _name, _value): + # called by HTMLForm + # assert _name == self.name and _value == '' + if len(self._upload_data) < 2: + if len(self._upload_data) == 0: + file_object = StringIO() + content_type = "application/octet-stream" + filename = "" + else: + file_object, content_type, filename = self._upload_data[0] + if filename is None: + filename = "" + mw2 = mw.nextpart() + fn_part = '; filename="%s"' % filename + disp = 'form-data; name="%s"%s' % (self.name, fn_part) + mw2.addheader("Content-Disposition", disp, prefix=1) + fh = mw2.startbody(content_type, prefix=0) + fh.write(file_object.read()) + else: + # multiple files + mw2 = mw.nextpart() + disp = 'form-data; name="%s"' % self.name + mw2.addheader("Content-Disposition", disp, prefix=1) + fh = mw2.startmultipartbody("mixed", prefix=0) + for file_object, content_type, filename in self._upload_data: + mw3 = mw2.nextpart() + if filename is None: + filename = "" + fn_part = '; filename="%s"' % filename + disp = "file%s" % fn_part + mw3.addheader("Content-Disposition", disp, prefix=1) + fh2 = mw3.startbody(content_type, prefix=0) + fh2.write(file_object.read()) + mw2.lastpart() + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + if not self._upload_data: + value = "<No files added>" + else: + value = [] + for file, ctype, filename in self._upload_data: + if filename is None: + value.append("<Unnamed file>") + else: + value.append(filename) + value = ", ".join(value) + + info = [] + if self.disabled: info.append("disabled") + if self.readonly: info.append("readonly") + info = ", ".join(info) + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class IsindexControl(ScalarControl): + """ISINDEX control. + + ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really + part of regular HTML forms at all, and predates it. You're only allowed + one ISINDEX per HTML document. ISINDEX and regular form submission are + mutually exclusive -- either submit a form, or the ISINDEX. + + Having said this, since ISINDEX controls may appear in forms (which is + probably bad HTML), ParseFile / ParseResponse will include them in the + HTMLForm instances it returns. You can set the ISINDEX's value, as with + any other control (but note that ISINDEX controls have no name, so you'll + need to use the type argument of set_value!). When you submit the form, + the ISINDEX will not be successful (ie., no data will get returned to the + server as a result of its presence), unless you click on the ISINDEX + control, in which case the ISINDEX gets submitted instead of the form: + + form.set_value("my isindex value", type="isindex") + mechanize.urlopen(form.click(type="isindex")) + + ISINDEX elements outside of FORMs are ignored. If you want to submit one + by hand, do it like so: + + url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value")) + result = mechanize.urlopen(url) + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind in ["text", "clickable"] + + def _totally_ordered_pairs(self): + return [] + + def _click(self, form, coord, return_type, request_class=_request.Request): + # Relative URL for ISINDEX submission: instead of "foo=bar+baz", + # want "bar+baz". + # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is + # deprecated in 4.01, but it should still say how to submit it). + # Submission of ISINDEX is explained in the HTML 3.2 spec, though. + parts = self._urlparse(form.action) + rest, (query, frag) = parts[:-2], parts[-2:] + parts = rest + (urllib.quote_plus(self.value), None) + url = self._urlunparse(parts) + req_data = url, None, [] + + if return_type == "pairs": + return [] + elif return_type == "request_data": + return req_data + else: + return request_class(url) + + def __str__(self): + value = self.value + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s)%s>" % (self.__class__.__name__, value, info) + + +#--------------------------------------------------- +class IgnoreControl(ScalarControl): + """Control that we're not interested in. + + Covers: + + INPUT/RESET + BUTTON/RESET + INPUT/BUTTON + BUTTON/BUTTON + + These controls are always unsuccessful, in the terminology of HTML 4 (ie. + they never require any information to be returned to the server). + + BUTTON/BUTTON is used to generate events for script embedded in HTML. + + The value attribute of IgnoreControl is always None. + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + self._value = None + + def is_of_kind(self, kind): return False + + def __setattr__(self, name, value): + if name == "value": + raise AttributeError( + "control '%s' is ignored, hence read-only" % self.name) + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + +#--------------------------------------------------- +# ListControls + +# helpers and subsidiary classes + +class Item: + def __init__(self, control, attrs, index=None): + label = _get_label(attrs) + self.__dict__.update({ + "name": attrs["value"], + "_labels": label and [label] or [], + "attrs": attrs, + "_control": control, + "disabled": attrs.has_key("disabled"), + "_selected": False, + "id": attrs.get("id"), + "_index": index, + }) + control.items.append(self) + + def get_labels(self): + """Return all labels (Label instances) for this item. + + For items that represent radio buttons or checkboxes, if the item was + surrounded by a <label> tag, that will be the first label; all other + labels, connected by 'for' and 'id', are in the order that appear in + the HTML. + + For items that represent select options, if the option had a label + attribute, that will be the first label. If the option has contents + (text within the option tags) and it is not the same as the label + attribute (if any), that will be a label. There is nothing in the + spec to my knowledge that makes an option with an id unable to be the + target of a label's for attribute, so those are included, if any, for + the sake of consistency and completeness. + + """ + res = [] + res.extend(self._labels) + if self.id: + res.extend(self._control._form._id_to_labels.get(self.id, ())) + return res + + def __getattr__(self, name): + if name=="selected": + return self._selected + raise AttributeError(name) + + def __setattr__(self, name, value): + if name == "selected": + self._control._set_selected_state(self, value) + elif name == "disabled": + self.__dict__["disabled"] = bool(value) + else: + raise AttributeError(name) + + def __str__(self): + res = self.name + if self.selected: + res = "*" + res + if self.disabled: + res = "(%s)" % res + return res + + def __repr__(self): + # XXX appending the attrs without distinguishing them from name and id + # is silly + attrs = [("name", self.name), ("id", self.id)]+self.attrs.items() + return "<%s %s>" % ( + self.__class__.__name__, + " ".join(["%s=%r" % (k, v) for k, v in attrs]) + ) + +def disambiguate(items, nr, **kwds): + msgs = [] + for key, value in kwds.items(): + msgs.append("%s=%r" % (key, value)) + msg = " ".join(msgs) + if not items: + raise ItemNotFoundError(msg) + if nr is None: + if len(items) > 1: + raise AmbiguityError(msg) + nr = 0 + if len(items) <= nr: + raise ItemNotFoundError(msg) + return items[nr] + +class ListControl(Control): + """Control representing a sequence of items. + + The value attribute of a ListControl represents the successful list items + in the control. The successful list items are those that are selected and + not disabled. + + ListControl implements both list controls that take a length-1 value + (single-selection) and those that take length >1 values + (multiple-selection). + + ListControls accept sequence values only. Some controls only accept + sequences of length 0 or 1 (RADIO, and single-selection SELECT). + In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes + and multiple-selection SELECTs (those having the "multiple" HTML attribute) + accept sequences of any length. + + Note the following mistake: + + control.value = some_value + assert control.value == some_value # not necessarily true + + The reason for this is that the value attribute always gives the list items + in the order they were listed in the HTML. + + ListControl items can also be referred to by their labels instead of names. + Use the label argument to .get(), and the .set_value_by_label(), + .get_value_by_label() methods. + + Note that, rather confusingly, though SELECT controls are represented in + HTML by SELECT elements (which contain OPTION elements, representing + individual list items), CHECKBOXes and RADIOs are not represented by *any* + element. Instead, those controls are represented by a collection of INPUT + elements. For example, this is a SELECT control, named "control1": + + <select name="control1"> + <option>foo</option> + <option value="1">bar</option> + </select> + + and this is a CHECKBOX control, named "control2": + + <input type="checkbox" name="control2" value="foo" id="cbe1"> + <input type="checkbox" name="control2" value="bar" id="cbe2"> + + The id attribute of a CHECKBOX or RADIO ListControl is always that of its + first element (for example, "cbe1" above). + + + Additional read-only public attribute: multiple. + + """ + + # ListControls are built up by the parser from their component items by + # creating one ListControl per item, consolidating them into a single + # master ListControl held by the HTMLForm: + + # -User calls form.new_control(...) + # -Form creates Control, and calls control.add_to_form(self). + # -Control looks for a Control with the same name and type in the form, + # and if it finds one, merges itself with that control by calling + # control.merge_control(self). The first Control added to the form, of + # a particular name and type, is the only one that survives in the + # form. + # -Form calls control.fixup for all its controls. ListControls in the + # form know they can now safely pick their default values. + + # To create a ListControl without an HTMLForm, use: + + # control.merge_control(new_control) + + # (actually, it's much easier just to use ParseFile) + + _label = None + + def __init__(self, type, name, attrs={}, select_default=False, + called_as_base_class=False, index=None): + """ + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present + + """ + if not called_as_base_class: + raise NotImplementedError() + + self.__dict__["type"] = type.lower() + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = False + self.readonly = False + self.id = attrs.get("id") + self._closed = False + + # As Controls are merged in with .merge_control(), self.attrs will + # refer to each Control in turn -- always the most recently merged + # control. Each merged-in Control instance corresponds to a single + # list item: see ListControl.__doc__. + self.items = [] + self._form = None + + self._select_default = select_default + self._clicked = False + + def clear(self): + self.value = [] + + def is_of_kind(self, kind): + if kind == "list": + return True + elif kind == "multilist": + return bool(self.multiple) + elif kind == "singlelist": + return not self.multiple + else: + return False + + def get_items(self, name=None, label=None, id=None, + exclude_disabled=False): + """Return matching items by name or label. + + For argument docs, see the docstring for .get() + + """ + if name is not None and not isstringlike(name): + raise TypeError("item name must be string-like") + if label is not None and not isstringlike(label): + raise TypeError("item label must be string-like") + if id is not None and not isstringlike(id): + raise TypeError("item id must be string-like") + items = [] # order is important + compat = self._form.backwards_compat + for o in self.items: + if exclude_disabled and o.disabled: + continue + if name is not None and o.name != name: + continue + if label is not None: + for l in o.get_labels(): + if ((compat and l.text == label) or + (not compat and l.text.find(label) > -1)): + break + else: + continue + if id is not None and o.id != id: + continue + items.append(o) + return items + + def get(self, name=None, label=None, id=None, nr=None, + exclude_disabled=False): + """Return item by name or label, disambiguating if necessary with nr. + + All arguments must be passed by name, with the exception of 'name', + which may be used as a positional argument. + + If name is specified, then the item must have the indicated name. + + If label is specified, then the item must have a label whose + whitespace-compressed, stripped, text substring-matches the indicated + label string (e.g. label="please choose" will match + " Do please choose an item "). + + If id is specified, then the item must have the indicated id. + + nr is an optional 0-based index of the items matching the query. + + If nr is the default None value and more than item is found, raises + AmbiguityError (unless the HTMLForm instance's backwards_compat + attribute is true). + + If no item is found, or if items are found but nr is specified and not + found, raises ItemNotFoundError. + + Optionally excludes disabled items. + + """ + if nr is None and self._form.backwards_compat: + nr = 0 # :-/ + items = self.get_items(name, label, id, exclude_disabled) + return disambiguate(items, nr, name=name, label=label, id=id) + + def _get(self, name, by_label=False, nr=None, exclude_disabled=False): + # strictly for use by deprecated methods + if by_label: + name, label = None, name + else: + name, label = name, None + return self.get(name, label, nr, exclude_disabled) + + def toggle(self, name, by_label=False, nr=None): + """Deprecated: given a name or label and optional disambiguating index + nr, toggle the matching item's selection. + + Selecting items follows the behavior described in the docstring of the + 'get' method. + + if the item is disabled, or this control is disabled or readonly, + raise AttributeError. + + """ + deprecation( + "item = control.get(...); item.selected = not item.selected") + o = self._get(name, by_label, nr) + self._set_selected_state(o, not o.selected) + + def set(self, selected, name, by_label=False, nr=None): + """Deprecated: given a name or label and optional disambiguating index + nr, set the matching item's selection to the bool value of selected. + + Selecting items follows the behavior described in the docstring of the + 'get' method. + + if the item is disabled, or this control is disabled or readonly, + raise AttributeError. + + """ + deprecation( + "control.get(...).selected = <boolean>") + self._set_selected_state(self._get(name, by_label, nr), selected) + + def _set_selected_state(self, item, action): + # action: + # bool False: off + # bool True: on + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + action == bool(action) + compat = self._form.backwards_compat + if not compat and item.disabled: + raise AttributeError("item is disabled") + else: + if compat and item.disabled and action: + raise AttributeError("item is disabled") + if self.multiple: + item.__dict__["_selected"] = action + else: + if not action: + item.__dict__["_selected"] = False + else: + for o in self.items: + o.__dict__["_selected"] = False + item.__dict__["_selected"] = True + + def toggle_single(self, by_label=None): + """Deprecated: toggle the selection of the single item in this control. + + Raises ItemCountError if the control does not contain only one item. + + by_label argument is ignored, and included only for backwards + compatibility. + + """ + deprecation( + "control.items[0].selected = not control.items[0].selected") + if len(self.items) != 1: + raise ItemCountError( + "'%s' is not a single-item control" % self.name) + item = self.items[0] + self._set_selected_state(item, not item.selected) + + def set_single(self, selected, by_label=None): + """Deprecated: set the selection of the single item in this control. + + Raises ItemCountError if the control does not contain only one item. + + by_label argument is ignored, and included only for backwards + compatibility. + + """ + deprecation( + "control.items[0].selected = <boolean>") + if len(self.items) != 1: + raise ItemCountError( + "'%s' is not a single-item control" % self.name) + self._set_selected_state(self.items[0], selected) + + def get_item_disabled(self, name, by_label=False, nr=None): + """Get disabled state of named list item in a ListControl.""" + deprecation( + "control.get(...).disabled") + return self._get(name, by_label, nr).disabled + + def set_item_disabled(self, disabled, name, by_label=False, nr=None): + """Set disabled state of named list item in a ListControl. + + disabled: boolean disabled state + + """ + deprecation( + "control.get(...).disabled = <boolean>") + self._get(name, by_label, nr).disabled = disabled + + def set_all_items_disabled(self, disabled): + """Set disabled state of all list items in a ListControl. + + disabled: boolean disabled state + + """ + for o in self.items: + o.disabled = disabled + + def get_item_attrs(self, name, by_label=False, nr=None): + """Return dictionary of HTML attributes for a single ListControl item. + + The HTML element types that describe list items are: OPTION for SELECT + controls, INPUT for the rest. These elements have HTML attributes that + you may occasionally want to know about -- for example, the "alt" HTML + attribute gives a text string describing the item (graphical browsers + usually display this as a tooltip). + + The returned dictionary maps HTML attribute names to values. The names + and values are taken from the original HTML. + + """ + deprecation( + "control.get(...).attrs") + return self._get(name, by_label, nr).attrs + + def close_control(self): + self._closed = True + + def add_to_form(self, form): + assert self._form is None or form == self._form, ( + "can't add control to more than one form") + self._form = form + if self.name is None: + # always count nameless elements as separate controls + Control.add_to_form(self, form) + else: + for ii in range(len(form.controls)-1, -1, -1): + control = form.controls[ii] + if control.name == self.name and control.type == self.type: + if control._closed: + Control.add_to_form(self, form) + else: + control.merge_control(self) + break + else: + Control.add_to_form(self, form) + + def merge_control(self, control): + assert bool(control.multiple) == bool(self.multiple) + # usually, isinstance(control, self.__class__) + self.items.extend(control.items) + + def fixup(self): + """ + ListControls are built up from component list items (which are also + ListControls) during parsing. This method should be called after all + items have been added. See ListControl.__doc__ for the reason this is + required. + + """ + # Need to set default selection where no item was indicated as being + # selected by the HTML: + + # CHECKBOX: + # Nothing should be selected. + # SELECT/single, SELECT/multiple and RADIO: + # RFC 1866 (HTML 2.0): says first item should be selected. + # W3C HTML 4.01 Specification: says that client behaviour is + # undefined in this case. For RADIO, exactly one must be selected, + # though which one is undefined. + # Both Netscape and Microsoft Internet Explorer (IE) choose first + # item for SELECT/single. However, both IE5 and Mozilla (both 1.0 + # and Firebird 0.6) leave all items unselected for RADIO and + # SELECT/multiple. + + # Since both Netscape and IE all choose the first item for + # SELECT/single, we do the same. OTOH, both Netscape and IE + # leave SELECT/multiple with nothing selected, in violation of RFC 1866 + # (but not in violation of the W3C HTML 4 standard); the same is true + # of RADIO (which *is* in violation of the HTML 4 standard). We follow + # RFC 1866 if the _select_default attribute is set, and Netscape and IE + # otherwise. RFC 1866 and HTML 4 are always violated insofar as you + # can deselect all items in a RadioControl. + + for o in self.items: + # set items' controls to self, now that we've merged + o.__dict__["_control"] = self + + def __getattr__(self, name): + if name == "value": + compat = self._form.backwards_compat + if self.name is None: + return [] + return [o.name for o in self.items if o.selected and + (not o.disabled or compat)] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._set_value(value) + elif name in ("name", "type", "multiple"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + if not value: + compat = self._form.backwards_compat + for o in self.items: + if not o.disabled or compat: + o.selected = False + elif self.multiple: + self._multiple_set_value(value) + elif len(value) > 1: + raise ItemCountError( + "single selection list, must set sequence of " + "length 0 or 1") + else: + self._single_set_value(value) + + def _get_items(self, name, target=1): + all_items = self.get_items(name) + items = [o for o in all_items if not o.disabled] + if len(items) < target: + if len(all_items) < target: + raise ItemNotFoundError( + "insufficient items with name %r" % name) + else: + raise AttributeError( + "insufficient non-disabled items with name %s" % name) + on = [] + off = [] + for o in items: + if o.selected: + on.append(o) + else: + off.append(o) + return on, off + + def _single_set_value(self, value): + assert len(value) == 1 + on, off = self._get_items(value[0]) + assert len(on) <= 1 + if not on: + off[0].selected = True + + def _multiple_set_value(self, value): + compat = self._form.backwards_compat + turn_on = [] # transactional-ish + turn_off = [item for item in self.items if + item.selected and (not item.disabled or compat)] + names = {} + for nn in value: + if nn in names.keys(): + names[nn] += 1 + else: + names[nn] = 1 + for name, count in names.items(): + on, off = self._get_items(name, count) + for i in range(count): + if on: + item = on[0] + del on[0] + del turn_off[turn_off.index(item)] + else: + item = off[0] + del off[0] + turn_on.append(item) + for item in turn_off: + item.selected = False + for item in turn_on: + item.selected = True + + def set_value_by_label(self, value): + """Set the value of control by item labels. + + value is expected to be an iterable of strings that are substrings of + the item labels that should be selected. Before substring matching is + performed, the original label text is whitespace-compressed + (consecutive whitespace characters are converted to a single space + character) and leading and trailing whitespace is stripped. Ambiguous + labels are accepted without complaint if the form's backwards_compat is + True; otherwise, it will not complain as long as all ambiguous labels + share the same item name (e.g. OPTION value). + + """ + if isstringlike(value): + raise TypeError(value) + if not self.multiple and len(value) > 1: + raise ItemCountError( + "single selection list, must set sequence of " + "length 0 or 1") + items = [] + for nn in value: + found = self.get_items(label=nn) + if len(found) > 1: + if not self._form.backwards_compat: + # ambiguous labels are fine as long as item names (e.g. + # OPTION values) are same + opt_name = found[0].name + if [o for o in found[1:] if o.name != opt_name]: + raise AmbiguityError(nn) + else: + # OK, we'll guess :-( Assume first available item. + found = found[:1] + for o in found: + # For the multiple-item case, we could try to be smarter, + # saving them up and trying to resolve, but that's too much. + if self._form.backwards_compat or o not in items: + items.append(o) + break + else: # all of them are used + raise ItemNotFoundError(nn) + # now we have all the items that should be on + # let's just turn everything off and then back on. + self.value = [] + for o in items: + o.selected = True + + def get_value_by_label(self): + """Return the value of the control as given by normalized labels.""" + res = [] + compat = self._form.backwards_compat + for o in self.items: + if (not o.disabled or compat) and o.selected: + for l in o.get_labels(): + if l.text: + res.append(l.text) + break + else: + res.append(None) + return res + + def possible_items(self, by_label=False): + """Deprecated: return the names or labels of all possible items. + + Includes disabled items, which may be misleading for some use cases. + + """ + deprecation( + "[item.name for item in self.items]") + if by_label: + res = [] + for o in self.items: + for l in o.get_labels(): + if l.text: + res.append(l.text) + break + else: + res.append(None) + return res + return [o.name for o in self.items] + + def _totally_ordered_pairs(self): + if self.disabled or self.name is None: + return [] + else: + return [(o._index, self.name, o.name) for o in self.items + if o.selected and not o.disabled] + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + display = [str(o) for o in self.items] + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s=[%s])%s>" % (self.__class__.__name__, + name, ", ".join(display), info) + + +class RadioControl(ListControl): + """ + Covers: + + INPUT/RADIO + + """ + def __init__(self, type, name, attrs, select_default=False, index=None): + attrs.setdefault("value", "on") + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True, index=index) + self.__dict__["multiple"] = False + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("checked") + + def fixup(self): + ListControl.fixup(self) + found = [o for o in self.items if o.selected and not o.disabled] + if not found: + if self._select_default: + for o in self.items: + if not o.disabled: + o.selected = True + break + else: + # Ensure only one item selected. Choose the last one, + # following IE and Firefox. + for o in found[:-1]: + o.selected = False + + def get_labels(self): + return [] + +class CheckboxControl(ListControl): + """ + Covers: + + INPUT/CHECKBOX + + """ + def __init__(self, type, name, attrs, select_default=False, index=None): + attrs.setdefault("value", "on") + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True, index=index) + self.__dict__["multiple"] = True + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("checked") + + def get_labels(self): + return [] + + +class SelectControl(ListControl): + """ + Covers: + + SELECT (and OPTION) + + + OPTION 'values', in HTML parlance, are Item 'names' in mechanize parlance. + + SELECT control values and labels are subject to some messy defaulting + rules. For example, if the HTML representation of the control is: + + <SELECT name=year> + <OPTION value=0 label="2002">current year</OPTION> + <OPTION value=1>2001</OPTION> + <OPTION>2000</OPTION> + </SELECT> + + The items, in order, have labels "2002", "2001" and "2000", whereas their + names (the OPTION values) are "0", "1" and "2000" respectively. Note that + the value of the last OPTION in this example defaults to its contents, as + specified by RFC 1866, as do the labels of the second and third OPTIONs. + + The OPTION labels are sometimes more meaningful than the OPTION values, + which can make for more maintainable code. + + Additional read-only public attribute: attrs + + The attrs attribute is a dictionary of the original HTML attributes of the + SELECT element. Other ListControls do not have this attribute, because in + other cases the control as a whole does not correspond to any single HTML + element. control.get(...).attrs may be used as usual to get at the HTML + attributes of the HTML elements corresponding to individual list items (for + SELECT controls, these are OPTION elements). + + Another special case is that the Item.attrs dictionaries have a special key + "contents" which does not correspond to any real HTML attribute, but rather + contains the contents of the OPTION element: + + <OPTION>this bit</OPTION> + + """ + # HTML attributes here are treated slightly differently from other list + # controls: + # -The SELECT HTML attributes dictionary is stuffed into the OPTION + # HTML attributes dictionary under the "__select" key. + # -The content of each OPTION element is stored under the special + # "contents" key of the dictionary. + # After all this, the dictionary is passed to the SelectControl constructor + # as the attrs argument, as usual. However: + # -The first SelectControl constructed when building up a SELECT control + # has a constructor attrs argument containing only the __select key -- so + # this SelectControl represents an empty SELECT control. + # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and + # the __select dictionary containing the SELECT HTML-attributes. + + def __init__(self, type, name, attrs, select_default=False, index=None): + # fish out the SELECT HTML attributes from the OPTION HTML attributes + # dictionary + self.attrs = attrs["__select"].copy() + self.__dict__["_label"] = _get_label(self.attrs) + self.__dict__["id"] = self.attrs.get("id") + self.__dict__["multiple"] = self.attrs.has_key("multiple") + # the majority of the contents, label, and value dance already happened + contents = attrs.get("contents") + attrs = attrs.copy() + del attrs["__select"] + + ListControl.__init__(self, type, name, self.attrs, select_default, + called_as_base_class=True, index=index) + self.disabled = self.attrs.has_key("disabled") + self.readonly = self.attrs.has_key("readonly") + if attrs.has_key("value"): + # otherwise it is a marker 'select started' token + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("selected") + # add 'label' label and contents label, if different. If both are + # provided, the 'label' label is used for display in HTML + # 4.0-compliant browsers (and any lower spec? not sure) while the + # contents are used for display in older or less-compliant + # browsers. We make label objects for both, if the values are + # different. + label = attrs.get("label") + if label: + o._labels.append(Label({"__text": label})) + if contents and contents != label: + o._labels.append(Label({"__text": contents})) + elif contents: + o._labels.append(Label({"__text": contents})) + + def fixup(self): + ListControl.fixup(self) + # Firefox doesn't exclude disabled items from those considered here + # (i.e. from 'found', for both branches of the if below). Note that + # IE6 doesn't support the disabled attribute on OPTIONs at all. + found = [o for o in self.items if o.selected] + if not found: + if not self.multiple or self._select_default: + for o in self.items: + if not o.disabled: + was_disabled = self.disabled + self.disabled = False + try: + o.selected = True + finally: + o.disabled = was_disabled + break + elif not self.multiple: + # Ensure only one item selected. Choose the last one, + # following IE and Firefox. + for o in found[:-1]: + o.selected = False + + +#--------------------------------------------------- +class SubmitControl(ScalarControl): + """ + Covers: + + INPUT/SUBMIT + BUTTON/SUBMIT + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it + # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem + # to define this. + if self.value is None: self.value = "" + self.readonly = True + + def get_labels(self): + res = [] + if self.value: + res.append(Label({"__text": self.value})) + res.extend(ScalarControl.get_labels(self)) + return res + + def is_of_kind(self, kind): return kind == "clickable" + + def _click(self, form, coord, return_type, request_class=_request.Request): + self._clicked = coord + r = form._switch_click(return_type, request_class) + self._clicked = False + return r + + def _totally_ordered_pairs(self): + if not self._clicked: + return [] + return ScalarControl._totally_ordered_pairs(self) + + +#--------------------------------------------------- +class ImageControl(SubmitControl): + """ + Covers: + + INPUT/IMAGE + + Coordinates are specified using one of the HTMLForm.click* methods. + + """ + def __init__(self, type, name, attrs, index=None): + SubmitControl.__init__(self, type, name, attrs, index) + self.readonly = False + + def _totally_ordered_pairs(self): + clicked = self._clicked + if self.disabled or not clicked: + return [] + name = self.name + if name is None: return [] + pairs = [ + (self._index, "%s.x" % name, str(clicked[0])), + (self._index+1, "%s.y" % name, str(clicked[1])), + ] + value = self._value + if value: + pairs.append((self._index+2, name, value)) + return pairs + + get_labels = ScalarControl.get_labels + +# aliases, just to make str(control) and str(form) clearer +class PasswordControl(TextControl): pass +class HiddenControl(TextControl): pass +class TextareaControl(TextControl): pass +class SubmitButtonControl(SubmitControl): pass + + +def is_listcontrol(control): return control.is_of_kind("list") + + +class HTMLForm: + """Represents a single HTML <form> ... </form> element. + + A form consists of a sequence of controls that usually have names, and + which can take on various values. The values of the various types of + controls represent variously: text, zero-or-one-of-many or many-of-many + choices, and files to be uploaded. Some controls can be clicked on to + submit the form, and clickable controls' values sometimes include the + coordinates of the click. + + Forms can be filled in with data to be returned to the server, and then + submitted, using the click method to generate a request object suitable for + passing to mechanize.urlopen (or the click_request_data or click_pairs + methods for integration with third-party code). + + import mechanize + forms = mechanize.ParseFile(html, base_uri) + form = forms[0] + + form["query"] = "Python" + form.find_control("nr_results").get("lots").selected = True + + response = mechanize.urlopen(form.click()) + + Usually, HTMLForm instances are not created directly. Instead, the + ParseFile or ParseResponse factory functions are used. If you do construct + HTMLForm objects yourself, however, note that an HTMLForm instance is only + properly initialised after the fixup method has been called (ParseFile and + ParseResponse do this for you). See ListControl.__doc__ for the reason + this is required. + + Indexing a form (form["control_name"]) returns the named Control's value + attribute. Assignment to a form index (form["control_name"] = something) + is equivalent to assignment to the named Control's value attribute. If you + need to be more specific than just supplying the control's name, use the + set_value and get_value methods. + + ListControl values are lists of item names (specifically, the names of the + items that are selected and not disabled, and hence are "successful" -- ie. + cause data to be returned to the server). The list item's name is the + value of the corresponding HTML element's"value" attribute. + + Example: + + <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT> + <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT> + + defines a CHECKBOX control with name "cheeses" which has two items, named + "leicester" and "cheddar". + + Another example: + + <SELECT name="more_cheeses"> + <OPTION>1</OPTION> + <OPTION value="2" label="CHEDDAR">cheddar</OPTION> + </SELECT> + + defines a SELECT control with name "more_cheeses" which has two items, + named "1" and "2" (because the OPTION element's value HTML attribute + defaults to the element contents -- see SelectControl.__doc__ for more on + these defaulting rules). + + To select, deselect or otherwise manipulate individual list items, use the + HTMLForm.find_control() and ListControl.get() methods. To set the whole + value, do as for any other control: use indexing or the set_/get_value + methods. + + Example: + + # select *only* the item named "cheddar" + form["cheeses"] = ["cheddar"] + # select "cheddar", leave other items unaffected + form.find_control("cheeses").get("cheddar").selected = True + + Some controls (RADIO and SELECT without the multiple attribute) can only + have zero or one items selected at a time. Some controls (CHECKBOX and + SELECT with the multiple attribute) can have multiple items selected at a + time. To set the whole value of a ListControl, assign a sequence to a form + index: + + form["cheeses"] = ["cheddar", "leicester"] + + If the ListControl is not multiple-selection, the assigned list must be of + length one. + + To check if a control has an item, if an item is selected, or if an item is + successful (selected and not disabled), respectively: + + "cheddar" in [item.name for item in form.find_control("cheeses").items] + "cheddar" in [item.name for item in form.find_control("cheeses").items and + item.selected] + "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses")) + + Note that some list items may be disabled (see below). + + Note the following mistake: + + form[control_name] = control_value + assert form[control_name] == control_value # not necessarily true + + The reason for this is that form[control_name] always gives the list items + in the order they were listed in the HTML. + + List items (hence list values, too) can be referred to in terms of list + item labels rather than list item names using the appropriate label + arguments. Note that each item may have several labels. + + The question of default values of OPTION contents, labels and values is + somewhat complicated: see SelectControl.__doc__ and + ListControl.get_item_attrs.__doc__ if you think you need to know. + + Controls can be disabled or readonly. In either case, the control's value + cannot be changed until you clear those flags (see example below). + Disabled is the state typically represented by browsers by 'greying out' a + control. Disabled controls are not 'successful' -- they don't cause data + to get returned to the server. Readonly controls usually appear in + browsers as read-only text boxes. Readonly controls are successful. List + items can also be disabled. Attempts to select or deselect disabled items + fail with AttributeError. + + If a lot of controls are readonly, it can be useful to do this: + + form.set_all_readonly(False) + + To clear a control's value attribute, so that it is not successful (until a + value is subsequently set): + + form.clear("cheeses") + + More examples: + + control = form.find_control("cheeses") + control.disabled = False + control.readonly = False + control.get("gruyere").disabled = True + control.items[0].selected = True + + See the various Control classes for further documentation. Many methods + take name, type, kind, id, label and nr arguments to specify the control to + be operated on: see HTMLForm.find_control.__doc__. + + ControlNotFoundError (subclass of ValueError) is raised if the specified + control can't be found. This includes occasions where a non-ListControl + is found, but the method (set, for example) requires a ListControl. + ItemNotFoundError (subclass of ValueError) is raised if a list item can't + be found. ItemCountError (subclass of ValueError) is raised if an attempt + is made to select more than one item and the control doesn't allow that, or + set/get_single are called and the control contains more than one item. + AttributeError is raised if a control or item is readonly or disabled and + an attempt is made to alter its value. + + Security note: Remember that any passwords you store in HTMLForm instances + will be saved to disk in the clear if you pickle them (directly or + indirectly). The simplest solution to this is to avoid pickling HTMLForm + objects. You could also pickle before filling in any password, or just set + the password to "" before pickling. + + + Public attributes: + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form (None if no name was specified) + attrs: dictionary mapping original HTML form attributes to their values + + controls: list of Control instances; do not alter this list + (instead, call form.new_control to make a Control and add it to the + form, or control.add_to_form if you already have a Control instance) + + + + Methods for form filling: + ------------------------- + + Most of the these methods have very similar arguments. See + HTMLForm.find_control.__doc__ for details of the name, type, kind, label + and nr arguments. + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None, label=None) + + get_value(name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None) + set_value(value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None) + + clear_all() + clear(name=None, type=None, kind=None, id=None, nr=None, label=None) + + set_all_readonly(readonly) + + + Method applying only to FileControls: + + add_file(file_object, + content_type="application/octet-stream", filename=None, + name=None, id=None, nr=None, label=None) + + + Methods applying only to clickable controls: + + click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None) + click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1), + label=None) + click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None) + + """ + + type2class = { + "text": TextControl, + "password": PasswordControl, + "hidden": HiddenControl, + "textarea": TextareaControl, + + "isindex": IsindexControl, + + "file": FileControl, + + "button": IgnoreControl, + "buttonbutton": IgnoreControl, + "reset": IgnoreControl, + "resetbutton": IgnoreControl, + + "submit": SubmitControl, + "submitbutton": SubmitButtonControl, + "image": ImageControl, + + "radio": RadioControl, + "checkbox": CheckboxControl, + "select": SelectControl, + } + +#--------------------------------------------------- +# Initialisation. Use ParseResponse / ParseFile instead. + + def __init__(self, action, method="GET", + enctype="application/x-www-form-urlencoded", + name=None, attrs=None, + request_class=_request.Request, + forms=None, labels=None, id_to_labels=None, + backwards_compat=True): + """ + In the usual case, use ParseResponse (or ParseFile) to create new + HTMLForm objects. + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form + attrs: dictionary mapping original HTML form attributes to their values + + """ + self.action = action + self.method = method + self.enctype = enctype + self.name = name + if attrs is not None: + self.attrs = attrs.copy() + else: + self.attrs = {} + self.controls = [] + self._request_class = request_class + + # these attributes are used by zope.testbrowser + self._forms = forms # this is a semi-public API! + self._labels = labels # this is a semi-public API! + self._id_to_labels = id_to_labels # this is a semi-public API! + + self.backwards_compat = backwards_compat # note __setattr__ + + self._urlunparse = urlparse.urlunparse + self._urlparse = urlparse.urlparse + + def __getattr__(self, name): + if name == "backwards_compat": + return self._backwards_compat + return getattr(HTMLForm, name) + + def __setattr__(self, name, value): + # yuck + if name == "backwards_compat": + name = "_backwards_compat" + value = bool(value) + for cc in self.controls: + try: + items = cc.items + except AttributeError: + continue + else: + for ii in items: + for ll in ii.get_labels(): + ll._backwards_compat = value + self.__dict__[name] = value + + def new_control(self, type, name, attrs, + ignore_unknown=False, select_default=False, index=None): + """Adds a new control to the form. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + Note that controls representing lists of items are built up from + controls holding only a single list item. See ListControl.__doc__ for + further information. + + type: type of control (see Control.__doc__ for a list) + attrs: HTML attributes of control + ignore_unknown: if true, use a dummy Control instance for controls of + unknown type; otherwise, use a TextControl + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present (this defaulting happens when the HTMLForm.fixup method is + called) + index: index of corresponding element in HTML (see + MoreFormTests.test_interspersed_controls for motivation) + + """ + type = type.lower() + klass = self.type2class.get(type) + if klass is None: + if ignore_unknown: + klass = IgnoreControl + else: + klass = TextControl + + a = attrs.copy() + if issubclass(klass, ListControl): + control = klass(type, name, a, select_default, index) + else: + control = klass(type, name, a, index) + + if type == "select" and len(attrs) == 1: + for ii in range(len(self.controls)-1, -1, -1): + ctl = self.controls[ii] + if ctl.type == "select": + ctl.close_control() + break + + control.add_to_form(self) + control._urlparse = self._urlparse + control._urlunparse = self._urlunparse + + def fixup(self): + """Normalise form after all controls have been added. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + This method should only be called once, after all controls have been + added to the form. + + """ + for control in self.controls: + control.fixup() + self.backwards_compat = self._backwards_compat + +#--------------------------------------------------- + def __str__(self): + header = "%s%s %s %s" % ( + (self.name and self.name+" " or ""), + self.method, self.action, self.enctype) + rep = [header] + for control in self.controls: + rep.append(" %s" % str(control)) + return "<%s>" % "\n".join(rep) + +#--------------------------------------------------- +# Form-filling methods. + + def __getitem__(self, name): + return self.find_control(name).value + def __contains__(self, name): + return bool(self.find_control(name)) + def __setitem__(self, name, value): + control = self.find_control(name) + try: + control.value = value + except AttributeError, e: + raise ValueError(str(e)) + + def get_value(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None): + """Return value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] + + """ + if by_label: + deprecation("form.get_value_by_label(...)") + c = self.find_control(name, type, kind, id, label=label, nr=nr) + if by_label: + try: + meth = c.get_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + return meth() + else: + return c.value + def set_value(self, value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None): + """Set value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] = value + + """ + if by_label: + deprecation("form.get_value_by_label(...)") + c = self.find_control(name, type, kind, id, label=label, nr=nr) + if by_label: + try: + meth = c.set_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + meth(value) + else: + c.value = value + def get_value_by_label( + self, name=None, type=None, kind=None, id=None, label=None, nr=None): + """ + + All arguments should be passed by name. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + return c.get_value_by_label() + + def set_value_by_label( + self, value, + name=None, type=None, kind=None, id=None, label=None, nr=None): + """ + + All arguments should be passed by name. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + c.set_value_by_label(value) + + def set_all_readonly(self, readonly): + for control in self.controls: + control.readonly = bool(readonly) + + def clear_all(self): + """Clear the value attributes of all controls in the form. + + See HTMLForm.clear.__doc__. + + """ + for control in self.controls: + control.clear() + + def clear(self, + name=None, type=None, kind=None, id=None, nr=None, label=None): + """Clear the value attribute of a control. + + As a result, the affected control will not be successful until a value + is subsequently set. AttributeError is raised on readonly controls. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + c.clear() + + +#--------------------------------------------------- +# Form-filling methods applying only to ListControls. + + def possible_items(self, # deprecated + name=None, type=None, kind=None, id=None, + nr=None, by_label=False, label=None): + """Return a list of all values that the specified control can take.""" + c = self._find_list_control(name, type, kind, id, label, nr) + return c.possible_items(by_label) + + def set(self, selected, item_name, # deprecated + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, label=None): + """Select / deselect named list item. + + selected: boolean selected state + + """ + self._find_list_control(name, type, kind, id, label, nr).set( + selected, item_name, by_label) + def toggle(self, item_name, # deprecated + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, label=None): + """Toggle selected state of named list item.""" + self._find_list_control(name, type, kind, id, label, nr).toggle( + item_name, by_label) + + def set_single(self, selected, # deprecated + name=None, type=None, kind=None, id=None, + nr=None, by_label=None, label=None): + """Select / deselect list item in a control having only one item. + + If the control has multiple list items, ItemCountError is raised. + + This is just a convenience method, so you don't need to know the item's + name -- the item name in these single-item controls is usually + something meaningless like "1" or "on". + + For example, if a checkbox has a single item named "on", the following + two calls are equivalent: + + control.toggle("on") + control.toggle_single() + + """ # by_label ignored and deprecated + self._find_list_control( + name, type, kind, id, label, nr).set_single(selected) + def toggle_single(self, name=None, type=None, kind=None, id=None, + nr=None, by_label=None, label=None): # deprecated + """Toggle selected state of list item in control having only one item. + + The rest is as for HTMLForm.set_single.__doc__. + + """ # by_label ignored and deprecated + self._find_list_control(name, type, kind, id, label, nr).toggle_single() + +#--------------------------------------------------- +# Form-filling method applying only to FileControls. + + def add_file(self, file_object, content_type=None, filename=None, + name=None, id=None, nr=None, label=None): + """Add a file to be uploaded. + + file_object: file-like object (with read method) from which to read + data to upload + content_type: MIME content type of data to upload + filename: filename to pass to server + + If filename is None, no filename is sent to the server. + + If content_type is None, the content type is guessed based on the + filename and the data from read from the file object. + + XXX + At the moment, guessed content type is always application/octet-stream. + Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and + plain text. + + Note the following useful HTML attributes of file upload controls (see + HTML 4.01 spec, section 17): + + accept: comma-separated list of content types that the server will + handle correctly; you can use this to filter out non-conforming files + size: XXX IIRC, this is indicative of whether form wants multiple or + single files + maxlength: XXX hint of max content length in bytes? + + """ + self.find_control(name, "file", id=id, label=label, nr=nr).add_file( + file_object, content_type, filename) + +#--------------------------------------------------- +# Form submission methods, applying only to clickable controls. + + def click(self, name=None, type=None, id=None, nr=0, coord=(1,1), + request_class=_request.Request, + label=None): + """Return request that would result from clicking on a control. + + The request object is a mechanize.Request instance, which you can pass + to mechanize.urlopen. + + Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and + IMAGEs) can be clicked. + + Will click on the first clickable control, subject to the name, type + and nr arguments (as for find_control). If no name, type, id or number + is specified and there are no clickable controls, a request will be + returned for the form in its current, un-clicked, state. + + IndexError is raised if any of name, type, id or nr is specified but no + matching control is found. ValueError is raised if the HTMLForm has an + enctype attribute that is not recognised. + + You can optionally specify a coordinate to click at, which only makes a + difference if you clicked on an image. + + """ + return self._click(name, type, id, label, nr, coord, "request", + self._request_class) + + def click_request_data(self, + name=None, type=None, id=None, + nr=0, coord=(1,1), + request_class=_request.Request, + label=None): + """As for click method, but return a tuple (url, data, headers). + + You can use this data to send a request to the server. This is useful + if you're using httplib or urllib rather than mechanize. Otherwise, + use the click method. + + # Untested. Have to subclass to add headers, I think -- so use + # mechanize instead! + import urllib + url, data, hdrs = form.click_request_data() + r = urllib.urlopen(url, data) + + # Untested. I don't know of any reason to use httplib -- you can get + # just as much control with mechanize. + import httplib, urlparse + url, data, hdrs = form.click_request_data() + tup = urlparse(url) + host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:]) + conn = httplib.HTTPConnection(host) + if data: + httplib.request("POST", path, data, hdrs) + else: + httplib.request("GET", path, headers=hdrs) + r = conn.getresponse() + + """ + return self._click(name, type, id, label, nr, coord, "request_data", + self._request_class) + + def click_pairs(self, name=None, type=None, id=None, + nr=0, coord=(1,1), + label=None): + """As for click_request_data, but returns a list of (key, value) pairs. + + You can use this list as an argument to urllib.urlencode. This is + usually only useful if you're using httplib or urllib rather than + mechanize. It may also be useful if you want to manually tweak the + keys and/or values, but this should not be necessary. Otherwise, use + the click method. + + Note that this method is only useful for forms of MIME type + x-www-form-urlencoded. In particular, it does not return the + information required for file upload. If you need file upload and are + not using mechanize, use click_request_data. + """ + return self._click(name, type, id, label, nr, coord, "pairs", + self._request_class) + +#--------------------------------------------------- + + def find_control(self, + name=None, type=None, kind=None, id=None, + predicate=None, nr=None, + label=None): + """Locate and return some specific control within the form. + + At least one of the name, type, kind, predicate and nr arguments must + be supplied. If no matching control is found, ControlNotFoundError is + raised. + + If name is specified, then the control must have the indicated name. + + If type is specified then the control must have the specified type (in + addition to the types possible for <input> HTML tags: "text", + "password", "hidden", "submit", "image", "button", "radio", "checkbox", + "file" we also have "reset", "buttonbutton", "submitbutton", + "resetbutton", "textarea", "select" and "isindex"). + + If kind is specified, then the control must fall into the specified + group, each of which satisfies a particular interface. The types are + "text", "list", "multilist", "singlelist", "clickable" and "file". + + If id is specified, then the control must have the indicated id. + + If predicate is specified, then the control must match that function. + The predicate function is passed the control as its single argument, + and should return a boolean value indicating whether the control + matched. + + nr, if supplied, is the sequence number of the control (where 0 is the + first). Note that control 0 is the first control matching all the + other arguments (if supplied); it is not necessarily the first control + in the form. If no nr is supplied, AmbiguityError is raised if + multiple controls match the other arguments (unless the + .backwards-compat attribute is true). + + If label is specified, then the control must have this label. Note + that radio controls and checkboxes never have labels: their items do. + + """ + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (label is None) and (predicate is None) and + (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + return self._find_control(name, type, kind, id, label, predicate, nr) + +#--------------------------------------------------- +# Private methods. + + def _find_list_control(self, + name=None, type=None, kind=None, id=None, + label=None, nr=None): + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (label is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + + return self._find_control(name, type, kind, id, label, + is_listcontrol, nr) + + def _find_control(self, name, type, kind, id, label, predicate, nr): + if ((name is not None) and (name is not Missing) and + not isstringlike(name)): + raise TypeError("control name must be string-like") + if (type is not None) and not isstringlike(type): + raise TypeError("control type must be string-like") + if (kind is not None) and not isstringlike(kind): + raise TypeError("control kind must be string-like") + if (id is not None) and not isstringlike(id): + raise TypeError("control id must be string-like") + if (label is not None) and not isstringlike(label): + raise TypeError("control label must be string-like") + if (predicate is not None) and not callable(predicate): + raise TypeError("control predicate must be callable") + if (nr is not None) and nr < 0: + raise ValueError("control number must be a positive integer") + + orig_nr = nr + found = None + ambiguous = False + if nr is None and self.backwards_compat: + nr = 0 + + for control in self.controls: + if ((name is not None and name != control.name) and + (name is not Missing or control.name is not None)): + continue + if type is not None and type != control.type: + continue + if kind is not None and not control.is_of_kind(kind): + continue + if id is not None and id != control.id: + continue + if predicate and not predicate(control): + continue + if label: + for l in control.get_labels(): + if l.text.find(label) > -1: + break + else: + continue + if nr is not None: + if nr == 0: + return control # early exit: unambiguous due to nr + nr -= 1 + continue + if found: + ambiguous = True + break + found = control + + if found and not ambiguous: + return found + + description = [] + if name is not None: description.append("name %s" % repr(name)) + if type is not None: description.append("type '%s'" % type) + if kind is not None: description.append("kind '%s'" % kind) + if id is not None: description.append("id '%s'" % id) + if label is not None: description.append("label '%s'" % label) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr: description.append("nr %d" % orig_nr) + description = ", ".join(description) + + if ambiguous: + raise AmbiguityError("more than one control matching "+description) + elif not found: + raise ControlNotFoundError("no control matching "+description) + assert False + + def _click(self, name, type, id, label, nr, coord, return_type, + request_class=_request.Request): + try: + control = self._find_control( + name, type, "clickable", id, label, None, nr) + except ControlNotFoundError: + if ((name is not None) or (type is not None) or (id is not None) or + (label is not None) or (nr != 0)): + raise + # no clickable controls, but no control was explicitly requested, + # so return state without clicking any control + return self._switch_click(return_type, request_class) + else: + return control._click(self, coord, return_type, request_class) + + def _pairs(self): + """Return sequence of (key, value) pairs suitable for urlencoding.""" + return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()] + + + def _pairs_and_controls(self): + """Return sequence of (index, key, value, control_index) + of totally ordered pairs suitable for urlencoding. + + control_index is the index of the control in self.controls + """ + pairs = [] + for control_index in range(len(self.controls)): + control = self.controls[control_index] + for ii, key, val in control._totally_ordered_pairs(): + pairs.append((ii, key, val, control_index)) + + # stable sort by ONLY first item in tuple + pairs.sort() + + return pairs + + def _request_data(self): + """Return a tuple (url, data, headers).""" + method = self.method.upper() + #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action) + parts = self._urlparse(self.action) + rest, (query, frag) = parts[:-2], parts[-2:] + + if method == "GET": + if self.enctype != "application/x-www-form-urlencoded": + raise ValueError( + "unknown GET form encoding type '%s'" % self.enctype) + parts = rest + (urllib.urlencode(self._pairs()), None) + uri = self._urlunparse(parts) + return uri, None, [] + elif method == "POST": + parts = rest + (query, None) + uri = self._urlunparse(parts) + if self.enctype == "application/x-www-form-urlencoded": + return (uri, urllib.urlencode(self._pairs()), + [("Content-Type", self.enctype)]) + elif self.enctype == "multipart/form-data": + data = StringIO() + http_hdrs = [] + mw = MimeWriter(data, http_hdrs) + mw.startmultipartbody("form-data", add_to_http_hdrs=True, + prefix=0) + for ii, k, v, control_index in self._pairs_and_controls(): + self.controls[control_index]._write_mime_data(mw, k, v) + mw.lastpart() + return uri, data.getvalue(), http_hdrs + else: + raise ValueError( + "unknown POST form encoding type '%s'" % self.enctype) + else: + raise ValueError("Unknown method '%s'" % method) + + def _switch_click(self, return_type, request_class=_request.Request): + # This is called by HTMLForm and clickable Controls to hide switching + # on return_type. + if return_type == "pairs": + return self._pairs() + elif return_type == "request_data": + return self._request_data() + else: + req_data = self._request_data() + req = request_class(req_data[0], req_data[1]) + for key, val in req_data[2]: + add_hdr = req.add_header + if key.lower() == "content-type": + try: + add_hdr = req.add_unredirected_header + except AttributeError: + # pre-2.4 and not using ClientCookie + pass + add_hdr(key, val) + return req diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_gzip.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_gzip.py new file mode 100644 index 0000000000000000000000000000000000000000..7e9d6a0ce7de0fe11f5333e6fedefee2c9c1e95f --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_gzip.py @@ -0,0 +1,105 @@ +from cStringIO import StringIO + +import _response +import _urllib2_fork + + +# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library +class GzipConsumer: + + def __init__(self, consumer): + self.__consumer = consumer + self.__decoder = None + self.__data = "" + + def __getattr__(self, key): + return getattr(self.__consumer, key) + + def feed(self, data): + if self.__decoder is None: + # check if we have a full gzip header + data = self.__data + data + try: + i = 10 + flag = ord(data[3]) + if flag & 4: # extra + x = ord(data[i]) + 256*ord(data[i+1]) + i = i + 2 + x + if flag & 8: # filename + while ord(data[i]): + i = i + 1 + i = i + 1 + if flag & 16: # comment + while ord(data[i]): + i = i + 1 + i = i + 1 + if flag & 2: # crc + i = i + 2 + if len(data) < i: + raise IndexError("not enough data") + if data[:3] != "\x1f\x8b\x08": + raise IOError("invalid gzip data") + data = data[i:] + except IndexError: + self.__data = data + return # need more data + import zlib + self.__data = "" + self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS) + data = self.__decoder.decompress(data) + if data: + self.__consumer.feed(data) + + def close(self): + if self.__decoder: + data = self.__decoder.flush() + if data: + self.__consumer.feed(data) + self.__consumer.close() + + +# -------------------------------------------------------------------- + +# the rest of this module is John Lee's stupid code, not +# Fredrik's nice code :-) + +class stupid_gzip_consumer: + def __init__(self): self.data = [] + def feed(self, data): self.data.append(data) + +class stupid_gzip_wrapper(_response.closeable_response): + def __init__(self, response): + self._response = response + + c = stupid_gzip_consumer() + gzc = GzipConsumer(c) + gzc.feed(response.read()) + self.__data = StringIO("".join(c.data)) + + def read(self, size=-1): + return self.__data.read(size) + def readline(self, size=-1): + return self.__data.readline(size) + def readlines(self, sizehint=-1): + return self.__data.readlines(sizehint) + + def __getattr__(self, name): + # delegate unknown methods/attributes + return getattr(self._response, name) + +class HTTPGzipProcessor(_urllib2_fork.BaseHandler): + handler_order = 200 # response processing before HTTPEquivProcessor + + def http_request(self, request): + request.add_header("Accept-Encoding", "gzip") + return request + + def http_response(self, request, response): + # post-process response + enc_hdrs = response.info().getheaders("Content-encoding") + for enc_hdr in enc_hdrs: + if ("gzip" in enc_hdr) or ("compress" in enc_hdr): + return stupid_gzip_wrapper(response) + return response + + https_response = http_response diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_headersutil.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_headersutil.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c78e93ba0319eef0bc6e33a34223776e1e0f3f --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_headersutil.py @@ -0,0 +1,241 @@ +"""Utility functions for HTTP header value parsing and construction. + +Copyright 1997-1998, Gisle Aas +Copyright 2002-2006, John J. Lee + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import os, re +from types import StringType +from types import UnicodeType +STRING_TYPES = StringType, UnicodeType + +from _util import http2time +import _rfc3986 + + +def is_html_file_extension(url, allow_xhtml): + ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1] + html_exts = [".htm", ".html"] + if allow_xhtml: + html_exts += [".xhtml"] + return ext in html_exts + + +def is_html(ct_headers, url, allow_xhtml=False): + """ + ct_headers: Sequence of Content-Type headers + url: Response URL + + """ + if not ct_headers: + return is_html_file_extension(url, allow_xhtml) + headers = split_header_words(ct_headers) + if len(headers) < 1: + return is_html_file_extension(url, allow_xhtml) + first_header = headers[0] + first_parameter = first_header[0] + ct = first_parameter[0] + html_types = ["text/html"] + if allow_xhtml: + html_types += [ + "text/xhtml", "text/xml", + "application/xml", "application/xhtml+xml", + ] + return ct in html_types + + +def unmatched(match): + """Return unmatched part of re.Match object.""" + start, end = match.span(0) + return match.string[:start]+match.string[end:] + +token_re = re.compile(r"^\s*([^=\s;,]+)") +quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") +value_re = re.compile(r"^\s*=\s*([^\s;,]*)") +escape_re = re.compile(r"\\(.)") +def split_header_words(header_values): + r"""Parse header values into a list of lists containing key,value pairs. + + The function knows how to deal with ",", ";" and "=" as well as quoted + values after "=". A list of space separated tokens are parsed as if they + were separated by ";". + + If the header_values passed as argument contains multiple values, then they + are treated as if they were a single value separated by comma ",". + + This means that this function is useful for parsing header fields that + follow this syntax (BNF as from the HTTP/1.1 specification, but we relax + the requirement for tokens). + + headers = #header + header = (token | parameter) *( [";"] (token | parameter)) + + token = 1*<any CHAR except CTLs or separators> + separators = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + + quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) + qdtext = <any TEXT except <">> + quoted-pair = "\" CHAR + + parameter = attribute "=" value + attribute = token + value = token | quoted-string + + Each header is represented by a list of key/value pairs. The value for a + simple token (not part of a parameter) is None. Syntactically incorrect + headers will not necessarily be parsed as you would want. + + This is easier to describe with some examples: + + >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) + [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] + >>> split_header_words(['text/html; charset="iso-8859-1"']) + [[('text/html', None), ('charset', 'iso-8859-1')]] + >>> split_header_words([r'Basic realm="\"foo\bar\""']) + [[('Basic', None), ('realm', '"foobar"')]] + + """ + assert type(header_values) not in STRING_TYPES + result = [] + for text in header_values: + orig_text = text + pairs = [] + while text: + m = token_re.search(text) + if m: + text = unmatched(m) + name = m.group(1) + m = quoted_value_re.search(text) + if m: # quoted value + text = unmatched(m) + value = m.group(1) + value = escape_re.sub(r"\1", value) + else: + m = value_re.search(text) + if m: # unquoted value + text = unmatched(m) + value = m.group(1) + value = value.rstrip() + else: + # no value, a lone token + value = None + pairs.append((name, value)) + elif text.lstrip().startswith(","): + # concatenated headers, as per RFC 2616 section 4.2 + text = text.lstrip()[1:] + if pairs: result.append(pairs) + pairs = [] + else: + # skip junk + non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) + assert nr_junk_chars > 0, ( + "split_header_words bug: '%s', '%s', %s" % + (orig_text, text, pairs)) + text = non_junk + if pairs: result.append(pairs) + return result + +join_escape_re = re.compile(r"([\"\\])") +def join_header_words(lists): + """Do the inverse of the conversion done by split_header_words. + + Takes a list of lists of (key, value) pairs and produces a single header + value. Attribute values are quoted if needed. + + >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) + 'text/plain; charset="iso-8859/1"' + >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) + 'text/plain, charset="iso-8859/1"' + + """ + headers = [] + for pairs in lists: + attr = [] + for k, v in pairs: + if v is not None: + if not re.search(r"^\w+$", v): + v = join_escape_re.sub(r"\\\1", v) # escape " and \ + v = '"%s"' % v + if k is None: # Netscape cookies may have no name + k = v + else: + k = "%s=%s" % (k, v) + attr.append(k) + if attr: headers.append("; ".join(attr)) + return ", ".join(headers) + +def strip_quotes(text): + if text.startswith('"'): + text = text[1:] + if text.endswith('"'): + text = text[:-1] + return text + +def parse_ns_headers(ns_headers): + """Ad-hoc parser for Netscape protocol cookie-attributes. + + The old Netscape cookie format for Set-Cookie can for instance contain + an unquoted "," in the expires field, so we have to use this ad-hoc + parser instead of split_header_words. + + XXX This may not make the best possible effort to parse all the crap + that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient + parser is probably better, so could do worse than following that if + this ever gives any trouble. + + Currently, this is also used for parsing RFC 2109 cookies. + + """ + known_attrs = ("expires", "domain", "path", "secure", + # RFC 2109 attrs (may turn up in Netscape cookies, too) + "version", "port", "max-age") + + result = [] + for ns_header in ns_headers: + pairs = [] + version_set = False + params = re.split(r";\s*", ns_header) + for ii in range(len(params)): + param = params[ii] + param = param.rstrip() + if param == "": continue + if "=" not in param: + k, v = param, None + else: + k, v = re.split(r"\s*=\s*", param, 1) + k = k.lstrip() + if ii != 0: + lc = k.lower() + if lc in known_attrs: + k = lc + if k == "version": + # This is an RFC 2109 cookie. + v = strip_quotes(v) + version_set = True + if k == "expires": + # convert expires date to seconds since epoch + v = http2time(strip_quotes(v)) # None if invalid + pairs.append((k, v)) + + if pairs: + if not version_set: + pairs.append(("version", "0")) + result.append(pairs) + + return result + + +def _test(): + import doctest, _headersutil + return doctest.testmod(_headersutil) + +if __name__ == "__main__": + _test() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_html.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_html.py new file mode 100644 index 0000000000000000000000000000000000000000..1a4e2c0281571b8c7bbc7da8dcb377835adbb091 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_html.py @@ -0,0 +1,629 @@ +"""HTML handling. + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import codecs +import copy +import htmlentitydefs +import re + +import _sgmllib_copy as sgmllib + +import _beautifulsoup +import _form +from _headersutil import split_header_words, is_html as _is_html +import _request +import _rfc3986 + +DEFAULT_ENCODING = "latin-1" + +COMPRESS_RE = re.compile(r"\s+") + + +class CachingGeneratorFunction(object): + """Caching wrapper around a no-arguments iterable.""" + + def __init__(self, iterable): + self._cache = [] + # wrap iterable to make it non-restartable (otherwise, repeated + # __call__ would give incorrect results) + self._iterator = iter(iterable) + + def __call__(self): + cache = self._cache + for item in cache: + yield item + for item in self._iterator: + cache.append(item) + yield item + + +class EncodingFinder: + def __init__(self, default_encoding): + self._default_encoding = default_encoding + def encoding(self, response): + # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV + # headers may be in the response. HTTP-EQUIV headers come last, + # so try in order from first to last. + for ct in response.info().getheaders("content-type"): + for k, v in split_header_words([ct])[0]: + if k == "charset": + encoding = v + try: + codecs.lookup(v) + except LookupError: + continue + else: + return encoding + return self._default_encoding + + +class ResponseTypeFinder: + def __init__(self, allow_xhtml): + self._allow_xhtml = allow_xhtml + def is_html(self, response, encoding): + ct_hdrs = response.info().getheaders("content-type") + url = response.geturl() + # XXX encoding + return _is_html(ct_hdrs, url, self._allow_xhtml) + + +class Args(object): + + # idea for this argument-processing trick is from Peter Otten + + def __init__(self, args_map): + self.__dict__["dictionary"] = dict(args_map) + + def __getattr__(self, key): + try: + return self.dictionary[key] + except KeyError: + return getattr(self.__class__, key) + + def __setattr__(self, key, value): + if key == "dictionary": + raise AttributeError() + self.dictionary[key] = value + + +def form_parser_args( + select_default=False, + form_parser_class=None, + request_class=None, + backwards_compat=False, + ): + return Args(locals()) + + +class Link: + def __init__(self, base_url, url, text, tag, attrs): + assert None not in [url, tag, attrs] + self.base_url = base_url + self.absolute_url = _rfc3986.urljoin(base_url, url) + self.url, self.text, self.tag, self.attrs = url, text, tag, attrs + def __cmp__(self, other): + try: + for name in "url", "text", "tag", "attrs": + if getattr(self, name) != getattr(other, name): + return -1 + except AttributeError: + return -1 + return 0 + def __repr__(self): + return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % ( + self.base_url, self.url, self.text, self.tag, self.attrs) + + +class LinksFactory: + + def __init__(self, + link_parser_class=None, + link_class=Link, + urltags=None, + ): + import _pullparser + if link_parser_class is None: + link_parser_class = _pullparser.TolerantPullParser + self.link_parser_class = link_parser_class + self.link_class = link_class + if urltags is None: + urltags = { + "a": "href", + "area": "href", + "frame": "src", + "iframe": "src", + } + self.urltags = urltags + self._response = None + self._encoding = None + + def set_response(self, response, base_url, encoding): + self._response = response + self._encoding = encoding + self._base_url = base_url + + def links(self): + """Return an iterator that provides links of the document.""" + response = self._response + encoding = self._encoding + base_url = self._base_url + p = self.link_parser_class(response, encoding=encoding) + + try: + for token in p.tags(*(self.urltags.keys()+["base"])): + if token.type == "endtag": + continue + if token.data == "base": + base_href = dict(token.attrs).get("href") + if base_href is not None: + base_url = base_href + continue + attrs = dict(token.attrs) + tag = token.data + text = None + # XXX use attr_encoding for ref'd doc if that doc does not + # provide one by other means + #attr_encoding = attrs.get("charset") + url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? + if not url: + # Probably an <A NAME="blah"> link or <AREA NOHREF...>. + # For our purposes a link is something with a URL, so + # ignore this. + continue + + url = _rfc3986.clean_url(url, encoding) + if tag == "a": + if token.type != "startendtag": + # hmm, this'd break if end tag is missing + text = p.get_compressed_text(("endtag", tag)) + # but this doesn't work for e.g. + # <a href="blah"><b>Andy</b></a> + #text = p.get_compressed_text() + + yield Link(base_url, url, text, tag, token.attrs) + except sgmllib.SGMLParseError, exc: + raise _form.ParseError(exc) + +class FormsFactory: + + """Makes a sequence of objects satisfying HTMLForm interface. + + After calling .forms(), the .global_form attribute is a form object + containing all controls not a descendant of any FORM element. + + For constructor argument docs, see ParseResponse argument docs. + """ + + def __init__(self, + select_default=False, + form_parser_class=None, + request_class=None, + backwards_compat=False, + ): + self.select_default = select_default + if form_parser_class is None: + form_parser_class = _form.FormParser + self.form_parser_class = form_parser_class + if request_class is None: + request_class = _request.Request + self.request_class = request_class + self.backwards_compat = backwards_compat + self._response = None + self.encoding = None + self.global_form = None + + def set_response(self, response, encoding): + self._response = response + self.encoding = encoding + self.global_form = None + + def forms(self): + encoding = self.encoding + forms = _form.ParseResponseEx( + self._response, + select_default=self.select_default, + form_parser_class=self.form_parser_class, + request_class=self.request_class, + encoding=encoding, + _urljoin=_rfc3986.urljoin, + _urlparse=_rfc3986.urlsplit, + _urlunparse=_rfc3986.urlunsplit, + ) + self.global_form = forms[0] + return forms[1:] + +class TitleFactory: + def __init__(self): + self._response = self._encoding = None + + def set_response(self, response, encoding): + self._response = response + self._encoding = encoding + + def _get_title_text(self, parser): + import _pullparser + text = [] + tok = None + while 1: + try: + tok = parser.get_token() + except _pullparser.NoMoreTokensError: + break + if tok.type == "data": + text.append(str(tok)) + elif tok.type == "entityref": + t = unescape("&%s;" % tok.data, + parser._entitydefs, parser.encoding) + text.append(t) + elif tok.type == "charref": + t = unescape_charref(tok.data, parser.encoding) + text.append(t) + elif tok.type in ["starttag", "endtag", "startendtag"]: + tag_name = tok.data + if tok.type == "endtag" and tag_name == "title": + break + text.append(str(tok)) + return COMPRESS_RE.sub(" ", "".join(text).strip()) + + def title(self): + import _pullparser + p = _pullparser.TolerantPullParser( + self._response, encoding=self._encoding) + try: + try: + p.get_tag("title") + except _pullparser.NoMoreTokensError: + return None + else: + return self._get_title_text(p) + except sgmllib.SGMLParseError, exc: + raise _form.ParseError(exc) + + +def unescape(data, entities, encoding): + if data is None or "&" not in data: + return data + + def replace_entities(match): + ent = match.group() + if ent[1] == "#": + return unescape_charref(ent[2:-1], encoding) + + repl = entities.get(ent[1:-1]) + if repl is not None: + repl = unichr(repl) + if type(repl) != type(""): + try: + repl = repl.encode(encoding) + except UnicodeError: + repl = ent + else: + repl = ent + return repl + + return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) + +def unescape_charref(data, encoding): + name, base = data, 10 + if name.startswith("x"): + name, base= name[1:], 16 + uc = unichr(int(name, base)) + if encoding is None: + return uc + else: + try: + repl = uc.encode(encoding) + except UnicodeError: + repl = "&#%s;" % data + return repl + + +class MechanizeBs(_beautifulsoup.BeautifulSoup): + _entitydefs = htmlentitydefs.name2codepoint + # don't want the magic Microsoft-char workaround + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda(x):'<!' + x.group(1) + '>') + ] + + def __init__(self, encoding, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + self._encoding = encoding + _beautifulsoup.BeautifulSoup.__init__( + self, text, avoidParserProblems, initialTextIsEverything) + + def handle_charref(self, ref): + t = unescape("&#%s;"%ref, self._entitydefs, self._encoding) + self.handle_data(t) + def handle_entityref(self, ref): + t = unescape("&%s;"%ref, self._entitydefs, self._encoding) + self.handle_data(t) + def unescape_attrs(self, attrs): + escaped_attrs = [] + for key, val in attrs: + val = unescape(val, self._entitydefs, self._encoding) + escaped_attrs.append((key, val)) + return escaped_attrs + +class RobustLinksFactory: + + compress_re = COMPRESS_RE + + def __init__(self, + link_parser_class=None, + link_class=Link, + urltags=None, + ): + if link_parser_class is None: + link_parser_class = MechanizeBs + self.link_parser_class = link_parser_class + self.link_class = link_class + if urltags is None: + urltags = { + "a": "href", + "area": "href", + "frame": "src", + "iframe": "src", + } + self.urltags = urltags + self._bs = None + self._encoding = None + self._base_url = None + + def set_soup(self, soup, base_url, encoding): + self._bs = soup + self._base_url = base_url + self._encoding = encoding + + def links(self): + bs = self._bs + base_url = self._base_url + encoding = self._encoding + for ch in bs.recursiveChildGenerator(): + if (isinstance(ch, _beautifulsoup.Tag) and + ch.name in self.urltags.keys()+["base"]): + link = ch + attrs = bs.unescape_attrs(link.attrs) + attrs_dict = dict(attrs) + if link.name == "base": + base_href = attrs_dict.get("href") + if base_href is not None: + base_url = base_href + continue + url_attr = self.urltags[link.name] + url = attrs_dict.get(url_attr) + if not url: + continue + url = _rfc3986.clean_url(url, encoding) + text = link.fetchText(lambda t: True) + if not text: + # follow _pullparser's weird behaviour rigidly + if link.name == "a": + text = "" + else: + text = None + else: + text = self.compress_re.sub(" ", " ".join(text).strip()) + yield Link(base_url, url, text, link.name, attrs) + + +class RobustFormsFactory(FormsFactory): + def __init__(self, *args, **kwds): + args = form_parser_args(*args, **kwds) + if args.form_parser_class is None: + args.form_parser_class = _form.RobustFormParser + FormsFactory.__init__(self, **args.dictionary) + + def set_response(self, response, encoding): + self._response = response + self.encoding = encoding + + +class RobustTitleFactory: + def __init__(self): + self._bs = self._encoding = None + + def set_soup(self, soup, encoding): + self._bs = soup + self._encoding = encoding + + def title(self): + title = self._bs.first("title") + if title == _beautifulsoup.Null: + return None + else: + inner_html = "".join([str(node) for node in title.contents]) + return COMPRESS_RE.sub(" ", inner_html.strip()) + + +class Factory: + """Factory for forms, links, etc. + + This interface may expand in future. + + Public methods: + + set_request_class(request_class) + set_response(response) + forms() + links() + + Public attributes: + + Note that accessing these attributes may raise ParseError. + + encoding: string specifying the encoding of response if it contains a text + document (this value is left unspecified for documents that do not have + an encoding, e.g. an image file) + is_html: true if response contains an HTML document (XHTML may be + regarded as HTML too) + title: page title, or None if no title or not HTML + global_form: form object containing all controls that are not descendants + of any FORM element, or None if the forms_factory does not support + supplying a global form + + """ + + LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"] + + def __init__(self, forms_factory, links_factory, title_factory, + encoding_finder=EncodingFinder(DEFAULT_ENCODING), + response_type_finder=ResponseTypeFinder(allow_xhtml=False), + ): + """ + + Pass keyword arguments only. + + default_encoding: character encoding to use if encoding cannot be + determined (or guessed) from the response. You should turn on + HTTP-EQUIV handling if you want the best chance of getting this right + without resorting to this default. The default value of this + parameter (currently latin-1) may change in future. + + """ + self._forms_factory = forms_factory + self._links_factory = links_factory + self._title_factory = title_factory + self._encoding_finder = encoding_finder + self._response_type_finder = response_type_finder + + self.set_response(None) + + def set_request_class(self, request_class): + """Set request class (mechanize.Request by default). + + HTMLForm instances returned by .forms() will return instances of this + class when .click()ed. + + """ + self._forms_factory.request_class = request_class + + def set_response(self, response): + """Set response. + + The response must either be None or implement the same interface as + objects returned by mechanize.urlopen(). + + """ + self._response = response + self._forms_genf = self._links_genf = None + self._get_title = None + for name in self.LAZY_ATTRS: + try: + delattr(self, name) + except AttributeError: + pass + + def __getattr__(self, name): + if name not in self.LAZY_ATTRS: + return getattr(self.__class__, name) + + if name == "encoding": + self.encoding = self._encoding_finder.encoding( + copy.copy(self._response)) + return self.encoding + elif name == "is_html": + self.is_html = self._response_type_finder.is_html( + copy.copy(self._response), self.encoding) + return self.is_html + elif name == "title": + if self.is_html: + self.title = self._title_factory.title() + else: + self.title = None + return self.title + elif name == "global_form": + self.forms() + return self.global_form + + def forms(self): + """Return iterable over HTMLForm-like objects. + + Raises mechanize.ParseError on failure. + """ + # this implementation sets .global_form as a side-effect, for benefit + # of __getattr__ impl + if self._forms_genf is None: + try: + self._forms_genf = CachingGeneratorFunction( + self._forms_factory.forms()) + except: # XXXX define exception! + self.set_response(self._response) + raise + self.global_form = getattr( + self._forms_factory, "global_form", None) + return self._forms_genf() + + def links(self): + """Return iterable over mechanize.Link-like objects. + + Raises mechanize.ParseError on failure. + """ + if self._links_genf is None: + try: + self._links_genf = CachingGeneratorFunction( + self._links_factory.links()) + except: # XXXX define exception! + self.set_response(self._response) + raise + return self._links_genf() + +class DefaultFactory(Factory): + """Based on sgmllib.""" + def __init__(self, i_want_broken_xhtml_support=False): + Factory.__init__( + self, + forms_factory=FormsFactory(), + links_factory=LinksFactory(), + title_factory=TitleFactory(), + response_type_finder=ResponseTypeFinder( + allow_xhtml=i_want_broken_xhtml_support), + ) + + def set_response(self, response): + Factory.set_response(self, response) + if response is not None: + self._forms_factory.set_response( + copy.copy(response), self.encoding) + self._links_factory.set_response( + copy.copy(response), response.geturl(), self.encoding) + self._title_factory.set_response( + copy.copy(response), self.encoding) + +class RobustFactory(Factory): + """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is + DefaultFactory. + + """ + def __init__(self, i_want_broken_xhtml_support=False, + soup_class=None): + Factory.__init__( + self, + forms_factory=RobustFormsFactory(), + links_factory=RobustLinksFactory(), + title_factory=RobustTitleFactory(), + response_type_finder=ResponseTypeFinder( + allow_xhtml=i_want_broken_xhtml_support), + ) + if soup_class is None: + soup_class = MechanizeBs + self._soup_class = soup_class + + def set_response(self, response): + Factory.set_response(self, response) + if response is not None: + data = response.read() + soup = self._soup_class(self.encoding, data) + self._forms_factory.set_response( + copy.copy(response), self.encoding) + self._links_factory.set_soup( + soup, response.geturl(), self.encoding) + self._title_factory.set_soup(soup, self.encoding) diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_http.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_http.py new file mode 100644 index 0000000000000000000000000000000000000000..657973519dedccbcdfe86715d25fea4f7359ebbc --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_http.py @@ -0,0 +1,447 @@ +"""HTTP related handlers. + +Note that some other HTTP handlers live in more specific modules: _auth.py, +_gzip.py, etc. + + +Copyright 2002-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import HTMLParser +from cStringIO import StringIO +import htmlentitydefs +import logging +import robotparser +import socket +import time + +import _sgmllib_copy as sgmllib +from _urllib2_fork import HTTPError, BaseHandler + +from _headersutil import is_html +from _html import unescape, unescape_charref +from _request import Request +from _response import response_seek_wrapper +import _rfc3986 +import _sockettimeout + +debug = logging.getLogger("mechanize").debug +debug_robots = logging.getLogger("mechanize.robots").debug + +# monkeypatch urllib2.HTTPError to show URL +## import urllib2 +## def urllib2_str(self): +## return 'HTTP Error %s: %s (%s)' % ( +## self.code, self.msg, self.geturl()) +## urllib2.HTTPError.__str__ = urllib2_str + + +CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes +DEFAULT_ENCODING = 'latin-1' + +# XXX would self.reset() work, instead of raising this exception? +class EndOfHeadError(Exception): pass +class AbstractHeadParser: + # only these elements are allowed in or before HEAD of document + head_elems = ("html", "head", + "title", "base", + "script", "style", "meta", "link", "object") + _entitydefs = htmlentitydefs.name2codepoint + _encoding = DEFAULT_ENCODING + + def __init__(self): + self.http_equiv = [] + + def start_meta(self, attrs): + http_equiv = content = None + for key, value in attrs: + if key == "http-equiv": + http_equiv = self.unescape_attr_if_required(value) + elif key == "content": + content = self.unescape_attr_if_required(value) + if http_equiv is not None and content is not None: + self.http_equiv.append((http_equiv, content)) + + def end_head(self): + raise EndOfHeadError() + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + escaped_attrs[key] = self.unescape_attr(val) + return escaped_attrs + + def unknown_entityref(self, ref): + self.handle_data("&%s;" % ref) + + def unknown_charref(self, ref): + self.handle_data("&#%s;" % ref) + + +class XHTMLCompatibleHeadParser(AbstractHeadParser, + HTMLParser.HTMLParser): + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + AbstractHeadParser.__init__(self) + + def handle_starttag(self, tag, attrs): + if tag not in self.head_elems: + raise EndOfHeadError() + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + if tag not in self.head_elems: + raise EndOfHeadError() + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + +class HeadParser(AbstractHeadParser, sgmllib.SGMLParser): + + def _not_called(self): + assert False + + def __init__(self): + sgmllib.SGMLParser.__init__(self) + AbstractHeadParser.__init__(self) + + def handle_starttag(self, tag, method, attrs): + if tag not in self.head_elems: + raise EndOfHeadError() + if tag == "meta": + method(attrs) + + def unknown_starttag(self, tag, attrs): + self.handle_starttag(tag, self._not_called, attrs) + + def handle_endtag(self, tag, method): + if tag in self.head_elems: + method() + else: + raise EndOfHeadError() + + def unescape_attr_if_required(self, name): + return self.unescape_attr(name) + +def parse_head(fileobj, parser): + """Return a list of key, value pairs.""" + while 1: + data = fileobj.read(CHUNK) + try: + parser.feed(data) + except EndOfHeadError: + break + if len(data) != CHUNK: + # this should only happen if there is no HTML body, or if + # CHUNK is big + break + return parser.http_equiv + +class HTTPEquivProcessor(BaseHandler): + """Append META HTTP-EQUIV headers to regular HTTP headers.""" + + handler_order = 300 # before handlers that look at HTTP headers + + def __init__(self, head_parser_class=HeadParser, + i_want_broken_xhtml_support=False, + ): + self.head_parser_class = head_parser_class + self._allow_xhtml = i_want_broken_xhtml_support + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + http_message = response.info() + url = response.geturl() + ct_hdrs = http_message.getheaders("content-type") + if is_html(ct_hdrs, url, self._allow_xhtml): + try: + try: + html_headers = parse_head(response, + self.head_parser_class()) + finally: + response.seek(0) + except (HTMLParser.HTMLParseError, + sgmllib.SGMLParseError): + pass + else: + for hdr, val in html_headers: + # add a header + http_message.dict[hdr.lower()] = val + text = hdr + ": " + val + for line in text.split("\n"): + http_message.headers.append(line + "\n") + return response + + https_response = http_response + + +class MechanizeRobotFileParser(robotparser.RobotFileParser): + + def __init__(self, url='', opener=None): + robotparser.RobotFileParser.__init__(self, url) + self._opener = opener + self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT + + def set_opener(self, opener=None): + import _opener + if opener is None: + opener = _opener.OpenerDirector() + self._opener = opener + + def set_timeout(self, timeout): + self._timeout = timeout + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + if self._opener is None: + self.set_opener() + req = Request(self.url, unverifiable=True, visit=False, + timeout=self._timeout) + try: + f = self._opener.open(req) + except HTTPError, f: + pass + except (IOError, socket.error, OSError), exc: + debug_robots("ignoring error opening %r: %s" % + (self.url, exc)) + return + lines = [] + line = f.readline() + while line: + lines.append(line.strip()) + line = f.readline() + status = f.code + if status == 401 or status == 403: + self.disallow_all = True + debug_robots("disallow all") + elif status >= 400: + self.allow_all = True + debug_robots("allow all") + elif status == 200 and lines: + debug_robots("parse lines") + self.parse(lines) + +class RobotExclusionError(HTTPError): + def __init__(self, request, *args): + apply(HTTPError.__init__, (self,)+args) + self.request = request + +class HTTPRobotRulesProcessor(BaseHandler): + # before redirections, after everything else + handler_order = 800 + + try: + from httplib import HTTPMessage + except: + from mimetools import Message + http_response_class = Message + else: + http_response_class = HTTPMessage + + def __init__(self, rfp_class=MechanizeRobotFileParser): + self.rfp_class = rfp_class + self.rfp = None + self._host = None + + def http_request(self, request): + scheme = request.get_type() + if scheme not in ["http", "https"]: + # robots exclusion only applies to HTTP + return request + + if request.get_selector() == "/robots.txt": + # /robots.txt is always OK to fetch + return request + + host = request.get_host() + + # robots.txt requests don't need to be allowed by robots.txt :-) + origin_req = getattr(request, "_origin_req", None) + if (origin_req is not None and + origin_req.get_selector() == "/robots.txt" and + origin_req.get_host() == host + ): + return request + + if host != self._host: + self.rfp = self.rfp_class() + try: + self.rfp.set_opener(self.parent) + except AttributeError: + debug("%r instance does not support set_opener" % + self.rfp.__class__) + self.rfp.set_url(scheme+"://"+host+"/robots.txt") + self.rfp.set_timeout(request.timeout) + self.rfp.read() + self._host = host + + ua = request.get_header("User-agent", "") + if self.rfp.can_fetch(ua, request.get_full_url()): + return request + else: + # XXX This should really have raised URLError. Too late now... + msg = "request disallowed by robots.txt" + raise RobotExclusionError( + request, + request.get_full_url(), + 403, msg, + self.http_response_class(StringIO()), StringIO(msg)) + + https_request = http_request + +class HTTPRefererProcessor(BaseHandler): + """Add Referer header to requests. + + This only makes sense if you use each RefererProcessor for a single + chain of requests only (so, for example, if you use a single + HTTPRefererProcessor to fetch a series of URLs extracted from a single + page, this will break). + + There's a proper implementation of this in mechanize.Browser. + + """ + def __init__(self): + self.referer = None + + def http_request(self, request): + if ((self.referer is not None) and + not request.has_header("Referer")): + request.add_unredirected_header("Referer", self.referer) + return request + + def http_response(self, request, response): + self.referer = response.geturl() + return response + + https_request = http_request + https_response = http_response + + +def clean_refresh_url(url): + # e.g. Firefox 1.5 does (something like) this + if ((url.startswith('"') and url.endswith('"')) or + (url.startswith("'") and url.endswith("'"))): + url = url[1:-1] + return _rfc3986.clean_url(url, "latin-1") # XXX encoding + +def parse_refresh_header(refresh): + """ + >>> parse_refresh_header("1; url=http://example.com/") + (1.0, 'http://example.com/') + >>> parse_refresh_header("1; url='http://example.com/'") + (1.0, 'http://example.com/') + >>> parse_refresh_header("1") + (1.0, None) + >>> parse_refresh_header("blah") # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ValueError: invalid literal for float(): blah + + """ + + ii = refresh.find(";") + if ii != -1: + pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:] + jj = newurl_spec.find("=") + key = None + if jj != -1: + key, newurl = newurl_spec[:jj], newurl_spec[jj+1:] + newurl = clean_refresh_url(newurl) + if key is None or key.strip().lower() != "url": + raise ValueError() + else: + pause, newurl = float(refresh), None + return pause, newurl + +class HTTPRefreshProcessor(BaseHandler): + """Perform HTTP Refresh redirections. + + Note that if a non-200 HTTP code has occurred (for example, a 30x + redirect), this processor will do nothing. + + By default, only zero-time Refresh headers are redirected. Use the + max_time attribute / constructor argument to allow Refresh with longer + pauses. Use the honor_time attribute / constructor argument to control + whether the requested pause is honoured (with a time.sleep()) or + skipped in favour of immediate redirection. + + Public attributes: + + max_time: see above + honor_time: see above + + """ + handler_order = 1000 + + def __init__(self, max_time=0, honor_time=True): + self.max_time = max_time + self.honor_time = honor_time + self._sleep = time.sleep + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code == 200 and hdrs.has_key("refresh"): + refresh = hdrs.getheaders("refresh")[0] + try: + pause, newurl = parse_refresh_header(refresh) + except ValueError: + debug("bad Refresh header: %r" % refresh) + return response + + if newurl is None: + newurl = response.geturl() + if (self.max_time is None) or (pause <= self.max_time): + if pause > 1E-3 and self.honor_time: + self._sleep(pause) + hdrs["location"] = newurl + # hardcoded http is NOT a bug + response = self.parent.error( + "http", request, response, + "refresh", msg, hdrs) + else: + debug("Refresh header ignored: %r" % refresh) + + return response + + https_response = http_response diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_lwpcookiejar.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_lwpcookiejar.py new file mode 100644 index 0000000000000000000000000000000000000000..f8d49cf2d4af25f239ec7c96863de1adcc36831b --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_lwpcookiejar.py @@ -0,0 +1,185 @@ +"""Load / save to libwww-perl (LWP) format files. + +Actually, the format is slightly extended from that used by LWP's +(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information +not recorded by LWP. + +It uses the version string "2.0", though really there isn't an LWP Cookies +2.0 format. This indicates that there is extra information in here +(domain_dot and port_spec) while still being compatible with libwww-perl, +I hope. + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import time, re, logging + +from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError +from _headersutil import join_header_words, split_header_words +from _util import iso2time, time2isoz + +debug = logging.getLogger("mechanize").debug + + +def lwp_cookie_str(cookie): + """Return string representation of Cookie in an the LWP cookie file format. + + Actually, the format is extended a bit -- see module docstring. + + """ + h = [(cookie.name, cookie.value), + ("path", cookie.path), + ("domain", cookie.domain)] + if cookie.port is not None: h.append(("port", cookie.port)) + if cookie.path_specified: h.append(("path_spec", None)) + if cookie.port_specified: h.append(("port_spec", None)) + if cookie.domain_initial_dot: h.append(("domain_dot", None)) + if cookie.secure: h.append(("secure", None)) + if cookie.expires: h.append(("expires", + time2isoz(float(cookie.expires)))) + if cookie.discard: h.append(("discard", None)) + if cookie.comment: h.append(("comment", cookie.comment)) + if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) + if cookie.rfc2109: h.append(("rfc2109", None)) + + keys = cookie.nonstandard_attr_keys() + keys.sort() + for k in keys: + h.append((k, str(cookie.get_nonstandard_attr(k)))) + + h.append(("version", str(cookie.version))) + + return join_header_words([h]) + +class LWPCookieJar(FileCookieJar): + """ + The LWPCookieJar saves a sequence of"Set-Cookie3" lines. + "Set-Cookie3" is the format used by the libwww-perl libary, not known + to be compatible with any browser, but which is easy to read and + doesn't lose information about RFC 2965 cookies. + + Additional methods + + as_lwp_str(ignore_discard=True, ignore_expired=True) + + """ + + magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" + + def as_lwp_str(self, ignore_discard=True, ignore_expires=True): + """Return cookies as a string of "\n"-separated "Set-Cookie3" headers. + + ignore_discard and ignore_expires: see docstring for FileCookieJar.save + + """ + now = time.time() + r = [] + for cookie in self: + if not ignore_discard and cookie.discard: + debug(" Not saving %s: marked for discard", cookie.name) + continue + if not ignore_expires and cookie.is_expired(now): + debug(" Not saving %s: expired", cookie.name) + continue + r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) + return "\n".join(r+[""]) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + debug("Saving LWP cookies file") + # There really isn't an LWP Cookies 2.0 format, but this indicates + # that there is extra information in here (domain_dot and + # port_spec) while still being compatible with libwww-perl, I hope. + f.write("#LWP-Cookies-2.0\n") + f.write(self.as_lwp_str(ignore_discard, ignore_expires)) + finally: + f.close() + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + magic = f.readline() + if not re.search(self.magic_re, magic): + msg = "%s does not seem to contain cookies" % filename + raise LoadError(msg) + + now = time.time() + + header = "Set-Cookie3:" + boolean_attrs = ("port_spec", "path_spec", "domain_dot", + "secure", "discard", "rfc2109") + value_attrs = ("version", + "port", "path", "domain", + "expires", + "comment", "commenturl") + + try: + while 1: + line = f.readline() + if line == "": break + if not line.startswith(header): + continue + line = line[len(header):].strip() + + for data in split_header_words([line]): + name, value = data[0] + standard = {} + rest = {} + for k in boolean_attrs: + standard[k] = False + for k, v in data[1:]: + if k is not None: + lc = k.lower() + else: + lc = None + # don't lose case distinction for unknown fields + if (lc in value_attrs) or (lc in boolean_attrs): + k = lc + if k in boolean_attrs: + if v is None: v = True + standard[k] = v + elif k in value_attrs: + standard[k] = v + else: + rest[k] = v + + h = standard.get + expires = h("expires") + discard = h("discard") + if expires is not None: + expires = iso2time(expires) + if expires is None: + discard = True + domain = h("domain") + domain_specified = domain.startswith(".") + c = Cookie(h("version"), name, value, + h("port"), h("port_spec"), + domain, domain_specified, h("domain_dot"), + h("path"), h("path_spec"), + h("secure"), + expires, + discard, + h("comment"), + h("commenturl"), + rest, + h("rfc2109"), + ) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + except: + reraise_unmasked_exceptions((IOError,)) + raise LoadError("invalid Set-Cookie3 format file %s" % filename) + diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_markupbase.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_markupbase.py new file mode 100644 index 0000000000000000000000000000000000000000..ae9c2a875f22b2dc5a8cd879668a82c1854dfc48 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_markupbase.py @@ -0,0 +1,393 @@ +# Taken from Python 2.6.4 for use by _sgmllib.py +"""Shared support for scanning document type declarations in HTML and XHTML. + +This module is used as a foundation for the HTMLParser and sgmllib +modules (indirectly, for htmllib as well). It has no documented +public API and should not be used directly. + +""" + +import re + +_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match +_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match +_commentclose = re.compile(r'--\s*>') +_markedsectionclose = re.compile(r']\s*]\s*>') + +# An analysis of the MS-Word extensions is available at +# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf + +_msmarkedsectionclose = re.compile(r']\s*>') + +del re + + +class ParserBase: + """Parser base class which provides some common support methods used + by the SGML/HTML and XHTML parsers.""" + + def __init__(self): + if self.__class__ is ParserBase: + raise RuntimeError( + "markupbase.ParserBase must be subclassed") + + def error(self, message): + raise NotImplementedError( + "subclasses of ParserBase must override error()") + + def reset(self): + self.lineno = 1 + self.offset = 0 + + def getpos(self): + """Return current line number and offset.""" + return self.lineno, self.offset + + # Internal -- update line number and offset. This should be + # called for each piece of data exactly once, in order -- in other + # words the concatenation of all the input strings to this + # function should be exactly the entire input. + def updatepos(self, i, j): + if i >= j: + return j + rawdata = self.rawdata + nlines = rawdata.count("\n", i, j) + if nlines: + self.lineno = self.lineno + nlines + pos = rawdata.rindex("\n", i, j) # Should not fail + self.offset = j-(pos+1) + else: + self.offset = self.offset + j-i + return j + + _decl_otherchars = '' + + # Internal -- parse declaration (for use by subclasses). + def parse_declaration(self, i): + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration ("<!DOCTYPE html...>"). + # ISO 8879:1986, however, has more complex + # declaration syntax for elements in <!...>, including: + # --comment-- + # [marked section] + # name in the following list: ENTITY, DOCTYPE, ELEMENT, + # ATTLIST, NOTATION, SHORTREF, USEMAP, + # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM + rawdata = self.rawdata + j = i + 2 + assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" + if rawdata[j:j+1] == ">": + # the empty comment <!> + return j + 1 + if rawdata[j:j+1] in ("-", ""): + # Start of comment followed by buffer boundary, + # or just a buffer boundary. + return -1 + # A simple, practical version could look like: ((name|stringlit) S*) + '>' + n = len(rawdata) + if rawdata[j:j+2] == '--': #comment + # Locate --.*-- as the body of the comment + return self.parse_comment(i) + elif rawdata[j] == '[': #marked section + # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section + # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA + # Note that this is extended by Microsoft Office "Save as Web" function + # to include [if...] and [endif]. + return self.parse_marked_section(i) + else: #all other declaration elements + decltype, j = self._scan_name(j, i) + if j < 0: + return j + if decltype == "doctype": + self._decl_otherchars = '' + while j < n: + c = rawdata[j] + if c == ">": + # end of declaration syntax + data = rawdata[i+2:j] + if decltype == "doctype": + self.handle_decl(data) + else: + self.unknown_decl(data) + return j + 1 + if c in "\"'": + m = _declstringlit_match(rawdata, j) + if not m: + return -1 # incomplete + j = m.end() + elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": + name, j = self._scan_name(j, i) + elif c in self._decl_otherchars: + j = j + 1 + elif c == "[": + # this could be handled in a separate doctype parser + if decltype == "doctype": + j = self._parse_doctype_subset(j + 1, i) + elif decltype in ("attlist", "linktype", "link", "element"): + # must tolerate []'d groups in a content model in an element declaration + # also in data attribute specifications of attlist declaration + # also link type declaration subsets in linktype declarations + # also link attribute specification lists in link declarations + self.error("unsupported '[' char in %s declaration" % decltype) + else: + self.error("unexpected '[' char in declaration") + else: + self.error( + "unexpected %r char in declaration" % rawdata[j]) + if j < 0: + return j + return -1 # incomplete + + # Internal -- parse a marked section + # Override this to handle MS-word extension syntax <![if word]>content<![endif]> + def parse_marked_section(self, i, report=1): + rawdata= self.rawdata + assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" + sectName, j = self._scan_name( i+3, i ) + if j < 0: + return j + if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): + # look for standard ]]> ending + match= _markedsectionclose.search(rawdata, i+3) + elif sectName in ("if", "else", "endif"): + # look for MS Office ]> ending + match= _msmarkedsectionclose.search(rawdata, i+3) + else: + self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) + if not match: + return -1 + if report: + j = match.start(0) + self.unknown_decl(rawdata[i+3: j]) + return match.end(0) + + # Internal -- parse comment, return length or -1 if not terminated + def parse_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+4] != '<!--': + self.error('unexpected call to parse_comment()') + match = _commentclose.search(rawdata, i+4) + if not match: + return -1 + if report: + j = match.start(0) + self.handle_comment(rawdata[i+4: j]) + return match.end(0) + + # Internal -- scan past the internal subset in a <!DOCTYPE declaration, + # returning the index just past any whitespace following the trailing ']'. + def _parse_doctype_subset(self, i, declstartpos): + rawdata = self.rawdata + n = len(rawdata) + j = i + while j < n: + c = rawdata[j] + if c == "<": + s = rawdata[j:j+2] + if s == "<": + # end of buffer; incomplete + return -1 + if s != "<!": + self.updatepos(declstartpos, j + 1) + self.error("unexpected char in internal subset (in %r)" % s) + if (j + 2) == n: + # end of buffer; incomplete + return -1 + if (j + 4) > n: + # end of buffer; incomplete + return -1 + if rawdata[j:j+4] == "<!--": + j = self.parse_comment(j, report=0) + if j < 0: + return j + continue + name, j = self._scan_name(j + 2, declstartpos) + if j == -1: + return -1 + if name not in ("attlist", "element", "entity", "notation"): + self.updatepos(declstartpos, j + 2) + self.error( + "unknown declaration %r in internal subset" % name) + # handle the individual names + meth = getattr(self, "_parse_doctype_" + name) + j = meth(j, declstartpos) + if j < 0: + return j + elif c == "%": + # parameter entity reference + if (j + 1) == n: + # end of buffer; incomplete + return -1 + s, j = self._scan_name(j + 1, declstartpos) + if j < 0: + return j + if rawdata[j] == ";": + j = j + 1 + elif c == "]": + j = j + 1 + while j < n and rawdata[j].isspace(): + j = j + 1 + if j < n: + if rawdata[j] == ">": + return j + self.updatepos(declstartpos, j) + self.error("unexpected char after internal subset") + else: + return -1 + elif c.isspace(): + j = j + 1 + else: + self.updatepos(declstartpos, j) + self.error("unexpected char %r in internal subset" % c) + # end of buffer reached + return -1 + + # Internal -- scan past <!ELEMENT declarations + def _parse_doctype_element(self, i, declstartpos): + name, j = self._scan_name(i, declstartpos) + if j == -1: + return -1 + # style content model; just skip until '>' + rawdata = self.rawdata + if '>' in rawdata[j:]: + return rawdata.find(">", j) + 1 + return -1 + + # Internal -- scan past <!ATTLIST declarations + def _parse_doctype_attlist(self, i, declstartpos): + rawdata = self.rawdata + name, j = self._scan_name(i, declstartpos) + c = rawdata[j:j+1] + if c == "": + return -1 + if c == ">": + return j + 1 + while 1: + # scan a series of attribute descriptions; simplified: + # name type [value] [#constraint] + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + c = rawdata[j:j+1] + if c == "": + return -1 + if c == "(": + # an enumerated type; look for ')' + if ")" in rawdata[j:]: + j = rawdata.find(")", j) + 1 + else: + return -1 + while rawdata[j:j+1].isspace(): + j = j + 1 + if not rawdata[j:]: + # end of buffer, incomplete + return -1 + else: + name, j = self._scan_name(j, declstartpos) + c = rawdata[j:j+1] + if not c: + return -1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if m: + j = m.end() + else: + return -1 + c = rawdata[j:j+1] + if not c: + return -1 + if c == "#": + if rawdata[j:] == "#": + # end of buffer + return -1 + name, j = self._scan_name(j + 1, declstartpos) + if j < 0: + return j + c = rawdata[j:j+1] + if not c: + return -1 + if c == '>': + # all done + return j + 1 + + # Internal -- scan past <!NOTATION declarations + def _parse_doctype_notation(self, i, declstartpos): + name, j = self._scan_name(i, declstartpos) + if j < 0: + return j + rawdata = self.rawdata + while 1: + c = rawdata[j:j+1] + if not c: + # end of buffer; incomplete + return -1 + if c == '>': + return j + 1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if not m: + return -1 + j = m.end() + else: + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + + # Internal -- scan past <!ENTITY declarations + def _parse_doctype_entity(self, i, declstartpos): + rawdata = self.rawdata + if rawdata[i:i+1] == "%": + j = i + 1 + while 1: + c = rawdata[j:j+1] + if not c: + return -1 + if c.isspace(): + j = j + 1 + else: + break + else: + j = i + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + while 1: + c = self.rawdata[j:j+1] + if not c: + return -1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if m: + j = m.end() + else: + return -1 # incomplete + elif c == ">": + return j + 1 + else: + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + + # Internal -- scan a name token and the new position and the token, or + # return -1 if we've reached the end of the buffer. + def _scan_name(self, i, declstartpos): + rawdata = self.rawdata + n = len(rawdata) + if i == n: + return None, -1 + m = _declname_match(rawdata, i) + if m: + s = m.group() + name = s.strip() + if (i + len(s)) == n: + return None, -1 # end of buffer + return name.lower(), m.end() + else: + self.updatepos(declstartpos, i) + self.error("expected name token at %r" + % rawdata[declstartpos:declstartpos+20]) + + # To be overridden -- handlers for unknown objects + def unknown_decl(self, data): + pass diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_mechanize.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_mechanize.py new file mode 100644 index 0000000000000000000000000000000000000000..5ce71a6cea102cb919dec37c4ade52a0ac1bce5f --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_mechanize.py @@ -0,0 +1,669 @@ +"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize. + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> +Copyright 2003 Andy Lester (original Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import copy, re, os, urllib, urllib2 + +from _html import DefaultFactory +import _response +import _request +import _rfc3986 +import _sockettimeout +import _urllib2_fork +from _useragent import UserAgentBase + +class BrowserStateError(Exception): pass +class LinkNotFoundError(Exception): pass +class FormNotFoundError(Exception): pass + + +def sanepathname2url(path): + urlpath = urllib.pathname2url(path) + if os.name == "nt" and urlpath.startswith("///"): + urlpath = urlpath[2:] + # XXX don't ask me about the mac... + return urlpath + + +class History: + """ + + Though this will become public, the implied interface is not yet stable. + + """ + def __init__(self): + self._history = [] # LIFO + def add(self, request, response): + self._history.append((request, response)) + def back(self, n, _response): + response = _response # XXX move Browser._response into this class? + while n > 0 or response is None: + try: + request, response = self._history.pop() + except IndexError: + raise BrowserStateError("already at start of history") + n -= 1 + return request, response + def clear(self): + del self._history[:] + def close(self): + for request, response in self._history: + if response is not None: + response.close() + del self._history[:] + + +class HTTPRefererProcessor(_urllib2_fork.BaseHandler): + def http_request(self, request): + # See RFC 2616 14.36. The only times we know the source of the + # request URI has a URI associated with it are redirect, and + # Browser.click() / Browser.submit() / Browser.follow_link(). + # Otherwise, it's the user's job to add any Referer header before + # .open()ing. + if hasattr(request, "redirect_dict"): + request = self.parent._add_referer_header( + request, origin_request=False) + return request + + https_request = http_request + + +class Browser(UserAgentBase): + """Browser-like class with support for history, forms and links. + + BrowserStateError is raised whenever the browser is in the wrong state to + complete the requested operation - e.g., when .back() is called when the + browser history is empty, or when .follow_link() is called when the current + response does not contain HTML data. + + Public attributes: + + request: current request (mechanize.Request) + form: currently selected form (see .select_form()) + + """ + + handler_classes = copy.copy(UserAgentBase.handler_classes) + handler_classes["_referer"] = HTTPRefererProcessor + default_features = copy.copy(UserAgentBase.default_features) + default_features.append("_referer") + + def __init__(self, + factory=None, + history=None, + request_class=None, + ): + """ + + Only named arguments should be passed to this constructor. + + factory: object implementing the mechanize.Factory interface. + history: object implementing the mechanize.History interface. Note + this interface is still experimental and may change in future. + request_class: Request class to use. Defaults to mechanize.Request + + The Factory and History objects passed in are 'owned' by the Browser, + so they should not be shared across Browsers. In particular, + factory.set_response() should not be called except by the owning + Browser itself. + + Note that the supplied factory's request_class is overridden by this + constructor, to ensure only one Request class is used. + + """ + self._handle_referer = True + + if history is None: + history = History() + self._history = history + + if request_class is None: + request_class = _request.Request + + if factory is None: + factory = DefaultFactory() + factory.set_request_class(request_class) + self._factory = factory + self.request_class = request_class + + self.request = None + self._set_response(None, False) + + # do this last to avoid __getattr__ problems + UserAgentBase.__init__(self) + + def close(self): + UserAgentBase.close(self) + if self._response is not None: + self._response.close() + if self._history is not None: + self._history.close() + self._history = None + + # make use after .close easy to spot + self.form = None + self.request = self._response = None + self.request = self.response = self.set_response = None + self.geturl = self.reload = self.back = None + self.clear_history = self.set_cookie = self.links = self.forms = None + self.viewing_html = self.encoding = self.title = None + self.select_form = self.click = self.submit = self.click_link = None + self.follow_link = self.find_link = None + + def set_handle_referer(self, handle): + """Set whether to add Referer header to each request.""" + self._set_handler("_referer", handle) + self._handle_referer = bool(handle) + + def _add_referer_header(self, request, origin_request=True): + if self.request is None: + return request + scheme = request.get_type() + original_scheme = self.request.get_type() + if scheme not in ["http", "https"]: + return request + if not origin_request and not self.request.has_header("Referer"): + return request + + if (self._handle_referer and + original_scheme in ["http", "https"] and + not (original_scheme == "https" and scheme != "https")): + # strip URL fragment (RFC 2616 14.36) + parts = _rfc3986.urlsplit(self.request.get_full_url()) + parts = parts[:-1]+(None,) + referer = _rfc3986.urlunsplit(parts) + request.add_unredirected_header("Referer", referer) + return request + + def open_novisit(self, url, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + """Open a URL without visiting it. + + Browser state (including request, response, history, forms and links) + is left unchanged by calling this function. + + The interface is the same as for .open(). + + This is useful for things like fetching images. + + See also .retrieve(). + + """ + return self._mech_open(url, data, visit=False, timeout=timeout) + + def open(self, url, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return self._mech_open(url, data, timeout=timeout) + + def _mech_open(self, url, data=None, update_history=True, visit=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + try: + url.get_full_url + except AttributeError: + # string URL -- convert to absolute URL if required + scheme, authority = _rfc3986.urlsplit(url)[:2] + if scheme is None: + # relative URL + if self._response is None: + raise BrowserStateError( + "can't fetch relative reference: " + "not viewing any document") + url = _rfc3986.urljoin(self._response.geturl(), url) + + request = self._request(url, data, visit, timeout) + visit = request.visit + if visit is None: + visit = True + + if visit: + self._visit_request(request, update_history) + + success = True + try: + response = UserAgentBase.open(self, request, data) + except urllib2.HTTPError, error: + success = False + if error.fp is None: # not a response + raise + response = error +## except (IOError, socket.error, OSError), error: +## # Yes, urllib2 really does raise all these :-(( +## # See test_urllib2.py for examples of socket.gaierror and OSError, +## # plus note that FTPHandler raises IOError. +## # XXX I don't seem to have an example of exactly socket.error being +## # raised, only socket.gaierror... +## # I don't want to start fixing these here, though, since this is a +## # subclass of OpenerDirector, and it would break old code. Even in +## # Python core, a fix would need some backwards-compat. hack to be +## # acceptable. +## raise + + if visit: + self._set_response(response, False) + response = copy.copy(self._response) + elif response is not None: + response = _response.upgrade_response(response) + + if not success: + raise response + return response + + def __str__(self): + text = [] + text.append("<%s " % self.__class__.__name__) + if self._response: + text.append("visiting %s" % self._response.geturl()) + else: + text.append("(not visiting a URL)") + if self.form: + text.append("\n selected form:\n %s\n" % str(self.form)) + text.append(">") + return "".join(text) + + def response(self): + """Return a copy of the current response. + + The returned object has the same interface as the object returned by + .open() (or mechanize.urlopen()). + + """ + return copy.copy(self._response) + + def open_local_file(self, filename): + path = sanepathname2url(os.path.abspath(filename)) + url = 'file://'+path + return self.open(url) + + def set_response(self, response): + """Replace current response with (a copy of) response. + + response may be None. + + This is intended mostly for HTML-preprocessing. + """ + self._set_response(response, True) + + def _set_response(self, response, close_current): + # sanity check, necessary but far from sufficient + if not (response is None or + (hasattr(response, "info") and hasattr(response, "geturl") and + hasattr(response, "read") + ) + ): + raise ValueError("not a response object") + + self.form = None + if response is not None: + response = _response.upgrade_response(response) + if close_current and self._response is not None: + self._response.close() + self._response = response + self._factory.set_response(response) + + def visit_response(self, response, request=None): + """Visit the response, as if it had been .open()ed. + + Unlike .set_response(), this updates history rather than replacing the + current response. + """ + if request is None: + request = _request.Request(response.geturl()) + self._visit_request(request, True) + self._set_response(response, False) + + def _visit_request(self, request, update_history): + if self._response is not None: + self._response.close() + if self.request is not None and update_history: + self._history.add(self.request, self._response) + self._response = None + # we want self.request to be assigned even if UserAgentBase.open + # fails + self.request = request + + def geturl(self): + """Get URL of current document.""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._response.geturl() + + def reload(self): + """Reload current document, and return response object.""" + if self.request is None: + raise BrowserStateError("no URL has yet been .open()ed") + if self._response is not None: + self._response.close() + return self._mech_open(self.request, update_history=False) + + def back(self, n=1): + """Go back n steps in history, and return response object. + + n: go back this number of steps (default 1 step) + + """ + if self._response is not None: + self._response.close() + self.request, response = self._history.back(n, self._response) + self.set_response(response) + if not response.read_complete: + return self.reload() + return copy.copy(response) + + def clear_history(self): + self._history.clear() + + def set_cookie(self, cookie_string): + """Request to set a cookie. + + Note that it is NOT necessary to call this method under ordinary + circumstances: cookie handling is normally entirely automatic. The + intended use case is rather to simulate the setting of a cookie by + client script in a web page (e.g. JavaScript). In that case, use of + this method is necessary because mechanize currently does not support + JavaScript, VBScript, etc. + + The cookie is added in the same way as if it had arrived with the + current response, as a result of the current request. This means that, + for example, if it is not appropriate to set the cookie based on the + current request, no cookie will be set. + + The cookie will be returned automatically with subsequent responses + made by the Browser instance whenever that's appropriate. + + cookie_string should be a valid value of the Set-Cookie header. + + For example: + + browser.set_cookie( + "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT") + + Currently, this method does not allow for adding RFC 2986 cookies. + This limitation will be lifted if anybody requests it. + + """ + if self._response is None: + raise BrowserStateError("not viewing any document") + if self.request.get_type() not in ["http", "https"]: + raise BrowserStateError("can't set cookie for non-HTTP/HTTPS " + "transactions") + cookiejar = self._ua_handlers["_cookies"].cookiejar + response = self.response() # copy + headers = response.info() + headers["Set-cookie"] = cookie_string + cookiejar.extract_cookies(response, self.request) + + def links(self, **kwds): + """Return iterable over links (mechanize.Link objects).""" + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + links = self._factory.links() + if kwds: + return self._filter_links(links, **kwds) + else: + return links + + def forms(self): + """Return iterable over forms. + + The returned form objects implement the mechanize.HTMLForm interface. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.forms() + + def global_form(self): + """Return the global form object, or None if the factory implementation + did not supply one. + + The "global" form object contains all controls that are not descendants + of any FORM element. + + The returned form object implements the mechanize.HTMLForm interface. + + This is a separate method since the global form is not regarded as part + of the sequence of forms in the document -- mostly for + backwards-compatibility. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.global_form + + def viewing_html(self): + """Return whether the current response contains HTML data.""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._factory.is_html + + def encoding(self): + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._factory.encoding + + def title(self): + r"""Return title, or None if there is no title element in the document. + + Treatment of any tag children of attempts to follow Firefox and IE + (currently, tags are preserved). + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.title + + def select_form(self, name=None, predicate=None, nr=None): + """Select an HTML form for input. + + This is a bit like giving a form the "input focus" in a browser. + + If a form is selected, the Browser object supports the HTMLForm + interface, so you can call methods like .set_value(), .set(), and + .click(). + + Another way to select a form is to assign to the .form attribute. The + form assigned should be one of the objects returned by the .forms() + method. + + At least one of the name, predicate and nr arguments must be supplied. + If no matching form is found, mechanize.FormNotFoundError is raised. + + If name is specified, then the form must have the indicated name. + + If predicate is specified, then the form must match that function. The + predicate function is passed the HTMLForm as its single argument, and + should return a boolean value indicating whether the form matched. + + nr, if supplied, is the sequence number of the form (where 0 is the + first). Note that control 0 is the first form matching all the other + arguments (if supplied); it is not necessarily the first control in the + form. The "global form" (consisting of all form controls not contained + in any FORM element) is considered not to be part of this sequence and + to have no name, so will not be matched unless both name and nr are + None. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + if (name is None) and (predicate is None) and (nr is None): + raise ValueError( + "at least one argument must be supplied to specify form") + + global_form = self._factory.global_form + if nr is None and name is None and \ + predicate is not None and predicate(global_form): + self.form = global_form + return + + orig_nr = nr + for form in self.forms(): + if name is not None and name != form.name: + continue + if predicate is not None and not predicate(form): + continue + if nr: + nr -= 1 + continue + self.form = form + break # success + else: + # failure + description = [] + if name is not None: description.append("name '%s'" % name) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr is not None: description.append("nr %d" % orig_nr) + description = ", ".join(description) + raise FormNotFoundError("no form matching "+description) + + def click(self, *args, **kwds): + """See mechanize.HTMLForm.click for documentation.""" + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + request = self.form.click(*args, **kwds) + return self._add_referer_header(request) + + def submit(self, *args, **kwds): + """Submit current form. + + Arguments are as for mechanize.HTMLForm.click(). + + Return value is same as for Browser.open(). + + """ + return self.open(self.click(*args, **kwds)) + + def click_link(self, link=None, **kwds): + """Find a link and return a Request object for it. + + Arguments are as for .find_link(), except that a link may be supplied + as the first argument. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + if not link: + link = self.find_link(**kwds) + else: + if kwds: + raise ValueError( + "either pass a Link, or keyword arguments, not both") + request = self.request_class(link.absolute_url) + return self._add_referer_header(request) + + def follow_link(self, link=None, **kwds): + """Find a link and .open() it. + + Arguments are as for .click_link(). + + Return value is same as for Browser.open(). + + """ + return self.open(self.click_link(link, **kwds)) + + def find_link(self, **kwds): + """Find a link in current page. + + Links are returned as mechanize.Link objects. + + # Return third link that .search()-matches the regexp "python" + # (by ".search()-matches", I mean that the regular expression method + # .search() is used, rather than .match()). + find_link(text_regex=re.compile("python"), nr=2) + + # Return first http link in the current page that points to somewhere + # on python.org whose link text (after tags have been removed) is + # exactly "monty python". + find_link(text="monty python", + url_regex=re.compile("http.*python.org")) + + # Return first link with exactly three HTML attributes. + find_link(predicate=lambda link: len(link.attrs) == 3) + + Links include anchors (<a>), image maps (<area>), and frames (<frame>, + <iframe>). + + All arguments must be passed by keyword, not position. Zero or more + arguments may be supplied. In order to find a link, all arguments + supplied must match. + + If a matching link is not found, mechanize.LinkNotFoundError is raised. + + text: link text between link tags: e.g. <a href="blah">this bit</a> (as + returned by pullparser.get_compressed_text(), ie. without tags but + with opening tags "textified" as per the pullparser docs) must compare + equal to this argument, if supplied + text_regex: link text between tag (as defined above) must match the + regular expression object or regular expression string passed as this + argument, if supplied + name, name_regex: as for text and text_regex, but matched against the + name HTML attribute of the link tag + url, url_regex: as for text and text_regex, but matched against the + URL of the link tag (note this matches against Link.url, which is a + relative or absolute URL according to how it was written in the HTML) + tag: element name of opening tag, e.g. "a" + predicate: a function taking a Link object as its single argument, + returning a boolean result, indicating whether the links + nr: matches the nth link that matches all other criteria (default 0) + + """ + try: + return self._filter_links(self._factory.links(), **kwds).next() + except StopIteration: + raise LinkNotFoundError() + + def __getattr__(self, name): + # pass through _form.HTMLForm methods and attributes + form = self.__dict__.get("form") + if form is None: + raise AttributeError( + "%s instance has no attribute %s (perhaps you forgot to " + ".select_form()?)" % (self.__class__, name)) + return getattr(form, name) + + def _filter_links(self, links, + text=None, text_regex=None, + name=None, name_regex=None, + url=None, url_regex=None, + tag=None, + predicate=None, + nr=0 + ): + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + + orig_nr = nr + + for link in links: + if url is not None and url != link.url: + continue + if url_regex is not None and not re.search(url_regex, link.url): + continue + if (text is not None and + (link.text is None or text != link.text)): + continue + if (text_regex is not None and + (link.text is None or not re.search(text_regex, link.text))): + continue + if name is not None and name != dict(link.attrs).get("name"): + continue + if name_regex is not None: + link_name = dict(link.attrs).get("name") + if link_name is None or not re.search(name_regex, link_name): + continue + if tag is not None and tag != link.tag: + continue + if predicate is not None and not predicate(link): + continue + if nr: + nr -= 1 + continue + yield link + nr = orig_nr diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_mozillacookiejar.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_mozillacookiejar.py new file mode 100644 index 0000000000000000000000000000000000000000..51e81bb62d414d2336e7a801ead9402de965955b --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_mozillacookiejar.py @@ -0,0 +1,161 @@ +"""Mozilla / Netscape cookie loading / saving. + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import re, time, logging + +from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError +debug = logging.getLogger("ClientCookie").debug + + +class MozillaCookieJar(FileCookieJar): + """ + + WARNING: you may want to backup your browser's cookies file if you use + this class to save cookies. I *think* it works, but there have been + bugs in the past! + + This class differs from CookieJar only in the format it uses to save and + load cookies to and from a file. This class uses the Mozilla/Netscape + `cookies.txt' format. lynx uses this file format, too. + + Don't expect cookies saved while the browser is running to be noticed by + the browser (in fact, Mozilla on unix will overwrite your saved cookies if + you change them on disk while it's running; on Windows, you probably can't + save at all while the browser is running). + + Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to + Netscape cookies on saving. + + In particular, the cookie version and port number information is lost, + together with information about whether or not Path, Port and Discard were + specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the + domain as set in the HTTP header started with a dot (yes, I'm aware some + domains in Netscape files start with a dot and some don't -- trust me, you + really don't want to know any more about this). + + Note that though Mozilla and Netscape use the same format, they use + slightly different headers. The class saves cookies using the Netscape + header by default (Mozilla can cope with that). + + """ + magic_re = "#( Netscape)? HTTP Cookie File" + header = """\ + # Netscape HTTP Cookie File + # http://www.netscape.com/newsref/std/cookie_spec.html + # This is a generated file! Do not edit. + +""" + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + now = time.time() + + magic = f.readline() + if not re.search(self.magic_re, magic): + f.close() + raise LoadError( + "%s does not look like a Netscape format cookies file" % + filename) + + try: + while 1: + line = f.readline() + if line == "": break + + # last field may be absent, so keep any trailing tab + if line.endswith("\n"): line = line[:-1] + + # skip comments and blank lines XXX what is $ for? + if (line.strip().startswith("#") or + line.strip().startswith("$") or + line.strip() == ""): + continue + + domain, domain_specified, path, secure, expires, name, value = \ + line.split("\t", 6) + secure = (secure == "TRUE") + domain_specified = (domain_specified == "TRUE") + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + if domain_specified != initial_dot: + raise LoadError("domain and domain specified flag don't " + "match in %s: %s" % (filename, line)) + + discard = False + if expires == "": + expires = None + discard = True + + # assume path_specified is false + c = Cookie(0, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + {}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + + except: + reraise_unmasked_exceptions((IOError, LoadError)) + raise LoadError("invalid Netscape format file %s: %s" % + (filename, line)) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + debug("Saving Netscape cookies.txt file") + f.write(self.header) + now = time.time() + for cookie in self: + if not ignore_discard and cookie.discard: + debug(" Not saving %s: marked for discard", cookie.name) + continue + if not ignore_expires and cookie.is_expired(now): + debug(" Not saving %s: expired", cookie.name) + continue + if cookie.secure: secure = "TRUE" + else: secure = "FALSE" + if cookie.domain.startswith("."): initial_dot = "TRUE" + else: initial_dot = "FALSE" + if cookie.expires is not None: + expires = str(cookie.expires) + else: + expires = "" + if cookie.value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas cookielib regards it as a + # cookie with no value. + name = "" + value = cookie.name + else: + name = cookie.name + value = cookie.value + f.write( + "\t".join([cookie.domain, initial_dot, cookie.path, + secure, expires, name, value])+ + "\n") + finally: + f.close() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_msiecookiejar.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_msiecookiejar.py new file mode 100644 index 0000000000000000000000000000000000000000..8af11c0e4ceae55d2c7394a92effc5f551c392f7 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_msiecookiejar.py @@ -0,0 +1,388 @@ +"""Microsoft Internet Explorer cookie loading on Windows. + +Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code) +Copyright 2002-2006 John J Lee <jjl@pobox.com> (The Python port) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# XXX names and comments are not great here + +import os, re, time, struct, logging +if os.name == "nt": + import _winreg + +from _clientcookie import FileCookieJar, CookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError + +debug = logging.getLogger("mechanize").debug + + +def regload(path, leaf): + key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0, + _winreg.KEY_ALL_ACCESS) + try: + value = _winreg.QueryValueEx(key, leaf)[0] + except WindowsError: + value = None + return value + +WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME + +def epoch_time_offset_from_win32_filetime(filetime): + """Convert from win32 filetime to seconds-since-epoch value. + + MSIE stores create and expire times as Win32 FILETIME, which is 64 + bits of 100 nanosecond intervals since Jan 01 1601. + + mechanize expects time in 32-bit value expressed in seconds since the + epoch (Jan 01 1970). + + """ + if filetime < WIN32_EPOCH: + raise ValueError("filetime (%d) is before epoch (%d)" % + (filetime, WIN32_EPOCH)) + + return divmod((filetime - WIN32_EPOCH), 10000000L)[0] + +def binary_to_char(c): return "%02X" % ord(c) +def binary_to_str(d): return "".join(map(binary_to_char, list(d))) + +class MSIEBase: + magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*") + padding = "\x0d\xf0\xad\x0b" + + msie_domain_re = re.compile(r"^([^/]+)(/.*)$") + cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?" + "(.+\@[\x21-\xFF]+\.txt)") + + # path under HKEY_CURRENT_USER from which to get location of index.dat + reg_path = r"software\microsoft\windows" \ + r"\currentversion\explorer\shell folders" + reg_key = "Cookies" + + def __init__(self): + self._delayload_domains = {} + + def _delayload_domain(self, domain): + # if necessary, lazily load cookies for this domain + delayload_info = self._delayload_domains.get(domain) + if delayload_info is not None: + cookie_file, ignore_discard, ignore_expires = delayload_info + try: + self.load_cookie_data(cookie_file, + ignore_discard, ignore_expires) + except (LoadError, IOError): + debug("error reading cookie file, skipping: %s", cookie_file) + else: + del self._delayload_domains[domain] + + def _load_cookies_from_file(self, filename): + debug("Loading MSIE cookies file: %s", filename) + cookies = [] + + cookies_fh = open(filename) + + try: + while 1: + key = cookies_fh.readline() + if key == "": break + + rl = cookies_fh.readline + def getlong(rl=rl): return long(rl().rstrip()) + def getstr(rl=rl): return rl().rstrip() + + key = key.rstrip() + value = getstr() + domain_path = getstr() + flags = getlong() # 0x2000 bit is for secure I think + lo_expire = getlong() + hi_expire = getlong() + lo_create = getlong() + hi_create = getlong() + sep = getstr() + + if "" in (key, value, domain_path, flags, hi_expire, lo_expire, + hi_create, lo_create, sep) or (sep != "*"): + break + + m = self.msie_domain_re.search(domain_path) + if m: + domain = m.group(1) + path = m.group(2) + + cookies.append({"KEY": key, "VALUE": value, + "DOMAIN": domain, "PATH": path, + "FLAGS": flags, "HIXP": hi_expire, + "LOXP": lo_expire, "HICREATE": hi_create, + "LOCREATE": lo_create}) + finally: + cookies_fh.close() + + return cookies + + def load_cookie_data(self, filename, + ignore_discard=False, ignore_expires=False): + """Load cookies from file containing actual cookie data. + + Old cookies are kept unless overwritten by newly loaded ones. + + You should not call this method if the delayload attribute is set. + + I think each of these files contain all cookies for one user, domain, + and path. + + filename: file containing cookies -- usually found in a file like + C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt + + """ + now = int(time.time()) + + cookie_data = self._load_cookies_from_file(filename) + + for cookie in cookie_data: + flags = cookie["FLAGS"] + secure = ((flags & 0x2000) != 0) + filetime = (cookie["HIXP"] << 32) + cookie["LOXP"] + expires = epoch_time_offset_from_win32_filetime(filetime) + if expires < now: + discard = True + else: + discard = False + domain = cookie["DOMAIN"] + initial_dot = domain.startswith(".") + if initial_dot: + domain_specified = True + else: + # MSIE 5 does not record whether the domain cookie-attribute + # was specified. + # Assuming it wasn't is conservative, because with strict + # domain matching this will match less frequently; with regular + # Netscape tail-matching, this will match at exactly the same + # times that domain_specified = True would. It also means we + # don't have to prepend a dot to achieve consistency with our + # own & Mozilla's domain-munging scheme. + domain_specified = False + + # assume path_specified is false + # XXX is there other stuff in here? -- e.g. comment, commentURL? + c = Cookie(0, + cookie["KEY"], cookie["VALUE"], + None, False, + domain, domain_specified, initial_dot, + cookie["PATH"], False, + secure, + expires, + discard, + None, + None, + {"flags": flags}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + CookieJar.set_cookie(self, c) + + def load_from_registry(self, ignore_discard=False, ignore_expires=False, + username=None): + """ + username: only required on win9x + + """ + cookies_dir = regload(self.reg_path, self.reg_key) + filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT")) + self.load(filename, ignore_discard, ignore_expires, username) + + def _really_load(self, index, filename, ignore_discard, ignore_expires, + username): + now = int(time.time()) + + if username is None: + username = os.environ['USERNAME'].lower() + + cookie_dir = os.path.dirname(filename) + + data = index.read(256) + if len(data) != 256: + raise LoadError("%s file is too short" % filename) + + # Cookies' index.dat file starts with 32 bytes of signature + # followed by an offset to the first record, stored as a little- + # endian DWORD. + sig, size, data = data[:32], data[32:36], data[36:] + size = struct.unpack("<L", size)[0] + + # check that sig is valid + if not self.magic_re.match(sig) or size != 0x4000: + raise LoadError("%s ['%s' %s] does not seem to contain cookies" % + (str(filename), sig, size)) + + # skip to start of first record + index.seek(size, 0) + + sector = 128 # size of sector in bytes + + while 1: + data = "" + + # Cookies are usually in two contiguous sectors, so read in two + # sectors and adjust if not a Cookie. + to_read = 2 * sector + d = index.read(to_read) + if len(d) != to_read: + break + data = data + d + + # Each record starts with a 4-byte signature and a count + # (little-endian DWORD) of sectors for the record. + sig, size, data = data[:4], data[4:8], data[8:] + size = struct.unpack("<L", size)[0] + + to_read = (size - 2) * sector + +## from urllib import quote +## print "data", quote(data) +## print "sig", quote(sig) +## print "size in sectors", size +## print "size in bytes", size*sector +## print "size in units of 16 bytes", (size*sector) / 16 +## print "size to read in bytes", to_read +## print + + if sig != "URL ": + assert sig in ("HASH", "LEAK", \ + self.padding, "\x00\x00\x00\x00"), \ + "unrecognized MSIE index.dat record: %s" % \ + binary_to_str(sig) + if sig == "\x00\x00\x00\x00": + # assume we've got all the cookies, and stop + break + if sig == self.padding: + continue + # skip the rest of this record + assert to_read >= 0 + if size != 2: + assert to_read != 0 + index.seek(to_read, 1) + continue + + # read in rest of record if necessary + if size > 2: + more_data = index.read(to_read) + if len(more_data) != to_read: break + data = data + more_data + + cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username + + "(%s\@[\x21-\xFF]+\.txt)" % username) + m = re.search(cookie_re, data, re.I) + if m: + cookie_file = os.path.join(cookie_dir, m.group(2)) + if not self.delayload: + try: + self.load_cookie_data(cookie_file, + ignore_discard, ignore_expires) + except (LoadError, IOError): + debug("error reading cookie file, skipping: %s", + cookie_file) + else: + domain = m.group(1) + i = domain.find("/") + if i != -1: + domain = domain[:i] + + self._delayload_domains[domain] = ( + cookie_file, ignore_discard, ignore_expires) + + +class MSIECookieJar(MSIEBase, FileCookieJar): + """FileCookieJar that reads from the Windows MSIE cookies database. + + MSIECookieJar can read the cookie files of Microsoft Internet Explorer + (MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and + Windows 98. Other configurations may also work, but are untested. Saving + cookies in MSIE format is NOT supported. If you save cookies, they'll be + in the usual Set-Cookie3 format, which you can read back in using an + instance of the plain old CookieJar class. Don't save using the same + filename that you loaded cookies from, because you may succeed in + clobbering your MSIE cookies index file! + + You should be able to have LWP share Internet Explorer's cookies like + this (note you need to supply a username to load_from_registry if you're on + Windows 9x or Windows ME): + + cj = MSIECookieJar(delayload=1) + # find cookies index file in registry and load cookies from it + cj.load_from_registry() + opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) + response = opener.open("http://example.com/") + + Iterating over a delayloaded MSIECookieJar instance will not cause any + cookies to be read from disk. To force reading of all cookies from disk, + call read_all_cookies. Note that the following methods iterate over self: + clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__ + and as_string. + + Additional methods: + + load_from_registry(ignore_discard=False, ignore_expires=False, + username=None) + load_cookie_data(filename, ignore_discard=False, ignore_expires=False) + read_all_cookies() + + """ + def __init__(self, filename=None, delayload=False, policy=None): + MSIEBase.__init__(self) + FileCookieJar.__init__(self, filename, delayload, policy) + + def set_cookie(self, cookie): + if self.delayload: + self._delayload_domain(cookie.domain) + CookieJar.set_cookie(self, cookie) + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + domains = self._cookies.copy() + domains.update(self._delayload_domains) + domains = domains.keys() + + cookies = [] + for domain in domains: + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookies_for_domain(self, domain, request): + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + if self.delayload: + self._delayload_domain(domain) + return CookieJar._cookies_for_domain(self, domain, request) + + def read_all_cookies(self): + """Eagerly read in all cookies.""" + if self.delayload: + for domain in self._delayload_domains.keys(): + self._delayload_domain(domain) + + def load(self, filename, ignore_discard=False, ignore_expires=False, + username=None): + """Load cookies from an MSIE 'index.dat' cookies index file. + + filename: full path to cookie index file + username: only required on win9x + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + index = open(filename, "rb") + + try: + self._really_load(index, filename, ignore_discard, ignore_expires, + username) + finally: + index.close() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_opener.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_opener.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8412d817e4e9149a1d4d2b8dc7aedc748a878b --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_opener.py @@ -0,0 +1,442 @@ +"""URL opener. + +Copyright 2004-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import os, urllib2, bisect, httplib, types, tempfile +try: + import threading as _threading +except ImportError: + import dummy_threading as _threading +try: + set +except NameError: + import sets + set = sets.Set + +from _request import Request +import _response +import _rfc3986 +import _sockettimeout +import _urllib2_fork +from _util import isstringlike + +open_file = open + + +class ContentTooShortError(urllib2.URLError): + def __init__(self, reason, result): + urllib2.URLError.__init__(self, reason) + self.result = result + + +def set_request_attr(req, name, value, default): + try: + getattr(req, name) + except AttributeError: + setattr(req, name, default) + if value is not default: + setattr(req, name, value) + + +class OpenerDirector(_urllib2_fork.OpenerDirector): + def __init__(self): + _urllib2_fork.OpenerDirector.__init__(self) + # really none of these are (sanely) public -- the lack of initial + # underscore on some is just due to following urllib2 + self.process_response = {} + self.process_request = {} + self._any_request = {} + self._any_response = {} + self._handler_index_valid = True + self._tempfiles = [] + + def add_handler(self, handler): + if not hasattr(handler, "add_parent"): + raise TypeError("expected BaseHandler instance, got %r" % + type(handler)) + + if handler in self.handlers: + return + # XXX why does self.handlers need to be sorted? + bisect.insort(self.handlers, handler) + handler.add_parent(self) + self._handler_index_valid = False + + def _maybe_reindex_handlers(self): + if self._handler_index_valid: + return + + handle_error = {} + handle_open = {} + process_request = {} + process_response = {} + any_request = set() + any_response = set() + unwanted = [] + + for handler in self.handlers: + added = False + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + if meth == "any_request": + any_request.add(handler) + added = True + continue + elif meth == "any_response": + any_response.add(handler) + added = True + continue + + ii = meth.find("_") + scheme = meth[:ii] + condition = meth[ii+1:] + + if condition.startswith("error"): + jj = meth[ii+1:].find("_") + ii + 1 + kind = meth[jj+1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = handle_error.setdefault(scheme, {}) + elif condition == "open": + kind = scheme + lookup = handle_open + elif condition == "request": + kind = scheme + lookup = process_request + elif condition == "response": + kind = scheme + lookup = process_response + else: + continue + + lookup.setdefault(kind, set()).add(handler) + added = True + + if not added: + unwanted.append(handler) + + for handler in unwanted: + self.handlers.remove(handler) + + # sort indexed methods + # XXX could be cleaned up + for lookup in [process_request, process_response]: + for scheme, handlers in lookup.iteritems(): + lookup[scheme] = handlers + for scheme, lookup in handle_error.iteritems(): + for code, handlers in lookup.iteritems(): + handlers = list(handlers) + handlers.sort() + lookup[code] = handlers + for scheme, handlers in handle_open.iteritems(): + handlers = list(handlers) + handlers.sort() + handle_open[scheme] = handlers + + # cache the indexes + self.handle_error = handle_error + self.handle_open = handle_open + self.process_request = process_request + self.process_response = process_response + self._any_request = any_request + self._any_response = any_response + + def _request(self, url_or_req, data, visit, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + if isstringlike(url_or_req): + req = Request(url_or_req, data, visit=visit, timeout=timeout) + else: + # already a mechanize.Request instance + req = url_or_req + if data is not None: + req.add_data(data) + # XXX yuck + set_request_attr(req, "visit", visit, None) + set_request_attr(req, "timeout", timeout, + _sockettimeout._GLOBAL_DEFAULT_TIMEOUT) + return req + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + req = self._request(fullurl, data, None, timeout) + req_scheme = req.get_type() + + self._maybe_reindex_handlers() + + # pre-process request + # XXX should we allow a Processor to change the URL scheme + # of the request? + request_processors = set(self.process_request.get(req_scheme, [])) + request_processors.update(self._any_request) + request_processors = list(request_processors) + request_processors.sort() + for processor in request_processors: + for meth_name in ["any_request", req_scheme+"_request"]: + meth = getattr(processor, meth_name, None) + if meth: + req = meth(req) + + # In Python >= 2.4, .open() supports processors already, so we must + # call ._open() instead. + urlopen = _urllib2_fork.OpenerDirector._open + response = urlopen(self, req, data) + + # post-process response + response_processors = set(self.process_response.get(req_scheme, [])) + response_processors.update(self._any_response) + response_processors = list(response_processors) + response_processors.sort() + for processor in response_processors: + for meth_name in ["any_response", req_scheme+"_response"]: + meth = getattr(processor, meth_name, None) + if meth: + response = meth(req, response) + + return response + + def error(self, proto, *args): + if proto in ['http', 'https']: + # XXX http[s] protocols are special-cased + dict = self.handle_error['http'] # https is not different than http + proto = args[2] # YUCK! + meth_name = 'http_error_%s' % proto + http_err = 1 + orig_args = args + else: + dict = self.handle_error + meth_name = proto + '_error' + http_err = 0 + args = (dict, proto, meth_name) + args + result = apply(self._call_chain, args) + if result: + return result + + if http_err: + args = (dict, 'default', 'http_error_default') + orig_args + return apply(self._call_chain, args) + + BLOCK_SIZE = 1024*8 + def retrieve(self, fullurl, filename=None, reporthook=None, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT, + open=open_file): + """Returns (filename, headers). + + For remote objects, the default filename will refer to a temporary + file. Temporary files are removed when the OpenerDirector.close() + method is called. + + For file: URLs, at present the returned filename is None. This may + change in future. + + If the actual number of bytes read is less than indicated by the + Content-Length header, raises ContentTooShortError (a URLError + subclass). The exception's .result attribute contains the (filename, + headers) that would have been returned. + + """ + req = self._request(fullurl, data, False, timeout) + scheme = req.get_type() + fp = self.open(req) + try: + headers = fp.info() + if filename is None and scheme == 'file': + # XXX req.get_selector() seems broken here, return None, + # pending sanity :-/ + return None, headers + #return urllib.url2pathname(req.get_selector()), headers + if filename: + tfp = open(filename, 'wb') + else: + path = _rfc3986.urlsplit(req.get_full_url())[2] + suffix = os.path.splitext(path)[1] + fd, filename = tempfile.mkstemp(suffix) + self._tempfiles.append(filename) + tfp = os.fdopen(fd, 'wb') + try: + result = filename, headers + bs = self.BLOCK_SIZE + size = -1 + read = 0 + blocknum = 0 + if reporthook: + if "content-length" in headers: + size = int(headers["Content-Length"]) + reporthook(blocknum, bs, size) + while 1: + block = fp.read(bs) + if block == "": + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, bs, size) + finally: + tfp.close() + finally: + fp.close() + + # raise exception if actual size does not match content-length header + if size >= 0 and read < size: + raise ContentTooShortError( + "retrieval incomplete: " + "got only %i out of %i bytes" % (read, size), + result + ) + + return result + + def close(self): + _urllib2_fork.OpenerDirector.close(self) + + # make it very obvious this object is no longer supposed to be used + self.open = self.error = self.retrieve = self.add_handler = None + + if self._tempfiles: + for filename in self._tempfiles: + try: + os.unlink(filename) + except OSError: + pass + del self._tempfiles[:] + + +def wrapped_open(urlopen, process_response_object, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + success = True + try: + response = urlopen(fullurl, data, timeout) + except urllib2.HTTPError, error: + success = False + if error.fp is None: # not a response + raise + response = error + + if response is not None: + response = process_response_object(response) + + if not success: + raise response + return response + +class ResponseProcessingOpener(OpenerDirector): + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + def bound_open(fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return OpenerDirector.open(self, fullurl, data, timeout) + return wrapped_open( + bound_open, self.process_response_object, fullurl, data, timeout) + + def process_response_object(self, response): + return response + + +class SeekableResponseOpener(ResponseProcessingOpener): + def process_response_object(self, response): + return _response.seek_wrapped_response(response) + + +def isclass(obj): + return isinstance(obj, (types.ClassType, type)) + + +class OpenerFactory: + """This class's interface is quite likely to change.""" + + default_classes = [ + # handlers + _urllib2_fork.ProxyHandler, + _urllib2_fork.UnknownHandler, + _urllib2_fork.HTTPHandler, + _urllib2_fork.HTTPDefaultErrorHandler, + _urllib2_fork.HTTPRedirectHandler, + _urllib2_fork.FTPHandler, + _urllib2_fork.FileHandler, + # processors + _urllib2_fork.HTTPCookieProcessor, + _urllib2_fork.HTTPErrorProcessor, + ] + if hasattr(httplib, 'HTTPS'): + default_classes.append(_urllib2_fork.HTTPSHandler) + handlers = [] + replacement_handlers = [] + + def __init__(self, klass=OpenerDirector): + self.klass = klass + + def build_opener(self, *handlers): + """Create an opener object from a list of handlers and processors. + + The opener will use several default handlers and processors, including + support for HTTP and FTP. + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + + """ + opener = self.klass() + default_classes = list(self.default_classes) + skip = set() + for klass in default_classes: + for check in handlers: + if isclass(check): + if issubclass(check, klass): + skip.add(klass) + elif isinstance(check, klass): + skip.add(klass) + for klass in skip: + default_classes.remove(klass) + + for klass in default_classes: + opener.add_handler(klass()) + for h in handlers: + if isclass(h): + h = h() + opener.add_handler(h) + + return opener + + +build_opener = OpenerFactory().build_opener + +_opener = None +urlopen_lock = _threading.Lock() +def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + urlopen_lock.acquire() + try: + if _opener is None: + _opener = build_opener() + finally: + urlopen_lock.release() + return _opener.open(url, data, timeout) + +def urlretrieve(url, filename=None, reporthook=None, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + urlopen_lock.acquire() + try: + if _opener is None: + _opener = build_opener() + finally: + urlopen_lock.release() + return _opener.retrieve(url, filename, reporthook, data, timeout) + +def install_opener(opener): + global _opener + _opener = opener diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_pullparser.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_pullparser.py new file mode 100644 index 0000000000000000000000000000000000000000..1f212c1512bacd2d3ef95e51ba9d578ce0adfbf5 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_pullparser.py @@ -0,0 +1,391 @@ +"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser. + +Examples + +This program extracts all links from a document. It will print one +line for each link, containing the URL and the textual description +between the <A>...</A> tags: + +import pullparser, sys +f = file(sys.argv[1]) +p = pullparser.PullParser(f) +for token in p.tags("a"): + if token.type == "endtag": continue + url = dict(token.attrs).get("href", "-") + text = p.get_compressed_text(endat=("endtag", "a")) + print "%s\t%s" % (url, text) + +This program extracts the <TITLE> from the document: + +import pullparser, sys +f = file(sys.argv[1]) +p = pullparser.PullParser(f) +if p.get_tag("title"): + title = p.get_compressed_text() + print "Title: %s" % title + + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> +Copyright 1998-2001 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses. + +""" + +import re, htmlentitydefs +import _sgmllib_copy as sgmllib +import HTMLParser +from xml.sax import saxutils + +from _html import unescape, unescape_charref + + +class NoMoreTokensError(Exception): pass + +class Token: + """Represents an HTML tag, declaration, processing instruction etc. + + Behaves as both a tuple-like object (ie. iterable) and has attributes + .type, .data and .attrs. + + >>> t = Token("starttag", "a", [("href", "http://www.python.org/")]) + >>> t == ("starttag", "a", [("href", "http://www.python.org/")]) + True + >>> (t.type, t.data) == ("starttag", "a") + True + >>> t.attrs == [("href", "http://www.python.org/")] + True + + Public attributes + + type: one of "starttag", "endtag", "startendtag", "charref", "entityref", + "data", "comment", "decl", "pi", after the corresponding methods of + HTMLParser.HTMLParser + data: For a tag, the tag name; otherwise, the relevant data carried by the + tag, as a string + attrs: list of (name, value) pairs representing HTML attributes + (or None if token does not represent an opening tag) + + """ + def __init__(self, type, data, attrs=None): + self.type = type + self.data = data + self.attrs = attrs + def __iter__(self): + return iter((self.type, self.data, self.attrs)) + def __eq__(self, other): + type, data, attrs = other + if (self.type == type and + self.data == data and + self.attrs == attrs): + return True + else: + return False + def __ne__(self, other): return not self.__eq__(other) + def __repr__(self): + args = ", ".join(map(repr, [self.type, self.data, self.attrs])) + return self.__class__.__name__+"(%s)" % args + + def __str__(self): + """ + >>> print Token("starttag", "br") + <br> + >>> print Token("starttag", "a", + ... [("href", "http://www.python.org/"), ("alt", '"foo"')]) + <a href="http://www.python.org/" alt='"foo"'> + >>> print Token("startendtag", "br") + <br /> + >>> print Token("startendtag", "br", [("spam", "eggs")]) + <br spam="eggs" /> + >>> print Token("endtag", "p") + </p> + >>> print Token("charref", "38") + & + >>> print Token("entityref", "amp") + & + >>> print Token("data", "foo\\nbar") + foo + bar + >>> print Token("comment", "Life is a bowl\\nof cherries.") + <!--Life is a bowl + of cherries.--> + >>> print Token("decl", "decl") + <!decl> + >>> print Token("pi", "pi") + <?pi> + """ + if self.attrs is not None: + attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for + k, v in self.attrs]) + else: + attrs = "" + if self.type == "starttag": + return "<%s%s>" % (self.data, attrs) + elif self.type == "startendtag": + return "<%s%s />" % (self.data, attrs) + elif self.type == "endtag": + return "</%s>" % self.data + elif self.type == "charref": + return "&#%s;" % self.data + elif self.type == "entityref": + return "&%s;" % self.data + elif self.type == "data": + return self.data + elif self.type == "comment": + return "<!--%s-->" % self.data + elif self.type == "decl": + return "<!%s>" % self.data + elif self.type == "pi": + return "<?%s>" % self.data + assert False + + +def iter_until_exception(fn, exception, *args, **kwds): + while 1: + try: + yield fn(*args, **kwds) + except exception: + raise StopIteration + + +class _AbstractParser: + chunk = 1024 + compress_re = re.compile(r"\s+") + def __init__(self, fh, textify={"img": "alt", "applet": "alt"}, + encoding="ascii", entitydefs=None): + """ + fh: file-like object (only a .read() method is required) from which to + read HTML to be parsed + textify: mapping used by .get_text() and .get_compressed_text() methods + to represent opening tags as text + encoding: encoding used to encode numeric character references by + .get_text() and .get_compressed_text() ("ascii" by default) + + entitydefs: mapping like {"amp": "&", ...} containing HTML entity + definitions (a sensible default is used). This is used to unescape + entities in .get_text() (and .get_compressed_text()) and attribute + values. If the encoding can not represent the character, the entity + reference is left unescaped. Note that entity references (both + numeric - e.g. { or ઼ - and non-numeric - e.g. &) are + unescaped in attribute values and the return value of .get_text(), but + not in data outside of tags. Instead, entity references outside of + tags are represented as tokens. This is a bit odd, it's true :-/ + + If the element name of an opening tag matches a key in the textify + mapping then that tag is converted to text. The corresponding value is + used to specify which tag attribute to obtain the text from. textify + maps from element names to either: + + - an HTML attribute name, in which case the HTML attribute value is + used as its text value along with the element name in square + brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute + were missing, just "[IMG]") + - a callable object (e.g. a function) which takes a Token and returns + the string to be used as its text value + + If textify has no key for an element name, nothing is substituted for + the opening tag. + + Public attributes: + + encoding and textify: see above + + """ + self._fh = fh + self._tokenstack = [] # FIFO + self.textify = textify + self.encoding = encoding + if entitydefs is None: + entitydefs = htmlentitydefs.name2codepoint + self._entitydefs = entitydefs + + def __iter__(self): return self + + def tags(self, *names): + return iter_until_exception(self.get_tag, NoMoreTokensError, *names) + + def tokens(self, *tokentypes): + return iter_until_exception(self.get_token, NoMoreTokensError, + *tokentypes) + + def next(self): + try: + return self.get_token() + except NoMoreTokensError: + raise StopIteration() + + def get_token(self, *tokentypes): + """Pop the next Token object from the stack of parsed tokens. + + If arguments are given, they are taken to be token types in which the + caller is interested: tokens representing other elements will be + skipped. Element names must be given in lower case. + + Raises NoMoreTokensError. + + """ + while 1: + while self._tokenstack: + token = self._tokenstack.pop(0) + if tokentypes: + if token.type in tokentypes: + return token + else: + return token + data = self._fh.read(self.chunk) + if not data: + raise NoMoreTokensError() + self.feed(data) + + def unget_token(self, token): + """Push a Token back onto the stack.""" + self._tokenstack.insert(0, token) + + def get_tag(self, *names): + """Return the next Token that represents an opening or closing tag. + + If arguments are given, they are taken to be element names in which the + caller is interested: tags representing other elements will be skipped. + Element names must be given in lower case. + + Raises NoMoreTokensError. + + """ + while 1: + tok = self.get_token() + if tok.type not in ["starttag", "endtag", "startendtag"]: + continue + if names: + if tok.data in names: + return tok + else: + return tok + + def get_text(self, endat=None): + """Get some text. + + endat: stop reading text at this tag (the tag is included in the + returned text); endtag is a tuple (type, name) where type is + "starttag", "endtag" or "startendtag", and name is the element name of + the tag (element names must be given in lower case) + + If endat is not given, .get_text() will stop at the next opening or + closing tag, or when there are no more tokens (no exception is raised). + Note that .get_text() includes the text representation (if any) of the + opening tag, but pushes the opening tag back onto the stack. As a + result, if you want to call .get_text() again, you need to call + .get_tag() first (unless you want an empty string returned when you + next call .get_text()). + + Entity references are translated using the value of the entitydefs + constructor argument (a mapping from names to characters like that + provided by the standard module htmlentitydefs). Named entity + references that are not in this mapping are left unchanged. + + The textify attribute is used to translate opening tags into text: see + the class docstring. + + """ + text = [] + tok = None + while 1: + try: + tok = self.get_token() + except NoMoreTokensError: + # unget last token (not the one we just failed to get) + if tok: self.unget_token(tok) + break + if tok.type == "data": + text.append(tok.data) + elif tok.type == "entityref": + t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding) + text.append(t) + elif tok.type == "charref": + t = unescape_charref(tok.data, self.encoding) + text.append(t) + elif tok.type in ["starttag", "endtag", "startendtag"]: + tag_name = tok.data + if tok.type in ["starttag", "startendtag"]: + alt = self.textify.get(tag_name) + if alt is not None: + if callable(alt): + text.append(alt(tok)) + elif tok.attrs is not None: + for k, v in tok.attrs: + if k == alt: + text.append(v) + text.append("[%s]" % tag_name.upper()) + if endat is None or endat == (tok.type, tag_name): + self.unget_token(tok) + break + return "".join(text) + + def get_compressed_text(self, *args, **kwds): + """ + As .get_text(), but collapses each group of contiguous whitespace to a + single space character, and removes all initial and trailing + whitespace. + + """ + text = self.get_text(*args, **kwds) + text = text.strip() + return self.compress_re.sub(" ", text) + + def handle_startendtag(self, tag, attrs): + self._tokenstack.append(Token("startendtag", tag, attrs)) + def handle_starttag(self, tag, attrs): + self._tokenstack.append(Token("starttag", tag, attrs)) + def handle_endtag(self, tag): + self._tokenstack.append(Token("endtag", tag)) + def handle_charref(self, name): + self._tokenstack.append(Token("charref", name)) + def handle_entityref(self, name): + self._tokenstack.append(Token("entityref", name)) + def handle_data(self, data): + self._tokenstack.append(Token("data", data)) + def handle_comment(self, data): + self._tokenstack.append(Token("comment", data)) + def handle_decl(self, decl): + self._tokenstack.append(Token("decl", decl)) + def unknown_decl(self, data): + # XXX should this call self.error instead? + #self.error("unknown declaration: " + `data`) + self._tokenstack.append(Token("decl", data)) + def handle_pi(self, data): + self._tokenstack.append(Token("pi", data)) + + def unescape_attr(self, name): + return unescape(name, self._entitydefs, self.encoding) + def unescape_attrs(self, attrs): + escaped_attrs = [] + for key, val in attrs: + escaped_attrs.append((key, self.unescape_attr(val))) + return escaped_attrs + +class PullParser(_AbstractParser, HTMLParser.HTMLParser): + def __init__(self, *args, **kwds): + HTMLParser.HTMLParser.__init__(self) + _AbstractParser.__init__(self, *args, **kwds) + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + +class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser): + def __init__(self, *args, **kwds): + sgmllib.SGMLParser.__init__(self) + _AbstractParser.__init__(self, *args, **kwds) + def unknown_starttag(self, tag, attrs): + attrs = self.unescape_attrs(attrs) + self._tokenstack.append(Token("starttag", tag, attrs)) + def unknown_endtag(self, tag): + self._tokenstack.append(Token("endtag", tag)) + + +def _test(): + import doctest, _pullparser + return doctest.testmod(_pullparser) + +if __name__ == "__main__": + _test() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_request.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_request.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8acb6cc45706d66a1af55e1a1dfa6b3e4550ec --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_request.py @@ -0,0 +1,40 @@ +"""Integration with Python standard library module urllib2: Request class. + +Copyright 2004-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import logging + +import _rfc3986 +import _sockettimeout +import _urllib2_fork + +warn = logging.getLogger("mechanize").warning + + +class Request(_urllib2_fork.Request): + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False, visit=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + # In mechanize 0.2, the interpretation of a unicode url argument will + # change: A unicode url argument will be interpreted as an IRI, and a + # bytestring as a URI. For now, we accept unicode or bytestring. We + # don't insist that the value is always a URI (specifically, must only + # contain characters which are legal), because that might break working + # code (who knows what bytes some servers want to see, especially with + # browser plugins for internationalised URIs). + if not _rfc3986.is_clean_uri(url): + warn("url argument is not a URI " + "(contains illegal characters) %r" % url) + _urllib2_fork.Request.__init__(self, url, data, headers) + self.selector = None + self.visit = visit + self.timeout = timeout + + def __str__(self): + return "<Request for %s>" % self.get_full_url() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_response.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_response.py new file mode 100644 index 0000000000000000000000000000000000000000..d5ca5f2e4e6e44ee375a1ef6a926f5310cf336d2 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_response.py @@ -0,0 +1,525 @@ +"""Response classes. + +The seek_wrapper code is not used if you're using UserAgent with +.set_seekable_responses(False), or if you're using the urllib2-level interface +HTTPEquivProcessor. Class closeable_response is instantiated by some handlers +(AbstractHTTPHandler), but the closeable_response interface is only depended +upon by Browser-level code. Function upgrade_response is only used if you're +using Browser. + + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import copy, mimetools, urllib2 +from cStringIO import StringIO + + +def len_of_seekable(file_): + # this function exists because evaluation of len(file_.getvalue()) on every + # .read() from seek_wrapper would be O(N**2) in number of .read()s + pos = file_.tell() + file_.seek(0, 2) # to end + try: + return file_.tell() + finally: + file_.seek(pos) + + +# XXX Andrew Dalke kindly sent me a similar class in response to my request on +# comp.lang.python, which I then proceeded to lose. I wrote this class +# instead, but I think he's released his code publicly since, could pinch the +# tests from it, at least... + +# For testing seek_wrapper invariant (note that +# test_urllib2.HandlerTest.test_seekable is expected to fail when this +# invariant checking is turned on). The invariant checking is done by module +# ipdc, which is available here: +# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834 +## from ipdbc import ContractBase +## class seek_wrapper(ContractBase): +class seek_wrapper: + """Adds a seek method to a file object. + + This is only designed for seeking on readonly file-like objects. + + Wrapped file-like object must have a read method. The readline method is + only supported if that method is present on the wrapped object. The + readlines method is always supported. xreadlines and iteration are + supported only for Python 2.2 and above. + + Public attributes: + + wrapped: the wrapped file object + is_closed: true iff .close() has been called + + WARNING: All other attributes of the wrapped object (ie. those that are not + one of wrapped, read, readline, readlines, xreadlines, __iter__ and next) + are passed through unaltered, which may or may not make sense for your + particular file object. + + """ + # General strategy is to check that cache is full enough, then delegate to + # the cache (self.__cache, which is a cStringIO.StringIO instance). A seek + # position (self.__pos) is maintained independently of the cache, in order + # that a single cache may be shared between multiple seek_wrapper objects. + # Copying using module copy shares the cache in this way. + + def __init__(self, wrapped): + self.wrapped = wrapped + self.__read_complete_state = [False] + self.__is_closed_state = [False] + self.__have_readline = hasattr(self.wrapped, "readline") + self.__cache = StringIO() + self.__pos = 0 # seek position + + def invariant(self): + # The end of the cache is always at the same place as the end of the + # wrapped file (though the .tell() method is not required to be present + # on wrapped file). + return self.wrapped.tell() == len(self.__cache.getvalue()) + + def close(self): + self.wrapped.close() + self.is_closed = True + + def __getattr__(self, name): + if name == "is_closed": + return self.__is_closed_state[0] + elif name == "read_complete": + return self.__read_complete_state[0] + + wrapped = self.__dict__.get("wrapped") + if wrapped: + return getattr(wrapped, name) + + return getattr(self.__class__, name) + + def __setattr__(self, name, value): + if name == "is_closed": + self.__is_closed_state[0] = bool(value) + elif name == "read_complete": + if not self.is_closed: + self.__read_complete_state[0] = bool(value) + else: + self.__dict__[name] = value + + def seek(self, offset, whence=0): + assert whence in [0,1,2] + + # how much data, if any, do we need to read? + if whence == 2: # 2: relative to end of *wrapped* file + if offset < 0: raise ValueError("negative seek offset") + # since we don't know yet where the end of that file is, we must + # read everything + to_read = None + else: + if whence == 0: # 0: absolute + if offset < 0: raise ValueError("negative seek offset") + dest = offset + else: # 1: relative to current position + pos = self.__pos + if pos < offset: + raise ValueError("seek to before start of file") + dest = pos + offset + end = len_of_seekable(self.__cache) + to_read = dest - end + if to_read < 0: + to_read = 0 + + if to_read != 0: + self.__cache.seek(0, 2) + if to_read is None: + assert whence == 2 + self.__cache.write(self.wrapped.read()) + self.read_complete = True + self.__pos = self.__cache.tell() - offset + else: + data = self.wrapped.read(to_read) + if not data: + self.read_complete = True + else: + self.__cache.write(data) + # Don't raise an exception even if we've seek()ed past the end + # of .wrapped, since fseek() doesn't complain in that case. + # Also like fseek(), pretend we have seek()ed past the end, + # i.e. not: + #self.__pos = self.__cache.tell() + # but rather: + self.__pos = dest + else: + self.__pos = dest + + def tell(self): + return self.__pos + + def __copy__(self): + cpy = self.__class__(self.wrapped) + cpy.__cache = self.__cache + cpy.__read_complete_state = self.__read_complete_state + cpy.__is_closed_state = self.__is_closed_state + return cpy + + def get_data(self): + pos = self.__pos + try: + self.seek(0) + return self.read(-1) + finally: + self.__pos = pos + + def read(self, size=-1): + pos = self.__pos + end = len_of_seekable(self.__cache) + available = end - pos + + # enough data already cached? + if size <= available and size != -1: + self.__cache.seek(pos) + self.__pos = pos+size + return self.__cache.read(size) + + # no, so read sufficient data from wrapped file and cache it + self.__cache.seek(0, 2) + if size == -1: + self.__cache.write(self.wrapped.read()) + self.read_complete = True + else: + to_read = size - available + assert to_read > 0 + data = self.wrapped.read(to_read) + if not data: + self.read_complete = True + else: + self.__cache.write(data) + self.__cache.seek(pos) + + data = self.__cache.read(size) + self.__pos = self.__cache.tell() + assert self.__pos == pos + len(data) + return data + + def readline(self, size=-1): + if not self.__have_readline: + raise NotImplementedError("no readline method on wrapped object") + + # line we're about to read might not be complete in the cache, so + # read another line first + pos = self.__pos + self.__cache.seek(0, 2) + data = self.wrapped.readline() + if not data: + self.read_complete = True + else: + self.__cache.write(data) + self.__cache.seek(pos) + + data = self.__cache.readline() + if size != -1: + r = data[:size] + self.__pos = pos+size + else: + r = data + self.__pos = pos+len(data) + return r + + def readlines(self, sizehint=-1): + pos = self.__pos + self.__cache.seek(0, 2) + self.__cache.write(self.wrapped.read()) + self.read_complete = True + self.__cache.seek(pos) + data = self.__cache.readlines(sizehint) + self.__pos = self.__cache.tell() + return data + + def __iter__(self): return self + def next(self): + line = self.readline() + if line == "": raise StopIteration + return line + + xreadlines = __iter__ + + def __repr__(self): + return ("<%s at %s whose wrapped object = %r>" % + (self.__class__.__name__, hex(abs(id(self))), self.wrapped)) + + +class response_seek_wrapper(seek_wrapper): + + """ + Supports copying response objects and setting response body data. + + """ + + def __init__(self, wrapped): + seek_wrapper.__init__(self, wrapped) + self._headers = self.wrapped.info() + + def __copy__(self): + cpy = seek_wrapper.__copy__(self) + # copy headers from delegate + cpy._headers = copy.copy(self.info()) + return cpy + + # Note that .info() and .geturl() (the only two urllib2 response methods + # that are not implemented by seek_wrapper) must be here explicitly rather + # than by seek_wrapper's __getattr__ delegation) so that the nasty + # dynamically-created HTTPError classes in get_seek_wrapper_class() get the + # wrapped object's implementation, and not HTTPError's. + + def info(self): + return self._headers + + def geturl(self): + return self.wrapped.geturl() + + def set_data(self, data): + self.seek(0) + self.read() + self.close() + cache = self._seek_wrapper__cache = StringIO() + cache.write(data) + self.seek(0) + + +class eoffile: + # file-like object that always claims to be at end-of-file... + def read(self, size=-1): return "" + def readline(self, size=-1): return "" + def __iter__(self): return self + def next(self): return "" + def close(self): pass + +class eofresponse(eoffile): + def __init__(self, url, headers, code, msg): + self._url = url + self._headers = headers + self.code = code + self.msg = msg + def geturl(self): return self._url + def info(self): return self._headers + + +class closeable_response: + """Avoids unnecessarily clobbering urllib.addinfourl methods on .close(). + + Only supports responses returned by mechanize.HTTPHandler. + + After .close(), the following methods are supported: + + .read() + .readline() + .info() + .geturl() + .__iter__() + .next() + .close() + + and the following attributes are supported: + + .code + .msg + + Also supports pickling (but the stdlib currently does something to prevent + it: http://python.org/sf/1144636). + + """ + # presence of this attr indicates is useable after .close() + closeable_response = None + + def __init__(self, fp, headers, url, code, msg): + self._set_fp(fp) + self._headers = headers + self._url = url + self.code = code + self.msg = msg + + def _set_fp(self, fp): + self.fp = fp + self.read = self.fp.read + self.readline = self.fp.readline + if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines + if hasattr(self.fp, "fileno"): + self.fileno = self.fp.fileno + else: + self.fileno = lambda: None + self.__iter__ = self.fp.__iter__ + self.next = self.fp.next + + def __repr__(self): + return '<%s at %s whose fp = %r>' % ( + self.__class__.__name__, hex(abs(id(self))), self.fp) + + def info(self): + return self._headers + + def geturl(self): + return self._url + + def close(self): + wrapped = self.fp + wrapped.close() + new_wrapped = eofresponse( + self._url, self._headers, self.code, self.msg) + self._set_fp(new_wrapped) + + def __getstate__(self): + # There are three obvious options here: + # 1. truncate + # 2. read to end + # 3. close socket, pickle state including read position, then open + # again on unpickle and use Range header + # XXXX um, 4. refuse to pickle unless .close()d. This is better, + # actually ("errors should never pass silently"). Pickling doesn't + # work anyway ATM, because of http://python.org/sf/1144636 so fix + # this later + + # 2 breaks pickle protocol, because one expects the original object + # to be left unscathed by pickling. 3 is too complicated and + # surprising (and too much work ;-) to happen in a sane __getstate__. + # So we do 1. + + state = self.__dict__.copy() + new_wrapped = eofresponse( + self._url, self._headers, self.code, self.msg) + state["wrapped"] = new_wrapped + return state + +def test_response(data='test data', headers=[], + url="http://example.com/", code=200, msg="OK"): + return make_response(data, headers, url, code, msg) + +def test_html_response(data='test data', headers=[], + url="http://example.com/", code=200, msg="OK"): + headers += [("Content-type", "text/html")] + return make_response(data, headers, url, code, msg) + +def make_response(data, headers, url, code, msg): + """Convenient factory for objects implementing response interface. + + data: string containing response body data + headers: sequence of (name, value) pairs + url: URL of response + code: integer response code (e.g. 200) + msg: string response code message (e.g. "OK") + + """ + mime_headers = make_headers(headers) + r = closeable_response(StringIO(data), mime_headers, url, code, msg) + return response_seek_wrapper(r) + + +def make_headers(headers): + """ + headers: sequence of (name, value) pairs + """ + hdr_text = [] + for name_value in headers: + hdr_text.append("%s: %s" % name_value) + return mimetools.Message(StringIO("\n".join(hdr_text))) + + +# Rest of this module is especially horrible, but needed, at least until fork +# urllib2. Even then, may want to preseve urllib2 compatibility. + +def get_seek_wrapper_class(response): + # in order to wrap response objects that are also exceptions, we must + # dynamically subclass the exception :-((( + if (isinstance(response, urllib2.HTTPError) and + not hasattr(response, "seek")): + if response.__class__.__module__ == "__builtin__": + exc_class_name = response.__class__.__name__ + else: + exc_class_name = "%s.%s" % ( + response.__class__.__module__, response.__class__.__name__) + + class httperror_seek_wrapper(response_seek_wrapper, response.__class__): + # this only derives from HTTPError in order to be a subclass -- + # the HTTPError behaviour comes from delegation + + _exc_class_name = exc_class_name + + def __init__(self, wrapped): + response_seek_wrapper.__init__(self, wrapped) + # be compatible with undocumented HTTPError attributes :-( + self.hdrs = wrapped.info() + self.filename = wrapped.geturl() + + def __repr__(self): + return ( + "<%s (%s instance) at %s " + "whose wrapped object = %r>" % ( + self.__class__.__name__, self._exc_class_name, + hex(abs(id(self))), self.wrapped) + ) + wrapper_class = httperror_seek_wrapper + else: + wrapper_class = response_seek_wrapper + return wrapper_class + +def seek_wrapped_response(response): + """Return a copy of response that supports seekable response interface. + + Accepts responses from both mechanize and urllib2 handlers. + + Copes with both ordinary response instances and HTTPError instances (which + can't be simply wrapped due to the requirement of preserving the exception + base class). + """ + if not hasattr(response, "seek"): + wrapper_class = get_seek_wrapper_class(response) + response = wrapper_class(response) + assert hasattr(response, "get_data") + return response + +def upgrade_response(response): + """Return a copy of response that supports Browser response interface. + + Browser response interface is that of "seekable responses" + (response_seek_wrapper), plus the requirement that responses must be + useable after .close() (closeable_response). + + Accepts responses from both mechanize and urllib2 handlers. + + Copes with both ordinary response instances and HTTPError instances (which + can't be simply wrapped due to the requirement of preserving the exception + base class). + """ + wrapper_class = get_seek_wrapper_class(response) + if hasattr(response, "closeable_response"): + if not hasattr(response, "seek"): + response = wrapper_class(response) + assert hasattr(response, "get_data") + return copy.copy(response) + + # a urllib2 handler constructed the response, i.e. the response is an + # urllib.addinfourl or a urllib2.HTTPError, instead of a + # _Util.closeable_response as returned by e.g. mechanize.HTTPHandler + try: + code = response.code + except AttributeError: + code = None + try: + msg = response.msg + except AttributeError: + msg = None + + # may have already-.read() data from .seek() cache + data = None + get_data = getattr(response, "get_data", None) + if get_data: + data = get_data() + + response = closeable_response( + response.fp, response.info(), response.geturl(), code, msg) + response = wrapper_class(response) + if data: + response.set_data(data) + return response diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_rfc3986.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_rfc3986.py new file mode 100644 index 0000000000000000000000000000000000000000..91fcd197f4ec173ef00c5e05555c7936d7fbc4ed --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_rfc3986.py @@ -0,0 +1,245 @@ +"""RFC 3986 URI parsing and relative reference resolution / absolutization. + +(aka splitting and joining) + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM. + +import re, urllib + +## def chr_range(a, b): +## return "".join(map(chr, range(ord(a), ord(b)+1))) + +## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" +## "abcdefghijklmnopqrstuvwxyz" +## "0123456789" +## "-_.~") +## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]" +## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%' +# this re matches any character that's not in URI_CHARS +BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]") + + +def clean_url(url, encoding): + # percent-encode illegal URI characters + # Trying to come up with test cases for this gave me a headache, revisit + # when do switch to unicode. + # Somebody else's comments (lost the attribution): +## - IE will return you the url in the encoding you send it +## - Mozilla/Firefox will send you latin-1 if there's no non latin-1 +## characters in your link. It will send you utf-8 however if there are... + if type(url) == type(""): + url = url.decode(encoding, "replace") + url = url.strip() + # for second param to urllib.quote(), we want URI_CHARS, minus the + # 'always_safe' characters that urllib.quote() never percent-encodes + return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~") + +def is_clean_uri(uri): + """ + >>> is_clean_uri("ABC!") + True + >>> is_clean_uri(u"ABC!") + True + >>> is_clean_uri("ABC|") + False + >>> is_clean_uri(u"ABC|") + False + >>> is_clean_uri("http://example.com/0") + True + >>> is_clean_uri(u"http://example.com/0") + True + """ + # note module re treats bytestrings as through they were decoded as latin-1 + # so this function accepts both unicode and bytestrings + return not bool(BAD_URI_CHARS_RE.search(uri)) + + +SPLIT_MATCH = re.compile( + r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match +def urlsplit(absolute_uri): + """Return scheme, authority, path, query, fragment.""" + match = SPLIT_MATCH(absolute_uri) + if match: + g = match.groups() + return g[1], g[3], g[4], g[6], g[8] + +def urlunsplit(parts): + scheme, authority, path, query, fragment = parts + r = [] + append = r.append + if scheme is not None: + append(scheme) + append(":") + if authority is not None: + append("//") + append(authority) + append(path) + if query is not None: + append("?") + append(query) + if fragment is not None: + append("#") + append(fragment) + return "".join(r) + +def urljoin(base_uri, uri_reference): + """Join a base URI with a URI reference and return the resulting URI. + + See RFC 3986. + """ + return urlunsplit(urljoin_parts(urlsplit(base_uri), + urlsplit(uri_reference))) + +# oops, this doesn't do the same thing as the literal translation +# from the RFC below +## import posixpath +## def urljoin_parts(base_parts, reference_parts): +## scheme, authority, path, query, fragment = base_parts +## rscheme, rauthority, rpath, rquery, rfragment = reference_parts + +## # compute target URI path +## if rpath == "": +## tpath = path +## else: +## tpath = rpath +## if not tpath.startswith("/"): +## tpath = merge(authority, path, tpath) +## tpath = posixpath.normpath(tpath) + +## if rscheme is not None: +## return (rscheme, rauthority, tpath, rquery, rfragment) +## elif rauthority is not None: +## return (scheme, rauthority, tpath, rquery, rfragment) +## elif rpath == "": +## if rquery is not None: +## tquery = rquery +## else: +## tquery = query +## return (scheme, authority, tpath, tquery, rfragment) +## else: +## return (scheme, authority, tpath, rquery, rfragment) + +def urljoin_parts(base_parts, reference_parts): + scheme, authority, path, query, fragment = base_parts + rscheme, rauthority, rpath, rquery, rfragment = reference_parts + + if rscheme == scheme: + rscheme = None + + if rscheme is not None: + tscheme, tauthority, tpath, tquery = ( + rscheme, rauthority, remove_dot_segments(rpath), rquery) + else: + if rauthority is not None: + tauthority, tpath, tquery = ( + rauthority, remove_dot_segments(rpath), rquery) + else: + if rpath == "": + tpath = path + if rquery is not None: + tquery = rquery + else: + tquery = query + else: + if rpath.startswith("/"): + tpath = remove_dot_segments(rpath) + else: + tpath = merge(authority, path, rpath) + tpath = remove_dot_segments(tpath) + tquery = rquery + tauthority = authority + tscheme = scheme + tfragment = rfragment + return (tscheme, tauthority, tpath, tquery, tfragment) + +# um, something *vaguely* like this is what I want, but I have to generate +# lots of test cases first, if only to understand what it is that +# remove_dot_segments really does... +## def remove_dot_segments(path): +## if path == '': +## return '' +## comps = path.split('/') +## new_comps = [] +## for comp in comps: +## if comp in ['.', '']: +## if not new_comps or new_comps[-1]: +## new_comps.append('') +## continue +## if comp != '..': +## new_comps.append(comp) +## elif new_comps: +## new_comps.pop() +## return '/'.join(new_comps) + + +def remove_dot_segments(path): + r = [] + while path: + # A + if path.startswith("../"): + path = path[3:] + continue + if path.startswith("./"): + path = path[2:] + continue + # B + if path.startswith("/./"): + path = path[2:] + continue + if path == "/.": + path = "/" + continue + # C + if path.startswith("/../"): + path = path[3:] + if r: + r.pop() + continue + if path == "/..": + path = "/" + if r: + r.pop() + continue + # D + if path == ".": + path = path[1:] + continue + if path == "..": + path = path[2:] + continue + # E + start = 0 + if path.startswith("/"): + start = 1 + ii = path.find("/", start) + if ii < 0: + ii = None + r.append(path[:ii]) + if ii is None: + break + path = path[ii:] + return "".join(r) + +def merge(base_authority, base_path, ref_path): + # XXXX Oddly, the sample Perl implementation of this by Roy Fielding + # doesn't even take base_authority as a parameter, despite the wording in + # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity. + #if base_authority is not None and base_path == "": + if base_path == "": + return "/" + ref_path + ii = base_path.rfind("/") + if ii >= 0: + return base_path[:ii+1] + ref_path + return ref_path + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_sgmllib_copy.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_sgmllib_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..a545d25eb9dda19ef70f032f437015870856cd54 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_sgmllib_copy.py @@ -0,0 +1,559 @@ +# Taken from Python 2.6.4 and regexp module constants modified +"""A parser for SGML, using the derived class as a static DTD.""" + +# XXX This only supports those SGML features used by HTML. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). RCDATA is +# not supported at all. + + +# from warnings import warnpy3k +# warnpy3k("the sgmllib module has been removed in Python 3.0", +# stacklevel=2) +# del warnpy3k + +import markupbase +import re + +__all__ = ["SGMLParser", "SGMLParseError"] + +# Regular expressions used for parsing + +interesting = re.compile('[&<]') +incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|' + '/([a-zA-Z][^<>]*)?|' + '![^<>]*)?') + +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') +# hack to fix http://bugs.python.org/issue803422 +# charref = re.compile('&#([0-9]+)[^0-9]') +charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") + +starttagopen = re.compile('<[>a-zA-Z]') +shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') +shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') +piclose = re.compile('>') +endbracket = re.compile('[<>]') +# hack moved from _beautifulsoup.py (bundled BeautifulSoup version 2) +#This code makes Beautiful Soup able to parse XML with namespaces +# tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') +tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +attrfind = re.compile( + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + + +class SGMLParseError(RuntimeError): + """Exception raised for all parse errors.""" + pass + + +# SGML parser base class -- find tags and call handler functions. +# Usage: p = SGMLParser(); p.feed(data); ...; p.close(). +# The dtd is defined by deriving a class which defines methods +# with special names to handle tags: start_foo and end_foo to handle +# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. +# (Tags are converted to lower case for this purpose.) The data +# between tags is passed to the parser by calling self.handle_data() +# with some data as argument (the data may be split up in arbitrary +# chunks). Entity references are passed by calling +# self.handle_entityref() with the entity reference as argument. + +class SGMLParser(markupbase.ParserBase): + # Definition of entities -- derived classes may override + entity_or_charref = re.compile('&(?:' + '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' + ')(;?)') + + def __init__(self, verbose=0): + """Initialize and reset this instance.""" + self.verbose = verbose + self.reset() + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" + self.__starttag_text = None + self.rawdata = '' + self.stack = [] + self.lasttag = '???' + self.nomoretags = 0 + self.literal = 0 + markupbase.ParserBase.reset(self) + + def setnomoretags(self): + """Enter literal mode (CDATA) till EOF. + + Intended for derived classes only. + """ + self.nomoretags = self.literal = 1 + + def setliteral(self, *args): + """Enter literal mode (CDATA). + + Intended for derived classes only. + """ + self.literal = 1 + + def feed(self, data): + """Feed some data to the parser. + + Call this as often as you want, with as little or as much text + as you want (may include '\n'). (This just saves the text, + all the processing is done by goahead().) + """ + + self.rawdata = self.rawdata + data + self.goahead(0) + + def close(self): + """Handle the remaining data.""" + self.goahead(1) + + def error(self, message): + raise SGMLParseError(message) + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.nomoretags: + self.handle_data(rawdata[i:n]) + i = n + break + match = interesting.search(rawdata, i) + if match: j = match.start() + else: j = n + if i < j: + self.handle_data(rawdata[i:j]) + i = j + if i == n: break + if rawdata[i] == '<': + if starttagopen.match(rawdata, i): + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_starttag(i) + if k < 0: break + i = k + continue + if rawdata.startswith("</", i): + k = self.parse_endtag(i) + if k < 0: break + i = k + self.literal = 0 + continue + if self.literal: + if n > (i + 1): + self.handle_data("<") + i = i+1 + else: + # incomplete + break + continue + if rawdata.startswith("<!--", i): + # Strictly speaking, a comment is --.*-- + # within a declaration tag <!...>. + # This should be removed, + # and comments handled only in parse_declaration. + k = self.parse_comment(i) + if k < 0: break + i = k + continue + if rawdata.startswith("<?", i): + k = self.parse_pi(i) + if k < 0: break + i = i+k + continue + if rawdata.startswith("<!", i): + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration ("<!DOCTYPE html...>"). + k = self.parse_declaration(i) + if k < 0: break + i = k + continue + elif rawdata[i] == '&': + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + match = charref.match(rawdata, i) + if match: + name = match.group(1) + self.handle_charref(name) + i = match.end(0) + if rawdata[i-1] != ';': i = i-1 + continue + match = entityref.match(rawdata, i) + if match: + name = match.group(1) + self.handle_entityref(name) + i = match.end(0) + if rawdata[i-1] != ';': i = i-1 + continue + else: + self.error('neither < nor & ??') + # We get here only if incomplete matches but + # nothing else + match = incomplete.match(rawdata, i) + if not match: + self.handle_data(rawdata[i]) + i = i+1 + continue + j = match.end(0) + if j == n: + break # Really incomplete + self.handle_data(rawdata[i:j]) + i = j + # end while + if end and i < n: + self.handle_data(rawdata[i:n]) + i = n + self.rawdata = rawdata[i:] + # XXX if end: check for empty stack + + # Extensions for the DOCTYPE scanner: + _decl_otherchars = '=' + + # Internal -- parse processing instr, return length or -1 if not terminated + def parse_pi(self, i): + rawdata = self.rawdata + if rawdata[i:i+2] != '<?': + self.error('unexpected call to parse_pi()') + match = piclose.search(rawdata, i+2) + if not match: + return -1 + j = match.start(0) + self.handle_pi(rawdata[i+2: j]) + j = match.end(0) + return j-i + + def get_starttag_text(self): + return self.__starttag_text + + # Internal -- handle starttag, return length or -1 if not terminated + def parse_starttag(self, i): + self.__starttag_text = None + start_pos = i + rawdata = self.rawdata + if shorttagopen.match(rawdata, i): + # SGML shorthand: <tag/data/ == <tag>data</tag> + # XXX Can data contain &... (entity or char refs)? + # XXX Can data contain < or > (tag characters)? + # XXX Can there be whitespace before the first /? + match = shorttag.match(rawdata, i) + if not match: + return -1 + tag, data = match.group(1, 2) + self.__starttag_text = '<%s/' % tag + tag = tag.lower() + k = match.end(0) + self.finish_shorttag(tag, data) + self.__starttag_text = rawdata[start_pos:match.end(1) + 1] + return k + # XXX The following should skip matching quotes (' or ") + # As a shortcut way to exit, this isn't so bad, but shouldn't + # be used to locate the actual end of the start tag since the + # < or > characters may be embedded in an attribute value. + match = endbracket.search(rawdata, i+1) + if not match: + return -1 + j = match.start(0) + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + if rawdata[i:i+2] == '<>': + # SGML shorthand: <> == <last open tag seen> + k = j + tag = self.lasttag + else: + match = tagfind.match(rawdata, i+1) + if not match: + self.error('unexpected call to parse_starttag') + k = match.end(0) + tag = rawdata[i+1:k].lower() + self.lasttag = tag + while k < j: + match = attrfind.match(rawdata, k) + if not match: break + attrname, rest, attrvalue = match.group(1, 2, 3) + if not rest: + attrvalue = attrname + else: + if (attrvalue[:1] == "'" == attrvalue[-1:] or + attrvalue[:1] == '"' == attrvalue[-1:]): + # strip quotes + attrvalue = attrvalue[1:-1] + attrvalue = self.entity_or_charref.sub( + self._convert_ref, attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = match.end(0) + if rawdata[j] == '>': + j = j+1 + self.__starttag_text = rawdata[start_pos:j] + self.finish_starttag(tag, attrs) + return j + + # Internal -- convert entity or character reference + def _convert_ref(self, match): + if match.group(2): + return self.convert_charref(match.group(2)) or \ + '&#%s%s' % match.groups()[1:] + elif match.group(3): + return self.convert_entityref(match.group(1)) or \ + '&%s;' % match.group(1) + else: + return '&%s' % match.group(1) + + # Internal -- parse endtag + def parse_endtag(self, i): + rawdata = self.rawdata + match = endbracket.search(rawdata, i+1) + if not match: + return -1 + j = match.start(0) + tag = rawdata[i+2:j].strip().lower() + if rawdata[j] == '>': + j = j+1 + self.finish_endtag(tag) + return j + + # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) + def finish_shorttag(self, tag, data): + self.finish_starttag(tag, []) + self.handle_data(data) + self.finish_endtag(tag) + + # Internal -- finish processing of start tag + # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag + def finish_starttag(self, tag, attrs): + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + self.unknown_starttag(tag, attrs) + return -1 + else: + self.handle_starttag(tag, method, attrs) + return 0 + else: + self.stack.append(tag) + self.handle_starttag(tag, method, attrs) + return 1 + + # Internal -- finish processing of end tag + def finish_endtag(self, tag): + if not tag: + found = len(self.stack) - 1 + if found < 0: + self.unknown_endtag(tag) + return + else: + if tag not in self.stack: + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + self.unknown_endtag(tag) + else: + self.report_unbalanced(tag) + return + found = len(self.stack) + for i in range(found): + if self.stack[i] == tag: found = i + while len(self.stack) > found: + tag = self.stack[-1] + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + method = None + if method: + self.handle_endtag(tag, method) + else: + self.unknown_endtag(tag) + del self.stack[-1] + + # Overridable -- handle start tag + def handle_starttag(self, tag, method, attrs): + method(attrs) + + # Overridable -- handle end tag + def handle_endtag(self, tag, method): + method() + + # Example -- report an unbalanced </...> tag. + def report_unbalanced(self, tag): + if self.verbose: + print '*** Unbalanced </' + tag + '>' + print '*** Stack:', self.stack + + def convert_charref(self, name): + """Convert character reference, may be overridden.""" + try: + n = int(name) + except ValueError: + return + if not 0 <= n <= 127: + return + return self.convert_codepoint(n) + + def convert_codepoint(self, codepoint): + return chr(codepoint) + + def handle_charref(self, name): + """Handle character reference, no need to override.""" + replacement = self.convert_charref(name) + if replacement is None: + self.unknown_charref(name) + else: + self.handle_data(replacement) + + # Definition of entities -- derived classes may override + entitydefs = \ + {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} + + def convert_entityref(self, name): + """Convert entity references. + + As an alternative to overriding this method; one can tailor the + results by setting up the self.entitydefs mapping appropriately. + """ + table = self.entitydefs + if name in table: + return table[name] + else: + return + + def handle_entityref(self, name): + """Handle entity references, no need to override.""" + replacement = self.convert_entityref(name) + if replacement is None: + self.unknown_entityref(name) + else: + self.handle_data(replacement) + + # Example -- handle data, should be overridden + def handle_data(self, data): + pass + + # Example -- handle comment, could be overridden + def handle_comment(self, data): + pass + + # Example -- handle declaration, could be overridden + def handle_decl(self, decl): + pass + + # Example -- handle processing instruction, could be overridden + def handle_pi(self, data): + pass + + # To be overridden -- handlers for unknown objects + def unknown_starttag(self, tag, attrs): pass + def unknown_endtag(self, tag): pass + def unknown_charref(self, ref): pass + def unknown_entityref(self, ref): pass + + +class TestSGMLParser(SGMLParser): + + def __init__(self, verbose=0): + self.testdata = "" + SGMLParser.__init__(self, verbose) + + def handle_data(self, data): + self.testdata = self.testdata + data + if len(repr(self.testdata)) >= 70: + self.flush() + + def flush(self): + data = self.testdata + if data: + self.testdata = "" + print 'data:', repr(data) + + def handle_comment(self, data): + self.flush() + r = repr(data) + if len(r) > 68: + r = r[:32] + '...' + r[-32:] + print 'comment:', r + + def unknown_starttag(self, tag, attrs): + self.flush() + if not attrs: + print 'start tag: <' + tag + '>' + else: + print 'start tag: <' + tag, + for name, value in attrs: + print name + '=' + '"' + value + '"', + print '>' + + def unknown_endtag(self, tag): + self.flush() + print 'end tag: </' + tag + '>' + + def unknown_entityref(self, ref): + self.flush() + print '*** unknown entity ref: &' + ref + ';' + + def unknown_charref(self, ref): + self.flush() + print '*** unknown char ref: &#' + ref + ';' + + def unknown_decl(self, data): + self.flush() + print '*** unknown decl: [' + data + ']' + + def close(self): + SGMLParser.close(self) + self.flush() + + +def test(args = None): + import sys + + if args is None: + args = sys.argv[1:] + + if args and args[0] == '-s': + args = args[1:] + klass = SGMLParser + else: + klass = TestSGMLParser + + if args: + file = args[0] + else: + file = 'test.html' + + if file == '-': + f = sys.stdin + else: + try: + f = open(file, 'r') + except IOError, msg: + print file, ":", msg + sys.exit(1) + + data = f.read() + if f is not sys.stdin: + f.close() + + x = klass() + for c in data: + x.feed(c) + x.close() + + +if __name__ == '__main__': + test() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_sockettimeout.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_sockettimeout.py new file mode 100644 index 0000000000000000000000000000000000000000..c22b7346a05f966d3f71eb27e5211393a302dbe6 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_sockettimeout.py @@ -0,0 +1,6 @@ +import socket + +try: + _GLOBAL_DEFAULT_TIMEOUT = socket._GLOBAL_DEFAULT_TIMEOUT +except AttributeError: + _GLOBAL_DEFAULT_TIMEOUT = object() diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_testcase.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_testcase.py new file mode 100644 index 0000000000000000000000000000000000000000..f372760ef9ee72558e270e9112bc3b228cc2384a --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_testcase.py @@ -0,0 +1,162 @@ +import os +import shutil +import subprocess +import tempfile +import unittest + + +class SetupStack(object): + + def __init__(self): + self._on_teardown = [] + + def add_teardown(self, teardown): + self._on_teardown.append(teardown) + + def tear_down(self): + for func in reversed(self._on_teardown): + func() + + +class TearDownConvenience(object): + + def __init__(self, setup_stack=None): + self._own_setup_stack = setup_stack is None + if setup_stack is None: + setup_stack = SetupStack() + self._setup_stack = setup_stack + + # only call this convenience method if no setup_stack was supplied to c'tor + def tear_down(self): + assert self._own_setup_stack + self._setup_stack.tear_down() + + +class TempDirMaker(TearDownConvenience): + + def make_temp_dir(self, dir_=None): + temp_dir = tempfile.mkdtemp(prefix="tmp-%s-" % self.__class__.__name__, + dir=dir_) + def tear_down(): + shutil.rmtree(temp_dir) + self._setup_stack.add_teardown(tear_down) + return temp_dir + + +class MonkeyPatcher(TearDownConvenience): + + Unset = object() + + def monkey_patch(self, obj, name, value): + orig_value = getattr(obj, name) + setattr(obj, name, value) + def reverse_patch(): + setattr(obj, name, orig_value) + self._setup_stack.add_teardown(reverse_patch) + + def _set_environ(self, env, name, value): + if value is self.Unset: + try: + del env[name] + except KeyError: + pass + else: + env[name] = value + + def monkey_patch_environ(self, name, value, env=os.environ): + orig_value = env.get(name, self.Unset) + self._set_environ(env, name, value) + def reverse_patch(): + self._set_environ(env, name, orig_value) + self._setup_stack.add_teardown(reverse_patch) + + +class FixtureFactory(object): + + def __init__(self): + self._setup_stack = SetupStack() + self._context_managers = {} + self._fixtures = {} + + def register_context_manager(self, name, context_manager): + self._context_managers[name] = context_manager + + def get_fixture(self, name, add_teardown): + context_manager = self._context_managers[name] + fixture = context_manager.__enter__() + add_teardown(lambda: context_manager.__exit__(None, None, None)) + return fixture + + def get_cached_fixture(self, name): + fixture = self._fixtures.get(name) + if fixture is None: + fixture = self.get_fixture(name, self._setup_stack.add_teardown) + self._fixtures[name] = fixture + return fixture + + def tear_down(self): + self._setup_stack.tear_down() + + +class TestCase(unittest.TestCase): + + def setUp(self): + self._setup_stack = SetupStack() + self._monkey_patcher = MonkeyPatcher(self._setup_stack) + + def tearDown(self): + self._setup_stack.tear_down() + + def register_context_manager(self, name, context_manager): + return self.fixture_factory.register_context_manager( + name, context_manager) + + def get_fixture(self, name): + return self.fixture_factory.get_fixture(name, self.add_teardown) + + def get_cached_fixture(self, name): + return self.fixture_factory.get_cached_fixture(name) + + def add_teardown(self, *args, **kwds): + self._setup_stack.add_teardown(*args, **kwds) + + def make_temp_dir(self, *args, **kwds): + return TempDirMaker(self._setup_stack).make_temp_dir(*args, **kwds) + + def monkey_patch(self, *args, **kwds): + return self._monkey_patcher.monkey_patch(*args, **kwds) + + def monkey_patch_environ(self, *args, **kwds): + return self._monkey_patcher.monkey_patch_environ(*args, **kwds) + + def assert_contains(self, container, containee): + self.assertTrue(containee in container, "%r not in %r" % + (containee, container)) + + def assert_less_than(self, got, expected): + self.assertTrue(got < expected, "%r >= %r" % + (got, expected)) + + +# http://lackingrhoticity.blogspot.com/2009/01/testing-using-golden-files-in-python.html + +class GoldenTestCase(TestCase): + + run_meld = False + + def assert_golden(self, dir_got, dir_expect): + assert os.path.exists(dir_expect), dir_expect + proc = subprocess.Popen(["diff", "--recursive", "-u", "-N", + "--exclude=.*", dir_expect, dir_got], + stdout=subprocess.PIPE) + stdout, stderr = proc.communicate() + if len(stdout) > 0: + if self.run_meld: + # Put expected output on the right because that is the + # side we usually edit. + subprocess.call(["meld", dir_got, dir_expect]) + raise AssertionError( + "Differences from golden files found.\n" + "Try running with --meld to update golden files.\n" + "%s" % stdout) + self.assertEquals(proc.wait(), 0) diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_urllib2.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_urllib2.py new file mode 100644 index 0000000000000000000000000000000000000000..29b7038dd49d33c82fba2685283f9ba044da1e37 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_urllib2.py @@ -0,0 +1,50 @@ +# urllib2 work-alike interface +# ...from urllib2... +from urllib2 import \ + URLError, \ + HTTPError +# ...and from mechanize +from _auth import \ + HTTPProxyPasswordMgr, \ + HTTPSClientCertMgr +from _debug import \ + HTTPResponseDebugProcessor, \ + HTTPRedirectDebugProcessor +# crap ATM +## from _gzip import \ +## HTTPGzipProcessor +from _urllib2_fork import \ + AbstractBasicAuthHandler, \ + AbstractDigestAuthHandler, \ + BaseHandler, \ + CacheFTPHandler, \ + FileHandler, \ + FTPHandler, \ + HTTPBasicAuthHandler, \ + HTTPCookieProcessor, \ + HTTPDefaultErrorHandler, \ + HTTPDigestAuthHandler, \ + HTTPErrorProcessor, \ + HTTPHandler, \ + HTTPPasswordMgr, \ + HTTPPasswordMgrWithDefaultRealm, \ + HTTPRedirectHandler, \ + ProxyBasicAuthHandler, \ + ProxyDigestAuthHandler, \ + ProxyHandler, \ + UnknownHandler +from _http import \ + HTTPEquivProcessor, \ + HTTPRefererProcessor, \ + HTTPRefreshProcessor, \ + HTTPRobotRulesProcessor, \ + RobotExclusionError +import httplib +if hasattr(httplib, 'HTTPS'): + from _urllib2_fork import HTTPSHandler +del httplib +from _opener import OpenerDirector, \ + SeekableResponseOpener, \ + build_opener, install_opener, urlopen +from _request import \ + Request diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_urllib2_fork.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_urllib2_fork.py new file mode 100644 index 0000000000000000000000000000000000000000..d0cfe382f80ce8deb96f071d9a79a2f6e9482a07 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_urllib2_fork.py @@ -0,0 +1,1414 @@ +"""Fork of urllib2. + +When reading this, don't assume that all code in here is reachable. Code in +the rest of mechanize may be used instead. + +Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 Python +Software Foundation; All Rights Reserved + +Copyright 2002-2009 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# XXX issues: +# If an authentication error handler that tries to perform +# authentication for some reason but fails, how should the error be +# signalled? The client needs to know the HTTP error code. But if +# the handler knows that the problem was, e.g., that it didn't know +# that hash algo that requested in the challenge, it would be good to +# pass that information along to the client, too. +# ftp errors aren't handled cleanly +# check digest against correct (i.e. non-apache) implementation + +# Possible extensions: +# complex proxies XXX not sure what exactly was meant by this +# abstract factory for opener + +import copy +import base64 +import httplib +import mimetools +import logging +import os +import posixpath +import random +import re +import socket +import sys +import time +import urllib +import urlparse +import bisect + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +try: + import hashlib +except ImportError: + # python 2.4 + import md5 + import sha + def sha1_digest(bytes): + return sha.new(bytes).hexdigest() + def md5_digest(bytes): + return md5.new(bytes).hexdigest() +else: + def sha1_digest(bytes): + return hashlib.sha1(bytes).hexdigest() + def md5_digest(bytes): + return hashlib.md5(bytes).hexdigest() + + +try: + socket._fileobject("fake socket", close=True) +except TypeError: + # python <= 2.4 + create_readline_wrapper = socket._fileobject +else: + def create_readline_wrapper(fh): + return socket._fileobject(fh, close=True) + + +# python 2.4 splithost has a bug in empty path component case +_hostprog = None +def splithost(url): + """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" + global _hostprog + if _hostprog is None: + import re + _hostprog = re.compile('^//([^/?]*)(.*)$') + + match = _hostprog.match(url) + if match: return match.group(1, 2) + return None, url + + +from urllib import (unwrap, unquote, splittype, quote, + addinfourl, splitport, + splitattr, ftpwrapper, splituser, splitpasswd, splitvalue) + +# support for FileHandler, proxies via environment variables +from urllib import localhost, url2pathname, getproxies + +from urllib2 import HTTPError, URLError + +import _request +import _rfc3986 +import _sockettimeout + +from _clientcookie import CookieJar +from _response import closeable_response + + +# used in User-Agent header sent +__version__ = sys.version[:3] + +_opener = None +def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + _opener = build_opener() + return _opener.open(url, data, timeout) + +def install_opener(opener): + global _opener + _opener = opener + +# copied from cookielib.py +_cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = urlparse.urlparse(url)[1] + if host == "": + host = request.get_header("Host", "") + + # remove port, if present + host = _cut_port_re.sub("", host, 1) + return host.lower() + +class Request: + + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False): + # unwrap('<URL:type://host/path>') --> 'type://host/path' + self.__original = unwrap(url) + self.type = None + # self.__r_type is what's left after doing the splittype + self.host = None + self.port = None + self._tunnel_host = None + self.data = data + self.headers = {} + for key, value in headers.items(): + self.add_header(key, value) + self.unredirected_hdrs = {} + if origin_req_host is None: + origin_req_host = request_host(self) + self.origin_req_host = origin_req_host + self.unverifiable = unverifiable + + def __getattr__(self, attr): + # XXX this is a fallback mechanism to guard against these + # methods getting called in a non-standard order. this may be + # too complicated and/or unnecessary. + # XXX should the __r_XXX attributes be public? + if attr[:12] == '_Request__r_': + name = attr[12:] + if hasattr(Request, 'get_' + name): + getattr(self, 'get_' + name)() + return getattr(self, attr) + raise AttributeError, attr + + def get_method(self): + if self.has_data(): + return "POST" + else: + return "GET" + + # XXX these helper methods are lame + + def add_data(self, data): + self.data = data + + def has_data(self): + return self.data is not None + + def get_data(self): + return self.data + + def get_full_url(self): + return self.__original + + def get_type(self): + if self.type is None: + self.type, self.__r_type = splittype(self.__original) + if self.type is None: + raise ValueError, "unknown url type: %s" % self.__original + return self.type + + def get_host(self): + if self.host is None: + self.host, self.__r_host = splithost(self.__r_type) + if self.host: + self.host = unquote(self.host) + return self.host + + def get_selector(self): + scheme, authority, path, query, fragment = _rfc3986.urlsplit( + self.__r_host) + if path == "": + path = "/" # RFC 2616, section 3.2.2 + fragment = None # RFC 3986, section 3.5 + return _rfc3986.urlunsplit([scheme, authority, path, query, fragment]) + + def set_proxy(self, host, type): + orig_host = self.get_host() + if self.get_type() == 'https' and not self._tunnel_host: + self._tunnel_host = orig_host + else: + self.type = type + self.__r_host = self.__original + + self.host = host + + def has_proxy(self): + """Private method.""" + # has non-HTTPS proxy + return self.__r_host == self.__original + + def get_origin_req_host(self): + return self.origin_req_host + + def is_unverifiable(self): + return self.unverifiable + + def add_header(self, key, val): + # useful for something like authentication + self.headers[key.capitalize()] = val + + def add_unredirected_header(self, key, val): + # will not be added to a redirected request + self.unredirected_hdrs[key.capitalize()] = val + + def has_header(self, header_name): + return (header_name in self.headers or + header_name in self.unredirected_hdrs) + + def get_header(self, header_name, default=None): + return self.headers.get( + header_name, + self.unredirected_hdrs.get(header_name, default)) + + def header_items(self): + hdrs = self.unredirected_hdrs.copy() + hdrs.update(self.headers) + return hdrs.items() + +class OpenerDirector: + def __init__(self): + client_version = "Python-urllib/%s" % __version__ + self.addheaders = [('User-agent', client_version)] + # manage the individual handlers + self.handlers = [] + self.handle_open = {} + self.handle_error = {} + self.process_response = {} + self.process_request = {} + + def add_handler(self, handler): + if not hasattr(handler, "add_parent"): + raise TypeError("expected BaseHandler instance, got %r" % + type(handler)) + + added = False + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i+1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j+1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = self.handle_error.get(protocol, {}) + self.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = self.handle_open + elif condition == "response": + kind = protocol + lookup = self.process_response + elif condition == "request": + kind = protocol + lookup = self.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + bisect.insort(handlers, handler) + else: + handlers.append(handler) + added = True + + if added: + # the handlers must work in an specific order, the order + # is specified in a Handler attribute + bisect.insort(self.handlers, handler) + handler.add_parent(self) + + def close(self): + # Only exists for backwards compatibility. + pass + + def _call_chain(self, chain, kind, meth_name, *args): + # Handlers raise an exception if no one else should try to handle + # the request, or return None if they can't but another handler + # could. Otherwise, they return the response. + handlers = chain.get(kind, ()) + for handler in handlers: + func = getattr(handler, meth_name) + + result = func(*args) + if result is not None: + return result + + def _open(self, req, data=None): + result = self._call_chain(self.handle_open, 'default', + 'default_open', req) + if result: + return result + + protocol = req.get_type() + result = self._call_chain(self.handle_open, protocol, protocol + + '_open', req) + if result: + return result + + return self._call_chain(self.handle_open, 'unknown', + 'unknown_open', req) + + def error(self, proto, *args): + if proto in ('http', 'https'): + # XXX http[s] protocols are special-cased + dict = self.handle_error['http'] # https is not different than http + proto = args[2] # YUCK! + meth_name = 'http_error_%s' % proto + http_err = 1 + orig_args = args + else: + dict = self.handle_error + meth_name = proto + '_error' + http_err = 0 + args = (dict, proto, meth_name) + args + result = self._call_chain(*args) + if result: + return result + + if http_err: + args = (dict, 'default', 'http_error_default') + orig_args + return self._call_chain(*args) + +# XXX probably also want an abstract factory that knows when it makes +# sense to skip a superclass in favor of a subclass and when it might +# make sense to include both + +def build_opener(*handlers): + """Create an opener object from a list of handlers. + + The opener will use several default handlers, including support + for HTTP, FTP and when applicable, HTTPS. + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + """ + import types + def isclass(obj): + return isinstance(obj, (types.ClassType, type)) + + opener = OpenerDirector() + default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, + HTTPDefaultErrorHandler, HTTPRedirectHandler, + FTPHandler, FileHandler, HTTPErrorProcessor] + if hasattr(httplib, 'HTTPS'): + default_classes.append(HTTPSHandler) + skip = set() + for klass in default_classes: + for check in handlers: + if isclass(check): + if issubclass(check, klass): + skip.add(klass) + elif isinstance(check, klass): + skip.add(klass) + for klass in skip: + default_classes.remove(klass) + + for klass in default_classes: + opener.add_handler(klass()) + + for h in handlers: + if isclass(h): + h = h() + opener.add_handler(h) + return opener + +class BaseHandler: + handler_order = 500 + + def add_parent(self, parent): + self.parent = parent + + def close(self): + # Only exists for backwards compatibility + pass + + def __lt__(self, other): + if not hasattr(other, "handler_order"): + # Try to preserve the old behavior of having custom classes + # inserted after default ones (works only for custom user + # classes which are not aware of handler_order). + return True + return self.handler_order < other.handler_order + + +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses. + + The purpose of this handler is to to allow other response processors a + look-in by removing the call to parent.error() from + AbstractHTTPHandler. + + For non-2xx error codes, this just passes the job on to the + Handler.<proto>_error_<code> methods, via the OpenerDirector.error method. + Eventually, HTTPDefaultErrorHandler will raise an HTTPError if no other + handler handles the error. + + """ + handler_order = 1000 # after all other processors + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + # According to RFC 2616, "2xx" code indicates that the client's + # request was successfully received, understood, and accepted. + if not (200 <= code < 300): + # hardcoded http is NOT a bug + response = self.parent.error( + 'http', request, response, code, msg, hdrs) + + return response + + https_response = http_response + +class HTTPDefaultErrorHandler(BaseHandler): + def http_error_default(self, req, fp, code, msg, hdrs): + # why these error methods took the code, msg, headers args in the first + # place rather than a response object, I don't know, but to avoid + # multiple wrapping, we're discarding them + + if isinstance(fp, HTTPError): + response = fp + else: + response = HTTPError( + req.get_full_url(), code, msg, hdrs, fp) + assert code == response.code + assert msg == response.msg + assert hdrs == response.hdrs + raise response + +class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections to any single URL + # this is needed because of the state that cookies introduce + max_repeats = 4 + # maximum total number of redirections (regardless of URL) before + # assuming we're in a loop + max_redirections = 10 + + # Implementation notes: + + # To avoid the server sending us into an infinite loop, the request + # object needs to track what URLs we have already seen. Do this by + # adding a handler-specific attribute to the Request object. The value + # of the dict is used to count the number of times the same URL has + # been visited. This is needed because visiting the same URL twice + # does not necessarily imply a loop, thanks to state introduced by + # cookies. + + # Always unhandled redirection codes: + # 300 Multiple Choices: should not handle this here. + # 304 Not Modified: no need to handle here: only of interest to caches + # that do conditional GETs + # 305 Use Proxy: probably not worth dealing with here + # 306 Unused: what was this for in the previous versions of protocol?? + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (code in (301, 302, 303, 307, "refresh") and m in ("GET", "HEAD") + or code in (301, 302, 303, "refresh") and m == "POST"): + # Strictly (according to RFC 2616), 301 or 302 in response + # to a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib2, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + # TODO: really refresh redirections should be visiting; tricky to fix + new = _request.Request( + newurl, + headers=req.headers, + origin_req_host=req.get_origin_req_host(), + unverifiable=True, + visit=False, + timeout=req.timeout) + new._origin_req = getattr(req, "_origin_req", req) + return new + else: + raise HTTPError(req.get_full_url(), code, msg, headers, fp) + + def http_error_302(self, req, fp, code, msg, headers): + # Some servers (incorrectly) return multiple Location headers + # (so probably same goes for URI). Use first header. + if 'location' in headers: + newurl = headers.getheaders('location')[0] + elif 'uri' in headers: + newurl = headers.getheaders('uri')[0] + else: + return + newurl = _rfc3986.clean_url(newurl, "latin-1") + newurl = _rfc3986.urljoin(req.get_full_url(), newurl) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + new = self.redirect_request(req, fp, code, msg, headers, newurl) + if new is None: + return + + # loop detection + # .redirect_dict has a key url if url was previously visited. + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if (visited.get(newurl, 0) >= self.max_repeats or + len(visited) >= self.max_redirections): + raise HTTPError(req.get_full_url(), code, + self.inf_msg + msg, headers, fp) + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[newurl] = visited.get(newurl, 0) + 1 + + # Don't close the fp until we are sure that we won't use it + # with HTTPError. + fp.read() + fp.close() + + return self.parent.open(new) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + http_error_refresh = http_error_302 + + inf_msg = "The HTTP server returned a redirect error that would " \ + "lead to an infinite loop.\n" \ + "The last 30x error message was:\n" + + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + scheme, r_scheme = splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = splituser(authority) + if userinfo is not None: + user, password = splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + +class ProxyHandler(BaseHandler): + # Proxies must be in front + handler_order = 100 + + def __init__(self, proxies=None, proxy_bypass=None): + if proxies is None: + proxies = getproxies() + + assert hasattr(proxies, 'has_key'), "proxies must be a mapping" + self.proxies = proxies + for type, url in proxies.items(): + setattr(self, '%s_open' % type, + lambda r, proxy=url, type=type, meth=self.proxy_open: \ + meth(r, proxy, type)) + if proxy_bypass is None: + proxy_bypass = urllib.proxy_bypass + self._proxy_bypass = proxy_bypass + + def proxy_open(self, req, proxy, type): + orig_type = req.get_type() + proxy_type, user, password, hostport = _parse_proxy(proxy) + + if proxy_type is None: + proxy_type = orig_type + + if req.get_host() and self._proxy_bypass(req.get_host()): + return None + + if user and password: + user_pass = '%s:%s' % (unquote(user), unquote(password)) + creds = base64.b64encode(user_pass).strip() + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type or orig_type == 'https': + # let other handlers take care of it + return None + else: + # need to start over, because the other handlers don't + # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a + return self.parent.open(req) + + +class HTTPPasswordMgr: + + def __init__(self): + self.passwd = {} + + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if isinstance(uri, basestring): + uri = [uri] + if not realm in self.passwd: + self.passwd[realm] = {} + for default_port in True, False: + reduced_uri = tuple( + [self.reduce_uri(u, default_port) for u in uri]) + self.passwd[realm][reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + domains = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uris, authinfo in domains.iteritems(): + for uri in uris: + if self.is_suburi(uri, reduced_authuri): + return authinfo + return None, None + + def reduce_uri(self, uri, default_port=True): + """Accept authority or URI and extract only the authority and path.""" + # note HTTP URLs do not have a userinfo component + parts = urlparse.urlsplit(uri) + if parts[1]: + # URI + scheme = parts[0] + authority = parts[1] + path = parts[2] or '/' + else: + # host or host:port + scheme = None + authority = uri + path = '/' + host, port = splitport(authority) + if default_port and port is None and scheme is not None: + dport = {"http": 80, + "https": 443, + }.get(scheme) + if dport is not None: + authority = "%s:%d" % (host, dport) + return authority, path + + def is_suburi(self, base, test): + """Check if test is below base in a URI tree + + Both args must be URIs in reduced form. + """ + if base == test: + return True + if base[0] != test[0]: + return False + common = posixpath.commonprefix((base[1], test[1])) + if len(common) == len(base[1]): + return True + return False + + +class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): + + def find_user_password(self, realm, authuri): + user, password = HTTPPasswordMgr.find_user_password(self, realm, + authuri) + if user is not None: + return user, password + return HTTPPasswordMgr.find_user_password(self, None, authuri) + + +class AbstractBasicAuthHandler: + + # XXX this allows for multiple auth-schemes, but will stupidly pick + # the last one with a realm specified. + + # allow for double- and single-quoted realm values + # (single quotes are a violation of the RFC, but appear in the wild) + rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' + 'realm=(["\'])(.*?)\\2', re.I) + + # XXX could pre-emptively send auth info already accepted (RFC 2617, + # end of section 2, and section 1.2 immediately after "credentials" + # production). + + def __init__(self, password_mgr=None): + if password_mgr is None: + password_mgr = HTTPPasswordMgr() + self.passwd = password_mgr + self.add_password = self.passwd.add_password + + def http_error_auth_reqed(self, authreq, host, req, headers): + # host may be an authority (without userinfo) or a URL with an + # authority + # XXX could be multiple headers + authreq = headers.get(authreq, None) + if authreq: + mo = AbstractBasicAuthHandler.rx.search(authreq) + if mo: + scheme, quote, realm = mo.groups() + if scheme.lower() == 'basic': + return self.retry_http_basic_auth(host, req, realm) + + def retry_http_basic_auth(self, host, req, realm): + user, pw = self.passwd.find_user_password(realm, host) + if pw is not None: + raw = "%s:%s" % (user, pw) + auth = 'Basic %s' % base64.b64encode(raw).strip() + if req.headers.get(self.auth_header, None) == auth: + return None + newreq = copy.copy(req) + newreq.add_header(self.auth_header, auth) + newreq.visit = False + return self.parent.open(newreq) + else: + return None + + +class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Authorization' + + def http_error_401(self, req, fp, code, msg, headers): + url = req.get_full_url() + return self.http_error_auth_reqed('www-authenticate', + url, req, headers) + + +class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Proxy-authorization' + + def http_error_407(self, req, fp, code, msg, headers): + # http_error_auth_reqed requires that there is no userinfo component in + # authority. Assume there isn't one, since urllib2 does not (and + # should not, RFC 3986 s. 3.2.1) support requests for URLs containing + # userinfo. + authority = req.get_host() + return self.http_error_auth_reqed('proxy-authenticate', + authority, req, headers) + + +def randombytes(n): + """Return n random bytes.""" + # Use /dev/urandom if it is available. Fall back to random module + # if not. It might be worthwhile to extend this function to use + # other platform-specific mechanisms for getting random bytes. + if os.path.exists("/dev/urandom"): + f = open("/dev/urandom") + s = f.read(n) + f.close() + return s + else: + L = [chr(random.randrange(0, 256)) for i in range(n)] + return "".join(L) + +class AbstractDigestAuthHandler: + # Digest authentication is specified in RFC 2617. + + # XXX The client does not inspect the Authentication-Info header + # in a successful response. + + # XXX It should be possible to test this implementation against + # a mock server that just generates a static set of challenges. + + # XXX qop="auth-int" supports is shaky + + def __init__(self, passwd=None): + if passwd is None: + passwd = HTTPPasswordMgr() + self.passwd = passwd + self.add_password = self.passwd.add_password + self.retried = 0 + self.nonce_count = 0 + self.last_nonce = None + + def reset_retry_count(self): + self.retried = 0 + + def http_error_auth_reqed(self, auth_header, host, req, headers): + authreq = headers.get(auth_header, None) + if self.retried > 5: + # Don't fail endlessly - if we failed once, we'll probably + # fail a second time. Hm. Unless the Password Manager is + # prompting for the information. Crap. This isn't great + # but it's better than the current 'repeat until recursion + # depth exceeded' approach <wink> + raise HTTPError(req.get_full_url(), 401, "digest auth failed", + headers, None) + else: + self.retried += 1 + if authreq: + scheme = authreq.split()[0] + if scheme.lower() == 'digest': + return self.retry_http_digest_auth(req, authreq) + + def retry_http_digest_auth(self, req, auth): + token, challenge = auth.split(' ', 1) + chal = parse_keqv_list(parse_http_list(challenge)) + auth = self.get_authorization(req, chal) + if auth: + auth_val = 'Digest %s' % auth + if req.headers.get(self.auth_header, None) == auth_val: + return None + newreq = copy.copy(req) + newreq.add_unredirected_header(self.auth_header, auth_val) + newreq.visit = False + return self.parent.open(newreq) + + def get_cnonce(self, nonce): + # The cnonce-value is an opaque + # quoted string value provided by the client and used by both client + # and server to avoid chosen plaintext attacks, to provide mutual + # authentication, and to provide some message integrity protection. + # This isn't a fabulous effort, but it's probably Good Enough. + dig = sha1_digest("%s:%s:%s:%s" % (self.nonce_count, nonce, + time.ctime(), randombytes(8))) + return dig[:16] + + def get_authorization(self, req, chal): + try: + realm = chal['realm'] + nonce = chal['nonce'] + qop = chal.get('qop') + algorithm = chal.get('algorithm', 'MD5') + # mod_digest doesn't send an opaque, even though it isn't + # supposed to be optional + opaque = chal.get('opaque', None) + except KeyError: + return None + + H, KD = self.get_algorithm_impls(algorithm) + if H is None: + return None + + user, pw = self.passwd.find_user_password(realm, req.get_full_url()) + if user is None: + return None + + # XXX not implemented yet + if req.has_data(): + entdig = self.get_entity_digest(req.get_data(), chal) + else: + entdig = None + + A1 = "%s:%s:%s" % (user, realm, pw) + A2 = "%s:%s" % (req.get_method(), + # XXX selector: what about proxies and full urls + req.get_selector()) + if qop == 'auth': + if nonce == self.last_nonce: + self.nonce_count += 1 + else: + self.nonce_count = 1 + self.last_nonce = nonce + + ncvalue = '%08x' % self.nonce_count + cnonce = self.get_cnonce(nonce) + noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) + respdig = KD(H(A1), noncebit) + elif qop is None: + respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) + else: + # XXX handle auth-int. + logger = logging.getLogger("mechanize.auth") + logger.info("digest auth auth-int qop is not supported, not " + "handling digest authentication") + return None + + # XXX should the partial digests be encoded too? + + base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ + 'response="%s"' % (user, realm, nonce, req.get_selector(), + respdig) + if opaque: + base += ', opaque="%s"' % opaque + if entdig: + base += ', digest="%s"' % entdig + base += ', algorithm="%s"' % algorithm + if qop: + base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) + return base + + def get_algorithm_impls(self, algorithm): + # algorithm should be case-insensitive according to RFC2617 + algorithm = algorithm.upper() + if algorithm == 'MD5': + H = md5_digest + elif algorithm == 'SHA': + H = sha1_digest + # XXX MD5-sess + KD = lambda s, d: H("%s:%s" % (s, d)) + return H, KD + + def get_entity_digest(self, data, chal): + # XXX not implemented yet + return None + + +class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + """An authentication protocol defined by RFC 2069 + + Digest authentication improves on basic authentication because it + does not transmit passwords in the clear. + """ + + auth_header = 'Authorization' + handler_order = 490 # before Basic auth + + def http_error_401(self, req, fp, code, msg, headers): + host = urlparse.urlparse(req.get_full_url())[1] + retry = self.http_error_auth_reqed('www-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + + auth_header = 'Proxy-Authorization' + handler_order = 490 # before Basic auth + + def http_error_407(self, req, fp, code, msg, headers): + host = req.get_host() + retry = self.http_error_auth_reqed('proxy-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + +class AbstractHTTPHandler(BaseHandler): + + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def do_request_(self, request): + host = request.get_host() + if not host: + raise URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if not request.has_header('Content-length'): + request.add_unredirected_header( + 'Content-length', '%d' % len(data)) + + sel_host = host + if request.has_proxy(): + scheme, sel = splittype(request.get_selector()) + sel_host, sel_path = splithost(sel) + + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + + def do_open(self, http_class, req): + """Return an addinfourl object for the request, using http_class. + + http_class must implement the HTTPConnection API from httplib. + The addinfourl return value is a file-like object. It also + has methods and attributes including: + - info(): return a mimetools.Message object for the headers + - geturl(): return the original request URL + - code: HTTP status code + """ + host_port = req.get_host() + if not host_port: + raise URLError('no host given') + + try: + h = http_class(host_port, timeout=req.timeout) + except TypeError: + # Python < 2.6, no per-connection timeout support + h = http_class(host_port) + h.set_debuglevel(self._debuglevel) + + headers = dict(req.headers) + headers.update(req.unredirected_hdrs) + # We want to make an HTTP/1.1 request, but the addinfourl + # class isn't prepared to deal with a persistent connection. + # It will try to read all remaining data from the socket, + # which will block while the server waits for the next request. + # So make sure the connection gets closed after the (only) + # request. + headers["Connection"] = "close" + headers = dict( + (name.title(), val) for name, val in headers.items()) + + if req._tunnel_host: + if not hasattr(h, "set_tunnel"): + if not hasattr(h, "_set_tunnel"): + raise URLError("HTTPS through proxy not supported " + "(Python >= 2.6.4 required)") + else: + # python 2.6 + set_tunnel = h._set_tunnel + else: + set_tunnel = h.set_tunnel + set_tunnel(req._tunnel_host) + + try: + h.request(req.get_method(), req.get_selector(), req.data, headers) + r = h.getresponse() + except socket.error, err: # XXX what error? + raise URLError(err) + + # Pick apart the HTTPResponse object to get the addinfourl + # object initialized properly. + + # Wrap the HTTPResponse object in socket's file object adapter + # for Windows. That adapter calls recv(), so delegate recv() + # to read(). This weird wrapping allows the returned object to + # have readline() and readlines() methods. + + # XXX It might be better to extract the read buffering code + # out of socket._fileobject() and into a base class. + + r.recv = r.read + fp = create_readline_wrapper(r) + + resp = closeable_response(fp, r.msg, req.get_full_url(), + r.status, r.reason) + return resp + + +class HTTPHandler(AbstractHTTPHandler): + + def http_open(self, req): + return self.do_open(httplib.HTTPConnection, req) + + http_request = AbstractHTTPHandler.do_request_ + +if hasattr(httplib, 'HTTPS'): + + class HTTPSConnectionFactory: + def __init__(self, key_file, cert_file): + self._key_file = key_file + self._cert_file = cert_file + def __call__(self, hostport): + return httplib.HTTPSConnection( + hostport, + key_file=self._key_file, cert_file=self._cert_file) + + class HTTPSHandler(AbstractHTTPHandler): + + def __init__(self, client_cert_manager=None): + AbstractHTTPHandler.__init__(self) + self.client_cert_manager = client_cert_manager + + def https_open(self, req): + if self.client_cert_manager is not None: + key_file, cert_file = self.client_cert_manager.find_key_cert( + req.get_full_url()) + conn_factory = HTTPSConnectionFactory(key_file, cert_file) + else: + conn_factory = httplib.HTTPSConnection + return self.do_open(conn_factory, req) + + https_request = AbstractHTTPHandler.do_request_ + +class HTTPCookieProcessor(BaseHandler): + """Handle HTTP cookies. + + Public attributes: + + cookiejar: CookieJar instance + + """ + def __init__(self, cookiejar=None): + if cookiejar is None: + cookiejar = CookieJar() + self.cookiejar = cookiejar + + def http_request(self, request): + self.cookiejar.add_cookie_header(request) + return request + + def http_response(self, request, response): + self.cookiejar.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + +class UnknownHandler(BaseHandler): + def unknown_open(self, req): + type = req.get_type() + raise URLError('unknown url type: %s' % type) + +def parse_keqv_list(l): + """Parse list of key=value strings where keys are not duplicated.""" + parsed = {} + for elt in l: + k, v = elt.split('=', 1) + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + parsed[k] = v + return parsed + +def parse_http_list(s): + """Parse lists as described by RFC 2068 Section 2. + + In particular, parse comma-separated lists where the elements of + the list may include quoted-strings. A quoted-string could + contain a comma. A non-quoted string could have quotes in the + middle. Neither commas nor quotes count if they are escaped. + Only double-quotes count, not single-quotes. + """ + res = [] + part = '' + + escape = quote = False + for cur in s: + if escape: + part += cur + escape = False + continue + if quote: + if cur == '\\': + escape = True + continue + elif cur == '"': + quote = False + part += cur + continue + + if cur == ',': + res.append(part) + part = '' + continue + + if cur == '"': + quote = True + + part += cur + + # append last part + if part: + res.append(part) + + return [part.strip() for part in res] + +class FileHandler(BaseHandler): + # Use local file or FTP depending on form of URL + def file_open(self, req): + url = req.get_selector() + if url[:2] == '//' and url[2:3] != '/': + req.type = 'ftp' + return self.parent.open(req) + else: + return self.open_local_file(req) + + # names for the localhost + names = None + def get_names(self): + if FileHandler.names is None: + try: + FileHandler.names = (socket.gethostbyname('localhost'), + socket.gethostbyname(socket.gethostname())) + except socket.gaierror: + FileHandler.names = (socket.gethostbyname('localhost'),) + return FileHandler.names + + # not entirely sure what the rules are here + def open_local_file(self, req): + try: + import email.utils as emailutils + except ImportError: + # python 2.4 + import email.Utils as emailutils + import mimetypes + host = req.get_host() + file = req.get_selector() + localfile = url2pathname(file) + try: + stats = os.stat(localfile) + size = stats.st_size + modified = emailutils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(file)[0] + headers = mimetools.Message(StringIO( + 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified))) + if host: + host, port = splitport(host) + if not host or \ + (not port and socket.gethostbyname(host) in self.get_names()): + return addinfourl(open(localfile, 'rb'), + headers, 'file:'+file) + except OSError, msg: + # urllib2 users shouldn't expect OSErrors coming from urlopen() + raise URLError(msg) + raise URLError('file not on local host') + +class FTPHandler(BaseHandler): + def ftp_open(self, req): + import ftplib + import mimetypes + host = req.get_host() + if not host: + raise URLError('ftp error: no host given') + host, port = splitport(host) + if port is None: + port = ftplib.FTP_PORT + else: + port = int(port) + + # username/password handling + user, host = splituser(host) + if user: + user, passwd = splitpasswd(user) + else: + passwd = None + host = unquote(host) + user = unquote(user or '') + passwd = unquote(passwd or '') + + try: + host = socket.gethostbyname(host) + except socket.error, msg: + raise URLError(msg) + path, attrs = splitattr(req.get_selector()) + dirs = path.split('/') + dirs = map(unquote, dirs) + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: + dirs = dirs[1:] + try: + fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) + type = file and 'I' or 'D' + for attr in attrs: + attr, value = splitvalue(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + fp, retrlen = fw.retrfile(file, type) + headers = "" + mtype = mimetypes.guess_type(req.get_full_url())[0] + if mtype: + headers += "Content-type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-length: %d\n" % retrlen + sf = StringIO(headers) + headers = mimetools.Message(sf) + return addinfourl(fp, headers, req.get_full_url()) + except ftplib.all_errors, msg: + raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2] + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + try: + fw = ftpwrapper(user, passwd, host, port, dirs, timeout) + except TypeError: + # Python < 2.6, no per-connection timeout support + fw = ftpwrapper(user, passwd, host, port, dirs) +## fw.ftp.set_debuglevel(1) + return fw + +class CacheFTPHandler(FTPHandler): + # XXX would be nice to have pluggable cache strategies + # XXX this stuff is definitely not thread safe + def __init__(self): + self.cache = {} + self.timeout = {} + self.soonest = 0 + self.delay = 60 + self.max_conns = 16 + + def setTimeout(self, t): + self.delay = t + + def setMaxConns(self, m): + self.max_conns = m + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + key = user, host, port, '/'.join(dirs), timeout + if key in self.cache: + self.timeout[key] = time.time() + self.delay + else: + self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout) + self.timeout[key] = time.time() + self.delay + self.check_cache() + return self.cache[key] + + def check_cache(self): + # first check for old ones + t = time.time() + if self.soonest <= t: + for k, v in self.timeout.items(): + if v < t: + self.cache[k].close() + del self.cache[k] + del self.timeout[k] + self.soonest = min(self.timeout.values()) + + # then check the size + if len(self.cache) == self.max_conns: + for k, v in self.timeout.items(): + if v == self.soonest: + del self.cache[k] + del self.timeout[k] + break + self.soonest = min(self.timeout.values()) diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_useragent.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_useragent.py new file mode 100644 index 0000000000000000000000000000000000000000..ac28bdd7bbfda39efa1e882f4086d3eabef6f6b4 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_useragent.py @@ -0,0 +1,367 @@ +"""Convenient HTTP UserAgent class. + +This is a subclass of urllib2.OpenerDirector. + + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import warnings + +import _auth +import _gzip +import _opener +import _response +import _sockettimeout +import _urllib2 + + +class UserAgentBase(_opener.OpenerDirector): + """Convenient user-agent class. + + Do not use .add_handler() to add a handler for something already dealt with + by this code. + + The only reason at present for the distinction between UserAgent and + UserAgentBase is so that classes that depend on .seek()able responses + (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass + UserAgent exposes a .set_seekable_responses() method that allows switching + off the adding of a .seek() method to responses. + + Public attributes: + + addheaders: list of (name, value) pairs specifying headers to send with + every request, unless they are overridden in the Request instance. + + >>> ua = UserAgentBase() + >>> ua.addheaders = [ + ... ("User-agent", "Mozilla/5.0 (compatible)"), + ... ("From", "responsible.person@example.com")] + + """ + + handler_classes = { + # scheme handlers + "http": _urllib2.HTTPHandler, + # CacheFTPHandler is buggy, at least in 2.3, so we don't use it + "ftp": _urllib2.FTPHandler, + "file": _urllib2.FileHandler, + + # other handlers + "_unknown": _urllib2.UnknownHandler, + # HTTP{S,}Handler depend on HTTPErrorProcessor too + "_http_error": _urllib2.HTTPErrorProcessor, + "_http_default_error": _urllib2.HTTPDefaultErrorHandler, + + # feature handlers + "_basicauth": _urllib2.HTTPBasicAuthHandler, + "_digestauth": _urllib2.HTTPDigestAuthHandler, + "_redirect": _urllib2.HTTPRedirectHandler, + "_cookies": _urllib2.HTTPCookieProcessor, + "_refresh": _urllib2.HTTPRefreshProcessor, + "_equiv": _urllib2.HTTPEquivProcessor, + "_proxy": _urllib2.ProxyHandler, + "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler, + "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler, + "_robots": _urllib2.HTTPRobotRulesProcessor, + "_gzip": _gzip.HTTPGzipProcessor, # experimental! + + # debug handlers + "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor, + "_debug_response_body": _urllib2.HTTPResponseDebugProcessor, + } + + default_schemes = ["http", "ftp", "file"] + default_others = ["_unknown", "_http_error", "_http_default_error"] + default_features = ["_redirect", "_cookies", + "_refresh", "_equiv", + "_basicauth", "_digestauth", + "_proxy", "_proxy_basicauth", "_proxy_digestauth", + "_robots", + ] + if hasattr(_urllib2, 'HTTPSHandler'): + handler_classes["https"] = _urllib2.HTTPSHandler + default_schemes.append("https") + + def __init__(self): + _opener.OpenerDirector.__init__(self) + + ua_handlers = self._ua_handlers = {} + for scheme in (self.default_schemes+ + self.default_others+ + self.default_features): + klass = self.handler_classes[scheme] + ua_handlers[scheme] = klass() + for handler in ua_handlers.itervalues(): + self.add_handler(handler) + + # Yuck. + # Ensure correct default constructor args were passed to + # HTTPRefreshProcessor and HTTPEquivProcessor. + if "_refresh" in ua_handlers: + self.set_handle_refresh(True) + if "_equiv" in ua_handlers: + self.set_handle_equiv(True) + # Ensure default password managers are installed. + pm = ppm = None + if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers: + pm = _urllib2.HTTPPasswordMgrWithDefaultRealm() + if ("_proxy_basicauth" in ua_handlers or + "_proxy_digestauth" in ua_handlers): + ppm = _auth.HTTPProxyPasswordMgr() + self.set_password_manager(pm) + self.set_proxy_password_manager(ppm) + # set default certificate manager + if "https" in ua_handlers: + cm = _urllib2.HTTPSClientCertMgr() + self.set_client_cert_manager(cm) + + def close(self): + _opener.OpenerDirector.close(self) + self._ua_handlers = None + + # XXX +## def set_timeout(self, timeout): +## self._timeout = timeout +## def set_http_connection_cache(self, conn_cache): +## self._http_conn_cache = conn_cache +## def set_ftp_connection_cache(self, conn_cache): +## # XXX ATM, FTP has cache as part of handler; should it be separate? +## self._ftp_conn_cache = conn_cache + + def set_handled_schemes(self, schemes): + """Set sequence of URL scheme (protocol) strings. + + For example: ua.set_handled_schemes(["http", "ftp"]) + + If this fails (with ValueError) because you've passed an unknown + scheme, the set of handled schemes will not be changed. + + """ + want = {} + for scheme in schemes: + if scheme.startswith("_"): + raise ValueError("not a scheme '%s'" % scheme) + if scheme not in self.handler_classes: + raise ValueError("unknown scheme '%s'") + want[scheme] = None + + # get rid of scheme handlers we don't want + for scheme, oldhandler in self._ua_handlers.items(): + if scheme.startswith("_"): continue # not a scheme handler + if scheme not in want: + self._replace_handler(scheme, None) + else: + del want[scheme] # already got it + # add the scheme handlers that are missing + for scheme in want.keys(): + self._set_handler(scheme, True) + + def set_cookiejar(self, cookiejar): + """Set a mechanize.CookieJar, or None.""" + self._set_handler("_cookies", obj=cookiejar) + + # XXX could use Greg Stein's httpx for some of this instead? + # or httplib2?? + def set_proxies(self, proxies=None, proxy_bypass=None): + """Configure proxy settings. + + proxies: dictionary mapping URL scheme to proxy specification. None + means use the default system-specific settings. + proxy_bypass: function taking hostname, returning whether proxy should + be used. None means use the default system-specific settings. + + The default is to try to obtain proxy settings from the system (see the + documentation for urllib.urlopen for information about the + system-specific methods used -- note that's urllib, not urllib2). + + To avoid all use of proxies, pass an empty proxies dict. + + >>> ua = UserAgentBase() + >>> def proxy_bypass(hostname): + ... return hostname == "noproxy.com" + >>> ua.set_proxies( + ... {"http": "joe:password@myproxy.example.com:3128", + ... "ftp": "proxy.example.com"}, + ... proxy_bypass) + + """ + self._set_handler("_proxy", True, + constructor_kwds=dict(proxies=proxies, + proxy_bypass=proxy_bypass)) + + def add_password(self, url, user, password, realm=None): + self._password_manager.add_password(realm, url, user, password) + def add_proxy_password(self, user, password, hostport=None, realm=None): + self._proxy_password_manager.add_password( + realm, hostport, user, password) + + def add_client_certificate(self, url, key_file, cert_file): + """Add an SSL client certificate, for HTTPS client auth. + + key_file and cert_file must be filenames of the key and certificate + files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS + 12) file to PEM format: + + openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem + openssl pkcs12 -nocerts -in cert.p12 -out key.pem + + + Note that client certificate password input is very inflexible ATM. At + the moment this seems to be console only, which is presumably the + default behaviour of libopenssl. In future mechanize may support + third-party libraries that (I assume) allow more options here. + + """ + self._client_cert_manager.add_key_cert(url, key_file, cert_file) + + # the following are rarely useful -- use add_password / add_proxy_password + # instead + def set_password_manager(self, password_manager): + """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None.""" + self._password_manager = password_manager + self._set_handler("_basicauth", obj=password_manager) + self._set_handler("_digestauth", obj=password_manager) + def set_proxy_password_manager(self, password_manager): + """Set a mechanize.HTTPProxyPasswordMgr, or None.""" + self._proxy_password_manager = password_manager + self._set_handler("_proxy_basicauth", obj=password_manager) + self._set_handler("_proxy_digestauth", obj=password_manager) + def set_client_cert_manager(self, cert_manager): + """Set a mechanize.HTTPClientCertMgr, or None.""" + self._client_cert_manager = cert_manager + handler = self._ua_handlers["https"] + handler.client_cert_manager = cert_manager + + # these methods all take a boolean parameter + def set_handle_robots(self, handle): + """Set whether to observe rules from robots.txt.""" + self._set_handler("_robots", handle) + def set_handle_redirect(self, handle): + """Set whether to handle HTTP 30x redirections.""" + self._set_handler("_redirect", handle) + def set_handle_refresh(self, handle, max_time=None, honor_time=True): + """Set whether to handle HTTP Refresh headers.""" + self._set_handler("_refresh", handle, constructor_kwds= + {"max_time": max_time, "honor_time": honor_time}) + def set_handle_equiv(self, handle, head_parser_class=None): + """Set whether to treat HTML http-equiv headers like HTTP headers. + + Response objects may be .seek()able if this is set (currently returned + responses are, raised HTTPError exception responses are not). + + """ + if head_parser_class is not None: + constructor_kwds = {"head_parser_class": head_parser_class} + else: + constructor_kwds={} + self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds) + def set_handle_gzip(self, handle): + """Handle gzip transfer encoding. + + """ + if handle: + warnings.warn( + "gzip transfer encoding is experimental!", stacklevel=2) + self._set_handler("_gzip", handle) + def set_debug_redirects(self, handle): + """Log information about HTTP redirects (including refreshes). + + Logging is performed using module logging. The logger name is + "mechanize.http_redirects". To actually print some debug output, + eg: + + import sys, logging + logger = logging.getLogger("mechanize.http_redirects") + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.setLevel(logging.INFO) + + Other logger names relevant to this module: + + "mechanize.http_responses" + "mechanize.cookies" + + To turn on everything: + + import sys, logging + logger = logging.getLogger("mechanize") + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.setLevel(logging.INFO) + + """ + self._set_handler("_debug_redirect", handle) + def set_debug_responses(self, handle): + """Log HTTP response bodies. + + See docstring for .set_debug_redirects() for details of logging. + + Response objects may be .seek()able if this is set (currently returned + responses are, raised HTTPError exception responses are not). + + """ + self._set_handler("_debug_response_body", handle) + def set_debug_http(self, handle): + """Print HTTP headers to sys.stdout.""" + level = int(bool(handle)) + for scheme in "http", "https": + h = self._ua_handlers.get(scheme) + if h is not None: + h.set_http_debuglevel(level) + + def _set_handler(self, name, handle=None, obj=None, + constructor_args=(), constructor_kwds={}): + if handle is None: + handle = obj is not None + if handle: + handler_class = self.handler_classes[name] + if obj is not None: + newhandler = handler_class(obj) + else: + newhandler = handler_class( + *constructor_args, **constructor_kwds) + else: + newhandler = None + self._replace_handler(name, newhandler) + + def _replace_handler(self, name, newhandler=None): + # first, if handler was previously added, remove it + if name is not None: + handler = self._ua_handlers.get(name) + if handler: + try: + self.handlers.remove(handler) + except ValueError: + pass + # then add the replacement, if any + if newhandler is not None: + self.add_handler(newhandler) + self._ua_handlers[name] = newhandler + + +class UserAgent(UserAgentBase): + + def __init__(self): + UserAgentBase.__init__(self) + self._seekable = False + + def set_seekable_responses(self, handle): + """Make response objects .seek()able.""" + self._seekable = bool(handle) + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + if self._seekable: + def bound_open(fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return UserAgentBase.open(self, fullurl, data, timeout) + response = _opener.wrapped_open( + bound_open, _response.seek_wrapped_response, fullurl, data, + timeout) + else: + response = UserAgentBase.open(self, fullurl, data) + return response diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_util.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_util.py new file mode 100644 index 0000000000000000000000000000000000000000..0a5ebb1f31f75f3c2b5f572b555198b9fe0c7e69 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_util.py @@ -0,0 +1,305 @@ +"""Utility functions and date/time routines. + + Copyright 2002-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). +""" + +import re +import time +import warnings + + +class ExperimentalWarning(UserWarning): + pass + +def experimental(message): + warnings.warn(message, ExperimentalWarning, stacklevel=3) +def hide_experimental_warnings(): + warnings.filterwarnings("ignore", category=ExperimentalWarning) +def reset_experimental_warnings(): + warnings.filterwarnings("default", category=ExperimentalWarning) + +def deprecation(message): + warnings.warn(message, DeprecationWarning, stacklevel=3) +def hide_deprecations(): + warnings.filterwarnings("ignore", category=DeprecationWarning) +def reset_deprecations(): + warnings.filterwarnings("default", category=DeprecationWarning) + + +def write_file(filename, data): + f = open(filename, "wb") + try: + f.write(data) + finally: + f.close() + + +def get1(sequence): + assert len(sequence) == 1 + return sequence[0] + + +def isstringlike(x): + try: x+"" + except: return False + else: return True + +## def caller(): +## try: +## raise SyntaxError +## except: +## import sys +## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name + + +from calendar import timegm + +# Date/time conversion routines for formats used by the HTTP protocol. + +EPOCH = 1970 +def my_timegm(tt): + year, month, mday, hour, min, sec = tt[:6] + if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and + (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): + return timegm(tt) + else: + return None + +days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] +months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] +months_lower = [] +for month in months: months_lower.append(month.lower()) + + +def time2isoz(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", + representing Universal Time (UTC, aka GMT). An example of this format is: + + 1994-11-24 08:49:37Z + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec = time.gmtime(t)[:6] + return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( + year, mon, mday, hour, min, sec) + +def time2netscape(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like this: + + Wed, DD-Mon-YYYY HH:MM:SS GMT + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7] + return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % ( + days[wday], mday, months[mon-1], year, hour, min, sec) + + +UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} + +timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") +def offset_from_tz_string(tz): + offset = None + if UTC_ZONES.has_key(tz): + offset = 0 + else: + m = timezone_re.search(tz) + if m: + offset = 3600 * int(m.group(2)) + if m.group(3): + offset = offset + 60 * int(m.group(3)) + if m.group(1) == '-': + offset = -offset + return offset + +def _str2time(day, mon, yr, hr, min, sec, tz): + # translate month name to number + # month numbers start with 1 (January) + try: + mon = months_lower.index(mon.lower())+1 + except ValueError: + # maybe it's already a number + try: + imon = int(mon) + except ValueError: + return None + if 1 <= imon <= 12: + mon = imon + else: + return None + + # make sure clock elements are defined + if hr is None: hr = 0 + if min is None: min = 0 + if sec is None: sec = 0 + + yr = int(yr) + day = int(day) + hr = int(hr) + min = int(min) + sec = int(sec) + + if yr < 1000: + # find "obvious" year + cur_yr = time.localtime(time.time())[0] + m = cur_yr % 100 + tmp = yr + yr = yr + cur_yr - m + m = m - tmp + if abs(m) > 50: + if m > 0: yr = yr + 100 + else: yr = yr - 100 + + # convert UTC time tuple to seconds since epoch (not timezone-adjusted) + t = my_timegm((yr, mon, day, hr, min, sec, tz)) + + if t is not None: + # adjust time using timezone string, to get absolute time since epoch + if tz is None: + tz = "UTC" + tz = tz.upper() + offset = offset_from_tz_string(tz) + if offset is None: + return None + t = t - offset + + return t + + +strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " + r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") +wkday_re = re.compile( + r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) +loose_http_re = re.compile( + r"""^ + (\d\d?) # day + (?:\s+|[-\/]) + (\w+) # month + (?:\s+|[-\/]) + (\d+) # year + (?: + (?:\s+|:) # separator before clock + (\d\d?):(\d\d) # hour:min + (?::(\d\d))? # optional seconds + )? # optional clock + \s* + ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone + \s* + (?:\(\w+\))? # ASCII representation of timezone in parens. + \s*$""", re.X) +def http2time(text): + """Returns time in seconds since epoch of time represented by a string. + + Return value is an integer. + + None is returned if the format of str is unrecognized, the time is outside + the representable range, or the timezone string is not recognized. If the + string contains no timezone, UTC is assumed. + + The timezone in the string may be numerical (like "-0800" or "+0100") or a + string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the + timezone strings equivalent to UTC (zero offset) are known to the function. + + The function loosely parses the following formats: + + Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format + Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format + Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format + 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) + 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) + 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) + + The parser ignores leading and trailing whitespace. The time may be + absent. + + If the year is given with only 2 digits, the function will select the + century that makes the year closest to the current date. + + """ + # fast exit for strictly conforming string + m = strict_re.search(text) + if m: + g = m.groups() + mon = months_lower.index(g[1].lower()) + 1 + tt = (int(g[2]), mon, int(g[0]), + int(g[3]), int(g[4]), float(g[5])) + return my_timegm(tt) + + # No, we need some messy parsing... + + # clean up + text = text.lstrip() + text = wkday_re.sub("", text, 1) # Useless weekday + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = loose_http_re.search(text) + if m is not None: + day, mon, yr, hr, min, sec, tz = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + + +iso_re = re.compile( + """^ + (\d{4}) # year + [-\/]? + (\d\d?) # numerical month + [-\/]? + (\d\d?) # day + (?: + (?:\s+|[-:Tt]) # separator before clock + (\d\d?):?(\d\d) # hour:min + (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) + )? # optional clock + \s* + ([-+]?\d\d?:?(:?\d\d)? + |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) + \s*$""", re.X) +def iso2time(text): + """ + As for http2time, but parses the ISO 8601 formats: + + 1994-02-03 14:15:29 -0100 -- ISO 8601 format + 1994-02-03 14:15:29 -- zone is optional + 1994-02-03 -- only date + 1994-02-03T14:15:29 -- Use T as separator + 19940203T141529Z -- ISO 8601 compact format + 19940203 -- only date + + """ + # clean up + text = text.lstrip() + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = iso_re.search(text) + if m is not None: + # XXX there's an extra bit of the timezone I'm ignoring here: is + # this the right thing to do? + yr, mon, day, hr, min, sec, tz, _ = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) diff --git a/LTA/LTAIngest/mechanize-0.2.5/mechanize/_version.py b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..ab5b07b714725721454ec14497adb7134496d06f --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/mechanize/_version.py @@ -0,0 +1,2 @@ +"0.2.5" +__version__ = (0, 2, 5, None, None) diff --git a/LTA/LTAIngest/mechanize-0.2.5/release.py b/LTA/LTAIngest/mechanize-0.2.5/release.py new file mode 100644 index 0000000000000000000000000000000000000000..058c52c92eff2ccc63abdac4e38965b7e05b2db2 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/release.py @@ -0,0 +1,1129 @@ +"""%prog RELEASE_AREA [action ...] + +Perform needed actions to release mechanize, doing the work in directory +RELEASE_AREA. + +If no actions are given, print the tree of actions and do nothing. + +This is only intended to work on Unix (unlike mechanize itself). Some of it +only works on Ubuntu 10.04 (lucid). + +Warning: + + * Many actions do rm -rf on RELEASE_AREA or subdirectories of RELEASE_AREA. + + * The install_deps action installs some debian packages system-wide. The +clean action doesn't uninstall them. + + * The install_deps action adds a PPA. + + * The install_deps action downloads and installs software to RELEASE_AREA. +The clean action uninstalls (by rm -rf). +""" + +# This script depends on the code from this git repository: +# git://github.com/jjlee/mechanize-build-tools.git + +# TODO + +# * 0install package? +# * test in a Windows VM + +import glob +import optparse +import os +import re +import shutil +import smtplib +import subprocess +import sys +import tempfile +import time +import unittest + +# Stop the test runner from reporting import failure if these modules aren't +# available or not running under Python >= 2.6. AttributeError occurs if run +# with Python < 2.6, due to lack of collections.namedtuple +try: + import email.mime.text + + import action_tree + import cmd_env + + import buildtools.release as release +except (ImportError, AttributeError): + # fake module + class action_tree(object): + @staticmethod + def action_node(func): + return func + +# based on Mark Seaborn's plash build-tools (action_tree) and Cmed's in-chroot +# (cmd_env) -- which is also Mark's idea + + +class WrongVersionError(Exception): + + def __init__(self, version): + Exception.__init__(self, version) + self.version = version + + def __str__(self): + return str(self.version) + + +class MissingVersionError(Exception): + + def __init__(self, path, release_version): + Exception.__init__(self, path, release_version) + self.path = path + self.release_version = release_version + + def __str__(self): + return ("Release version string not found in %s: should be %s" % + (self.path, self.release_version)) + + +class CSSValidationError(Exception): + + def __init__(self, path, details): + Exception.__init__(self, path, details) + self.path = path + self.details = details + + def __str__(self): + return ("CSS validation of %s failed:\n%s" % + (self.path, self.details)) + + +def run_performance_tests(path): + # TODO: use a better/standard test runner + sys.path.insert(0, os.path.join(path, "test")) + test_runner = unittest.TextTestRunner(verbosity=1) + test_loader = unittest.defaultTestLoader + modules = [] + for module_name in ["test_performance"]: + module = __import__(module_name) + for part in module_name.split('.')[1:]: + module = getattr(module, part) + modules.append(module) + suite = unittest.TestSuite() + for module in modules: + test = test_loader.loadTestsFromModule(module) + suite.addTest(test) + result = test_runner.run(test) + return result + + +def send_email(from_address, to_address, subject, body): + msg = email.mime.text.MIMEText(body) + msg['Subject'] = subject + msg['From'] = from_address + msg['To'] = to_address + # print "from_address %r" % from_address + # print "to_address %r" % to_address + # print "msg.as_string():\n%s" % msg.as_string() + s = smtplib.SMTP() + s.connect() + s.sendmail(from_address, [to_address], msg.as_string()) + s.quit() + + +def is_git_repository(path): + return os.path.exists(os.path.join(path, ".git")) + + +def ensure_unmodified(env, path): + # raise if working tree differs from HEAD + release.CwdEnv(env, path).cmd(["git", "diff", "--exit-code", "HEAD"]) + + +def add_to_path_cmd(value): + set_path_script = """\ +if [ -n "$PATH" ] + then + export PATH="$PATH":%(value)s + else + export PATH=%(value)s +fi +exec "$@" +""" % dict(value=value) + return ["sh", "-c", set_path_script, "inline_script"] + + +def clean_environ_env(env): + return cmd_env.PrefixCmdEnv( + ["sh", "-c", 'env -i HOME="$HOME" PATH="$PATH" "$@"', + "clean_environ_env"], env) + + +def ensure_trailing_slash(path): + return path.rstrip("/") + "/" + + +def clean_dir(env, path): + env.cmd(release.rm_rf_cmd(path)) + env.cmd(["mkdir", "-p", path]) + + +def check_version_equals(env, version, python): + try: + output = release.get_cmd_stdout( + env, + [python, "-c", + "import mechanize; print mechanize.__version__"], + stderr=subprocess.PIPE) + except cmd_env.CommandFailedError: + raise WrongVersionError(None) + else: + version_tuple_string = output.strip() + assert len(version.tuple) == 6, len(version.tuple) + if not(version_tuple_string == str(version.tuple) or + version_tuple_string == str(version.tuple[:-1])): + raise WrongVersionError(version_tuple_string) + + +def check_not_installed(env, python): + bogus_version = release.parse_version("0.0.0") + try: + check_version_equals(env, bogus_version, python) + except WrongVersionError, exc: + if exc.version is not None: + raise + else: + raise WrongVersionError(bogus_version) + + +class EasyInstallTester(object): + + def __init__(self, env, install_dir, project_name, + test_cmd, expected_version, + easy_install_cmd=("easy_install",), + python="python"): + self._env = env + self._install_dir = install_dir + self._project_name = project_name + self._test_cmd = test_cmd + self._expected_version = expected_version + self._easy_install_cmd = list(easy_install_cmd) + self._python = python + self._install_dir_on_pythonpath = cmd_env.set_environ_vars_env( + [("PYTHONPATH", self._install_dir)], env) + + def easy_install(self, log): + clean_dir(self._env, self._install_dir) + check_not_installed(self._install_dir_on_pythonpath, self._python) + output = release.get_cmd_stdout( + self._install_dir_on_pythonpath, + self._easy_install_cmd + ["-d", self._install_dir, + self._project_name]) + # easy_install doesn't fail properly :-( + if "SyntaxError" in output: + raise Exception(output) + check_version_equals(self._install_dir_on_pythonpath, + self._expected_version, + self._python) + + def test(self, log): + self._install_dir_on_pythonpath.cmd(self._test_cmd) + + @action_tree.action_node + def easy_install_test(self): + return [ + self.easy_install, + self.test, + ] + + +def make_source_dist_easy_install_test_step(env, install_dir, + source_dir, + test_cmd, expected_version, + python_version): + python = "python%d.%d" % python_version + tester = EasyInstallTester( + env, + install_dir, + project_name=".", + test_cmd=test_cmd, + expected_version=expected_version, + easy_install_cmd=(cmd_env.in_dir(source_dir) + + [python, "setup.py", "easy_install"]), + python=python) + return tester.easy_install_test + + +def make_pypi_easy_install_test_step(env, install_dir, + test_cmd, expected_version, + python_version): + easy_install = "easy_install-%d.%d" % python_version + python = "python%d.%d" % python_version + tester = EasyInstallTester( + env, + install_dir, + project_name="mechanize", + test_cmd=test_cmd, + expected_version=expected_version, + easy_install_cmd=[easy_install], + python=python) + return tester.easy_install_test + + +def make_tarball_easy_install_test_step(env, install_dir, + tarball_path, + test_cmd, expected_version, + python_version): + easy_install = "easy_install-%d.%d" % python_version + python = "python%d.%d" % python_version + tester = EasyInstallTester( + env, + install_dir, + project_name=tarball_path, + test_cmd=test_cmd, + expected_version=expected_version, + easy_install_cmd=[easy_install], + python=python) + return tester.easy_install_test + + +class Releaser(object): + + def __init__(self, env, git_repository_path, release_area, mirror_path, + build_tools_repo_path=None, run_in_repository=False, + tag_name=None, test_uri=None): + self._release_area = release_area + self._release_dir = release_dir = os.path.join(release_area, "release") + self._opt_dir = os.path.join(release_dir, "opt") + self._bin_dir = os.path.join(self._opt_dir, "bin") + AddToPathEnv = release.make_env_maker(add_to_path_cmd) + self._env = AddToPathEnv(release.GitPagerWrapper(env), self._bin_dir) + self._source_repo_path = git_repository_path + self._in_source_repo = release.CwdEnv(self._env, + self._source_repo_path) + self._tag_name = tag_name + self._set_next_release_version() + self._clone_path = os.path.join(release_dir, "clone") + self._in_clone = release.CwdEnv(self._env, self._clone_path) + if run_in_repository: + self._in_repo = self._in_source_repo + self._repo_path = self._source_repo_path + else: + self._in_repo = self._in_clone + self._repo_path = self._clone_path + self._docs_dir = os.path.join(self._repo_path, "docs") + self._in_docs_dir = release.CwdEnv(self._env, self._docs_dir) + self._in_release_dir = release.CwdEnv(self._env, self._release_dir) + self._build_tools_path = build_tools_repo_path + if self._build_tools_path is not None: + self._website_source_path = os.path.join(self._build_tools_path, + "website") + self._mirror_path = mirror_path + self._in_mirror = release.CwdEnv(self._env, self._mirror_path) + self._css_validator_path = "css_validator" + self._test_uri = test_uri + self._test_deps_dir = os.path.join(release_dir, "test_deps") + self._easy_install_test_dir = os.path.join(release_dir, + "easy_install_test") + self._in_easy_install_dir = release.CwdEnv(self._env, + self._easy_install_test_dir) + # prevent anything other than functional test dependencies being on + # sys.path due to cwd or PYTHONPATH + self._easy_install_env = clean_environ_env( + release.CwdEnv(env, self._test_deps_dir)) + self._zope_testbrowser_dir = os.path.join(release_dir, + "zope_testbrowser_test") + + def _mkdtemp(self): + temp_dir = tempfile.mkdtemp(prefix="tmp-%s-" % self.__class__.__name__) + def tear_down(): + shutil.rmtree(temp_dir) + return temp_dir, tear_down + + def _get_next_release_version(self): + # --pretend / git not installed + most_recent, next = "dummy version", "dummy version" + try: + tags = release.get_cmd_stdout(self._in_source_repo, + ["git", "tag", "-l"]).split() + except cmd_env.CommandFailedError: + pass + else: + versions = [release.parse_version(tag) for tag in tags] + if versions: + most_recent = max(versions) + next = most_recent.next_version() + return most_recent, next + + def _set_next_release_version(self): + self._previous_version, self._release_version = \ + self._get_next_release_version() + if self._tag_name is not None: + self._release_version = release.parse_version(self._tag_name) + self._source_distributions = self._get_source_distributions( + self._release_version) + + def _get_source_distributions(self, version): + def dist_basename(version, format): + return "mechanize-%s.%s" % (version, format) + return set([dist_basename(version, "zip"), + dist_basename(version, "tar.gz")]) + + def git_fetch(self, log): + # for tags + self._in_source_repo.cmd(["git", "fetch"]) + self._set_next_release_version() + + def print_next_tag(self, log): + print self._release_version + + def _verify_version(self, path): + if str(self._release_version) not in \ + release.read_file_from_env(self._in_repo, path): + raise MissingVersionError(path, self._release_version) + + def _verify_versions(self): + for path in ["ChangeLog", "mechanize/_version.py"]: + self._verify_version(path) + + def clone(self, log): + self._env.cmd(["git", "clone", + self._source_repo_path, self._clone_path]) + + def checks(self, log): + self._verify_versions() + + def _ensure_installed(self, package_name, ppa): + release.ensure_installed(self._env, + cmd_env.PrefixCmdEnv(["sudo"], self._env), + package_name, + ppa=ppa) + + def install_css_validator_in_release_area(self, log): + jar_dir = os.path.join(self._release_area, self._css_validator_path) + clean_dir(self._env, jar_dir) + in_jar_dir = release.CwdEnv(self._env, jar_dir) + in_jar_dir.cmd([ + "wget", + "http://www.w3.org/QA/Tools/css-validator/css-validator.jar"]) + in_jar_dir.cmd(["wget", + "http://jigsaw.w3.org/Distrib/jigsaw_2.2.6.tar.bz2"]) + in_jar_dir.cmd(["sh", "-c", "tar xf jigsaw_*.tar.bz2"]) + in_jar_dir.cmd(["ln", "-s", "Jigsaw/classes/jigsaw.jar"]) + + @action_tree.action_node + def install_deps(self): + dependency_actions = [] + standard_dependency_actions = [] + def add_dependency(package_name, ppa=None): + if ppa is None: + actions = standard_dependency_actions + else: + actions = dependency_actions + actions.append( + (package_name.replace(".", ""), + lambda log: self._ensure_installed(package_name, ppa))) + add_dependency("python2.6") + # required, but ubuntu doesn't have them any more :-( I installed these + # (and zope.interface and twisted SVN trunk) by hand + # add_dependency("python2.4"), + # add_dependency("python2.5") + # add_dependency("python2.7") + add_dependency("python-setuptools") + add_dependency("git-core") + # for running zope_testbrowser tests + add_dependency("python-virtualenv") + add_dependency("python2.6-dev") + # for deployment to SF and local collation of files for release + add_dependency("rsync") + # for running functional tests against local web server + add_dependency("python-twisted-web2") + # for generating .html docs from .txt markdown files + add_dependency("pandoc") + # for generating docs from .in templates + add_dependency("python-empy") + # for post-processing generated HTML + add_dependency("python-lxml") + # for the validate command + add_dependency("wdg-html-validator") + # for collecting code coverage data and generating coverage reports + # no 64 bit .deb ATM + # add_dependency("python-figleaf", ppa="jjl/figleaf") + + # for css validator + add_dependency("default-jre") + add_dependency("libcommons-collections3-java") + add_dependency("libcommons-lang-java") + add_dependency("libxerces2-java") + add_dependency("libtagsoup-java") + # OMG, it depends on piles of java web server stuff, even for local + # command-line validation. You're doing it wrong! + add_dependency("velocity") + dependency_actions.append(self.install_css_validator_in_release_area) + + dependency_actions.insert(0, action_tree.make_node( + standard_dependency_actions, "standard_dependencies")) + return dependency_actions + + def copy_test_dependencies(self, log): + # so test.py can be run without the mechanize alongside it being on + # sys.path + # TODO: move mechanize package into a top-level directory, so it's not + # automatically on sys.path + def copy_in(src): + self._env.cmd(["cp", "-r", src, self._test_deps_dir]) + clean_dir(self._env, self._test_deps_dir) + copy_in(os.path.join(self._repo_path, "test.py")) + copy_in(os.path.join(self._repo_path, "test")) + copy_in(os.path.join(self._repo_path, "test-tools")) + copy_in(os.path.join(self._repo_path, "examples")) + + def _make_test_cmd(self, python_version, + local_server=True, + uri=None, + coverage=False): + python = "python%d.%d" % python_version + if coverage: + # python-figleaf only supports Python 2.6 ATM + assert python_version == (2, 6), python_version + python = "figleaf" + test_cmd = [python, "test.py"] + if not local_server: + test_cmd.append("--no-local-server") + # running against wwwsearch.sourceforge.net is slow, want to + # see where it failed + test_cmd.append("-v") + if coverage: + # TODO: Fix figleaf traceback with doctests + test_cmd.append("--skip-doctests") + if uri is not None: + test_cmd.extend(["--uri", uri]) + return test_cmd + + def performance_test(self, log): + result = run_performance_tests(self._repo_path) + if not result.wasSuccessful(): + raise Exception("performance tests failed") + + def clean_coverage(self, log): + self._in_repo.cmd(["rm", "-f", ".figleaf"]) + self._in_repo.cmd(release.rm_rf_cmd("html")) + + def _make_test_step(self, env, **kwds): + test_cmd = self._make_test_cmd(**kwds) + def test_step(log): + env.cmd(test_cmd) + return test_step + + def _make_easy_install_test_cmd(self, **kwds): + test_cmd = self._make_test_cmd(**kwds) + test_cmd.extend(["discover", "--start-directory", self._test_deps_dir]) + return test_cmd + + def _make_source_dist_easy_install_test_step(self, env, **kwds): + test_cmd = self._make_easy_install_test_cmd(**kwds) + return make_source_dist_easy_install_test_step( + self._easy_install_env, self._easy_install_test_dir, + self._repo_path, test_cmd, self._release_version, + kwds["python_version"]) + + def _make_pypi_easy_install_test_step(self, env, **kwds): + test_cmd = self._make_easy_install_test_cmd(**kwds) + return make_pypi_easy_install_test_step( + self._easy_install_env, self._easy_install_test_dir, + test_cmd, self._release_version, kwds["python_version"]) + + def _make_tarball_easy_install_test_step(self, env, **kwds): + test_cmd = self._make_easy_install_test_cmd(**kwds) + [tarball] = list(d for d in self._source_distributions if + d.endswith(".tar.gz")) + return make_tarball_easy_install_test_step( + self._easy_install_env, self._easy_install_test_dir, + os.path.abspath(os.path.join(self._repo_path, "dist", tarball)), + test_cmd, self._release_version, kwds["python_version"]) + + def _make_unpacked_tarball_test_step(self, env, **kwds): + # This catches mistakes in listing test files in MANIFEST.in (the tests + # don't get installed, so these don't get caught by testing installed + # code). + test_cmd = self._make_test_cmd(**kwds) + [tarball] = list(d for d in self._source_distributions if + d.endswith(".tar.gz")) + tarball_path = os.path.abspath( + os.path.join(self._repo_path, "dist", tarball)) + def test_step(log): + target_dir, tear_down = self._mkdtemp() + try: + env.cmd(["tar", "-C", target_dir, "-xf", tarball_path]) + [source_dir] = glob.glob( + os.path.join(target_dir, "mechanize-*")) + test_env = clean_environ_env(release.CwdEnv(env, source_dir)) + test_env.cmd(test_cmd) + finally: + tear_down() + return test_step + + @action_tree.action_node + def test(self): + r = [] + r.append(("python27_test", + self._make_test_step(self._in_repo, python_version=(2, 7)))) + r.append(("python27_easy_install_test", + self._make_source_dist_easy_install_test_step( + self._in_repo, python_version=(2, 7)))) + r.append(("python26_test", + self._make_test_step(self._in_repo, python_version=(2, 6)))) + # disabled for the moment -- think I probably built the launchpad .deb + # from wrong branch, without bug fixes + # r.append(("python26_coverage", + # self._make_test_step(self._in_repo, python_version=(2, 6), + # coverage=True))) + r.append(("python25_easy_install_test", + self._make_source_dist_easy_install_test_step( + self._in_repo, python_version=(2, 5)))) + r.append(("python24_easy_install_test", + self._make_source_dist_easy_install_test_step( + self._in_repo, python_version=(2, 4)))) + r.append(self.performance_test) + return r + + def make_coverage_html(self, log): + self._in_repo.cmd(["figleaf2html"]) + + def tag(self, log): + self._in_repo.cmd(["git", "checkout", "master"]) + self._in_repo.cmd(["git", "tag", + "-m", "Tagging release %s" % self._release_version, + str(self._release_version)]) + + def clean_docs(self, log): + self._in_docs_dir.cmd(release.rm_rf_cmd("html")) + + def make_docs(self, log): + self._in_docs_dir.cmd(["mkdir", "-p", "html"]) + site_map = release.site_map() + def pandoc(filename, source_filename): + last_modified = release.last_modified(source_filename, + self._in_docs_dir) + if filename == "download.txt": + last_modified = time.gmtime() + variables = [ + ("last_modified_iso", + time.strftime("%Y-%m-%d", last_modified)), + ("last_modified_month_year", + time.strftime("%B %Y", last_modified))] + page_name = os.path.splitext(os.path.basename(filename))[0] + variables.append(("nav", release.nav_html(site_map, page_name))) + variables.append(("subnav", release.subnav_html(site_map, + page_name))) + release.pandoc(self._in_docs_dir, filename, variables=variables) + release.empy(self._in_docs_dir, "forms.txt.in") + release.empy(self._in_docs_dir, "download.txt.in", + defines=["version=%r" % str(self._release_version)]) + for page in site_map.iter_pages(): + if page.name in ["Root", "Changelog"]: + continue + source_filename = filename = page.name + ".txt" + if page.name in ["forms", "download"]: + source_filename += ".in" + pandoc(filename, source_filename) + self._in_repo.cmd(["cp", "-r", "ChangeLog", "docs/html/ChangeLog.txt"]) + if self._build_tools_path is not None: + styles = ensure_trailing_slash( + os.path.join(self._website_source_path, "styles")) + self._env.cmd(["rsync", "-a", styles, + os.path.join(self._docs_dir, "styles")]) + + def setup_py_sdist(self, log): + self._in_repo.cmd(release.rm_rf_cmd("dist")) + # write empty setup.cfg so source distribution is built using a version + # number without ".dev" and today's date appended + self._in_repo.cmd(cmd_env.write_file_cmd("setup.cfg", "")) + self._in_repo.cmd(["python", "setup.py", "sdist", + "--formats=gztar,zip"]) + archives = set(os.listdir(os.path.join(self._repo_path, "dist"))) + assert archives == self._source_distributions, \ + (archives, self._source_distributions) + + @action_tree.action_node + def build_sdist(self): + return [ + self.clean_docs, + self.make_docs, + self.setup_py_sdist, + ] + + def _stage(self, path, dest_dir, dest_basename=None, + source_base_path=None): + # IIRC not using rsync because didn't see easy way to avoid updating + # timestamp of unchanged files, which was upsetting git + # note: files in the website repository that are no longer generated + # must be manually deleted from the repository + if source_base_path is None: + source_base_path = self._repo_path + full_path = os.path.join(source_base_path, path) + try: + self._env.cmd(["readlink", "-e", full_path], + stdout=open(os.devnull, "w")) + except cmd_env.CommandFailedError: + print "not staging (does not exist):", full_path + return + if dest_basename is None: + dest_basename = os.path.basename(path) + dest = os.path.join(self._mirror_path, dest_dir, dest_basename) + try: + self._env.cmd(["cmp", full_path, dest]) + except cmd_env.CommandFailedError: + print "staging: %s -> %s" % (full_path, dest) + self._env.cmd(["cp", full_path, dest]) + else: + print "not staging (unchanged): %s -> %s" % (full_path, dest) + + def ensure_unmodified(self, log): + if self._build_tools_path: + ensure_unmodified(self._env, self._website_source_path) + ensure_unmodified(self._env, self._mirror_path) + + def _stage_flat_dir(self, path, dest): + self._env.cmd(["mkdir", "-p", os.path.join(self._mirror_path, dest)]) + for filename in os.listdir(path): + self._stage(os.path.join(path, filename), dest) + + def _symlink_flat_dir(self, path, exclude): + for filename in os.listdir(path): + if filename in exclude: + continue + link_dir = os.path.dirname(path) + target = os.path.relpath(os.path.join(path, filename), link_dir) + link_path = os.path.join(link_dir, filename) + if not os.path.islink(link_path) or \ + os.path.realpath(link_path) != target: + self._env.cmd(["ln", "-f", "-s", "-t", link_dir, target]) + + def collate_from_mechanize(self, log): + html_dir = os.path.join(self._docs_dir, "html") + self._stage_flat_dir(html_dir, "htdocs/mechanize/docs") + self._symlink_flat_dir( + os.path.join(self._mirror_path, "htdocs/mechanize/docs"), + exclude=[".git", ".htaccess", ".svn", "CVS"]) + self._stage("test-tools/cookietest.cgi", "cgi-bin") + self._stage("examples/forms/echo.cgi", "cgi-bin") + self._stage("examples/forms/example.html", "htdocs/mechanize") + for archive in self._source_distributions: + placeholder = os.path.join("htdocs/mechanize/src", archive) + self._in_mirror.cmd(["touch", placeholder]) + + def collate_from_build_tools(self, log): + self._stage(os.path.join(self._website_source_path, "frontpage.html"), + "htdocs", "index.html") + self._stage_flat_dir( + os.path.join(self._website_source_path, "styles"), "htdocs/styles") + + @action_tree.action_node + def collate(self): + r = [self.collate_from_mechanize] + if self._build_tools_path is not None: + r.append(self.collate_from_build_tools) + return r + + def collate_pypi_upload_built_items(self, log): + for archive in self._source_distributions: + self._stage(os.path.join("dist", archive), "htdocs/mechanize/src") + + def commit_staging_website(self, log): + self._in_mirror.cmd(["git", "add", "--all"]) + self._in_mirror.cmd( + ["git", "commit", + "-m", "Automated update for release %s" % self._release_version]) + + def validate_html(self, log): + exclusions = set(f for f in """\ +./cookietest.html +htdocs/basic_auth/index.html +htdocs/digest_auth/index.html +htdocs/mechanize/example.html +htdocs/test_fixtures/index.html +htdocs/test_fixtures/mechanize_reload_test.html +htdocs/test_fixtures/referertest.html +""".splitlines() if not f.startswith("#")) + for dirpath, dirnames, filenames in os.walk(self._mirror_path): + try: + # archived website + dirnames.remove("old") + except ValueError: + pass + for filename in filenames: + if filename.endswith(".html"): + page_path = os.path.join( + os.path.relpath(dirpath, self._mirror_path), filename) + if page_path not in exclusions: + self._in_mirror.cmd(["validate", page_path]) + + def _classpath_cmd(self): + from_packages = ["/usr/share/java/commons-collections3.jar", + "/usr/share/java/commons-lang.jar", + "/usr/share/java/xercesImpl.jar", + "/usr/share/java/tagsoup.jar", + "/usr/share/java/velocity.jar", + ] + jar_dir = os.path.join(self._release_area, self._css_validator_path) + local = glob.glob(os.path.join(jar_dir, "*.jar")) + path = ":".join(local + from_packages) + return ["env", "CLASSPATH=%s" % path] + + def _sanitise_css(self, path): + temp_dir, tear_down = self._mkdtemp() + temp_path = os.path.join(temp_dir, os.path.basename(path)) + temp = open(temp_path, "w") + try: + for line in open(path): + if line.rstrip().endswith("/*novalidate*/"): + # temp.write("/*%s*/\n" % line.rstrip()) + temp.write("/*sanitised*/\n") + else: + temp.write(line) + finally: + temp.close() + return temp_path, tear_down + + def validate_css(self, log): + env = cmd_env.PrefixCmdEnv(self._classpath_cmd(), self._in_release_dir) + # env.cmd(["java", "org.w3c.css.css.CssValidator", "--help"]) + """ +Usage: java org.w3c.css.css.CssValidator [OPTIONS] | [URL]* +OPTIONS + -p, --printCSS + Prints the validated CSS (only with text output, the CSS is printed with other outputs) + -profile PROFILE, --profile=PROFILE + Checks the Stylesheet against PROFILE + Possible values for PROFILE are css1, css2, css21 (default), css3, svg, svgbasic, svgtiny, atsc-tv, mobile, tv + -medium MEDIUM, --medium=MEDIUM + Checks the Stylesheet using the medium MEDIUM + Possible values for MEDIUM are all (default), aural, braille, embossed, handheld, print, projection, screen, tty, tv, presentation + -output OUTPUT, --output=OUTPUT + Prints the result in the selected format + Possible values for OUTPUT are text (default), xhtml, html (same result as xhtml), soap12 + -lang LANG, --lang=LANG + Prints the result in the specified language + Possible values for LANG are de, en (default), es, fr, ja, ko, nl, zh-cn, pl, it + -warning WARN, --warning=WARN + Warnings verbosity level + Possible values for WARN are -1 (no warning), 0, 1, 2 (default, all the warnings + +URL + URL can either represent a distant web resource (http://) or a local file (file:/) +""" + validate_cmd = ["java", "org.w3c.css.css.CssValidator"] + for dirpath, dirnames, filenames in os.walk(self._mirror_path): + for filename in filenames: + if filename.endswith(".css"): + path = os.path.join(dirpath, filename) + temp_path, tear_down = self._sanitise_css(path) + try: + page_url = "file://" + temp_path + output = release.get_cmd_stdout( + env, validate_cmd + [page_url]) + finally: + tear_down() + # the validator doesn't fail properly: it exits + # successfully on validation failure + if "Sorry! We found the following errors" in output: + raise CSSValidationError(path, output) + + def fetch_zope_testbrowser(self, log): + clean_dir(self._env, self._zope_testbrowser_dir) + in_testbrowser = release.CwdEnv(self._env, self._zope_testbrowser_dir) + in_testbrowser.cmd(["easy_install", "--editable", + "--build-directory", ".", + "zope.testbrowser[test]"]) + in_testbrowser.cmd( + ["virtualenv", "--no-site-packages", "zope.testbrowser"]) + project_dir = os.path.join(self._zope_testbrowser_dir, + "zope.testbrowser") + in_project_dir = clean_environ_env( + release.CwdEnv(self._env, project_dir)) + check_not_installed(in_project_dir, "bin/python") + in_project_dir.cmd( + ["sed", "-i", "-e", "s/mechanize[^\"']*/mechanize/", "setup.py"]) + in_project_dir.cmd(["bin/easy_install", "zc.buildout"]) + in_project_dir.cmd(["bin/buildout", "init"]) + [mechanize_tarball] = list(d for d in self._source_distributions if + d.endswith(".tar.gz")) + tarball_path = os.path.join(self._repo_path, "dist", mechanize_tarball) + in_project_dir.cmd(["bin/easy_install", tarball_path]) + in_project_dir.cmd(["bin/buildout", "install"]) + + def test_zope_testbrowser(self, log): + project_dir = os.path.join(self._zope_testbrowser_dir, + "zope.testbrowser") + env = clean_environ_env(release.CwdEnv(self._env, project_dir)) + check_version_equals(env, self._release_version, "bin/python") + env.cmd(["bin/test"]) + + @action_tree.action_node + def zope_testbrowser(self): + return [self.fetch_zope_testbrowser, + self.test_zope_testbrowser, + ] + + def upload_to_pypi(self, log): + self._in_repo.cmd(["python", "setup.py", "sdist", + "--formats=gztar,zip", "upload"]) + + def sync_to_sf(self, log): + assert os.path.isdir( + os.path.join(self._mirror_path, "htdocs/mechanize")) + self._env.cmd(["rsync", "-rlptvuz", "--exclude", "*~", "--delete", + ensure_trailing_slash(self._mirror_path), + "jjlee,wwwsearch@web.sourceforge.net:"]) + + @action_tree.action_node + def upload(self): + r = [] + r.append(self.upload_to_pypi) + # setup.py upload requires sdist command to upload zip files, and the + # sdist comment insists on rebuilding source distributions, so it's not + # possible to use the upload command to upload the already-built zip + # file. Work around that by copying the rebuilt source distributions + # into website repository only now (rather than at build/test time), so + # don't end up with two different sets of source distributions with + # different md5 sums due to timestamps in the archives. + r.append(self.collate_pypi_upload_built_items) + r.append(self.commit_staging_website) + + if self._mirror_path is not None: + r.append(self.sync_to_sf) + return r + + def clean(self, log): + clean_dir(self._env, self._release_area) + + def clean_most(self, log): + # not dependencies installed in release area (css validator) + clean_dir(self._env, self._release_dir) + + def write_email(self, log): + log = release.get_cmd_stdout(self._in_repo, + ["git", "log", '--pretty=format: * %s', + "%s..HEAD" % self._previous_version]) + # filter out some uninteresting commits + log = "".join(line for line in log.splitlines(True) if not + re.match("^ \* Update (?:changelog|version)$", line, + re.I)) + self._in_release_dir.cmd(cmd_env.write_file_cmd( + "announce_email.txt", u"""\ +ANN: mechanize {version} released + +http://wwwsearch.sourceforge.net/mechanize/ + +This is a stable bugfix release. + +Changes since {previous_version}: + +{log} + +About mechanize +============================================= + +Requires Python 2.4, 2.5, 2.6, or 2.7. + + +Stateful programmatic web browsing, after Andy Lester's Perl module +WWW::Mechanize. + +Example: + +import re +from mechanize import Browser + +b = Browser() +b.open("http://www.example.com/") +# follow second link with element text matching regular expression +response = b.follow_link(text_regex=re.compile(r"cheese\s*shop"), nr=1) + +b.select_form(name="order") +# Browser passes through unknown attributes (including methods) +# to the selected HTMLForm +b["cheeses"] = ["mozzarella", "caerphilly"] # (the method here is __setitem__) +response2 = b.submit() # submit current form + +response3 = b.back() # back to cheese shop +response4 = b.reload() + +for link in b.forms(): + print form +# .links() optionally accepts the keyword args of .follow_/.find_link() +for link in b.links(url_regex=re.compile("python.org")): + print link + b.follow_link(link) # can be EITHER Link instance OR keyword args + b.back() + + +John +""".format(log=log, + version=self._release_version, + previous_version=self._previous_version))) + + def edit_email(self, log): + self._in_release_dir.cmd(["sensible-editor", "announce_email.txt"]) + + def push_tag(self, log): + self._in_repo.cmd(["git", "push", "git@github.com:jjlee/mechanize.git", + "tag", str(self._release_version)]) + + def send_email(self, log): + text = release.read_file_from_env(self._in_release_dir, + "announce_email.txt") + subject, sep, body = text.partition("\n") + body = body.lstrip() + assert len(body) > 0, body + send_email(from_address="John J Lee <jjl@pobox.com>", + to_address="wwwsearch-general@lists.sourceforge.net", + subject=subject, + body=body) + + @action_tree.action_node + def build(self): + return [ + self.clean, + self.install_deps, + self.clean_most, + self.git_fetch, + self.print_next_tag, + self.clone, + self.checks, + # self.clean_coverage, + self.copy_test_dependencies, + self.test, + # self.make_coverage_html, + self.tag, + self.build_sdist, + ("unpacked_tarball_test", self._make_unpacked_tarball_test_step( + self._env, python_version=(2,6))), + ("easy_install_test", self._make_tarball_easy_install_test_step( + self._in_repo, python_version=(2, 6), + local_server=False, uri=self._test_uri)), + self.zope_testbrowser, + self.write_email, + self.edit_email, + ] + + def update_version(self, log): + version_path = "mechanize/_version.py" + template = """\ +"%(text)s" +__version__ = %(tuple)s +""" + old_text = release.read_file_from_env(self._in_source_repo, + version_path) + old_version = old_text.splitlines()[0].strip(' "') + assert old_version == str(self._release_version), \ + (old_version, str(self._release_version)) + def version_text(version): + return template % {"text": str(version), + "tuple": repr(tuple(version.tuple[:-1]))} + assert old_text == version_text(release.parse_version(old_version)), \ + (old_text, version_text(release.parse_version(old_version))) + self._in_source_repo.cmd(cmd_env.write_file_cmd( + version_path, + version_text(self._release_version.next_version()))) + self._in_source_repo.cmd(["git", "commit", "-m", "Update version", + version_path]) + + @action_tree.action_node + def update_staging_website(self): + if self._mirror_path is None: + return [] + + return [ + self.ensure_unmodified, + self.collate, + self.validate_html, + self.validate_css, + self.commit_staging_website, + ] + + @action_tree.action_node + def tell_the_world(self): + return [ + self.push_tag, + self.upload, + ("easy_install_test_internet", + self._make_pypi_easy_install_test_step( + self._in_repo, python_version=(2, 6), + local_server=False, + uri="http://wwwsearch.sourceforge.net/")), + self.send_email, + ] + + @action_tree.action_node + def all(self): + return [ + self.build, + self.update_staging_website, + self.update_version, + self.tell_the_world, + ] + + +def parse_options(args): + parser = optparse.OptionParser(usage=__doc__.strip()) + release.add_basic_env_options(parser) + action_tree.add_options(parser) + parser.add_option("--mechanize-repository", metavar="DIRECTORY", + dest="git_repository_path", + help="path to mechanize git repository (default is cwd)") + parser.add_option("--build-tools-repository", metavar="DIRECTORY", + help=("path of mechanize-build-tools git repository, " + "from which to get other website source files " + "(default is not to build those files)")) + parser.add_option("--website-repository", metavar="DIRECTORY", + dest="mirror_path", + help=("path of local website mirror git repository into " + "which built files will be copied (default is not " + "to copy the files)")) + parser.add_option("--in-source-repository", action="store_true", + dest="in_repository", + help=("run all commands in original repository " + "(specified by --git-repository), rather than in " + "the clone of it in the release area")) + parser.add_option("--tag-name", metavar="TAG_NAME") + parser.add_option("--uri", default="http://wwwsearch.sourceforge.net/", + help=("base URI to run tests against when not using a " + "built-in web server")) + options, remaining_args = parser.parse_args(args) + nr_args = len(remaining_args) + try: + options.release_area = remaining_args.pop(0) + except IndexError: + parser.error("Expected at least 1 argument, got %d" % nr_args) + if options.git_repository_path is None: + options.git_repository_path = os.getcwd() + if not is_git_repository(options.git_repository_path): + parser.error("incorrect git repository path") + if options.build_tools_repository is not None and \ + not is_git_repository(options.build_tools_repository): + parser.error("incorrect mechanize-build-tools repository path") + mirror_path = options.mirror_path + if mirror_path is not None: + if not is_git_repository(options.mirror_path): + parser.error("mirror path is not a git reporsitory") + mirror_path = os.path.join(mirror_path, "mirror") + if not os.path.isdir(mirror_path): + parser.error("%r does not exist" % mirror_path) + options.mirror_path = mirror_path + return options, remaining_args + + +def main(argv): + if not hasattr(action_tree, "action_main"): + sys.exit("failed to import required modules") + + options, action_tree_args = parse_options(argv[1:]) + env = release.get_env_from_options(options) + releaser = Releaser(env, options.git_repository_path, options.release_area, + options.mirror_path, options.build_tools_repository, + options.in_repository, options.tag_name, options.uri) + action_tree.action_main_(releaser.all, options, action_tree_args) + + +if __name__ == "__main__": + main(sys.argv) diff --git a/LTA/LTAIngest/mechanize-0.2.5/setup.cfg b/LTA/LTAIngest/mechanize-0.2.5/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..861a9f554263efb088d8636c4f17a30696e495ad --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/setup.cfg @@ -0,0 +1,5 @@ +[egg_info] +tag_build = +tag_date = 0 +tag_svn_revision = 0 + diff --git a/LTA/LTAIngest/mechanize-0.2.5/setup.py b/LTA/LTAIngest/mechanize-0.2.5/setup.py new file mode 100755 index 0000000000000000000000000000000000000000..646c69d07edb68c6e1706c625e22c4cc6f1d24d2 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/setup.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +"""Stateful programmatic web browsing. + +Stateful programmatic web browsing, after Andy Lester's Perl module +WWW::Mechanize. + +mechanize.Browser implements the urllib2.OpenerDirector interface. Browser +objects have state, including navigation history, HTML form state, cookies, +etc. The set of features and URL schemes handled by Browser objects is +configurable. The library also provides an API that is mostly compatible with +urllib2: your urllib2 program will likely still work if you replace "urllib2" +with "mechanize" everywhere. + +Features include: ftp:, http: and file: URL schemes, browser history, hyperlink +and HTML form support, HTTP cookies, HTTP-EQUIV and Refresh, Referer [sic] +header, robots.txt, redirections, proxies, and Basic and Digest HTTP +authentication. + +Much of the code originally derived from Perl code by Gisle Aas (libwww-perl), +Johnny Lee (MSIE Cookie support) and last but not least Andy Lester +(WWW::Mechanize). urllib2 was written by Jeremy Hylton. + +""" + +import os + +VERSION = open(os.path.join("mechanize", "_version.py")).\ + readlines()[0].strip(' "\n') + +CLASSIFIERS = """\ +Development Status :: 5 - Production/Stable +Intended Audience :: Developers +Intended Audience :: System Administrators +License :: OSI Approved :: BSD License +License :: OSI Approved :: Zope Public License +Natural Language :: English +Operating System :: OS Independent +Programming Language :: Python +Programming Language :: Python :: 2 +Programming Language :: Python :: 2.4 +Programming Language :: Python :: 2.5 +Programming Language :: Python :: 2.6 +Programming Language :: Python :: 2.7 +Topic :: Internet +Topic :: Internet :: File Transfer Protocol (FTP) +Topic :: Internet :: WWW/HTTP +Topic :: Internet :: WWW/HTTP :: Browsers +Topic :: Internet :: WWW/HTTP :: Indexing/Search +Topic :: Internet :: WWW/HTTP :: Site Management +Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking +Topic :: Software Development :: Libraries +Topic :: Software Development :: Libraries :: Python Modules +Topic :: Software Development :: Testing +Topic :: Software Development :: Testing :: Traffic Generation +Topic :: System :: Archiving :: Mirroring +Topic :: System :: Networking :: Monitoring +Topic :: System :: Systems Administration +Topic :: Text Processing +Topic :: Text Processing :: Markup +Topic :: Text Processing :: Markup :: HTML +Topic :: Text Processing :: Markup :: XML +""" + +def main(): + try: + import setuptools + except ImportError: + import ez_setup + ez_setup.use_setuptools() + import setuptools + setuptools.setup( + name = "mechanize", + version = VERSION, + license = "BSD", # or ZPL 2.1 + platforms = ["any"], + classifiers = [c for c in CLASSIFIERS.split("\n") if c], + install_requires = [], + zip_safe = True, + test_suite = "test", + author = "John J. Lee", + author_email = "jjl@pobox.com", + description = __doc__.split("\n", 1)[0], + long_description = __doc__.split("\n", 2)[-1], + url = "http://wwwsearch.sourceforge.net/mechanize/", + download_url = ("http://pypi.python.org/packages/source/m/mechanize/" + "mechanize-%s.tar.gz" % VERSION), + packages = ["mechanize"], + ) + + +if __name__ == "__main__": + main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/cookietest.cgi b/LTA/LTAIngest/mechanize-0.2.5/test-tools/cookietest.cgi new file mode 100755 index 0000000000000000000000000000000000000000..468682f9cddab82bdffa11463ec78d8b1619bc0a --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/cookietest.cgi @@ -0,0 +1,61 @@ +#!/usr/bin/python +# -*-python-*- + +# This is used by functional_tests.py + +#import cgitb; cgitb.enable() + +import time + +print "Content-Type: text/html" +year_plus_one = time.localtime(time.time())[0] + 1 +expires = "expires=09-Nov-%d 23:12:40 GMT" % (year_plus_one,) +print "Set-Cookie: foo=bar; %s" % expires +print "Set-Cookie: sessioncookie=spam\n" +import sys, os, string, cgi, Cookie, urllib +from xml.sax import saxutils + +from types import ListType + +print "<html><head><title>Cookies and form submission parameters</title>" +cookie = Cookie.SimpleCookie() +cookieHdr = os.environ.get("HTTP_COOKIE", "") +cookie.load(cookieHdr) +form = cgi.FieldStorage() +refresh_value = None +if form.has_key("refresh"): + refresh = form["refresh"] + if not isinstance(refresh, ListType): + refresh_value = refresh.value +if refresh_value is not None: + print '<meta http-equiv="refresh" content=%s>' % ( + saxutils.quoteattr(urllib.unquote_plus(refresh_value))) +elif not cookie.has_key("foo"): + print '<meta http-equiv="refresh" content="5">' + +print "</head>" +print "<p>Received cookies:</p>" +print "<pre>" +print cgi.escape(os.environ.get("HTTP_COOKIE", "")) +print "</pre>" +if cookie.has_key("foo"): + print "<p>Your browser supports cookies!" +if cookie.has_key("sessioncookie"): + print "<p>Received session cookie" +print "<p>Referer:</p>" +print "<pre>" +print cgi.escape(os.environ.get("HTTP_REFERER", "")) +print "</pre>" +print "<p>Received parameters:</p>" +print "<pre>" +for k in form.keys(): + v = form[k] + if isinstance(v, ListType): + vs = [] + for item in v: + vs.append(item.value) + text = string.join(vs, ", ") + else: + text = v.value + print "%s: %s" % (cgi.escape(k), cgi.escape(text)) +print "</pre></html>" diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/doctest.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/doctest.py new file mode 100644 index 0000000000000000000000000000000000000000..674fa5c1c5c2153e01249a5d3d3598d6e672f573 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/doctest.py @@ -0,0 +1,2695 @@ +# Module doctest. +# Released to the public domain 16-Jan-2001, by Tim Peters (tim@python.org). +# Major enhancements and refactoring by: +# Jim Fulton +# Edward Loper + +# Provided as-is; use at your own risk; no warranty; no promises; enjoy! + +r"""Module doctest -- a framework for running examples in docstrings. + +In simplest use, end each module M to be tested with: + +def _test(): + import doctest + doctest.testmod() + +if __name__ == "__main__": + _test() + +Then running the module as a script will cause the examples in the +docstrings to get executed and verified: + +python M.py + +This won't display anything unless an example fails, in which case the +failing example(s) and the cause(s) of the failure(s) are printed to stdout +(why not stderr? because stderr is a lame hack <0.2 wink>), and the final +line of output is "Test failed.". + +Run it with the -v switch instead: + +python M.py -v + +and a detailed report of all examples tried is printed to stdout, along +with assorted summaries at the end. + +You can force verbose mode by passing "verbose=True" to testmod, or prohibit +it by passing "verbose=False". In either of those cases, sys.argv is not +examined by testmod. + +There are a variety of other ways to run doctests, including integration +with the unittest framework, and support for running non-Python text +files containing doctests. There are also many ways to override parts +of doctest's default behaviors. See the Library Reference Manual for +details. +""" + +__docformat__ = 'reStructuredText en' + +__all__ = [ + # 0, Option Flags + 'register_optionflag', + 'DONT_ACCEPT_TRUE_FOR_1', + 'DONT_ACCEPT_BLANKLINE', + 'NORMALIZE_WHITESPACE', + 'ELLIPSIS', + 'SKIP', + 'IGNORE_EXCEPTION_DETAIL', + 'COMPARISON_FLAGS', + 'REPORT_UDIFF', + 'REPORT_CDIFF', + 'REPORT_NDIFF', + 'REPORT_ONLY_FIRST_FAILURE', + 'REPORTING_FLAGS', + # 1. Utility Functions + 'is_private', + # 2. Example & DocTest + 'Example', + 'DocTest', + # 3. Doctest Parser + 'DocTestParser', + # 4. Doctest Finder + 'DocTestFinder', + # 5. Doctest Runner + 'DocTestRunner', + 'OutputChecker', + 'DocTestFailure', + 'UnexpectedException', + 'DebugRunner', + # 6. Test Functions + 'testmod', + 'testfile', + 'run_docstring_examples', + # 7. Tester + 'Tester', + # 8. Unittest Support + 'DocTestSuite', + 'DocFileSuite', + 'set_unittest_reportflags', + # 9. Debugging Support + 'script_from_examples', + 'testsource', + 'debug_src', + 'debug', +] + +import __future__ + +import sys, traceback, inspect, linecache_copy, os, re, types +import unittest, difflib, pdb, tempfile +import warnings +from StringIO import StringIO + +# Don't whine about the deprecated is_private function in this +# module's tests. +warnings.filterwarnings("ignore", "is_private", DeprecationWarning, + __name__, 0) + +# There are 4 basic classes: +# - Example: a <source, want> pair, plus an intra-docstring line number. +# - DocTest: a collection of examples, parsed from a docstring, plus +# info about where the docstring came from (name, filename, lineno). +# - DocTestFinder: extracts DocTests from a given object's docstring and +# its contained objects' docstrings. +# - DocTestRunner: runs DocTest cases, and accumulates statistics. +# +# So the basic picture is: +# +# list of: +# +------+ +---------+ +-------+ +# |object| --DocTestFinder-> | DocTest | --DocTestRunner-> |results| +# +------+ +---------+ +-------+ +# | Example | +# | ... | +# | Example | +# +---------+ + +# Option constants. + +OPTIONFLAGS_BY_NAME = {} +def register_optionflag(name): + flag = 1 << len(OPTIONFLAGS_BY_NAME) + OPTIONFLAGS_BY_NAME[name] = flag + return flag + +DONT_ACCEPT_TRUE_FOR_1 = register_optionflag('DONT_ACCEPT_TRUE_FOR_1') +DONT_ACCEPT_BLANKLINE = register_optionflag('DONT_ACCEPT_BLANKLINE') +NORMALIZE_WHITESPACE = register_optionflag('NORMALIZE_WHITESPACE') +ELLIPSIS = register_optionflag('ELLIPSIS') +SKIP = register_optionflag('SKIP') +IGNORE_EXCEPTION_DETAIL = register_optionflag('IGNORE_EXCEPTION_DETAIL') + +COMPARISON_FLAGS = (DONT_ACCEPT_TRUE_FOR_1 | + DONT_ACCEPT_BLANKLINE | + NORMALIZE_WHITESPACE | + ELLIPSIS | + SKIP | + IGNORE_EXCEPTION_DETAIL) + +REPORT_UDIFF = register_optionflag('REPORT_UDIFF') +REPORT_CDIFF = register_optionflag('REPORT_CDIFF') +REPORT_NDIFF = register_optionflag('REPORT_NDIFF') +REPORT_ONLY_FIRST_FAILURE = register_optionflag('REPORT_ONLY_FIRST_FAILURE') + +REPORTING_FLAGS = (REPORT_UDIFF | + REPORT_CDIFF | + REPORT_NDIFF | + REPORT_ONLY_FIRST_FAILURE) + +# Special string markers for use in `want` strings: +BLANKLINE_MARKER = '<BLANKLINE>' +ELLIPSIS_MARKER = '...' + +###################################################################### +## Table of Contents +###################################################################### +# 1. Utility Functions +# 2. Example & DocTest -- store test cases +# 3. DocTest Parser -- extracts examples from strings +# 4. DocTest Finder -- extracts test cases from objects +# 5. DocTest Runner -- runs test cases +# 6. Test Functions -- convenient wrappers for testing +# 7. Tester Class -- for backwards compatibility +# 8. Unittest Support +# 9. Debugging Support +# 10. Example Usage + +###################################################################### +## 1. Utility Functions +###################################################################### + +def is_private(prefix, base): + """prefix, base -> true iff name prefix + "." + base is "private". + + Prefix may be an empty string, and base does not contain a period. + Prefix is ignored (although functions you write conforming to this + protocol may make use of it). + Return true iff base begins with an (at least one) underscore, but + does not both begin and end with (at least) two underscores. + + >>> is_private("a.b", "my_func") + False + >>> is_private("____", "_my_func") + True + >>> is_private("someclass", "__init__") + False + >>> is_private("sometypo", "__init_") + True + >>> is_private("x.y.z", "_") + True + >>> is_private("_x.y.z", "__") + False + >>> is_private("", "") # senseless but consistent + False + """ + warnings.warn("is_private is deprecated; it wasn't useful; " + "examine DocTestFinder.find() lists instead", + DeprecationWarning, stacklevel=2) + return base[:1] == "_" and not base[:2] == "__" == base[-2:] + +def _extract_future_flags(globs): + """ + Return the compiler-flags associated with the future features that + have been imported into the given namespace (globs). + """ + flags = 0 + for fname in __future__.all_feature_names: + feature = globs.get(fname, None) + if feature is getattr(__future__, fname): + flags |= feature.compiler_flag + return flags + +def _normalize_module(module, depth=2): + """ + Return the module specified by `module`. In particular: + - If `module` is a module, then return module. + - If `module` is a string, then import and return the + module with that name. + - If `module` is None, then return the calling module. + The calling module is assumed to be the module of + the stack frame at the given depth in the call stack. + """ + if inspect.ismodule(module): + return module + elif isinstance(module, (str, unicode)): + return __import__(module, globals(), locals(), ["*"]) + elif module is None: + return sys.modules[sys._getframe(depth).f_globals['__name__']] + else: + raise TypeError("Expected a module, string, or None") + +def _load_testfile(filename, package, module_relative): + if module_relative: + package = _normalize_module(package, 3) + filename = _module_relative_path(package, filename) + if hasattr(package, '__loader__'): + if hasattr(package.__loader__, 'get_data'): + return package.__loader__.get_data(filename), filename + return open(filename).read(), filename + +def _indent(s, indent=4): + """ + Add the given number of space characters to the beginning every + non-blank line in `s`, and return the result. + """ + # This regexp matches the start of non-blank lines: + return re.sub('(?m)^(?!$)', indent*' ', s) + +def _exception_traceback(exc_info): + """ + Return a string containing a traceback message for the given + exc_info tuple (as returned by sys.exc_info()). + """ + # Get a traceback message. + excout = StringIO() + exc_type, exc_val, exc_tb = exc_info + traceback.print_exception(exc_type, exc_val, exc_tb, file=excout) + return excout.getvalue() + +# Override some StringIO methods. +class _SpoofOut(StringIO): + def getvalue(self): + result = StringIO.getvalue(self) + # If anything at all was written, make sure there's a trailing + # newline. There's no way for the expected output to indicate + # that a trailing newline is missing. + if result and not result.endswith("\n"): + result += "\n" + # Prevent softspace from screwing up the next test case, in + # case they used print with a trailing comma in an example. + if hasattr(self, "softspace"): + del self.softspace + return result + + def truncate(self, size=None): + StringIO.truncate(self, size) + if hasattr(self, "softspace"): + del self.softspace + +# Worst-case linear-time ellipsis matching. +def _ellipsis_match(want, got): + """ + Essentially the only subtle case: + >>> _ellipsis_match('aa...aa', 'aaa') + False + """ + if ELLIPSIS_MARKER not in want: + return want == got + + # Find "the real" strings. + ws = want.split(ELLIPSIS_MARKER) + assert len(ws) >= 2 + + # Deal with exact matches possibly needed at one or both ends. + startpos, endpos = 0, len(got) + w = ws[0] + if w: # starts with exact match + if got.startswith(w): + startpos = len(w) + del ws[0] + else: + return False + w = ws[-1] + if w: # ends with exact match + if got.endswith(w): + endpos -= len(w) + del ws[-1] + else: + return False + + if startpos > endpos: + # Exact end matches required more characters than we have, as in + # _ellipsis_match('aa...aa', 'aaa') + return False + + # For the rest, we only need to find the leftmost non-overlapping + # match for each piece. If there's no overall match that way alone, + # there's no overall match period. + for w in ws: + # w may be '' at times, if there are consecutive ellipses, or + # due to an ellipsis at the start or end of `want`. That's OK. + # Search for an empty string succeeds, and doesn't change startpos. + startpos = got.find(w, startpos, endpos) + if startpos < 0: + return False + startpos += len(w) + + return True + +def _comment_line(line): + "Return a commented form of the given line" + line = line.rstrip() + if line: + return '# '+line + else: + return '#' + +class _OutputRedirectingPdb(pdb.Pdb): + """ + A specialized version of the python debugger that redirects stdout + to a given stream when interacting with the user. Stdout is *not* + redirected when traced code is executed. + """ + def __init__(self, out): + self.__out = out + self.__debugger_used = False + pdb.Pdb.__init__(self) + + def set_trace(self): + self.__debugger_used = True + pdb.Pdb.set_trace(self) + + def set_continue(self): + # Calling set_continue unconditionally would break unit test coverage + # reporting, as Bdb.set_continue calls sys.settrace(None). + if self.__debugger_used: + pdb.Pdb.set_continue(self) + + def trace_dispatch(self, *args): + # Redirect stdout to the given stream. + save_stdout = sys.stdout + sys.stdout = self.__out + # Call Pdb's trace dispatch method. + try: + return pdb.Pdb.trace_dispatch(self, *args) + finally: + sys.stdout = save_stdout + +# [XX] Normalize with respect to os.path.pardir? +def _module_relative_path(module, path): + if not inspect.ismodule(module): + raise TypeError, 'Expected a module: %r' % module + if path.startswith('/'): + raise ValueError, 'Module-relative files may not have absolute paths' + + # Find the base directory for the path. + if hasattr(module, '__file__'): + # A normal module/package + basedir = os.path.split(module.__file__)[0] + elif module.__name__ == '__main__': + # An interactive session. + if len(sys.argv)>0 and sys.argv[0] != '': + basedir = os.path.split(sys.argv[0])[0] + else: + basedir = os.curdir + else: + # A module w/o __file__ (this includes builtins) + raise ValueError("Can't resolve paths relative to the module " + + module + " (it has no __file__)") + + # Combine the base directory and the path. + return os.path.join(basedir, *(path.split('/'))) + +###################################################################### +## 2. Example & DocTest +###################################################################### +## - An "example" is a <source, want> pair, where "source" is a +## fragment of source code, and "want" is the expected output for +## "source." The Example class also includes information about +## where the example was extracted from. +## +## - A "doctest" is a collection of examples, typically extracted from +## a string (such as an object's docstring). The DocTest class also +## includes information about where the string was extracted from. + +class Example: + """ + A single doctest example, consisting of source code and expected + output. `Example` defines the following attributes: + + - source: A single Python statement, always ending with a newline. + The constructor adds a newline if needed. + + - want: The expected output from running the source code (either + from stdout, or a traceback in case of exception). `want` ends + with a newline unless it's empty, in which case it's an empty + string. The constructor adds a newline if needed. + + - exc_msg: The exception message generated by the example, if + the example is expected to generate an exception; or `None` if + it is not expected to generate an exception. This exception + message is compared against the return value of + `traceback.format_exception_only()`. `exc_msg` ends with a + newline unless it's `None`. The constructor adds a newline + if needed. + + - lineno: The line number within the DocTest string containing + this Example where the Example begins. This line number is + zero-based, with respect to the beginning of the DocTest. + + - indent: The example's indentation in the DocTest string. + I.e., the number of space characters that preceed the + example's first prompt. + + - options: A dictionary mapping from option flags to True or + False, which is used to override default options for this + example. Any option flags not contained in this dictionary + are left at their default value (as specified by the + DocTestRunner's optionflags). By default, no options are set. + """ + def __init__(self, source, want, exc_msg=None, lineno=0, indent=0, + options=None): + # Normalize inputs. + if not source.endswith('\n'): + source += '\n' + if want and not want.endswith('\n'): + want += '\n' + if exc_msg is not None and not exc_msg.endswith('\n'): + exc_msg += '\n' + # Store properties. + self.source = source + self.want = want + self.lineno = lineno + self.indent = indent + if options is None: options = {} + self.options = options + self.exc_msg = exc_msg + +class DocTest: + """ + A collection of doctest examples that should be run in a single + namespace. Each `DocTest` defines the following attributes: + + - examples: the list of examples. + + - globs: The namespace (aka globals) that the examples should + be run in. + + - name: A name identifying the DocTest (typically, the name of + the object whose docstring this DocTest was extracted from). + + - filename: The name of the file that this DocTest was extracted + from, or `None` if the filename is unknown. + + - lineno: The line number within filename where this DocTest + begins, or `None` if the line number is unavailable. This + line number is zero-based, with respect to the beginning of + the file. + + - docstring: The string that the examples were extracted from, + or `None` if the string is unavailable. + """ + def __init__(self, examples, globs, name, filename, lineno, docstring): + """ + Create a new DocTest containing the given examples. The + DocTest's globals are initialized with a copy of `globs`. + """ + assert not isinstance(examples, basestring), \ + "DocTest no longer accepts str; use DocTestParser instead" + self.examples = examples + self.docstring = docstring + self.globs = globs.copy() + self.name = name + self.filename = filename + self.lineno = lineno + + def __repr__(self): + if len(self.examples) == 0: + examples = 'no examples' + elif len(self.examples) == 1: + examples = '1 example' + else: + examples = '%d examples' % len(self.examples) + return ('<DocTest %s from %s:%s (%s)>' % + (self.name, self.filename, self.lineno, examples)) + + + # This lets us sort tests by name: + def __cmp__(self, other): + if not isinstance(other, DocTest): + return -1 + return cmp((self.name, self.filename, self.lineno, id(self)), + (other.name, other.filename, other.lineno, id(other))) + +###################################################################### +## 3. DocTestParser +###################################################################### + +class DocTestParser: + """ + A class used to parse strings containing doctest examples. + """ + # This regular expression is used to find doctest examples in a + # string. It defines three groups: `source` is the source code + # (including leading indentation and prompts); `indent` is the + # indentation of the first (PS1) line of the source code; and + # `want` is the expected output (including leading indentation). + _EXAMPLE_RE = re.compile(r''' + # Source consists of a PS1 line followed by zero or more PS2 lines. + (?P<source> + (?:^(?P<indent> [ ]*) >>> .*) # PS1 line + (?:\n [ ]* \.\.\. .*)*) # PS2 lines + \n? + # Want consists of any non-blank lines that do not start with PS1. + (?P<want> (?:(?![ ]*$) # Not a blank line + (?![ ]*>>>) # Not a line starting with PS1 + .*$\n? # But any other line + )*) + ''', re.MULTILINE | re.VERBOSE) + + # A regular expression for handling `want` strings that contain + # expected exceptions. It divides `want` into three pieces: + # - the traceback header line (`hdr`) + # - the traceback stack (`stack`) + # - the exception message (`msg`), as generated by + # traceback.format_exception_only() + # `msg` may have multiple lines. We assume/require that the + # exception message is the first non-indented line starting with a word + # character following the traceback header line. + _EXCEPTION_RE = re.compile(r""" + # Grab the traceback header. Different versions of Python have + # said different things on the first traceback line. + ^(?P<hdr> Traceback\ \( + (?: most\ recent\ call\ last + | innermost\ last + ) \) : + ) + \s* $ # toss trailing whitespace on the header. + (?P<stack> .*?) # don't blink: absorb stuff until... + ^ (?P<msg> \w+ .*) # a line *starts* with alphanum. + """, re.VERBOSE | re.MULTILINE | re.DOTALL) + + # A callable returning a true value iff its argument is a blank line + # or contains a single comment. + _IS_BLANK_OR_COMMENT = re.compile(r'^[ ]*(#.*)?$').match + + def parse(self, string, name='<string>'): + """ + Divide the given string into examples and intervening text, + and return them as a list of alternating Examples and strings. + Line numbers for the Examples are 0-based. The optional + argument `name` is a name identifying this string, and is only + used for error messages. + """ + string = string.expandtabs() + # If all lines begin with the same indentation, then strip it. + min_indent = self._min_indent(string) + if min_indent > 0: + string = '\n'.join([l[min_indent:] for l in string.split('\n')]) + + output = [] + charno, lineno = 0, 0 + # Find all doctest examples in the string: + for m in self._EXAMPLE_RE.finditer(string): + # Add the pre-example text to `output`. + output.append(string[charno:m.start()]) + # Update lineno (lines before this example) + lineno += string.count('\n', charno, m.start()) + # Extract info from the regexp match. + (source, options, want, exc_msg) = \ + self._parse_example(m, name, lineno) + # Create an Example, and add it to the list. + if not self._IS_BLANK_OR_COMMENT(source): + output.append( Example(source, want, exc_msg, + lineno=lineno, + indent=min_indent+len(m.group('indent')), + options=options) ) + # Update lineno (lines inside this example) + lineno += string.count('\n', m.start(), m.end()) + # Update charno. + charno = m.end() + # Add any remaining post-example text to `output`. + output.append(string[charno:]) + return output + + def get_doctest(self, string, globs, name, filename, lineno): + """ + Extract all doctest examples from the given string, and + collect them into a `DocTest` object. + + `globs`, `name`, `filename`, and `lineno` are attributes for + the new `DocTest` object. See the documentation for `DocTest` + for more information. + """ + return DocTest(self.get_examples(string, name), globs, + name, filename, lineno, string) + + def get_examples(self, string, name='<string>'): + """ + Extract all doctest examples from the given string, and return + them as a list of `Example` objects. Line numbers are + 0-based, because it's most common in doctests that nothing + interesting appears on the same line as opening triple-quote, + and so the first interesting line is called \"line 1\" then. + + The optional argument `name` is a name identifying this + string, and is only used for error messages. + """ + return [x for x in self.parse(string, name) + if isinstance(x, Example)] + + def _parse_example(self, m, name, lineno): + """ + Given a regular expression match from `_EXAMPLE_RE` (`m`), + return a pair `(source, want)`, where `source` is the matched + example's source code (with prompts and indentation stripped); + and `want` is the example's expected output (with indentation + stripped). + + `name` is the string's name, and `lineno` is the line number + where the example starts; both are used for error messages. + """ + # Get the example's indentation level. + indent = len(m.group('indent')) + + # Divide source into lines; check that they're properly + # indented; and then strip their indentation & prompts. + source_lines = m.group('source').split('\n') + self._check_prompt_blank(source_lines, indent, name, lineno) + self._check_prefix(source_lines[1:], ' '*indent + '.', name, lineno) + source = '\n'.join([sl[indent+4:] for sl in source_lines]) + + # Divide want into lines; check that it's properly indented; and + # then strip the indentation. Spaces before the last newline should + # be preserved, so plain rstrip() isn't good enough. + want = m.group('want') + want_lines = want.split('\n') + if len(want_lines) > 1 and re.match(r' *$', want_lines[-1]): + del want_lines[-1] # forget final newline & spaces after it + self._check_prefix(want_lines, ' '*indent, name, + lineno + len(source_lines)) + want = '\n'.join([wl[indent:] for wl in want_lines]) + + # If `want` contains a traceback message, then extract it. + m = self._EXCEPTION_RE.match(want) + if m: + exc_msg = m.group('msg') + else: + exc_msg = None + + # Extract options from the source. + options = self._find_options(source, name, lineno) + + return source, options, want, exc_msg + + # This regular expression looks for option directives in the + # source code of an example. Option directives are comments + # starting with "doctest:". Warning: this may give false + # positives for string-literals that contain the string + # "#doctest:". Eliminating these false positives would require + # actually parsing the string; but we limit them by ignoring any + # line containing "#doctest:" that is *followed* by a quote mark. + _OPTION_DIRECTIVE_RE = re.compile(r'#\s*doctest:\s*([^\n\'"]*)$', + re.MULTILINE) + + def _find_options(self, source, name, lineno): + """ + Return a dictionary containing option overrides extracted from + option directives in the given source string. + + `name` is the string's name, and `lineno` is the line number + where the example starts; both are used for error messages. + """ + options = {} + # (note: with the current regexp, this will match at most once:) + for m in self._OPTION_DIRECTIVE_RE.finditer(source): + option_strings = m.group(1).replace(',', ' ').split() + for option in option_strings: + if (option[0] not in '+-' or + option[1:] not in OPTIONFLAGS_BY_NAME): + raise ValueError('line %r of the doctest for %s ' + 'has an invalid option: %r' % + (lineno+1, name, option)) + flag = OPTIONFLAGS_BY_NAME[option[1:]] + options[flag] = (option[0] == '+') + if options and self._IS_BLANK_OR_COMMENT(source): + raise ValueError('line %r of the doctest for %s has an option ' + 'directive on a line with no example: %r' % + (lineno, name, source)) + return options + + # This regular expression finds the indentation of every non-blank + # line in a string. + _INDENT_RE = re.compile('^([ ]*)(?=\S)', re.MULTILINE) + + def _min_indent(self, s): + "Return the minimum indentation of any non-blank line in `s`" + indents = [len(indent) for indent in self._INDENT_RE.findall(s)] + if len(indents) > 0: + return min(indents) + else: + return 0 + + def _check_prompt_blank(self, lines, indent, name, lineno): + """ + Given the lines of a source string (including prompts and + leading indentation), check to make sure that every prompt is + followed by a space character. If any line is not followed by + a space character, then raise ValueError. + """ + for i, line in enumerate(lines): + if len(line) >= indent+4 and line[indent+3] != ' ': + raise ValueError('line %r of the docstring for %s ' + 'lacks blank after %s: %r' % + (lineno+i+1, name, + line[indent:indent+3], line)) + + def _check_prefix(self, lines, prefix, name, lineno): + """ + Check that every line in the given list starts with the given + prefix; if any line does not, then raise a ValueError. + """ + for i, line in enumerate(lines): + if line and not line.startswith(prefix): + raise ValueError('line %r of the docstring for %s has ' + 'inconsistent leading whitespace: %r' % + (lineno+i+1, name, line)) + + +###################################################################### +## 4. DocTest Finder +###################################################################### + +class DocTestFinder: + """ + A class used to extract the DocTests that are relevant to a given + object, from its docstring and the docstrings of its contained + objects. Doctests can currently be extracted from the following + object types: modules, functions, classes, methods, staticmethods, + classmethods, and properties. + """ + + def __init__(self, verbose=False, parser=DocTestParser(), + recurse=True, _namefilter=None, exclude_empty=True): + """ + Create a new doctest finder. + + The optional argument `parser` specifies a class or + function that should be used to create new DocTest objects (or + objects that implement the same interface as DocTest). The + signature for this factory function should match the signature + of the DocTest constructor. + + If the optional argument `recurse` is false, then `find` will + only examine the given object, and not any contained objects. + + If the optional argument `exclude_empty` is false, then `find` + will include tests for objects with empty docstrings. + """ + self._parser = parser + self._verbose = verbose + self._recurse = recurse + self._exclude_empty = exclude_empty + # _namefilter is undocumented, and exists only for temporary backward- + # compatibility support of testmod's deprecated isprivate mess. + self._namefilter = _namefilter + + def find(self, obj, name=None, module=None, globs=None, + extraglobs=None): + """ + Return a list of the DocTests that are defined by the given + object's docstring, or by any of its contained objects' + docstrings. + + The optional parameter `module` is the module that contains + the given object. If the module is not specified or is None, then + the test finder will attempt to automatically determine the + correct module. The object's module is used: + + - As a default namespace, if `globs` is not specified. + - To prevent the DocTestFinder from extracting DocTests + from objects that are imported from other modules. + - To find the name of the file containing the object. + - To help find the line number of the object within its + file. + + Contained objects whose module does not match `module` are ignored. + + If `module` is False, no attempt to find the module will be made. + This is obscure, of use mostly in tests: if `module` is False, or + is None but cannot be found automatically, then all objects are + considered to belong to the (non-existent) module, so all contained + objects will (recursively) be searched for doctests. + + The globals for each DocTest is formed by combining `globs` + and `extraglobs` (bindings in `extraglobs` override bindings + in `globs`). A new copy of the globals dictionary is created + for each DocTest. If `globs` is not specified, then it + defaults to the module's `__dict__`, if specified, or {} + otherwise. If `extraglobs` is not specified, then it defaults + to {}. + + """ + # If name was not specified, then extract it from the object. + if name is None: + name = getattr(obj, '__name__', None) + if name is None: + raise ValueError("DocTestFinder.find: name must be given " + "when obj.__name__ doesn't exist: %r" % + (type(obj),)) + + # Find the module that contains the given object (if obj is + # a module, then module=obj.). Note: this may fail, in which + # case module will be None. + if module is False: + module = None + elif module is None: + module = inspect.getmodule(obj) + + # Read the module's source code. This is used by + # DocTestFinder._find_lineno to find the line number for a + # given object's docstring. + try: + file = inspect.getsourcefile(obj) or inspect.getfile(obj) + source_lines = linecache_copy.getlines(file) + if not source_lines: + source_lines = None + except TypeError: + source_lines = None + + # Initialize globals, and merge in extraglobs. + if globs is None: + if module is None: + globs = {} + else: + globs = module.__dict__.copy() + else: + globs = globs.copy() + if extraglobs is not None: + globs.update(extraglobs) + + # Recursively expore `obj`, extracting DocTests. + tests = [] + self._find(tests, obj, name, module, source_lines, globs, {}) + return tests + + def _filter(self, obj, prefix, base): + """ + Return true if the given object should not be examined. + """ + return (self._namefilter is not None and + self._namefilter(prefix, base)) + + def _from_module(self, module, object): + """ + Return true if the given object is defined in the given + module. + """ + if module is None: + return True + elif inspect.isfunction(object): + return module.__dict__ is object.func_globals + elif inspect.isclass(object): + return module.__name__ == object.__module__ + elif inspect.getmodule(object) is not None: + return module is inspect.getmodule(object) + elif hasattr(object, '__module__'): + return module.__name__ == object.__module__ + elif isinstance(object, property): + return True # [XX] no way not be sure. + else: + raise ValueError("object must be a class or function") + + def _find(self, tests, obj, name, module, source_lines, globs, seen): + """ + Find tests for the given object and any contained objects, and + add them to `tests`. + """ + if self._verbose: + print 'Finding tests in %s' % name + + # If we've already processed this object, then ignore it. + if id(obj) in seen: + return + seen[id(obj)] = 1 + + # Find a test for this object, and add it to the list of tests. + test = self._get_test(obj, name, module, globs, source_lines) + if test is not None: + tests.append(test) + + # Look for tests in a module's contained objects. + if inspect.ismodule(obj) and self._recurse: + for valname, val in obj.__dict__.items(): + # Check if this contained object should be ignored. + if self._filter(val, name, valname): + continue + valname = '%s.%s' % (name, valname) + # Recurse to functions & classes. + if ((inspect.isfunction(val) or inspect.isclass(val)) and + self._from_module(module, val)): + self._find(tests, val, valname, module, source_lines, + globs, seen) + + # Look for tests in a module's __test__ dictionary. + if inspect.ismodule(obj) and self._recurse: + for valname, val in getattr(obj, '__test__', {}).items(): + if not isinstance(valname, basestring): + raise ValueError("DocTestFinder.find: __test__ keys " + "must be strings: %r" % + (type(valname),)) + if not (inspect.isfunction(val) or inspect.isclass(val) or + inspect.ismethod(val) or inspect.ismodule(val) or + isinstance(val, basestring)): + raise ValueError("DocTestFinder.find: __test__ values " + "must be strings, functions, methods, " + "classes, or modules: %r" % + (type(val),)) + valname = '%s.__test__.%s' % (name, valname) + self._find(tests, val, valname, module, source_lines, + globs, seen) + + # Look for tests in a class's contained objects. + if inspect.isclass(obj) and self._recurse: + for valname, val in obj.__dict__.items(): + # Check if this contained object should be ignored. + if self._filter(val, name, valname): + continue + # Special handling for staticmethod/classmethod. + if isinstance(val, staticmethod): + val = getattr(obj, valname) + if isinstance(val, classmethod): + val = getattr(obj, valname).im_func + + # Recurse to methods, properties, and nested classes. + if ((inspect.isfunction(val) or inspect.isclass(val) or + isinstance(val, property)) and + self._from_module(module, val)): + valname = '%s.%s' % (name, valname) + self._find(tests, val, valname, module, source_lines, + globs, seen) + + def _get_test(self, obj, name, module, globs, source_lines): + """ + Return a DocTest for the given object, if it defines a docstring; + otherwise, return None. + """ + # Extract the object's docstring. If it doesn't have one, + # then return None (no test for this object). + if isinstance(obj, basestring): + docstring = obj + else: + try: + if obj.__doc__ is None: + docstring = '' + else: + docstring = obj.__doc__ + if not isinstance(docstring, basestring): + docstring = str(docstring) + except (TypeError, AttributeError): + docstring = '' + + # Find the docstring's location in the file. + lineno = self._find_lineno(obj, source_lines) + + # Don't bother if the docstring is empty. + if self._exclude_empty and not docstring: + return None + + # Return a DocTest for this object. + if module is None: + filename = None + else: + filename = getattr(module, '__file__', module.__name__) + if filename[-4:] in (".pyc", ".pyo"): + filename = filename[:-1] + return self._parser.get_doctest(docstring, globs, name, + filename, lineno) + + def _find_lineno(self, obj, source_lines): + """ + Return a line number of the given object's docstring. Note: + this method assumes that the object has a docstring. + """ + lineno = None + + # Find the line number for modules. + if inspect.ismodule(obj): + lineno = 0 + + # Find the line number for classes. + # Note: this could be fooled if a class is defined multiple + # times in a single file. + if inspect.isclass(obj): + if source_lines is None: + return None + pat = re.compile(r'^\s*class\s*%s\b' % + getattr(obj, '__name__', '-')) + for i, line in enumerate(source_lines): + if pat.match(line): + lineno = i + break + + # Find the line number for functions & methods. + if inspect.ismethod(obj): obj = obj.im_func + if inspect.isfunction(obj): obj = obj.func_code + if inspect.istraceback(obj): obj = obj.tb_frame + if inspect.isframe(obj): obj = obj.f_code + if inspect.iscode(obj): + lineno = getattr(obj, 'co_firstlineno', None)-1 + + # Find the line number where the docstring starts. Assume + # that it's the first line that begins with a quote mark. + # Note: this could be fooled by a multiline function + # signature, where a continuation line begins with a quote + # mark. + if lineno is not None: + if source_lines is None: + return lineno+1 + pat = re.compile('(^|.*:)\s*\w*("|\')') + for lineno in range(lineno, len(source_lines)): + if pat.match(source_lines[lineno]): + return lineno + + # We couldn't find the line number. + return None + +###################################################################### +## 5. DocTest Runner +###################################################################### + +class DocTestRunner: + """ + A class used to run DocTest test cases, and accumulate statistics. + The `run` method is used to process a single DocTest case. It + returns a tuple `(f, t)`, where `t` is the number of test cases + tried, and `f` is the number of test cases that failed. + + >>> tests = DocTestFinder().find(_TestClass) + >>> runner = DocTestRunner(verbose=False) + >>> for test in tests: + ... print runner.run(test) + (0, 2) + (0, 1) + (0, 2) + (0, 2) + + The `summarize` method prints a summary of all the test cases that + have been run by the runner, and returns an aggregated `(f, t)` + tuple: + + >>> runner.summarize(verbose=1) + 4 items passed all tests: + 2 tests in _TestClass + 2 tests in _TestClass.__init__ + 2 tests in _TestClass.get + 1 tests in _TestClass.square + 7 tests in 4 items. + 7 passed and 0 failed. + Test passed. + (0, 7) + + The aggregated number of tried examples and failed examples is + also available via the `tries` and `failures` attributes: + + >>> runner.tries + 7 + >>> runner.failures + 0 + + The comparison between expected outputs and actual outputs is done + by an `OutputChecker`. This comparison may be customized with a + number of option flags; see the documentation for `testmod` for + more information. If the option flags are insufficient, then the + comparison may also be customized by passing a subclass of + `OutputChecker` to the constructor. + + The test runner's display output can be controlled in two ways. + First, an output function (`out) can be passed to + `TestRunner.run`; this function will be called with strings that + should be displayed. It defaults to `sys.stdout.write`. If + capturing the output is not sufficient, then the display output + can be also customized by subclassing DocTestRunner, and + overriding the methods `report_start`, `report_success`, + `report_unexpected_exception`, and `report_failure`. + """ + # This divider string is used to separate failure messages, and to + # separate sections of the summary. + DIVIDER = "*" * 70 + + def __init__(self, checker=None, verbose=None, optionflags=0): + """ + Create a new test runner. + + Optional keyword arg `checker` is the `OutputChecker` that + should be used to compare the expected outputs and actual + outputs of doctest examples. + + Optional keyword arg 'verbose' prints lots of stuff if true, + only failures if false; by default, it's true iff '-v' is in + sys.argv. + + Optional argument `optionflags` can be used to control how the + test runner compares expected output to actual output, and how + it displays failures. See the documentation for `testmod` for + more information. + """ + self._checker = checker or OutputChecker() + if verbose is None: + verbose = '-v' in sys.argv + self._verbose = verbose + self.optionflags = optionflags + self.original_optionflags = optionflags + + # Keep track of the examples we've run. + self.tries = 0 + self.failures = 0 + self._name2ft = {} + + # Create a fake output target for capturing doctest output. + self._fakeout = _SpoofOut() + + #///////////////////////////////////////////////////////////////// + # Reporting methods + #///////////////////////////////////////////////////////////////// + + def report_start(self, out, test, example): + """ + Report that the test runner is about to process the given + example. (Only displays a message if verbose=True) + """ + if self._verbose: + if example.want: + out('Trying:\n' + _indent(example.source) + + 'Expecting:\n' + _indent(example.want)) + else: + out('Trying:\n' + _indent(example.source) + + 'Expecting nothing\n') + + def report_success(self, out, test, example, got): + """ + Report that the given example ran successfully. (Only + displays a message if verbose=True) + """ + if self._verbose: + out("ok\n") + + def report_failure(self, out, test, example, got): + """ + Report that the given example failed. + """ + out(self._failure_header(test, example) + + self._checker.output_difference(example, got, self.optionflags)) + + def report_unexpected_exception(self, out, test, example, exc_info): + """ + Report that the given example raised an unexpected exception. + """ + out(self._failure_header(test, example) + + 'Exception raised:\n' + _indent(_exception_traceback(exc_info))) + + def _failure_header(self, test, example): + out = [self.DIVIDER] + if test.filename: + if test.lineno is not None and example.lineno is not None: + lineno = test.lineno + example.lineno + 1 + else: + lineno = '?' + out.append('File "%s", line %s, in %s' % + (test.filename, lineno, test.name)) + else: + out.append('Line %s, in %s' % (example.lineno+1, test.name)) + out.append('Failed example:') + source = example.source + out.append(_indent(source)) + return '\n'.join(out) + + #///////////////////////////////////////////////////////////////// + # DocTest Running + #///////////////////////////////////////////////////////////////// + + def __run(self, test, compileflags, out): + """ + Run the examples in `test`. Write the outcome of each example + with one of the `DocTestRunner.report_*` methods, using the + writer function `out`. `compileflags` is the set of compiler + flags that should be used to execute examples. Return a tuple + `(f, t)`, where `t` is the number of examples tried, and `f` + is the number of examples that failed. The examples are run + in the namespace `test.globs`. + """ + # Keep track of the number of failures and tries. + failures = tries = 0 + + # Save the option flags (since option directives can be used + # to modify them). + original_optionflags = self.optionflags + + SUCCESS, FAILURE, BOOM = range(3) # `outcome` state + + check = self._checker.check_output + + # Process each example. + for examplenum, example in enumerate(test.examples): + + # If REPORT_ONLY_FIRST_FAILURE is set, then supress + # reporting after the first failure. + quiet = (self.optionflags & REPORT_ONLY_FIRST_FAILURE and + failures > 0) + + # Merge in the example's options. + self.optionflags = original_optionflags + if example.options: + for (optionflag, val) in example.options.items(): + if val: + self.optionflags |= optionflag + else: + self.optionflags &= ~optionflag + + # If 'SKIP' is set, then skip this example. + if self.optionflags & SKIP: + continue + + # Record that we started this example. + tries += 1 + if not quiet: + self.report_start(out, test, example) + + # Use a special filename for compile(), so we can retrieve + # the source code during interactive debugging (see + # __patched_linecache_getlines). + filename = '<doctest %s[%d]>' % (test.name, examplenum) + + # Run the example in the given context (globs), and record + # any exception that gets raised. (But don't intercept + # keyboard interrupts.) + try: + # Don't blink! This is where the user's code gets run. + exec compile(example.source, filename, "single", + compileflags, 1) in test.globs + self.debugger.set_continue() # ==== Example Finished ==== + exception = None + except KeyboardInterrupt: + raise + except: + exception = sys.exc_info() + self.debugger.set_continue() # ==== Example Finished ==== + + got = self._fakeout.getvalue() # the actual output + self._fakeout.truncate(0) + outcome = FAILURE # guilty until proved innocent or insane + + # If the example executed without raising any exceptions, + # verify its output. + if exception is None: + if check(example.want, got, self.optionflags): + outcome = SUCCESS + + # The example raised an exception: check if it was expected. + else: + exc_info = sys.exc_info() + exc_msg = traceback.format_exception_only(*exc_info[:2])[-1] + if not quiet: + got += _exception_traceback(exc_info) + + # If `example.exc_msg` is None, then we weren't expecting + # an exception. + if example.exc_msg is None: + outcome = BOOM + + # We expected an exception: see whether it matches. + elif check(example.exc_msg, exc_msg, self.optionflags): + outcome = SUCCESS + + # Another chance if they didn't care about the detail. + elif self.optionflags & IGNORE_EXCEPTION_DETAIL: + m1 = re.match(r'[^:]*:', example.exc_msg) + m2 = re.match(r'[^:]*:', exc_msg) + if m1 and m2 and check(m1.group(0), m2.group(0), + self.optionflags): + outcome = SUCCESS + + # Report the outcome. + if outcome is SUCCESS: + if not quiet: + self.report_success(out, test, example, got) + elif outcome is FAILURE: + if not quiet: + self.report_failure(out, test, example, got) + failures += 1 + elif outcome is BOOM: + if not quiet: + self.report_unexpected_exception(out, test, example, + exc_info) + failures += 1 + else: + assert False, ("unknown outcome", outcome) + + # Restore the option flags (in case they were modified) + self.optionflags = original_optionflags + + # Record and return the number of failures and tries. + self.__record_outcome(test, failures, tries) + return failures, tries + + def __record_outcome(self, test, f, t): + """ + Record the fact that the given DocTest (`test`) generated `f` + failures out of `t` tried examples. + """ + f2, t2 = self._name2ft.get(test.name, (0,0)) + self._name2ft[test.name] = (f+f2, t+t2) + self.failures += f + self.tries += t + + __LINECACHE_FILENAME_RE = re.compile(r'<doctest ' + r'(?P<name>[\w\.]+)' + r'\[(?P<examplenum>\d+)\]>$') + def __patched_linecache_getlines(self, filename, module_globals=None): + m = self.__LINECACHE_FILENAME_RE.match(filename) + if m and m.group('name') == self.test.name: + example = self.test.examples[int(m.group('examplenum'))] + return example.source.splitlines(True) + else: + return self.save_linecache_getlines(filename, module_globals) + + def run(self, test, compileflags=None, out=None, clear_globs=True): + """ + Run the examples in `test`, and display the results using the + writer function `out`. + + The examples are run in the namespace `test.globs`. If + `clear_globs` is true (the default), then this namespace will + be cleared after the test runs, to help with garbage + collection. If you would like to examine the namespace after + the test completes, then use `clear_globs=False`. + + `compileflags` gives the set of flags that should be used by + the Python compiler when running the examples. If not + specified, then it will default to the set of future-import + flags that apply to `globs`. + + The output of each example is checked using + `DocTestRunner.check_output`, and the results are formatted by + the `DocTestRunner.report_*` methods. + """ + self.test = test + + if compileflags is None: + compileflags = _extract_future_flags(test.globs) + + save_stdout = sys.stdout + if out is None: + out = save_stdout.write + sys.stdout = self._fakeout + + # Patch pdb.set_trace to restore sys.stdout during interactive + # debugging (so it's not still redirected to self._fakeout). + # Note that the interactive output will go to *our* + # save_stdout, even if that's not the real sys.stdout; this + # allows us to write test cases for the set_trace behavior. + save_set_trace = pdb.set_trace + self.debugger = _OutputRedirectingPdb(save_stdout) + self.debugger.reset() + pdb.set_trace = self.debugger.set_trace + + # Patch linecache_copy.getlines, so we can see the example's source + # when we're inside the debugger. + self.save_linecache_getlines = linecache_copy.getlines + linecache_copy.getlines = self.__patched_linecache_getlines + + try: + return self.__run(test, compileflags, out) + finally: + sys.stdout = save_stdout + pdb.set_trace = save_set_trace + linecache_copy.getlines = self.save_linecache_getlines + if clear_globs: + test.globs.clear() + + #///////////////////////////////////////////////////////////////// + # Summarization + #///////////////////////////////////////////////////////////////// + def summarize(self, verbose=None): + """ + Print a summary of all the test cases that have been run by + this DocTestRunner, and return a tuple `(f, t)`, where `f` is + the total number of failed examples, and `t` is the total + number of tried examples. + + The optional `verbose` argument controls how detailed the + summary is. If the verbosity is not specified, then the + DocTestRunner's verbosity is used. + """ + if verbose is None: + verbose = self._verbose + notests = [] + passed = [] + failed = [] + totalt = totalf = 0 + for x in self._name2ft.items(): + name, (f, t) = x + assert f <= t + totalt += t + totalf += f + if t == 0: + notests.append(name) + elif f == 0: + passed.append( (name, t) ) + else: + failed.append(x) + if verbose: + if notests: + print len(notests), "items had no tests:" + notests.sort() + for thing in notests: + print " ", thing + if passed: + print len(passed), "items passed all tests:" + passed.sort() + for thing, count in passed: + print " %3d tests in %s" % (count, thing) + if failed: + print self.DIVIDER + print len(failed), "items had failures:" + failed.sort() + for thing, (f, t) in failed: + print " %3d of %3d in %s" % (f, t, thing) + if verbose: + print totalt, "tests in", len(self._name2ft), "items." + print totalt - totalf, "passed and", totalf, "failed." + if totalf: + print "***Test Failed***", totalf, "failures." + elif verbose: + print "Test passed." + return totalf, totalt + + #///////////////////////////////////////////////////////////////// + # Backward compatibility cruft to maintain doctest.master. + #///////////////////////////////////////////////////////////////// + def merge(self, other): + d = self._name2ft + for name, (f, t) in other._name2ft.items(): + if name in d: + print "*** DocTestRunner.merge: '" + name + "' in both" \ + " testers; summing outcomes." + f2, t2 = d[name] + f = f + f2 + t = t + t2 + d[name] = f, t + +class OutputChecker: + """ + A class used to check the whether the actual output from a doctest + example matches the expected output. `OutputChecker` defines two + methods: `check_output`, which compares a given pair of outputs, + and returns true if they match; and `output_difference`, which + returns a string describing the differences between two outputs. + """ + def check_output(self, want, got, optionflags): + """ + Return True iff the actual output from an example (`got`) + matches the expected output (`want`). These strings are + always considered to match if they are identical; but + depending on what option flags the test runner is using, + several non-exact match types are also possible. See the + documentation for `TestRunner` for more information about + option flags. + """ + # Handle the common case first, for efficiency: + # if they're string-identical, always return true. + if got == want: + return True + + # The values True and False replaced 1 and 0 as the return + # value for boolean comparisons in Python 2.3. + if not (optionflags & DONT_ACCEPT_TRUE_FOR_1): + if (got,want) == ("True\n", "1\n"): + return True + if (got,want) == ("False\n", "0\n"): + return True + + # <BLANKLINE> can be used as a special sequence to signify a + # blank line, unless the DONT_ACCEPT_BLANKLINE flag is used. + if not (optionflags & DONT_ACCEPT_BLANKLINE): + # Replace <BLANKLINE> in want with a blank line. + want = re.sub('(?m)^%s\s*?$' % re.escape(BLANKLINE_MARKER), + '', want) + # If a line in got contains only spaces, then remove the + # spaces. + got = re.sub('(?m)^\s*?$', '', got) + if got == want: + return True + + # This flag causes doctest to ignore any differences in the + # contents of whitespace strings. Note that this can be used + # in conjunction with the ELLIPSIS flag. + if optionflags & NORMALIZE_WHITESPACE: + got = ' '.join(got.split()) + want = ' '.join(want.split()) + if got == want: + return True + + # The ELLIPSIS flag says to let the sequence "..." in `want` + # match any substring in `got`. + if optionflags & ELLIPSIS: + if _ellipsis_match(want, got): + return True + + # We didn't find any match; return false. + return False + + # Should we do a fancy diff? + def _do_a_fancy_diff(self, want, got, optionflags): + # Not unless they asked for a fancy diff. + if not optionflags & (REPORT_UDIFF | + REPORT_CDIFF | + REPORT_NDIFF): + return False + + # If expected output uses ellipsis, a meaningful fancy diff is + # too hard ... or maybe not. In two real-life failures Tim saw, + # a diff was a major help anyway, so this is commented out. + # [todo] _ellipsis_match() knows which pieces do and don't match, + # and could be the basis for a kick-ass diff in this case. + ##if optionflags & ELLIPSIS and ELLIPSIS_MARKER in want: + ## return False + + # ndiff does intraline difference marking, so can be useful even + # for 1-line differences. + if optionflags & REPORT_NDIFF: + return True + + # The other diff types need at least a few lines to be helpful. + return want.count('\n') > 2 and got.count('\n') > 2 + + def output_difference(self, example, got, optionflags): + """ + Return a string describing the differences between the + expected output for a given example (`example`) and the actual + output (`got`). `optionflags` is the set of option flags used + to compare `want` and `got`. + """ + want = example.want + # If <BLANKLINE>s are being used, then replace blank lines + # with <BLANKLINE> in the actual output string. + if not (optionflags & DONT_ACCEPT_BLANKLINE): + got = re.sub('(?m)^[ ]*(?=\n)', BLANKLINE_MARKER, got) + + # Check if we should use diff. + if self._do_a_fancy_diff(want, got, optionflags): + # Split want & got into lines. + want_lines = want.splitlines(True) # True == keep line ends + got_lines = got.splitlines(True) + # Use difflib to find their differences. + if optionflags & REPORT_UDIFF: + diff = difflib.unified_diff(want_lines, got_lines, n=2) + diff = list(diff)[2:] # strip the diff header + kind = 'unified diff with -expected +actual' + elif optionflags & REPORT_CDIFF: + diff = difflib.context_diff(want_lines, got_lines, n=2) + diff = list(diff)[2:] # strip the diff header + kind = 'context diff with expected followed by actual' + elif optionflags & REPORT_NDIFF: + engine = difflib.Differ(charjunk=difflib.IS_CHARACTER_JUNK) + diff = list(engine.compare(want_lines, got_lines)) + kind = 'ndiff with -expected +actual' + else: + assert 0, 'Bad diff option' + # Remove trailing whitespace on diff output. + diff = [line.rstrip() + '\n' for line in diff] + return 'Differences (%s):\n' % kind + _indent(''.join(diff)) + + # If we're not using diff, then simply list the expected + # output followed by the actual output. + if want and got: + return 'Expected:\n%sGot:\n%s' % (_indent(want), _indent(got)) + elif want: + return 'Expected:\n%sGot nothing\n' % _indent(want) + elif got: + return 'Expected nothing\nGot:\n%s' % _indent(got) + else: + return 'Expected nothing\nGot nothing\n' + +class DocTestFailure(Exception): + """A DocTest example has failed in debugging mode. + + The exception instance has variables: + + - test: the DocTest object being run + + - excample: the Example object that failed + + - got: the actual output + """ + def __init__(self, test, example, got): + self.test = test + self.example = example + self.got = got + + def __str__(self): + return str(self.test) + +class UnexpectedException(Exception): + """A DocTest example has encountered an unexpected exception + + The exception instance has variables: + + - test: the DocTest object being run + + - excample: the Example object that failed + + - exc_info: the exception info + """ + def __init__(self, test, example, exc_info): + self.test = test + self.example = example + self.exc_info = exc_info + + def __str__(self): + return str(self.test) + +class DebugRunner(DocTestRunner): + r"""Run doc tests but raise an exception as soon as there is a failure. + + If an unexpected exception occurs, an UnexpectedException is raised. + It contains the test, the example, and the original exception: + + >>> runner = DebugRunner(verbose=False) + >>> test = DocTestParser().get_doctest('>>> raise KeyError\n42', + ... {}, 'foo', 'foo.py', 0) + >>> try: + ... runner.run(test) + ... except UnexpectedException, failure: + ... pass + + >>> failure.test is test + True + + >>> failure.example.want + '42\n' + + >>> exc_info = failure.exc_info + >>> raise exc_info[0], exc_info[1], exc_info[2] + Traceback (most recent call last): + ... + KeyError + + We wrap the original exception to give the calling application + access to the test and example information. + + If the output doesn't match, then a DocTestFailure is raised: + + >>> test = DocTestParser().get_doctest(''' + ... >>> x = 1 + ... >>> x + ... 2 + ... ''', {}, 'foo', 'foo.py', 0) + + >>> try: + ... runner.run(test) + ... except DocTestFailure, failure: + ... pass + + DocTestFailure objects provide access to the test: + + >>> failure.test is test + True + + As well as to the example: + + >>> failure.example.want + '2\n' + + and the actual output: + + >>> failure.got + '1\n' + + If a failure or error occurs, the globals are left intact: + + >>> del test.globs['__builtins__'] + >>> test.globs + {'x': 1} + + >>> test = DocTestParser().get_doctest(''' + ... >>> x = 2 + ... >>> raise KeyError + ... ''', {}, 'foo', 'foo.py', 0) + + >>> runner.run(test) + Traceback (most recent call last): + ... + UnexpectedException: <DocTest foo from foo.py:0 (2 examples)> + + >>> del test.globs['__builtins__'] + >>> test.globs + {'x': 2} + + But the globals are cleared if there is no error: + + >>> test = DocTestParser().get_doctest(''' + ... >>> x = 2 + ... ''', {}, 'foo', 'foo.py', 0) + + >>> runner.run(test) + (0, 1) + + >>> test.globs + {} + + """ + + def run(self, test, compileflags=None, out=None, clear_globs=True): + r = DocTestRunner.run(self, test, compileflags, out, False) + if clear_globs: + test.globs.clear() + return r + + def report_unexpected_exception(self, out, test, example, exc_info): + raise UnexpectedException(test, example, exc_info) + + def report_failure(self, out, test, example, got): + raise DocTestFailure(test, example, got) + +###################################################################### +## 6. Test Functions +###################################################################### +# These should be backwards compatible. + +# For backward compatibility, a global instance of a DocTestRunner +# class, updated by testmod. +master = None + +def testmod(m=None, name=None, globs=None, verbose=None, isprivate=None, + report=True, optionflags=0, extraglobs=None, + raise_on_error=False, exclude_empty=False): + """m=None, name=None, globs=None, verbose=None, isprivate=None, + report=True, optionflags=0, extraglobs=None, raise_on_error=False, + exclude_empty=False + + Test examples in docstrings in functions and classes reachable + from module m (or the current module if m is not supplied), starting + with m.__doc__. Unless isprivate is specified, private names + are not skipped. + + Also test examples reachable from dict m.__test__ if it exists and is + not None. m.__test__ maps names to functions, classes and strings; + function and class docstrings are tested even if the name is private; + strings are tested directly, as if they were docstrings. + + Return (#failures, #tests). + + See doctest.__doc__ for an overview. + + Optional keyword arg "name" gives the name of the module; by default + use m.__name__. + + Optional keyword arg "globs" gives a dict to be used as the globals + when executing examples; by default, use m.__dict__. A copy of this + dict is actually used for each docstring, so that each docstring's + examples start with a clean slate. + + Optional keyword arg "extraglobs" gives a dictionary that should be + merged into the globals that are used to execute examples. By + default, no extra globals are used. This is new in 2.4. + + Optional keyword arg "verbose" prints lots of stuff if true, prints + only failures if false; by default, it's true iff "-v" is in sys.argv. + + Optional keyword arg "report" prints a summary at the end when true, + else prints nothing at the end. In verbose mode, the summary is + detailed, else very brief (in fact, empty if all tests passed). + + Optional keyword arg "optionflags" or's together module constants, + and defaults to 0. This is new in 2.3. Possible values (see the + docs for details): + + DONT_ACCEPT_TRUE_FOR_1 + DONT_ACCEPT_BLANKLINE + NORMALIZE_WHITESPACE + ELLIPSIS + SKIP + IGNORE_EXCEPTION_DETAIL + REPORT_UDIFF + REPORT_CDIFF + REPORT_NDIFF + REPORT_ONLY_FIRST_FAILURE + + Optional keyword arg "raise_on_error" raises an exception on the + first unexpected exception or failure. This allows failures to be + post-mortem debugged. + + Deprecated in Python 2.4: + Optional keyword arg "isprivate" specifies a function used to + determine whether a name is private. The default function is + treat all functions as public. Optionally, "isprivate" can be + set to doctest.is_private to skip over functions marked as private + using the underscore naming convention; see its docs for details. + + Advanced tomfoolery: testmod runs methods of a local instance of + class doctest.Tester, then merges the results into (or creates) + global Tester instance doctest.master. Methods of doctest.master + can be called directly too, if you want to do something unusual. + Passing report=0 to testmod is especially useful then, to delay + displaying a summary. Invoke doctest.master.summarize(verbose) + when you're done fiddling. + """ + global master + + if isprivate is not None: + warnings.warn("the isprivate argument is deprecated; " + "examine DocTestFinder.find() lists instead", + DeprecationWarning) + + # If no module was given, then use __main__. + if m is None: + # DWA - m will still be None if this wasn't invoked from the command + # line, in which case the following TypeError is about as good an error + # as we should expect + m = sys.modules.get('__main__') + + # Check that we were actually given a module. + if not inspect.ismodule(m): + raise TypeError("testmod: module required; %r" % (m,)) + + # If no name was given, then use the module's name. + if name is None: + name = m.__name__ + + # Find, parse, and run all tests in the given module. + finder = DocTestFinder(_namefilter=isprivate, exclude_empty=exclude_empty) + + if raise_on_error: + runner = DebugRunner(verbose=verbose, optionflags=optionflags) + else: + runner = DocTestRunner(verbose=verbose, optionflags=optionflags) + + for test in finder.find(m, name, globs=globs, extraglobs=extraglobs): + runner.run(test) + + if report: + runner.summarize() + + if master is None: + master = runner + else: + master.merge(runner) + + return runner.failures, runner.tries + +def testfile(filename, module_relative=True, name=None, package=None, + globs=None, verbose=None, report=True, optionflags=0, + extraglobs=None, raise_on_error=False, parser=DocTestParser()): + """ + Test examples in the given file. Return (#failures, #tests). + + Optional keyword arg "module_relative" specifies how filenames + should be interpreted: + + - If "module_relative" is True (the default), then "filename" + specifies a module-relative path. By default, this path is + relative to the calling module's directory; but if the + "package" argument is specified, then it is relative to that + package. To ensure os-independence, "filename" should use + "/" characters to separate path segments, and should not + be an absolute path (i.e., it may not begin with "/"). + + - If "module_relative" is False, then "filename" specifies an + os-specific path. The path may be absolute or relative (to + the current working directory). + + Optional keyword arg "name" gives the name of the test; by default + use the file's basename. + + Optional keyword argument "package" is a Python package or the + name of a Python package whose directory should be used as the + base directory for a module relative filename. If no package is + specified, then the calling module's directory is used as the base + directory for module relative filenames. It is an error to + specify "package" if "module_relative" is False. + + Optional keyword arg "globs" gives a dict to be used as the globals + when executing examples; by default, use {}. A copy of this dict + is actually used for each docstring, so that each docstring's + examples start with a clean slate. + + Optional keyword arg "extraglobs" gives a dictionary that should be + merged into the globals that are used to execute examples. By + default, no extra globals are used. + + Optional keyword arg "verbose" prints lots of stuff if true, prints + only failures if false; by default, it's true iff "-v" is in sys.argv. + + Optional keyword arg "report" prints a summary at the end when true, + else prints nothing at the end. In verbose mode, the summary is + detailed, else very brief (in fact, empty if all tests passed). + + Optional keyword arg "optionflags" or's together module constants, + and defaults to 0. Possible values (see the docs for details): + + DONT_ACCEPT_TRUE_FOR_1 + DONT_ACCEPT_BLANKLINE + NORMALIZE_WHITESPACE + ELLIPSIS + SKIP + IGNORE_EXCEPTION_DETAIL + REPORT_UDIFF + REPORT_CDIFF + REPORT_NDIFF + REPORT_ONLY_FIRST_FAILURE + + Optional keyword arg "raise_on_error" raises an exception on the + first unexpected exception or failure. This allows failures to be + post-mortem debugged. + + Optional keyword arg "parser" specifies a DocTestParser (or + subclass) that should be used to extract tests from the files. + + Advanced tomfoolery: testmod runs methods of a local instance of + class doctest.Tester, then merges the results into (or creates) + global Tester instance doctest.master. Methods of doctest.master + can be called directly too, if you want to do something unusual. + Passing report=0 to testmod is especially useful then, to delay + displaying a summary. Invoke doctest.master.summarize(verbose) + when you're done fiddling. + """ + global master + + if package and not module_relative: + raise ValueError("Package may only be specified for module-" + "relative paths.") + + # Relativize the path + text, filename = _load_testfile(filename, package, module_relative) + + # If no name was given, then use the file's name. + if name is None: + name = os.path.basename(filename) + + # Assemble the globals. + if globs is None: + globs = {} + else: + globs = globs.copy() + if extraglobs is not None: + globs.update(extraglobs) + + if raise_on_error: + runner = DebugRunner(verbose=verbose, optionflags=optionflags) + else: + runner = DocTestRunner(verbose=verbose, optionflags=optionflags) + + # Read the file, convert it to a test, and run it. + test = parser.get_doctest(text, globs, name, filename, 0) + runner.run(test) + + if report: + runner.summarize() + + if master is None: + master = runner + else: + master.merge(runner) + + return runner.failures, runner.tries + +def run_docstring_examples(f, globs, verbose=False, name="NoName", + compileflags=None, optionflags=0): + """ + Test examples in the given object's docstring (`f`), using `globs` + as globals. Optional argument `name` is used in failure messages. + If the optional argument `verbose` is true, then generate output + even if there are no failures. + + `compileflags` gives the set of flags that should be used by the + Python compiler when running the examples. If not specified, then + it will default to the set of future-import flags that apply to + `globs`. + + Optional keyword arg `optionflags` specifies options for the + testing and output. See the documentation for `testmod` for more + information. + """ + # Find, parse, and run all tests in the given module. + finder = DocTestFinder(verbose=verbose, recurse=False) + runner = DocTestRunner(verbose=verbose, optionflags=optionflags) + for test in finder.find(f, name, globs=globs): + runner.run(test, compileflags=compileflags) + +###################################################################### +## 7. Tester +###################################################################### +# This is provided only for backwards compatibility. It's not +# actually used in any way. + +class Tester: + def __init__(self, mod=None, globs=None, verbose=None, + isprivate=None, optionflags=0): + + warnings.warn("class Tester is deprecated; " + "use class doctest.DocTestRunner instead", + DeprecationWarning, stacklevel=2) + if mod is None and globs is None: + raise TypeError("Tester.__init__: must specify mod or globs") + if mod is not None and not inspect.ismodule(mod): + raise TypeError("Tester.__init__: mod must be a module; %r" % + (mod,)) + if globs is None: + globs = mod.__dict__ + self.globs = globs + + self.verbose = verbose + self.isprivate = isprivate + self.optionflags = optionflags + self.testfinder = DocTestFinder(_namefilter=isprivate) + self.testrunner = DocTestRunner(verbose=verbose, + optionflags=optionflags) + + def runstring(self, s, name): + test = DocTestParser().get_doctest(s, self.globs, name, None, None) + if self.verbose: + print "Running string", name + (f,t) = self.testrunner.run(test) + if self.verbose: + print f, "of", t, "examples failed in string", name + return (f,t) + + def rundoc(self, object, name=None, module=None): + f = t = 0 + tests = self.testfinder.find(object, name, module=module, + globs=self.globs) + for test in tests: + (f2, t2) = self.testrunner.run(test) + (f,t) = (f+f2, t+t2) + return (f,t) + + def rundict(self, d, name, module=None): + import new + m = new.module(name) + m.__dict__.update(d) + if module is None: + module = False + return self.rundoc(m, name, module) + + def run__test__(self, d, name): + import new + m = new.module(name) + m.__test__ = d + return self.rundoc(m, name) + + def summarize(self, verbose=None): + return self.testrunner.summarize(verbose) + + def merge(self, other): + self.testrunner.merge(other.testrunner) + +###################################################################### +## 8. Unittest Support +###################################################################### + +_unittest_reportflags = 0 + +def set_unittest_reportflags(flags): + """Sets the unittest option flags. + + The old flag is returned so that a runner could restore the old + value if it wished to: + + >>> import doctest + >>> old = doctest._unittest_reportflags + >>> doctest.set_unittest_reportflags(REPORT_NDIFF | + ... REPORT_ONLY_FIRST_FAILURE) == old + True + + >>> doctest._unittest_reportflags == (REPORT_NDIFF | + ... REPORT_ONLY_FIRST_FAILURE) + True + + Only reporting flags can be set: + + >>> doctest.set_unittest_reportflags(ELLIPSIS) + Traceback (most recent call last): + ... + ValueError: ('Only reporting flags allowed', 8) + + >>> doctest.set_unittest_reportflags(old) == (REPORT_NDIFF | + ... REPORT_ONLY_FIRST_FAILURE) + True + """ + global _unittest_reportflags + + if (flags & REPORTING_FLAGS) != flags: + raise ValueError("Only reporting flags allowed", flags) + old = _unittest_reportflags + _unittest_reportflags = flags + return old + + +class DocTestCase(unittest.TestCase): + + def __init__(self, test, optionflags=0, setUp=None, tearDown=None, + checker=None): + + unittest.TestCase.__init__(self) + self._dt_optionflags = optionflags + self._dt_checker = checker + self._dt_test = test + self._dt_setUp = setUp + self._dt_tearDown = tearDown + + def setUp(self): + test = self._dt_test + + if self._dt_setUp is not None: + self._dt_setUp(test) + + def tearDown(self): + test = self._dt_test + + if self._dt_tearDown is not None: + self._dt_tearDown(test) + + test.globs.clear() + + def runTest(self): + test = self._dt_test + old = sys.stdout + new = StringIO() + optionflags = self._dt_optionflags + + if not (optionflags & REPORTING_FLAGS): + # The option flags don't include any reporting flags, + # so add the default reporting flags + optionflags |= _unittest_reportflags + + runner = DocTestRunner(optionflags=optionflags, + checker=self._dt_checker, verbose=False) + + try: + runner.DIVIDER = "-"*70 + failures, tries = runner.run( + test, out=new.write, clear_globs=False) + finally: + sys.stdout = old + + if failures: + raise self.failureException(self.format_failure(new.getvalue())) + + def format_failure(self, err): + test = self._dt_test + if test.lineno is None: + lineno = 'unknown line number' + else: + lineno = '%s' % test.lineno + lname = '.'.join(test.name.split('.')[-1:]) + return ('Failed doctest test for %s\n' + ' File "%s", line %s, in %s\n\n%s' + % (test.name, test.filename, lineno, lname, err) + ) + + def debug(self): + r"""Run the test case without results and without catching exceptions + + The unit test framework includes a debug method on test cases + and test suites to support post-mortem debugging. The test code + is run in such a way that errors are not caught. This way a + caller can catch the errors and initiate post-mortem debugging. + + The DocTestCase provides a debug method that raises + UnexpectedException errors if there is an unexepcted + exception: + + >>> test = DocTestParser().get_doctest('>>> raise KeyError\n42', + ... {}, 'foo', 'foo.py', 0) + >>> case = DocTestCase(test) + >>> try: + ... case.debug() + ... except UnexpectedException, failure: + ... pass + + The UnexpectedException contains the test, the example, and + the original exception: + + >>> failure.test is test + True + + >>> failure.example.want + '42\n' + + >>> exc_info = failure.exc_info + >>> raise exc_info[0], exc_info[1], exc_info[2] + Traceback (most recent call last): + ... + KeyError + + If the output doesn't match, then a DocTestFailure is raised: + + >>> test = DocTestParser().get_doctest(''' + ... >>> x = 1 + ... >>> x + ... 2 + ... ''', {}, 'foo', 'foo.py', 0) + >>> case = DocTestCase(test) + + >>> try: + ... case.debug() + ... except DocTestFailure, failure: + ... pass + + DocTestFailure objects provide access to the test: + + >>> failure.test is test + True + + As well as to the example: + + >>> failure.example.want + '2\n' + + and the actual output: + + >>> failure.got + '1\n' + + """ + + self.setUp() + runner = DebugRunner(optionflags=self._dt_optionflags, + checker=self._dt_checker, verbose=False) + runner.run(self._dt_test) + self.tearDown() + + def id(self): + return self._dt_test.name + + def __repr__(self): + name = self._dt_test.name.split('.') + return "%s (%s)" % (name[-1], '.'.join(name[:-1])) + + __str__ = __repr__ + + def shortDescription(self): + return "Doctest: " + self._dt_test.name + +def DocTestSuite(module=None, globs=None, extraglobs=None, test_finder=None, + **options): + """ + Convert doctest tests for a module to a unittest test suite. + + This converts each documentation string in a module that + contains doctest tests to a unittest test case. If any of the + tests in a doc string fail, then the test case fails. An exception + is raised showing the name of the file containing the test and a + (sometimes approximate) line number. + + The `module` argument provides the module to be tested. The argument + can be either a module or a module name. + + If no argument is given, the calling module is used. + + A number of options may be provided as keyword arguments: + + setUp + A set-up function. This is called before running the + tests in each file. The setUp function will be passed a DocTest + object. The setUp function can access the test globals as the + globs attribute of the test passed. + + tearDown + A tear-down function. This is called after running the + tests in each file. The tearDown function will be passed a DocTest + object. The tearDown function can access the test globals as the + globs attribute of the test passed. + + globs + A dictionary containing initial global variables for the tests. + + optionflags + A set of doctest option flags expressed as an integer. + """ + + if test_finder is None: + test_finder = DocTestFinder() + + module = _normalize_module(module) + tests = test_finder.find(module, globs=globs, extraglobs=extraglobs) + if globs is None: + globs = module.__dict__ + if not tests: + # Why do we want to do this? Because it reveals a bug that might + # otherwise be hidden. + raise ValueError(module, "has no tests") + + tests.sort() + suite = unittest.TestSuite() + for test in tests: + if len(test.examples) == 0: + continue + if not test.filename: + filename = module.__file__ + if filename[-4:] in (".pyc", ".pyo"): + filename = filename[:-1] + test.filename = filename + suite.addTest(DocTestCase(test, **options)) + + return suite + +class DocFileCase(DocTestCase): + + def id(self): + return '_'.join(self._dt_test.name.split('.')) + + def __repr__(self): + return self._dt_test.filename + __str__ = __repr__ + + def format_failure(self, err): + return ('Failed doctest test for %s\n File "%s", line 0\n\n%s' + % (self._dt_test.name, self._dt_test.filename, err) + ) + +def DocFileTest(path, module_relative=True, package=None, + globs=None, parser=DocTestParser(), **options): + if globs is None: + globs = {} + else: + globs = globs.copy() + + if package and not module_relative: + raise ValueError("Package may only be specified for module-" + "relative paths.") + + # Relativize the path. + doc, path = _load_testfile(path, package, module_relative) + + if "__file__" not in globs: + globs["__file__"] = path + + # Find the file and read it. + name = os.path.basename(path) + + # Convert it to a test, and wrap it in a DocFileCase. + test = parser.get_doctest(doc, globs, name, path, 0) + return DocFileCase(test, **options) + +def DocFileSuite(*paths, **kw): + """A unittest suite for one or more doctest files. + + The path to each doctest file is given as a string; the + interpretation of that string depends on the keyword argument + "module_relative". + + A number of options may be provided as keyword arguments: + + module_relative + If "module_relative" is True, then the given file paths are + interpreted as os-independent module-relative paths. By + default, these paths are relative to the calling module's + directory; but if the "package" argument is specified, then + they are relative to that package. To ensure os-independence, + "filename" should use "/" characters to separate path + segments, and may not be an absolute path (i.e., it may not + begin with "/"). + + If "module_relative" is False, then the given file paths are + interpreted as os-specific paths. These paths may be absolute + or relative (to the current working directory). + + package + A Python package or the name of a Python package whose directory + should be used as the base directory for module relative paths. + If "package" is not specified, then the calling module's + directory is used as the base directory for module relative + filenames. It is an error to specify "package" if + "module_relative" is False. + + setUp + A set-up function. This is called before running the + tests in each file. The setUp function will be passed a DocTest + object. The setUp function can access the test globals as the + globs attribute of the test passed. + + tearDown + A tear-down function. This is called after running the + tests in each file. The tearDown function will be passed a DocTest + object. The tearDown function can access the test globals as the + globs attribute of the test passed. + + globs + A dictionary containing initial global variables for the tests. + + optionflags + A set of doctest option flags expressed as an integer. + + parser + A DocTestParser (or subclass) that should be used to extract + tests from the files. + """ + suite = unittest.TestSuite() + + # We do this here so that _normalize_module is called at the right + # level. If it were called in DocFileTest, then this function + # would be the caller and we might guess the package incorrectly. + if kw.get('module_relative', True): + kw['package'] = _normalize_module(kw.get('package')) + + for path in paths: + suite.addTest(DocFileTest(path, **kw)) + + return suite + +###################################################################### +## 9. Debugging Support +###################################################################### + +def script_from_examples(s): + r"""Extract script from text with examples. + + Converts text with examples to a Python script. Example input is + converted to regular code. Example output and all other words + are converted to comments: + + >>> text = ''' + ... Here are examples of simple math. + ... + ... Python has super accurate integer addition + ... + ... >>> 2 + 2 + ... 5 + ... + ... And very friendly error messages: + ... + ... >>> 1/0 + ... To Infinity + ... And + ... Beyond + ... + ... You can use logic if you want: + ... + ... >>> if 0: + ... ... blah + ... ... blah + ... ... + ... + ... Ho hum + ... ''' + + >>> print script_from_examples(text) + # Here are examples of simple math. + # + # Python has super accurate integer addition + # + 2 + 2 + # Expected: + ## 5 + # + # And very friendly error messages: + # + 1/0 + # Expected: + ## To Infinity + ## And + ## Beyond + # + # You can use logic if you want: + # + if 0: + blah + blah + # + # Ho hum + <BLANKLINE> + """ + output = [] + for piece in DocTestParser().parse(s): + if isinstance(piece, Example): + # Add the example's source code (strip trailing NL) + output.append(piece.source[:-1]) + # Add the expected output: + want = piece.want + if want: + output.append('# Expected:') + output += ['## '+l for l in want.split('\n')[:-1]] + else: + # Add non-example text. + output += [_comment_line(l) + for l in piece.split('\n')[:-1]] + + # Trim junk on both ends. + while output and output[-1] == '#': + output.pop() + while output and output[0] == '#': + output.pop(0) + # Combine the output, and return it. + # Add a courtesy newline to prevent exec from choking (see bug #1172785) + return '\n'.join(output) + '\n' + +def testsource(module, name): + """Extract the test sources from a doctest docstring as a script. + + Provide the module (or dotted name of the module) containing the + test to be debugged and the name (within the module) of the object + with the doc string with tests to be debugged. + """ + module = _normalize_module(module) + tests = DocTestFinder().find(module) + test = [t for t in tests if t.name == name] + if not test: + raise ValueError(name, "not found in tests") + test = test[0] + testsrc = script_from_examples(test.docstring) + return testsrc + +def debug_src(src, pm=False, globs=None): + """Debug a single doctest docstring, in argument `src`'""" + testsrc = script_from_examples(src) + debug_script(testsrc, pm, globs) + +def debug_script(src, pm=False, globs=None): + "Debug a test script. `src` is the script, as a string." + import pdb + + # Note that tempfile.NameTemporaryFile() cannot be used. As the + # docs say, a file so created cannot be opened by name a second time + # on modern Windows boxes, and execfile() needs to open it. + srcfilename = tempfile.mktemp(".py", "doctestdebug") + f = open(srcfilename, 'w') + f.write(src) + f.close() + + try: + if globs: + globs = globs.copy() + else: + globs = {} + + if pm: + try: + execfile(srcfilename, globs, globs) + except: + print sys.exc_info()[1] + pdb.post_mortem(sys.exc_info()[2]) + else: + # Note that %r is vital here. '%s' instead can, e.g., cause + # backslashes to get treated as metacharacters on Windows. + pdb.run("execfile(%r)" % srcfilename, globs, globs) + + finally: + os.remove(srcfilename) + +def debug(module, name, pm=False): + """Debug a single doctest docstring. + + Provide the module (or dotted name of the module) containing the + test to be debugged and the name (within the module) of the object + with the docstring with tests to be debugged. + """ + module = _normalize_module(module) + testsrc = testsource(module, name) + debug_script(testsrc, pm, module.__dict__) + +###################################################################### +## 10. Example Usage +###################################################################### +class _TestClass: + """ + A pointless class, for sanity-checking of docstring testing. + + Methods: + square() + get() + + >>> _TestClass(13).get() + _TestClass(-12).get() + 1 + >>> hex(_TestClass(13).square().get()) + '0xa9' + """ + + def __init__(self, val): + """val -> _TestClass object with associated value val. + + >>> t = _TestClass(123) + >>> print t.get() + 123 + """ + + self.val = val + + def square(self): + """square() -> square TestClass's associated value + + >>> _TestClass(13).square().get() + 169 + """ + + self.val = self.val ** 2 + return self + + def get(self): + """get() -> return TestClass's associated value. + + >>> x = _TestClass(-42) + >>> print x.get() + -42 + """ + + return self.val + +__test__ = {"_TestClass": _TestClass, + "string": r""" + Example of a string object, searched as-is. + >>> x = 1; y = 2 + >>> x + y, x * y + (3, 2) + """, + + "bool-int equivalence": r""" + In 2.2, boolean expressions displayed + 0 or 1. By default, we still accept + them. This can be disabled by passing + DONT_ACCEPT_TRUE_FOR_1 to the new + optionflags argument. + >>> 4 == 4 + 1 + >>> 4 == 4 + True + >>> 4 > 4 + 0 + >>> 4 > 4 + False + """, + + "blank lines": r""" + Blank lines can be marked with <BLANKLINE>: + >>> print 'foo\n\nbar\n' + foo + <BLANKLINE> + bar + <BLANKLINE> + """, + + "ellipsis": r""" + If the ellipsis flag is used, then '...' can be used to + elide substrings in the desired output: + >>> print range(1000) #doctest: +ELLIPSIS + [0, 1, 2, ..., 999] + """, + + "whitespace normalization": r""" + If the whitespace normalization flag is used, then + differences in whitespace are ignored. + >>> print range(30) #doctest: +NORMALIZE_WHITESPACE + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29] + """, + } + +def _test(): + r = unittest.TextTestRunner() + r.run(DocTestSuite()) + +if __name__ == "__main__": + _test() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/functools_copy.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/functools_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..6d0e1c399c770629b2ee27d1a8e3fb7f491dadd7 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/functools_copy.py @@ -0,0 +1,59 @@ +"""functools.py - Tools for working with functions and callable objects +""" +# Python module wrapper for _functools C module +# to allow utilities written in Python to be added +# to the functools module. +# Written by Nick Coghlan <ncoghlan at gmail.com> +# Copyright (C) 2006 Python Software Foundation. +# See C source code for _functools credits/copyright + +def partial(func, *args, **keywords): + def newfunc(*fargs, **fkeywords): + newkeywords = keywords.copy() + newkeywords.update(fkeywords) + return func(*(args + fargs), **newkeywords) + newfunc.func = func + newfunc.args = args + newfunc.keywords = keywords + return newfunc + +# update_wrapper() and wraps() are tools to help write +# wrapper functions that can handle naive introspection + +WRAPPER_ASSIGNMENTS = ('__module__', '__name__', '__doc__') +WRAPPER_UPDATES = ('__dict__',) +def update_wrapper(wrapper, + wrapped, + assigned = WRAPPER_ASSIGNMENTS, + updated = WRAPPER_UPDATES): + """Update a wrapper function to look like the wrapped function + + wrapper is the function to be updated + wrapped is the original function + assigned is a tuple naming the attributes assigned directly + from the wrapped function to the wrapper function (defaults to + functools.WRAPPER_ASSIGNMENTS) + updated is a tuple naming the attributes of the wrapper that + are updated with the corresponding attribute from the wrapped + function (defaults to functools.WRAPPER_UPDATES) + """ + for attr in assigned: + setattr(wrapper, attr, getattr(wrapped, attr)) + for attr in updated: + getattr(wrapper, attr).update(getattr(wrapped, attr, {})) + # Return the wrapper so this can be used as a decorator via partial() + return wrapper + +def wraps(wrapped, + assigned = WRAPPER_ASSIGNMENTS, + updated = WRAPPER_UPDATES): + """Decorator factory to apply update_wrapper() to a wrapper function + + Returns a decorator that invokes update_wrapper() with the decorated + function as the wrapper argument and the arguments to wraps() as the + remaining arguments. Default arguments are as for update_wrapper(). + This is a convenience function to simplify applying partial() to + update_wrapper(). + """ + return partial(update_wrapper, wrapped=wrapped, + assigned=assigned, updated=updated) diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/linecache_copy.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/linecache_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..f49695ac1cec88c87b428e9d18a7324ffc79ce9a --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/linecache_copy.py @@ -0,0 +1,132 @@ +"""Cache lines from files. + +This is intended to read lines from modules imported -- hence if a filename +is not found, it will look down the module search path for a file by +that name. +""" + +import sys +import os + +__all__ = ["getline", "clearcache", "checkcache"] + +def getline(filename, lineno, module_globals=None): + lines = getlines(filename, module_globals) + if 1 <= lineno <= len(lines): + return lines[lineno-1] + else: + return '' + + +# The cache + +cache = {} # The cache + + +def clearcache(): + """Clear the cache entirely.""" + + global cache + cache = {} + + +def getlines(filename, module_globals=None): + """Get the lines for a file from the cache. + Update the cache if it doesn't contain an entry for this file already.""" + + if filename in cache: + return cache[filename][2] + else: + return updatecache(filename, module_globals) + + +def checkcache(filename=None): + """Discard cache entries that are out of date. + (This is not checked upon each call!)""" + + if filename is None: + filenames = cache.keys() + else: + if filename in cache: + filenames = [filename] + else: + return + + for filename in filenames: + size, mtime, lines, fullname = cache[filename] + if mtime is None: + continue # no-op for files loaded via a __loader__ + try: + stat = os.stat(fullname) + except os.error: + del cache[filename] + continue + if size != stat.st_size or mtime != stat.st_mtime: + del cache[filename] + + +def updatecache(filename, module_globals=None): + """Update a cache entry and return its list of lines. + If something's wrong, print a message, discard the cache entry, + and return an empty list.""" + + if filename in cache: + del cache[filename] + if not filename or filename[0] + filename[-1] == '<>': + return [] + + fullname = filename + try: + stat = os.stat(fullname) + except os.error, msg: + basename = os.path.split(filename)[1] + + # Try for a __loader__, if available + if module_globals and '__loader__' in module_globals: + name = module_globals.get('__name__') + loader = module_globals['__loader__'] + get_source = getattr(loader, 'get_source', None) + + if name and get_source: + if basename.startswith(name.split('.')[-1]+'.'): + try: + data = get_source(name) + except (ImportError, IOError): + pass + else: + cache[filename] = ( + len(data), None, + [line+'\n' for line in data.splitlines()], fullname + ) + return cache[filename][2] + + # Try looking through the module search path. + + for dirname in sys.path: + # When using imputil, sys.path may contain things other than + # strings; ignore them when it happens. + try: + fullname = os.path.join(dirname, basename) + except (TypeError, AttributeError): + # Not sufficiently string-like to do anything useful with. + pass + else: + try: + stat = os.stat(fullname) + break + except os.error: + pass + else: + # No luck +## print '*** Cannot stat', filename, ':', msg + return [] + try: + fp = open(fullname, 'rU') + lines = fp.readlines() + fp.close() + except IOError, msg: +## print '*** Cannot open', fullname, ':', msg + return [] + size, mtime = stat.st_size, stat.st_mtime + cache[filename] = size, mtime, lines, fullname + return lines diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/testprogram.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/testprogram.py new file mode 100644 index 0000000000000000000000000000000000000000..247d9da8ad6ffd412d7a036e61b210f6698b717d --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/testprogram.py @@ -0,0 +1,469 @@ +"""Test runner. + +Local test HTTP server support and a few other bits and pieces. +""" + +USAGE = """ +%prog [OPTIONS...] [ARGUMENTS...] +%prog [discover [OPTIONS...]] [ARGUMENTS...] + +Examples: + +python test.py # all tests +python test.py test_api # run test/test_api.py +python test.py functional_tests # run test/functional_tests.py +python test.py mechanize/_headersutil # run the doctests from this module +python test.py functional_tests.CookieJarTests # just this class +# just this test method +python test.py functional_tests.CookieJarTests.test_mozilla_cookiejar + +python test.py discover --pattern test_browser.doctest # doctest file +# run test/functional_tests.py +python test.py discover --pattern functional_tests.py + +python test.py --tag internet # include tests that use the internet +""" + +# TODO: resurrect cgitb support + +import errno +import logging +import os +import optparse +import socket +import subprocess +import sys +import time +import unittest +import urllib + +import mechanize +import mechanize._rfc3986 +import mechanize._testcase as _testcase + + +class ServerStartupError(Exception): + + pass + + +class ServerProcess: + + def __init__(self, filename, name=None): + if filename is None: + raise ValueError('filename arg must be a string') + if name is None: + name = filename + self.name = os.path.basename(name) + self.port = None + self.report_hook = lambda msg: None + self._filename = filename + self._args = None + self._process = None + + def _get_args(self): + """Return list of command line arguments. + + Override me. + """ + return [] + + def _start(self): + self._args = [sys.executable, self._filename]+self._get_args() + self.report_hook("starting (%s)" % (self._args,)) + self._process = subprocess.Popen(self._args) + self.report_hook("waiting for startup") + self._wait_for_startup() + self.report_hook("running") + + def _wait_for_startup(self): + def connect(): + self._process.poll() + if self._process.returncode is not None: + message = ("server exited on startup with status %d: %r" % + (self._process.returncode, self._args)) + raise ServerStartupError(message) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.settimeout(1.0) + try: + sock.connect(('127.0.0.1', self.port)) + finally: + sock.close() + backoff(connect, (socket.error,)) + + def stop(self): + """Kill process (forcefully if necessary).""" + pid = self._process.pid + if os.name == 'nt': + kill_windows(pid, self.report_hook) + else: + kill_posix(pid, self.report_hook) + + +def backoff(func, errors, + initial_timeout=1., hard_timeout=60., factor=1.2): + starttime = time.time() + timeout = initial_timeout + while time.time() < starttime + hard_timeout - 0.01: + try: + func() + except errors: + time.sleep(timeout) + timeout *= factor + hard_limit = hard_timeout - (time.time() - starttime) + timeout = min(timeout, hard_limit) + else: + break + else: + raise + + +def kill_windows(handle, report_hook): + try: + import win32api + except ImportError: + import ctypes + ctypes.windll.kernel32.TerminateProcess(int(handle), -1) + else: + win32api.TerminateProcess(int(handle), -1) + + +def kill_posix(pid, report_hook): + import signal + os.kill(pid, signal.SIGTERM) + + timeout = 10. + starttime = time.time() + report_hook("waiting for exit") + def do_nothing(*args): + pass + old_handler = signal.signal(signal.SIGCHLD, do_nothing) + try: + while time.time() < starttime + timeout - 0.01: + pid, sts = os.waitpid(pid, os.WNOHANG) + if pid != 0: + # exited, or error + break + newtimeout = timeout - (time.time() - starttime) - 1. + time.sleep(newtimeout) # wait for signal + else: + report_hook("forcefully killing") + try: + os.kill(pid, signal.SIGKILL) + except OSError, exc: + if exc.errno != errno.ECHILD: + raise + finally: + signal.signal(signal.SIGCHLD, old_handler) + + +class TwistedServerProcess(ServerProcess): + + def __init__(self, uri, name, log=False): + this_dir = os.path.dirname(__file__) + path = os.path.join(this_dir, "twisted-localserver.py") + ServerProcess.__init__(self, path, name) + self.uri = uri + authority = mechanize._rfc3986.urlsplit(uri)[1] + host, port = urllib.splitport(authority) + if port is None: + port = "80" + self.port = int(port) + # def report(msg): + # print "%s: %s" % (name, msg) + report = lambda msg: None + self.report_hook = report + self._log = log + self._start() + + def _get_args(self): + args = [str(self.port)] + if self._log: + args.append("--log") + return args + + +class TwistedFtpServerProcess(ServerProcess): + + def __init__(self, name, port=2121, log=False): + this_dir = os.path.dirname(__file__) + path = os.path.join(this_dir, "twisted-ftpserver.py") + ServerProcess.__init__(self, path, name) + self._temp_maker = mechanize._testcase.TempDirMaker() + self.root_path = self._temp_maker.make_temp_dir() + self.port = port + report = lambda msg: None + self.report_hook = report + self._log = log + self._start() + + def _get_args(self): + args = ["--port", str(self.port), self.root_path] + # if self._log: + # args.append("--log") + return args + + def stop(self): + ServerProcess.stop(self) + self._temp_maker.tear_down() + + +class ServerCM(object): + + def __init__(self, make_server): + self._server = None + self._make_server = make_server + + def __enter__(self): + assert self._server is None + server = self._make_server() + self._server = server + return self._server + + def __exit__(self, exc_type, exc_value, exc_tb): + self._server.stop() + self._server = None + + +class NullServer(object): + + def __init__(self, uri, name=None): + self.uri = uri + + +class TrivialCM(object): + + def __init__(self, obj): + self._obj = obj + + def __enter__(self): + return self._obj + + def __exit__(self, exc_type, exc_value, exc_tb): + pass + + +def add_attributes_to_test_cases(suite, attributes): + for test in suite: + if isinstance(test, unittest.TestCase): + for name, value in attributes.iteritems(): + setattr(test, name, value) + else: + try: + add_attributes_to_test_cases(test, attributes) + except AttributeError: + pass + + +class FixtureCacheSuite(unittest.TestSuite): + + def __init__(self, fixture_factory, *args, **kwds): + unittest.TestSuite.__init__(self, *args, **kwds) + self._fixture_factory = fixture_factory + + def run(self, result): + try: + super(FixtureCacheSuite, self).run(result) + finally: + self._fixture_factory.tear_down() + + +def toplevel_test(suite, test_attributes): + suite = FixtureCacheSuite(test_attributes["fixture_factory"], suite) + add_attributes_to_test_cases(suite, test_attributes) + return suite + + +def make_http_server_cm(uri, log): + import warnings + # http://code.google.com/p/rdflib/issues/detail?id=101 + warnings.filterwarnings( + action="ignore", + message=(".*Module test was already imported from " + ".*test/__init__.pyc?, but .* is being added to " + "sys.path"), + category=UserWarning, + module="zope") + try: + import twisted.web2 + import zope.interface + except ImportError: + warnings.warn("Skipping functional tests: Failed to import " + "twisted.web2 and/or zope.interface") + def skip(): + raise unittest.SkipTest + cm = ServerCM(skip) + else: + cm = ServerCM(lambda: TwistedServerProcess( + uri, "local twisted server", log)) + return cm + + +def make_ftp_server_cm(log): + import warnings + try: + import twisted.protocols.ftp + import zope.interface + except ImportError: + warnings.warn("Skipping functional tests: Failed to import " + "twisted.protocols.ftp and/or zope.interface") + def skip(): + raise unittest.SkipTest + cm = ServerCM(skip) + else: + cm = ServerCM(lambda: TwistedFtpServerProcess( + "local twisted server", 2121, log)) + return cm + + +class TestProgram(unittest.TestProgram): + + def __init__(self, default_discovery_args=None, + *args, **kwds): + self._default_discovery_args = default_discovery_args + unittest.TestProgram.__init__(self, *args, **kwds) + + def _parse_options(self, argv): + parser = optparse.OptionParser(usage=USAGE) + # plain old unittest + parser.add_option("-v", "--verbose", action="store_true", + help="Verbose output") + parser.add_option("-q", "--quiet", action="store_true", + help="No output") + # from bundled Python 2.7 stdlib test discovery + parser.add_option("-s", "--start-directory", dest="start", default=".", + help='Directory to start discovery ("." default)') + parser.add_option("-p", "--pattern", dest="pattern", + default="test*.py", + help='Pattern to match tests ("test*.py" default)') + parser.add_option("-t", "--top-level-directory", dest="top", + default=None, + help=("Top level directory of project (defaults to " + "start directory)")) + # mechanize additions + # TODO: test_urllib2_localnet ignores --uri and --no-local-server + note = ("Note that there are two local servers in use, and this " + "option only affects the twisted server, not the server used " + "by test_urllib2_localnet (which originates from standard " + "library).") + parser.add_option( + "--uri", metavar="URI", + help="Run functional tests against base URI. " + note) + parser.add_option( + "--no-local-server", action="store_false", + dest="run_local_server", default=True, + help=("Don't run local test server. By default, this runs the " + "functional tests against mechanize sourceforge site, use " + "--uri to override that. " + note)) + # TODO: probably not everything respects this (test_urllib2_localnet?) + parser.add_option("--no-proxies", action="store_true") + parser.add_option("--log", action="store_true", + help=('Turn on logging for logger "mechanize" at ' + 'level logging.DEBUG')) + parser.add_option("--log-server", action="store_true", + help=("Turn on logging for twisted.web2 local HTTP " + " server")) + parser.add_option("--skip-doctests", action="store_true", + help="Don't discover doctests.") + allowed_tags = set(["internet"]) + parser.add_option("--tag", action="append", dest="tags", metavar="TAG", + help=("Discover tests tagged with TAG. Tagged " + "tests are not discovered by default. Pass " + "option more than once to specify more than " + "one tag. Current tags: %r" % allowed_tags)) + parser.add_option("--meld", action="store_true", + help=("On golden test failure, run meld to view & " + "edit differences")) + + options, remaining_args = parser.parse_args(argv) + if len(remaining_args) > 3: + self.usageExit() + + options.skip_tags = allowed_tags.copy() + if options.tags is not None: + unknown_tags = set(options.tags) - allowed_tags + if unknown_tags: + self.usageExit("Unknown tag(s) %r" % unknown_tags) + options.skip_tags -= set(options.tags) + options.allowed_tags = allowed_tags + options.do_discovery = ((len(remaining_args) == 0 and + self._default_discovery_args is not None) or + (len(remaining_args) >= 1 and + remaining_args[0].lower() == "discover")) + if options.do_discovery: + if len(remaining_args) == 0: + discovery_args = self._default_discovery_args + else: + discovery_args = remaining_args[1:] + for name, value in zip(("start", "pattern", "top"), + discovery_args): + setattr(options, name, value) + else: + options.test_names = remaining_args + if options.uri is None: + if options.run_local_server: + options.uri = "http://127.0.0.1:8000" + else: + options.uri = "http://wwwsearch.sourceforge.net/" + return options + + def _do_discovery(self, options): + start_dir = options.start + pattern = options.pattern + top_level_dir = options.top + loader = unittest.TestLoader() + self.test = loader.discover(start_dir, pattern, top_level_dir, + skip_tags=options.skip_tags, + allowed_tags=options.allowed_tags, + skip_doctests=options.skip_doctests) + + def _vanilla_unittest_main(self, options): + if len(options.test_names) == 0 and self.defaultTest is None: + # createTests will load tests from self.module + self.testNames = None + elif len(options.test_names) > 0: + self.testNames = options.test_names + else: + self.testNames = (self.defaultTest,) + self.createTests() + + def parseArgs(self, argv): + options = self._parse_options(argv[1:]) + if options.verbose: + self.verbosity = 2 + if options.quiet: + self.verbosity = 0 + if options.do_discovery: + self._do_discovery(options) + else: + self._vanilla_unittest_main(options) + + if options.log: + level = logging.DEBUG + # level = logging.INFO + # level = logging.WARNING + # level = logging.NOTSET + logger = logging.getLogger("mechanize") + logger.setLevel(level) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(level) + logger.addHandler(handler) + + fixture_factory = _testcase.FixtureFactory() + if options.run_local_server: + cm = make_http_server_cm(options.uri, options.log_server) + else: + cm = TrivialCM(NullServer(options.uri)) + fixture_factory.register_context_manager("server", cm) + fixture_factory.register_context_manager( + "ftp_server", make_ftp_server_cm(options.log_server)) + test_attributes = dict(uri=options.uri, no_proxies=options.no_proxies, + fixture_factory=fixture_factory) + if options.meld: + import mechanize._testcase + mechanize._testcase.GoldenTestCase.run_meld = True + self.test = toplevel_test(self.test, test_attributes) + + +main = TestProgram diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/twisted-ftpserver.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/twisted-ftpserver.py new file mode 100644 index 0000000000000000000000000000000000000000..8148bea555fe1f591cf11da10c6b6fd606b21ae5 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/twisted-ftpserver.py @@ -0,0 +1,86 @@ +import optparse +import sys + +import twisted.cred.checkers +import twisted.cred.credentials +import twisted.cred.portal +import twisted.internet +import twisted.protocols.ftp +from twisted.python import filepath, log + +from zope.interface import implements + + +def make_ftp_shell(avatar_id, root_path): + if avatar_id is twisted.cred.checkers.ANONYMOUS: + return twisted.protocols.ftp.FTPAnonymousShell(root_path) + else: + return twisted.protocols.ftp.FTPShell(root_path) + + +class FTPRealm(object): + + implements(twisted.cred.portal.IRealm) + + def __init__(self, root_path): + self._root_path = filepath.FilePath(root_path) + + def requestAvatar(self, avatarId, mind, *interfaces): + for iface in interfaces: + if iface is twisted.protocols.ftp.IFTPShell: + avatar = make_ftp_shell(avatarId, self._root_path) + return (twisted.protocols.ftp.IFTPShell, + avatar, + getattr(avatar, "logout", lambda: None)) + raise NotImplementedError() + + +class FtpServerFactory(object): + """ + port = FtpServerFactory("/tmp", 2121).makeListner() + self.addCleanup(port.stopListening) + """ + + def __init__(self, root_path, port): + factory = twisted.protocols.ftp.FTPFactory() + realm = FTPRealm(root_path) + portal = twisted.cred.portal.Portal(realm) + portal.registerChecker(twisted.cred.checkers.AllowAnonymousAccess(), + twisted.cred.credentials.IAnonymous) + checker = twisted.cred.checkers.\ + InMemoryUsernamePasswordDatabaseDontUse() + checker.addUser("john", "john") + portal.registerChecker(checker) + factory.tld = root_path + factory.userAnonymous = "anon" + factory.portal = portal + factory.protocol = twisted.protocols.ftp.FTP + self._factory = factory + self._port = port + + def makeListener(self): + # XXX use 0 instead of self._port? + return twisted.internet.reactor.listenTCP( + self._port, self._factory, interface="127.0.0.1") + + +def parse_options(args): + parser = optparse.OptionParser() + parser.add_option("--log", action="store_true") + parser.add_option("--port", type="int", default=2121) + options, remaining_args = parser.parse_args(args) + options.root_path = remaining_args[0] + return options + + +def main(argv): + options = parse_options(argv[1:]) + if options.log: + log.startLogging(sys.stdout) + factory = FtpServerFactory(options.root_path, options.port) + factory.makeListener() + twisted.internet.reactor.run() + + +if __name__ == "__main__": + main(sys.argv) diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/twisted-localserver.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/twisted-localserver.py new file mode 100644 index 0000000000000000000000000000000000000000..6189ef71d621faadba21148fac14e61a70cddc6e --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/twisted-localserver.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python +""" +%prog port + +e.g. %prog 8000 + +Runs a local server to point the mechanize functional tests at. Example: + +python test-tools/twisted-localserver.py 8042 +python functional_tests.py --uri=http://localhost:8042/ + +You need twisted.web2 to run it. On ubuntu feisty, you can install it like so: + +sudo apt-get install python-twisted-web2 +""" + +import optparse +import os +import re +import sys + +from twisted.cred import portal, checkers +from twisted.internet import reactor +from twisted.python import log +from twisted.python.hashlib import md5 +from twisted.web2 import server, http, resource, channel, \ + http_headers, responsecode, twcgi +from twisted.web2.auth import basic, digest, wrapper +from twisted.web2.auth.interfaces import IHTTPUser + +from zope.interface import implements + + +def html(title=None, extra_content=""): + html = """\ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> + <title>mechanize</title> + </head> + <body><a href="http://sourceforge.net/"> +%s +</body> +</html> +""" % extra_content + if title is not None: + html = re.sub("<title>(.*)</title>", "<title>%s</title>" % title, html) + return html + +MECHANIZE_HTML = html() +ROOT_HTML = html("mechanize") +RELOAD_TEST_HTML = """\ +<html> +<head><title>Title</title></head> +<body> + +<a href="/mechanize">near the start</a> + +<p>Now some data to prevent HEAD parsing from reading the link near +the end. + +<pre> +%s</pre> + +<a href="/mechanize">near the end</a> + +</body> + +</html>""" % (("0123456789ABCDEF"*4+"\n")*61) +REFERER_TEST_HTML = """\ +<html> +<head> +<title>mechanize Referer (sic) test page</title> +</head> +<body> +<p>This page exists to test the Referer functionality of <a href="/mechanize">mechanize</a>. +<p><a href="/cgi-bin/cookietest.cgi">Here</a> is a link to a page that displays the Referer header. +</body> +</html>""" + + +BASIC_AUTH_PAGE = """ +<html> +<head> +<title>Basic Auth Protected Area</title> +</head> +<body> +<p>Hello, basic auth world. +<p> +</body> +</html> +""" + + +DIGEST_AUTH_PAGE = """ +<html> +<head> +<title>Digest Auth Protected Area</title> +</head> +<body> +<p>Hello, digest auth world. +<p> +</body> +</html> +""" + + +class TestHTTPUser(object): + """ + Test avatar implementation for http auth with cred + """ + implements(IHTTPUser) + + username = None + + def __init__(self, username): + """ + @param username: The str username sent as part of the HTTP auth + response. + """ + self.username = username + + +class TestAuthRealm(object): + """ + Test realm that supports the IHTTPUser interface + """ + + implements(portal.IRealm) + + def requestAvatar(self, avatarId, mind, *interfaces): + if IHTTPUser in interfaces: + if avatarId == checkers.ANONYMOUS: + return IHTTPUser, TestHTTPUser('anonymous') + + return IHTTPUser, TestHTTPUser(avatarId) + + raise NotImplementedError("Only IHTTPUser interface is supported") + + +class Page(resource.Resource): + + addSlash = True + content_type = http_headers.MimeType("text", "html") + + def render(self, ctx): + return http.Response( + responsecode.OK, + {"content-type": self.content_type}, + self.text) + + +class Dir(resource.Resource): + + addSlash = True + + def locateChild(self, request, segments): + #import pdb; pdb.set_trace() + return resource.Resource.locateChild(self, request, segments) + + def render(self, ctx): + print "render" + return http.Response(responsecode.FORBIDDEN) + + +def make_dir(parent, name): + dir_ = Dir() + parent.putChild(name, dir_) + return dir_ + + +def _make_page(parent, name, text, content_type, wrapper, + leaf=False): + page = Page() + page.text = text + base_type, specific_type = content_type.split("/") + page.content_type = http_headers.MimeType(base_type, specific_type) + page.addSlash = not leaf + parent.putChild(name, wrapper(page)) + return page + +def make_page(parent, name, text, + content_type="text/html", wrapper=lambda page: page): + return _make_page(parent, name, text, content_type, wrapper, leaf=False) + +def make_leaf_page(parent, name, text, + content_type="text/html", wrapper=lambda page: page): + return _make_page(parent, name, text, content_type, wrapper, leaf=True) + +def make_redirect(parent, name, location_relative_ref): + redirect = resource.RedirectResource(path=location_relative_ref) + setattr(parent, "child_"+name, redirect) + return redirect + +def make_cgi_bin(parent, name, dir_name): + cgi_bin = twcgi.CGIDirectory(dir_name) + setattr(parent, "child_"+name, cgi_bin) + return cgi_bin + +def make_cgi_script(parent, name, path): + cgi_script = twcgi.CGIScript(path) + setattr(parent, "child_"+name, cgi_script) + return cgi_script + +def require_basic_auth(resource): + p = portal.Portal(TestAuthRealm()) + c = checkers.InMemoryUsernamePasswordDatabaseDontUse() + c.addUser("john", "john") + p.registerChecker(c) + cred_factory = basic.BasicCredentialFactory("Basic Auth protected area") + return wrapper.HTTPAuthResource(resource, + [cred_factory], + p, + interfaces=(IHTTPUser,)) + + +class DigestCredFactory(digest.DigestCredentialFactory): + + def generateOpaque(self, nonce, clientip): + # http://twistedmatrix.com/trac/ticket/3693 + key = "%s,%s,%s" % (nonce, clientip, str(int(self._getTime()))) + digest = md5(key + self.privateKey).hexdigest() + ekey = key.encode('base64') + return "%s-%s" % (digest, ekey.replace('\n', '')) + + +def require_digest_auth(resource): + p = portal.Portal(TestAuthRealm()) + c = checkers.InMemoryUsernamePasswordDatabaseDontUse() + c.addUser("digestuser", "digestuser") + p.registerChecker(c) + cred_factory = DigestCredFactory("MD5", "Digest Auth protected area") + return wrapper.HTTPAuthResource(resource, + [cred_factory], + p, + interfaces=(IHTTPUser,)) + + +def parse_options(args): + parser = optparse.OptionParser() + parser.add_option("--log", action="store_true") + options, remaining_args = parser.parse_args(args) + options.port = int(remaining_args[0]) + return options + + +def main(argv): + options = parse_options(argv[1:]) + if options.log: + log.startLogging(sys.stdout) + + # This is supposed to match the SF site so it's easy to run a functional + # test over the internet and against Apache. + # TODO: Remove bizarre structure and strings expected by functional tests. + root = Page() + root.text = ROOT_HTML + mechanize = make_page(root, "mechanize", MECHANIZE_HTML) + make_leaf_page(root, "robots.txt", + "User-Agent: *\nDisallow: /norobots", + "text/plain") + make_leaf_page(root, "robots", "Hello, robots.", "text/plain") + make_leaf_page(root, "norobots", "Hello, non-robots.", "text/plain") + test_fixtures = make_page(root, "test_fixtures", + # satisfy stupid assertions in functional tests + html("Python bits", + extra_content="GeneralFAQ.html")) + make_leaf_page(test_fixtures, "cctest2.txt", + "Hello ClientCookie functional test suite.", + "text/plain") + make_leaf_page(test_fixtures, "referertest.html", REFERER_TEST_HTML) + make_leaf_page(test_fixtures, "mechanize_reload_test.html", + RELOAD_TEST_HTML) + make_redirect(root, "redirected", "/doesnotexist") + cgi_bin = make_dir(root, "cgi-bin") + project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + make_cgi_script(cgi_bin, "cookietest.cgi", + os.path.join(project_dir, "test-tools", "cookietest.cgi")) + example_html = open(os.path.join("examples", "forms", "example.html")).read() + make_leaf_page(mechanize, "example.html", example_html) + make_cgi_script(cgi_bin, "echo.cgi", + os.path.join(project_dir, "examples", "forms", "echo.cgi")) + make_page(root, "basic_auth", BASIC_AUTH_PAGE, wrapper=require_basic_auth) + make_page(root, "digest_auth", DIGEST_AUTH_PAGE, + wrapper=require_digest_auth) + + site = server.Site(root) + reactor.listenTCP(options.port, channel.HTTPFactory(site)) + reactor.run() + + +if __name__ == "__main__": + main(sys.argv) diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/__init__.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0318f3942fe572529e3f02cbca089d6221be626 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/__init__.py @@ -0,0 +1,63 @@ +""" +Python unit testing framework, based on Erich Gamma's JUnit and Kent Beck's +Smalltalk testing framework. + +This module contains the core framework classes that form the basis of +specific test cases and suites (TestCase, TestSuite etc.), and also a +text-based utility class for running the tests and reporting the results + (TextTestRunner). + +Simple usage: + + import unittest + + class IntegerArithmenticTestCase(unittest.TestCase): + def testAdd(self): ## test method names begin 'test*' + self.assertEqual((1 + 2), 3) + self.assertEqual(0 + 1, 1) + def testMultiply(self): + self.assertEqual((0 * 10), 0) + self.assertEqual((5 * 8), 40) + + if __name__ == '__main__': + unittest.main() + +Further information is available in the bundled documentation, and from + + http://docs.python.org/library/unittest.html + +Copyright (c) 1999-2003 Steve Purcell +Copyright (c) 2003-2010 Python Software Foundation +This module is free software, and you may redistribute it and/or modify +it under the same terms as Python itself, so long as this copyright message +and disclaimer are retained in their original form. + +IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, +SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF +THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + +THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE. THE CODE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, +AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, +SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +""" + +__all__ = ['TestResult', 'TestCase', 'TestSuite', + 'TextTestRunner', 'TestLoader', 'FunctionTestCase', 'main', + 'defaultTestLoader', 'SkipTest', 'skip', 'skipIf', 'skipUnless', + 'expectedFailure'] + +# Expose obsolete functions for backwards compatibility +__all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases']) + + +from unittest.result import TestResult +from unittest.case import (TestCase, FunctionTestCase, SkipTest, skip, skipIf, + skipUnless, expectedFailure) +from unittest.suite import TestSuite +from unittest.loader import (TestLoader, defaultTestLoader, makeSuite, + getTestCaseNames, findTestCases) +from unittest.main import TestProgram, main +from unittest.runner import TextTestRunner diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/__main__.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..14b7565e50f9c9f67f7e0a22a55c385176e00086 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/__main__.py @@ -0,0 +1,8 @@ +"""Main entry point""" + +import sys +if sys.argv[0].endswith("__main__.py"): + sys.argv[0] = "unittest" + +from unittest.main import main +main(module=None) diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/case.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/case.py new file mode 100644 index 0000000000000000000000000000000000000000..eb7649024eccf897accf1a7b0efaa659b1ced9bc --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/case.py @@ -0,0 +1,921 @@ +"""Test case implementation""" + +import sys +import functools_copy +import difflib +import pprint +import re +import warnings + +from unittest import result, util + + +# Python 2.4 compatibility +def with_(mgr, suite_func): + exit = mgr.__exit__ # Not calling it yet + value = mgr.__enter__() + exc = True + try: + try: + suite_func() + except: + exc = False + if not exit(*sys.exc_info()): + raise + finally: + if exc: + exit(None, None, None) + + +class SkipTest(Exception): + """ + Raise this exception in a test to skip it. + + Usually you can use TestResult.skip() or one of the skipping decorators + instead of raising this directly. + """ + pass + +class _ExpectedFailure(Exception): + """ + Raise this when a test is expected to fail. + + This is an implementation detail. + """ + + def __init__(self, exc_info): + Exception.__init__(self) + self.exc_info = exc_info + +class _UnexpectedSuccess(Exception): + """ + The test was supposed to fail, but it didn't! + """ + pass + +def _id(obj): + return obj + +def skip(reason): + """ + Unconditionally skip a test. + """ + def decorator(test_item): + if isinstance(test_item, type) and issubclass(test_item, TestCase): + test_item.__unittest_skip__ = True + test_item.__unittest_skip_why__ = reason + return test_item + @functools_copy.wraps(test_item) + def skip_wrapper(*args, **kwargs): + raise SkipTest(reason) + return skip_wrapper + return decorator + +def skipIf(condition, reason): + """ + Skip a test if the condition is true. + """ + if condition: + return skip(reason) + return _id + +def skipUnless(condition, reason): + """ + Skip a test unless the condition is true. + """ + if not condition: + return skip(reason) + return _id + + +def expectedFailure(func): + @functools_copy.wraps(func) + def wrapper(*args, **kwargs): + try: + func(*args, **kwargs) + except Exception: + raise _ExpectedFailure(sys.exc_info()) + raise _UnexpectedSuccess + return wrapper + + +class _AssertRaisesContext(object): + """A context manager used to implement TestCase.assertRaises* methods.""" + + def __init__(self, expected, test_case, expected_regexp=None): + self.expected = expected + self.failureException = test_case.failureException + self.expected_regex = expected_regexp + + def __enter__(self): + pass + + def __exit__(self, exc_type, exc_value, tb): + if exc_type is None: + try: + exc_name = self.expected.__name__ + except AttributeError: + exc_name = str(self.expected) + raise self.failureException("%s not raised" % (exc_name,)) + if not issubclass(exc_type, self.expected): + # let unexpected exceptions pass through + return False + self.exc_value = exc_value #store for later retrieval + if self.expected_regex is None: + return True + + expected_regexp = self.expected_regex + if isinstance(expected_regexp, basestring): + expected_regexp = re.compile(expected_regexp) + if not expected_regexp.search(str(exc_value)): + raise self.failureException('"%s" does not match "%s"' % + (expected_regexp.pattern, str(exc_value))) + return True + + +class TestCase(object): + """A class whose instances are single test cases. + + By default, the test code itself should be placed in a method named + 'runTest'. + + If the fixture may be used for many test cases, create as + many test methods as are needed. When instantiating such a TestCase + subclass, specify in the constructor arguments the name of the test method + that the instance is to execute. + + Test authors should subclass TestCase for their own tests. Construction + and deconstruction of the test's environment ('fixture') can be + implemented by overriding the 'setUp' and 'tearDown' methods respectively. + + If it is necessary to override the __init__ method, the base class + __init__ method must always be called. It is important that subclasses + should not change the signature of their __init__ method, since instances + of the classes are instantiated automatically by parts of the framework + in order to be run. + """ + + # This attribute determines which exception will be raised when + # the instance's assertion methods fail; test methods raising this + # exception will be deemed to have 'failed' rather than 'errored' + + failureException = AssertionError + + # This attribute determines whether long messages (including repr of + # objects used in assert methods) will be printed on failure in *addition* + # to any explicit message passed. + + longMessage = False + + + def __init__(self, methodName='runTest'): + """Create an instance of the class that will use the named test + method when executed. Raises a ValueError if the instance does + not have a method with the specified name. + """ + self._testMethodName = methodName + self._resultForDoCleanups = None + try: + testMethod = getattr(self, methodName) + except AttributeError: + raise ValueError("no such test method in %s: %s" % \ + (self.__class__, methodName)) + self._testMethodDoc = testMethod.__doc__ + self._cleanups = [] + + # Map types to custom assertEqual functions that will compare + # instances of said type in more detail to generate a more useful + # error message. + self._type_equality_funcs = {} + self.addTypeEqualityFunc(dict, self.assertDictEqual) + self.addTypeEqualityFunc(list, self.assertListEqual) + self.addTypeEqualityFunc(tuple, self.assertTupleEqual) + self.addTypeEqualityFunc(set, self.assertSetEqual) + self.addTypeEqualityFunc(frozenset, self.assertSetEqual) + + def addTypeEqualityFunc(self, typeobj, function): + """Add a type specific assertEqual style function to compare a type. + + This method is for use by TestCase subclasses that need to register + their own type equality functions to provide nicer error messages. + + Args: + typeobj: The data type to call this function on when both values + are of the same type in assertEqual(). + function: The callable taking two arguments and an optional + msg= argument that raises self.failureException with a + useful error message when the two arguments are not equal. + """ + self._type_equality_funcs[typeobj] = function + + def addCleanup(self, function, *args, **kwargs): + """Add a function, with arguments, to be called when the test is + completed. Functions added are called on a LIFO basis and are + called after tearDown on test failure or success. + + Cleanup items are called even if setUp fails (unlike tearDown).""" + self._cleanups.append((function, args, kwargs)) + + def setUp(self): + "Hook method for setting up the test fixture before exercising it." + pass + + def tearDown(self): + "Hook method for deconstructing the test fixture after testing it." + pass + + def countTestCases(self): + return 1 + + def defaultTestResult(self): + return result.TestResult() + + def shortDescription(self): + """Returns both the test method name and first line of its docstring. + + If no docstring is given, only returns the method name. + + This method overrides unittest.TestCase.shortDescription(), which + only returns the first line of the docstring, obscuring the name + of the test upon failure. + """ + desc = str(self) + doc_first_line = None + + if self._testMethodDoc: + doc_first_line = self._testMethodDoc.split("\n")[0].strip() + if doc_first_line: + desc = '\n'.join((desc, doc_first_line)) + return desc + + def id(self): + return "%s.%s" % (util.strclass(self.__class__), self._testMethodName) + + def __eq__(self, other): + if type(self) is not type(other): + return NotImplemented + + return self._testMethodName == other._testMethodName + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash((type(self), self._testMethodName)) + + def __str__(self): + return "%s (%s)" % (self._testMethodName, util.strclass(self.__class__)) + + def __repr__(self): + return "<%s testMethod=%s>" % \ + (util.strclass(self.__class__), self._testMethodName) + + def run(self, result=None): + orig_result = result + if result is None: + result = self.defaultTestResult() + startTestRun = getattr(result, 'startTestRun', None) + if startTestRun is not None: + startTestRun() + + self._resultForDoCleanups = result + result.startTest(self) + if getattr(self.__class__, "__unittest_skip__", False): + # If the whole class was skipped. + try: + result.addSkip(self, self.__class__.__unittest_skip_why__) + finally: + result.stopTest(self) + return + testMethod = getattr(self, self._testMethodName) + try: + success = False + try: + self.setUp() + except SkipTest, e: + result.addSkip(self, str(e)) + except Exception: + result.addError(self, sys.exc_info()) + else: + try: + testMethod() + except self.failureException: + result.addFailure(self, sys.exc_info()) + except _ExpectedFailure, e: + result.addExpectedFailure(self, e.exc_info) + except _UnexpectedSuccess: + result.addUnexpectedSuccess(self) + except SkipTest, e: + result.addSkip(self, str(e)) + except Exception: + result.addError(self, sys.exc_info()) + else: + success = True + + try: + self.tearDown() + except Exception: + result.addError(self, sys.exc_info()) + success = False + + cleanUpSuccess = self.doCleanups() + success = success and cleanUpSuccess + if success: + result.addSuccess(self) + finally: + result.stopTest(self) + if orig_result is None: + stopTestRun = getattr(result, 'stopTestRun', None) + if stopTestRun is not None: + stopTestRun() + + def doCleanups(self): + """Execute all cleanup functions. Normally called for you after + tearDown.""" + result = self._resultForDoCleanups + ok = True + while self._cleanups: + function, args, kwargs = self._cleanups.pop(-1) + try: + function(*args, **kwargs) + except Exception: + ok = False + result.addError(self, sys.exc_info()) + return ok + + def __call__(self, *args, **kwds): + return self.run(*args, **kwds) + + def debug(self): + """Run the test without collecting errors in a TestResult""" + self.setUp() + getattr(self, self._testMethodName)() + self.tearDown() + + def skipTest(self, reason): + """Skip this test.""" + raise SkipTest(reason) + + def fail(self, msg=None): + """Fail immediately, with the given message.""" + raise self.failureException(msg) + + def assertFalse(self, expr, msg=None): + "Fail the test if the expression is true." + if expr: + msg = self._formatMessage(msg, "%r is not False" % expr) + raise self.failureException(msg) + + def assertTrue(self, expr, msg=None): + """Fail the test unless the expression is true.""" + if not expr: + msg = self._formatMessage(msg, "%r is not True" % expr) + raise self.failureException(msg) + + def _formatMessage(self, msg, standardMsg): + """Honour the longMessage attribute when generating failure messages. + If longMessage is False this means: + * Use only an explicit message if it is provided + * Otherwise use the standard message for the assert + + If longMessage is True: + * Use the standard message + * If an explicit message is provided, plus ' : ' and the explicit message + """ + if not self.longMessage: + return msg or standardMsg + if msg is None: + return standardMsg + return standardMsg + ' : ' + msg + + + def assertRaises(self, excClass, callableObj=None, *args, **kwargs): + """Fail unless an exception of class excClass is thrown + by callableObj when invoked with arguments args and keyword + arguments kwargs. If a different type of exception is + thrown, it will not be caught, and the test case will be + deemed to have suffered an error, exactly as for an + unexpected exception. + + If called with callableObj omitted or None, will return a + context object used like this:: + + with self.assertRaises(some_error_class): + do_something() + """ + context = _AssertRaisesContext(excClass, self) + if callableObj is None: + return context + with_(context, lambda: callableObj(*args, **kwargs)) + + def _getAssertEqualityFunc(self, first, second): + """Get a detailed comparison function for the types of the two args. + + Returns: A callable accepting (first, second, msg=None) that will + raise a failure exception if first != second with a useful human + readable error message for those types. + """ + # + # NOTE(gregory.p.smith): I considered isinstance(first, type(second)) + # and vice versa. I opted for the conservative approach in case + # subclasses are not intended to be compared in detail to their super + # class instances using a type equality func. This means testing + # subtypes won't automagically use the detailed comparison. Callers + # should use their type specific assertSpamEqual method to compare + # subclasses if the detailed comparison is desired and appropriate. + # See the discussion in http://bugs.python.org/issue2578. + # + if type(first) is type(second): + asserter = self._type_equality_funcs.get(type(first)) + if asserter is not None: + return asserter + + return self._baseAssertEqual + + def _baseAssertEqual(self, first, second, msg=None): + """The default assertEqual implementation, not type specific.""" + if not first == second: + standardMsg = '%r != %r' % (first, second) + msg = self._formatMessage(msg, standardMsg) + raise self.failureException(msg) + + def assertEqual(self, first, second, msg=None): + """Fail if the two objects are unequal as determined by the '==' + operator. + """ + assertion_func = self._getAssertEqualityFunc(first, second) + assertion_func(first, second, msg=msg) + + def assertNotEqual(self, first, second, msg=None): + """Fail if the two objects are equal as determined by the '==' + operator. + """ + if not first != second: + msg = self._formatMessage(msg, '%r == %r' % (first, second)) + raise self.failureException(msg) + + def assertAlmostEqual(self, first, second, places=7, msg=None): + """Fail if the two objects are unequal as determined by their + difference rounded to the given number of decimal places + (default 7) and comparing to zero. + + Note that decimal places (from zero) are usually not the same + as significant digits (measured from the most signficant digit). + + If the two objects compare equal then they will automatically + compare almost equal. + """ + if first == second: + # shortcut for ite + return + if round(abs(second-first), places) != 0: + standardMsg = '%r != %r within %r places' % (first, second, places) + msg = self._formatMessage(msg, standardMsg) + raise self.failureException(msg) + + def assertNotAlmostEqual(self, first, second, places=7, msg=None): + """Fail if the two objects are equal as determined by their + difference rounded to the given number of decimal places + (default 7) and comparing to zero. + + Note that decimal places (from zero) are usually not the same + as significant digits (measured from the most signficant digit). + + Objects that are equal automatically fail. + """ + if (first == second) or round(abs(second-first), places) == 0: + standardMsg = '%r == %r within %r places' % (first, second, places) + msg = self._formatMessage(msg, standardMsg) + raise self.failureException(msg) + + # Synonyms for assertion methods + + # The plurals are undocumented. Keep them that way to discourage use. + # Do not add more. Do not remove. + # Going through a deprecation cycle on these would annoy many people. + assertEquals = assertEqual + assertNotEquals = assertNotEqual + assertAlmostEquals = assertAlmostEqual + assertNotAlmostEquals = assertNotAlmostEqual + assert_ = assertTrue + + # These fail* assertion method names are pending deprecation and will + # be a DeprecationWarning in 3.2; http://bugs.python.org/issue2578 + def _deprecate(original_func): + def deprecated_func(*args, **kwargs): + warnings.warn( + 'Please use %s instead.' % (original_func.__name__,), + PendingDeprecationWarning, 2) + return original_func(*args, **kwargs) + return deprecated_func + + failUnlessEqual = _deprecate(assertEqual) + failIfEqual = _deprecate(assertNotEqual) + failUnlessAlmostEqual = _deprecate(assertAlmostEqual) + failIfAlmostEqual = _deprecate(assertNotAlmostEqual) + failUnless = _deprecate(assertTrue) + failUnlessRaises = _deprecate(assertRaises) + failIf = _deprecate(assertFalse) + + def assertSequenceEqual(self, seq1, seq2, msg=None, seq_type=None): + """An equality assertion for ordered sequences (like lists and tuples). + + For the purposes of this function, a valid orderd sequence type is one + which can be indexed, has a length, and has an equality operator. + + Args: + seq1: The first sequence to compare. + seq2: The second sequence to compare. + seq_type: The expected datatype of the sequences, or None if no + datatype should be enforced. + msg: Optional message to use on failure instead of a list of + differences. + """ + if seq_type != None: + seq_type_name = seq_type.__name__ + if not isinstance(seq1, seq_type): + raise self.failureException('First sequence is not a %s: %r' + % (seq_type_name, seq1)) + if not isinstance(seq2, seq_type): + raise self.failureException('Second sequence is not a %s: %r' + % (seq_type_name, seq2)) + else: + seq_type_name = "sequence" + + differing = None + try: + len1 = len(seq1) + except (TypeError, NotImplementedError): + differing = 'First %s has no length. Non-sequence?' % ( + seq_type_name) + + if differing is None: + try: + len2 = len(seq2) + except (TypeError, NotImplementedError): + differing = 'Second %s has no length. Non-sequence?' % ( + seq_type_name) + + if differing is None: + if seq1 == seq2: + return + + seq1_repr = repr(seq1) + seq2_repr = repr(seq2) + if len(seq1_repr) > 30: + seq1_repr = seq1_repr[:30] + '...' + if len(seq2_repr) > 30: + seq2_repr = seq2_repr[:30] + '...' + elements = (seq_type_name.capitalize(), seq1_repr, seq2_repr) + differing = '%ss differ: %s != %s\n' % elements + + for i in xrange(min(len1, len2)): + try: + item1 = seq1[i] + except (TypeError, IndexError, NotImplementedError): + differing += ('\nUnable to index element %d of first %s\n' % + (i, seq_type_name)) + break + + try: + item2 = seq2[i] + except (TypeError, IndexError, NotImplementedError): + differing += ('\nUnable to index element %d of second %s\n' % + (i, seq_type_name)) + break + + if item1 != item2: + differing += ('\nFirst differing element %d:\n%s\n%s\n' % + (i, item1, item2)) + break + else: + if (len1 == len2 and seq_type is None and + type(seq1) != type(seq2)): + # The sequences are the same, but have differing types. + return + + if len1 > len2: + differing += ('\nFirst %s contains %d additional ' + 'elements.\n' % (seq_type_name, len1 - len2)) + try: + differing += ('First extra element %d:\n%s\n' % + (len2, seq1[len2])) + except (TypeError, IndexError, NotImplementedError): + differing += ('Unable to index element %d ' + 'of first %s\n' % (len2, seq_type_name)) + elif len1 < len2: + differing += ('\nSecond %s contains %d additional ' + 'elements.\n' % (seq_type_name, len2 - len1)) + try: + differing += ('First extra element %d:\n%s\n' % + (len1, seq2[len1])) + except (TypeError, IndexError, NotImplementedError): + differing += ('Unable to index element %d ' + 'of second %s\n' % (len1, seq_type_name)) + standardMsg = differing + '\n' + '\n'.join( + difflib.ndiff(pprint.pformat(seq1).splitlines(), + pprint.pformat(seq2).splitlines())) + msg = self._formatMessage(msg, standardMsg) + self.fail(msg) + + def assertListEqual(self, list1, list2, msg=None): + """A list-specific equality assertion. + + Args: + list1: The first list to compare. + list2: The second list to compare. + msg: Optional message to use on failure instead of a list of + differences. + + """ + self.assertSequenceEqual(list1, list2, msg, seq_type=list) + + def assertTupleEqual(self, tuple1, tuple2, msg=None): + """A tuple-specific equality assertion. + + Args: + tuple1: The first tuple to compare. + tuple2: The second tuple to compare. + msg: Optional message to use on failure instead of a list of + differences. + """ + self.assertSequenceEqual(tuple1, tuple2, msg, seq_type=tuple) + + def assertSetEqual(self, set1, set2, msg=None): + """A set-specific equality assertion. + + Args: + set1: The first set to compare. + set2: The second set to compare. + msg: Optional message to use on failure instead of a list of + differences. + + For more general containership equality, assertSameElements will work + with things other than sets. This uses ducktyping to support + different types of sets, and is optimized for sets specifically + (parameters must support a difference method). + """ + try: + difference1 = set1.difference(set2) + except TypeError, e: + self.fail('invalid type when attempting set difference: %s' % e) + except AttributeError, e: + self.fail('first argument does not support set difference: %s' % e) + + try: + difference2 = set2.difference(set1) + except TypeError, e: + self.fail('invalid type when attempting set difference: %s' % e) + except AttributeError, e: + self.fail('second argument does not support set difference: %s' % e) + + if not (difference1 or difference2): + return + + lines = [] + if difference1: + lines.append('Items in the first set but not the second:') + for item in difference1: + lines.append(repr(item)) + if difference2: + lines.append('Items in the second set but not the first:') + for item in difference2: + lines.append(repr(item)) + + standardMsg = '\n'.join(lines) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertIn(self, member, container, msg=None): + """Just like self.assertTrue(a in b), but with a nicer default message.""" + if member not in container: + standardMsg = '%r not found in %r' % (member, container) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertNotIn(self, member, container, msg=None): + """Just like self.assertTrue(a not in b), but with a nicer default message.""" + if member in container: + standardMsg = '%r unexpectedly found in %r' % (member, container) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertIs(self, expr1, expr2, msg=None): + """Just like self.assertTrue(a is b), but with a nicer default message.""" + if expr1 is not expr2: + standardMsg = '%r is not %r' % (expr1, expr2) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertIsNot(self, expr1, expr2, msg=None): + """Just like self.assertTrue(a is not b), but with a nicer default message.""" + if expr1 is expr2: + standardMsg = 'unexpectedly identical: %r' % (expr1,) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertDictEqual(self, d1, d2, msg=None): + self.assert_(isinstance(d1, dict), 'First argument is not a dictionary') + self.assert_(isinstance(d2, dict), 'Second argument is not a dictionary') + + if d1 != d2: + standardMsg = ('\n' + '\n'.join(difflib.ndiff( + pprint.pformat(d1).splitlines(), + pprint.pformat(d2).splitlines()))) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertDictContainsSubset(self, expected, actual, msg=None): + """Checks whether actual is a superset of expected.""" + missing = [] + mismatched = [] + for key, value in expected.iteritems(): + if key not in actual: + missing.append(key) + elif value != actual[key]: + mismatched.append('%s, expected: %s, actual: %s' % + (key, value, actual[key])) + + if not (missing or mismatched): + return + + standardMsg = '' + if missing: + standardMsg = 'Missing: %r' % ','.join(missing) + if mismatched: + if standardMsg: + standardMsg += '; ' + standardMsg += 'Mismatched values: %s' % ','.join(mismatched) + + self.fail(self._formatMessage(msg, standardMsg)) + + def assertSameElements(self, expected_seq, actual_seq, msg=None): + """An unordered sequence specific comparison. + + Raises with an error message listing which elements of expected_seq + are missing from actual_seq and vice versa if any. + """ + try: + expected = set(expected_seq) + actual = set(actual_seq) + missing = list(expected.difference(actual)) + unexpected = list(actual.difference(expected)) + missing.sort() + unexpected.sort() + except TypeError: + # Fall back to slower list-compare if any of the objects are + # not hashable. + expected = list(expected_seq) + actual = list(actual_seq) + expected.sort() + actual.sort() + missing, unexpected = util.sorted_list_difference(expected, actual) + errors = [] + if missing: + errors.append('Expected, but missing:\n %r' % missing) + if unexpected: + errors.append('Unexpected, but present:\n %r' % unexpected) + if errors: + standardMsg = '\n'.join(errors) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertMultiLineEqual(self, first, second, msg=None): + """Assert that two multi-line strings are equal.""" + self.assert_(isinstance(first, basestring), ( + 'First argument is not a string')) + self.assert_(isinstance(second, basestring), ( + 'Second argument is not a string')) + + if first != second: + standardMsg = '\n' + ''.join(difflib.ndiff(first.splitlines(True), + second.splitlines(True))) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertLess(self, a, b, msg=None): + """Just like self.assertTrue(a < b), but with a nicer default message.""" + if not a < b: + standardMsg = '%r not less than %r' % (a, b) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertLessEqual(self, a, b, msg=None): + """Just like self.assertTrue(a <= b), but with a nicer default message.""" + if not a <= b: + standardMsg = '%r not less than or equal to %r' % (a, b) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertGreater(self, a, b, msg=None): + """Just like self.assertTrue(a > b), but with a nicer default message.""" + if not a > b: + standardMsg = '%r not greater than %r' % (a, b) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertGreaterEqual(self, a, b, msg=None): + """Just like self.assertTrue(a >= b), but with a nicer default message.""" + if not a >= b: + standardMsg = '%r not greater than or equal to %r' % (a, b) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertIsNone(self, obj, msg=None): + """Same as self.assertTrue(obj is None), with a nicer default message.""" + if obj is not None: + standardMsg = '%r is not None' % obj + self.fail(self._formatMessage(msg, standardMsg)) + + def assertIsNotNone(self, obj, msg=None): + """Included for symmetry with assertIsNone.""" + if obj is None: + standardMsg = 'unexpectedly None' + self.fail(self._formatMessage(msg, standardMsg)) + + def assertIsInstance(self, obj, cls, msg=None): + """Same as self.assertTrue(isinstance(obj, cls)), with a nicer + default message.""" + if not isinstance(obj, cls): + standardMsg = '%r is not an instance of %r' % (obj, cls) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertNotIsInstance(self, obj, cls, msg=None): + """Included for symmetry with assertIsInstance.""" + if isinstance(obj, cls): + standardMsg = '%r is an instance of %r' % (obj, cls) + self.fail(self._formatMessage(msg, standardMsg)) + + def assertRaisesRegexp(self, expected_exception, expected_regexp, + callable_obj=None, *args, **kwargs): + """Asserts that the message in a raised exception matches a regexp. + + Args: + expected_exception: Exception class expected to be raised. + expected_regexp: Regexp (re pattern object or string) expected + to be found in error message. + callable_obj: Function to be called. + args: Extra args. + kwargs: Extra kwargs. + """ + context = _AssertRaisesContext(expected_exception, self, expected_regexp) + if callable_obj is None: + return context + with_(context, lambda: callable_obj(*args, **kwargs)) + + def assertRegexpMatches(self, text, expected_regex, msg=None): + if isinstance(expected_regex, basestring): + expected_regex = re.compile(expected_regex) + if not expected_regex.search(text): + msg = msg or "Regexp didn't match" + msg = '%s: %r not found in %r' % (msg, expected_regex.pattern, text) + raise self.failureException(msg) + + +class FunctionTestCase(TestCase): + """A test case that wraps a test function. + + This is useful for slipping pre-existing test functions into the + unittest framework. Optionally, set-up and tidy-up functions can be + supplied. As with TestCase, the tidy-up ('tearDown') function will + always be called if the set-up ('setUp') function ran successfully. + """ + + def __init__(self, testFunc, setUp=None, tearDown=None, description=None): + super(FunctionTestCase, self).__init__() + self._setUpFunc = setUp + self._tearDownFunc = tearDown + self._testFunc = testFunc + self._description = description + + def setUp(self): + if self._setUpFunc is not None: + self._setUpFunc() + + def tearDown(self): + if self._tearDownFunc is not None: + self._tearDownFunc() + + def runTest(self): + self._testFunc() + + def id(self): + return self._testFunc.__name__ + + def __eq__(self, other): + if not isinstance(other, self.__class__): + return NotImplemented + + return self._setUpFunc == other._setUpFunc and \ + self._tearDownFunc == other._tearDownFunc and \ + self._testFunc == other._testFunc and \ + self._description == other._description + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash((type(self), self._setUpFunc, self._tearDownFunc, + self._testFunc, self._description)) + + def __str__(self): + return "%s (%s)" % (util.strclass(self.__class__), + self._testFunc.__name__) + + def __repr__(self): + return "<%s testFunc=%s>" % (util.strclass(self.__class__), + self._testFunc) + + def shortDescription(self): + if self._description is not None: + return self._description + doc = self._testFunc.__doc__ + return doc and doc.split("\n")[0].strip() or None diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/loader.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..101f81dc16f90786cc8113f578a714e2c63d5445 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/loader.py @@ -0,0 +1,387 @@ +"""Loading unittests.""" + +import doctest +import os +import re +import sys +import traceback +import types + +from fnmatch import fnmatch + +from unittest import case, suite + +# Python 2.4 compatibility +if os.name == "posix": + from os.path import join, abspath, commonprefix, pardir, curdir, sep + def relpath(path, start=curdir): + """Return a relative version of a path""" + + if not path: + raise ValueError("no path specified") + + start_list = abspath(start).split(sep) + path_list = abspath(path).split(sep) + + # Work out how much of the filepath is shared by start and path. + i = len(commonprefix([start_list, path_list])) + + rel_list = [pardir] * (len(start_list)-i) + path_list[i:] + if not rel_list: + return curdir + return join(*rel_list) +elif os.name == "nt": + from os.path import join, abspath, pardir, curdir, sep, splitunc + def relpath(path, start=curdir): + """Return a relative version of a path""" + + if not path: + raise ValueError("no path specified") + start_list = abspath(start).split(sep) + path_list = abspath(path).split(sep) + if start_list[0].lower() != path_list[0].lower(): + unc_path, rest = splitunc(path) + unc_start, rest = splitunc(start) + if bool(unc_path) ^ bool(unc_start): + raise ValueError("Cannot mix UNC and non-UNC paths (%s and %s)" + % (path, start)) + else: + raise ValueError("path is on drive %s, start on drive %s" + % (path_list[0], start_list[0])) + # Work out how much of the filepath is shared by start and path. + for i in range(min(len(start_list), len(path_list))): + if start_list[i].lower() != path_list[i].lower(): + break + else: + i += 1 + + rel_list = [pardir] * (len(start_list)-i) + path_list[i:] + if not rel_list: + return curdir + return join(*rel_list) +else: + # seems test discovery code from Python 2.7 trunk doesn't support the mac + # yet + raise NotImplementedError("fixme") + + +def _CmpToKey(mycmp): + 'Convert a cmp= function into a key= function' + class K(object): + def __init__(self, obj): + self.obj = obj + def __lt__(self, other): + return mycmp(self.obj, other.obj) == -1 + return K + + +# what about .pyc or .pyo (etc) +# we would need to avoid loading the same tests multiple times +# from '.py', '.pyc' *and* '.pyo' +VALID_MODULE_NAME = re.compile(r'[_a-z]\w*\.py$', re.IGNORECASE) + + +def _make_failed_import_test(name, suiteClass): + message = 'Failed to import test module: %s' % name + if hasattr(traceback, 'format_exc'): + # Python 2.3 compatibility + # format_exc returns two frames of discover.py as well + message += '\n%s' % traceback.format_exc() + + def testImportFailure(self): + raise ImportError(message) + attrs = {name: testImportFailure} + ModuleImportFailure = type('ModuleImportFailure', (case.TestCase,), attrs) + return suiteClass((ModuleImportFailure(name),)) + + +def maybe_load_doctest(path): + if path.endswith(".doctest"): + return doctest.DocFileTest(path, module_relative=False) + elif path.endswith("test_password_manager.special_doctest"): + # TODO: get rid of this + import mechanize + tests = [] + common_globs = {"mechanize": mechanize} + for globs in [ + {"mgr_class": mechanize.HTTPPasswordMgr}, + {"mgr_class": mechanize.HTTPProxyPasswordMgr}, + ]: + globs.update(common_globs) + tests.append(doctest.DocFileTest(path, module_relative=False, + globs=globs)) + return suite.TestSuite(tests) + return None + + +def flatten_test(test): + try: + tests = iter(test) + except TypeError: + yield test + else: + for test in tests: + for flattened in flatten_test(test): + yield flattened + + +def is_not_skipped(test, skip_tags, allowed_tags, skip_doctests): + skipped = False + for tag in getattr(test, "tags", "").split(): + if tag not in allowed_tags: + raise Exception("unknown tag: %r" % tag) + if tag in skip_tags: + skipped = True + if skip_doctests and isinstance(test, doctest.DocTestCase): + skipped = True + return not skipped + + +class TestLoader(object): + + # problems + # * Can't load doctests from name + # * I'm maintaining this :-( + + # TODO: fix doctest support in nose, and use nose instead + + """ + This class is responsible for loading tests according to various criteria + and returning them wrapped in a TestSuite + """ + testMethodPrefix = 'test' + sortTestMethodsUsing = cmp + suiteClass = suite.TestSuite + _top_level_dir = None + + def loadTestsFromTestCase(self, testCaseClass): + """Return a suite of all tests cases contained in testCaseClass""" + if issubclass(testCaseClass, suite.TestSuite): + raise TypeError("Test cases should not be derived from TestSuite." \ + " Maybe you meant to derive from TestCase?") + testCaseNames = self.getTestCaseNames(testCaseClass) + if not testCaseNames and hasattr(testCaseClass, 'runTest'): + testCaseNames = ['runTest'] + loaded_suite = self.suiteClass(map(testCaseClass, testCaseNames)) + return loaded_suite + + def loadTestsFromModule(self, module, use_load_tests=True): + """Return a suite of all tests cases contained in the given module""" + tests = [] + for name in dir(module): + obj = getattr(module, name) + if isinstance(obj, type) and issubclass(obj, case.TestCase): + tests.append(self.loadTestsFromTestCase(obj)) + + try: + if isinstance(module, types.ModuleType): + tests.append(doctest.DocTestSuite(module)) + except ValueError: + # no docstring doctests + pass + + load_tests = getattr(module, 'load_tests', None) + if use_load_tests and load_tests is not None: + return load_tests(self, tests, None) + return self.suiteClass(tests) + + def loadTestsFromName(self, name, module=None): + """Return a suite of all tests cases given a string specifier. + + The name may resolve either to a module, a test case class, a + test method within a test case class, or a callable object which + returns a TestCase or TestSuite instance. + + The method optionally resolves the names relative to a given module. + """ + parts = name.split('.') + if module is None: + parts_copy = parts[:] + while parts_copy: + try: + module = __import__('.'.join(parts_copy)) + break + except ImportError: + del parts_copy[-1] + if not parts_copy: + doctest_test = maybe_load_doctest(name) + if doctest_test is not None: + obj = doctest_test + else: + raise + parts = parts[1:] + obj = module + for part in parts: + parent, obj = obj, getattr(obj, part) + + if isinstance(obj, types.ModuleType): + return self.loadTestsFromModule(obj) + elif isinstance(obj, type) and issubclass(obj, case.TestCase): + return self.loadTestsFromTestCase(obj) + elif (isinstance(obj, types.UnboundMethodType) and + isinstance(parent, type) and + issubclass(parent, case.TestCase)): + return self.suiteClass([parent(obj.__name__)]) + elif isinstance(obj, suite.TestSuite): + return obj + elif hasattr(obj, '__call__'): + test = obj() + if isinstance(test, suite.TestSuite): + return test + elif isinstance(test, case.TestCase): + return self.suiteClass([test]) + else: + raise TypeError("calling %s returned %s, not a test" % + (obj, test)) + else: + raise TypeError("don't know how to make test from: %s" % obj) + + def loadTestsFromNames(self, names, module=None): + """Return a suite of all tests cases found using the given sequence + of string specifiers. See 'loadTestsFromName()'. + """ + suites = [self.loadTestsFromName(name, module) for name in names] + return self.suiteClass(suites) + + def getTestCaseNames(self, testCaseClass): + """Return a sorted sequence of method names found within testCaseClass + """ + def isTestMethod(attrname, testCaseClass=testCaseClass, + prefix=self.testMethodPrefix): + return attrname.startswith(prefix) and \ + hasattr(getattr(testCaseClass, attrname), '__call__') + testFnNames = filter(isTestMethod, dir(testCaseClass)) + if self.sortTestMethodsUsing: + testFnNames.sort(key=_CmpToKey(self.sortTestMethodsUsing)) + return testFnNames + + def discover(self, start_dir, pattern='test*.py', top_level_dir=None, + skip_tags=frozenset(), allowed_tags=frozenset(), + skip_doctests=False): + """Find and return all test modules from the specified start + directory, recursing into subdirectories to find them. Only test files + that match the pattern will be loaded. (Using shell style pattern + matching.) + + All test modules must be importable from the top level of the project. + If the start directory is not the top level directory then the top + level directory must be specified separately. + + If a test package name (directory with '__init__.py') matches the + pattern then the package will be checked for a 'load_tests' function. If + this exists then it will be called with loader, tests, pattern. + + If load_tests exists then discovery does *not* recurse into the package, + load_tests is responsible for loading all tests in the package. + + The pattern is deliberately not stored as a loader attribute so that + packages can continue discovery themselves. top_level_dir is stored so + load_tests does not need to pass this argument in to loader.discover(). + """ + if top_level_dir is None and self._top_level_dir is not None: + # make top_level_dir optional if called from load_tests in a package + top_level_dir = self._top_level_dir + elif top_level_dir is None: + top_level_dir = start_dir + + top_level_dir = os.path.abspath(os.path.normpath(top_level_dir)) + start_dir = os.path.abspath(os.path.normpath(start_dir)) + + if not top_level_dir in sys.path: + # all test modules must be importable from the top level directory + sys.path.append(top_level_dir) + self._top_level_dir = top_level_dir + + if start_dir != top_level_dir and not os.path.isfile(os.path.join(start_dir, '__init__.py')): + # what about __init__.pyc or pyo (etc) + raise ImportError('Start directory is not importable: %r' % start_dir) + + tests = list(test for test in + flatten_test(self._find_tests(start_dir, pattern)) + if is_not_skipped(test, skip_tags, allowed_tags, + skip_doctests)) + return self.suiteClass(tests) + + def _get_name_from_path(self, path): + path = os.path.splitext(os.path.normpath(path))[0] + + _relpath = relpath(path, self._top_level_dir) + assert not os.path.isabs(_relpath), "Path must be within the project" + assert not _relpath.startswith('..'), "Path must be within the project" + + name = _relpath.replace(os.path.sep, '.') + return name + + def _get_module_from_name(self, name): + __import__(name) + return sys.modules[name] + + def _find_tests(self, start_dir, pattern): + """Used by discovery. Yields test suites it loads.""" + paths = os.listdir(start_dir) + + for path in paths: + full_path = os.path.join(start_dir, path) + if os.path.isfile(full_path): + doctest_test = maybe_load_doctest(full_path) + if doctest_test is not None: + yield doctest_test + continue + + if not VALID_MODULE_NAME.match(path): + # valid Python identifiers only + continue + + if fnmatch(path, pattern): + # if the test file matches, load it + name = self._get_name_from_path(full_path) + try: + module = self._get_module_from_name(name) + except: + yield _make_failed_import_test(name, self.suiteClass) + else: + yield self.loadTestsFromModule(module) + elif os.path.isdir(full_path): + if not os.path.isfile(os.path.join(full_path, '__init__.py')): + continue + + load_tests = None + tests = None + if fnmatch(path, pattern): + # only check load_tests if the package directory itself matches the filter + name = self._get_name_from_path(full_path) + package = self._get_module_from_name(name) + load_tests = getattr(package, 'load_tests', None) + tests = self.loadTestsFromModule(package, use_load_tests=False) + + if load_tests is None: + if tests is not None: + # tests loaded from package file + yield tests + # recurse into the package + for test in self._find_tests(full_path, pattern): + yield test + else: + yield load_tests(self, tests, pattern) + +defaultTestLoader = TestLoader() + + +def _makeLoader(prefix, sortUsing, suiteClass=None): + loader = TestLoader() + loader.sortTestMethodsUsing = sortUsing + loader.testMethodPrefix = prefix + if suiteClass: + loader.suiteClass = suiteClass + return loader + +def getTestCaseNames(testCaseClass, prefix, sortUsing=cmp): + return _makeLoader(prefix, sortUsing).getTestCaseNames(testCaseClass) + +def makeSuite(testCaseClass, prefix='test', sortUsing=cmp, + suiteClass=suite.TestSuite): + return _makeLoader(prefix, sortUsing, suiteClass).loadTestsFromTestCase(testCaseClass) + +def findTestCases(module, prefix='test', sortUsing=cmp, + suiteClass=suite.TestSuite): + return _makeLoader(prefix, sortUsing, suiteClass).loadTestsFromModule(module) diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/main.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/main.py new file mode 100644 index 0000000000000000000000000000000000000000..afc7d1ffab09321654e098aa04d8561ab4a2e922 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/main.py @@ -0,0 +1,178 @@ +"""Unittest main program""" + +import sys +import os +import types + +from unittest import loader, runner + + +USAGE_AS_MAIN = """\ +Usage: %(progName)s [options] [tests] + +Options: + -h, --help Show this message + -v, --verbose Verbose output + -q, --quiet Minimal output + +Examples: + %(progName)s test_module - run tests from test_module + %(progName)s test_module.TestClass - run tests from + test_module.TestClass + %(progName)s test_module.TestClass.test_method - run specified test method + +[tests] can be a list of any number of test modules, classes and test +methods. + +Alternative Usage: %(progName)s discover [options] + +Options: + -v, --verbose Verbose output + -s directory Directory to start discovery ('.' default) + -p pattern Pattern to match test files ('test*.py' default) + -t directory Top level directory of project (default to + start directory) + +For test discovery all test modules must be importable from the top +level directory of the project. +""" + +USAGE_FROM_MODULE = """\ +Usage: %(progName)s [options] [test] [...] + +Options: + -h, --help Show this message + -v, --verbose Verbose output + -q, --quiet Minimal output + +Examples: + %(progName)s - run default set of tests + %(progName)s MyTestSuite - run suite 'MyTestSuite' + %(progName)s MyTestCase.testSomething - run MyTestCase.testSomething + %(progName)s MyTestCase - run all 'test*' test methods + in MyTestCase +""" + +if __name__ == '__main__': + USAGE = USAGE_AS_MAIN +else: + USAGE = USAGE_FROM_MODULE + + +class TestProgram(object): + """A command-line program that runs a set of tests; this is primarily + for making test modules conveniently executable. + """ + USAGE = USAGE + def __init__(self, module='__main__', defaultTest=None, + argv=None, testRunner=None, + testLoader=loader.defaultTestLoader, exit=True, + verbosity=1): + if isinstance(module, basestring): + self.module = __import__(module) + for part in module.split('.')[1:]: + self.module = getattr(self.module, part) + else: + self.module = module + if argv is None: + argv = sys.argv + + self.exit = exit + self.verbosity = verbosity + self.defaultTest = defaultTest + self.testRunner = testRunner + self.testLoader = testLoader + self.progName = os.path.basename(argv[0]) + self.parseArgs(argv) + self.runTests() + + def usageExit(self, msg=None): + if msg: + print msg + print self.USAGE % self.__dict__ + sys.exit(2) + + def parseArgs(self, argv): + if len(argv) > 1 and argv[1].lower() == 'discover': + self._do_discovery(argv[2:]) + return + + import getopt + long_opts = ['help','verbose','quiet'] + try: + options, args = getopt.getopt(argv[1:], 'hHvq', long_opts) + for opt, value in options: + if opt in ('-h','-H','--help'): + self.usageExit() + if opt in ('-q','--quiet'): + self.verbosity = 0 + if opt in ('-v','--verbose'): + self.verbosity = 2 + if len(args) == 0 and self.defaultTest is None: + # createTests will load tests from self.module + self.testNames = None + elif len(args) > 0: + self.testNames = args + if __name__ == '__main__': + # to support python -m unittest ... + self.module = None + else: + self.testNames = (self.defaultTest,) + self.createTests() + except getopt.error, msg: + self.usageExit(msg) + + def createTests(self): + if self.testNames is None: + self.test = self.testLoader.loadTestsFromModule(self.module) + else: + self.test = self.testLoader.loadTestsFromNames(self.testNames, + self.module) + + def _do_discovery(self, argv, Loader=loader.TestLoader): + # handle command line args for test discovery + import optparse + parser = optparse.OptionParser() + parser.add_option('-v', '--verbose', dest='verbose', default=False, + help='Verbose output', action='store_true') + parser.add_option('-s', '--start-directory', dest='start', default='.', + help="Directory to start discovery ('.' default)") + parser.add_option('-p', '--pattern', dest='pattern', default='test*.py', + help="Pattern to match tests ('test*.py' default)") + parser.add_option('-t', '--top-level-directory', dest='top', default=None, + help='Top level directory of project (defaults to start directory)') + + options, args = parser.parse_args(argv) + if len(args) > 3: + self.usageExit() + + for name, value in zip(('start', 'pattern', 'top'), args): + setattr(options, name, value) + + if options.verbose: + self.verbosity = 2 + + start_dir = options.start + pattern = options.pattern + top_level_dir = options.top + + loader = Loader() + self.test = loader.discover(start_dir, pattern, top_level_dir) + + def runTests(self): + if self.testRunner is None: + self.testRunner = runner.TextTestRunner + if isinstance(self.testRunner, (type, types.ClassType)): + try: + testRunner = self.testRunner(verbosity=self.verbosity) + except TypeError: + # didn't accept the verbosity argument + testRunner = self.testRunner() + else: + # it is assumed to be a TestRunner instance + testRunner = self.testRunner + self.result = testRunner.run(self.test) + if self.exit: + sys.exit(not self.result.wasSuccessful()) + +main = TestProgram diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/result.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/result.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a7dfacfa90038e4891d2cc359053f650f8f379 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/result.py @@ -0,0 +1,113 @@ +"""Test result object""" + +import traceback + +from unittest import util + + +class TestResult(object): + """Holder for test result information. + + Test results are automatically managed by the TestCase and TestSuite + classes, and do not need to be explicitly manipulated by writers of tests. + + Each instance holds the total number of tests run, and collections of + failures and errors that occurred among those test runs. The collections + contain tuples of (testcase, exceptioninfo), where exceptioninfo is the + formatted traceback of the error that occurred. + """ + def __init__(self): + self.failures = [] + self.errors = [] + self.testsRun = 0 + self.skipped = [] + self.expectedFailures = [] + self.unexpectedSuccesses = [] + self.shouldStop = False + + def startTest(self, test): + "Called when the given test is about to be run" + self.testsRun = self.testsRun + 1 + + def startTestRun(self): + """Called once before any tests are executed. + + See startTest for a method called before each test. + """ + + def stopTest(self, test): + "Called when the given test has been run" + pass + + def stopTestRun(self): + """Called once after all tests are executed. + + See stopTest for a method called after each test. + """ + + def addError(self, test, err): + """Called when an error has occurred. 'err' is a tuple of values as + returned by sys.exc_info(). + """ + self.errors.append((test, self._exc_info_to_string(err, test))) + + def addFailure(self, test, err): + """Called when an error has occurred. 'err' is a tuple of values as + returned by sys.exc_info().""" + self.failures.append((test, self._exc_info_to_string(err, test))) + + def addSuccess(self, test): + "Called when a test has completed successfully" + pass + + def addSkip(self, test, reason): + """Called when a test is skipped.""" + self.skipped.append((test, reason)) + + def addExpectedFailure(self, test, err): + """Called when an expected failure/error occured.""" + self.expectedFailures.append( + (test, self._exc_info_to_string(err, test))) + + def addUnexpectedSuccess(self, test): + """Called when a test was expected to fail, but succeed.""" + self.unexpectedSuccesses.append(test) + + def wasSuccessful(self): + "Tells whether or not this result was a success" + return len(self.failures) == len(self.errors) == 0 + + def stop(self): + "Indicates that the tests should be aborted" + self.shouldStop = True + + def _exc_info_to_string(self, err, test): + """Converts a sys.exc_info()-style tuple of values into a string.""" + exctype, value, tb = err + # Skip test runner traceback levels + while tb and self._is_relevant_tb_level(tb): + tb = tb.tb_next + if exctype is test.failureException: + # Skip assert*() traceback levels + length = self._count_relevant_tb_levels(tb) + return ''.join(traceback.format_exception(exctype, value, tb, length)) + return ''.join(traceback.format_exception(exctype, value, tb)) + + def _is_relevant_tb_level(self, tb): + globs = tb.tb_frame.f_globals + is_relevant = '__name__' in globs and \ + globs["__name__"].startswith("unittest") + del globs + return is_relevant + + def _count_relevant_tb_levels(self, tb): + length = 0 + while tb and not self._is_relevant_tb_level(tb): + length += 1 + tb = tb.tb_next + return length + + def __repr__(self): + return "<%s run=%i errors=%i failures=%i>" % \ + (util.strclass(self.__class__), self.testsRun, len(self.errors), + len(self.failures)) diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/runner.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/runner.py new file mode 100644 index 0000000000000000000000000000000000000000..2b950dbdc314e9c17ea489cc621f62060684ee65 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/runner.py @@ -0,0 +1,174 @@ +"""Running tests""" + +import sys +import time + +from unittest import result + + +class _WritelnDecorator(object): + """Used to decorate file-like objects with a handy 'writeln' method""" + def __init__(self,stream): + self.stream = stream + + def __getattr__(self, attr): + if attr in ('stream', '__getstate__'): + raise AttributeError(attr) + return getattr(self.stream,attr) + + def writeln(self, arg=None): + if arg: + self.write(arg) + self.write('\n') # text-mode streams translate to \r\n if needed + + +class _TextTestResult(result.TestResult): + """A test result class that can print formatted text results to a stream. + + Used by TextTestRunner. + """ + separator1 = '=' * 70 + separator2 = '-' * 70 + + def __init__(self, stream, descriptions, verbosity): + super(_TextTestResult, self).__init__() + self.stream = stream + self.showAll = verbosity > 1 + self.dots = verbosity == 1 + self.descriptions = descriptions + + def getDescription(self, test): + if self.descriptions: + return test.shortDescription() or str(test) + else: + return str(test) + + def startTest(self, test): + super(_TextTestResult, self).startTest(test) + if self.showAll: + self.stream.write(self.getDescription(test)) + self.stream.write(" ... ") + self.stream.flush() + + def addSuccess(self, test): + super(_TextTestResult, self).addSuccess(test) + if self.showAll: + self.stream.writeln("ok") + elif self.dots: + self.stream.write('.') + self.stream.flush() + + def addError(self, test, err): + super(_TextTestResult, self).addError(test, err) + if self.showAll: + self.stream.writeln("ERROR") + elif self.dots: + self.stream.write('E') + self.stream.flush() + + def addFailure(self, test, err): + super(_TextTestResult, self).addFailure(test, err) + if self.showAll: + self.stream.writeln("FAIL") + elif self.dots: + self.stream.write('F') + self.stream.flush() + + def addSkip(self, test, reason): + super(_TextTestResult, self).addSkip(test, reason) + if self.showAll: + self.stream.writeln("skipped %r" % (reason,)) + elif self.dots: + self.stream.write("s") + self.stream.flush() + + def addExpectedFailure(self, test, err): + super(_TextTestResult, self).addExpectedFailure(test, err) + if self.showAll: + self.stream.writeln("expected failure") + elif self.dots: + self.stream.write("x") + self.stream.flush() + + def addUnexpectedSuccess(self, test): + super(_TextTestResult, self).addUnexpectedSuccess(test) + if self.showAll: + self.stream.writeln("unexpected success") + elif self.dots: + self.stream.write("u") + self.stream.flush() + + def printErrors(self): + if self.dots or self.showAll: + self.stream.writeln() + self.printErrorList('ERROR', self.errors) + self.printErrorList('FAIL', self.failures) + + def printErrorList(self, flavour, errors): + for test, err in errors: + self.stream.writeln(self.separator1) + self.stream.writeln("%s: %s" % (flavour,self.getDescription(test))) + self.stream.writeln(self.separator2) + self.stream.writeln("%s" % err) + + +class TextTestRunner(object): + """A test runner class that displays results in textual form. + + It prints out the names of tests as they are run, errors as they + occur, and a summary of the results at the end of the test run. + """ + def __init__(self, stream=sys.stderr, descriptions=1, verbosity=1): + self.stream = _WritelnDecorator(stream) + self.descriptions = descriptions + self.verbosity = verbosity + + def _makeResult(self): + return _TextTestResult(self.stream, self.descriptions, self.verbosity) + + def run(self, test): + "Run the given test case or test suite." + result = self._makeResult() + startTime = time.time() + startTestRun = getattr(result, 'startTestRun', None) + if startTestRun is not None: + startTestRun() + try: + test(result) + finally: + stopTestRun = getattr(result, 'stopTestRun', None) + if stopTestRun is not None: + stopTestRun() + stopTime = time.time() + timeTaken = stopTime - startTime + result.printErrors() + self.stream.writeln(result.separator2) + run = result.testsRun + self.stream.writeln("Ran %d test%s in %.3fs" % + (run, run != 1 and "s" or "", timeTaken)) + self.stream.writeln() + results = map(len, (result.expectedFailures, + result.unexpectedSuccesses, + result.skipped)) + expectedFails, unexpectedSuccesses, skipped = results + infos = [] + if not result.wasSuccessful(): + self.stream.write("FAILED") + failed, errored = map(len, (result.failures, result.errors)) + if failed: + infos.append("failures=%d" % failed) + if errored: + infos.append("errors=%d" % errored) + else: + self.stream.write("OK") + if skipped: + infos.append("skipped=%d" % skipped) + if expectedFails: + infos.append("expected failures=%d" % expectedFails) + if unexpectedSuccesses: + infos.append("unexpected successes=%d" % unexpectedSuccesses) + if infos: + self.stream.writeln(" (%s)" % (", ".join(infos),)) + else: + self.stream.write("\n") + return result diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/suite.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/suite.py new file mode 100644 index 0000000000000000000000000000000000000000..e5f4817c9dd9cc19b40a3ef9bd49e16584a721c8 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/suite.py @@ -0,0 +1,72 @@ +"""TestSuite""" + +from unittest import case +from unittest import util + + +class TestSuite(object): + """A test suite is a composite test consisting of a number of TestCases. + + For use, create an instance of TestSuite, then add test case instances. + When all tests have been added, the suite can be passed to a test + runner, such as TextTestRunner. It will run the individual test cases + in the order in which they were added, aggregating the results. When + subclassing, do not forget to call the base class constructor. + """ + def __init__(self, tests=()): + self._tests = [] + self.addTests(tests) + + def __repr__(self): + return "<%s tests=%s>" % (util.strclass(self.__class__), list(self)) + + def __eq__(self, other): + if not isinstance(other, self.__class__): + return NotImplemented + return list(self) == list(other) + + def __ne__(self, other): + return not self == other + + # Can't guarantee hash invariant, so flag as unhashable + __hash__ = None + + def __iter__(self): + return iter(self._tests) + + def countTestCases(self): + cases = 0 + for test in self: + cases += test.countTestCases() + return cases + + def addTest(self, test): + # sanity checks + if not hasattr(test, '__call__'): + raise TypeError("the test to add must be callable") + if isinstance(test, type) and issubclass(test, + (case.TestCase, TestSuite)): + raise TypeError("TestCases and TestSuites must be instantiated " + "before passing them to addTest()") + self._tests.append(test) + + def addTests(self, tests): + if isinstance(tests, basestring): + raise TypeError("tests must be an iterable of tests, not a string") + for test in tests: + self.addTest(test) + + def run(self, result): + for test in self: + if result.shouldStop: + break + test(result) + return result + + def __call__(self, *args, **kwds): + return self.run(*args, **kwds) + + def debug(self): + """Run the tests without collecting errors in a TestResult""" + for test in self: + test.debug() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/util.py b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/util.py new file mode 100644 index 0000000000000000000000000000000000000000..2546e205e70c7e9832036f0fdecd4d9587ff0858 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test-tools/unittest/util.py @@ -0,0 +1,44 @@ +"""Various utility functions.""" + +def strclass(cls): + return "%s.%s" % (cls.__module__, cls.__name__) + +def sorted_list_difference(expected, actual): + """Finds elements in only one or the other of two, sorted input lists. + + Returns a two-element tuple of lists. The first list contains those + elements in the "expected" list but not in the "actual" list, and the + second contains those elements in the "actual" list but not in the + "expected" list. Duplicate elements in either input list are ignored. + """ + i = j = 0 + missing = [] + unexpected = [] + while True: + try: + e = expected[i] + a = actual[j] + if e < a: + missing.append(e) + i += 1 + while expected[i] == e: + i += 1 + elif e > a: + unexpected.append(a) + j += 1 + while actual[j] == a: + j += 1 + else: + i += 1 + try: + while expected[i] == e: + i += 1 + finally: + j += 1 + while actual[j] == a: + j += 1 + except IndexError: + missing.extend(expected[i:]) + unexpected.extend(actual[j:]) + break + return missing, unexpected diff --git a/LTA/LTAIngest/mechanize-0.2.5/test.py b/LTA/LTAIngest/mechanize-0.2.5/test.py new file mode 100755 index 0000000000000000000000000000000000000000..7f54309ffd7fed17462bda7fe5f772a58a1858ea --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +""" +Note that the functional tests and doctests require test-tools to be on +sys.path before the stdlib. One way to ensure that is to use this script to +run tests. +""" + +import os +import sys + + +def mutate_sys_path(): + this_dir = os.path.dirname(__file__) + sys.path.insert(0, os.path.join(this_dir, "test")) + sys.path.insert(0, os.path.join(this_dir, "test-tools")) + + +def main(argv): + # test-tools/ dir includes a bundled Python 2.5 doctest / linecache, and a + # bundled & modified Python trunk (2.7 vintage) unittest. This is only for + # testing purposes, and these don't get installed. + + # unittest revision 77209, modified (probably I should have used PyPI + # project discover, which is already backported to 2.4, but since I've + # already done that and made changes, I won't bother for now) + + # doctest.py revision 45701 and linecache.py revision 45940. Since + # linecache is used by Python itself, linecache.py is renamed + # linecache_copy.py, and this copy of doctest is modified (only) to use + # that renamed module. + + mutate_sys_path() + assert "doctest" not in sys.modules + import testprogram + + # *.py to catch doctests in docstrings + this_dir = os.path.dirname(__file__) + prog = testprogram.TestProgram( + argv=argv, default_discovery_args=(this_dir, "*.py", None), + module=None) + result = prog.runTests() + success = result.wasSuccessful() + sys.exit(int(not success)) + + +if __name__ == "__main__": + main(sys.argv) diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/__init__.py b/LTA/LTAIngest/mechanize-0.2.5/test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/functional_tests_golden/FormsExamplesTests.test_example/output b/LTA/LTAIngest/mechanize-0.2.5/test/functional_tests_golden/FormsExamplesTests.test_example/output new file mode 100644 index 0000000000000000000000000000000000000000..fafe71f45c54bb79f25ac22edd6298850fe7c417 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/functional_tests_golden/FormsExamplesTests.test_example/output @@ -0,0 +1,43 @@ +<POST http://127.0.0.1:8000/cgi-bin/echo.cgi multipart/form-data + <TextareaControl(blah=)> + <TextControl(comments=)> + <CheckboxControl(eggs=[spam])> + <SelectControl(cheeses=[mozz, caerphilly, gouda, gorgonzola, parmesan, leicester, cheddar, mascarpone, curd, limburger, emmenthal])> + <CheckboxControl(apples=[pears])> + <CheckboxControl(whocares=[edam, gouda])> + <RadioControl(spam=[spam, rhubarb])> + <RadioControl(smelly=[on])> + <SelectControl(favorite_cheese=[*cheddar, brie, leicester, jahlsberg])> + <FileControl(<None>=<No files added>)>> +False +False +cheeses ['parmesan', 'leicester', 'cheddar'] select +curd True None {'value': 'curd', 'contents': 'curd', 'label': 'curd'} +mozz +caerphilly +gouda +gorgonzola +parmesan +leicester +cheddar +mascarpone +curd +limburger +emmenthal +False +False +True +False +http://127.0.0.1:8000/cgi-bin/echo.cgi +Connection: close +Content-Type: text/html +<html><head><title>Form submission parameters</title></head> +<p>Received parameters:</p> +<pre> +favorite_cheese: brie +blah: rhubarb rhubarb +spam: spam +comments: Blah. +cheeses: caerphilly, gorgonzola +</pre></html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/functional_tests_golden/FormsExamplesTests.test_simple/output b/LTA/LTAIngest/mechanize-0.2.5/test/functional_tests_golden/FormsExamplesTests.test_simple/output new file mode 100644 index 0000000000000000000000000000000000000000..0ebe194f92e97faa586fe9654db214237a980628 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/functional_tests_golden/FormsExamplesTests.test_simple/output @@ -0,0 +1,19 @@ +<POST http://127.0.0.1:8000/cgi-bin/echo.cgi multipart/form-data + <TextareaControl(blah=)> + <TextControl(comments=)> + <CheckboxControl(eggs=[spam])> + <SelectControl(cheeses=[mozz, caerphilly, gouda, gorgonzola, parmesan, leicester, cheddar, mascarpone, curd, limburger, emmenthal])> + <CheckboxControl(apples=[pears])> + <CheckboxControl(whocares=[edam, gouda])> + <RadioControl(spam=[spam, rhubarb])> + <RadioControl(smelly=[on])> + <SelectControl(favorite_cheese=[*cheddar, brie, leicester, jahlsberg])> + <FileControl(<None>=<No files added>)>> +<html><head><title>Form submission parameters</title></head> +<p>Received parameters:</p> +<pre> +blah: +favorite_cheese: cheddar +comments: Thanks, Gisle +</pre></html> + diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_api.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_api.py new file mode 100644 index 0000000000000000000000000000000000000000..877966c4584d478fe0a559b90a577d2154503179 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_api.py @@ -0,0 +1,13 @@ +import unittest + + +class ImportTests(unittest.TestCase): + + def test_import_all(self): + # the following will raise an exception if __all__ contains undefined + # classes + from mechanize import * + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_browser.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_browser.doctest new file mode 100644 index 0000000000000000000000000000000000000000..6e5765e445410558ea847c1d7af7d233e1b4dd41 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_browser.doctest @@ -0,0 +1,295 @@ +>>> import mechanize +>>> from mechanize._response import test_response +>>> from test_browser import TestBrowser2, make_mock_handler + + +Opening a new response should close the old one. + +>>> class TestHttpHandler(mechanize.BaseHandler): +... def http_open(self, request): +... return test_response(url=request.get_full_url()) +>>> class TestHttpBrowser(TestBrowser2): +... handler_classes = TestBrowser2.handler_classes.copy() +... handler_classes["http"] = TestHttpHandler +... default_schemes = ["http"] +>>> def response_impl(response): +... return response.wrapped.fp.__class__.__name__ + +>>> br = TestHttpBrowser() +>>> r = br.open("http://example.com") +>>> print response_impl(r) +StringI +>>> r2 = br.open("http://example.com") +>>> print response_impl(r2) +StringI +>>> print response_impl(r) +eofresponse + +So should .set_response() + +>>> br.set_response(test_response()) +>>> print response_impl(r2) +eofresponse + + +.visit_response() works very similarly to .open() + +>>> br = TestHttpBrowser() +>>> r = br.open("http://example.com") +>>> r2 = test_response(url="http://example.com/2") +>>> print response_impl(r2) +StringI +>>> br.visit_response(r2) +>>> print response_impl(r) +eofresponse +>>> br.geturl() == br.request.get_full_url() == "http://example.com/2" +True +>>> junk = br.back() +>>> br.geturl() == br.request.get_full_url() == "http://example.com" +True + + +.back() may reload if the complete response was not read. If so, it +should return the new response, not the old one + +>>> class ReloadCheckBrowser(TestHttpBrowser): +... reloaded = False +... def reload(self): +... self.reloaded = True +... return TestHttpBrowser.reload(self) +>>> br = ReloadCheckBrowser() +>>> old = br.open("http://example.com") +>>> junk = br.open("http://example.com/2") +>>> new = br.back() +>>> br.reloaded +True +>>> new.wrapped is not old.wrapped +True + + +Warn early about some mistakes setting a response object + +>>> import StringIO +>>> br = TestBrowser2() +>>> br.set_response("blah") +Traceback (most recent call last): +... +ValueError: not a response object +>>> br.set_response(StringIO.StringIO()) +Traceback (most recent call last): +... +ValueError: not a response object + + +.open() without an appropriate scheme handler should fail with +URLError + +>>> br = TestBrowser2() +>>> br.open("http://example.com") +Traceback (most recent call last): +... +URLError: <urlopen error unknown url type: http> + +Reload after failed .open() should fail due to failure to open, not +with BrowserStateError + +>>> br.reload() +Traceback (most recent call last): +... +URLError: <urlopen error unknown url type: http> + + +.clear_history() should do what it says on the tin. Note that the +history does not include the current response! + +>>> br = TestBrowser2() +>>> br.add_handler(make_mock_handler(test_response)([("http_open", None)])) + +>>> br.response() is None +True +>>> len(br._history._history) +0 + +>>> r = br.open("http://example.com/1") +>>> br.response() is not None +True +>>> len(br._history._history) +0 + +>>> br.clear_history() +>>> br.response() is not None +True +>>> len(br._history._history) +0 + +>>> r = br.open("http://example.com/2") +>>> br.response() is not None +True +>>> len(br._history._history) +1 + +>>> br.clear_history() +>>> br.response() is not None +True +>>> len(br._history._history) +0 + + +.open()ing a Request with False .visit does not affect Browser state. +Redirections during such a non-visiting request should also be +non-visiting. + +>>> from mechanize import BrowserStateError, Request, HTTPRedirectHandler +>>> from test_urllib2 import MockHTTPHandler + +>>> def make_browser_with_redirect(): +... br = TestBrowser2() +... hh = MockHTTPHandler(302, "Location: http://example.com/\r\n\r\n") +... br.add_handler(hh) +... br.add_handler(HTTPRedirectHandler()) +... return br +>>> def raises(exc_class, fn, *args, **kwds): +... try: +... fn(*args, **kwds) +... except exc_class, exc: +... return True +... return False +>>> def test_state(br): +... return (br.request is None and +... br.response() is None and +... raises(BrowserStateError, br.back) +... ) +>>> br = make_browser_with_redirect() +>>> test_state(br) +True +>>> req = Request("http://example.com") +>>> req.visit = False +>>> r = br.open(req) +>>> test_state(br) +True + +.open_novisit() mutates the request object + +>>> br = make_browser_with_redirect() +>>> test_state(br) +True +>>> req = Request("http://example.com") +>>> print req.visit +None +>>> r = br.open_novisit(req) +>>> test_state(br) +True +>>> req.visit +False + + +...in fact, any redirection (but not refresh), proxy request, basic or +digest auth request, or robots.txt request should be non-visiting, +even if .visit is True: + +>>> from test_urllib2 import MockPasswordManager +>>> def test_one_visit(handlers): +... br = TestBrowser2() +... for handler in handlers: br.add_handler(handler) +... req = Request("http://example.com") +... req.visit = True +... br.open(req) +... return br +>>> def test_state(br): +... # XXX the _history._history check is needed because of the weird +... # throwing-away of history entries by .back() where response is +... # None, which makes the .back() check insufficient to tell if a +... # history entry was .add()ed. I don't want to change this until +... # post-stable. +... return ( +... br.response() and +... br.request and +... len(br._history._history) == 0 and +... raises(BrowserStateError, br.back)) + +>>> hh = MockHTTPHandler(302, "Location: http://example.com/\r\n\r\n") +>>> br = test_one_visit([hh, HTTPRedirectHandler()]) +>>> test_state(br) +True + +>>> class MockPasswordManager: +... def add_password(self, realm, uri, user, password): pass +... def find_user_password(self, realm, authuri): return '', '' + +>>> ah = mechanize.HTTPBasicAuthHandler(MockPasswordManager()) +>>> hh = MockHTTPHandler( +... 401, 'WWW-Authenticate: Basic realm="realm"\r\n\r\n') +>>> test_state(test_one_visit([hh, ah])) +True + +>>> ph = mechanize.ProxyHandler(dict(http="proxy.example.com:3128")) +>>> ah = mechanize.ProxyBasicAuthHandler(MockPasswordManager()) +>>> hh = MockHTTPHandler( +... 407, 'Proxy-Authenticate: Basic realm="realm"\r\n\r\n') +>>> test_state(test_one_visit([ph, hh, ah])) +True + +XXX Can't really fix this one properly without significant changes -- +the refresh should go onto the history *after* the call, but currently +all redirects, including refreshes, are done by recursive .open() +calls, which gets the history wrong in this case. Will have to wait +until after stable release: + +#>>> hh = MockHTTPHandler( +#... "refresh", 'Location: http://example.com/\r\n\r\n') +#>>> br = test_one_visit([hh, HTTPRedirectHandler()]) +#>>> br.response() is not None +#True +#>>> br.request is not None +#True +#>>> r = br.back() + +XXX digest, robots + + +.global_form() is separate from the other forms (partly for backwards- +compatibility reasons). + +>>> from mechanize._response import test_response +>>> br = TestBrowser2() +>>> html = """\ +... <html><body> +... <input type="text" name="a" /> +... <form><input type="text" name="b" /></form> +... </body></html> +... """ +>>> response = test_response(html, headers=[("Content-type", "text/html")]) +>>> br.global_form() +Traceback (most recent call last): +BrowserStateError: not viewing any document +>>> br.set_response(response) +>>> br.global_form().find_control(nr=0).name +'a' +>>> len(list(br.forms())) +1 +>>> iter(br.forms()).next().find_control(nr=0).name +'b' + + + +.select_form() works with the global form + +>>> from mechanize._response import test_html_response +>>> br = TestBrowser2() +>>> br.visit_response(test_html_response("""\ +... <html><head><title></title></head><body> +... <input type="text" name="a" value="b"></input> +... <form> +... <input type="text" name="p" value="q"></input> +... </form> +... </body></html>""")) +>>> def has_a(form): +... try: +... form.find_control(name="a") +... except mechanize.ControlNotFoundError: +... return False +... else: +... return True +>>> br.select_form(predicate=has_a) +>>> br.form.find_control(name="a").value +'b' diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_browser.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_browser.py new file mode 100644 index 0000000000000000000000000000000000000000..9b874ca0d1d6d90257287f74aa01801683f0173f --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_browser.py @@ -0,0 +1,774 @@ +#!/usr/bin/env python +"""Tests for mechanize.Browser.""" + +import StringIO +from unittest import TestCase +import re + +import mechanize +from mechanize._response import test_html_response +FACTORY_CLASSES = [mechanize.DefaultFactory, mechanize.RobustFactory] + + +# XXX these 'mock' classes are badly in need of simplification / removal +# (note this stuff is also used by test_useragent.py and test_browser.doctest) +class MockMethod: + def __init__(self, meth_name, action, handle): + self.meth_name = meth_name + self.handle = handle + self.action = action + def __call__(self, *args): + return apply(self.handle, (self.meth_name, self.action)+args) + +class MockHeaders(dict): + def getheaders(self, name): + name = name.lower() + return [v for k, v in self.iteritems() if name == k.lower()] + +class MockResponse: + closeable_response = None + def __init__(self, url="http://example.com/", data=None, info=None): + self.url = url + self.fp = StringIO.StringIO(data) + if info is None: info = {} + self._info = MockHeaders(info) + def info(self): return self._info + def geturl(self): return self.url + def read(self, size=-1): return self.fp.read(size) + def seek(self, whence): + assert whence == 0 + self.fp.seek(0) + def close(self): pass + def get_data(self): pass + +def make_mock_handler(response_class=MockResponse): + class MockHandler: + processor_order = 500 + handler_order = -1 + def __init__(self, methods): + self._define_methods(methods) + def _define_methods(self, methods): + for name, action in methods: + if name.endswith("_open"): + meth = MockMethod(name, action, self.handle) + else: + meth = MockMethod(name, action, self.process) + setattr(self.__class__, name, meth) + def handle(self, fn_name, response, *args, **kwds): + self.parent.calls.append((self, fn_name, args, kwds)) + if response: + if isinstance(response, mechanize.HTTPError): + raise response + r = response + r.seek(0) + else: + r = response_class() + req = args[0] + r.url = req.get_full_url() + return r + def process(self, fn_name, action, *args, **kwds): + self.parent.calls.append((self, fn_name, args, kwds)) + if fn_name.endswith("_request"): + return args[0] + else: + return args[1] + def close(self): pass + def add_parent(self, parent): + self.parent = parent + self.parent.calls = [] + def __lt__(self, other): + if not hasattr(other, "handler_order"): + # Try to preserve the old behavior of having custom classes + # inserted after default ones (works only for custom user + # classes which are not aware of handler_order). + return True + return self.handler_order < other.handler_order + return MockHandler + +class TestBrowser(mechanize.Browser): + default_features = [] + default_others = [] + default_schemes = [] + +class TestBrowser2(mechanize.Browser): + # XXX better name! + # As TestBrowser, this is neutered so doesn't know about protocol handling, + # but still knows what to do with unknown schemes, etc., because + # UserAgent's default_others list is left intact, including classes like + # UnknownHandler + default_features = [] + default_schemes = [] + + +class BrowserTests(TestCase): + + def test_referer(self): + b = TestBrowser() + url = "http://www.example.com/" + r = MockResponse(url, +"""<html> +<head><title>Title</title></head> +<body> +<form name="form1"> + <input type="hidden" name="foo" value="bar"></input> + <input type="submit"></input> + </form> +<a href="http://example.com/foo/bar.html" name="apples"></a> +<a href="https://example.com/spam/eggs.html" name="secure"></a> +<a href="blah://example.com/" name="pears"></a> +</body> +</html> +""", {"content-type": "text/html"}) + b.add_handler(make_mock_handler()([("http_open", r)])) + + # Referer not added by .open()... + req = mechanize.Request(url) + b.open(req) + self.assert_(req.get_header("Referer") is None) + # ...even if we're visiting a document + b.open(req) + self.assert_(req.get_header("Referer") is None) + # Referer added by .click_link() and .click() + b.select_form("form1") + req2 = b.click() + self.assertEqual(req2.get_header("Referer"), url) + r2 = b.open(req2) + req3 = b.click_link(name="apples") + self.assertEqual(req3.get_header("Referer"), url+"?foo=bar") + # Referer not added when going from https to http URL + b.add_handler(make_mock_handler()([("https_open", r)])) + r3 = b.open(req3) + req4 = b.click_link(name="secure") + self.assertEqual(req4.get_header("Referer"), + "http://example.com/foo/bar.html") + r4 = b.open(req4) + req5 = b.click_link(name="apples") + self.assert_(not req5.has_header("Referer")) + # Referer not added for non-http, non-https requests + b.add_handler(make_mock_handler()([("blah_open", r)])) + req6 = b.click_link(name="pears") + self.assert_(not req6.has_header("Referer")) + # Referer not added when going from non-http, non-https URL + r4 = b.open(req6) + req7 = b.click_link(name="apples") + self.assert_(not req7.has_header("Referer")) + + # XXX Referer added for redirect + + def test_encoding(self): + import mechanize + from StringIO import StringIO + import urllib, mimetools + # always take first encoding, since that's the one from the real HTTP + # headers, rather than from HTTP-EQUIV + b = mechanize.Browser() + for s, ct in [("", mechanize._html.DEFAULT_ENCODING), + + ("Foo: Bar\r\n\r\n", mechanize._html.DEFAULT_ENCODING), + + ("Content-Type: text/html; charset=UTF-8\r\n\r\n", + "UTF-8"), + + ("Content-Type: text/html; charset=UTF-8\r\n" + "Content-Type: text/html; charset=KOI8-R\r\n\r\n", + "UTF-8"), + ]: + msg = mimetools.Message(StringIO(s)) + r = urllib.addinfourl(StringIO(""), msg, "http://www.example.com/") + b.set_response(r) + self.assertEqual(b.encoding(), ct) + + def test_history(self): + import mechanize + from mechanize import _response + + def same_response(ra, rb): + return ra.wrapped is rb.wrapped + + class Handler(mechanize.BaseHandler): + def http_open(self, request): + r = _response.test_response(url=request.get_full_url()) + # these tests aren't interested in auto-.reload() behaviour of + # .back(), so read the response to prevent that happening + r.get_data() + return r + + b = TestBrowser2() + b.add_handler(Handler()) + self.assertRaises(mechanize.BrowserStateError, b.back) + r1 = b.open("http://example.com/") + self.assertRaises(mechanize.BrowserStateError, b.back) + r2 = b.open("http://example.com/foo") + self.assert_(same_response(b.back(), r1)) + r3 = b.open("http://example.com/bar") + r4 = b.open("http://example.com/spam") + self.assert_(same_response(b.back(), r3)) + self.assert_(same_response(b.back(), r1)) + self.assertEquals(b.geturl(), "http://example.com/") + self.assertRaises(mechanize.BrowserStateError, b.back) + # reloading does a real HTTP fetch rather than using history cache + r5 = b.reload() + self.assert_(not same_response(r5, r1)) + # .geturl() gets fed through to b.response + self.assertEquals(b.geturl(), "http://example.com/") + # can go back n times + r6 = b.open("spam") + self.assertEquals(b.geturl(), "http://example.com/spam") + r7 = b.open("/spam") + self.assert_(same_response(b.response(), r7)) + self.assertEquals(b.geturl(), "http://example.com/spam") + self.assert_(same_response(b.back(2), r5)) + self.assertEquals(b.geturl(), "http://example.com/") + self.assertRaises(mechanize.BrowserStateError, b.back, 2) + r8 = b.open("/spam") + + # even if we get an HTTPError, history, .response() and .request should + # still get updated + class Handler2(mechanize.BaseHandler): + def https_open(self, request): + r = mechanize.HTTPError( + "https://example.com/bad", 503, "Oops", + MockHeaders(), StringIO.StringIO()) + return r + b.add_handler(Handler2()) + self.assertRaises(mechanize.HTTPError, b.open, + "https://example.com/badreq") + self.assertEqual(b.response().geturl(), "https://example.com/bad") + self.assertEqual(b.request.get_full_url(), + "https://example.com/badreq") + self.assert_(same_response(b.back(), r8)) + + # .close() should make use of Browser methods and attributes complain + # noisily, since they should not be called after .close() + b.form = "blah" + b.close() + for attr in ("form open error retrieve add_handler " + "request response set_response geturl reload back " + "clear_history set_cookie links forms viewing_html " + "encoding title select_form click submit click_link " + "follow_link find_link".split() + ): + self.assert_(getattr(b, attr) is None) + + def test_reload_read_incomplete(self): + import mechanize + from mechanize._response import test_response + class Browser(TestBrowser): + def __init__(self): + TestBrowser.__init__(self) + self.reloaded = False + def reload(self): + self.reloaded = True + TestBrowser.reload(self) + br = Browser() + data = "<html><head><title></title></head><body>%s</body></html>" + data = data % ("The quick brown fox jumps over the lazy dog."*100) + class Handler(mechanize.BaseHandler): + def http_open(self, requst): + return test_response(data, [("content-type", "text/html")]) + br.add_handler(Handler()) + + # .reload() on .back() if the whole response hasn't already been read + # (.read_incomplete is True) + r = br.open("http://example.com") + r.read(10) + br.open('http://www.example.com/blah') + self.failIf(br.reloaded) + br.back() + self.assert_(br.reloaded) + + # don't reload if already read + br.reloaded = False + br.response().read() + br.open('http://www.example.com/blah') + br.back() + self.failIf(br.reloaded) + + def test_viewing_html(self): + # XXX not testing multiple Content-Type headers + import mechanize + url = "http://example.com/" + + for allow_xhtml in False, True: + for ct, expect in [ + (None, False), + ("text/plain", False), + ("text/html", True), + + # don't try to handle XML until we can do it right! + ("text/xhtml", allow_xhtml), + ("text/xml", allow_xhtml), + ("application/xml", allow_xhtml), + ("application/xhtml+xml", allow_xhtml), + + ("text/html; charset=blah", True), + (" text/html ; charset=ook ", True), + ]: + b = TestBrowser(mechanize.DefaultFactory( + i_want_broken_xhtml_support=allow_xhtml)) + hdrs = {} + if ct is not None: + hdrs["Content-Type"] = ct + b.add_handler(make_mock_handler()([("http_open", + MockResponse(url, "", hdrs))])) + b.open(url) + self.assertEqual(b.viewing_html(), expect) + + for allow_xhtml in False, True: + for ext, expect in [ + (".htm", True), + (".html", True), + + # don't try to handle XML until we can do it right! + (".xhtml", allow_xhtml), + + (".html?foo=bar&a=b;whelk#kool", True), + (".txt", False), + (".xml", False), + ("", False), + ]: + b = TestBrowser(mechanize.DefaultFactory( + i_want_broken_xhtml_support=allow_xhtml)) + url = "http://example.com/foo"+ext + b.add_handler(make_mock_handler()( + [("http_open", MockResponse(url, "", {}))])) + b.open(url) + self.assertEqual(b.viewing_html(), expect) + + def test_empty(self): + for factory_class in FACTORY_CLASSES: + self._test_empty(factory_class()) + + def _test_empty(self, factory): + import mechanize + url = "http://example.com/" + + b = TestBrowser(factory=factory) + + self.assert_(b.response() is None) + + # To open a relative reference (often called a "relative URL"), you + # have to have already opened a URL for it "to be relative to". + self.assertRaises(mechanize.BrowserStateError, b.open, "relative_ref") + + # we can still clear the history even if we've not visited any URL + b.clear_history() + + # most methods raise BrowserStateError... + def test_state_error(method_names): + for attr in method_names: + method = getattr(b, attr) + #print attr + self.assertRaises(mechanize.BrowserStateError, method) + self.assertRaises(mechanize.BrowserStateError, b.select_form, + name="blah") + self.assertRaises(mechanize.BrowserStateError, b.find_link, + name="blah") + # ...if not visiting a URL... + test_state_error(("geturl reload back viewing_html encoding " + "click links forms title select_form".split())) + self.assertRaises(mechanize.BrowserStateError, b.set_cookie, "foo=bar") + self.assertRaises(mechanize.BrowserStateError, b.submit, nr=0) + self.assertRaises(mechanize.BrowserStateError, b.click_link, nr=0) + self.assertRaises(mechanize.BrowserStateError, b.follow_link, nr=0) + self.assertRaises(mechanize.BrowserStateError, b.find_link, nr=0) + # ...and lots do so if visiting a non-HTML URL + b.add_handler(make_mock_handler()( + [("http_open", MockResponse(url, "", {}))])) + r = b.open(url) + self.assert_(not b.viewing_html()) + test_state_error("click links forms title select_form".split()) + self.assertRaises(mechanize.BrowserStateError, b.submit, nr=0) + self.assertRaises(mechanize.BrowserStateError, b.click_link, nr=0) + self.assertRaises(mechanize.BrowserStateError, b.follow_link, nr=0) + self.assertRaises(mechanize.BrowserStateError, b.find_link, nr=0) + + b = TestBrowser() + r = MockResponse(url, +"""<html> +<head><title>Title</title></head> +<body> +</body> +</html> +""", {"content-type": "text/html"}) + b.add_handler(make_mock_handler()([("http_open", r)])) + r = b.open(url) + self.assertEqual(b.title(), "Title") + self.assertEqual(len(list(b.links())), 0) + self.assertEqual(len(list(b.forms())), 0) + self.assertRaises(ValueError, b.select_form) + self.assertRaises(mechanize.FormNotFoundError, b.select_form, + name="blah") + self.assertRaises(mechanize.FormNotFoundError, b.select_form, + predicate=lambda form: form is not b.global_form()) + self.assertRaises(mechanize.LinkNotFoundError, b.find_link, + name="blah") + self.assertRaises(mechanize.LinkNotFoundError, b.find_link, + predicate=lambda x: True) + + def test_forms(self): + for factory_class in FACTORY_CLASSES: + self._test_forms(factory_class()) + def _test_forms(self, factory): + import mechanize + url = "http://example.com" + + b = TestBrowser(factory=factory) + r = test_html_response( + url=url, + headers=[("content-type", "text/html")], + data="""\ +<html> +<head><title>Title</title></head> +<body> +<form name="form1"> + <input type="text"></input> + <input type="checkbox" name="cheeses" value="cheddar"></input> + <input type="checkbox" name="cheeses" value="edam"></input> + <input type="submit" name="one"></input> +</form> +<a href="http://example.com/foo/bar.html" name="apples"> +<form name="form2"> + <input type="submit" name="two"> +</form> +</body> +</html> +""" + ) + b.add_handler(make_mock_handler()([("http_open", r)])) + r = b.open(url) + + forms = list(b.forms()) + self.assertEqual(len(forms), 2) + for got, expect in zip([f.name for f in forms], [ + "form1", "form2"]): + self.assertEqual(got, expect) + + self.assertRaises(mechanize.FormNotFoundError, b.select_form, "foo") + + # no form is set yet + self.assertRaises(AttributeError, getattr, b, "possible_items") + b.select_form("form1") + # now unknown methods are fed through to selected mechanize.HTMLForm + self.assertEqual( + [i.name for i in b.find_control("cheeses").items], + ["cheddar", "edam"]) + b["cheeses"] = ["cheddar", "edam"] + self.assertEqual(b.click_pairs(), [ + ("cheeses", "cheddar"), ("cheeses", "edam"), ("one", "")]) + + b.select_form(nr=1) + self.assertEqual(b.name, "form2") + self.assertEqual(b.click_pairs(), [("two", "")]) + + def test_link_encoding(self): + for factory_class in FACTORY_CLASSES: + self._test_link_encoding(factory_class()) + def _test_link_encoding(self, factory): + import mechanize + from mechanize._rfc3986 import clean_url + url = "http://example.com/" + for encoding in ["UTF-8", "latin-1"]: + encoding_decl = "; charset=%s" % encoding + b = TestBrowser(factory=factory) + r = MockResponse(url, """\ +<a href="http://example.com/foo/bar——.html" + name="name0——">blah——</a> +""", #" +{"content-type": "text/html%s" % encoding_decl}) + b.add_handler(make_mock_handler()([("http_open", r)])) + r = b.open(url) + + Link = mechanize.Link + try: + mdashx2 = u"\u2014".encode(encoding)*2 + except UnicodeError: + mdashx2 = '——' + qmdashx2 = clean_url(mdashx2, encoding) + # base_url, url, text, tag, attrs + exp = Link(url, "http://example.com/foo/bar%s.html" % qmdashx2, + "blah"+mdashx2, "a", + [("href", "http://example.com/foo/bar%s.html" % mdashx2), + ("name", "name0%s" % mdashx2)]) + # nr + link = b.find_link() +## print +## print exp +## print link + self.assertEqual(link, exp) + + def test_link_whitespace(self): + from mechanize import Link + for factory_class in FACTORY_CLASSES: + base_url = "http://example.com/" + url = " http://example.com/foo.html%20+ " + stripped_url = url.strip() + html = '<a href="%s"></a>' % url + b = TestBrowser(factory=factory_class()) + r = MockResponse(base_url, html, {"content-type": "text/html"}) + b.add_handler(make_mock_handler()([("http_open", r)])) + r = b.open(base_url) + link = b.find_link(nr=0) + self.assertEqual( + link, + Link(base_url, stripped_url, "", "a", [("href", url)]) + ) + + def test_links(self): + for factory_class in FACTORY_CLASSES: + self._test_links(factory_class()) + def _test_links(self, factory): + import mechanize + from mechanize import Link + url = "http://example.com/" + + b = TestBrowser(factory=factory) + r = MockResponse(url, +"""<html> +<head><title>Title</title></head> +<body> +<a href="http://example.com/foo/bar.html" name="apples"></a> +<a name="pears"></a> +<a href="spam" name="pears"></a> +<area href="blah" name="foo"></area> +<form name="form2"> + <input type="submit" name="two"> +</form> +<frame name="name" href="href" src="src"></frame> +<iframe name="name2" href="href" src="src"></iframe> +<a name="name3" href="one">yada yada</a> +<a name="pears" href="two" weird="stuff">rhubarb</a> +<a></a> +<iframe src="foo"></iframe> +</body> +</html> +""", {"content-type": "text/html"}) + b.add_handler(make_mock_handler()([("http_open", r)])) + r = b.open(url) + + exp_links = [ + # base_url, url, text, tag, attrs + Link(url, "http://example.com/foo/bar.html", "", "a", + [("href", "http://example.com/foo/bar.html"), + ("name", "apples")]), + Link(url, "spam", "", "a", [("href", "spam"), ("name", "pears")]), + Link(url, "blah", None, "area", + [("href", "blah"), ("name", "foo")]), + Link(url, "src", None, "frame", + [("name", "name"), ("href", "href"), ("src", "src")]), + Link(url, "src", None, "iframe", + [("name", "name2"), ("href", "href"), ("src", "src")]), + Link(url, "one", "yada yada", "a", + [("name", "name3"), ("href", "one")]), + Link(url, "two", "rhubarb", "a", + [("name", "pears"), ("href", "two"), ("weird", "stuff")]), + Link(url, "foo", None, "iframe", + [("src", "foo")]), + ] + links = list(b.links()) + self.assertEqual(len(links), len(exp_links)) + for got, expect in zip(links, exp_links): + self.assertEqual(got, expect) + # nr + l = b.find_link() + self.assertEqual(l.url, "http://example.com/foo/bar.html") + l = b.find_link(nr=1) + self.assertEqual(l.url, "spam") + # text + l = b.find_link(text="yada yada") + self.assertEqual(l.url, "one") + self.assertRaises(mechanize.LinkNotFoundError, + b.find_link, text="da ya") + l = b.find_link(text_regex=re.compile("da ya")) + self.assertEqual(l.url, "one") + l = b.find_link(text_regex="da ya") + self.assertEqual(l.url, "one") + # name + l = b.find_link(name="name3") + self.assertEqual(l.url, "one") + l = b.find_link(name_regex=re.compile("oo")) + self.assertEqual(l.url, "blah") + l = b.find_link(name_regex="oo") + self.assertEqual(l.url, "blah") + # url + l = b.find_link(url="spam") + self.assertEqual(l.url, "spam") + l = b.find_link(url_regex=re.compile("pam")) + self.assertEqual(l.url, "spam") + l = b.find_link(url_regex="pam") + self.assertEqual(l.url, "spam") + # tag + l = b.find_link(tag="area") + self.assertEqual(l.url, "blah") + # predicate + l = b.find_link(predicate= + lambda l: dict(l.attrs).get("weird") == "stuff") + self.assertEqual(l.url, "two") + # combinations + l = b.find_link(name="pears", nr=1) + self.assertEqual(l.text, "rhubarb") + l = b.find_link(url="src", nr=0, name="name2") + self.assertEqual(l.tag, "iframe") + self.assertEqual(l.url, "src") + self.assertRaises(mechanize.LinkNotFoundError, b.find_link, + url="src", nr=1, name="name2") + l = b.find_link(tag="a", predicate= + lambda l: dict(l.attrs).get("weird") == "stuff") + self.assertEqual(l.url, "two") + + # .links() + self.assertEqual(list(b.links(url="src")), [ + Link(url, url="src", text=None, tag="frame", + attrs=[("name", "name"), ("href", "href"), ("src", "src")]), + Link(url, url="src", text=None, tag="iframe", + attrs=[("name", "name2"), ("href", "href"), ("src", "src")]), + ]) + + def test_base_uri(self): + url = "http://example.com/" + + for html, urls in [ + ( +"""<base href="http://www.python.org/foo/"> +<a href="bar/baz.html"></a> +<a href="/bar/baz.html"></a> +<a href="http://example.com/bar %2f%2Fblah;/baz@~._-.html"></a> +""", + [ + "http://www.python.org/foo/bar/baz.html", + "http://www.python.org/bar/baz.html", + "http://example.com/bar%20%2f%2Fblah;/baz@~._-.html", + ]), + ( +"""<a href="bar/baz.html"></a> +<a href="/bar/baz.html"></a> +<a href="http://example.com/bar/baz.html"></a> +""", + [ + "http://example.com/bar/baz.html", + "http://example.com/bar/baz.html", + "http://example.com/bar/baz.html", + ] + ), + ]: + b = TestBrowser() + r = MockResponse(url, html, {"content-type": "text/html"}) + b.add_handler(make_mock_handler()([("http_open", r)])) + r = b.open(url) + self.assertEqual([link.absolute_url for link in b.links()], urls) + + def test_set_cookie(self): + class CookieTestBrowser(TestBrowser): + default_features = list(TestBrowser.default_features)+["_cookies"] + + # have to be visiting HTTP/HTTPS URL + url = "ftp://example.com/" + br = CookieTestBrowser() + r = mechanize.make_response( + "<html><head><title>Title</title></head><body></body></html>", + [("content-type", "text/html")], + url, + 200, "OK", + ) + br.add_handler(make_mock_handler()([("http_open", r)])) + handler = br._ua_handlers["_cookies"] + cj = handler.cookiejar + self.assertRaises(mechanize.BrowserStateError, + br.set_cookie, "foo=bar") + self.assertEqual(len(cj), 0) + + + url = "http://example.com/" + br = CookieTestBrowser() + r = mechanize.make_response( + "<html><head><title>Title</title></head><body></body></html>", + [("content-type", "text/html")], + url, + 200, "OK", + ) + br.add_handler(make_mock_handler()([("http_open", r)])) + handler = br._ua_handlers["_cookies"] + cj = handler.cookiejar + + # have to be visiting a URL + self.assertRaises(mechanize.BrowserStateError, + br.set_cookie, "foo=bar") + self.assertEqual(len(cj), 0) + + + # normal case + br.open(url) + br.set_cookie("foo=bar") + self.assertEqual(len(cj), 1) + self.assertEqual(cj._cookies["example.com"]["/"]["foo"].value, "bar") + + +class ResponseTests(TestCase): + + def test_set_response(self): + import copy + from mechanize import response_seek_wrapper + + br = TestBrowser() + url = "http://example.com/" + html = """<html><body><a href="spam">click me</a></body></html>""" + headers = {"content-type": "text/html"} + r = response_seek_wrapper(MockResponse(url, html, headers)) + br.add_handler(make_mock_handler()([("http_open", r)])) + + r = br.open(url) + self.assertEqual(r.read(), html) + r.seek(0) + self.assertEqual(copy.copy(r).read(), html) + self.assertEqual(list(br.links())[0].url, "spam") + + newhtml = """<html><body><a href="eggs">click me</a></body></html>""" + + r.set_data(newhtml) + self.assertEqual(r.read(), newhtml) + self.assertEqual(br.response().read(), html) + br.response().set_data(newhtml) + self.assertEqual(br.response().read(), html) + self.assertEqual(list(br.links())[0].url, "spam") + r.seek(0) + + br.set_response(r) + self.assertEqual(br.response().read(), newhtml) + self.assertEqual(list(br.links())[0].url, "eggs") + + def test_str(self): + import mimetools + from mechanize import _response + + br = TestBrowser() + self.assertEqual( + str(br), + "<TestBrowser (not visiting a URL)>" + ) + + fp = StringIO.StringIO('<html><form name="f"><input /></form></html>') + headers = mimetools.Message( + StringIO.StringIO("Content-type: text/html")) + response = _response.response_seek_wrapper( + _response.closeable_response( + fp, headers, "http://example.com/", 200, "OK")) + br.set_response(response) + self.assertEqual( + str(br), + "<TestBrowser visiting http://example.com/>" + ) + + br.select_form(nr=0) + self.assertEqual( + str(br), + """\ +<TestBrowser visiting http://example.com/ + selected form: + <f GET http://example.com/ application/x-www-form-urlencoded + <TextControl(<None>=)>> +>""") + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_cookie.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_cookie.py new file mode 100644 index 0000000000000000000000000000000000000000..4ac14492850155348f7b70a9f3bca9d875e44a6a --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_cookie.py @@ -0,0 +1,60 @@ +import mechanize._clientcookie +import mechanize._testcase + + +def cookie_args( + version=1, name="spam", value="eggs", + port="80", port_specified=True, + domain="example.com", domain_specified=False, domain_initial_dot=False, + path="/", path_specified=False, + secure=False, + expires=0, + discard=True, + comment=None, + comment_url=None, + rest={}, + rfc2109=False, + ): + return locals() + + +def make_cookie(*args, **kwds): + return mechanize._clientcookie.Cookie(**cookie_args(*args, **kwds)) + + +class Test(mechanize._testcase.TestCase): + + + def test_equality(self): + # not using assertNotEqual here since operator used varies across + # Python versions + self.assertEqual(make_cookie(), make_cookie()) + self.assertFalse(make_cookie(name="ham") == make_cookie()) + + def test_inequality(self): + # not using assertNotEqual here since operator used varies across + # Python versions + self.assertTrue(make_cookie(name="ham") != make_cookie()) + self.assertFalse(make_cookie() != make_cookie()) + + def test_all_state_included(self): + def non_equal_value(value): + if value is None: + new_value = "80" + elif isinstance(value, basestring): + new_value = value + "1" + elif isinstance(value, bool): + new_value = not value + elif isinstance(value, dict): + new_value = dict(value) + new_value["spam"] = "eggs" + elif isinstance(value, int): + new_value = value + 1 + else: + assert False, value + assert new_value != value, value + return new_value + cookie = make_cookie() + for arg, default_value in cookie_args().iteritems(): + new_value = non_equal_value(default_value) + self.assertNotEqual(make_cookie(**{arg: new_value}), cookie) diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_cookies.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_cookies.py new file mode 100644 index 0000000000000000000000000000000000000000..aae495e1b987d52cea3cb567e23d684dab771cde --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_cookies.py @@ -0,0 +1,1892 @@ +"""Tests for _clientcookie.""" + +import StringIO +import errno +import inspect +import mimetools +import os +import re +import sys +import tempfile +import time +import unittest + +import mechanize +from mechanize._util import hide_experimental_warnings, \ + reset_experimental_warnings +from mechanize import Request + + +class FakeResponse: + def __init__(self, headers=[], url=None): + """ + headers: list of RFC822-style 'Key: value' strings + """ + f = StringIO.StringIO("\n".join(headers)) + self._headers = mimetools.Message(f) + self._url = url + def info(self): return self._headers + +def interact_2965(cookiejar, url, *set_cookie_hdrs): + return _interact(cookiejar, url, set_cookie_hdrs, "Set-Cookie2") + +def interact_netscape(cookiejar, url, *set_cookie_hdrs): + return _interact(cookiejar, url, set_cookie_hdrs, "Set-Cookie") + +def _interact(cookiejar, url, set_cookie_hdrs, hdr_name): + """Perform a single request / response cycle, returning Cookie: header.""" + req = Request(url) + cookiejar.add_cookie_header(req) + cookie_hdr = req.get_header("Cookie", "") + headers = [] + for hdr in set_cookie_hdrs: + headers.append("%s: %s" % (hdr_name, hdr)) + res = FakeResponse(headers, url) + cookiejar.extract_cookies(res, req) + return cookie_hdr + + +class TempfileTestMixin: + + def setUp(self): + self._tempfiles = [] + + def tearDown(self): + for fn in self._tempfiles: + try: + os.remove(fn) + except IOError, exc: + if exc.errno != errno.ENOENT: + raise + + def mktemp(self): + fn = tempfile.mktemp() + self._tempfiles.append(fn) + return fn + + +def caller(): + return sys._getframe().f_back.f_back.f_code.co_name + +def attribute_names(obj): + return set([spec[0] for spec in inspect.getmembers(obj) + if not spec[0].startswith("__")]) + +class CookieJarInterfaceTests(unittest.TestCase): + + def test_add_cookie_header(self): + from mechanize import CookieJar + # verify only these methods are used + class MockRequest(object): + def __init__(self): + self.added_headers = [] + self.called = set() + def log_called(self): + self.called.add(caller()) + def get_full_url(self): + self.log_called() + return "https://example.com:443" + def get_host(self): + self.log_called() + return "example.com:443" + def get_type(self): + self.log_called() + return "https" + def has_header(self, header_name): + self.log_called() + return False + def get_header(self, header_name, default=None): + self.log_called() + pass # currently not called + def header_items(self): + self.log_called() + pass # currently not called + def add_unredirected_header(self, key, val): + self.log_called() + self.added_headers.append((key, val)) + def is_unverifiable(self): + self.log_called() + return False + jar = CookieJar() + interact_netscape(jar, "https://example.com:443", + "foo=bar; port=443; secure") + request = MockRequest() + jar.add_cookie_header(request) + expect_called = attribute_names(MockRequest) - set( + ["port", "get_header", "header_items", "log_called"]) + self.assertEquals(request.called, expect_called) + self.assertEquals(request.added_headers, [("Cookie", "foo=bar")]) + + def test_extract_cookies(self): + from mechanize import CookieJar + + # verify only these methods are used + + class StubMessage(object): + def getheaders(self, name): + return ["foo=bar; port=443"] + + class StubResponse(object): + def info(self): + return StubMessage() + + class StubRequest(object): + def __init__(self): + self.added_headers = [] + self.called = set() + def log_called(self): + self.called.add(caller()) + def get_full_url(self): + self.log_called() + return "https://example.com:443" + def get_host(self): + self.log_called() + return "example.com:443" + def is_unverifiable(self): + self.log_called() + return False + jar = CookieJar() + response = StubResponse() + request = StubRequest() + jar.extract_cookies(response, request) + expect_called = attribute_names(StubRequest) - set( + ["port", "log_called"]) + self.assertEquals(request.called, expect_called) + self.assertEquals([(cookie.name, cookie.value) for cookie in jar], + [("foo", "bar")]) + + def test_unverifiable(self): + from mechanize._clientcookie import request_is_unverifiable + # .unverifiable was added in mechanize, .is_unverifiable() later got + # added in cookielib. XXX deprecate .unverifiable + class StubRequest(object): + def __init__(self, attrs): + self._attrs = attrs + self.accessed = set() + def __getattr__(self, name): + self.accessed.add(name) + try: + return self._attrs[name] + except KeyError: + raise AttributeError(name) + + request = StubRequest(dict(is_unverifiable=lambda: False)) + self.assertEquals(request_is_unverifiable(request), False) + + request = StubRequest(dict(is_unverifiable=lambda: False, + unverifiable=True)) + self.assertEquals(request_is_unverifiable(request), False) + + request = StubRequest(dict(unverifiable=False)) + self.assertEquals(request_is_unverifiable(request), False) + + +class CookieTests(unittest.TestCase): + # XXX + # Get rid of string comparisons where not actually testing str / repr. + # .clear() etc. + # IP addresses like 50 (single number, no dot) and domain-matching + # functions (and is_HDN)? See draft RFC 2965 errata. + # Strictness switches + # is_third_party() + # unverifiability / third_party blocking + # Netscape cookies work the same as RFC 2965 with regard to port. + # Set-Cookie with negative max age. + # If turn RFC 2965 handling off, Set-Cookie2 cookies should not clobber + # Set-Cookie cookies. + # Cookie2 should be sent if *any* cookies are not V1 (ie. V0 OR V2 etc.). + # Cookies (V1 and V0) with no expiry date should be set to be discarded. + # RFC 2965 Quoting: + # Should accept unquoted cookie-attribute values? check errata draft. + # Which are required on the way in and out? + # Should always return quoted cookie-attribute values? + # Proper testing of when RFC 2965 clobbers Netscape (waiting for errata). + # Path-match on return (same for V0 and V1). + # RFC 2965 acceptance and returning rules + # Set-Cookie2 without version attribute is rejected. + + # Netscape peculiarities list from Ronald Tschalar. + # The first two still need tests, the rest are covered. +## - Quoting: only quotes around the expires value are recognized as such +## (and yes, some folks quote the expires value); quotes around any other +## value are treated as part of the value. +## - White space: white space around names and values is ignored +## - Default path: if no path parameter is given, the path defaults to the +## path in the request-uri up to, but not including, the last '/'. Note +## that this is entirely different from what the spec says. +## - Commas and other delimiters: Netscape just parses until the next ';'. +## This means it will allow commas etc inside values (and yes, both +## commas and equals are commonly appear in the cookie value). This also +## means that if you fold multiple Set-Cookie header fields into one, +## comma-separated list, it'll be a headache to parse (at least my head +## starts hurting everytime I think of that code). +## - Expires: You'll get all sorts of date formats in the expires, +## including emtpy expires attributes ("expires="). Be as flexible as you +## can, and certainly don't expect the weekday to be there; if you can't +## parse it, just ignore it and pretend it's a session cookie. +## - Domain-matching: Netscape uses the 2-dot rule for _all_ domains, not +## just the 7 special TLD's listed in their spec. And folks rely on +## that... + + def test_policy(self): + import mechanize + policy = mechanize.DefaultCookiePolicy() + jar = mechanize.CookieJar() + jar.set_policy(policy) + self.assertEquals(jar.get_policy(), policy) + + def test_make_cookies_doesnt_change_jar_state(self): + from mechanize import CookieJar, Request, Cookie + from mechanize._util import time2netscape + from mechanize._response import test_response + cookie = Cookie(0, "spam", "eggs", + "80", False, + "example.com", False, False, + "/", False, + False, + None, + False, + "", + "", + {}) + jar = CookieJar() + jar._policy._now = jar._now = int(time.time()) + jar.set_cookie(cookie) + self.assertEquals(len(jar), 1) + set_cookie = "spam=eggs; expires=%s" % time2netscape(time.time()- 1000) + url = "http://example.com/" + response = test_response(url=url, headers=[("Set-Cookie", set_cookie)]) + jar.make_cookies(response, Request(url)) + self.assertEquals(len(jar), 1) + + def test_domain_return_ok(self): + # test optimization: .domain_return_ok() should filter out most + # domains in the CookieJar before we try to access them (because that + # may require disk access -- in particular, with MSIECookieJar) + # This is only a rough check for performance reasons, so it's not too + # critical as long as it's sufficiently liberal. + import mechanize + pol = mechanize.DefaultCookiePolicy() + for url, domain, ok in [ + ("http://foo.bar.com/", "blah.com", False), + ("http://foo.bar.com/", "rhubarb.blah.com", False), + ("http://foo.bar.com/", "rhubarb.foo.bar.com", False), + ("http://foo.bar.com/", ".foo.bar.com", True), + ("http://foo.bar.com/", "foo.bar.com", True), + ("http://foo.bar.com/", ".bar.com", True), + ("http://foo.bar.com/", "com", True), + ("http://foo.com/", "rhubarb.foo.com", False), + ("http://foo.com/", ".foo.com", True), + ("http://foo.com/", "foo.com", True), + ("http://foo.com/", "com", True), + ("http://foo/", "rhubarb.foo", False), + ("http://foo/", ".foo", True), + ("http://foo/", "foo", True), + ("http://foo/", "foo.local", True), + ("http://foo/", ".local", True), + ]: + request = mechanize.Request(url) + r = pol.domain_return_ok(domain, request) + if ok: self.assert_(r) + else: self.assert_(not r) + + def test_missing_name(self): + from mechanize import MozillaCookieJar, lwp_cookie_str + + # missing = sign in Cookie: header is regarded by Mozilla as a missing + # NAME. WE regard it as a missing VALUE. + filename = tempfile.mktemp() + c = MozillaCookieJar(filename) + interact_netscape(c, "http://www.acme.com/", 'eggs') + interact_netscape(c, "http://www.acme.com/", '"spam"; path=/foo/') + cookie = c._cookies["www.acme.com"]["/"]['eggs'] + assert cookie.name == "eggs" + assert cookie.value is None + cookie = c._cookies["www.acme.com"]['/foo/']['"spam"'] + assert cookie.name == '"spam"' + assert cookie.value is None + assert lwp_cookie_str(cookie) == ( + r'"spam"; path="/foo/"; domain="www.acme.com"; ' + 'path_spec; discard; version=0') + old_str = repr(c) + c.save(ignore_expires=True, ignore_discard=True) + try: + c = MozillaCookieJar(filename) + c.revert(ignore_expires=True, ignore_discard=True) + finally: + os.unlink(c.filename) + # cookies unchanged apart from lost info re. whether path was specified + assert repr(c) == \ + re.sub("path_specified=%s" % True, "path_specified=%s" % False, + old_str) + assert interact_netscape(c, "http://www.acme.com/foo/") == \ + '"spam"; eggs' + + def test_rfc2109_handling(self): + # 2109 cookies have rfc2109 attr set correctly, and are handled + # as 2965 or Netscape cookies depending on policy settings + from mechanize import CookieJar, DefaultCookiePolicy + + for policy, version in [ + (DefaultCookiePolicy(), 0), + (DefaultCookiePolicy(rfc2965=True), 1), + (DefaultCookiePolicy(rfc2109_as_netscape=True), 0), + (DefaultCookiePolicy(rfc2965=True, rfc2109_as_netscape=True), 0), + ]: + c = CookieJar(policy) + interact_netscape(c, "http://www.example.com/", "ni=ni; Version=1") + cookie = c._cookies["www.example.com"]["/"]["ni"] + self.assert_(cookie.rfc2109) + self.assertEqual(cookie.version, version) + + def test_ns_parser(self): + from mechanize import CookieJar + from mechanize._clientcookie import DEFAULT_HTTP_PORT + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/", + 'spam=eggs; DoMain=.acme.com; port; blArgh="feep"') + interact_netscape(c, "http://www.acme.com/", 'ni=ni; port=80,8080') + interact_netscape(c, "http://www.acme.com:80/", 'nini=ni') + interact_netscape(c, "http://www.acme.com:80/", 'foo=bar; expires=') + interact_netscape(c, "http://www.acme.com:80/", 'spam=eggs; ' + 'expires="Foo Bar 25 33:22:11 3022"') + + cookie = c._cookies[".acme.com"]["/"]["spam"] + assert cookie.domain == ".acme.com" + assert cookie.domain_specified + assert cookie.port == DEFAULT_HTTP_PORT + assert not cookie.port_specified + # case is preserved + assert (cookie.has_nonstandard_attr("blArgh") and + not cookie.has_nonstandard_attr("blargh")) + + cookie = c._cookies["www.acme.com"]["/"]["ni"] + assert cookie.domain == "www.acme.com" + assert not cookie.domain_specified + assert cookie.port == "80,8080" + assert cookie.port_specified + + cookie = c._cookies["www.acme.com"]["/"]["nini"] + assert cookie.port is None + assert not cookie.port_specified + + # invalid expires should not cause cookie to be dropped + foo = c._cookies["www.acme.com"]["/"]["foo"] + spam = c._cookies["www.acme.com"]["/"]["foo"] + assert foo.expires is None + assert spam.expires is None + + def test_ns_parser_special_names(self): + # names such as 'expires' are not special in first name=value pair + # of Set-Cookie: header + from mechanize import CookieJar + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/", 'expires=eggs') + interact_netscape(c, "http://www.acme.com/", 'version=eggs; spam=eggs') + + cookies = c._cookies["www.acme.com"]["/"] + self.assert_(cookies.has_key('expires')) + self.assert_(cookies.has_key('version')) + + def test_expires(self): + from mechanize._util import time2netscape + from mechanize import CookieJar + + # if expires is in future, keep cookie... + c = CookieJar() + future = time2netscape(time.time()+3600) + interact_netscape(c, "http://www.acme.com/", 'spam="bar"; expires=%s' % + future) + assert len(c) == 1 + now = time2netscape(time.time()-1) + # ... and if in past or present, discard it + interact_netscape(c, "http://www.acme.com/", 'foo="eggs"; expires=%s' % + now) + h = interact_netscape(c, "http://www.acme.com/") + assert len(c) == 1 + assert h.find('spam="bar"') != -1 and h.find("foo") == -1 + + # max-age takes precedence over expires, and zero max-age is request to + # delete both new cookie and any old matching cookie + interact_netscape(c, "http://www.acme.com/", 'eggs="bar"; expires=%s' % + future) + interact_netscape(c, "http://www.acme.com/", 'bar="bar"; expires=%s' % + future) + assert len(c) == 3 + interact_netscape(c, "http://www.acme.com/", 'eggs="bar"; ' + 'expires=%s; max-age=0' % future) + interact_netscape(c, "http://www.acme.com/", 'bar="bar"; ' + 'max-age=0; expires=%s' % future) + h = interact_netscape(c, "http://www.acme.com/") + assert len(c) == 1 + + # test expiry at end of session for cookies with no expires attribute + interact_netscape(c, "http://www.rhubarb.net/", 'whum="fizz"') + assert len(c) == 2 + c.clear_session_cookies() + assert len(c) == 1 + assert h.find('spam="bar"') != -1 + + # XXX RFC 2965 expiry rules (some apply to V0 too) + + def test_default_path(self): + from mechanize import CookieJar, DefaultCookiePolicy + + # RFC 2965 + pol = DefaultCookiePolicy(rfc2965=True) + + c = CookieJar(pol) + interact_2965(c, "http://www.acme.com/", 'spam="bar"; Version="1"') + assert c._cookies["www.acme.com"].has_key("/") + + c = CookieJar(pol) + interact_2965(c, "http://www.acme.com/blah", 'eggs="bar"; Version="1"') + assert c._cookies["www.acme.com"].has_key("/") + + c = CookieJar(pol) + interact_2965(c, "http://www.acme.com/blah/rhubarb", + 'eggs="bar"; Version="1"') + assert c._cookies["www.acme.com"].has_key("/blah/") + + c = CookieJar(pol) + interact_2965(c, "http://www.acme.com/blah/rhubarb/", + 'eggs="bar"; Version="1"') + assert c._cookies["www.acme.com"].has_key("/blah/rhubarb/") + + # Netscape + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/", 'spam="bar"') + assert c._cookies["www.acme.com"].has_key("/") + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/blah", 'eggs="bar"') + assert c._cookies["www.acme.com"].has_key("/") + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/blah/rhubarb", 'eggs="bar"') + assert c._cookies["www.acme.com"].has_key("/blah") + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/blah/rhubarb/", 'eggs="bar"') + assert c._cookies["www.acme.com"].has_key("/blah/rhubarb") + + def test_default_path_with_query(self): + cj = mechanize.CookieJar() + uri = "http://example.com/?spam/eggs" + value = 'eggs="bar"' + interact_netscape(cj, uri, value) + # default path does not include query, so is "/", not "/?spam" + self.assertIn("/", cj._cookies["example.com"]) + # cookie is sent back to the same URI + self.assertEqual(interact_netscape(cj, uri), value) + + def test_escape_path(self): + from mechanize._clientcookie import escape_path + cases = [ + # quoted safe + ("/foo%2f/bar", "/foo%2F/bar"), + ("/foo%2F/bar", "/foo%2F/bar"), + # quoted % + ("/foo%%/bar", "/foo%%/bar"), + # quoted unsafe + ("/fo%19o/bar", "/fo%19o/bar"), + ("/fo%7do/bar", "/fo%7Do/bar"), + # unquoted safe + ("/foo/bar&", "/foo/bar&"), + ("/foo//bar", "/foo//bar"), + ("\176/foo/bar", "\176/foo/bar"), + # unquoted unsafe + ("/foo\031/bar", "/foo%19/bar"), + ("/\175foo/bar", "/%7Dfoo/bar"), + # unicode + (u"/foo/bar\uabcd", "/foo/bar%EA%AF%8D"), # UTF-8 encoded + ] + for arg, result in cases: + self.assert_(escape_path(arg) == result) + + def test_request_path(self): + from mechanize._clientcookie import request_path + # with parameters + req = Request("http://www.example.com/rheum/rhaponticum;" + "foo=bar;sing=song?apples=pears&spam=eggs#ni") + self.assertEquals(request_path(req), + "/rheum/rhaponticum;foo=bar;sing=song") + # without parameters + req = Request("http://www.example.com/rheum/rhaponticum?" + "apples=pears&spam=eggs#ni") + self.assertEquals(request_path(req), "/rheum/rhaponticum") + # missing final slash + req = Request("http://www.example.com") + self.assert_(request_path(req) == "/") + + def test_request_port(self): + from mechanize._clientcookie import request_port, DEFAULT_HTTP_PORT + req = Request("http://www.acme.com:1234/", + headers={"Host": "www.acme.com:4321"}) + assert request_port(req) == "1234" + req = Request("http://www.acme.com/", + headers={"Host": "www.acme.com:4321"}) + assert request_port(req) == DEFAULT_HTTP_PORT + + def test_request_host_lc(self): + from mechanize._clientcookie import request_host_lc + # this request is illegal (RFC2616, 14.2.3) + req = Request("http://1.1.1.1/", + headers={"Host": "www.acme.com:80"}) + # libwww-perl wants this response, but that seems wrong (RFC 2616, + # section 5.2, point 1., and RFC 2965 section 1, paragraph 3) + #assert request_host_lc(req) == "www.acme.com" + assert request_host_lc(req) == "1.1.1.1" + req = Request("http://www.acme.com/", + headers={"Host": "irrelevant.com"}) + assert request_host_lc(req) == "www.acme.com" + # not actually sure this one is valid Request object, so maybe should + # remove test for no host in url in request_host_lc function? + req = Request("/resource.html", + headers={"Host": "www.acme.com"}) + assert request_host_lc(req) == "www.acme.com" + # port shouldn't be in request-host + req = Request("http://www.acme.com:2345/resource.html", + headers={"Host": "www.acme.com:5432"}) + assert request_host_lc(req) == "www.acme.com" + # the _lc function lower-cases the result + req = Request("http://EXAMPLE.com") + assert request_host_lc(req) == "example.com" + + def test_effective_request_host(self): + from mechanize import effective_request_host + self.assertEquals( + effective_request_host(Request("http://www.EXAMPLE.com/spam")), + "www.EXAMPLE.com") + self.assertEquals( + effective_request_host(Request("http://bob/spam")), + "bob.local") + + def test_is_HDN(self): + from mechanize._clientcookie import is_HDN + assert is_HDN("foo.bar.com") + assert is_HDN("1foo2.3bar4.5com") + assert not is_HDN("192.168.1.1") + assert not is_HDN("") + assert not is_HDN(".") + assert not is_HDN(".foo.bar.com") + assert not is_HDN("..foo") + assert not is_HDN("foo.") + + def test_reach(self): + from mechanize._clientcookie import reach + assert reach("www.acme.com") == ".acme.com" + assert reach("acme.com") == "acme.com" + assert reach("acme.local") == ".local" + assert reach(".local") == ".local" + assert reach(".com") == ".com" + assert reach(".") == "." + assert reach("") == "" + assert reach("192.168.0.1") == "192.168.0.1" + + def test_domain_match(self): + from mechanize._clientcookie import domain_match, user_domain_match + assert domain_match("192.168.1.1", "192.168.1.1") + assert not domain_match("192.168.1.1", ".168.1.1") + assert domain_match("x.y.com", "x.Y.com") + assert domain_match("x.y.com", ".Y.com") + assert not domain_match("x.y.com", "Y.com") + assert domain_match("a.b.c.com", ".c.com") + assert not domain_match(".c.com", "a.b.c.com") + assert domain_match("example.local", ".local") + assert not domain_match("blah.blah", "") + assert not domain_match("", ".rhubarb.rhubarb") + assert domain_match("", "") + + assert user_domain_match("acme.com", "acme.com") + assert not user_domain_match("acme.com", ".acme.com") + assert user_domain_match("rhubarb.acme.com", ".acme.com") + assert user_domain_match("www.rhubarb.acme.com", ".acme.com") + assert user_domain_match("x.y.com", "x.Y.com") + assert user_domain_match("x.y.com", ".Y.com") + assert not user_domain_match("x.y.com", "Y.com") + assert user_domain_match("y.com", "Y.com") + assert not user_domain_match(".y.com", "Y.com") + assert user_domain_match(".y.com", ".Y.com") + assert user_domain_match("x.y.com", ".com") + assert not user_domain_match("x.y.com", "com") + assert not user_domain_match("x.y.com", "m") + assert not user_domain_match("x.y.com", ".m") + assert not user_domain_match("x.y.com", "") + assert not user_domain_match("x.y.com", ".") + assert user_domain_match("192.168.1.1", "192.168.1.1") + # not both HDNs, so must string-compare equal to match + assert not user_domain_match("192.168.1.1", ".168.1.1") + assert not user_domain_match("192.168.1.1", ".") + # empty string is a special case + assert not user_domain_match("192.168.1.1", "") + + def test_wrong_domain(self): + """Cookies whose ERH does not domain-match the domain are rejected. + + ERH = effective request-host. + + """ + # XXX far from complete + from mechanize import CookieJar + c = CookieJar() + interact_2965(c, "http://www.nasty.com/", 'foo=bar; domain=friendly.org; Version="1"') + assert len(c) == 0 + + def test_strict_domain(self): + # Cookies whose domain is a country-code tld like .co.uk should + # not be set if CookiePolicy.strict_domain is true. + from mechanize import CookieJar, DefaultCookiePolicy + + cp = DefaultCookiePolicy(strict_domain=True) + cj = CookieJar(policy=cp) + interact_netscape(cj, "http://example.co.uk/", 'no=problemo') + interact_netscape(cj, "http://example.co.uk/", + 'okey=dokey; Domain=.example.co.uk') + self.assertEquals(len(cj), 2) + for pseudo_tld in [".co.uk", ".org.za", ".tx.us", ".name.us"]: + interact_netscape(cj, "http://example.%s/" % pseudo_tld, + 'spam=eggs; Domain=.co.uk') + self.assertEquals(len(cj), 2) + # XXXX This should be compared with the Konqueror (kcookiejar.cpp) and + # Mozilla implementations. + + def test_two_component_domain_ns(self): + # Netscape: .www.bar.com, www.bar.com, .bar.com, bar.com, no domain should + # all get accepted, as should .acme.com, acme.com and no domain for + # 2-component domains like acme.com. + from mechanize import CookieJar, DefaultCookiePolicy + + c = CookieJar() + + # two-component V0 domain is OK + interact_netscape(c, "http://foo.net/", 'ns=bar') + assert len(c) == 1 + assert c._cookies["foo.net"]["/"]["ns"].value == "bar" + assert interact_netscape(c, "http://foo.net/") == "ns=bar" + # *will* be returned to any other domain (unlike RFC 2965)... + assert interact_netscape(c, "http://www.foo.net/") == "ns=bar" + # ...unless requested otherwise + pol = DefaultCookiePolicy( + strict_ns_domain=DefaultCookiePolicy.DomainStrictNonDomain) + c.set_policy(pol) + assert interact_netscape(c, "http://www.foo.net/") == "" + + # unlike RFC 2965, even explicit two-component domain is OK, + # because .foo.net matches foo.net + interact_netscape(c, "http://foo.net/foo/", + 'spam1=eggs; domain=foo.net') + # even if starts with a dot -- in NS rules, .foo.net matches foo.net! + interact_netscape(c, "http://foo.net/foo/bar/", + 'spam2=eggs; domain=.foo.net') + assert len(c) == 3 + assert c._cookies[".foo.net"]["/foo"]["spam1"].value == "eggs" + assert c._cookies[".foo.net"]["/foo/bar"]["spam2"].value == "eggs" + assert interact_netscape(c, "http://foo.net/foo/bar/") == \ + "spam2=eggs; spam1=eggs; ns=bar" + + # top-level domain is too general + interact_netscape(c, "http://foo.net/", 'nini="ni"; domain=.net') + assert len(c) == 3 + +## # Netscape protocol doesn't allow non-special top level domains (such +## # as co.uk) in the domain attribute unless there are at least three +## # dots in it. + # Oh yes it does! Real implementations don't check this, and real + # cookies (of course) rely on that behaviour. + interact_netscape(c, "http://foo.co.uk", 'nasty=trick; domain=.co.uk') +## assert len(c) == 2 + assert len(c) == 4 + + def test_two_component_domain_rfc2965(self): + from mechanize import CookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + c = CookieJar(pol) + + # two-component V1 domain is OK + interact_2965(c, "http://foo.net/", 'foo=bar; Version="1"') + assert len(c) == 1 + assert c._cookies["foo.net"]["/"]["foo"].value == "bar" + assert interact_2965(c, "http://foo.net/") == "$Version=1; foo=bar" + # won't be returned to any other domain (because domain was implied) + assert interact_2965(c, "http://www.foo.net/") == "" + + # unless domain is given explicitly, because then it must be + # rewritten to start with a dot: foo.net --> .foo.net, which does + # not domain-match foo.net + interact_2965(c, "http://foo.net/foo", + 'spam=eggs; domain=foo.net; path=/foo; Version="1"') + assert len(c) == 1 + assert interact_2965(c, "http://foo.net/foo") == "$Version=1; foo=bar" + + # explicit foo.net from three-component domain www.foo.net *does* get + # set, because .foo.net domain-matches .foo.net + interact_2965(c, "http://www.foo.net/foo/", + 'spam=eggs; domain=foo.net; Version="1"') + assert c._cookies[".foo.net"]["/foo/"]["spam"].value == "eggs" + assert len(c) == 2 + assert interact_2965(c, "http://foo.net/foo/") == "$Version=1; foo=bar" + assert interact_2965(c, "http://www.foo.net/foo/") == \ + '$Version=1; spam=eggs; $Domain="foo.net"' + + # top-level domain is too general + interact_2965(c, "http://foo.net/", + 'ni="ni"; domain=".net"; Version="1"') + assert len(c) == 2 + + # RFC 2965 doesn't require blocking this + interact_2965(c, "http://foo.co.uk/", + 'nasty=trick; domain=.co.uk; Version="1"') + assert len(c) == 3 + + def test_domain_allow(self): + from mechanize import CookieJar, DefaultCookiePolicy + + c = CookieJar(policy=DefaultCookiePolicy( + blocked_domains=["acme.com"], + allowed_domains=["www.acme.com"])) + + req = Request("http://acme.com/") + headers = ["Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/"] + res = FakeResponse(headers, "http://acme.com/") + c.extract_cookies(res, req) + assert len(c) == 0 + + req = Request("http://www.acme.com/") + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + assert len(c) == 1 + + req = Request("http://www.coyote.com/") + res = FakeResponse(headers, "http://www.coyote.com/") + c.extract_cookies(res, req) + assert len(c) == 1 + + # set a cookie with non-allowed domain... + req = Request("http://www.coyote.com/") + res = FakeResponse(headers, "http://www.coyote.com/") + cookies = c.make_cookies(res, req) + c.set_cookie(cookies[0]) + assert len(c) == 2 + # ... and check is doesn't get returned + c.add_cookie_header(req) + assert not req.has_header("Cookie") + + def test_domain_block(self): + from mechanize import CookieJar, DefaultCookiePolicy + + #import logging; logging.getLogger("mechanize").setLevel(logging.DEBUG) + + pol = DefaultCookiePolicy( + rfc2965=True, blocked_domains=[".acme.com"]) + c = CookieJar(policy=pol) + headers = ["Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/"] + + req = Request("http://www.acme.com/") + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + assert len(c) == 0 + + pol.set_blocked_domains(["acme.com"]) + c.extract_cookies(res, req) + assert len(c) == 1 + + c.clear() + req = Request("http://www.roadrunner.net/") + res = FakeResponse(headers, "http://www.roadrunner.net/") + c.extract_cookies(res, req) + assert len(c) == 1 + req = Request("http://www.roadrunner.net/") + c.add_cookie_header(req) + assert (req.has_header("Cookie") and + req.has_header("Cookie2")) + + c.clear() + pol.set_blocked_domains([".acme.com"]) + c.extract_cookies(res, req) + assert len(c) == 1 + + # set a cookie with blocked domain... + req = Request("http://www.acme.com/") + res = FakeResponse(headers, "http://www.acme.com/") + cookies = c.make_cookies(res, req) + c.set_cookie(cookies[0]) + assert len(c) == 2 + # ... and check it doesn't get returned + c.add_cookie_header(req) + assert not req.has_header("Cookie") + + def test_secure(self): + from mechanize import CookieJar, DefaultCookiePolicy + + for ns in True, False: + for whitespace in " ", "": + c = CookieJar() + if ns: + pol = DefaultCookiePolicy(rfc2965=False) + int = interact_netscape + vs = "" + else: + pol = DefaultCookiePolicy(rfc2965=True) + int = interact_2965 + vs = "; Version=1" + c.set_policy(pol) + url = "http://www.acme.com/" + int(c, url, "foo1=bar%s%s" % (vs, whitespace)) + int(c, url, "foo2=bar%s; secure%s" % (vs, whitespace)) + assert not c._cookies["www.acme.com"]["/"]["foo1"].secure, \ + "non-secure cookie registered secure" + assert c._cookies["www.acme.com"]["/"]["foo2"].secure, \ + "secure cookie registered non-secure" + + def test_quote_cookie_value(self): + from mechanize import CookieJar, DefaultCookiePolicy + c = CookieJar(policy=DefaultCookiePolicy(rfc2965=True)) + interact_2965(c, "http://www.acme.com/", r'foo=\b"a"r; Version=1') + h = interact_2965(c, "http://www.acme.com/") + assert h == r'$Version=1; foo=\\b\"a\"r' + + def test_missing_final_slash(self): + # Missing slash from request URL's abs_path should be assumed present. + from mechanize import CookieJar, Request, DefaultCookiePolicy + url = "http://www.acme.com" + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + interact_2965(c, url, "foo=bar; Version=1") + req = Request(url) + assert len(c) == 1 + c.add_cookie_header(req) + assert req.has_header("Cookie") + + def test_domain_mirror(self): + from mechanize import CookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, "spam=eggs; Version=1") + h = interact_2965(c, url) + assert h.find( "Domain") == -1, \ + "absent domain returned with domain present" + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; Domain=.bar.com') + h = interact_2965(c, url) + assert h.find('$Domain=".bar.com"') != -1, \ + "domain not returned" + + c = CookieJar(pol) + url = "http://foo.bar.com/" + # note missing initial dot in Domain + interact_2965(c, url, 'spam=eggs; Version=1; Domain=bar.com') + h = interact_2965(c, url) + assert h.find('$Domain="bar.com"') != -1, \ + "domain not returned" + + def test_path_mirror(self): + from mechanize import CookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, "spam=eggs; Version=1") + h = interact_2965(c, url) + assert h.find("Path") == -1, \ + "absent path returned with path present" + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; Path=/') + h = interact_2965(c, url) + assert h.find('$Path="/"') != -1, "path not returned" + + def test_port_mirror(self): + from mechanize import CookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, "spam=eggs; Version=1") + h = interact_2965(c, url) + assert h.find("Port") == -1, \ + "absent port returned with port present" + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, "spam=eggs; Version=1; Port") + h = interact_2965(c, url) + assert re.search("\$Port([^=]|$)", h), \ + "port with no value not returned with no value" + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; Port="80"') + h = interact_2965(c, url) + assert h.find('$Port="80"') != -1, \ + "port with single value not returned with single value" + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; Port="80,8080"') + h = interact_2965(c, url) + assert h.find('$Port="80,8080"') != -1, \ + "port with multiple values not returned with multiple values" + + def test_no_return_comment(self): + from mechanize import CookieJar, DefaultCookiePolicy + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; ' + 'Comment="does anybody read these?"; ' + 'CommentURL="http://foo.bar.net/comment.html"') + h = interact_2965(c, url) + assert h.find("Comment") == -1, \ + "Comment or CommentURL cookie-attributes returned to server" + +# just pondering security here -- this isn't really a test (yet) +## def test_hack(self): +## from mechanize import CookieJar + +## c = CookieJar() +## interact_netscape(c, "http://victim.mall.com/", +## 'prefs="foo"') +## interact_netscape(c, "http://cracker.mall.com/", +## 'prefs="bar"; Domain=.mall.com') +## interact_netscape(c, "http://cracker.mall.com/", +## '$Version="1"; Domain=.mall.com') +## h = interact_netscape(c, "http://victim.mall.com/") +## print h + + def test_Cookie_iterator(self): + from mechanize import CookieJar, Cookie, DefaultCookiePolicy + + cs = CookieJar(DefaultCookiePolicy(rfc2965=True)) + # add some random cookies + interact_2965(cs, "http://blah.spam.org/", 'foo=eggs; Version=1; ' + 'Comment="does anybody read these?"; ' + 'CommentURL="http://foo.bar.net/comment.html"') + interact_netscape(cs, "http://www.acme.com/blah/", "spam=bar; secure") + interact_2965(cs, "http://www.acme.com/blah/", "foo=bar; secure; Version=1") + interact_2965(cs, "http://www.acme.com/blah/", "foo=bar; path=/; Version=1") + interact_2965(cs, "http://www.sol.no", + r'bang=wallop; version=1; domain=".sol.no"; ' + r'port="90,100, 80,8080"; ' + r'max-age=100; Comment = "Just kidding! (\"|\\\\) "') + + versions = [1, 1, 1, 0, 1] + names = ["bang", "foo", "foo", "spam", "foo"] + domains = [".sol.no", "blah.spam.org", "www.acme.com", + "www.acme.com", "www.acme.com"] + paths = ["/", "/", "/", "/blah", "/blah/"] + + # sequential iteration + for i in range(4): + i = 0 + for c in cs: + assert isinstance(c, Cookie) + assert c.version == versions[i] + assert c.name == names[i] + assert c.domain == domains[i] + assert c.path == paths[i] + i = i + 1 + + self.assertRaises(IndexError, lambda cs=cs : cs[5]) + + # can't skip + cs[0] + cs[1] + self.assertRaises(IndexError, lambda cs=cs : cs[3]) + + # can't go backwards + cs[0] + cs[1] + cs[2] + self.assertRaises(IndexError, lambda cs=cs : cs[1]) + + def test_parse_ns_headers(self): + from mechanize._headersutil import parse_ns_headers + + # missing domain value (invalid cookie) + assert parse_ns_headers(["foo=bar; path=/; domain"]) == [ + [("foo", "bar"), + ("path", "/"), ("domain", None), ("version", "0")]] + # invalid expires value + assert parse_ns_headers( + ["foo=bar; expires=Foo Bar 12 33:22:11 2000"]) == \ + [[("foo", "bar"), ("expires", None), ("version", "0")]] + # missing cookie name (valid cookie) + assert parse_ns_headers(["foo"]) == [[("foo", None), ("version", "0")]] + # shouldn't add version if header is empty + assert parse_ns_headers([""]) == [] + + def test_bad_cookie_header(self): + + def cookiejar_from_cookie_headers(headers): + from mechanize import CookieJar, Request + c = CookieJar() + req = Request("http://www.example.com/") + r = FakeResponse(headers, "http://www.example.com/") + c.extract_cookies(r, req) + return c + + # none of these bad headers should cause an exception to be raised + for headers in [ + ["Set-Cookie: "], # actually, nothing wrong with this + ["Set-Cookie2: "], # ditto + # missing domain value + ["Set-Cookie2: a=foo; path=/; Version=1; domain"], + # bad max-age + ["Set-Cookie: b=foo; max-age=oops"], + # bad version + ["Set-Cookie: b=foo; version=spam"], + ]: + c = cookiejar_from_cookie_headers(headers) + # these bad cookies shouldn't be set + assert len(c) == 0 + + # cookie with invalid expires is treated as session cookie + headers = ["Set-Cookie: c=foo; expires=Foo Bar 12 33:22:11 2000"] + c = cookiejar_from_cookie_headers(headers) + cookie = c._cookies["www.example.com"]["/"]["c"] + assert cookie.expires is None + + def test_cookies_for_request(self): + from mechanize import CookieJar, Request + + cj = CookieJar() + interact_netscape(cj, "http://example.com/", "short=path") + interact_netscape(cj, "http://example.com/longer/path", "longer=path") + for_short_path = cj.cookies_for_request(Request("http://example.com/")) + self.assertEquals([cookie.name for cookie in for_short_path], + ["short"]) + for_long_path = cj.cookies_for_request(Request( + "http://example.com/longer/path")) + self.assertEquals([cookie.name for cookie in for_long_path], + ["longer", "short"]) + + +class CookieJarPersistenceTests(TempfileTestMixin, unittest.TestCase): + + def _interact(self, cj): + year_plus_one = time.localtime(time.time())[0] + 1 + interact_2965(cj, "http://www.acme.com/", + "foo1=bar; max-age=100; Version=1") + interact_2965(cj, "http://www.acme.com/", + 'foo2=bar; port="80"; max-age=100; Discard; Version=1') + interact_2965(cj, "http://www.acme.com/", "foo3=bar; secure; Version=1") + + expires = "expires=09-Nov-%d 23:12:40 GMT" % (year_plus_one,) + interact_netscape(cj, "http://www.foo.com/", + "fooa=bar; %s" % expires) + interact_netscape(cj, "http://www.foo.com/", + "foob=bar; Domain=.foo.com; %s" % expires) + interact_netscape(cj, "http://www.foo.com/", + "fooc=bar; Domain=www.foo.com; %s" % expires) + + def test_firefox3_cookiejar_restore(self): + try: + from mechanize import Firefox3CookieJar + except ImportError: + pass + else: + from mechanize import DefaultCookiePolicy + filename = self.mktemp() + def create_cookiejar(): + hide_experimental_warnings() + try: + cj = Firefox3CookieJar( + filename, policy=DefaultCookiePolicy(rfc2965=True)) + finally: + reset_experimental_warnings() + cj.connect() + return cj + cj = create_cookiejar() + self._interact(cj) + self.assertEquals(len(cj), 6) + cj.close() + cj = create_cookiejar() + self.assert_("name='foo1', value='bar'" in repr(cj)) + self.assertEquals(len(cj), 4) + + def test_firefox3_cookiejar_iteration(self): + try: + from mechanize import Firefox3CookieJar + except ImportError: + pass + else: + from mechanize import DefaultCookiePolicy + filename = self.mktemp() + hide_experimental_warnings() + try: + cj = Firefox3CookieJar( + filename, policy=DefaultCookiePolicy(rfc2965=True)) + finally: + reset_experimental_warnings() + cj.connect() + self._interact(cj) + summary = "\n".join([str(cookie) for cookie in cj]) + self.assertEquals(summary, + """\ +<Cookie foo2=bar for www.acme.com:80/> +<Cookie foo3=bar for www.acme.com/> +<Cookie foo1=bar for www.acme.com/> +<Cookie fooa=bar for www.foo.com/> +<Cookie foob=bar for .foo.com/> +<Cookie fooc=bar for .www.foo.com/>""") + + def test_firefox3_cookiejar_clear(self): + try: + from mechanize import Firefox3CookieJar + except ImportError: + pass + else: + from mechanize import DefaultCookiePolicy + filename = self.mktemp() + hide_experimental_warnings() + try: + cj = Firefox3CookieJar( + filename, policy=DefaultCookiePolicy(rfc2965=True)) + finally: + reset_experimental_warnings() + cj.connect() + self._interact(cj) + cj.clear("www.acme.com", "/", "foo2") + def summary(): return "\n".join([str(cookie) for cookie in cj]) + self.assertEquals(summary(), + """\ +<Cookie foo3=bar for www.acme.com/> +<Cookie foo1=bar for www.acme.com/> +<Cookie fooa=bar for www.foo.com/> +<Cookie foob=bar for .foo.com/> +<Cookie fooc=bar for .www.foo.com/>""") + cj.clear("www.acme.com") + self.assertEquals(summary(), + """\ +<Cookie fooa=bar for www.foo.com/> +<Cookie foob=bar for .foo.com/> +<Cookie fooc=bar for .www.foo.com/>""") + # if name is given, so must path and domain + self.assertRaises(ValueError, cj.clear, domain=".foo.com", + name="foob") + # nonexistent domain + self.assertRaises(KeyError, cj.clear, domain=".spam.com") + + def test_firefox3_cookiejar_add_cookie_header(self): + try: + from mechanize import Firefox3CookieJar + except ImportError: + pass + else: + filename = self.mktemp() + hide_experimental_warnings() + try: + cj = Firefox3CookieJar(filename) + finally: + reset_experimental_warnings() + cj.connect() + # Session cookies (true .discard) and persistent cookies (false + # .discard) are stored differently. Check they both get sent. + year_plus_one = time.localtime(time.time())[0] + 1 + expires = "expires=09-Nov-%d 23:12:40 GMT" % (year_plus_one,) + interact_netscape(cj, "http://www.foo.com/", "fooa=bar") + interact_netscape(cj, "http://www.foo.com/", + "foob=bar; %s" % expires) + ca, cb = cj + self.assert_(ca.discard) + self.assertFalse(cb.discard) + request = Request("http://www.foo.com/") + cj.add_cookie_header(request) + self.assertEquals(request.get_header("Cookie"), + "fooa=bar; foob=bar") + + def test_mozilla_cookiejar(self): + # Save / load Mozilla/Netscape cookie file format. + from mechanize import MozillaCookieJar, DefaultCookiePolicy + filename = tempfile.mktemp() + c = MozillaCookieJar(filename, + policy=DefaultCookiePolicy(rfc2965=True)) + self._interact(c) + + def save_and_restore(cj, ignore_discard, filename=filename): + from mechanize import MozillaCookieJar, DefaultCookiePolicy + try: + cj.save(ignore_discard=ignore_discard) + new_c = MozillaCookieJar(filename, + DefaultCookiePolicy(rfc2965=True)) + new_c.load(ignore_discard=ignore_discard) + finally: + try: os.unlink(filename) + except OSError: pass + return new_c + + new_c = save_and_restore(c, True) + assert len(new_c) == 6 # none discarded + assert repr(new_c).find("name='foo1', value='bar'") != -1 + + new_c = save_and_restore(c, False) + assert len(new_c) == 4 # 2 of them discarded on save + assert repr(new_c).find("name='foo1', value='bar'") != -1 + + def test_mozilla_cookiejar_embedded_tab(self): + from mechanize import MozillaCookieJar + filename = tempfile.mktemp() + fh = open(filename, "w") + try: + fh.write( + MozillaCookieJar.header + "\n" + + "a.com\tFALSE\t/\tFALSE\t\tname\tval\tstillthevalue\n" + "a.com\tFALSE\t/\tFALSE\t\tname2\tvalue\n") + fh.close() + cj = MozillaCookieJar(filename) + cj.revert(ignore_discard=True) + cookies = cj._cookies["a.com"]["/"] + self.assertEquals(cookies["name"].value, "val\tstillthevalue") + self.assertEquals(cookies["name2"].value, "value") + finally: + try: + os.remove(filename) + except IOError, exc: + if exc.errno != errno.ENOENT: + raise + + def test_mozilla_cookiejar_initial_dot_violation(self): + from mechanize import MozillaCookieJar, LoadError + filename = tempfile.mktemp() + fh = open(filename, "w") + try: + fh.write( + MozillaCookieJar.header + "\n" + + ".a.com\tFALSE\t/\tFALSE\t\tname\tvalue\n") + fh.close() + cj = MozillaCookieJar(filename) + self.assertRaises(LoadError, cj.revert, ignore_discard=True) + finally: + try: + os.remove(filename) + except IOError, exc: + if exc.errno != errno.ENOENT: + raise + + + +class LWPCookieTests(unittest.TestCase, TempfileTestMixin): + # Tests taken from libwww-perl, with a few modifications. + + def test_netscape_example_1(self): + from mechanize import CookieJar, Request, DefaultCookiePolicy + + #------------------------------------------------------------------- + # First we check that it works for the original example at + # http://www.netscape.com/newsref/std/cookie_spec.html + + # Client requests a document, and receives in the response: + # + # Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-99 23:12:40 GMT + # + # When client requests a URL in path "/" on this server, it sends: + # + # Cookie: CUSTOMER=WILE_E_COYOTE + # + # Client requests a document, and receives in the response: + # + # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ + # + # When client requests a URL in path "/" on this server, it sends: + # + # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 + # + # Client receives: + # + # Set-Cookie: SHIPPING=FEDEX; path=/fo + # + # When client requests a URL in path "/" on this server, it sends: + # + # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 + # + # When client requests a URL in path "/foo" on this server, it sends: + # + # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001; SHIPPING=FEDEX + # + # The last Cookie is buggy, because both specifications say that the + # most specific cookie must be sent first. SHIPPING=FEDEX is the + # most specific and should thus be first. + + year_plus_one = time.localtime(time.time())[0] + 1 + + headers = [] + + c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) + + #req = Request("http://1.1.1.1/", + # headers={"Host": "www.acme.com:80"}) + req = Request("http://www.acme.com:80/", + headers={"Host": "www.acme.com:80"}) + + headers.append( + "Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/ ; " + "expires=Wednesday, 09-Nov-%d 23:12:40 GMT" % year_plus_one) + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/") + c.add_cookie_header(req) + + assert (req.get_header("Cookie") == "CUSTOMER=WILE_E_COYOTE" and + req.get_header("Cookie2") == '$Version="1"') + + headers.append("Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/foo/bar") + c.add_cookie_header(req) + + h = req.get_header("Cookie") + assert (h.find("PART_NUMBER=ROCKET_LAUNCHER_0001") != -1 and + h.find("CUSTOMER=WILE_E_COYOTE") != -1) + + + headers.append('Set-Cookie: SHIPPING=FEDEX; path=/foo') + res = FakeResponse(headers, "http://www.acme.com") + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/") + c.add_cookie_header(req) + + h = req.get_header("Cookie") + assert (h.find("PART_NUMBER=ROCKET_LAUNCHER_0001") != -1 and + h.find("CUSTOMER=WILE_E_COYOTE") != -1 and + not h.find("SHIPPING=FEDEX") != -1) + + + req = Request("http://www.acme.com/foo/") + c.add_cookie_header(req) + + h = req.get_header("Cookie") + assert (h.find("PART_NUMBER=ROCKET_LAUNCHER_0001") != -1 and + h.find("CUSTOMER=WILE_E_COYOTE") != -1 and + h.startswith("SHIPPING=FEDEX;")) + + def test_netscape_example_2(self): + from mechanize import CookieJar, Request + + # Second Example transaction sequence: + # + # Assume all mappings from above have been cleared. + # + # Client receives: + # + # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ + # + # When client requests a URL in path "/" on this server, it sends: + # + # Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001 + # + # Client receives: + # + # Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo + # + # When client requests a URL in path "/ammo" on this server, it sends: + # + # Cookie: PART_NUMBER=RIDING_ROCKET_0023; PART_NUMBER=ROCKET_LAUNCHER_0001 + # + # NOTE: There are two name/value pairs named "PART_NUMBER" due to + # the inheritance of the "/" mapping in addition to the "/ammo" mapping. + + c = CookieJar() + headers = [] + + req = Request("http://www.acme.com/") + headers.append("Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") + res = FakeResponse(headers, "http://www.acme.com/") + + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/") + c.add_cookie_header(req) + + assert (req.get_header("Cookie") == "PART_NUMBER=ROCKET_LAUNCHER_0001") + + headers.append( + "Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo") + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/ammo") + c.add_cookie_header(req) + + assert re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*" + "PART_NUMBER=ROCKET_LAUNCHER_0001", + req.get_header("Cookie")) + + def test_ietf_example_1(self): + from mechanize import CookieJar, DefaultCookiePolicy + #------------------------------------------------------------------- + # Then we test with the examples from draft-ietf-http-state-man-mec-03.txt + # + # 5. EXAMPLES + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + + # + # 5.1 Example 1 + # + # Most detail of request and response headers has been omitted. Assume + # the user agent has no stored cookies. + # + # 1. User Agent -> Server + # + # POST /acme/login HTTP/1.1 + # [form data] + # + # User identifies self via a form. + # + # 2. Server -> User Agent + # + # HTTP/1.1 200 OK + # Set-Cookie2: Customer="WILE_E_COYOTE"; Version="1"; Path="/acme" + # + # Cookie reflects user's identity. + + cookie = interact_2965( + c, 'http://www.acme.com/acme/login', + 'Customer="WILE_E_COYOTE"; Version="1"; Path="/acme"') + assert not cookie + + # + # 3. User Agent -> Server + # + # POST /acme/pickitem HTTP/1.1 + # Cookie: $Version="1"; Customer="WILE_E_COYOTE"; $Path="/acme" + # [form data] + # + # User selects an item for ``shopping basket.'' + # + # 4. Server -> User Agent + # + # HTTP/1.1 200 OK + # Set-Cookie2: Part_Number="Rocket_Launcher_0001"; Version="1"; + # Path="/acme" + # + # Shopping basket contains an item. + + cookie = interact_2965(c, 'http://www.acme.com/acme/pickitem', + 'Part_Number="Rocket_Launcher_0001"; ' + 'Version="1"; Path="/acme"'); + assert re.search( + r'^\$Version="?1"?; Customer="?WILE_E_COYOTE"?; \$Path="/acme"$', + cookie) + + # + # 5. User Agent -> Server + # + # POST /acme/shipping HTTP/1.1 + # Cookie: $Version="1"; + # Customer="WILE_E_COYOTE"; $Path="/acme"; + # Part_Number="Rocket_Launcher_0001"; $Path="/acme" + # [form data] + # + # User selects shipping method from form. + # + # 6. Server -> User Agent + # + # HTTP/1.1 200 OK + # Set-Cookie2: Shipping="FedEx"; Version="1"; Path="/acme" + # + # New cookie reflects shipping method. + + cookie = interact_2965(c, "http://www.acme.com/acme/shipping", + 'Shipping="FedEx"; Version="1"; Path="/acme"') + + assert (re.search(r'^\$Version="?1"?;', cookie) and + re.search(r'Part_Number="?Rocket_Launcher_0001"?;' + '\s*\$Path="\/acme"', cookie) and + re.search(r'Customer="?WILE_E_COYOTE"?;\s*\$Path="\/acme"', + cookie)) + + # + # 7. User Agent -> Server + # + # POST /acme/process HTTP/1.1 + # Cookie: $Version="1"; + # Customer="WILE_E_COYOTE"; $Path="/acme"; + # Part_Number="Rocket_Launcher_0001"; $Path="/acme"; + # Shipping="FedEx"; $Path="/acme" + # [form data] + # + # User chooses to process order. + # + # 8. Server -> User Agent + # + # HTTP/1.1 200 OK + # + # Transaction is complete. + + cookie = interact_2965(c, "http://www.acme.com/acme/process") + assert (re.search(r'Shipping="?FedEx"?;\s*\$Path="\/acme"', cookie) and + cookie.find("WILE_E_COYOTE") != -1) + + # + # The user agent makes a series of requests on the origin server, after + # each of which it receives a new cookie. All the cookies have the same + # Path attribute and (default) domain. Because the request URLs all have + # /acme as a prefix, and that matches the Path attribute, each request + # contains all the cookies received so far. + + def test_ietf_example_2(self): + from mechanize import CookieJar, DefaultCookiePolicy + + # 5.2 Example 2 + # + # This example illustrates the effect of the Path attribute. All detail + # of request and response headers has been omitted. Assume the user agent + # has no stored cookies. + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + + # Imagine the user agent has received, in response to earlier requests, + # the response headers + # + # Set-Cookie2: Part_Number="Rocket_Launcher_0001"; Version="1"; + # Path="/acme" + # + # and + # + # Set-Cookie2: Part_Number="Riding_Rocket_0023"; Version="1"; + # Path="/acme/ammo" + + interact_2965( + c, "http://www.acme.com/acme/ammo/specific", + 'Part_Number="Rocket_Launcher_0001"; Version="1"; Path="/acme"', + 'Part_Number="Riding_Rocket_0023"; Version="1"; Path="/acme/ammo"') + + # A subsequent request by the user agent to the (same) server for URLs of + # the form /acme/ammo/... would include the following request header: + # + # Cookie: $Version="1"; + # Part_Number="Riding_Rocket_0023"; $Path="/acme/ammo"; + # Part_Number="Rocket_Launcher_0001"; $Path="/acme" + # + # Note that the NAME=VALUE pair for the cookie with the more specific Path + # attribute, /acme/ammo, comes before the one with the less specific Path + # attribute, /acme. Further note that the same cookie name appears more + # than once. + + cookie = interact_2965(c, "http://www.acme.com/acme/ammo/...") + assert re.search(r"Riding_Rocket_0023.*Rocket_Launcher_0001", cookie) + + # A subsequent request by the user agent to the (same) server for a URL of + # the form /acme/parts/ would include the following request header: + # + # Cookie: $Version="1"; Part_Number="Rocket_Launcher_0001"; $Path="/acme" + # + # Here, the second cookie's Path attribute /acme/ammo is not a prefix of + # the request URL, /acme/parts/, so the cookie does not get forwarded to + # the server. + + cookie = interact_2965(c, "http://www.acme.com/acme/parts/") + assert (cookie.find("Rocket_Launcher_0001") != -1 and + not cookie.find("Riding_Rocket_0023") != -1) + + def test_rejection(self): + # Test rejection of Set-Cookie2 responses based on domain, path, port. + from mechanize import LWPCookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + + c = LWPCookieJar(policy=pol) + + # illegal domain (no embedded dots) + cookie = interact_2965(c, "http://www.acme.com", + 'foo=bar; domain=".com"; version=1') + assert not c + + # legal domain + cookie = interact_2965(c, "http://www.acme.com", + 'ping=pong; domain="acme.com"; version=1') + assert len(c) == 1 + + # illegal domain (host prefix "www.a" contains a dot) + cookie = interact_2965(c, "http://www.a.acme.com", + 'whiz=bang; domain="acme.com"; version=1') + assert len(c) == 1 + + # legal domain + cookie = interact_2965(c, "http://www.a.acme.com", + 'wow=flutter; domain=".a.acme.com"; version=1') + assert len(c) == 2 + + # can't partially match an IP-address + cookie = interact_2965(c, "http://125.125.125.125", + 'zzzz=ping; domain="125.125.125"; version=1') + assert len(c) == 2 + + # illegal path (must be prefix of request path) + cookie = interact_2965(c, "http://www.sol.no", + 'blah=rhubarb; domain=".sol.no"; path="/foo"; ' + 'version=1') + assert len(c) == 2 + + # legal path + cookie = interact_2965(c, "http://www.sol.no/foo/bar", + 'bing=bong; domain=".sol.no"; path="/foo"; ' + 'version=1') + assert len(c) == 3 + + # illegal port (request-port not in list) + cookie = interact_2965(c, "http://www.sol.no", + 'whiz=ffft; domain=".sol.no"; port="90,100"; ' + 'version=1') + assert len(c) == 3 + + # legal port + cookie = interact_2965( + c, "http://www.sol.no", + r'bang=wallop; version=1; domain=".sol.no"; ' + r'port="90,100, 80,8080"; ' + r'max-age=100; Comment = "Just kidding! (\"|\\\\) "') + assert len(c) == 4 + + # port attribute without any value (current port) + cookie = interact_2965(c, "http://www.sol.no", + 'foo9=bar; version=1; domain=".sol.no"; port; ' + 'max-age=100;') + assert len(c) == 5 + + # encoded path + # LWP has this test, but unescaping allowed path characters seems + # like a bad idea, so I think this should fail: +## cookie = interact_2965(c, "http://www.sol.no/foo/", +## r'foo8=bar; version=1; path="/%66oo"') + # but this is OK, because '<' is not an allowed HTTP URL path + # character: + interact_2965(c, "http://www.sol.no/<oo/", + r'foo8=bar; version=1; path="/%3coo"') + assert len(c) == 6 + + # save and restore + filename = tempfile.mktemp() + + try: + c.save(filename, ignore_discard=True) + old = repr(c) + + c = LWPCookieJar(policy=pol) + c.load(filename, ignore_discard=True) + finally: + try: os.unlink(filename) + except OSError: pass + + assert old == repr(c) + + def test_url_encoding(self): + # Try some URL encodings of the PATHs. + # (the behaviour here has changed from libwww-perl) + from mechanize import CookieJar, DefaultCookiePolicy + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + + interact_2965(c, "http://www.acme.com/foo%2f%25/%3c%3c%0Anew%E5/%E5", + "foo = bar; version = 1") + + cookie = interact_2965( + c, "http://www.acme.com/foo%2f%25/<<%0anew\345/\346\370\345", + 'bar=baz; path="/foo/"; version=1'); + version_re = re.compile(r'^\$version=\"?1\"?', re.I) + assert (cookie.find("foo=bar") != -1 and + version_re.search(cookie)) + + cookie = interact_2965( + c, "http://www.acme.com/foo/%25/<<%0anew\345/\346\370\345") + assert not cookie + + # unicode URL doesn't raise exception, as it used to! + cookie = interact_2965(c, u"http://www.acme.com/\xfc") + + def test_netscape_misc(self): + # Some additional Netscape cookies tests. + from mechanize import CookieJar, Request + + c = CookieJar() + headers = [] + req = Request("http://foo.bar.acme.com/foo") + + # Netscape allows a host part that contains dots + headers.append("Set-Cookie: Customer=WILE_E_COYOTE; domain=.acme.com") + res = FakeResponse(headers, "http://www.acme.com/foo") + c.extract_cookies(res, req) + + # and that the domain is the same as the host without adding a leading + # dot to the domain. Should not quote even if strange chars are used + # in the cookie value. + headers.append("Set-Cookie: PART_NUMBER=3,4; domain=foo.bar.acme.com") + res = FakeResponse(headers, "http://www.acme.com/foo") + c.extract_cookies(res, req) + + req = Request("http://foo.bar.acme.com/foo") + c.add_cookie_header(req) + assert ( + req.get_header("Cookie").find("PART_NUMBER=3,4") != -1 and + req.get_header("Cookie").find("Customer=WILE_E_COYOTE") != -1) + + def test_intranet_domains_2965(self): + # Test handling of local intranet hostnames without a dot. + from mechanize import CookieJar, DefaultCookiePolicy + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + interact_2965(c, "http://example/", + "foo1=bar; PORT; Discard; Version=1;") + cookie = interact_2965(c, "http://example/", + 'foo2=bar; domain=".local"; Version=1') + assert cookie.find("foo1=bar") >= 0 + + interact_2965(c, "http://example/", 'foo3=bar; Version=1') + cookie = interact_2965(c, "http://example/") + assert cookie.find("foo2=bar") >= 0 and len(c) == 3 + + def test_intranet_domains_ns(self): + from mechanize import CookieJar, DefaultCookiePolicy + + c = CookieJar(DefaultCookiePolicy(rfc2965 = False)) + interact_netscape(c, "http://example/", "foo1=bar") + cookie = interact_netscape(c, "http://example/", + 'foo2=bar; domain=.local') + assert len(c) == 2 + assert cookie.find("foo1=bar") >= 0 + + cookie = interact_netscape(c, "http://example/") + assert cookie.find("foo2=bar") >= 0 and len(c) == 2 + + def test_empty_path(self): + from mechanize import CookieJar, Request, DefaultCookiePolicy + + # Test for empty path + # Broken web-server ORION/1.3.38 returns to the client response like + # + # Set-Cookie: JSESSIONID=ABCDERANDOM123; Path= + # + # ie. with Path set to nothing. + # In this case, extract_cookies() must set cookie to / (root) + c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) + headers = [] + + req = Request("http://www.ants.com/") + headers.append("Set-Cookie: JSESSIONID=ABCDERANDOM123; Path=") + res = FakeResponse(headers, "http://www.ants.com/") + c.extract_cookies(res, req) + + req = Request("http://www.ants.com/") + c.add_cookie_header(req) + + assert (req.get_header("Cookie") == "JSESSIONID=ABCDERANDOM123" and + req.get_header("Cookie2") == '$Version="1"') + + # missing path in the request URI + req = Request("http://www.ants.com:8080") + c.add_cookie_header(req) + + assert (req.get_header("Cookie") == "JSESSIONID=ABCDERANDOM123" and + req.get_header("Cookie2") == '$Version="1"') + +# The correctness of this test is undefined, in the absence of RFC 2965 errata. +## def test_netscape_rfc2965_interop(self): +## # Test mixing of Set-Cookie and Set-Cookie2 headers. +## from mechanize import CookieJar + +## # Example from http://www.trip.com/trs/trip/flighttracker/flight_tracker_home.xsl +## # which gives up these headers: +## # +## # HTTP/1.1 200 OK +## # Connection: close +## # Date: Fri, 20 Jul 2001 19:54:58 GMT +## # Server: Apache/1.3.19 (Unix) ApacheJServ/1.1.2 +## # Content-Type: text/html +## # Content-Type: text/html; charset=iso-8859-1 +## # Link: </trip/stylesheet.css>; rel="stylesheet"; type="text/css" +## # Servlet-Engine: Tomcat Web Server/3.2.1 (JSP 1.1; Servlet 2.2; Java 1.3.0; SunOS 5.8 sparc; java.vendor=Sun Microsystems Inc.) +## # Set-Cookie: trip.appServer=1111-0000-x-024;Domain=.trip.com;Path=/ +## # Set-Cookie: JSESSIONID=fkumjm7nt1.JS24;Path=/trs +## # Set-Cookie2: JSESSIONID=fkumjm7nt1.JS24;Version=1;Discard;Path="/trs" +## # Title: TRIP.com Travel - FlightTRACKER +## # X-Meta-Description: Trip.com privacy policy +## # X-Meta-Keywords: privacy policy + +## req = mechanize.Request( +## 'http://www.trip.com/trs/trip/flighttracker/flight_tracker_home.xsl') +## headers = [] +## headers.append("Set-Cookie: trip.appServer=1111-0000-x-024;Domain=.trip.com;Path=/") +## headers.append("Set-Cookie: JSESSIONID=fkumjm7nt1.JS24;Path=/trs") +## headers.append('Set-Cookie2: JSESSIONID=fkumjm7nt1.JS24;Version=1;Discard;Path="/trs"') +## res = FakeResponse( +## headers, +## 'http://www.trip.com/trs/trip/flighttracker/flight_tracker_home.xsl') +## #print res + +## c = CookieJar() +## c.extract_cookies(res, req) +## #print c +## print str(c) +## print """Set-Cookie3: trip.appServer="1111-0000-x-024"; path="/"; domain=".trip.com"; path_spec; discard; version=0 +## Set-Cookie3: JSESSIONID="fkumjm7nt1.JS24"; path="/trs"; domain="www.trip.com"; path_spec; discard; version=1 +## """ +## assert c.as_lwp_str() == """Set-Cookie3: trip.appServer="1111-0000-x-024"; path="/"; domain=".trip.com"; path_spec; discard; version=0 +## Set-Cookie3: JSESSIONID="fkumjm7nt1.JS24"; path="/trs"; domain="www.trip.com"; path_spec; discard; version=1 +## """ + + def test_session_cookies(self): + from mechanize import CookieJar, Request + + year_plus_one = time.localtime(time.time())[0] + 1 + + # Check session cookies are deleted properly by + # CookieJar.clear_session_cookies method + + req = Request('http://www.perlmeister.com/scripts') + headers = [] + headers.append("Set-Cookie: s1=session;Path=/scripts") + headers.append("Set-Cookie: p1=perm; Domain=.perlmeister.com;" + "Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % + year_plus_one) + headers.append("Set-Cookie: p2=perm;Path=/;expires=Fri, " + "02-Feb-%d 23:24:20 GMT" % year_plus_one) + headers.append("Set-Cookie: s2=session;Path=/scripts;" + "Domain=.perlmeister.com") + headers.append('Set-Cookie2: s3=session;Version=1;Discard;Path="/"') + res = FakeResponse(headers, 'http://www.perlmeister.com/scripts') + + c = CookieJar() + c.extract_cookies(res, req) + # How many session/permanent cookies do we have? + counter = {"session_after": 0, + "perm_after": 0, + "session_before": 0, + "perm_before": 0} + for cookie in c: + key = "%s_before" % cookie.value + counter[key] = counter[key] + 1 + c.clear_session_cookies() + # How many now? + for cookie in c: + key = "%s_after" % cookie.value + counter[key] = counter[key] + 1 + + assert not ( + # a permanent cookie got lost accidently + counter["perm_after"] != counter["perm_before"] or + # a session cookie hasn't been cleared + counter["session_after"] != 0 or + # we didn't have session cookies in the first place + counter["session_before"] == 0) + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_date.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_date.py new file mode 100644 index 0000000000000000000000000000000000000000..feac7564eaeb28b053e74580fe66b0e5cab0472c --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_date.py @@ -0,0 +1,104 @@ +"""Tests for ClientCookie._HTTPDate.""" + +import re, time +from unittest import TestCase + +class DateTimeTests(TestCase): + + def test_time2isoz(self): + from mechanize._util import time2isoz + + base = 1019227000 + day = 24*3600 + assert time2isoz(base) == "2002-04-19 14:36:40Z" + assert time2isoz(base+day) == "2002-04-20 14:36:40Z" + assert time2isoz(base+2*day) == "2002-04-21 14:36:40Z" + assert time2isoz(base+3*day) == "2002-04-22 14:36:40Z" + + az = time2isoz() + bz = time2isoz(500000) + for text in (az, bz): + assert re.search(r"^\d{4}-\d\d-\d\d \d\d:\d\d:\d\dZ$", text), \ + "bad time2isoz format: %s %s" % (az, bz) + + def test_parse_date(self): + from mechanize._util import http2time + + def parse_date(text, http2time=http2time): + return time.gmtime(http2time(text))[:6] + + assert parse_date("01 Jan 2001") == (2001, 1, 1, 0, 0, 0.0) + + # this test will break around year 2070 + assert parse_date("03-Feb-20") == (2020, 2, 3, 0, 0, 0.0) + + # this test will break around year 2048 + assert parse_date("03-Feb-98") == (1998, 2, 3, 0, 0, 0.0) + + def test_http2time_formats(self): + from mechanize._util import http2time, time2isoz + + # test http2time for supported dates. Test cases with 2 digit year + # will probably break in year 2044. + tests = [ + 'Thu, 03 Feb 1994 00:00:00 GMT', # proposed new HTTP format + 'Thursday, 03-Feb-94 00:00:00 GMT', # old rfc850 HTTP format + 'Thursday, 03-Feb-1994 00:00:00 GMT', # broken rfc850 HTTP format + + '03 Feb 1994 00:00:00 GMT', # HTTP format (no weekday) + '03-Feb-94 00:00:00 GMT', # old rfc850 (no weekday) + '03-Feb-1994 00:00:00 GMT', # broken rfc850 (no weekday) + '03-Feb-1994 00:00 GMT', # broken rfc850 (no weekday, no seconds) + '03-Feb-1994 00:00', # broken rfc850 (no weekday, no seconds, no tz) + + '03-Feb-94', # old rfc850 HTTP format (no weekday, no time) + '03-Feb-1994', # broken rfc850 HTTP format (no weekday, no time) + '03 Feb 1994', # proposed new HTTP format (no weekday, no time) + + # A few tests with extra space at various places + ' 03 Feb 1994 0:00 ', + ' 03-Feb-1994 ', + ] + + test_t = 760233600 # assume broken POSIX counting of seconds + result = time2isoz(test_t) + expected = "1994-02-03 00:00:00Z" + assert result == expected, \ + "%s => '%s' (%s)" % (test_t, result, expected) + + for s in tests: + t = http2time(s) + t2 = http2time(s.lower()) + t3 = http2time(s.upper()) + + assert t == t2 == t3 == test_t, \ + "'%s' => %s, %s, %s (%s)" % (s, t, t2, t3, test_t) + + def test_http2time_garbage(self): + from mechanize._util import http2time + + for test in [ + '', 'Garbage', + 'Mandag 16. September 1996', + + '01-00-1980', + '01-13-1980', + '00-01-1980', + '32-01-1980', + '01-01-1980 25:00:00', + '01-01-1980 00:61:00', + '01-01-1980 00:00:62']: + + bad = False + + if http2time(test) is not None: + print "http2time(%s) is not None" % (test,) + print "http2time(test)", http2time(test) + bad = True + + assert not bad + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_form.py new file mode 100644 index 0000000000000000000000000000000000000000..cfa23226a261adcbc4959f39dbe66c51f7a92f4b --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form.py @@ -0,0 +1,3525 @@ +#!/usr/bin/env python +# -*- coding: iso-8859-1 -*- + +# Copyright 2002-2005 John J. Lee <jjl@pobox.com> +# Copyright 2005 Gary Poster +# Copyright 2005 Zope Corporation +# Copyright 1998-2000 Gisle Aas. + +from cStringIO import StringIO +import os +import string +import unittest + +import mechanize +import mechanize._form as _form +from mechanize import ControlNotFoundError, ItemNotFoundError, \ + ItemCountError, AmbiguityError +import mechanize._testcase as _testcase +from mechanize._util import get1 + +# XXX +# HTMLForm.set/get_value_by_label() +# Base control tests on ParseFile, so can use same tests for different form +# implementations. +# HTMLForm.enctype +# XHTML + +try: True +except NameError: + True = 1 + False = 0 + +try: bool +except NameError: + def bool(expr): + if expr: return True + else: return False + +try: + import warnings +except ImportError: + warnings_imported = False + def hide_deprecations(): + pass + def reset_deprecations(): + pass + def raise_deprecations(): + pass +else: + warnings_imported = True + def hide_deprecations(): + warnings.filterwarnings('ignore', category=DeprecationWarning) + def reset_deprecations(): + warnings.filterwarnings('default', category=DeprecationWarning) + #warnings.resetwarnings() # XXX probably safer + def raise_deprecations(): + try: + registry = _form.__warningregistry__ + except AttributeError: + pass + else: + registry.clear() + warnings.filterwarnings('error', category=DeprecationWarning) + +class DummyForm: + def __init__(self): + self._forms = [] + self._labels = [] + self._id_to_labels = {} + self.backwards_compat = False + self.controls = [] + + def find_control(self, name, type): + raise mechanize.ControlNotFoundError + + +class UnescapeTests(unittest.TestCase): + + def test_unescape_charref(self): + unescape_charref = _form.unescape_charref + mdash_utf8 = u"\u2014".encode("utf-8") + for ref, codepoint, utf8, latin1 in [ + ("38", 38, u"&".encode("utf-8"), "&"), + ("x2014", 0x2014, mdash_utf8, "—"), + ("8212", 8212, mdash_utf8, "—"), + ]: + self.assertEqual(unescape_charref(ref, None), unichr(codepoint)) + self.assertEqual(unescape_charref(ref, 'latin-1'), latin1) + self.assertEqual(unescape_charref(ref, 'utf-8'), utf8) + + def test_get_entitydefs(self): + get_entitydefs = _form.get_entitydefs + ed = get_entitydefs() + for name, char in [ + ("&", u"&"), + ("<", u"<"), + (">", u">"), + ("—", u"\u2014"), + ("♠", u"\u2660"), + ]: + self.assertEqual(ed[name], char) + + def test_unescape1(self): + unescape = _form.unescape + get_entitydefs = _form.get_entitydefs + data = "& < — — —" + mdash_utf8 = u"\u2014".encode("utf-8") + ue = unescape(data, get_entitydefs(), "utf-8") + self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue) + + for text, expect in [ + ("&a&", "&a&"), + ("a&", "a&"), + ]: + got = unescape(text, get_entitydefs(), "latin-1") + self.assertEqual(got, expect) + + def test_unescape2(self): + unescape = _form.unescape + get_entitydefs = _form.get_entitydefs + self.assertEqual(unescape("Donald Duck & Co", + {"&": "&"}), "Donald Duck & Co") + self.assertEqual( + unescape("<Donald Duck & Co>", + {"&": "&", "<": "<", ">": ">"}), + "<Donald Duck & Co>") + self.assertEqual(unescape("Hei på deg", {"å" : "�"}), + "Hei p� deg") + self.assertEqual( + unescape("&foo;", + {"&": "&", "&foo;": "splat"}), "&foo;") + self.assertEqual(unescape("&", {}), "&") + + for encoding, expected in [ + ("utf-8", u"&\u06aa\u2014\u2014".encode("utf-8")), + ("latin-1", "&ڪ——")]: + self.assertEqual( + expected, + unescape("&ڪ——", get_entitydefs(), encoding)) + + def test_unescape_parsing(self): + file = StringIO( +"""<form action="&amp;———"> +<textarea name="name&amp;———">val&amp;———</textarea> +</form> +""") #" + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False, encoding="utf-8") + form = forms[0] + test_string = "&"+(u"\u2014".encode('utf8')*3) + self.assertEqual(form.action, "http://localhost/"+test_string) + control = form.find_control(type="textarea", nr=0) + self.assertEqual(control.value, "val"+test_string) + self.assertEqual(control.name, "name"+test_string) + + def test_unescape_parsing_select(self): + f = StringIO("""\ +<form> +<select name="a"> + <option>1&amp;———</option> + <option value="2&amp;———">2&amp;———</option> +</select> +</form> +""") #" + forms = mechanize.ParseFileEx(f, "http://localhost/", encoding="utf-8") + form = forms[1] + test_string = "&"+(u"\u2014".encode('utf8')*3) + control = form.find_control(nr=0) + for ii in range(len(control.items)): + item = control.items[ii] + self.assertEqual(item.name, str(ii+1)+test_string) + # XXX label + + def test_unescape_parsing_data(self): + file = StringIO( +"""\ +<form> + <label for="foo">Blah ” ” blah</label> + <input type="text" id="foo" name="foo"> +</form> +""") #" + # don't crash if we can't encode -- rather, leave entity ref intact + forms = mechanize.ParseFile( + file, "http://localhost/", backwards_compat=False, + encoding="latin-1") + label = forms[0].find_control(nr=0).get_labels()[0] + self.assertEqual(label.text, "Blah ” ” blah") + + +class LWPFormTests(unittest.TestCase): + """The original tests from libwww-perl 5.64.""" + def testEmptyParse(self): + forms = mechanize.ParseFile(StringIO(""), "http://localhost", + backwards_compat=False) + self.assert_(len(forms) == 0) + + def _forms(self): + file = StringIO("""<form action="abc"> + + <input name="firstname" value="Gisle"> + + </form> + + """) + return mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + + def testParse(self): + forms = self._forms() + self.assert_(len(forms) == 1) + self.assert_(forms[0]["firstname"] == "Gisle") + + def testFillForm(self): + forms = self._forms() + form = forms[0] + form["firstname"] = "Gisle Aas" + req = form.click() + def request_method(req): + if req.has_data(): + return "POST" + else: + return "GET" + self.assert_(request_method(req) == "GET") + self.assert_(req.get_full_url() == "http://localhost/abc?firstname=Gisle+Aas") + +def get_header(req, name): + try: + return req.get_header(name) + except AttributeError: + return req.headers[name] + +def header_items(req): + try: + return req.header_items() + except AttributeError: + return req.headers.items() + +class MockResponse: + def __init__(self, f, url): + self._file = f + self._url = url + def geturl(self): + return self._url + def __getattr__(self, name): + return getattr(self._file, name) + + +class ParseErrorTests(_testcase.TestCase): + + def test_parseerror_str(self): + e = mechanize.ParseError("spam") + self.assertEqual(str(e), "spam") + + +class ParseTests(unittest.TestCase): + + def test_failing_parse(self): + # XXX couldn't provoke an error from BeautifulSoup (!), so this has not + # been tested with RobustFormParser + import sgmllib + # Python 2.0 sgmllib raises RuntimeError rather than SGMLParseError, + # but seems never to even raise that except as an assertion, from + # reading the code... + if hasattr(sgmllib, "SGMLParseError"): + f = StringIO("<!!!!>") + base_uri = "http://localhost/" + self.assertRaises( + mechanize.ParseError, + mechanize.ParseFile, f, base_uri, backwards_compat=False, + ) + self.assert_(issubclass(mechanize.ParseError, + sgmllib.SGMLParseError)) + + def test_unknown_control(self): + f = StringIO( +"""<form action="abc"> +<input type="bogus"> +<input> +</form> +""") + base_uri = "http://localhost/" + forms = mechanize.ParseFile(f, base_uri, backwards_compat=False) + form = forms[0] + for ctl in form.controls: + self.assert_(isinstance(ctl, _form.TextControl)) + + def test_ParseFileEx(self): + # empty "outer form" (where the "outer form" is the form consisting of + # all controls outside of any form) + f = StringIO( +"""<form action="abc"> +<input type="text"></input> +</form> +""") + base_uri = "http://localhost/" + forms = mechanize.ParseFileEx(f, base_uri) + outer = forms[0] + self.assertEqual(len(forms), 2) + self.assertEqual(outer.controls, []) + self.assertEqual(outer.name, None) + self.assertEqual(outer.action, base_uri) + self.assertEqual(outer.method, "GET") + self.assertEqual(outer.enctype, "application/x-www-form-urlencoded") + self.assertEqual(outer.attrs, {}) + + # non-empty outer form + f = StringIO( +""" +<input type="text" name="a"></input> +<form action="abc"> + <input type="text" name="b"></input> +</form> +<input type="text" name="c"></input> +<form action="abc"> + <input type="text" name="d"></input> +</form> +<input type="text" name="e"></input> +""") + base_uri = "http://localhost/" + forms = mechanize.ParseFileEx(f, base_uri) + outer = forms[0] + self.assertEqual(len(forms), 3) + self.assertEqual([c.name for c in outer.controls], ["a", "c", "e"]) + self.assertEqual(outer.name, None) + self.assertEqual(outer.action, base_uri) + self.assertEqual(outer.method, "GET") + self.assertEqual(outer.enctype, "application/x-www-form-urlencoded") + self.assertEqual(outer.attrs, {}) + + def test_ParseResponse(self): + url = "http://example.com/" + r = MockResponse( + StringIO("""\ +<input type="text" name="outer"></input> +<form action="abc"><input type="text" name="inner"></input></form> +"""), + url, + ) + + hide_deprecations() + forms = mechanize.ParseResponse(r) + reset_deprecations() + self.assertEqual(len(forms), 1) + form = forms[0] + self.assertEqual(form.action, url+"abc") + self.assertEqual(form.controls[0].name, "inner") + + def test_ParseResponseEx(self): + url = "http://example.com/" + r = MockResponse( + StringIO("""\ +<input type="text" name="outer"></input> +<form action="abc"><input type="text" name="inner"></input></form> +"""), + url, + ) + + forms = mechanize.ParseResponseEx(r) + self.assertEqual(len(forms), 2) + outer = forms[0] + inner = forms[1] + self.assertEqual(inner.action, url+"abc") + self.assertEqual(outer.action, url) + self.assertEqual(outer.controls[0].name, "outer") + self.assertEqual(inner.controls[0].name, "inner") + + def test_ParseString(self): + class DerivedRequest(mechanize.Request): + pass + forms = mechanize.ParseString('<input name="a" />', + "http://example.com/", + request_class=DerivedRequest) + self.assertEqual(len(forms), 1) + self.assertEqual(forms[0].controls[0].name, "a") + # arguments were passed through + self.assertTrue(isinstance(forms[0].click(), DerivedRequest)) + + def test_parse_error(self): + f = StringIO( +"""<form action="abc"> +<option> +</form> +""") + base_uri = "http://localhost/" + try: + mechanize.ParseFile(f, base_uri, backwards_compat=False) + except mechanize.ParseError, e: + self.assert_(e.base_uri == base_uri) + else: + self.assert_(0) + + def test_base_uri(self): + # BASE element takes priority over document URI + file = StringIO( +"""<base HREF="http://example.com"> +<form action="abc"> +<input type="submit"></input> +</form> +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + self.assert_(form.action == "http://example.com/abc") + + file = StringIO( +"""<form action="abc"> +<input type="submit"></input> +</form> +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + self.assert_(form.action == "http://localhost/abc") + + def testTextarea(self): + file = StringIO( +"""<form action="abc&amp;—d"> + +<input name="firstname" value="Gisle"> +<textarea>blah, blah, +Rhubarb. + +</textarea> + +<textarea></textarea> + +<textarea name=""ta"" id="foo&amp;bar">Hello testers &amp; users!</textarea> + +</form> + +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False, encoding="utf-8") + self.assert_(len(forms) == 1) + form = forms[0] + self.assert_(form.name is None) + self.assertEqual( + form.action, + "http://localhost/abc&"+u"\u2014".encode('utf8')+"d") + control = form.find_control(type="textarea", nr=0) + self.assert_(control.name is None) + self.assert_(control.value == "blah, blah,\r\nRhubarb.\r\n\r\n") + + empty_control = form.find_control(type="textarea", nr=1) + self.assert_(str(empty_control) == "<TextareaControl(<None>=)>") + self.assert_(empty_control.value == "") + + entity_ctl = form.find_control(type="textarea", nr=2) + self.assertEqual(entity_ctl.name, '"ta"') + self.assertEqual(entity_ctl.attrs["id"], "foo&bar") + self.assertEqual(entity_ctl.value, "Hello testers & users!") + + def testSelect(self): + file = StringIO( +"""<form action="abc"> + +<select name="foo"> + <option>Hello testers & &blah; users!</option> + <option></option><option></option> +</select> + +</form> + +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + self.assert_(len(forms) == 1) + form = forms[0] + + entity_ctl = form.find_control(type="select") + self.assert_(entity_ctl.name == "foo") + self.assertEqual(entity_ctl.value[0], "Hello testers & &blah; users!") + + hide_deprecations() + opt = entity_ctl.get_item_attrs("Hello testers & &blah; users!") + reset_deprecations() + self.assertEqual(opt["value"], "Hello testers & &blah; users!") + self.assertEqual(opt["label"], "Hello testers & &blah; users!") + self.assertEqual(opt["contents"], "Hello testers & &blah; users!") + + def testButton(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<input type="text" value="cow" name="moo"> + +<button name="b">blah, blah, +Rhubarb.</button> + +<button type="reset" name="b2"></button> +<button type="button" name="b3"></button> + +</form> + +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + self.assert_(form.name == "myform") + control = form.find_control(name="b") + self.assert_(control.type == "submitbutton") + self.assert_(control.value == "") + self.assert_(form.find_control("b2").type == "resetbutton") + self.assert_(form.find_control("b3").type == "buttonbutton") + pairs = form.click_pairs() + self.assert_(pairs == [("moo", "cow"), ("b", "")]) + + def testIsindex(self): + file = StringIO( +"""<form action="abc"> + +<isindex prompt=">>>"> + +</form> + +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + control = form.find_control(type="isindex") + self.assert_(control.type == "isindex") + self.assert_(control.name is None) + self.assert_(control.value == "") + control.value = "some stuff" + self.assert_(form.click_pairs() == []) + self.assert_(form.click_request_data() == + ("http://localhost/abc?some+stuff", None, [])) + self.assert_(form.click().get_full_url() == + "http://localhost/abc?some+stuff") + + def testEmptySelect(self): + file = StringIO( +"""<form action="abc"> +<select name="foo"></select> + +<select name="bar" multiple></select> + +</form> +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + control0 = form.find_control(type="select", nr=0) + control1 = form.find_control(type="select", nr=1) + self.assert_(str(control0) == "<SelectControl(foo=[])>") + self.assert_(str(control1) == "<SelectControl(bar=[])>") + form.set_value([], "foo") + self.assertRaises(ItemNotFoundError, form.set_value, ["oops"], "foo") + self.assert_(form.click_pairs() == []) + +# XXX figure out what to do in these sorts of cases +## def badSelect(self): +## # what objects should these generate, if any? +## # what should happen on submission of these? +## # what about similar checkboxes and radios? +## """<form action="abc" name="myform"> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ + +## """<form action="abc" name="myform"> + +## <select multiple> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ +## <select name="foo"> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select name="foo" multiple> +## <option>4</option> +## <option>5</option> +## <option>6</option> +## </select> +## """ + +## """<form action="abc" name="myform"> + +## <select> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## <select> +## <option>1</option> +## <option>2</option> +## <option>3</option> +## </select> + +## </form> +## """ + +## def testBadCheckbox(self): +## # see comments above +## # split checkbox -- is it one control, or two? + +## """ +## <html> + +## <input type=checkbox name=foo value=bar> +## <input type=checkbox name=foo value=bar> + +## <select> +## <option>1</option> +## <option>2</option> +## </select> + +## <input type=checkbox name=foo value=baz> +## <input type=checkbox name=foo value=bar> + +## </html> +## """ + + def testUnnamedControl(self): + file = StringIO(""" +<form action="./weird.html"> + +<input type="checkbox" value="foo"></input> + +</form> +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + self.assert_(form.controls[0].name is None) + + def testNamelessListItems(self): + # XXX SELECT + # these controls have no item names + file = StringIO("""<form action="./weird.html"> + +<input type="checkbox" name="foo"></input> + +<input type="radio" name="bar"></input> + +<!-- +<select name="baz"> + <option></option> +</select> + +<select name="baz" multiple> + <option></option> +</select> +--> + +<input type="submit" name="submit"> +</form> +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + hide_deprecations() + self.assert_(form.possible_items("foo") == ["on"]) + self.assert_(form.possible_items("bar") == ["on"]) + reset_deprecations() + #self.assert_(form.possible_items("baz") == []) + self.assert_(form["foo"] == []) + self.assert_(form["bar"] == []) + #self.assert_(form["baz"] == []) + form["foo"] = ["on"] + form["bar"] = ["on"] + pairs = form.click_pairs() + self.assert_(pairs == [("foo", "on"), ("bar", "on"), ("submit", "")]) + + def testSingleSelectFixup(self): + # HTML 4.01 section 17.6.1: single selection SELECT controls shouldn't + # have > 1 item selected, but if they do, not more than one should end + # up selected. + # In fact, testing really obscure stuff here, which follows Firefox + # 1.0.7 -- IE doesn't even support disabled OPTIONs. + file = StringIO("""<form action="./bad.html"> + +<select name="spam"> + <option selected>1</option> + <option selected>2</option> +</select> + +<select name="cow"> + <option selected>1</option> + <option disabled selected>2</option> +</select> + +<select name="moo"> + <option selected disabled>1</option> + <option>2</option> +</select> + +<select name="nnn"> + <option disabled>1</option> + <option>2</option> + <option>3</option> +</select> + +</form> +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + # deselect all but last item if more than one were selected... + spam = form.find_control("spam") + self.assertEqual([ii.name for ii in spam.items if ii.selected], ["2"]) + # ...even if it's disabled + cow = form.find_control("cow") + self.assertEqual([ii.name for ii in cow.items if ii.selected], ["2"]) + # exactly one selected item is OK even if it's disabled + moo = form.find_control("moo") + self.assertEqual([ii.name for ii in moo.items if ii.selected], ["1"]) + # if nothing was selected choose the first non-disabled item + moo = form.find_control("nnn") + self.assertEqual([ii.name for ii in moo.items if ii.selected], ["2"]) + + def testSelectDefault(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="a" multiple> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="b"> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +</form> + +""") + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + control = form.find_control("a") + self.assert_(control.value == []) + single_control = form.find_control("b") + self.assert_(single_control.value == ["1"]) + + file.seek(0) + forms = mechanize.ParseFile(file, "http://localhost/", + select_default=1, backwards_compat=False) + form = forms[0] + # select_default only affects *multiple* selection select controls + control = form.find_control(type="select", nr=0) + self.assert_(control.value == ["1"]) + single_control = form.find_control(type="select", nr=1) + self.assert_(single_control.value == ["1"]) + + def test_close_base_tag(self): + # Benji York: a single newline immediately after a start tag is + # stripped by browsers, but not one immediately before an end tag. + # TEXTAREA content is converted to the DOS newline convention. + forms = mechanize.ParseFile( + StringIO("<form><textarea>\n\nblah\n</textarea></form>"), + "http://example.com/", + backwards_compat=False, + ) + ctl = forms[0].find_control(type="textarea") + self.assertEqual(ctl.value, "\r\nblah\r\n") + + def test_embedded_newlines(self): + # newlines that happen to be at the start of strings passed to the + # parser's .handle_data() method must not be trimmed unless they also + # follow immediately after a start tag + forms = mechanize.ParseFile( + StringIO("<form><textarea>\n\nspam&\neggs\n</textarea></form>"), + "http://example.com/", + backwards_compat=False, + ) + ctl = forms[0].find_control(type="textarea") + self.assertEqual(ctl.value, "\r\nspam&\r\neggs\r\n") + + def test_double_select(self): + # More than one SELECT control of the same name in a form never + # represent a single control (unlike RADIO and CHECKBOX elements), so + # don't merge them. + forms = mechanize.ParseFile( + StringIO("""\ +<form> + <select name="a"> + <option>b</option> + <option>c</option> + </select> + <select name="a"> + <option>d</option> + <option>e</option> + </select> +</form> +"""), + "http://example.com/", + backwards_compat=False, + ) + form = forms[0] + self.assertEquals(len(form.controls), 2) + ctl = form.find_control(name="a", nr=0) + self.assertEqual([item.name for item in ctl.items], ["b", "c"]) + ctl = form.find_control(name="a", nr=1) + self.assertEqual([item.name for item in ctl.items], ["d", "e"]) + + def test_global_select(self): + # regression test: closing select and textarea tags should not be + # ignored, causing a ParseError due to incorrect tag nesting + + mechanize.ParseFileEx( + StringIO("""\ +<select name="a"> + <option>b</option> + <option>c</option> +</select> +<select name="a"> + <option>d</option> + <option>e</option> +</select> +"""), + "http://example.com/", + ) + + mechanize.ParseFile( + StringIO("""\ +<textarea></textarea> +<textarea></textarea> +"""), + "http://example.com/", + backwards_compat=False, + ) + + def test_empty_document(self): + forms = mechanize.ParseFileEx(StringIO(""), "http://example.com/") + self.assertEquals(len(forms), 1) # just the "global form" + + def test_missing_closing_body_tag(self): + # Even if there is no closing form or body tag, the last form on the + # page should be returned. + forms = mechanize.ParseFileEx( + StringIO('<form name="spam">'), + "http://example.com/", + ) + self.assertEquals(len(forms), 2) + self.assertEquals(forms[1].name, "spam") + + +class DisabledTests(unittest.TestCase): + def testOptgroup(self): + for compat in [False, True]: + self._testOptgroup(compat) + + def _testOptgroup(self, compat): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="foo" multiple> + <option>1</option> + <optgroup> + <option>2</option> + </optgroup> + <option>3</option> + <optgroup> + <option>4</option> + <option>5</option> + <option>6</option> + </optgroup> + <optgroup disabled> + <option selected>7</option> + <option>8</option> + </optgroup> + <option>9</option> + <optgroup disabled> + <option>10</option> + </optgroup> +</select> + +<select name="bar"> + <option>1</option> + <optgroup> + <option>2</option> + </optgroup> + <option>3</option> + <optgroup> + <option>4</option> + <option>5</option> + <option>6</option> + </optgroup> + <optgroup disabled> + <option selected>7</option> + <option>8</option> + </optgroup> + <option>9</option> + <optgroup disabled> + <option>10</option> + </optgroup> +</select> + +</form>""") + + def get_control(name, file=file, compat=compat): + file.seek(0) + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=False) + form = forms[0] + form.backwards_compat = compat + return form.find_control(name) + + # can't call item_disabled with no args + control = get_control("foo") + self.assertRaises(TypeError, control.get_item_disabled) + + hide_deprecations() + control.set_item_disabled(True, "2") + reset_deprecations() + self.assertEqual( + str(control), + "<SelectControl(foo=[1, (2), 3, 4, 5, 6, (*7), (8), 9, (10)])>") + + # list controls only allow assignment to .value if no attempt is + # made to set any disabled item... + + # ...multi selection + control = get_control("foo") + if compat: + extra = ["7"] + else: + extra = [] + # disabled items are not part of the submitted value, so "7" not + # included (they are not "successful": + # http://www.w3.org/TR/REC-html40/interact/forms.html#successful-controls + # ). This behavior was confirmed in Firefox 1.0.4 at least. + self.assertEqual(control.value, []+extra) + control.value = ["1"] + self.assertEqual(control.value, ["1"]) + control = get_control("foo") + self.assertRaises(AttributeError, setattr, control, 'value', ['8']) + self.assertEqual(control.value, []+extra) + # even though 7 is set already, attempt to set it fails + self.assertRaises(AttributeError, setattr, control, 'value', ['7']) + control.value = ["1", "3"] + self.assertEqual(control.value, ["1", "3"]) + control = get_control("foo") + self.assertRaises(AttributeError, setattr, control, 'value', ['1', '7']) + self.assertEqual(control.value, []+extra) + # enable all items + control.set_all_items_disabled(False) + control.value = ['1', '7'] + self.assertEqual(control.value, ["1", "7"]) + + control = get_control("foo") + hide_deprecations() + for name in 7, 8, 10: + self.assert_(control.get_item_disabled(str(name))) + if not compat: + # a disabled option is never "successful" (see above) so never + # in value + self.assert_(str(name) not in control.value) + # a disabled option always is always upset if you try to set it + self.assertRaises(AttributeError, control.set, True, str(name)) + self.assert_(str(name) not in control.value) + self.assertRaises(AttributeError, control.set, False, str(name)) + self.assert_(str(name) not in control.value) + self.assertRaises(AttributeError, control.toggle, str(name)) + self.assert_(str(name) not in control.value) + else: + self.assertRaises(AttributeError, control.set, True, str(name)) + control.set(False, str(name)) + self.assert_(str(name) not in control.value) + control.set(False, str(name)) + self.assert_(str(name) not in control.value) + self.assertRaises(AttributeError, control.toggle, str(name)) + self.assert_(str(name) not in control.value) + self.assertRaises(AttributeError, control.set, True, str(name)) + self.assert_(str(name) not in control.value) + + control = get_control("foo") + for name in 1, 2, 3, 4, 5, 6, 9: + self.assert_(not control.get_item_disabled(str(name))) + control.set(False, str(name)) + self.assert_(str(name) not in control.value) + control.toggle(str(name)) + self.assert_(str(name) in control.value) + control.set(True, str(name)) + self.assert_(str(name) in control.value) + control.toggle(str(name)) + self.assert_(str(name) not in control.value) + + control = get_control("foo") + self.assert_(control.get_item_disabled("7")) + control.set_item_disabled(True, "7") + self.assert_(control.get_item_disabled("7")) + self.assertRaises(AttributeError, control.set, True, "7") + control.set_item_disabled(False, "7") + self.assert_(not control.get_item_disabled("7")) + control.set(True, "7") + control.set(False, "7") + control.toggle("7") + control.toggle("7") + reset_deprecations() + + # ...single-selection + control = get_control("bar") + # 7 is selected but disabled + if compat: + value = ["7"] + else: + value = [] + self.assertEqual(control.value, value) + self.assertEqual( + [ii.name for ii in control.items if ii.selected], ["7"]) + control.value = ["2"] + + control = get_control("bar") + def assign_8(control=control): control.value = ["8"] + self.assertRaises(AttributeError, assign_8) + self.assertEqual(control.value, value) + def assign_7(control=control): control.value = ["7"] + self.assertRaises(AttributeError, assign_7) + # enable all items + control.set_all_items_disabled(False) + assign_7() + self.assertEqual(control.value, ['7']) + + control = get_control("bar") + hide_deprecations() + for name in 7, 8, 10: + self.assert_(control.get_item_disabled(str(name))) + if not compat: + # a disabled option is never "successful" (see above) so never in + # value + self.assert_(str(name) not in control.value) + # a disabled option always is always upset if you try to set it + self.assertRaises(AttributeError, control.set, True, str(name)) + self.assert_(str(name) not in control.value) + self.assertRaises(AttributeError, control.set, False, str(name)) + self.assert_(str(name) not in control.value) + self.assertRaises(AttributeError, control.toggle, str(name)) + self.assert_(str(name) not in control.value) + else: + self.assertRaises(AttributeError, control.set, True, str(name)) + control.set(False, str(name)) + self.assert_(str(name) != control.value) + control.set(False, str(name)) + self.assert_(str(name) != control.value) + self.assertRaises(AttributeError, control.toggle, str(name)) + self.assert_(str(name) != control.value) + self.assertRaises(AttributeError, control.set, True, str(name)) + self.assert_(str(name) != control.value) + + control = get_control("bar") + for name in 1, 2, 3, 4, 5, 6, 9: + self.assert_(not control.get_item_disabled(str(name))) + control.set(False, str(name)) + self.assert_(str(name) not in control.value) + control.toggle(str(name)) + self.assert_(str(name) == control.value[0]) + control.set(True, str(name)) + self.assert_(str(name) == control.value[0]) + control.toggle(str(name)) + self.assert_(str(name) not in control.value) + + control = get_control("bar") + self.assert_(control.get_item_disabled("7")) + control.set_item_disabled(True, "7") + self.assert_(control.get_item_disabled("7")) + self.assertRaises(AttributeError, control.set, True, "7") + self.assertEqual(control.value, value) + control.set_item_disabled(False, "7") + self.assertEqual(control.value, ["7"]) + self.assert_(not control.get_item_disabled("7")) + control.set(True, "7") + control.set(False, "7") + control.toggle("7") + control.toggle("7") + + # set_all_items_disabled + for name in "foo", "bar": + control = get_control(name) + control.set_all_items_disabled(False) + control.set(True, "7") + control.set(True, "1") + control.set_all_items_disabled(True) + self.assertRaises(AttributeError, control.set, True, "7") + self.assertRaises(AttributeError, control.set, True, "1") + reset_deprecations() + +# XXX single select + def testDisabledSelect(self): + for compat in [False, True]: + self._testDisabledSelect(compat) + def _testDisabledSelect(self, compat): + file = StringIO( +"""<form action="abc" name="myform"> + +<select name="foo" multiple> + <option label="a">1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="bar" multiple> + <option>1</option> + <option disabled>2</option> + <option>3</option> +</select> + +<select name="baz" disabled multiple> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +<select name="spam" disabled multiple> + <option>1</option> + <option disabled>2</option> + <option>3</option> +</select> + +<!--This is disabled, but fixup still needs to select an option, + rather than raising AttributeError--> +<select name="blah" disabled> + <option>1</option> + <option>2</option> + <option>3</option> +</select> + +</form> +""") + hide_deprecations() + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=compat) + reset_deprecations() + form = forms[0] + for name, control_disabled, item_disabled in [ + ("foo", False, False), + ("bar", False, True), + ("baz", True, False), + ("spam", True, True)]: + control = form.find_control(name) + self.assertEqual(bool(control.disabled), control_disabled) + hide_deprecations() + item = control.get_item_attrs("2") + reset_deprecations() + self.assertEqual(bool(item.has_key("disabled")), item_disabled) + + def bad_assign(value, control=control): control.value = value + hide_deprecations() + if control_disabled: + for name in "1", "2", "3": + self.assertRaises(AttributeError, control.set, True, name) + self.assertRaises(AttributeError, bad_assign, [name]) + elif item_disabled: + self.assertRaises(AttributeError, control.set, True, "2") + self.assertRaises(AttributeError, bad_assign, ["2"]) + for name in "1", "3": + control.set(True, name) + else: + control.value = ["1", "2", "3"] + reset_deprecations() + + control = form.find_control("foo") + # missing disabled arg + hide_deprecations() + self.assertRaises(TypeError, control.set_item_disabled, "1") + # by_label + self.assert_(not control.get_item_disabled("a", by_label=True)) + control.set_item_disabled(True, "a", by_label=True) + self.assert_(control.get_item_disabled("a", by_label=True)) + reset_deprecations() + + def testDisabledRadio(self): + for compat in False, True: + self._testDisabledRadio(compat) + def _testDisabledRadio(self, compat): + file = StringIO( +"""<form> +<input type="checkbox" name="foo" value="1" disabled></input> +<input type="checkbox" name="foo" value="2" disabled></input> +<input type="checkbox" name="foo" value="3" disabled></input> +</form>""") + hide_deprecations() + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=compat) + form = forms[0] + control = form.find_control('foo') + + # since all items are disabled, .fixup() should not select + # anything + self.assertEquals( + [item.name for item in control.items if item.selected], + [], + ) + reset_deprecations() + + def testDisabledCheckbox(self): + for compat in False, True: + self._testDisabledCheckbox(compat) + def _testDisabledCheckbox(self, compat): + file = StringIO( +"""<form action="abc" name="myform"> + +<label><input type="checkbox" name="foo" value="1"></input> a</label> +<input type="checkbox" name="foo" value="2"></input> +<input type="checkbox" name="foo" value="3"></input> + +<input type="checkbox" name="bar" value="1"></input> +<input type="checkbox" name="bar" value="2" disabled></input> +<input type="checkbox" name="bar" value="3"></input> + +<input type="checkbox" name="baz" value="1" disabled></input> +<input type="checkbox" name="baz" value="2" disabled></input> +<input type="checkbox" name="baz" value="3" disabled></input> + +</form>""") + hide_deprecations() + forms = mechanize.ParseFile(file, "http://localhost/", + backwards_compat=compat) + reset_deprecations() + form = forms[0] + for name, control_disabled, item_disabled in [ + ("foo", False, False), + ("bar", False, True), + ("baz", False, True)]: + control = form.find_control(name) + self.assert_(bool(control.disabled) == control_disabled) + hide_deprecations() + item = control.get_item_attrs("2") + self.assert_(bool(item.has_key("disabled")) == item_disabled) + self.assert_(control.get_item_disabled("2") == item_disabled) + + def bad_assign(value, control=control): control.value = value + if item_disabled: + self.assertRaises(AttributeError, control.set, True, "2") + self.assertRaises(AttributeError, bad_assign, ["2"]) + if not control.get_item_disabled("1"): + control.set(True, "1") + else: + control.value = ["1", "2", "3"] + reset_deprecations() + + control = form.find_control("foo") + hide_deprecations() + control.set_item_disabled(False, "1") + # missing disabled arg + self.assertRaises(TypeError, control.set_item_disabled, "1") + # by_label + self.failIf(control.get_item_disabled('a', by_label=True)) + self.assert_(not control.get_item_disabled("1")) + control.set_item_disabled(True, 'a', by_label=True) + self.assert_(control.get_item_disabled("1")) + reset_deprecations() + + +class ControlTests(unittest.TestCase): + def testTextControl(self): + attrs = {"type": "this is ignored", + "name": "ath_Uname", + "value": "", + "maxlength": "20", + "id": "foo"} + c = _form.TextControl("texT", "ath_Uname", attrs) + c.fixup() + self.assert_(c.type == "text") + self.assert_(c.name == "ath_Uname") + self.assert_(c.id == "foo") + self.assert_(c.value == "") + self.assert_(str(c) == "<TextControl(ath_Uname=)>") + self.assert_(c.pairs() == [("ath_Uname", "")]) + def bad_assign(c=c): c.type = "sometype" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.type == "text") + def bad_assign(c=c): c.name = "somename" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.name == "ath_Uname") + c.value = "2" + self.assert_(c.value == "2") + + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value is None) + + self.assert_(c.pairs() == []) + c.value = "2" # reset value... + self.assert_(str(c) == "<TextControl(ath_Uname=2)>") + def bad_assign(c=c): c.value = ["foo"] + self.assertRaises(TypeError, bad_assign) + self.assert_(c.value == "2") + self.assert_(not c.readonly) + c.readonly = True + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.value == "2") + c.disabled = True + self.assert_(str(c) == + "<TextControl(ath_Uname=2) (disabled, readonly)>") + c.readonly = False + self.assert_(str(c) == "<TextControl(ath_Uname=2) (disabled)>") + self.assertRaises(AttributeError, bad_assign) + self.assert_(c.value == "2") + self.assert_(c.pairs() == []) + c.disabled = False + self.assert_(str(c) == "<TextControl(ath_Uname=2)>") + + self.assert_(c.attrs.has_key("maxlength")) + for key in "name", "type", "value": + self.assert_(c.attrs.has_key(key)) + + # initialisation of readonly and disabled attributes + attrs["readonly"] = True + c = _form.TextControl("text", "ath_Uname", attrs) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + del attrs["readonly"] + attrs["disabled"] = True + c = _form.TextControl("text", "ath_Uname", attrs) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + del attrs["disabled"] + c = _form.TextControl("hidden", "ath_Uname", attrs) + self.assert_(c.readonly) + def bad_assign(c=c): c.value = "foo" + self.assertRaises(AttributeError, bad_assign) + + def testFileControl(self): + c = _form.FileControl("file", "test_file", {}) + fp = StringIO() + c.add_file(fp) + fp2 = StringIO() + c.add_file(fp2, None, "fp2 file test") + self.assert_(str(c) == '<FileControl(test_file=<Unnamed file>, fp2 file test)>') + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(str(c) == '<FileControl(test_file=<No files added>)>') + + def testIsindexControl(self): + attrs = {"type": "this is ignored", + "prompt": ">>>"} + c = _form.IsindexControl("isIndex", None, attrs) + c.fixup() + self.assert_(c.type == "isindex") + self.assert_(c.name is None) + self.assert_(c.value == "") + self.assert_(str(c) == "<IsindexControl()>") + self.assert_(c.pairs() == []) + def set_type(c=c): c.type = "sometype" + self.assertRaises(AttributeError, set_type) + self.assert_(c.type == "isindex") + def set_name(c=c): c.name = "somename" + self.assertRaises(AttributeError, set_name) + def set_value(value, c=c): c.value = value + self.assertRaises(TypeError, set_value, [None]) + self.assert_(c.name is None) + c.value = "2" + self.assert_(c.value == "2") + self.assert_(str(c) == "<IsindexControl(2)>") + c.disabled = True + self.assert_(str(c) == "<IsindexControl(2) (disabled)>") + self.assertRaises(AttributeError, set_value, "foo") + self.assert_(c.value == "2") + self.assert_(c.pairs() == []) + c.readonly = True + self.assert_(str(c) == "<IsindexControl(2) (disabled, readonly)>") + self.assertRaises(AttributeError, set_value, "foo") + c.disabled = False + self.assert_(str(c) == "<IsindexControl(2) (readonly)>") + self.assertRaises(AttributeError, set_value, "foo") + c.readonly = False + self.assert_(str(c) == "<IsindexControl(2)>") + + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs.has_key("prompt")) + self.assert_(c.attrs["prompt"] == ">>>") + for key in "name", "value": + self.assert_(not c.attrs.has_key(key)) + + c.value = "foo 1 bar 2" + class FakeForm: action = "http://localhost/" + form = FakeForm() + self.assert_(c._click(form, (1,1), "request_data") == + ("http://localhost/?foo+1+bar+2", None, [])) + + c.value = "foo 1 bar 2" + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value is None) + + def testIgnoreControl(self): + attrs = {"type": "this is ignored"} + c = _form.IgnoreControl("reset", None, attrs) + self.assert_(c.type == "reset") + self.assert_(c.value is None) + self.assert_(str(c) == "<IgnoreControl(<None>=<None>)>") + + def set_value(value, c=c): c.value = value + self.assertRaises(AttributeError, set_value, "foo") + self.assert_(c.value is None) + + # this is correct, but silly; basically nothing should happen + c.clear() + self.assert_(c.value is None) + + def testSubmitControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "img": "foo.gif"} + c = _form.SubmitControl("submit", "name_value", attrs) + self.assert_(c.type == "submit") + self.assert_(c.name == "name_value") + self.assert_(c.value == "value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value) (readonly)>") + + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value is None) + c.value = "value_value" + c.readonly = True + + def set_value(value, c=c): c.value = value + self.assertRaises(TypeError, set_value, ["foo"]) + c.disabled = True + self.assertRaises(AttributeError, set_value, "value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value) " + "(disabled, readonly)>") + c.disabled = False + c.readonly = False + set_value("value_value") + self.assert_(str(c) == "<SubmitControl(name_value=value_value)>") + c.readonly = True + + # click on button + form = _form.HTMLForm("http://foo.bar.com/") + c.add_to_form(form) + self.assert_(c.pairs() == []) + pairs = c._click(form, (1,1), "pairs") + request = c._click(form, (1,1), "request") + data = c._click(form, (1,1), "request_data") + self.assert_(c.pairs() == []) + self.assert_(pairs == [("name_value", "value_value")]) + self.assert_(request.get_full_url() == + "http://foo.bar.com/?name_value=value_value") + self.assert_(data == + ("http://foo.bar.com/?name_value=value_value", None, [])) + c.disabled = True + pairs = c._click(form, (1,1), "pairs") + request = c._click(form, (1,1), "request") + data = c._click(form, (1,1), "request_data") + self.assert_(pairs == []) + # XXX not sure if should have '?' on end of this URL, or if it really matters... + self.assert_(request.get_full_url() == "http://foo.bar.com/") + self.assert_(data == ("http://foo.bar.com/", None, [])) + + def testImageControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "img": "foo.gif"} + c = _form.ImageControl("image", "name_value", attrs, index=0) + self.assert_(c.type == "image") + self.assert_(c.name == "name_value") + self.assert_(c.value == "") + self.assert_(str(c) == "<ImageControl(name_value=)>") + + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value is None) + c.value = "" + + # click, at coordinate (0, 55), on image + form = _form.HTMLForm("http://foo.bar.com/") + c.add_to_form(form) + self.assert_(c.pairs() == []) + request = c._click(form, (0, 55), "request") + self.assert_(c.pairs() == []) + self.assert_(request.get_full_url() == + "http://foo.bar.com/?name_value.x=0&name_value.y=55") + self.assert_(c._click(form, (0,55), return_type="request_data") == + ("http://foo.bar.com/?name_value.x=0&name_value.y=55", + None, [])) + c.value = "blah" + request = c._click(form, (0, 55), "request") + self.assertEqual(request.get_full_url(), "http://foo.bar.com/?" + "name_value.x=0&name_value.y=55&name_value=blah") + + c.disabled = True + self.assertEqual(c.value, "blah") + self.assert_(str(c) == "<ImageControl(name_value=blah) (disabled)>") + def set_value(value, c=c): c.value = value + self.assertRaises(AttributeError, set_value, "blah") + self.assert_(c._click(form, (1,1), return_type="pairs") == []) + c.readonly = True + self.assert_(str(c) == "<ImageControl(name_value=blah) " + "(disabled, readonly)>") + self.assertRaises(AttributeError, set_value, "blah") + self.assert_(c._click(form, (1,1), return_type="pairs") == []) + c.disabled = c.readonly = False + self.assert_(c._click(form, (1,1), return_type="pairs") == + [("name_value.x", "1"), + ("name_value.y", "1"), + ('name_value', 'blah')]) + + def testCheckboxControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "alt": "some string"} + form = DummyForm() + c = _form.CheckboxControl("checkbox", "name_value", attrs) + c.add_to_form(form) + c.fixup() + self.assert_(c.type == "checkbox") + self.assert_(c.name == "name_value") + self.assert_(c.value == []) + hide_deprecations() + self.assert_(c.possible_items() == ["value_value"]) + reset_deprecations() + def set_type(c=c): c.type = "sometype" + self.assertRaises(AttributeError, set_type) + self.assert_(c.type == "checkbox") + def set_name(c=c): c.name = "somename" + self.assertRaises(AttributeError, set_name) + self.assert_(c.name == "name_value") + + # construct larger list from length-1 lists + c = _form.CheckboxControl("checkbox", "name_value", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = _form.CheckboxControl("checkbox", "name_value", attrs2) + c2.add_to_form(form) + c.merge_control(c2) + c.add_to_form(form) + c.fixup() + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[value_value, value_value2])>") + hide_deprecations() + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + attrs = c.get_item_attrs("value_value") + for key in "alt", "name", "value", "type": + self.assert_(attrs.has_key(key)) + self.assertRaises(ItemNotFoundError, c.get_item_attrs, "oops") + reset_deprecations() + + def set_value(value, c=c): c.value = value + + c.value = ["value_value", "value_value2"] + self.assert_(c.value == ["value_value", "value_value2"]) + c.value = ["value_value"] + self.assertEqual(c.value, ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assertRaises(TypeError, set_value, "value_value") + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + hide_deprecations() + c.toggle("value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.toggle("value_value2") + reset_deprecations() + self.assert_(c.value == ["value_value"]) + hide_deprecations() + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + reset_deprecations() + + self.assert_(c.value == ["value_value"]) + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value == []) + + # set + hide_deprecations() + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(False, "value_value2") + self.assert_(c.value == ["value_value"]) + c.set(False, "value_value2") + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + reset_deprecations() + + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2])>") + c.disabled = True + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(disabled)>") + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == []) + c.readonly = True + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(disabled, readonly)>") + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == []) + c.disabled = False + self.assert_(str(c) == "<CheckboxControl(" + "name_value=[*value_value, value_value2]) " + "(readonly)>") + self.assertRaises(AttributeError, set_value, ["value_value"]) + self.assert_(c.value == ["value_value"]) + self.assert_(c.pairs() == [("name_value", "value_value")]) + c.readonly = False + c.value = [] + self.assert_(c.value == []) + + def testSelectControlMultiple(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "alt": "some string", + "label": "contents_value", + "contents": "contents_value", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": "", + "alt": "alt_text"}} + form = DummyForm() + # with Netscape / IE default selection... + c = _form.SelectControl("select", "select_name", attrs) + c.add_to_form(form) + c.fixup() + self.assert_(c.type == "select") + self.assert_(c.name == "select_name") + self.assert_(c.value == []) + hide_deprecations() + self.assert_(c.possible_items() == ["value_value"]) + reset_deprecations() + self.assert_(c.attrs.has_key("name")) + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs["alt"] == "alt_text") + # ... and with RFC 1866 default selection + c = _form.SelectControl("select", "select_name", attrs, select_default=True) + c.add_to_form(form) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = _form.SelectControl("select", "select_name", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = _form.SelectControl("select", "select_name", attrs2) + c2.add_to_form(form) + c.merge_control(c2) + c.add_to_form(form) + c.fixup() + self.assert_(str(c) == "<SelectControl(" + "select_name=[value_value, value_value2])>") + hide_deprecations() + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + + # get_item_attrs + attrs3 = c.get_item_attrs("value_value") + reset_deprecations() + self.assert_(attrs3.has_key("alt")) + self.assert_(not attrs3.has_key("multiple")) + # HTML attributes dictionary should have been copied by ListControl + # constructor. + attrs["new_attr"] = "new" + attrs2["new_attr2"] = "new2" + for key in ("new_attr", "new_attr2"): + self.assert_(not attrs3.has_key(key)) + hide_deprecations() + self.assertRaises(ItemNotFoundError, c.get_item_attrs, "oops") + reset_deprecations() + + c.value = ["value_value", "value_value2"] + self.assert_(c.value == ["value_value", "value_value2"]) + c.value = ["value_value"] + self.assertEqual(c.value, ["value_value"]) + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assertRaises(TypeError, set_value, None) + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + hide_deprecations() + c.toggle("value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.toggle("value_value2") + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + self.assert_(c.value == ["value_value"]) + reset_deprecations() + + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value == []) + + # test ordering of items + c.value = ["value_value2", "value_value"] + self.assert_(c.value == ["value_value", "value_value2"]) + # set + hide_deprecations() + c.set(True, "value_value") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value", "value_value2"]) + c.set(False, "value_value") + self.assert_(c.value == ["value_value2"]) + c.set(False, "value_value") + self.assert_(c.value == ["value_value2"]) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + reset_deprecations() + c.value = [] + self.assert_(c.value == []) + + def testSelectControlMultiple_label(self): +## <SELECT name=year> +## <OPTION value=0 label="2002">current year</OPTION> +## <OPTION value=1>2001</OPTION> +## <OPTION>2000</OPTION> +## </SELECT> + attrs = {"type": "ignored", + "name": "year", + "value": "0", + "label": "2002", + "contents": "current year", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + attrs2 = {"type": "ignored", + "name": "year", + "value": "1", + "label": "2001", # label defaults to contents + "contents": "2001", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + attrs3 = {"type": "ignored", + "name": "year", + "value": "2000", # value defaults to contents + "label": "2000", # label defaults to contents + "contents": "2000", + "__select": {"type": "this is ignored", + "name": "select_name", + "multiple": ""}} + c = _form.SelectControl("select", "select_name", attrs) + c2 = _form.SelectControl("select", "select_name", attrs2) + c3 = _form.SelectControl("select", "select_name", attrs3) + form = DummyForm() + c.merge_control(c2) + c.merge_control(c3) + c.add_to_form(form) + c.fixup() + + hide_deprecations() + self.assert_(c.possible_items() == ["0", "1", "2000"]) + self.assert_(c.possible_items(by_label=True) == + ["2002", "2001", "2000"]) + + self.assert_(c.value == []) + c.toggle("2002", by_label=True) + self.assert_(c.value == ["0"]) + c.toggle("0") + self.assert_(c.value == []) + c.toggle("0") + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.toggle("2002", by_label=True) + self.assertRaises(ItemNotFoundError, c.toggle, "blah", by_label=True) + self.assert_(c.value == []) + c.toggle("2000") + reset_deprecations() + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["2002"]) + self.assertRaises(TypeError, set_value, "1") + self.assertRaises(TypeError, set_value, None) + self.assert_(c.value == ["2000"]) + c.value = ["0"] + self.assertEqual(c.value, ["0"]) + c.value = [] + self.assertRaises(TypeError, c.set_value_by_label, "2002") + c.set_value_by_label(["2002"]) + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.set_value_by_label(["2000"]) + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + c.set_value_by_label(["2000", "2002"]) + self.assert_(c.value == ["0", "2000"]) + self.assert_(c.get_value_by_label() == ["2002", "2000"]) + + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value == []) + + c.set_value_by_label(["2000", "2002"]) + hide_deprecations() + c.set(False, "2002", by_label=True) + self.assert_(c.get_value_by_label() == c.value == ["2000"]) + c.set(False, "2002", by_label=True) + self.assert_(c.get_value_by_label() == c.value == ["2000"]) + c.set(True, "2002", by_label=True) + self.assert_(c.get_value_by_label() == ["2002", "2000"]) + self.assert_(c.value == ["0", "2000"]) + c.set(False, "2000", by_label=True) + self.assert_(c.get_value_by_label() == ["2002"]) + self.assert_(c.value == ["0"]) + c.set(True, "2001", by_label=True) + self.assert_(c.get_value_by_label() == ["2002", "2001"]) + self.assert_(c.value == ["0", "1"]) + self.assertRaises(ItemNotFoundError, c.set, True, "blah", + by_label=True) + self.assertRaises(ItemNotFoundError, c.set, + False, "blah", by_label=True) + reset_deprecations() + + def testSelectControlSingle_label(self): +## <SELECT name=year> +## <OPTION value=0 label="2002">current year</OPTION> +## <OPTION value=1>2001</OPTION> +## <OPTION>2000</OPTION> +## </SELECT> + attrs = {"type": "ignored", + "name": "year", + "value": "0", + "label": "2002", + "contents": "current year", + "__select": {"type": "this is ignored", + "name": "select_name"}} + attrs2 = {"type": "ignored", + "name": "year", + "value": "1", + "label": "2001", # label defaults to contents + "contents": "2001", + "__select": {"type": "this is ignored", + "name": "select_name"}} + attrs3 = {"type": "ignored", + "name": "year", + "value": "2000", # value defaults to contents + "label": "2000", # label defaults to contents + "contents": "2000", + "__select": {"type": "this is ignored", + "name": "select_name"}} + c = _form.SelectControl("select", "select_name", attrs) + c2 = _form.SelectControl("select", "select_name", attrs2) + c3 = _form.SelectControl("select", "select_name", attrs3) + form = DummyForm() + c.merge_control(c2) + c.merge_control(c3) + c.add_to_form(form) + c.fixup() + + hide_deprecations() + self.assert_(c.possible_items() == ["0", "1", "2000"]) + self.assert_(c.possible_items(by_label=True) == + ["2002", "2001", "2000"]) + reset_deprecations() + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemNotFoundError, set_value, ["2002"]) + self.assertRaises(TypeError, set_value, "1") + self.assertRaises(TypeError, set_value, None) + self.assert_(c.value == ["0"]) + c.value = [] + self.assert_(c.value == []) + c.value = ["0"] + self.assert_(c.value == ["0"]) + + c.value = [] + self.assertRaises(TypeError, c.set_value_by_label, "2002") + self.assertRaises(ItemCountError, c.set_value_by_label, + ["2000", "2001"]) + self.assertRaises(ItemNotFoundError, c.set_value_by_label, ["foo"]) + c.set_value_by_label(["2002"]) + self.assert_(c.value == ["0"]) + self.assert_(c.get_value_by_label() == ["2002"]) + c.set_value_by_label(["2000"]) + self.assert_(c.value == ["2000"]) + self.assert_(c.get_value_by_label() == ["2000"]) + + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value == []) + + def testSelectControlSingle(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "label": "contents_value", + "contents": "contents_value", + "__select": {"type": "this is ignored", + "name": "select_name", + "alt": "alt_text"}} + # Netscape and IE behaviour... + c = _form.SelectControl("select", "select_name", attrs) + form = DummyForm() + c.add_to_form(form) + c.fixup() + self.assert_(c.type == "select") + self.assert_(c.name == "select_name") + self.assert_(c.value == ["value_value"]) + hide_deprecations() + self.assert_(c.possible_items() == ["value_value"]) + reset_deprecations() + self.assert_(c.attrs.has_key("name")) + self.assert_(c.attrs.has_key("type")) + self.assert_(c.attrs["alt"] == "alt_text") + # ...and RFC 1866 behaviour are identical (unlike multiple SELECT). + c = _form.SelectControl("select", "select_name", attrs, + select_default=1) + c.add_to_form(form) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = _form.SelectControl("select", "select_name", attrs) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = _form.SelectControl("select", "select_name", attrs2) + c.merge_control(c2) + c.add_to_form(form) + c.fixup() + self.assert_(str(c) == "<SelectControl(" + "select_name=[*value_value, value_value2])>") + c.value = [] + self.assert_(c.value == []) + self.assert_(str(c) == "<SelectControl(" + "select_name=[value_value, value_value2])>") + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assert_(str(c) == "<SelectControl(" + "select_name=[*value_value, value_value2])>") + hide_deprecations() + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + reset_deprecations() + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemCountError, set_value, + ["value_value", "value_value2"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assertRaises(TypeError, set_value, None) + c.value = ["value_value2"] + self.assert_(c.value == ["value_value2"]) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assert_(c.value == ["value_value"]) + hide_deprecations() + c.toggle("value_value") + self.assertRaises(ItemNotFoundError, c.toggle, "oops") + self.assertRaises(TypeError, c.toggle, ["oops"]) + reset_deprecations() + self.assert_(c.value == []) + c.value = ["value_value"] + self.assert_(c.value == ["value_value"]) + # nothing selected is allowed + c.value = [] + self.assert_(c.value == []) + + hide_deprecations() + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assert_(c.value == []) + + # set + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value") + self.assert_(c.value == ["value_value"]) + c.set(True, "value_value2") + self.assert_(c.value == ["value_value2"]) + c.set(False, "value_value") + self.assert_("value_value2") + c.set(False, "value_value2") + self.assert_(c.value == []) + c.set(False, "value_value2") + self.assert_(c.value == []) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + reset_deprecations() + + def testRadioControl(self): + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "id": "blah"} + # Netscape and IE behaviour... + c = _form.RadioControl("radio", "name_value", attrs) + form = DummyForm() + c.add_to_form(form) + c.fixup() + self.assert_(c.type == "radio") + self.assert_(c.name == "name_value") + self.assert_(c.id == "blah") + self.assert_(c.value == []) + hide_deprecations() + self.assert_(c.possible_items() == ["value_value"]) + reset_deprecations() + # ...and RFC 1866 behaviour + c = _form.RadioControl("radio", "name_value", attrs, + select_default=True) + c.add_to_form(form) + c.fixup() + self.assert_(c.value == ["value_value"]) + + # construct larger list from length-1 lists + c = _form.RadioControl("radio", "name_value", attrs, + select_default=True) + attrs2 = attrs.copy() + attrs2["value"] = "value_value2" + c2 = _form.RadioControl("radio", "name_value", attrs2, + select_default=True) + c.merge_control(c2) + c.add_to_form(form) + c.fixup() + self.assert_(str(c) == "<RadioControl(" + "name_value=[*value_value, value_value2])>") + hide_deprecations() + self.assert_(c.possible_items() == ["value_value", "value_value2"]) + reset_deprecations() + + def set_value(value, c=c): c.value = value + self.assertRaises(ItemCountError, set_value, + ["value_value", "value_value2"]) + self.assertRaises(TypeError, set_value, "value_value") + self.assertEqual(c.value, ["value_value"]) + c.value = ["value_value2"] + self.assertEqual(c.value, ["value_value2"]) + c.value = ["value_value"] + self.assertEqual(c.value, ["value_value"]) + self.assertRaises(ItemNotFoundError, set_value, ["oops"]) + self.assertEqual(c.value, ["value_value"]) + hide_deprecations() + c.toggle("value_value") + self.assertEqual(c.value, []) + c.toggle("value_value") + self.assertEqual(c.value, ["value_value"]) + self.assertRaises(TypeError, c.toggle, ["value_value"]) + self.assertEqual(c.value, ["value_value"]) + # nothing selected is allowed + c.value = [] + self.assertEqual(c.value, []) + + c.set(True, "value_value") + reset_deprecations() + self.assertEqual(c.value, ["value_value"]) + c.readonly = True + self.assertRaises(AttributeError, c.clear) + c.readonly = False + c.clear() + self.assertEqual(c.value, []) + + # set + hide_deprecations() + c.set(True, "value_value") + self.assertEqual(c.value, ["value_value"]) + c.set(True, "value_value") + self.assertEqual(c.value, ["value_value"]) + c.set(True, "value_value2") + self.assertEqual(c.value, ["value_value2"]) + c.set(False, "value_value") + self.assert_("value_value2") + c.set(False, "value_value2") + self.assertEqual(c.value, []) + c.set(False, "value_value2") + self.assertEqual(c.value, []) + self.assertRaises(ItemNotFoundError, c.set, True, "oops") + self.assertRaises(TypeError, c.set, True, ["value_value"]) + self.assertRaises(ItemNotFoundError, c.set, False, "oops") + self.assertRaises(TypeError, c.set, False, ["value_value"]) + reset_deprecations() + + # tests for multiple identical values + + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "id": "name_value_1"} + c1 = _form.RadioControl("radio", "name_value", attrs) + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "value_value", + "id": "name_value_2", + "checked": "checked"} + c2 = _form.RadioControl("radio", "name_value", attrs) + attrs = {"type": "this is ignored", + "name": "name_value", + "value": "another_value", + "id": "name_value_3", + "__label": {"__text": "Third Option"}} + c3 = _form.RadioControl("radio", "name_value", attrs) + form = DummyForm() + c1.merge_control(c2) + c1.merge_control(c3) + c1.add_to_form(form) + c1.fixup() + self.assertEqual(c1.value, ['value_value']) + hide_deprecations() + self.assertEqual( + c1.possible_items(), + ['value_value', 'value_value', 'another_value']) + reset_deprecations() + self.assertEqual(c1.value, ['value_value']) + self.failIf(c1.items[0].selected) + self.failUnless(c1.items[1].selected) + self.failIf(c1.items[2].selected) + c1.value = ['value_value'] # should be no change + self.failUnless(c1.items[1].selected) + self.assertEqual(c1.value, ['value_value']) + c1.value = ['another_value'] + self.failUnless(c1.items[2].selected) + self.assertEqual(c1.value, ['another_value']) + c1.value = ['value_value'] + self.failUnless(c1.items[0].selected) + self.assertEqual(c1.value, ['value_value']) + + # id labels + form._id_to_labels['name_value_1'] = [ + _form.Label({'for': 'name_value_1', '__text':'First Option'})] + form._id_to_labels['name_value_2'] = [ + _form.Label({'for': 'name_value_2', + '__text':'Second Option'})] + form._id_to_labels['name_value_3'] = [ + _form.Label({'for': 'name_value_3', + '__text':'Last Option'})] # notice __label above + self.assertEqual([l.text for l in c1.items[0].get_labels()], + ['First Option']) + self.assertEqual([l.text for l in c1.items[1].get_labels()], + ['Second Option']) + self.assertEqual([l.text for l in c1.items[2].get_labels()], + ['Third Option', 'Last Option']) + self.assertEqual(c1.get_value_by_label(), ['First Option']) + c1.set_value_by_label(['Second Option']) + self.assertEqual(c1.get_value_by_label(), ['Second Option']) + self.assertEqual(c1.value, ['value_value']) + c1.set_value_by_label(['Third Option']) + self.assertEqual(c1.get_value_by_label(), ['Third Option']) + self.assertEqual(c1.value, ['another_value']) + c1.items[1].selected = True + self.assertEqual(c1.get_value_by_label(), ['Second Option']) + self.assertEqual(c1.value, ['value_value']) + c1.set_value_by_label(['Last Option']) # by second label + self.assertEqual(c1.get_value_by_label(), ['Third Option']) + self.assertEqual(c1.value, ['another_value']) + c1.set_value_by_label(['irst']) # by substring + self.assertEqual(c1.get_value_by_label(), ['First Option']) + + +class FormTests(unittest.TestCase): + + base_uri = "http://auth.athensams.net/" + + def _get_test_file(self, filename): + import test_form + this_dir = os.path.dirname(test_form.__file__) + path = os.path.join(this_dir, "test_form_data", filename) + return open(path) + + def test_find_control(self): + f = StringIO("""\ +<form> + <label for="form.title"> Book Title </label></td> + <input type="text" id="form.title" name="form.title" + value="The Grapes of Wrath" /> + + <label for="form.quality">Book Quality</label></td> + <select id="form.quality" name="form.country"> + <option>Good</option> + <option>Bad</option> + </select> + + <label><input type="checkbox" id="form.genre.western" name="form.genre" + value="western" /> Western</label> + <label><input type="checkbox" id="form.genre.horror" name="form.genre" + value="horror" /> Horror</label> + + <label for="form.password">Password</label> + <input type="password" id="pswd1" name="password" value="123" /> + <input type="password" id="pswd2" name="password" value="123" /> +</form> +""") + form = mechanize.ParseFile(f, "http://example.com/", + backwards_compat=False)[0] + for compat in True, False: + form.backwards_compat = compat + fc = form.find_control + + self.assertEqual(fc("form.title").id, "form.title") + self.assertEqual(fc("form.title", nr=0).id, "form.title") + if compat: + self.assertEqual(fc("password").id, "pswd1") + else: + self.assertRaises(AmbiguityError, fc, "password") + self.assertEqual(fc("password", id="pswd2").id, "pswd2") + self.assertEqual(fc("password", nr=0).id, "pswd1") + self.assertRaises(ControlNotFoundError, fc, "form.title", nr=1) + self.assertRaises(ControlNotFoundError, fc, nr=50) + self.assertRaises(ValueError, fc, nr=-1) + self.assertRaises(ControlNotFoundError, fc, label="Bananas") + + # label + self.assertEqual(fc(label="Title").id, "form.title") + self.assertEqual(fc(label="Book Title").id, "form.title") + self.assertRaises(ControlNotFoundError, fc, label=" Book Title ") + self.assertRaises(ControlNotFoundError, fc, label="Bananas") + self.assertRaises(ControlNotFoundError, fc, label="title") + + self.assertEqual(fc(label="Book", nr=0).id, "form.title") + self.assertEqual(fc(label="Book", nr=1).id, "form.quality") + if compat: + self.assertEqual(fc(label="Book").id, "form.title") + else: + self.assertRaises(AmbiguityError, fc, label="Book") + + def test_find_nameless_control(self): + data = """\ +<form> + <input type="checkbox"/> + <input type="checkbox" id="a" onclick="blah()"/> +</form> +""" + f = StringIO(data) + form = mechanize.ParseFile(f, "http://example.com/", + backwards_compat=False)[0] + self.assertRaises( + AmbiguityError, + form.find_control, type="checkbox", name=mechanize.Missing) + ctl = form.find_control(type="checkbox", name=mechanize.Missing, nr=1) + self.assertEqual(ctl.id, "a") + + def test_deselect_disabled(self): + def get_new_form(f, compat): + f.seek(0) + form = mechanize.ParseFile(f, "http://example.com/", + backwards_compat=False)[0] + form.backwards_compat = compat + return form + + f = StringIO("""\ +<form> + <input type="checkbox" name="p" value="a" disabled checked></input> + <input type="checkbox" name="p" value="b"></input> + <input type="checkbox" name="p" value="c"></input> +</form> +""") + for compat in [False]:#True, False: + def new_form(compat=compat, f=f, get_new_form=get_new_form): + form = get_new_form(f, compat) + ctl = form.find_control("p") + a = ctl.get("a") + return ctl, a + ctl, a = new_form() + ctl.value = ["b"] + + # :-(( + if compat: + # rationale: allowed to deselect, but not select, disabled + # items + ctl, a = new_form() + self.assertRaises(AttributeError, setattr, a, "selected", True) + self.assertRaises(AttributeError, setattr, ctl, "value", ["a"]) + a.selected = False + ctl, a = new_form() + ctl.value = ["b"] + self.assertEqual(a.selected, False) + self.assertEqual(ctl.value, ["b"]) + ctl, a = new_form() + self.assertRaises(AttributeError, + setattr, ctl, "value", ["a", "b"]) + else: + + # rationale: Setting an individual item's selected state to its + # present value is a no-op, as is setting the whole control + # value where an item name doesn't appear in the new value, but + # that item is disabled anyway (but an item name that does + # appear in the new value is treated an explicit request that + # that item name get sent to the server). However, if the + # item's state does change, both selecting and deselecting are + # disallowed for disabled items. + + ctl, a = new_form() + self.assertRaises(AttributeError, setattr, a, "selected", True) + ctl, a = new_form() + self.assertRaises(AttributeError, setattr, ctl, "value", ["a"]) + ctl, a = new_form() + self.assertRaises(AttributeError, + setattr, a, "selected", False) + ctl.value = ["b"] + self.assertEqual(a.selected, True) + self.assertEqual(ctl.value, ["b"]) + ctl, a = new_form() + self.assertRaises(AttributeError, + setattr, ctl, "value", ["a", "b"]) + + f = StringIO("""\ +<form> + <input type="radio" name="p" value="a" disabled checked></input> + <input type="radio" name="p" value="b"></input> + <input type="radio" name="p" value="c"></input> +</form> +""") + + for compat in [False]:#True, False: + def new_form(compat=compat, f=f, get_new_form=get_new_form): + form = get_new_form(f, compat) + ctl = form.find_control("p") + a = ctl.get("a") + return ctl, a + ctl, a = new_form() + ctl.value = ["b"] + + if compat: + ctl, a = new_form() + self.assertRaises(AttributeError, setattr, a, "selected", True) + self.assertRaises(AttributeError, setattr, ctl, "value", ["a"]) + a.selected = False + ctl, a = new_form() + ctl.value = ["b"] + self.assertEqual(a.selected, False) + self.assertEqual(ctl.value, ["b"]) + ctl, a = new_form() + self.assertRaises(ItemCountError, + setattr, ctl, "value", ["a", "b"]) + else: + ctl, a = new_form() + self.assertRaises(AttributeError, setattr, a, "selected", True) + ctl, a = new_form() + self.assertRaises(AttributeError, setattr, ctl, "value", ["a"]) + ctl, a = new_form() + self.assertRaises(AttributeError, setattr, a, "selected", False) + ctl.value = ["b"] + self.assertEqual(a.selected, False) + self.assertEqual(ctl.value, ["b"]) + ctl, a = new_form() + self.assertRaises(ItemCountError, + setattr, ctl, "value", ["a", "b"]) + + def test_click(self): + file = StringIO( +"""<form action="abc" name="myform"> + +<input type="submit" name="foo"></input> +<input type="submit" name="bar"></input> +</form> +""") + form = mechanize.ParseFile(file, "http://blah/", + backwards_compat=False)[0] + self.assertRaises(ControlNotFoundError, form.click, nr=2) + self.assert_(form.click().get_full_url() == "http://blah/abc?foo=") + self.assert_(form.click(name="bar").get_full_url() == "http://blah/abc?bar=") + + for method in ["GET", "POST"]: + file = StringIO( +"""<form method="%s" action="abc?bang=whizz#doh" name="myform"> + +<input type="submit" name="foo"></input> +</form> +""" % method) + # " (this line is here for emacs) + form = mechanize.ParseFile(file, "http://blah/", + backwards_compat=False)[0] + if method == "GET": + url = "http://blah/abc?foo=" + else: + url = "http://blah/abc?bang=whizz" + self.assert_(form.click().get_full_url() == url) + + def testAuth(self): + fh = self._get_test_file("Auth.html") + forms = mechanize.ParseFile(fh, self.base_uri, + backwards_compat=False) + self.assert_(len(forms) == 1) + form = forms[0] + self.assert_(form.action == + "http://auth.athensams.net/" + "?ath_returl=%22http%3A%2F%2Ftame.mimas.ac.uk%2Fisicgi" + "%2FWOS-login.cgi%22&ath_dspid=MIMAS.WOS") + + self.assertRaises(ControlNotFoundError, + lambda form=form: form.toggle("d'oh", "oops")) + self.assertRaises(ControlNotFoundError, lambda form=form: form["oops"]) + def bad_assign(form=form): form["oops"] = ["d'oh"] + self.assertRaises(ControlNotFoundError, bad_assign) + + self.assertRaises(ValueError, form.find_control) + + keys = ["ath_uname", "ath_passwd"] + values = ["", ""] + types = ["text", "password"] + for i in range(len(keys)): + key = keys[i] + c = form.find_control(key) + self.assert_(c.value == values[i]) + self.assert_(c.type == types[i]) + c = form.find_control(type="image") + self.assert_(c.name is None) + self.assert_(c.value == "") + self.assert_(c.type == "image") + + form["ath_uname"] = "jbloggs" + form["ath_passwd"] = "foobar" + + self.assert_(form.click_pairs() == + [("ath_uname", "jbloggs"), + ("ath_passwd", "foobar")]) + + def testSearchType(self): + fh = self._get_test_file("SearchType.html") + forms = mechanize.ParseFile(fh, self.base_uri, + backwards_compat=False) + self.assert_(len(forms) == 1) + form = forms[0] + + keys = ["SID", "SESSION_DIR", "Full Search", "Easy Search", + "New Session", "Log off", "Form", "JavaScript"] + values = ["PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0", + "", "", "", "", "", "Welcome", "No"] + types = ["hidden", "hidden", "image", "image", "image", "image", + "hidden", "hidden"] + for i in range(len(keys)): + key = keys[i] + self.assert_(form.find_control(key).value == values[i]) + self.assert_(form.find_control(key).type == types[i]) + + pairs = form.click_pairs("Full Search") + self.assert_(pairs == [ + ("SID", "PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"), + ("SESSION_DIR", ""), ("Full Search.x", "1"), ("Full Search.y", "1"), + ("Form", "Welcome"), ("JavaScript", "No")]) + + def testFullSearch(self): + pass # XXX + + def testGeneralSearch(self): + fh = self._get_test_file("GeneralSearch.html") + forms = mechanize.ParseFile(fh, self.base_uri, + backwards_compat=False) + self.assert_(len(forms) == 1) + form = forms[0] + + keys = ["SID", "SESSION_DIR", + "Home", "Date & Database Limits", "Cited Ref Search", + "Log off", "Search", + "topic", "titleonly", "author", "journal", "address", + "Search", "Save query", "Clear", + "languagetype", "doctype", "Sort", + "Form", "Func"] + values = ["PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0", "", + "", "", "", "", "", + "", [], "", "", "", + "", "", "", + ["All languages"], ["All document types"], ["Latest date"], + "General", "Search"] + types = ["hidden", "hidden", + "image", "image", "image", "image", "image", + "text", "checkbox", "text", "text", "text", + "image", "image", "image", + "select", "select", "select", + "hidden", "hidden"] + fc = form.find_control + for i in range(len(keys)): + name = keys[i] + type = types[i] + self.assertEqual(fc(name, nr=0).value, form.get_value(name, nr=0)) + self.assertEqual(fc(name, nr=0).value, values[i]) + self.assertEqual(fc(name, nr=0).type, type) + self.assertEqual(fc(name, type, nr=0).name, name) + self.assert_(fc(type="hidden", nr=0).name == "SID") + self.assert_(fc(type="image", nr=0).name == "Home") + self.assert_(fc(nr=6).name == "Search") + self.assertRaises(ControlNotFoundError, fc, nr=50) + self.assertRaises(ValueError, fc, nr=-1) + self.assert_(fc("Search", "image", nr=0).name == "Search") + self.assertRaises(ControlNotFoundError, fc, "Search", "hidden") + s0 = fc("Search", "image", nr=0) + s0b = fc("Search", "image", nr=0) + s1 = fc("Search", "image", nr=1) + self.assert_(s0.name == s1.name == "Search") + self.assert_(s0 is s0b) + self.assert_(s0 is not s1) + self.assertRaises(ControlNotFoundError, fc, "Search", "image", nr=2) + self.assert_(fc(type="text", nr=2).name == "journal") + self.assert_(fc("Search", nr=0) is not fc("Search", nr=1)) + + form["topic"] = "foo" + self.assert_(form["topic"] == "foo") + form["author"] = "bar" + form["journal"] = "" + form["address"] = "baz" + form["languagetype"] = ["English", "Catalan"] + self.assert_(form["languagetype"] == ["English", "Catalan"]) + form["titleonly"] = ["on"] + self.assert_(form["titleonly"] == ["on"]) + pairs = form.click_pairs("Search") + self.assert_(pairs == [ + ("SID", "PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"), + ("SESSION_DIR", ""), + ("Search.x", "1"), ("Search.y", "1"), + ("topic", "foo"), + ("titleonly", "on"), + ("author", "bar"), + ("journal", ""), ("address", "baz"), + ("languagetype", "English"), ("languagetype", "Catalan"), + ("doctype", "All document types"), ("Sort", "Latest date"), + ("Form", "General"), ("Func", "Search")]) + + hide_deprecations() + pvs = form.possible_items("languagetype") + self.assert_(pvs[0] == "All languages") + self.assert_(len(pvs) == 47) + + self.assertRaises( + ItemNotFoundError, + lambda form=form: form.toggle("d'oh", "languagetype")) + form.toggle("English", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + self.assertRaises(TypeError, form.toggle, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.toggle, "Catalan", ["languagetype"]) + + # XXX type, nr, by_label args + + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "SID") + + # multiple select + form["languagetype"] = [] + self.assert_(form["languagetype"] == []) + form.set(True, "Catalan", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + form.set(True, "English", "languagetype") + self.assert_(form["languagetype"] == ["English", "Catalan"]) + form.set(False, "English", "languagetype") + self.assert_(form["languagetype"] == ["Catalan"]) + form.set(False, "Catalan", "languagetype") + self.assert_(form["languagetype"] == []) + self.assertRaises(ItemNotFoundError, form.set, True, "doh", "languagetype") + self.assertRaises(ItemNotFoundError, form.set, False, "doh", "languagetype") + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "oops") + self.assertRaises(TypeError, form.set, True, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.set, False, ["Catalan"], "languagetype") + self.assertRaises(TypeError, form.set, True, "Catalan", ["languagetype"]) + self.assertRaises(TypeError, form.set, False, "Catalan", ["languagetype"]) + + def setitem(name, value, form=form): form[name] = value + form["languagetype"] = ["Catalan"] + self.assert_(form["languagetype"] == ["Catalan"]) + self.assertRaises(ItemNotFoundError, + setitem, "languagetype", ["doh"]) + self.assertRaises(ControlNotFoundError, setitem, "oops", ["blah"]) + self.assertRaises(TypeError, setitem, ["languagetype"], "Catalan") + + # single select + form["Sort"] = [] + self.assert_(form["Sort"] == []) + form.set(True, "Relevance", "Sort") + self.assert_(form["Sort"] == ["Relevance"]) + form.set(True, "Times Cited", "Sort") + self.assert_(form["Sort"] == ["Times Cited"]) + form.set(False, "Times Cited", "Sort") + self.assert_(form["Sort"] == []) + self.assertRaises(ItemNotFoundError, form.set, True, "doh", "Sort") + self.assertRaises(ItemNotFoundError, form.set, False, "doh", "Sort") + self.assertRaises(ControlNotFoundError, form.set, True, "blah", "oops") + self.assertRaises(TypeError, form.set, True, ["Relevance"], "Sort") + self.assertRaises(TypeError, form.set, False, ["Relevance"], "Sort") + self.assertRaises(TypeError, form.set, True, "Relevance", ["Sort"]) + self.assertRaises(TypeError, form.set, False, "Relevance", ["Sort"]) + reset_deprecations() + + form["Sort"] = ["Relevance"] + self.assert_(form["Sort"] == ["Relevance"]) + self.assertRaises(ItemNotFoundError, + setitem, "Sort", ["doh"]) + self.assertRaises(ControlNotFoundError, setitem, "oops", ["blah"]) + self.assertRaises(TypeError, setitem, ["Sort"], ["Relevance"]) + + def testSetValueByLabelIgnoringAmbiguity(self): + # regression test: follow ClientForm 0.1 behaviour + # also test that backwards_compat argument to ParseFile works + f = StringIO("""\ +<form> + <select multiple name="form.grocery"> + <option value="bread" id="1">Loaf of Bread</option> + <option value="bread" id="2">Loaf of Bread</option> + <option value="challah">Loaf of Challah</option> + </select> + <input type="submit" value="Submit" /> +</form> +""") + for kwds, backwards_compat in [({}, True), + ({"backwards_compat": True}, True), + ({"backwards_compat": False}, False), + ]: + hide_deprecations() + form = mechanize.ParseFile(f, "http://localhost/", **kwds)[0] + reset_deprecations() + f.seek(0) + c = form.find_control("form.grocery") + #for item in c.items: + # print [label.text for label in item.get_labels()] + c.set_value_by_label( + ["Loaf of Bread", "Loaf of Bread", "Loaf of Challah"]) + if backwards_compat: + # select first item of ambiguous set + self.assertEqual( + c.get_value_by_label(), + ["Loaf of Bread", "Loaf of Challah"]) + self.assertEqual( + [item.id for item in c.items if item.selected], + ["1", None]) + # disabled items still part of 'value by label' + c.get(label="Loaf of Challah").disabled = True + self.assertEqual( + c.get_value_by_label(), + ["Loaf of Bread", "Loaf of Challah"]) + else: + self.assertEqual( + c.get_value_by_label(), + ["Loaf of Bread", "Loaf of Bread", "Loaf of Challah"]) + self.assertEqual( + [item.id for item in c.items if item.selected], + ["1", "2", None]) + # disabled items NOT part of 'value by label' + c.get(label="Challah").disabled = True + self.assertEqual( + c.get_value_by_label(), + ["Loaf of Bread", "Loaf of Bread"]) + + def testClearValue(self): + # regression test: follow ClientForm 0.1 behaviour + # assigning [] to value is implemented as a special case + f = StringIO("""\ +<form> + <select multiple name="s"> + <option disabled selected>a</option> + <option selected>b</option> + </select> +</form> +""") + for kwds, backwards_compat in [ + ({}, True), + ({"backwards_compat": True}, True), + ({"backwards_compat": False}, False), + ]: + hide_deprecations() + form = mechanize.ParseFile(f, "http://localhost/", **kwds)[0] + reset_deprecations() + f.seek(0) + cc = form.find_control("s") + if backwards_compat: + self.assertEqual(cc.value, ["a", "b"]) + cc.value = [] + self.assertEqual( + [ii.name for ii in cc.items if ii.selected], []) + else: + self.assertEqual(cc.value, ["b"]) + cc.value = [] + # first is disabled, so no need to deselect + self.assertEqual( + [ii.name for ii in cc.items if ii.selected], ["a"]) + + def testSearchByLabel(self): + f = StringIO("""\ +<form> +<table> + <tr> + <td><label for="form.title">Book Title</label></td> + <td><input type="text" id="form.title" name="form.title" + value="The Grapes of Wrath" /></tr> + </tr> + <tr> + <td>Quality</td> + <td> + <div> + <label><input type="radio" id="form.quality.good" name="form.quality" + value="good" /> Good</label> + </div><div> + <label><input type="radio" id="form.quality.indifferent" + name="form.quality" value="indifferent" /> + Indifferent</label> + </div><div> + <label><input type="radio" id="form.quality.bad" name="form.quality" + value="bad" /> Bad</label> + </div> + </td> + </tr> + <tr> + <td><label for="form.country" blah="foo">Country of Origin</label></td> + <td> + <select id="form.country" name="form.country"> + <option value="albania">Albania</option> + <optgroup label="European Union"> + <option label="GB" value="EU: Great Britain">Great Britain</option> + </optgroup> + <option value="USA">United States of America</option> + <option value="zimbabwe">Zimbabwe</option> + </select> + </td> + </tr> + <tr> + <td>Genre</label></td> + <td> + <div> + <label><input type="checkbox" id="form.genre.western" name="form.genre" + value="western" /> Western</label> + </div><div> + <label><input type="checkbox" id="form.genre.sciencefiction" + name="form.genre" value="scifi" /> + Science Fiction</label> + </div><div> + <label><input type="checkbox" id="form.genre.horror" name="form.genre" + value="horror" /> Horror</label> + </div> + </td> + </tr> + <tr> + <td><label for="form.password">Password</label></td> + <td><input type="text" id="form.password" name="form.password" + value="123" /></tr> + </tr> + <tr> + <td>In this grocery list of requested food items, mark the items you intend + to purchase: + </td> + <td> + <label><input type="checkbox" name="form.grocery" value="bread" id="1"/> + Loaf of Bread</label> | + <label><input type="checkbox" name="form.grocery" value="bread" id="2"/> + Loaf of Bread</label> | + <label><input type="checkbox" name="form.grocery" value="bread" id="3"/> + Loaf of Bread</label> | + <label><input type="checkbox" name="form.grocery" value="challah"/> + Loaf of Challah</label> | + <label><input type="checkbox" name="form.grocery" value="eggs"/> + Dozen Eggs</label> | + <label><input type="checkbox" name="form.grocery" value="milk"/> + Half-Gallon of Milk</label> | + <label><input type="checkbox" name="form.grocery" value="milk"/> + Half-Gallon of Milk</label> | + <label><input type="checkbox" name="form.grocery" value="diapers"/> + 36 30lb. Diapers</label> | + <label><input type="checkbox" name="form.grocery" value="diapers"/> + 36 30lb. Diapers</label> | + <label><input type="checkbox" name="form.grocery" value="diapers"/> + 36 30lb. Diapers</label> | + <label><input type="checkbox" name="form.grocery" value="diapers"/> + 36 30lb. Diapers</label> + </td> +</table> +<input type="submit" value="Submit" /> +</form> +""") + form = mechanize.ParseFile(f, "http://localhost/", + backwards_compat=False)[0] + + # basic tests + self.assertEqual(form.find_control(label="Title").value, + "The Grapes of Wrath") + self.assertEqual(form.find_control(label="Submit").value, + "Submit") + self.assertEqual( + form.find_control(label="Country").get( + label="Britain").name, "EU: Great Britain") + self.assertEqual( + form.find_control(label="Origin").get( + label="GB").name, "EU: Great Britain") + self.assertEqual(form.find_control(label="Password").value, + "123") + self.assertEqual(form.find_control(label="Title").value, + "The Grapes of Wrath") + + # Test item ambiguity, get, get_items, and set_value_by_label. + # A form can be in two states: either ignoring ambiguity or being + # careful about it. Currently, by default, a form's backwards_compat + # attribute is True, so ambiguity is ignored. For instance, notice + # that the form.grocery checkboxes include some loaves of bread and + # a loaf of challah. The code just guesses what you mean: + form.backwards_compat = True + c = form.find_control("form.grocery") + # label substring matching is turned off for compat mode + self.assertRaises(ItemNotFoundError, c.get, label="Loaf") + self.assertEqual(c.get(label="Loaf of Bread"), c.items[0]) + c.set_value_by_label(["Loaf of Bread"]) + self.assertEqual(c.get_value_by_label(), ["Loaf of Bread"]) + self.assertEqual(c.items[0].id, "1") + # However, if the form's backwards_compat attribute is False, Ambiguity + # Errors may be raised. This is generally a preferred approach, but is + # not backwards compatible. + form.backwards_compat = False + self.assertRaises(mechanize.AmbiguityError, c.get, label="Loaf") + self.assertRaises( + mechanize.AmbiguityError, c.set_value_by_label, ["Loaf"]) + # If items have the same name (value), set_value_by_label will + # be happy (since it is just setting the value anyway). + c.set_value_by_label(["Loaf of Bread"]) + self.assertEqual(c.get_value_by_label(), ["Loaf of Bread"]) + c.set_value_by_label( + ["Loaf of Bread", "Loaf of Bread", "Loaf of Challah"]) + self.assertEqual( + c.get_value_by_label(), + ["Loaf of Bread", "Loaf of Bread", "Loaf of Challah"]) + # "get" will still raise an exception, though. + self.assertRaises( + mechanize.AmbiguityError, c.get, label="Loaf of Bread") + # If you want an item, you need to specify which one you want (or use + # get_items to explicitly get all of them). + self.assertEqual(c.get(label="Loaf of Bread", nr=0).selected, True) + self.assertEqual(c.get(label="Loaf of Bread", nr=1).selected, True) + self.assertEqual(c.get(label="Loaf of Bread", nr=2).selected, False) + self.assertEqual(c.get(label="Loaf of Challah").selected, True) + self.assertEqual( + [i.selected for i in c.get_items(label="Loaf of Bread")], + [True, True, False]) + self.assertEqual( + [i.selected for i in c.get_items(label="Loaf of Challah")], + [True]) + self.assertEqual( + [i.name for i in c.get_items(label="Loaf")], + ["bread", "bread", "bread", "challah"]) + self.assertEqual( + [i.get_labels()[0].text for i in c.get_items("bread")], + ["Loaf of Bread", "Loaf of Bread", "Loaf of Bread"]) + + # test deprecation + if warnings_imported: + try: + for c, f in ( + (form.find_control("form.genre"), "western"), + (form.find_control("form.country"), "zimbabwe"), + (form.find_control("form.quality"), "good")): + # warnings are nasty. :-( + raise_deprecations() # clear onceregistry + try: + c.possible_items() + except DeprecationWarning: + pass + else: + self.fail("deprecation failed") + try: + c.toggle_single() + except DeprecationWarning: + pass + else: + self.fail("deprecation failed") + try: + c.set_single(True) + except DeprecationWarning: + pass + else: + self.fail("deprecation failed") + try: + c.toggle(f) + except DeprecationWarning: + pass + else: + self.fail("deprecation failed") + try: + c.get_item_disabled(f) + except DeprecationWarning: + pass + else: + self.fail("deprecation failed") + try: + c.set_item_disabled(True, f) + except DeprecationWarning: + pass + else: + self.fail("deprecation failed") + try: + c.get_item_attrs(True, f) + except DeprecationWarning: + pass + else: + self.fail("deprecation failed") + finally: + reset_deprecations() + + def testResults(self): + fh = self._get_test_file("Results.html") + forms = mechanize.ParseFile(fh, self.base_uri, + backwards_compat=False) + self.assert_(len(forms) == 1) + form = forms[0] + + hide_deprecations() + pvs = form.possible_items("marked_list_candidates") + reset_deprecations() + self.assert_(pvs == [ + "000174872000059/1", "000174858300003/2", "000174827900006/3"]) + def bad_setitem(form=form): + form["marked_list_candidates"] = ["blah"] + self.assertRaises(ItemNotFoundError, bad_setitem) + form["marked_list_candidates"] = [pvs[0]] + + # I've removed most of the INPUT elements from this page, and + # corrected an HTML error + keys = ["Add marked records to list", + "Add records on page to list", + "Add all records retrieved to list", + "marked_list_candidates", + "Add marked records to list", + "Add records on page to list", + "Add all records retrieved to list" + ] + types = ["image", "image", "image", + "checkbox", + "image", "image", "image"] + values = ["", "", "", + [pvs[0]], + "", "", "", + ] + + for i in range(len(keys)): + key = keys[i] + control = form.find_control(key, nr=0) + self.assert_(control.value == values[i]) + self.assert_(control.type == types[i]) + + pairs = form.click_pairs("Add all records retrieved to list") + self.assert_(pairs == [ + ("Add all records retrieved to list.x", "1"), + ("Add all records retrieved to list.y", "1"), + ("marked_list_candidates", pvs[0])]) + + def testMarkedResults(self): + fh = self._get_test_file("MarkedResults.html") + forms = mechanize.ParseFile(fh, self.base_uri, + backwards_compat=False) + self.assert_(len(forms) == 1) + form = forms[0] + + pairs = form.click_pairs() + # I've removed most of the INPUT elements from this page, and + # corrected an HTML error + self.assert_(pairs == [ + ("Add marked records to list.x", "1"), + ("Add marked records to list.y", "1"), + ("marked_list_candidates", "000174872000059/1"), + ("marked_list_candidates", "000174858300003/2"), + ("marked_list_candidates", "000174827900006/3") + ]) + + def testMarkedRecords(self): + pass # XXX + + +def make_form(html): + global_form, form = mechanize.ParseFileEx(StringIO(html), + "http://example.com/") + assert len(global_form.controls) == 0 + return form + + +def make_form_global(html): + return get1(mechanize.ParseFileEx(StringIO(html), "http://example.com/")) + + +class MoreFormTests(unittest.TestCase): + + def test_interspersed_controls(self): + # must preserve item ordering even across controls + f = StringIO("""\ +<form name="formname"> + <input type="checkbox" name="murphy" value="a"></input> + <input type="checkbox" name="woof" value="d"></input> + <input type="checkbox" name="murphy" value="b"></input> + <input type="checkbox" name="murphy" value="c"></input> + <input type="submit"></input> +</form> +""") + form = mechanize.ParseFile(f, "http://blah/", + backwards_compat=False)[0] + form["murphy"] = ["a", "b", "c"] + form["woof"] = ["d"] + self.assertEqual(form.click_pairs(), [ + ("murphy", "a"), + ("woof", "d"), + ("murphy", "b"), + ("murphy", "c"), + ]) + + form.method = "POST" + form.enctype = "multipart/form-data" + lines = [line for line in form.click_request_data()[1].split("\r\n") if + line != '' and not line.startswith("--")] + self.assertEqual( + lines, + ['Content-Disposition: form-data; name="murphy"', 'a', + 'Content-Disposition: form-data; name="woof"', 'd', + 'Content-Disposition: form-data; name="murphy"', 'b', + 'Content-Disposition: form-data; name="murphy"', 'c', + ] + ) + + def make_form(self): + f = StringIO("""\ +<form blah="nonsense" name="formname"> + <label><input type="checkbox" name="a" value="1" id="1a" blah="spam"></input> + One</label> + <label><input type="checkbox" name="a" value="2" blah="eggs"></input> + Two</label> + <input type="checkbox" name="a" value="3" id="3a"></input> + <label for="3a">Three</label> + + <label><input type="radio" name="b" value="1"></input> One</label> + <label><input type="radio" name="b" value="2" id="2"></input> Two</label> + <input type="radio" name="b" value="3" id="3"></input> + <label for="3">Three</label> + <label for="4"><input type="radio" name="b" value="4" id="4"></input> + Four</label> + + <select name="c" id="cselect" blah="foo"> + <option id="coption1" blah="bar">1</option> + <option selected blah="baz">2</option> + <option id="coption3">3</option> + </select> + + <select name="d" multiple> + <option value="v1">l1</option> + <option value="v2">l2</option> + <option blah="fee" rhubarb="fi" value="v3">l3</option> + </select> + + <input type="checkbox" name="e" value="1"></input> +</form> +""") + return mechanize.ParseFile(f, "http://blah/", + backwards_compat=False)[0] + + def test_value(self): + form = self.make_form() + + form.set_value(["v3"], type="select", kind="multilist") + self.assert_(form.get_value("d") == ["v3"]) + hide_deprecations() + form.set_value(["l2"], type="select", kind="multilist", by_label=True) + self.assert_(form.get_value("d", by_label=True) == ["l2"]) + + self.assert_(form.get_value( + "b", "radio", "singlelist", None, 0, False) == []) + form.set_value(["One"], "b", by_label=True) + self.assertEqual( + form.get_value("b", "radio", "singlelist", None, 0, False), + ["1"]) + form.set_value(["Three"], "b", by_label=True) + reset_deprecations() + self.assertEqual( + form.get_value("b", "radio", "singlelist", None, 0, False), + ["3"]) + + def test_id(self): + form = self.make_form() + + self.assert_(form.find_control("c").id == "cselect") + self.assert_(form.find_control("a").id == "1a") + self.assert_(form.find_control("b").id is None) + + self.assert_(form.find_control(id="cselect").id == "cselect") + self.assertRaises(ControlNotFoundError, form.find_control, + id="coption1") + self.assert_(form.find_control(id="1a").id == "1a") + self.assertRaises(ControlNotFoundError, form.find_control, id="1") + + def test_single(self): + form = self.make_form() + + hide_deprecations() + self.assertRaises(ItemCountError, form.set_single, True, "d") + form.set_single(True, 'e', by_label=True) + self.assertEqual(form.get_value("e"), ["1"]) + form.set_single(False, 'e', by_label=True) + self.assertEqual(form.get_value("e"), []) + form.toggle_single("e", "checkbox", "list", nr=0) + self.assert_("1" in form.get_value("e")) + form.set_single(False, "e", "checkbox", "list", nr=0) + self.assert_("1" not in form.get_value("e")) + form.set_single(True, "e", "checkbox", "list", nr=0) + self.assert_("1" in form.get_value("e")) + reset_deprecations() + + def test_possible_items(self): + form = self.make_form() + hide_deprecations() + self.assert_(form.possible_items("c") == ["1", "2", "3"]) + self.assert_(form.possible_items("d", by_label=True) == + ["l1", "l2", "l3"]) + + self.assert_(form.possible_items("a") == ["1", "2", "3"]) + self.assertEqual(form.possible_items('e', by_label=True), + [None]) + self.assertEqual(form.possible_items('a', by_label=True), + ['One', 'Two', 'Three']) + self.assertEqual(form.possible_items('b', by_label=True), + ['One', 'Two', 'Three', 'Four']) + reset_deprecations() + + def test_set_all_readonly(self): + form = self.make_form() + + form.set_all_readonly(True) + for c in form.controls: + self.assert_(c.readonly) + form.set_all_readonly(False) + for c in form.controls: + self.assert_(not c.readonly) + + def test_clear_all(self): + form = self.make_form() + form.set_all_readonly(True) + self.assertRaises(AttributeError, form.clear_all) + form.set_all_readonly(False) + form.clear_all() + for c in form.controls: + self.assert_(not c.value) + + def test_clear(self): + form = self.make_form() + form.set_all_readonly(True) + self.assertRaises(AttributeError, form.clear, "b") + form.set_all_readonly(False) + form["b"] = ["1"] + self.assertEqual(form["b"], ["1"]) + form.clear("b") + self.assertEqual(form["b"], []) + + def test_attrs(self): + form = self.make_form() + + self.assert_(form.attrs["blah"] == "nonsense") + self.assert_(form.attrs["name"] == "formname") + + a = form.find_control("a") + self.assertRaises(AttributeError, getattr, a, 'attrs') + hide_deprecations() + self.assert_(a.get_item_attrs("1")["blah"] == "spam") + self.assert_(a.get_item_attrs("2")["blah"] == "eggs") + self.assert_(not a.get_item_attrs("3").has_key("blah")) + + c = form.find_control("c") + self.assert_(c.attrs["blah"] == "foo") + self.assert_(c.get_item_attrs("1")["blah"] == "bar") + self.assert_(c.get_item_attrs("2")["blah"] == "baz") + self.assert_(not c.get_item_attrs("3").has_key("blah")) + reset_deprecations() + + def test_select_control_nr_and_label(self): + for compat in [False, True]: + self._test_select_control_nr_and_label(compat) + def _test_select_control_nr_and_label(self, compat): + f = StringIO("""\ +<form> + <select multiple name="form.grocery"> + <option value="p" label="a" id="1">a</option> + <option value="q" label="b" id="2">a</option> + <option value="p" label="a" id="3">b</option> + </select> +</form> +""") + if compat: hide_deprecations() + form = mechanize.ParseFile(f, "http://example.com/", + backwards_compat=compat)[0] + if compat: reset_deprecations() + ctl = form.find_control("form.grocery") + # ordinary case + self.assertEqual(ctl.get("p", nr=1).id, "3") + # nr too high + self.assertRaises(ItemNotFoundError, ctl.get, "p", nr=50) + # first having label "a" + self.assertEqual(ctl.get(label="a", nr=0).id, "1") + # second having label "a"... + item = ctl.get(label="a", nr=1) + # ...as opposed to second with label attribute "a"! -- each item + # has multiple labels accessible by .get_labels(), but only one + # label HTML-attribute + self.assertEqual(item.id, "2") + self.assertEqual(item.attrs.get("label"), "b") # ! + # third having label "a" (but only the second whose label is "a") + self.assertEqual(ctl.get(label="a", nr=1).id, "2") + # nr too high again + self.assertRaises(ItemNotFoundError, ctl.get, label="a", nr=3) + + self.assertEqual(ctl.get(id="2").id, "2") + self.assertRaises(ItemNotFoundError, ctl.get, id="4") + self.assertRaises(ItemNotFoundError, ctl.get, id="4") + + def test_label_whitespace(self): + for compat in [False, True]: + f = StringIO("""\ +<form> + <select multiple name="eg"> + <option value="p"> a b c </option> + <option value="q">b</option> + </select> +</form> +""") + if compat: + hide_deprecations() + form = mechanize.ParseFile(f, "http://example.com/", + backwards_compat=compat)[0] + ctl = form.find_control("eg") + p = ctl.get("p") + q = ctl.get("q") + self.assertEqual(p.get_labels()[0].text, + (compat and "a b c" or "a b c")) + self.assertEqual(q.get_labels()[0].text, "b") + if compat: + reset_deprecations() + + def test_nameless_list_control(self): + # ListControls are built up from elements that match by name and type + # attributes. Nameless controls cause some tricky cases. We should + # get a new control for nameless controls. + for data in [ + """\ +<form> + <input type="checkbox" name="foo"/> + <input type="checkbox" name="bar"/> + <input type="checkbox" id="a" onclick="bar()" checked /> +</form> +""", +"""\ +<form> + <input type="checkbox" name="foo"/> + <input type="checkbox" id="a" onclick="bar()" checked /> +</form> +""", +"""\ +<form> + <input type="checkbox"/> + <input type="checkbox"/> + <input type="checkbox" id="a" onclick="bar()" checked /> +</form> +""", + ]: + f = StringIO(data) + form = mechanize.ParseFile(f, "http://example.com/", + backwards_compat=False)[0] + bar = form.find_control(type="checkbox", id="a") + # should have value "on", but not be successful + self.assertEqual([item.name for item in bar.items], ["on"]) + self.assertEqual(bar.value, []) + self.assertEqual(form.click_pairs(), []) + + def test_action_with_fragment(self): + for method in ["GET", "POST"]: + data = ('<form action="" method="%s">' + '<input type="submit" name="s"/></form>' % method + ) + f = StringIO(data) + form = mechanize.ParseFile(f, "http://example.com/", + backwards_compat=False)[0] + self.assertEqual( + form.click().get_full_url(), + "http://example.com/"+(method=="GET" and "?s=" or ""), + ) + data = '<form action=""><isindex /></form>' + f = StringIO(data) + form = mechanize.ParseFile(f, "http://example.com/", + backwards_compat=False)[0] + form.find_control(type="isindex").value = "blah" + self.assertEqual(form.click(type="isindex").get_full_url(), + "http://example.com/?blah") + + def test_click_empty_form_by_label(self): + # http://github.com/jjlee/mechanize/issues#issue/16 + form = make_form_global("") + assert len(form.controls) == 0 + self.assertRaises(mechanize.ControlNotFoundError, + form.click, label="no control has this label") + + +class ContentTypeTests(unittest.TestCase): + + def test_content_type(self): + class OldStyleRequest: + def __init__(self, url, data=None, hdrs=None): + self.ah = self.auh = False + def add_header(self, key, val): + self.ah = True + class NewStyleRequest(OldStyleRequest): + def add_unredirected_header(self, key, val): + self.auh = True + class FakeForm(_form.HTMLForm): + def __init__(self, hdr): + self.hdr = hdr + def _request_data(self): + return "http://example.com", "", [(self.hdr, "spam")] + for request_class, hdr, auh in [ + (OldStyleRequest, "Foo", False), + (NewStyleRequest, "Foo", False), + (OldStyleRequest, "Content-type", False), + (NewStyleRequest, "Content-type", True), + ]: + form = FakeForm(hdr) + req = form._switch_click("request", request_class) + self.assertEqual(req.auh, auh) + self.assertEqual(req.ah, not auh) + + +class FunctionTests(unittest.TestCase): + + def test_normalize_line_endings(self): + def check(text, expected, self=self): + got = _form.normalize_line_endings(text) + self.assertEqual(got, expected) + + # unix + check("foo\nbar", "foo\r\nbar") + check("foo\nbar\n", "foo\r\nbar\r\n") + # mac + check("foo\rbar", "foo\r\nbar") + check("foo\rbar\r", "foo\r\nbar\r\n") + # dos + check("foo\r\nbar", "foo\r\nbar") + check("foo\r\nbar\r\n", "foo\r\nbar\r\n") + + # inconsistent -- we just blithely convert anything that looks like a + # line ending to the DOS convention, following Firefox's behaviour when + # normalizing textarea content + check("foo\r\nbar\nbaz\rblah\r\n", "foo\r\nbar\r\nbaz\r\nblah\r\n") + + # pathological ;-O + check("\r\n\n\r\r\r\n", "\r\n"*5) + + +class CaseInsensitiveDict: + + def __init__(self, items): + self._dict = {} + for key, val in items: + self._dict[string.lower(key)] = val + + def __getitem__(self, key): return self._dict[key] + + def __getattr__(self, name): return getattr(self._dict, name) + + +class UploadTests(_testcase.TestCase): + + def test_choose_boundary(self): + bndy = _form.choose_boundary() + ii = string.find(bndy, '.') + self.assert_(ii < 0) + + def make_form(self): + html = """\ +<form action="/cgi-bin/upload.cgi" method="POST" enctype="multipart/form-data"> +<input type="file" name="data"> +<input type="text" name="user" value="nobody"> +<br> +<input type="submit"> +</form> +""" + + return mechanize.ParseFile(StringIO(html), + "http://localhost/cgi-bin/upload.cgi", + backwards_compat=False)[0] + + def test_file_request(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data)) + #print "data_control._upload_data", data_control._upload_data + req = form.click() + self.assertTrue(get_header(req, "Content-type").startswith( + "multipart/form-data; boundary=")) + + #print "req.get_data()\n>>%s<<" % req.get_data() + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(header_items(req)), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + self.assert_(fs["data"].value == data) + self.assertEquals(fs["data"].filename, "") + + def test_file_request_with_filename(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data), filename="afilename") + req = form.click() + self.assert_(get_header(req, "Content-type").startswith( + "multipart/form-data; boundary=")) + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(header_items(req)), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + self.assert_(fs["data"].value == data) + self.assert_(fs["data"].filename == "afilename") + + def test_multipart_file_request(self): + import cgi + + # fill in a file upload form... + form = self.make_form() + form["user"] = "john" + data_control = form.find_control("data") + data = "blah\nbaz\n" + data_control.add_file(StringIO(data), filename="filenamea") + more_data = "rhubarb\nrhubarb\n" + data_control.add_file(StringIO(more_data)) + yet_more_data = "rheum\nrhaponicum\n" + data_control.add_file(StringIO(yet_more_data), filename="filenamec") + req = form.click() + self.assertTrue(get_header(req, "Content-type").startswith( + "multipart/form-data; boundary=")) + + #print "req.get_data()\n>>%s<<" % req.get_data() + + # ...and check the resulting request is understood by cgi module + fs = cgi.FieldStorage(StringIO(req.get_data()), + CaseInsensitiveDict(header_items(req)), + environ={"REQUEST_METHOD": "POST"}) + self.assert_(fs["user"].value == "john") + + fss = fs["data"][None] + filenames = "filenamea", "", "filenamec" + datas = data, more_data, yet_more_data + for i in range(len(fss)): + fs = fss[i] + filename = filenames[i] + data = datas[i] + self.assert_(fs.filename == filename) + self.assert_(fs.value == data) + + def test_upload_data(self): + form = self.make_form() + data = form.click().get_data() + self.assertTrue(data.startswith("--")) + + def test_empty_upload(self): + # no controls except for INPUT/SUBMIT + forms = mechanize.ParseFile(StringIO("""<html> +<form method="POST" action="./weird.html" enctype="multipart/form-data"> +<input type="submit" name="submit"></input> +</form></html>"""), ".", backwards_compat=False) + form = forms[0] + data = form.click().get_data() + lines = string.split(data, "\r\n") + self.assertTrue(lines[0].startswith("--")) + self.assertEqual(lines[1], + 'Content-Disposition: form-data; name="submit"') + self.assertEqual(lines[2], "") + self.assertEqual(lines[3], "") + self.assertTrue(lines[4].startswith("--")) + + def test_no_files(self): + # no files uploaded + self.monkey_patch(_form, "choose_boundary", lambda: "123") + forms = mechanize.ParseFileEx(StringIO("""<html> +<form method="POST" action="spam" enctype="multipart/form-data"> +<INPUT type="file" name="spam" /> +</form></html>"""), ".") + form = forms[1] + data = form.click().get_data() + self.assertEquals(data, """\ +--123\r +Content-Disposition: form-data; name="spam"; filename=""\r +Content-Type: application/octet-stream\r +\r +\r +--123--\r +""") + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/Auth.html b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/Auth.html new file mode 100644 index 0000000000000000000000000000000000000000..9c931ba9b26904b7d06a9e121ab9eae4c421f8cb --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/Auth.html @@ -0,0 +1,79 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> + +<HTML> +<HEAD> +<TITLE>Athens Authentication Point</TITLE> +<META http-equiv="Content-Type" content="text/html;charset=iso-8859-1"> +</HEAD> + +<BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#000000" VLINK="#000000"> + +<TABLE BORDER="0" CELLSPACING="0" CELLPADDING="0" WIDTH=609> + <TR> + <TD ALIGN="RIGHT"> + <IMG SRC="http://wos.mimas.ac.uk/isicgi/Images/main.jpg" ALT="ISI Web of Science" BORDER="0" WIDTH=470 HEIGHT=150> + </TD> + </TR> + <TR> + <TD> + <IMG SRC="http://auth.athensams.net/images/auth_point.gif" ALT="Athens Authentication Point"> + </TD> + </TR> + <TR> + <TD> + <P> <P> + </TD> + </TR> + <TR> + <TD ALIGN="CENTER"> + <FORM METHOD=POST ACTION="/?ath_returl=%22http%3A%2F%2Ftame.mimas.ac.uk%2Fisicgi%2FWOS-login.cgi%22&ath_dspid=MIMAS.WOS"> + <TABLE ALIGN=CENTER BORDER=0 CELLPADDING=0 CELLSPACING=10 WIDTH="75%"> + <TR> + <TD ALIGN=RIGHT WIDTH="40%"> + <FONT COLOR="#333366" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"><B>Username:</B></FONT> + </TD> + <TD ALIGN=LEFT> + <FONT COLOR="#FFFFFF" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"> + <INPUT TYPE=TEXT NAME="ath_uname" VALUE="" MAXLENGTH=20> + </FONT> + </TD> + </TR> + <TR> + <TD ALIGN=RIGHT> + <FONT COLOR="#333366" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"><B>Password:</B></FONT> + </TD> + <TD ALIGN=LEFT> + <FONT COLOR="#FFFFFF" SIZE=2 FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva"> + <INPUT TYPE=PASSWORD NAME="ath_passwd" MAXLENGTH=20> + </FONT> + </TD> + </TR> + <TR> + <TD ALIGN=CENTER COLSPAN=2> + <INPUT TYPE=IMAGE SRC="http://auth.athensams.net/images/login.gif" BORDER=0 ALT="Login" ALIGN=MIDDLE><BR> + </TD> + </TR> + </TABLE> + </FORM> + </TD> + </TR> +</TABLE> + +<TABLE WIDTH="609" BORDER="0"> + <TR> + <TD> + <FONT FACE="Verdana, Helvetica, Sans, Arial, Metabold, Geneva" SIZE=1> + Athens is a service of <a href=http://www.eduserv.ac.uk>EduServ</a> + </FONT> + <BR> + <FONT FACE="Verdana, Arial, Helvetica" SIZE=1>(c) <A HREF="http://www.athensams.net/copyright.html">Copyright</a>, EduServ. All rights reserved. February 2002</FONT> + </TD> + <TD> + <A HREF="http://www.mimas.ac.uk"><img align="right" +BORDER="0" SRC="http://wos.mimas.ac.uk/images/small_mimas2.gif" alt="MIMAS"></a> + </TD> + </TR> +</TABLE> + +</BODY> +</HTML> diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/FullSearch.html b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/FullSearch.html new file mode 100644 index 0000000000000000000000000000000000000000..60dc0479c38029601c8d320ae93e11fd61179cd3 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/FullSearch.html @@ -0,0 +1,114 @@ +<HTML><HEAD><TITLE>Search -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PeriodSelect.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi NAME = "searchForm" ENCTYPE="multipart/form-data" METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpsrch.html#Full_Search"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> +<CENTER><STRONG><FONT SIZE=4>Full Search</FONT><BR></STRONG></CENTER><INPUT TYPE=CHECKBOX NAME="editions" VALUE="D"> +<A HREF=help/helptoc.html#sci>Science Citation Index Expanded (SCI-EXPANDED)--1981-present</A><BR> +<INPUT TYPE=CHECKBOX NAME="editions" VALUE="S"> +<A HREF=help/helptoc.html#ssci>Social Sciences Citation Index (SSCI)--1981-present</A><BR> +<INPUT TYPE=CHECKBOX NAME="editions" VALUE="H"> +<A HREF=help/helptoc.html#ahci>Arts & Humanities Citation Index (A&HCI)--1981-present</A><BR> +<HR><INPUT TYPE=RADIO NAME="Period" VALUE="This Week" onClick="clear_years();"> +This week's update (Updated April 26, 2002)<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Latest 2 Weeks" onClick="clear_years();"> +Latest 2 Weeks<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Latest 4 Weeks" onClick="clear_years();"> +Latest 4 Weeks<BR><INPUT TYPE=RADIO NAME="Period" CHECKED VALUE="All Years" onClick="clear_years();"> +All years<BR><INPUT TYPE=RADIO NAME="Period" VALUE="Year Selection"> +Limit search to years selected below<BR><TABLE> +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="2002" onClick="set_period(4);"> +2002 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="2001" onClick="set_period(4);"> +2001 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="2000" onClick="set_period(4);"> +2000 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1999" onClick="set_period(4);"> +1999 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1998" onClick="set_period(4);"> +1998 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1997" onClick="set_period(4);"> +1997 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1996" onClick="set_period(4);"> +1996 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1995" onClick="set_period(4);"> +1995 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1994" onClick="set_period(4);"> +1994 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1993" onClick="set_period(4);"> +1993 + +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="1992" onClick="set_period(4);"> +1992 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1991" onClick="set_period(4);"> +1991 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1990" onClick="set_period(4);"> +1990 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1989" onClick="set_period(4);"> +1989 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1988" onClick="set_period(4);"> +1988 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1987" onClick="set_period(4);"> +1987 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1986" onClick="set_period(4);"> +1986 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1985" onClick="set_period(4);"> +1985 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1984" onClick="set_period(4);"> +1984 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1983" onClick="set_period(4);"> +1983 + +<TR><TD><INPUT TYPE=CHECKBOX NAME="years" VALUE="1982" onClick="set_period(4);"> +1982 +<INPUT TYPE=CHECKBOX NAME="years" VALUE="1981" onClick="set_period(4);"> +1981 +</TABLE><HR><TABLE> + <TR> + <TD><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=Images/gensrch.gif NAME="General Search" ALT="General Search"></TD> + + <TD> + Search for articles by subject term, author name, journal title, or author affiliation<BR></TD> + <TR> + <TD><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=Images/crsrch.gif NAME="Cited Ref Search" ALT="Cited Ref Search"></TD> + + <TD>Search for articles that cite an author or work</TD> </TR> </TABLE> + <HR> + <TABLE> + <TR > + <TD NOWRAP> <A HREF= http://tame.mimas.ac.uk:80/isicgi/CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=LoadQuery>Using Saved Queries:</A></TD><TD> Instructions for editing and running saved queries.</TD> + </TR> </TABLE> + + Enter full pathname of saved query (e.g., c:\myqueries\query1) or use Browse.<BR> + <TABLE> + <TR> + <TD NOWRAP> + <INPUT TYPE=file NAME=fileToUpload VALUE = "" ALT="Browse""> + </TD> + <TD> + <INPUT TYPE=SUBMIT NAME=Func VALUE="Open Query" ALT="Open Query"> + </TD> + </TR> + </TABLE> + <INPUT TYPE=HIDDEN NAME=Form VALUE=Full> + <HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/GeneralSearch.html b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/GeneralSearch.html new file mode 100644 index 0000000000000000000000000000000000000000..f5ba69fa7b46d47aad1f893b02ce8fb7a319a704 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/GeneralSearch.html @@ -0,0 +1,178 @@ +<HTML><HEAD><TITLE>General Search -- Web of Science v4.31</TITLE> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=http://tame.mimas.ac.uk:80/isicgi/CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> +<A NAME=top> + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + <TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#General_Search"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Date & Database Limits" ALT="Date & Database Limits" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblimits.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Cited Ref Search" ALT="Cited Ref Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbcrsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> + <CENTER><STRONG><FONT SIZE=4> + General Search</FONT><BR></STRONG></CENTER> + Enter individual search terms or phrases separated by search operators such as AND or OR then press SEARCH below.<BR> + <A href=#setlimits><FONT SIZE=+1> + Set language and document type limits and sort option.</A></FONT><BR> + <TABLE><TR> + <TD ALIGN=right HEIGHT="1" WIDTH="74"><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/search.gif NAME="Search" ALT="Search"></TD> + + <TD> + Search using terms entered below.</TD></TABLE><HR> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Basic_Index> + TOPIC:</A> Enter terms from the article title, keywords, or abstract + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#topic_search_examples> + Examples</A><BR> + <INPUT TYPE=TEXT NAME="topic" SIZE="50" VALUE=""> + <INPUT TYPE=CHECKBOX NAME="titleonly"> +Title only<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Author> + AUTHOR:</A> + Enter one or more author names as O'BRIAN C* OR OBRIAN C*<BR> + <INPUT TYPE=TEXT NAME="author" SIZE="50" VALUE=""> +<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Journal> + SOURCE TITLE:</A> + Enter journal title or copy and paste from the <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/A_fulljt.html> + source list</A><BR> + <INPUT TYPE=TEXT NAME="journal" SIZE="50" VALUE=""> +<P> + <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/helpgs.html#Address> + ADDRESS:</A> + Enter terms from an author's affiliation as YALE UNIV SAME HOSP (see <A HREF=http://tame.mimas.ac.uk:80/isicgi/help/adabrv.html>abbreviations list</A>)<BR> + <INPUT TYPE=TEXT NAME="address" SIZE="50" VALUE=""> +<BR> + <HR> + <TABLE> + <TR> + <TD ALIGN=right><INPUT TYPE=IMAGE SRC=http://tame.mimas.ac.uk:80/isicgi/Images/search.gif ALT="Search" BORDER=0 VSPACE=0 HSPACE=1 NAME="Search"></TD> + + <TD> + Search using terms entered above.<BR></TD> <TR> + <TD ALIGN=RIGHT><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/saveq.gif ALT="Save query" NAME="Save query"></TD> + <TD> + Save the search terms for future use.<BR></TD> + <TR> + <TD ALIGN=right><INPUT TYPE=IMAGE BORDER=0 VSPACE=0 HSPACE=1 SRC=http://tame.mimas.ac.uk:80/isicgi/Images/clear.gif NAME="Clear" ALT="Clear"></TD> + <TD> + Clear all search terms entered above.</TD> + </TABLE> + <A NAME=setlimits> + <HR> + <STRONG> + SET LIMITS AND SORT OPTION</STRONG><P> + <TABLE FRAME=VOID> <TR ALIGN=LEFT VALIGN=TOP> + <TH COLSPAN=2> Restrict search to a specific language or document type: <TR ALIGN=LEFT VALIGN=TOP> +<TH COLSPAN=2> (Multiple items may be selected from lists) <TH>Sort results by: <TR ALIGN=LEFT VALIGN=TOP> +<TD> <SELECT NAME="languagetype" MULTIPLE SIZE="5"> +<OPTION VALUE="All languages" SELECTED>All languages +<OPTION VALUE="English">English +<OPTION VALUE="Afrikaans">Afrikaans +<OPTION VALUE="Arabic">Arabic +<OPTION VALUE="Bengali">Bengali +<OPTION VALUE="Bulgarian">Bulgarian +<OPTION VALUE="Byelorussian">Byelorussian +<OPTION VALUE="Catalan">Catalan +<OPTION VALUE="Chinese">Chinese +<OPTION VALUE="Croatian">Croatian +<OPTION VALUE="Czech">Czech +<OPTION VALUE="Danish">Danish +<OPTION VALUE="Dutch">Dutch +<OPTION VALUE="Estonian">Estonian +<OPTION VALUE="Finnish">Finnish +<OPTION VALUE="Flemish">Flemish +<OPTION VALUE="French">French +<OPTION VALUE="Gaelic">Gaelic +<OPTION VALUE="Galician">Galician +<OPTION VALUE="Georgian">Georgian +<OPTION VALUE="German">German +<OPTION VALUE="Greek">Greek +<OPTION VALUE="Hebrew">Hebrew +<OPTION VALUE="Hungarian">Hungarian +<OPTION VALUE="Icelandic">Icelandic +<OPTION VALUE="Italian">Italian +<OPTION VALUE="Japanese">Japanese +<OPTION VALUE="Korean">Korean +<OPTION VALUE="Latin">Latin +<OPTION VALUE="Macedonian">Macedonian +<OPTION VALUE="Multi-Language">Multi-Language +<OPTION VALUE="Norwegian">Norwegian +<OPTION VALUE="Persian">Persian +<OPTION VALUE="Polish">Polish +<OPTION VALUE="Portuguese">Portuguese +<OPTION VALUE="Provencal">Provencal +<OPTION VALUE="Rumanian">Rumanian +<OPTION VALUE="Russian">Russian +<OPTION VALUE="Serbian">Serbian +<OPTION VALUE="Serbo-Croatian">Serbo-Croatian +<OPTION VALUE="Slovak">Slovak +<OPTION VALUE="Slovene">Slovene +<OPTION VALUE="Spanish">Spanish +<OPTION VALUE="Swedish">Swedish +<OPTION VALUE="Turkish">Turkish +<OPTION VALUE="Ukrainian">Ukrainian +<OPTION VALUE="Welsh">Welsh +</SELECT> +<TD><SELECT NAME="doctype" MULTIPLE SIZE="5"> +<OPTION VALUE="All document types" SELECTED>All document types +<OPTION VALUE="Article">Article +<OPTION VALUE="Abstract of Published Item">Abstract of Published Item +<OPTION VALUE="Art Exhibit Review">Art Exhibit Review +<OPTION VALUE="Bibliography">Bibliography +<OPTION VALUE="Biographical-Item">Biographical-Item +<OPTION VALUE="Book Review">Book Review +<OPTION VALUE="Chronology">Chronology +<OPTION VALUE="Correction">Correction +<OPTION VALUE="Correction, Addition">Correction, Addition +<OPTION VALUE="Dance Performance Review">Dance Performance Review +<OPTION VALUE="Database Review">Database Review +<OPTION VALUE="Discussion">Discussion +<OPTION VALUE="Editorial Material">Editorial Material +<OPTION VALUE="Excerpt">Excerpt +<OPTION VALUE="Fiction, Creative Prose">Fiction, Creative Prose +<OPTION VALUE="Film Review">Film Review +<OPTION VALUE="Hardware Review">Hardware Review +<OPTION VALUE="Item About an Individual">Item About an Individual +<OPTION VALUE="Letter">Letter +<OPTION VALUE="Meeting Abstract">Meeting Abstract +<OPTION VALUE="Meeting-Abstract">Meeting-Abstract +<OPTION VALUE="Music Performance Review">Music Performance Review +<OPTION VALUE="Music Score">Music Score +<OPTION VALUE="Music Score Review">Music Score Review +<OPTION VALUE="News Item">News Item +<OPTION VALUE="Note">Note +<OPTION VALUE="Poetry">Poetry +<OPTION VALUE="Record Review">Record Review +<OPTION VALUE="Reprint">Reprint +<OPTION VALUE="Review">Review +<OPTION VALUE="Script">Script +<OPTION VALUE="Software Review">Software Review +<OPTION VALUE="TV Review, Radio Review">TV Review, Radio Review +<OPTION VALUE="TV Review, Radio Review, Video">TV Review, Radio Review, Video +<OPTION VALUE="Theater Review">Theater Review +</SELECT> +<TD><SELECT NAME="Sort" SIZE="5"> +<OPTION VALUE="Latest date" SELECTED>Latest date +<OPTION VALUE="Times Cited">Times Cited +<OPTION VALUE="Relevance">Relevance +<OPTION VALUE="First author">First author +<OPTION VALUE="Source Title">Source Title +</SELECT> +</TABLE>Back to <A HREF=#top> + top of Search</A> + page <P> + <HR><BR> + </OL> + <INPUT TYPE=HIDDEN NAME=Form VALUE=General> + <INPUT TYPE=HIDDEN NAME=Func VALUE=Search> + </FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/MarkedRecords.html b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/MarkedRecords.html new file mode 100644 index 0000000000000000000000000000000000000000..8fb05bd747faae312aab40dcb4e41dde6ab3d217 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/MarkedRecords.html @@ -0,0 +1,152 @@ +<HTML><HEAD><TITLE>Marked Records -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=Common.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> +<A NAME=top> +<INPUT TYPE=HIDDEN NAME="Form" VALUE="Marked_Records"> + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<TABLE CELLSPACING=0 CELLPADDING=0> +<TR ALIGN=CENTER VALIGN=CENTER> +<TD><INPUT TYPE=IMAGE BORDER=0 NAME="Home" ALT="Home" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbhome.gif"></TD><TD><a href="http://tame.mimas.ac.uk:80/isicgi/help/helpprn.html#Print_&_Export_Marked_Records"><IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/tbhelp.gif ALT="Help" BORDER=0></a></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Date & Database Limits" ALT="Date & Database Limits" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblimits.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="General Search" ALT="General Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbgsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Cited Ref Search" ALT="Cited Ref Search" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tbcrsch.gif"></TD><TD><INPUT TYPE=IMAGE BORDER=0 NAME="Log off" ALT="Log off" SRC="http://tame.mimas.ac.uk:80/isicgi/Images/tblogoff.gif"></TD></TR> +</TABLE> +<HR> + +<INPUT TYPE=HIDDEN NAME=id VALUE=6> + +<div align="center"> + <table width="650" border="0" cellspacing="0" cellpadding="0"> + <tr> + <td width="231" align="center"> + </td> + <td width="215"> + <p align="center"><br> + <STRONG><FONT SIZE=4>Marked Records</FONT></STRONG> + </td> + <td align="right"> </td> + </tr> + <tr> + <td width="231" align="center"> + <p align="right"><b>500</b></td> + <td width="215"> + <p align="center"> <b>Records on the marked list</b></p> + </td> + <td align="right"><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Clear+Mark+List onClick="this.href = confirmLink( 'Warning: Pressing OK will clear the marked list.', 'CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Clear+Mark+List', 'javascript:void(0)');"> +<IMG SRC="Images/clearlst.gif" ALT="Clear Marked List" BORDER="0"></A></td> + </tr> + </table> +</div> +<hr> + +<font size="+1"><b>STEP 1: Select sort and output fields for the entire marked list.</b></font> + +<div align="center"> +<table width="92%" border="1" height="124"> + <tr> + <td width="21%" valign="top" height="124"> + <div align="left"> + <p align="center"><b>Select sort option:</b> + </p> + </div> + <div align="left"> + <p> + <SELECT NAME="MarkedSort" SIZE="4"> +<OPTION VALUE="Latest date" SELECTED>Latest date +<OPTION VALUE="First author">First author +<OPTION VALUE="Source Title">Source Title +<OPTION VALUE="Times Cited">Times Cited +</SELECT> + + </p> + </div> + </td> + <td width="79%" height="124"> + + + <p align="center"><b>Select fields to include in addition to the author(s), + article title and source.</b> </p> + + + <table width="481"> + <tr> + + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=include_refs >cited references*</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=address >addresses</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=abstract >abstract</td> + </tr> + <tr> + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=language >language</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=publisher >publisher information</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=ISSN >ISSN</td> + </tr> + <tr> + <td width="150"> + <INPUT TYPE=CHECKBOX NAME=doctype >document type</td> + <td width="181"> + <INPUT TYPE=CHECKBOX NAME=keywords >keywords</td> + <td width="130"> + <INPUT TYPE=CHECKBOX NAME=timescited >times cited</td> + </tr> + </table> + + <FONT SIZE=-1><i>*Selecting the cited references may cause the server + to time out with large numbers of records.</i></FONT> + + </td> + </tr> +</table> +</div> + +<br> + +<font size="+1"><b>STEP 2: Select action for output.</b></font><br> + +<div align="center"> + <table width=650 height="28" cellspacing="0" cellpadding="0" border="0"> + <tr align="center"> + <td width="542"><INPUT TYPE=IMAGE SRC=Images/print.gif NAME="Format for Print" ALT="Format for Print" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/savefile.gif NAME="Save to File" ALT="Save to File" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/export.gif NAME="Export to reference software" ALT="Export to reference software" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/email.gif NAME="E-Mail" ALT="E-Mail" BORDER=0> + </td> + </tr></table> +</div> +<hr> + <BR> +<DL><DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000174872000059 CHECKED> Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/1>Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002<!000174872000059> +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000174858300003 CHECKED> Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/2>Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002<!000174858300003> +<BR><BR> +<!--snip--> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000081310100003 CHECKED> Disney RHL<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/499>A troublesome sibling species complex of scuttle flies (Diptera : Phoridae) revisited</A><BR>J NAT HIST 33 (8): 1159-1216 AUG 1999<!000081310100003> +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_selected value=000081297200008 CHECKED> Rosanowski F, Eysholdt U<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=6/500>Medical expertise prior to voice change surgery in male-to-female transsexuals</A><BR>HNO 47 (6): 556-562 JUN 1999<!000081297200008> +<BR><BR> +</DL> +<hr> +<div align="center"> + <table width=650 height="28" cellspacing="0" cellpadding="0" border="0"> + <tr align="center"> + <td width="542"><INPUT TYPE=IMAGE SRC=Images/print.gif NAME="Format for Print" ALT="Format for Print" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/savefile.gif NAME="Save to File" ALT="Save to File" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/export.gif NAME="Export to reference software" ALT="Export to reference software" BORDER=0> <INPUT TYPE=IMAGE SRC=Images/email.gif NAME="E-Mail" ALT="E-Mail" BORDER=0> + </td> + </tr></table> +</div> +<BR>Back to <A HREF=#top>top of Marked Records</A> page<BR><BR><HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/MarkedResults.html b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/MarkedResults.html new file mode 100644 index 0000000000000000000000000000000000000000..cb5b2bc228579eb23e08a8b5fe20a29aa43f9eff --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/MarkedResults.html @@ -0,0 +1,97 @@ +<HTML><HEAD><TITLE>General Search Results-Summary -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PageSubmit.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<HR><TABLE WIDTH=100%><TR ALIGN=CENTER><TD><STRONG><FONT SIZE=4>General Search Results--Summary</FONT></STRONG></TD></TR></TABLE> + Topic=troublesome; DocType=All document types; Language=All languages; Databases=SCI-EXPANDED; Timespan=All Years; (sorted by latest date) + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Unmark Page" ALT="Unmark Page" SRC=Images/unmarkall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + <HR><I><FONT SIZE=-1>Use the checkboxes to add individual articles to the Marked List. Be sure to click SUBMIT MARKS button before leaving page.</FONT></I><DL> +<DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174872000059/1 CHECKED> Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/1 onClick="this.href="javascript:submit_page('Abstract', '1/1')";">Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174858300003/2 CHECKED> Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/2 onClick="this.href="javascript:submit_page('Abstract', '1/2')";">Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174827900006/3 CHECKED> Chang DW, Hussussian C, Lewin JS, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/3 onClick="this.href="javascript:submit_page('Abstract', '1/3')";">Analysis of pharyngocutaneous fistula following free jejunal transfer for total laryngopharyngectomy</A><BR>PLAST RECONSTR SURG 109 (5): 1522-1527 APR 15 2002 +<BR><BR> +</DL><HR> + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Unmark Page" ALT="Unmark Page" SRC=Images/unmarkall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + <BR>1783 of 16635816 documents matched the query. (500 shown)<HR></FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/Results.html b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/Results.html new file mode 100644 index 0000000000000000000000000000000000000000..ee31c1fd8370a02969a3098b37d815f6c5be2352 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/Results.html @@ -0,0 +1,94 @@ +<HTML><HEAD><TITLE>General Search Results-Summary -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=PageSubmit.js> +</SCRIPT> + + +</HEAD> +<BODY BGCOLOR=#FFFFFF ><FORM ACTION=CIW.cgi METHOD=POST> + + <TABLE WIDTH="100%" BORDER="0" CELLSPACING="0" CELLPADDING="0"> + <TR><TD WIDTH="100%" BGCOLOR="#000000"> + <IMG SRC=http://tame.mimas.ac.uk:80/isicgi/Images/isihdr.gif BORDER=0 ALT="ISI Citation Indexes" WIDTH="620" HEIGHT="20" ALIGN="BOTTOM" NATURALSIZEFLAG="3"> + </TD></TR> + <TR></TR> + </TABLE> + +<HR> +<TABLE WIDTH=100%><TR ALIGN=CENTER><TD><STRONG><FONT SIZE=4>General Search Results--Summary</FONT></STRONG></TD></TR><TR><TD> </TD></TR></TABLE> + Topic=troublesome; DocType=All document types; Language=All languages; Databases=SCI-EXPANDED; Timespan=All Years; (sorted by latest date) + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add records on page to list" ALT="Add records on page to list" SRC=Images/markall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> </TD> +<TD>. . . + <TD><IMG SRC=Images/frwrd10i.gif ALT="Next 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR> + </TABLE></CENTER> + <HR><I><FONT SIZE=-1>Use the checkboxes to add individual articles to the Marked List. Be sure to click SUBMIT MARKS button before leaving page.</FONT></I><DL> +<DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174872000059/1 > Jeppsson U, Alex J, Pons MN, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/1 onClick="this.href="javascript:submit_page('Abstract', '1/1')";">Status and future trends of ICA in wastewater treatment - a European perspective</A><BR>WATER SCI TECHNOL 45 (4-5): 485-494 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174858300003/2 > Gregory PL, Biswas AC, Batt ME<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/2 onClick="this.href="javascript:submit_page('Abstract', '1/2')";">Musculoskeletal problems of the chest wall in athletes</A><BR>SPORTS MED 32 (4): 235-250 2002 +<BR><BR> + <DT><INPUT TYPE=CHECKBOX name=marked_list_candidates value=000174827900006/3 > Chang DW, Hussussian C, Lewin JS, et al.<DD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Abstract&doc=1/3 onClick="this.href="javascript:submit_page('Abstract', '1/3')";">Analysis of pharyngocutaneous fistula following free jejunal transfer for total laryngopharyngectomy</A><BR>PLAST RECONSTR SURG 109 (5): 1522-1527 APR 15 2002 +<BR><BR> +</DL><HR> + + <P><TABLE WIDTH="100%" BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=LEFT VALIGN=CENTER> + <TD WIDTH=230><TABLE WIDTH=230 BORDER=0><TR> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add marked records to list" ALT="Add marked records to list" SRC=Images/marksel.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add records on page to list" ALT="Add records on page to list" SRC=Images/markall.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD WIDTH=75> <INPUT TYPE=IMAGE NAME="Add all records retrieved to list" ALT="Add all records retrieved to list" SRC=Images/markall_old.gif BORDER=0 VSPACE=1 HSPACE=1 > </TD> + </TR></TABLE></TD> + <TD WIDTH="100%"><TABLE ALIGN=CENTER BORDER=0><TD NOWRAP><B> + Page + 1 (Articles 1 -- 10):</B></TD> + <TD WIDTH="58%"> </TD></TR></TABLE> + </TR> + </TABLE> + <CENTER> + <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0> + <TR ALIGN=CENTER VALIGN=MIDDLE> + <TD><IMG SRC=Images/first10i.gif ALT="First Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/back10i.gif ALT="Previous 10 Pages" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + <TD><IMG SRC=Images/prevpgi.gif ALT="Previous Page" BORDER=0 VSPACE=1 HSPACE=1 > </TD> + + <TD>[ <I>1</I> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/11 onClick="this.href="javascript:submit_page('PageNo', '1/11')";">2</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/21 onClick="this.href="javascript:submit_page('PageNo', '1/21')";">3</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/31 onClick="this.href="javascript:submit_page('PageNo', '1/31')";">4</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/41 onClick="this.href="javascript:submit_page('PageNo', '1/41')";">5</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/51 onClick="this.href="javascript:submit_page('PageNo', '1/51')";">6</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/61 onClick="this.href="javascript:submit_page('PageNo', '1/61')";">7</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/71 onClick="this.href="javascript:submit_page('PageNo', '1/71')";">8</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/81 onClick="this.href="javascript:submit_page('PageNo', '1/81')";">9</A> | </TD> + <TD><A HREF=CIW.cgi?PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0&Func=Summary&curr_doc=1/91 onClick="this.href="javascript:submit_page('PageNo', '1/91')";">10</A> ] </TD> + </TR> + </TABLE></CENTER> + +<BR>1783 of 16635816 documents matched the query. (500 shown)<HR> +</FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/SearchType.html b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/SearchType.html new file mode 100644 index 0000000000000000000000000000000000000000..a895c3e0a5c95db2dd1d9e3534a414f801710391 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_data/SearchType.html @@ -0,0 +1,55 @@ +<HTML><HEAD><TITLE>Welcome -- Web of Science v4.31</TITLE> + + + +<SCRIPT LANGUAGE=JavaScript SRC=Common.js> +</SCRIPT> + + + +<SCRIPT LANGUAGE=JavaScript> +<!-- Hide script from old browsers. +function main(){ + JavaScriptTest(); +} +// End script hide. --> +</SCRIPT> + +</HEAD> +<BODY BGCOLOR=#FFFFFF onLoad="main()" ><FORM ACTION=CIW.cgi METHOD=POST> +<INPUT TYPE=HIDDEN NAME="SID" VALUE="PMrU0IJYy4MAAELSXic_E2011300_PMrU0IJYy4MAAELSXic-0"> +<INPUT TYPE=HIDDEN NAME="SESSION_DIR" VALUE=""> + + <A NAME=top></A> + <CENTER><IMG SRC=Images/main.jpg ALT="Institute for Scientific Information"></CENTER> + + <P> + <CENTER> + <TABLE BORDER=2 CELLPADDING=0> + <TR> + <TD ALIGN=CENTER><INPUT TYPE=IMAGE SRC=Images/fullsch.gif NAME="Full Search" ALT="Full Search" BORDER=0> + <TD> + <TABLE> + <TR><TD><TD>Search by bibliographic information (topic, author, source, address) or by cited reference. +</TABLE> + <TR> + <TD ALIGN=CENTER><INPUT TYPE=IMAGE SRC=Images/quiksch.gif NAME="Easy Search" ALT="Easy Search" BORDER=0> + <TD><TABLE> + <TR><TD><TD>Search for a limited number of articles on a specific topic, person, or address.</TABLE> +<TR><TD ALIGN=CENTER> + <INPUT TYPE=IMAGE SRC=Images/newsession.gif NAME="New Session" ALT="New Session" BORDER=0> + <TD> + <TABLE><TR><TD><TD> +Clear all search forms and the marked list.</TABLE> <TR> + <TD ALIGN=CENTER> +<INPUT TYPE=IMAGE SRC=Images/logoff.gif NAME="Log off" ALT="Log off" BORDER=0> + <TD><TABLE> + <TR><TD> +Fully disconnect from the database and make your connection available to another user at your institution.</TD></TABLE><INPUT TYPE=HIDDEN NAME=Form Value=Welcome> +</TABLE></CENTER> +<HR> +<INPUT TYPE=HIDDEN NAME="JavaScript" VALUE="No"> +<P><CENTER><IMG SRC=Images/isilogo.gif ALT="ISI Thomson Scientific"></CENTER><P> +</FORM> +<CENTER><EM><A HREF=http://wos.isitrial.com/policy/Policy.htm><FONT SIZE=-1>Acceptable Use Policy</FONT></A></EM></CENTER><P> +<CENTER><I>Copyright © 2002 <A HREF=http://www.isinet.com>Institute for Scientific Information</A></I></CENTER></BODY></HTML> diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_form_mutation.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_mutation.py new file mode 100644 index 0000000000000000000000000000000000000000..f71715f08d40ed8ad31bcc3c4ff219a7c0938071 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_form_mutation.py @@ -0,0 +1,23 @@ +import unittest +from unittest import TestCase + +import mechanize + + +def first_form(text, base_uri="http://example.com/"): + return mechanize.ParseString(text, base_uri)[0] + + +class MutationTests(TestCase): + + def test_add_textfield(self): + form = first_form('<input type="text" name="foo" value="bar" />') + more = first_form('<input type="text" name="spam" value="eggs" />') + combined = form.controls + more.controls + for control in more.controls: + control.add_to_form(form) + self.assertEquals(form.controls, combined) + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_forms.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_forms.doctest new file mode 100644 index 0000000000000000000000000000000000000000..1f98f4cb62e14ca81a3c636df9c099ab57142ffe --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_forms.doctest @@ -0,0 +1,59 @@ +Integration regression test for case where ClientForm handled RFC 3986 +url unparsing incorrectly (it was using "" in place of None for +fragment, due to continuing to support use of stdlib module urlparse +as well as mechanize._rfc3986). Fixed in ClientForm r33622 . + +>>> import mechanize +>>> from mechanize._response import test_response + +>>> def forms(): +... forms = [] +... for method in ["GET", "POST"]: +... data = ('<form action="" method="%s">' +... '<input type="submit" name="s"/></form>' % method +... ) +... br = mechanize.Browser() +... response = test_response(data, [("content-type", "text/html")]) +... br.set_response(response) +... br.select_form(nr=0) +... forms.append(br.form) +... return forms + +>>> getform, postform = forms() +>>> getform.click().get_full_url() +'http://example.com/?s=' +>>> postform.click().get_full_url() +'http://example.com/' + +>>> data = '<form action=""><isindex /></form>' +>>> br = mechanize.Browser() +>>> response = test_response(data, [("content-type", "text/html")]) +>>> br.set_response(response) +>>> br.select_form(nr=0) +>>> br.find_control(type="isindex").value = "blah" +>>> br.click(type="isindex").get_full_url() +'http://example.com/?blah' + + +If something (e.g. calling .forms() triggers parsing, and parsing +fails, the next attempt should not succeed! This used to happen +because the response held by LinksFactory etc was stale, since it had +already been .read(). Fixed by calling Factory.set_response() on +error. + +>>> import mechanize +>>> br = mechanize.Browser() +>>> r = mechanize._response.test_html_response("""\ +... <form> +... <input type="text" name="foo" value="a"></input><!!!> +... <input type="text" name="bar" value="b"></input> +... </form> +... """) +>>> br.set_response(r) +>>> try: +... br.select_form(nr=0) +... except mechanize.ParseError: +... pass +>>> br.select_form(nr=0) # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: expected name token diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_functional.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..90dcaae5134567bcca551ef7d946ee9127f101be --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_functional.py @@ -0,0 +1,775 @@ +#!/usr/bin/env python + +# These tests access the network. python test.py runs a local test server and +# doesn't try to fetch anything over the internet, since the few tests here +# that do that are disabled by default since they have test tag "internet". + +# thanks Moof (aka Giles Antonio Radford) for some of these + +import errno +import os +import socket +import subprocess +import sys +import unittest +import urllib +import urllib2 + +import mechanize +from mechanize import CookieJar, HTTPCookieProcessor, \ + HTTPHandler, HTTPRefreshProcessor, \ + HTTPEquivProcessor, HTTPRedirectHandler, \ + HTTPRedirectDebugProcessor, HTTPResponseDebugProcessor +from mechanize._rfc3986 import urljoin +from mechanize._util import hide_experimental_warnings, \ + reset_experimental_warnings, write_file +import mechanize._opener +import mechanize._rfc3986 +import mechanize._sockettimeout +import mechanize._testcase + + +#from cookielib import CookieJar +#from urllib2 import build_opener, install_opener, urlopen +#from urllib2 import HTTPCookieProcessor, HTTPHandler + +#from mechanize import CreateBSDDBCookieJar + +## import logging +## logger = logging.getLogger("mechanize") +## logger.addHandler(logging.StreamHandler(sys.stdout)) +## #logger.setLevel(logging.DEBUG) +## logger.setLevel(logging.INFO) + + +class TestCase(mechanize._testcase.TestCase): + + # testprogram sets self.no_proxies on each TestCase to request explicitly + # setting proxies so that http*_proxy environment variables are ignored + + def _configure_user_agent(self, ua): + if self.no_proxies: + ua.set_proxies({}) + + def make_browser(self): + browser = mechanize.Browser() + self._configure_user_agent(browser) + return browser + + def make_user_agent(self): + ua = mechanize.UserAgent() + self._configure_user_agent(ua) + return ua + + def build_opener(self, handlers=(), build_opener=None): + handlers += (mechanize.ProxyHandler(proxies={}),) + if build_opener is None: + build_opener = mechanize.build_opener + return build_opener(*handlers) + + def setUp(self): + mechanize._testcase.TestCase.setUp(self) + self.test_uri = urljoin(self.uri, "test_fixtures") + self.server = self.get_cached_fixture("server") + if self.no_proxies: + old_opener_m = mechanize._opener._opener + old_opener_u = urllib2._opener + mechanize.install_opener(mechanize.build_opener( + mechanize.ProxyHandler(proxies={}))) + urllib2.install_opener(urllib2.build_opener( + urllib2.ProxyHandler(proxies={}))) + def revert_install(): + mechanize.install_opener(old_opener_m) + urllib2.install_opener(old_opener_u) + self.add_teardown(revert_install) + + +def sanepathname2url(path): + urlpath = urllib.pathname2url(path) + if os.name == "nt" and urlpath.startswith("///"): + urlpath = urlpath[2:] + # XXX don't ask me about the mac... + return urlpath + + +def read_file(filename): + fh = open(filename) + try: + return fh.read() + finally: + fh.close() + + +class FtpTestCase(TestCase): + + def test_ftp(self): + server = self.get_cached_fixture("ftp_server") + browser = self.make_browser() + path = self.make_temp_dir(dir_=server.root_path) + file_path = os.path.join(path, "stuff") + data = "data\nmore data" + write_file(file_path, data) + relative_path = os.path.join(os.path.basename(path), "stuff") + r = browser.open("ftp://anon@localhost:%s/%s" % + (server.port, relative_path)) + self.assertEqual(r.read(), data) + + +class SocketTimeoutTest(TestCase): + + # the timeout tests in this module aren't full functional tests: in order + # to speed things up, don't actually call .settimeout on the socket. XXX + # allow running the tests against a slow server with a real timeout + + def _monkey_patch_socket(self): + class Delegator(object): + def __init__(self, delegate): + self._delegate = delegate + def __getattr__(self, name): + return getattr(self._delegate, name) + + assertEquals = self.assertEquals + + class TimeoutLog(object): + AnyValue = object() + def __init__(self): + self._nr_sockets = 0 + self._timeouts = [] + self.start() + def start(self): + self._monitoring = True + def stop(self): + self._monitoring = False + def socket_created(self): + if self._monitoring: + self._nr_sockets += 1 + def settimeout_called(self, timeout): + if self._monitoring: + self._timeouts.append(timeout) + def verify(self, value=AnyValue): + if sys.version_info[:2] < (2, 6): + # per-connection timeout not supported in Python 2.5 + self.verify_default() + else: + assertEquals(len(self._timeouts), self._nr_sockets) + if value is not self.AnyValue: + for timeout in self._timeouts: + assertEquals(timeout, value) + def verify_default(self): + assertEquals(len(self._timeouts), 0) + + log = TimeoutLog() + def settimeout(timeout): + log.settimeout_called(timeout) + orig_socket = socket.socket + def make_socket(*args, **kwds): + sock = Delegator(orig_socket(*args, **kwds)) + log.socket_created() + sock.settimeout = settimeout + return sock + self.monkey_patch(socket, "socket", make_socket) + return log + + +class SimpleTests(SocketTimeoutTest): + # thanks Moof (aka Giles Antonio Radford) + + def setUp(self): + super(SimpleTests, self).setUp() + self.browser = self.make_browser() + + def test_simple(self): + self.browser.open(self.test_uri) + self.assertEqual(self.browser.title(), 'Python bits') + # relative URL + self.browser.open('/mechanize/') + self.assertEqual(self.browser.title(), 'mechanize') + + def test_basic_auth(self): + uri = urljoin(self.uri, "basic_auth") + self.assertRaises(mechanize.URLError, self.browser.open, uri) + self.browser.add_password(uri, "john", "john") + self.browser.open(uri) + self.assertEqual(self.browser.title(), 'Basic Auth Protected Area') + + def test_digest_auth(self): + uri = urljoin(self.uri, "digest_auth") + self.assertRaises(mechanize.URLError, self.browser.open, uri) + self.browser.add_password(uri, "digestuser", "digestuser") + self.browser.open(uri) + self.assertEqual(self.browser.title(), 'Digest Auth Protected Area') + + def test_open_with_default_timeout(self): + timeout_log = self._monkey_patch_socket() + self.browser.open(self.test_uri) + self.assertEqual(self.browser.title(), 'Python bits') + timeout_log.verify_default() + + def test_open_with_timeout(self): + timeout_log = self._monkey_patch_socket() + timeout = 10. + self.browser.open(self.test_uri, timeout=timeout) + self.assertEqual(self.browser.title(), 'Python bits') + timeout_log.verify(timeout) + + def test_urlopen_with_default_timeout(self): + timeout_log = self._monkey_patch_socket() + response = mechanize.urlopen(self.test_uri) + self.assert_contains(response.read(), "Python bits") + timeout_log.verify_default() + + def test_urlopen_with_timeout(self): + timeout_log = self._monkey_patch_socket() + timeout = 10. + response = mechanize.urlopen(self.test_uri, timeout=timeout) + self.assert_contains(response.read(), "Python bits") + timeout_log.verify(timeout) + + def test_redirect_with_timeout(self): + timeout_log = self._monkey_patch_socket() + timeout = 10. + # 301 redirect due to missing final '/' + req = mechanize.Request(urljoin(self.test_uri, "test_fixtures"), + timeout=timeout) + r = self.browser.open(req) + self.assert_("GeneralFAQ.html" in r.read(2048)) + timeout_log.verify(timeout) + + def test_302_and_404(self): + # the combination of 302 and 404 (/redirected is configured to redirect + # to a non-existent URL /nonexistent) has caused problems in the past + # due to accidental double-wrapping of the error response + self.assertRaises( + mechanize.HTTPError, + self.browser.open, urljoin(self.uri, "/redirected"), + ) + + def test_reread(self): + # closing response shouldn't stop methods working (this happens also to + # be true for e.g. mechanize.OpenerDirector when mechanize's own + # handlers are in use, but is guaranteed to be true for + # mechanize.Browser) + r = self.browser.open(self.uri) + data = r.read() + r.close() + r.seek(0) + self.assertEqual(r.read(), data) + self.assertEqual(self.browser.response().read(), data) + + def test_error_recovery(self): + self.assertRaises(mechanize.URLError, self.browser.open, + 'file:///c|thisnoexistyiufheiurgbueirgbue') + self.browser.open(self.test_uri) + self.assertEqual(self.browser.title(), 'Python bits') + + def test_redirect(self): + # 301 redirect due to missing final '/' + codes = [] + class ObservingHandler(mechanize.BaseHandler): + def http_response(self, request, response): + codes.append(response.code) + return response + self.browser.add_handler(ObservingHandler()) + r = self.browser.open(urljoin(self.uri, "test_fixtures")) + self.assertEqual(r.code, 200) + self.assertTrue(301 in codes) + self.assert_("GeneralFAQ.html" in r.read(2048)) + + def test_refresh(self): + def refresh_request(seconds): + uri = urljoin(self.uri, "/cgi-bin/cookietest.cgi") + val = urllib.quote_plus('%d; url="%s"' % (seconds, self.uri)) + return uri + ("?refresh=%s" % val) + self.browser.set_handle_refresh(True, honor_time=False) + r = self.browser.open(refresh_request(5)) + self.assertEqual(r.geturl(), self.uri) + # Set a maximum refresh time of 30 seconds (these long refreshes tend + # to be there only because the website owner wants you to see the + # latest news, or whatever -- they're not essential to the operation of + # the site, and not really useful or appropriate when scraping). + refresh_uri = refresh_request(60) + self.browser.set_handle_refresh(True, max_time=30., honor_time=True) + r = self.browser.open(refresh_uri) + self.assertEqual(r.geturl(), refresh_uri) + # allow long refreshes (but don't actually wait 60 seconds) + self.browser.set_handle_refresh(True, max_time=None, honor_time=False) + r = self.browser.open(refresh_request(60)) + self.assertEqual(r.geturl(), self.uri) + + def test_file_url(self): + url = "file://%s" % sanepathname2url( + os.path.abspath(os.path.join("test", "test_functional.py"))) + r = self.browser.open(url) + self.assert_("this string appears in this file ;-)" in r.read()) + + def test_open_local_file(self): + # Since the file: URL scheme is not well standardised, Browser has a + # special method to open files by name, for convenience: + path = os.path.join("test", "test_functional.py") + response = self.browser.open_local_file(path) + self.assertIn("this string appears in this file ;-)", + response.get_data()) + + def test_open_novisit(self): + def test_state(br): + self.assert_(br.request is None) + self.assert_(br.response() is None) + self.assertRaises(mechanize.BrowserStateError, br.back) + test_state(self.browser) + uri = urljoin(self.uri, "test_fixtures") + # note this involves a redirect, which should itself be non-visiting + r = self.browser.open_novisit(uri) + test_state(self.browser) + self.assert_("GeneralFAQ.html" in r.read(2048)) + + # Request argument instead of URL + r = self.browser.open_novisit(mechanize.Request(uri)) + test_state(self.browser) + self.assert_("GeneralFAQ.html" in r.read(2048)) + + def test_non_seekable(self): + # check everything still works without response_seek_wrapper and + # the .seek() method on response objects + ua = self.make_user_agent() + ua.set_seekable_responses(False) + ua.set_handle_equiv(False) + response = ua.open(self.test_uri) + self.failIf(hasattr(response, "seek")) + data = response.read() + self.assert_("Python bits" in data) + + +class ResponseTests(TestCase): + + def test_seek(self): + br = self.make_browser() + r = br.open(self.uri) + html = r.read() + r.seek(0) + self.assertEqual(r.read(), html) + + def test_seekable_response_opener(self): + build_opener = mechanize.OpenerFactory( + mechanize.SeekableResponseOpener).build_opener + opener = self.build_opener(build_opener=build_opener) + r = opener.open(urljoin(self.uri, "test_fixtures/cctest2.txt")) + r.read() + r.seek(0) + self.assertEqual(r.read(), + r.get_data(), + "Hello ClientCookie functional test suite.\n") + + def test_seek_wrapper_class_name(self): + opener = self.make_user_agent() + opener.set_seekable_responses(True) + try: + opener.open(urljoin(self.uri, "nonexistent")) + except mechanize.HTTPError, exc: + self.assert_("HTTPError instance" in repr(exc)) + + def test_no_seek(self): + # should be possible to turn off UserAgent's .seek() functionality + def check_no_seek(opener): + r = opener.open(urljoin(self.uri, "test_fixtures/cctest2.txt")) + self.assert_(not hasattr(r, "seek")) + try: + opener.open(urljoin(self.uri, "nonexistent")) + except mechanize.HTTPError, exc: + self.assert_(not hasattr(exc, "seek")) + + # mechanize.UserAgent + opener = self.make_user_agent() + opener.set_handle_equiv(False) + opener.set_seekable_responses(False) + opener.set_debug_http(False) + check_no_seek(opener) + + # mechanize.OpenerDirector + opener = self.build_opener() + check_no_seek(opener) + + def test_consistent_seek(self): + # if we explicitly request that returned response objects have the + # .seek() method, then raised HTTPError exceptions should also have the + # .seek() method + def check(opener, excs_also): + r = opener.open(urljoin(self.uri, "test_fixtures/cctest2.txt")) + data = r.read() + r.seek(0) + self.assertEqual(data, r.read(), r.get_data()) + try: + opener.open(urljoin(self.uri, "nonexistent")) + except mechanize.HTTPError, exc: + data = exc.read() + if excs_also: + exc.seek(0) + self.assertEqual(data, exc.read(), exc.get_data()) + else: + self.assert_(False) + + opener = self.make_user_agent() + opener.set_debug_http(False) + + # Here, only the .set_handle_equiv() causes .seek() to be present, so + # exceptions don't necessarily support the .seek() method (and do not, + # at present). + opener.set_handle_equiv(True) + opener.set_seekable_responses(False) + check(opener, excs_also=False) + + # Here, (only) the explicit .set_seekable_responses() causes .seek() to + # be present (different mechanism from .set_handle_equiv()). Since + # there's an explicit request, ALL responses are seekable, even + # exception responses (HTTPError instances). + opener.set_handle_equiv(False) + opener.set_seekable_responses(True) + check(opener, excs_also=True) + + def test_set_response(self): + br = self.make_browser() + r = br.open(self.test_uri) + html = r.read() + self.assertEqual(br.title(), "Python bits") + + newhtml = """<html><body><a href="spam">click me</a></body></html>""" + + r.set_data(newhtml) + self.assertEqual(r.read(), newhtml) + self.assertEqual(br.response().read(), html) + br.response().set_data(newhtml) + self.assertEqual(br.response().read(), html) + self.assertEqual(list(br.links())[0].url, "http://sourceforge.net/") + + br.set_response(r) + self.assertEqual(br.response().read(), newhtml) + self.assertEqual(list(br.links())[0].url, "spam") + + def test_new_response(self): + br = self.make_browser() + data = ("<html><head><title>Test</title></head>" + "<body><p>Hello.</p></body></html>") + response = mechanize.make_response( + data, + [("Content-type", "text/html")], + "http://example.com/", + 200, + "OK") + br.set_response(response) + self.assertEqual(br.response().get_data(), data) + + def hidden_test_close_pickle_load(self): + print ("Test test_close_pickle_load is expected to fail unless Python " + "standard library patch http://python.org/sf/1144636 has been " + "applied") + import pickle + + b = self.make_browser() + r = b.open(urljoin(self.uri, "test_fixtures/cctest2.txt")) + r.read() + + r.close() + r.seek(0) + self.assertEqual(r.read(), + "Hello ClientCookie functional test suite.\n") + + HIGHEST_PROTOCOL = -1 + p = pickle.dumps(b, HIGHEST_PROTOCOL) + b = pickle.loads(p) + r = b.response() + r.seek(0) + self.assertEqual(r.read(), + "Hello ClientCookie functional test suite.\n") + + +class FunctionalTests(SocketTimeoutTest): + + def test_referer(self): + br = self.make_browser() + br.set_handle_refresh(True, honor_time=False) + referer = urljoin(self.uri, "test_fixtures/referertest.html") + info = urljoin(self.uri, "/cgi-bin/cookietest.cgi") + r = br.open(info) + self.assert_(referer not in r.get_data()) + + br.open(referer) + r = br.follow_link(text="Here") + self.assert_(referer in r.get_data()) + + def test_cookies(self): + # this test page depends on cookies, and an http-equiv refresh + #cj = CreateBSDDBCookieJar("/home/john/db.db") + cj = CookieJar() + handlers = [ + HTTPCookieProcessor(cj), + HTTPRefreshProcessor(max_time=None, honor_time=False), + HTTPEquivProcessor(), + + HTTPRedirectHandler(), # needed for Refresh handling in 2.4.0 +# HTTPHandler(True), +# HTTPRedirectDebugProcessor(), +# HTTPResponseDebugProcessor(), + ] + + opener = self.build_opener(handlers) + r = opener.open(urljoin(self.uri, "/cgi-bin/cookietest.cgi")) + data = r.read() + self.assert_(data.find("Your browser supports cookies!") >= 0) + self.assertEquals(len(cj), 2) + + # test response.seek() (added by HTTPEquivProcessor) + r.seek(0) + samedata = r.read() + r.close() + self.assertEquals(samedata, data) + + def test_robots(self): + plain_opener = self.build_opener( + [mechanize.HTTPRobotRulesProcessor]) + browser = self.make_browser() + for opener in plain_opener, browser: + opener.open(urljoin(self.uri, "robots")) + self.assertRaises( + mechanize.RobotExclusionError, + opener.open, urljoin(self.uri, "norobots")) + + def _check_retrieve(self, url, filename, headers): + from urllib import urlopen + self.assertEqual(headers.get('Content-Type'), 'text/html') + if self.no_proxies: + proxies = {} + else: + proxies = None + self.assertEqual(read_file(filename), + urlopen(url, proxies=proxies).read()) + + def test_retrieve_to_named_file(self): + url = urljoin(self.uri, "/mechanize/") + test_filename = os.path.join(self.make_temp_dir(), "python.html") + opener = self.build_opener() + verif = CallbackVerifier(self) + filename, headers = opener.retrieve(url, test_filename, verif.callback) + self.assertEqual(filename, test_filename) + self._check_retrieve(url, filename, headers) + self.assert_(os.path.isfile(filename)) + + def test_retrieve(self): + # not passing an explicit filename downloads to a temporary file + # using a Request object instead of a URL works + url = urljoin(self.uri, "/mechanize/") + opener = self.build_opener() + verif = CallbackVerifier(self) + request = mechanize.Request(url) + filename, headers = opener.retrieve(request, reporthook=verif.callback) + self.assertEquals(request.visit, False) + self._check_retrieve(url, filename, headers) + opener.close() + # closing the opener removed the temporary file + self.failIf(os.path.isfile(filename)) + + def test_urlretrieve(self): + timeout_log = self._monkey_patch_socket() + timeout = 10. + url = urljoin(self.uri, "/mechanize/") + verif = CallbackVerifier(self) + filename, headers = mechanize.urlretrieve(url, + reporthook=verif.callback, + timeout=timeout) + timeout_log.stop() + self._check_retrieve(url, filename, headers) + timeout_log.verify(timeout) + + def test_reload_read_incomplete(self): + browser = self.make_browser() + r1 = browser.open(urljoin(self.uri, + "test_fixtures/mechanize_reload_test.html")) + # if we don't do anything and go straight to another page, most of the + # last page's response won't be .read()... + r2 = browser.open(urljoin(self.uri, "mechanize")) + self.assert_(len(r1.get_data()) < 4097) # we only .read() a little bit + # ...so if we then go back, .follow_link() for a link near the end (a + # few kb in, past the point that always gets read in HTML files because + # of HEAD parsing) will only work if it causes a .reload()... + r3 = browser.back() + browser.follow_link(text="near the end") + # ... good, no LinkNotFoundError, so we did reload. + # we have .read() the whole file + self.assertEqual(len(r3._seek_wrapper__cache.getvalue()), 4202) + +## def test_cacheftp(self): +## from mechanize import CacheFTPHandler, build_opener +## o = build_opener(CacheFTPHandler()) +## r = o.open("ftp://ftp.python.org/pub/www.python.org/robots.txt") +## data1 = r.read() +## r.close() +## r = o.open("ftp://ftp.python.org/pub/www.python.org/2.3.2/announce.txt") +## data2 = r.read() +## r.close() +## self.assert_(data1 != data2) + + +class CommandFailedError(Exception): + + def __init__(self, message, rc): + Exception.__init__(self, message) + self.rc = rc + + +def get_cmd_stdout(args, **kwargs): + process = subprocess.Popen(args, stdout=subprocess.PIPE, **kwargs) + stdout, stderr = process.communicate() + rc = process.returncode + if rc != 0: + raise CommandFailedError( + "Command failed with return code %i: %s:\n%s" % + (rc, args, stderr), rc) + else: + return stdout + + +class ExamplesTests(TestCase): + + tags = "internet" + + def check_download_script(self, name): + python = sys.executable + parent_dir = os.path.dirname(os.path.dirname( + os.path.abspath(__file__))) + temp_dir = self.make_temp_dir() + get_cmd_stdout( + [python, os.path.join(parent_dir, "examples", name)], + cwd=temp_dir) + [tarball] = os.listdir(temp_dir) + self.assertTrue(tarball.endswith(".tar.gz")) + + def test_hack21(self): + self.check_download_script("hack21.py") + + def test_pypi(self): + self.check_download_script("pypi.py") + + +def add_to_path(env, name, value): + old = env.get(name) + if old is not None and old != "": + value = old + ":" + value + env[name] = value + + +class FormsExamplesTests(mechanize._testcase.GoldenTestCase): + + def check_forms_example(self, name, golden_path, fixup): + self.get_cached_fixture("server") + python = sys.executable + this_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(this_dir) + forms_examples_dir = os.path.join(parent_dir, "examples", "forms") + output_dir = self.make_temp_dir() + env = os.environ.copy() + add_to_path(env, "PYTHONPATH", parent_dir) + output = get_cmd_stdout([python, name, self.uri], + env=env, + cwd=forms_examples_dir) + output = fixup(output) + write_file(os.path.join(output_dir, "output"), output) + self.assert_golden(output_dir, + os.path.join(this_dir, golden_path)) + + def test_simple(self): + def fixup(output): + return output.replace("POST %s" % self.uri.rstrip("/"), + "POST http://127.0.0.1:8000") + self.check_forms_example( + "simple.py", + os.path.join("functional_tests_golden", + "FormsExamplesTests.test_simple"), + fixup) + + def test_example(self): + def fixup(output): + lines = [l for l in output.splitlines(True) if + not l.startswith("Vary:") and + not l.startswith("Server:") and + not l.startswith("Transfer-Encoding:") and + not l.startswith("Content-Length:")] + output = "".join(lines) + return output.replace(self.uri.rstrip("/"), + "http://127.0.0.1:8000") + self.check_forms_example( + "example.py", + os.path.join("functional_tests_golden", + "FormsExamplesTests.test_example"), + fixup) + + +class CookieJarTests(TestCase): + + def _test_cookiejar(self, make_cookiejar, commit): + cookiejar = make_cookiejar() + br = self.make_browser() + #br.set_debug_http(True) + br.set_cookiejar(cookiejar) + br.set_handle_refresh(False) + url = urljoin(self.uri, "/cgi-bin/cookietest.cgi") + # no cookie was set on the first request + html = br.open(url).read() + self.assertEquals(html.find("Your browser supports cookies!"), -1) + self.assertEquals(len(cookiejar), 2) + # ... but now we have the cookie + html = br.open(url).read() + self.assertIn("Your browser supports cookies!", html) + self.assertIn("Received session cookie", html) + commit(cookiejar) + + # should still have the cookie when we load afresh + cookiejar = make_cookiejar() + br.set_cookiejar(cookiejar) + html = br.open(url).read() + self.assertIn("Your browser supports cookies!", html) + self.assertNotIn("Received session cookie", html) + + def test_mozilla_cookiejar(self): + filename = os.path.join(self.make_temp_dir(), "cookies.txt") + def make_cookiejar(): + cj = mechanize.MozillaCookieJar(filename=filename) + try: + cj.revert() + except IOError, exc: + if exc.errno != errno.ENOENT: + raise + return cj + def commit(cj): + cj.save() + self._test_cookiejar(make_cookiejar, commit) + + def test_firefox3_cookiejar(self): + try: + mechanize.Firefox3CookieJar + except AttributeError: + # firefox 3 cookiejar is only supported in Python 2.5 and later; + # also, sqlite3 must be available + raise unittest.SkipTest() + + filename = os.path.join(self.make_temp_dir(), "cookies.sqlite") + def make_cookiejar(): + hide_experimental_warnings() + try: + return mechanize.Firefox3CookieJar(filename=filename) + finally: + reset_experimental_warnings() + def commit(cj): + pass + self._test_cookiejar(make_cookiejar, commit) + + +class CallbackVerifier: + # for .test_urlretrieve() + def __init__(self, testcase): + self._count = 0 + self._testcase = testcase + def callback(self, block_nr, block_size, total_size): + self._testcase.assertEqual(block_nr, self._count) + self._count = self._count + 1 + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_headers.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_headers.py new file mode 100644 index 0000000000000000000000000000000000000000..91c8cc8565d1dd3593375715112b36f764cf45d7 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_headers.py @@ -0,0 +1,146 @@ +"""Tests for ClientCookie._HeadersUtil.""" + +import mechanize._headersutil +from mechanize._testcase import TestCase + + +class IsHtmlTests(TestCase): + + def test_is_html(self): + def check(headers, extension, is_html): + url = "http://example.com/foo" + extension + self.assertEqual( + mechanize._headersutil.is_html(headers, url, allow_xhtml), + is_html) + for allow_xhtml in False, True: + check(["text/html"], ".html", True), + check(["text/html", "text/plain"], ".html", True) + # Content-type takes priority over file extension from URL + check(["text/html"], ".txt", True) + check(["text/plain"], ".html", False) + # use extension if no Content-Type + check([], ".html", True) + check([], ".gif", False) + # don't regard XHTML as HTML (unless user explicitly asks for it), + # since we don't yet handle XML properly + check([], ".xhtml", allow_xhtml) + check(["text/xhtml"], ".xhtml", allow_xhtml) + # header with empty value + check([""], ".txt", False) + + +class HeaderTests(TestCase): + + def test_parse_ns_headers_expires(self): + from mechanize._headersutil import parse_ns_headers + + # quotes should be stripped + assert parse_ns_headers(['foo=bar; expires=01 Jan 2040 22:23:32 GMT']) == \ + [[('foo', 'bar'), ('expires', 2209069412L), ('version', '0')]] + assert parse_ns_headers(['foo=bar; expires="01 Jan 2040 22:23:32 GMT"']) == \ + [[('foo', 'bar'), ('expires', 2209069412L), ('version', '0')]] + + def test_parse_ns_headers_version(self): + from mechanize._headersutil import parse_ns_headers + + # quotes should be stripped + expected = [[('foo', 'bar'), ('version', '1')]] + for hdr in [ + 'foo=bar; version="1"', + 'foo=bar; Version="1"', + ]: + self.assertEquals(parse_ns_headers([hdr]), expected) + + def test_parse_ns_headers_special_names(self): + # names such as 'expires' are not special in first name=value pair + # of Set-Cookie: header + from mechanize._headersutil import parse_ns_headers + + # Cookie with name 'expires' + hdr = 'expires=01 Jan 2040 22:23:32 GMT' + expected = [[("expires", "01 Jan 2040 22:23:32 GMT"), ("version", "0")]] + self.assertEquals(parse_ns_headers([hdr]), expected) + + def test_join_header_words(self): + from mechanize._headersutil import join_header_words + + assert join_header_words([[ + ("foo", None), ("bar", "baz"), (None, "value") + ]]) == "foo; bar=baz; value" + + assert join_header_words([[]]) == "" + + def test_split_header_words(self): + from mechanize._headersutil import split_header_words + + tests = [ + ("foo", [[("foo", None)]]), + ("foo=bar", [[("foo", "bar")]]), + (" foo ", [[("foo", None)]]), + (" foo= ", [[("foo", "")]]), + (" foo=", [[("foo", "")]]), + (" foo= ; ", [[("foo", "")]]), + (" foo= ; bar= baz ", [[("foo", ""), ("bar", "baz")]]), + ("foo=bar bar=baz", [[("foo", "bar"), ("bar", "baz")]]), + # doesn't really matter if this next fails, but it works ATM + ("foo= bar=baz", [[("foo", "bar=baz")]]), + ("foo=bar;bar=baz", [[("foo", "bar"), ("bar", "baz")]]), + ('foo bar baz', [[("foo", None), ("bar", None), ("baz", None)]]), + ("a, b, c", [[("a", None)], [("b", None)], [("c", None)]]), + (r'foo; bar=baz, spam=, foo="\,\;\"", bar= ', + [[("foo", None), ("bar", "baz")], + [("spam", "")], [("foo", ',;"')], [("bar", "")]]), + ] + + for arg, expect in tests: + try: + result = split_header_words([arg]) + except: + import traceback, StringIO + f = StringIO.StringIO() + traceback.print_exc(None, f) + result = "(error -- traceback follows)\n\n%s" % f.getvalue() + assert result == expect, """ +When parsing: '%s' +Expected: '%s' +Got: '%s' +""" % (arg, expect, result) + + def test_roundtrip(self): + from mechanize._headersutil import split_header_words, join_header_words + + tests = [ + ("foo", "foo"), + ("foo=bar", "foo=bar"), + (" foo ", "foo"), + ("foo=", 'foo=""'), + ("foo=bar bar=baz", "foo=bar; bar=baz"), + ("foo=bar;bar=baz", "foo=bar; bar=baz"), + ('foo bar baz', "foo; bar; baz"), + (r'foo="\"" bar="\\"', r'foo="\""; bar="\\"'), + ('foo,,,bar', 'foo, bar'), + ('foo=bar,bar=baz', 'foo=bar, bar=baz'), + + ('text/html; charset=iso-8859-1', + 'text/html; charset="iso-8859-1"'), + + ('foo="bar"; port="80,81"; discard, bar=baz', + 'foo=bar; port="80,81"; discard, bar=baz'), + + (r'Basic realm="\"foo\\\\bar\""', + r'Basic; realm="\"foo\\\\bar\""') + ] + + for arg, expect in tests: + input = split_header_words([arg]) + res = join_header_words(input) + assert res == expect, """ +When parsing: '%s' +Expected: '%s' +Got: '%s' +Input was: '%s'""" % (arg, expect, res, input) + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_history.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_history.doctest new file mode 100644 index 0000000000000000000000000000000000000000..cc59e5eeb1ae21f7e220454e5410acdb0c1264db --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_history.doctest @@ -0,0 +1,12 @@ +>>> from mechanize import History + +If nothing has been added, .close should work. + +>>> history = History() +>>> history.close() + +Under some circumstances response can be None, in that case +this method should not raise an exception. + +>>> history.add(None, None) +>>> history.close() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_html.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_html.doctest new file mode 100644 index 0000000000000000000000000000000000000000..cd76943582157f24916a8ac404ec603d68bad6c9 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_html.doctest @@ -0,0 +1,259 @@ +>>> import mechanize +>>> from mechanize._response import test_html_response +>>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \ +... MechanizeBs, \ +... RobustLinksFactory, RobustFormsFactory, RobustTitleFactory + +mechanize.ParseError should be raised on parsing erroneous HTML. + +For backwards compatibility, mechanize.ParseError derives from +exception classes that mechanize used to raise, prior to version +0.1.6. + +>>> import sgmllib +>>> import HTMLParser +>>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError) +True +>>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError) +True + +>>> def create_response(error=True): +... extra = "" +... if error: +... extra = "<!!!>" +... html = """\ +... <html> +... <head> +... <title>Title</title> +... %s +... </head> +... <body> +... <p>Hello world +... </body> +... </html> +... """ % extra +... return test_html_response(html) + +>>> f = LinksFactory() +>>> f.set_response(create_response(), "http://example.com", "latin-1") +>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: +>>> f = FormsFactory() +>>> f.set_response(create_response(), "latin-1") +>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: +>>> f = TitleFactory() +>>> f.set_response(create_response(), "latin-1") +>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: + + +Accessing attributes on Factory may also raise ParseError + +>>> def factory_getattr(attr_name): +... fact = mechanize.DefaultFactory() +... fact.set_response(create_response()) +... getattr(fact, attr_name) +>>> factory_getattr("title") # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: +>>> factory_getattr("global_form") # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: + + +BeautifulSoup ParseErrors: + +XXX If I could come up with examples that break links and forms +parsing, I'd uncomment these! + +>>> def create_soup(html): +... r = test_html_response(html) +... return MechanizeBs("latin-1", r.read()) + +#>>> f = RobustLinksFactory() +#>>> html = """\ +#... <a href="a"> +#... <frame src="b"> +#... <a href="c"> +#... <iframe src="d"> +#... </a> +#... </area> +#... </frame> +#... """ +#>>> f.set_soup(create_soup(html), "http://example.com", "latin-1") +#>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL +#Traceback (most recent call last): +#ParseError: + +#>>> html = """\ +#... <table> +#... <tr><td> +#... <input name='broken'> +#... </td> +#... </form> +#... </tr> +#... </form> +#... """ +#>>> f = RobustFormsFactory() +#>>> f.set_response(create_response(), "latin-1") +#>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL +#Traceback (most recent call last): +#ParseError: + +#>>> f = RobustTitleFactory() +#>>> f.set_soup(create_soup(""), "latin-1") +#>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL +#Traceback (most recent call last): +#ParseError: + + + +Utility class for caching forms etc. + +>>> from mechanize._html import CachingGeneratorFunction + +>>> i = [1] +>>> func = CachingGeneratorFunction(i) +>>> list(func()) +[1] +>>> list(func()) +[1] + +>>> i = [1, 2, 3] +>>> func = CachingGeneratorFunction(i) +>>> list(func()) +[1, 2, 3] + +>>> i = func() +>>> i.next() +1 +>>> i.next() +2 +>>> i.next() +3 + +>>> i = func() +>>> j = func() +>>> i.next() +1 +>>> j.next() +1 +>>> i.next() +2 +>>> j.next() +2 +>>> j.next() +3 +>>> i.next() +3 +>>> i.next() +Traceback (most recent call last): +... +StopIteration +>>> j.next() +Traceback (most recent call last): +... +StopIteration + + +Link text parsing + +>>> def get_first_link_text_bs(html): +... factory = RobustLinksFactory() +... soup = MechanizeBs("utf-8", html) +... factory.set_soup(soup, "http://example.com/", "utf-8") +... return list(factory.links())[0].text + +>>> def get_first_link_text_sgmllib(html): +... factory = LinksFactory() +... response = test_html_response(html) +... factory.set_response(response, "http://example.com/", "utf-8") +... return list(factory.links())[0].text + +Whitespace gets compressed down to single spaces. Tags are removed. + +>>> html = ("""\ +... <html><head><title>Title</title></head><body> +... <p><a href="http://example.com/">The quick\tbrown fox jumps +... over the <i><b>lazy</b></i> dog </a> +... </body></html> +... """) +>>> get_first_link_text_bs(html) +'The quick brown fox jumps over the lazy dog' +>>> get_first_link_text_sgmllib(html) +'The quick brown fox jumps over the lazy dog' + +Empty <a> links have empty link text + +>>> html = ("""\ +... <html><head><title>Title</title></head><body> +... <p><a href="http://example.com/"></a> +... </body></html> +... """) +>>> get_first_link_text_bs(html) +'' +>>> get_first_link_text_sgmllib(html) +'' + +But for backwards-compatibility, empty non-<a> links have None link text + +>>> html = ("""\ +... <html><head><title>Title</title></head><body> +... <p><frame src="http://example.com/"></frame> +... </body></html> +... """) +>>> print get_first_link_text_bs(html) +None +>>> print get_first_link_text_sgmllib(html) +None + + +Title parsing. We follow Firefox's behaviour with regard to child +elements (haven't tested IE). + +>>> def get_title_bs(html): +... factory = RobustTitleFactory() +... soup = MechanizeBs("utf-8", html) +... factory.set_soup(soup, "utf-8") +... return factory.title() + +>>> def get_title_sgmllib(html): +... factory = TitleFactory() +... response = test_html_response(html) +... factory.set_response(response, "utf-8") +... return factory.title() + +>>> html = ("""\ +... <html><head> +... <title>Title</title> +... </head><body><p>Blah.<p></body></html> +... """) +>>> get_title_bs(html) +'Title' +>>> get_title_sgmllib(html) +'Title' + +>>> html = ("""\ +... <html><head> +... <title> Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> +... tle && +... </title> +... </head><body><p>Blah.<p></body></html> +... """) +>>> get_title_bs(html) +'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&' +>>> get_title_sgmllib(html) +'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&' + + +No more tags after <title> used to cause an exception + +>>> html = ("""\ +... <html><head> +... <title>""") +>>> get_title_sgmllib(html) +'' diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_html.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_html.py new file mode 100644 index 0000000000000000000000000000000000000000..a4258b1f01948c94fa7526a945deb75ac13624f9 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_html.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python + +from unittest import TestCase + +import mechanize +import mechanize._form +from mechanize._response import test_html_response + + +class RegressionTests(TestCase): + + def test_close_base_tag(self): + # any document containing a </base> tag used to cause an exception + br = mechanize.Browser() + response = test_html_response("</base>") + br.set_response(response) + list(br.links()) + + def test_bad_base_tag(self): + # a document with a base tag with no href used to cause an exception + for factory in [mechanize.DefaultFactory(), mechanize.RobustFactory()]: + br = mechanize.Browser(factory=factory) + response = test_html_response( + "<BASE TARGET='_main'><a href='http://example.com/'>eg</a>") + br.set_response(response) + list(br.links()) + + def test_robust_form_parser_uses_beautifulsoup(self): + factory = mechanize.RobustFormsFactory() + self.assertIs(factory.form_parser_class, + mechanize._form.RobustFormParser) + + def test_form_parser_does_not_use_beautifulsoup(self): + factory = mechanize.FormsFactory() + self.assertIs(factory.form_parser_class, mechanize._form.FormParser) + + def _make_forms_from_bad_html(self, factory): + bad_html = "<! -- : -- >" + factory.set_response(test_html_response(bad_html), "utf-8") + return list(factory.forms()) + + def test_robust_form_parser_does_not_raise_on_bad_html(self): + self._make_forms_from_bad_html(mechanize.RobustFormsFactory()) + + def test_form_parser_fails_on_bad_html(self): + self.assertRaises( + mechanize.ParseError, + self._make_forms_from_bad_html, mechanize.FormsFactory()) + + +class CachingGeneratorFunctionTests(TestCase): + + def _get_simple_cgenf(self, log): + from mechanize._html import CachingGeneratorFunction + todo = [] + for ii in range(2): + def work(ii=ii): + log.append(ii) + return ii + todo.append(work) + def genf(): + for a in todo: + yield a() + return CachingGeneratorFunction(genf()) + + def test_cache(self): + log = [] + cgenf = self._get_simple_cgenf(log) + for repeat in range(2): + for ii, jj in zip(cgenf(), range(2)): + self.assertEqual(ii, jj) + self.assertEqual(log, range(2)) # work only done once + + def test_interleaved(self): + log = [] + cgenf = self._get_simple_cgenf(log) + cgen = cgenf() + self.assertEqual(cgen.next(), 0) + self.assertEqual(log, [0]) + cgen2 = cgenf() + self.assertEqual(cgen2.next(), 0) + self.assertEqual(log, [0]) + self.assertEqual(cgen.next(), 1) + self.assertEqual(log, [0, 1]) + self.assertEqual(cgen2.next(), 1) + self.assertEqual(log, [0, 1]) + self.assertRaises(StopIteration, cgen.next) + self.assertRaises(StopIteration, cgen2.next) + + +class UnescapeTests(TestCase): + + def test_unescape_charref(self): + from mechanize._html import unescape_charref + mdash_utf8 = u"\u2014".encode("utf-8") + for ref, codepoint, utf8, latin1 in [ + ("38", 38, u"&".encode("utf-8"), "&"), + ("x2014", 0x2014, mdash_utf8, "—"), + ("8212", 8212, mdash_utf8, "—"), + ]: + self.assertEqual(unescape_charref(ref, None), unichr(codepoint)) + self.assertEqual(unescape_charref(ref, 'latin-1'), latin1) + self.assertEqual(unescape_charref(ref, 'utf-8'), utf8) + + def test_unescape(self): + import htmlentitydefs + from mechanize._html import unescape + data = "& < — — —" + mdash_utf8 = u"\u2014".encode("utf-8") + ue = unescape(data, htmlentitydefs.name2codepoint, "utf-8") + self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue) + + for text, expect in [ + ("&a&", "&a&"), + ("a&", "a&"), + ]: + got = unescape(text, htmlentitydefs.name2codepoint, "latin-1") + self.assertEqual(got, expect) + + +class EncodingFinderTests(TestCase): + + def make_response(self, encodings): + return mechanize._response.test_response( + headers=[("Content-type", "text/html; charset=\"%s\"" % encoding) + for encoding in encodings]) + + def test_known_encoding(self): + encoding_finder = mechanize._html.EncodingFinder("default") + response = self.make_response(["utf-8"]) + self.assertEqual(encoding_finder.encoding(response), "utf-8") + + def test_unknown_encoding(self): + encoding_finder = mechanize._html.EncodingFinder("default") + response = self.make_response(["bogus"]) + self.assertEqual(encoding_finder.encoding(response), "default") + + def test_precedence(self): + encoding_finder = mechanize._html.EncodingFinder("default") + response = self.make_response(["latin-1", "utf-8"]) + self.assertEqual(encoding_finder.encoding(response), "latin-1") + + def test_fallback(self): + encoding_finder = mechanize._html.EncodingFinder("default") + response = self.make_response(["bogus", "utf-8"]) + self.assertEqual(encoding_finder.encoding(response), "utf-8") + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_import.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_import.py new file mode 100644 index 0000000000000000000000000000000000000000..b1d6220dcb0f16b9f7553ddbef53d22d298b1c63 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_import.py @@ -0,0 +1,15 @@ +import unittest + +import mechanize +from mechanize._testcase import TestCase + + +class ImportTests(TestCase): + + def test_import_all(self): + for name in mechanize.__all__: + exec "from mechanize import %s" % name + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_opener.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_opener.doctest new file mode 100644 index 0000000000000000000000000000000000000000..20a1278f38a165cad0822d857583e30ae8e98fbb --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_opener.doctest @@ -0,0 +1,58 @@ +>>> import StringIO +>>> from mechanize import _opener, HTTPError + +Normal case. Response goes through hook function and is returned. + +>>> def urlopen(fullurl, data=None, timeout=None): +... print "fullurl %r" % fullurl +... print "data %r" % data +... return "response" +>>> def response_hook(response): +... print "response %r" % response +... return "processed response" +>>> _opener.wrapped_open(urlopen, response_hook, +... "http://example.com", "data" +... ) +fullurl 'http://example.com' +data 'data' +response 'response' +'processed response' + + +Raised HTTPError exceptions still go through the response hook but +the result is raised rather than returned. + +>>> def urlopen(fullurl, data=None, timeout=None): +... print "fullurl %r" % fullurl +... print "data %r" % data +... raise HTTPError( +... "http://example.com", 200, "OK", {}, StringIO.StringIO()) +>>> def response_hook(response): +... print "response class", response.__class__.__name__ +... return Exception("processed response") +>>> try: +... _opener.wrapped_open(urlopen, response_hook, +... "http://example.com", "data" +... ) +... except Exception, exc: +... print exc +fullurl 'http://example.com' +data 'data' +response class HTTPError +processed response + +Other exceptions get ignored, since they're not response objects. + +>>> def urlopen(fullurl, data=None, timeout=None): +... print "fullurl %r" % fullurl +... print "data %r" % data +... raise Exception("not caught") +>>> try: +... _opener.wrapped_open(urlopen, response_hook, +... "http://example.com", "data" +... ) +... except Exception, exc: +... print exc +fullurl 'http://example.com' +data 'data' +not caught diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_opener.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_opener.py new file mode 100644 index 0000000000000000000000000000000000000000..4970511858c669499d383f0fe7c1c03f853abde3 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_opener.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python + +import os +import math +import stat +import unittest + +import mechanize +import mechanize._response as _response +import mechanize._sockettimeout as _sockettimeout + + +def killfile(filename): + try: + os.remove(filename) + except OSError: + if os.name=='nt': + try: + os.chmod(filename, stat.S_IWRITE) + os.remove(filename) + except OSError: + pass + + +class CloseVerifier(object): + + def __init__(self): + self.count = 0 + + def opened(self): + self.count += 1 + + def closed(self): + self.count -= 1 + + def verify(self, assert_equals): + assert_equals(self.count, 0) + + +class ResponseCloseWrapper(object): + + def __init__(self, response, closed_callback, read): + self._response = response + self._closed_callback = closed_callback + if read is None: + self.read = response.read + else: + self.read = read + + def __getattr__(self, name): + return getattr(self._response, name) + + def close(self): + self._closed_callback() + + +class ResponseCloseVerifier(CloseVerifier): + + def __init__(self, read=None): + CloseVerifier.__init__(self) + self._read = read + + def open(self): + self.opened() + response = _response.test_response("spam") + return ResponseCloseWrapper(response, self.closed, self._read) + + +class URLOpener(mechanize.OpenerDirector): + + def __init__(self, urlopen): + self._urlopen = urlopen + + def open(self, *args, **kwds): + return self._urlopen() + + +class FakeFile(object): + + def __init__(self, closed_callback): + self._closed_callback = closed_callback + + def write(self, *args, **kwds): + pass + + def close(self): + self._closed_callback() + + +class FakeFilesystem(CloseVerifier): + + def open(self, path, mode="r"): + self.opened() + return FakeFile(self.closed) + + +class OpenerTests(unittest.TestCase): + + def _check_retrieve(self, urlopen): + opener = URLOpener(urlopen=urlopen) + fs = FakeFilesystem() + try: + filename, headers = opener.retrieve("http://example.com", + "dummy filename", + open=fs.open) + except mechanize.URLError: + pass + fs.verify(self.assertEquals) + + def test_retrieve_closes_on_success(self): + response_verifier = ResponseCloseVerifier() + self._check_retrieve(urlopen=response_verifier.open) + response_verifier.verify(self.assertEquals) + + def test_retrieve_closes_on_failure(self): + def fail_to_open(): + raise mechanize.URLError("dummy reason") + self._check_retrieve(fail_to_open) + + def test_retrieve_closes_on_read_failure(self): + def fail_to_read(*args, **kwds): + raise mechanize.URLError("dummy reason") + response_verifier = ResponseCloseVerifier(read=fail_to_read) + self._check_retrieve(urlopen=response_verifier.open) + response_verifier.verify(self.assertEquals) + + def test_retrieve(self): + # The .retrieve() method deals with a number of different cases. In + # each case, .read() should be called the expected number of times, the + # progress callback should be called as expected, and we should end up + # with a filename and some headers. + + class Opener(mechanize.OpenerDirector): + def __init__(self, content_length=None): + mechanize.OpenerDirector.__init__(self) + self.calls = [] + self.block_size = mechanize.OpenerDirector.BLOCK_SIZE + self.nr_blocks = 2.5 + self.data = int((self.block_size/8)*self.nr_blocks)*"01234567" + self.total_size = len(self.data) + self._content_length = content_length + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + self.calls.append((fullurl, data, timeout)) + headers = [("Foo", "Bar")] + if self._content_length is not None: + if self._content_length is True: + content_length = str(len(self.data)) + else: + content_length = str(self._content_length) + headers.append(("content-length", content_length)) + return _response.test_response(self.data, headers) + + class CallbackVerifier: + def __init__(self, testcase, total_size, block_size): + self.count = 0 + self._testcase = testcase + self._total_size = total_size + self._block_size = block_size + def callback(self, block_nr, block_size, total_size): + self._testcase.assertEqual(block_nr, self.count) + self._testcase.assertEqual(block_size, self._block_size) + self._testcase.assertEqual(total_size, self._total_size) + self.count += 1 + + # ensure we start without the test file present + tfn = "mechanize_test_73940ukewrl.txt" + killfile(tfn) + + # case 1: filename supplied + op = Opener() + verif = CallbackVerifier(self, -1, op.block_size) + url = "http://example.com/" + filename, headers = op.retrieve( + url, tfn, reporthook=verif.callback) + try: + self.assertEqual(filename, tfn) + self.assertEqual(headers["foo"], 'Bar') + self.assertEqual(open(filename, "rb").read(), op.data) + self.assertEqual(len(op.calls), 1) + self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1) + op.close() + # .close()ing the opener does NOT remove non-temporary files + self.assert_(os.path.isfile(filename)) + finally: + killfile(filename) + + # case 2: no filename supplied, use a temporary file + op = Opener(content_length=True) + # We asked the Opener to add a content-length header to the response + # this time. Verify the total size passed to the callback is that case + # is according to the content-length (rather than -1). + verif = CallbackVerifier(self, op.total_size, op.block_size) + url = "http://example.com/" + filename, headers = op.retrieve(url, reporthook=verif.callback) + self.assertNotEqual(filename, tfn) # (some temp filename instead) + self.assertEqual(headers["foo"], 'Bar') + self.assertEqual(open(filename, "rb").read(), op.data) + self.assertEqual(len(op.calls), 1) + # .close()ing the opener removes temporary files + self.assert_(os.path.exists(filename)) + op.close() + self.failIf(os.path.exists(filename)) + self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1) + + # case 3: "file:" URL with no filename supplied + # we DON'T create a temporary file, since there's a file there already + op = Opener() + verif = CallbackVerifier(self, -1, op.block_size) + tifn = "input_for_"+tfn + try: + f = open(tifn, 'wb') + try: + f.write(op.data) + finally: + f.close() + url = "file://" + tifn + filename, headers = op.retrieve(url, reporthook=verif.callback) + self.assertEqual(filename, None) # this may change + self.assertEqual(headers["foo"], 'Bar') + self.assertEqual(open(tifn, "rb").read(), op.data) + # no .read()s took place, since we already have the disk file, + # and we weren't asked to write it to another filename + self.assertEqual(verif.count, 0) + op.close() + # .close()ing the opener does NOT remove the file! + self.assert_(os.path.isfile(tifn)) + finally: + killfile(tifn) + + # case 4: "file:" URL and filename supplied + # we DO create a new file in this case + op = Opener() + verif = CallbackVerifier(self, -1, op.block_size) + tifn = "input_for_"+tfn + try: + f = open(tifn, 'wb') + try: + f.write(op.data) + finally: + f.close() + url = "file://" + tifn + try: + filename, headers = op.retrieve( + url, tfn, reporthook=verif.callback) + self.assertEqual(filename, tfn) + self.assertEqual(headers["foo"], 'Bar') + self.assertEqual(open(tifn, "rb").read(), op.data) + self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1) + op.close() + # .close()ing the opener does NOT remove non-temporary files + self.assert_(os.path.isfile(tfn)) + finally: + killfile(tfn) + finally: + killfile(tifn) + + # Content-Length mismatch with real file length gives URLError + big = 1024*32 + op = Opener(content_length=big) + verif = CallbackVerifier(self, big, op.block_size) + url = "http://example.com/" + try: + try: + op.retrieve(url, reporthook=verif.callback) + except mechanize.ContentTooShortError, exc: + filename, headers = exc.result + self.assertNotEqual(filename, tfn) + self.assertEqual(headers["foo"], 'Bar') + # We still read and wrote to disk everything available, despite + # the exception. + self.assertEqual(open(filename, "rb").read(), op.data) + self.assertEqual(len(op.calls), 1) + self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1) + # cleanup should still take place + self.assert_(os.path.isfile(filename)) + op.close() + self.failIf(os.path.isfile(filename)) + else: + self.fail() + finally: + killfile(filename) + diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_password_manager.special_doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_password_manager.special_doctest new file mode 100644 index 0000000000000000000000000000000000000000..ddf5453e1617df6aa3e027562479ab5887b7e6b8 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_password_manager.special_doctest @@ -0,0 +1,148 @@ +Features common to HTTPPasswordMgr and HTTPProxyPasswordMgr +=========================================================== + +(mgr_class gets here through globs argument) + +>>> mgr = mgr_class() +>>> add = mgr.add_password + +>>> add("Some Realm", "http://example.com/", "joe", "password") +>>> add("Some Realm", "http://example.com/ni", "ni", "ni") +>>> add("c", "http://example.com/foo", "foo", "ni") +>>> add("c", "http://example.com/bar", "bar", "nini") +>>> add("b", "http://example.com/", "first", "blah") +>>> add("b", "http://example.com/", "second", "spam") +>>> add("a", "http://example.com", "1", "a") +>>> add("Some Realm", "http://c.example.com:3128", "3", "c") +>>> add("Some Realm", "d.example.com", "4", "d") +>>> add("Some Realm", "e.example.com:3128", "5", "e") + +>>> mgr.find_user_password("Some Realm", "example.com") +('joe', 'password') +>>> mgr.find_user_password("Some Realm", "http://example.com") +('joe', 'password') +>>> mgr.find_user_password("Some Realm", "http://example.com/") +('joe', 'password') +>>> mgr.find_user_password("Some Realm", "http://example.com/spam") +('joe', 'password') +>>> mgr.find_user_password("Some Realm", "http://example.com/spam/spam") +('joe', 'password') +>>> mgr.find_user_password("c", "http://example.com/foo") +('foo', 'ni') +>>> mgr.find_user_password("c", "http://example.com/bar") +('bar', 'nini') + +Actually, this is really undefined ATM +#Currently, we use the highest-level path where more than one match: +# +#>>> mgr.find_user_password("Some Realm", "http://example.com/ni") +#('joe', 'password') + +Use latest add_password() in case of conflict: + +>>> mgr.find_user_password("b", "http://example.com/") +('second', 'spam') + +No special relationship between a.example.com and example.com: + +>>> mgr.find_user_password("a", "http://example.com/") +('1', 'a') +>>> mgr.find_user_password("a", "http://a.example.com/") +(None, None) + +Ports: + +>>> mgr.find_user_password("Some Realm", "c.example.com") +(None, None) +>>> mgr.find_user_password("Some Realm", "c.example.com:3128") +('3', 'c') +>>> mgr.find_user_password("Some Realm", "http://c.example.com:3128") +('3', 'c') +>>> mgr.find_user_password("Some Realm", "d.example.com") +('4', 'd') +>>> mgr.find_user_password("Some Realm", "e.example.com:3128") +('5', 'e') + + +Default port tests +------------------ + +>>> mgr = mgr_class() +>>> add = mgr.add_password + +The point to note here is that we can't guess the default port if there's +no scheme. This applies to both add_password and find_user_password. + +>>> add("f", "http://g.example.com:80", "10", "j") +>>> add("g", "http://h.example.com", "11", "k") +>>> add("h", "i.example.com:80", "12", "l") +>>> add("i", "j.example.com", "13", "m") +>>> mgr.find_user_password("f", "g.example.com:100") +(None, None) +>>> mgr.find_user_password("f", "g.example.com:80") +('10', 'j') +>>> mgr.find_user_password("f", "g.example.com") +(None, None) +>>> mgr.find_user_password("f", "http://g.example.com:100") +(None, None) +>>> mgr.find_user_password("f", "http://g.example.com:80") +('10', 'j') +>>> mgr.find_user_password("f", "http://g.example.com") +('10', 'j') +>>> mgr.find_user_password("g", "h.example.com") +('11', 'k') +>>> mgr.find_user_password("g", "h.example.com:80") +('11', 'k') +>>> mgr.find_user_password("g", "http://h.example.com:80") +('11', 'k') +>>> mgr.find_user_password("h", "i.example.com") +(None, None) +>>> mgr.find_user_password("h", "i.example.com:80") +('12', 'l') +>>> mgr.find_user_password("h", "http://i.example.com:80") +('12', 'l') +>>> mgr.find_user_password("i", "j.example.com") +('13', 'm') +>>> mgr.find_user_password("i", "j.example.com:80") +(None, None) +>>> mgr.find_user_password("i", "http://j.example.com") +('13', 'm') +>>> mgr.find_user_password("i", "http://j.example.com:80") +(None, None) + + +Features specific to HTTPProxyPasswordMgr +========================================= + +Default realm: + +>>> mgr = mechanize.HTTPProxyPasswordMgr() +>>> add = mgr.add_password + +>>> mgr.find_user_password("d", "f.example.com") +(None, None) +>>> add(None, "f.example.com", "6", "f") +>>> mgr.find_user_password("d", "f.example.com") +('6', 'f') + +Default host/port: + +>>> mgr.find_user_password("e", "g.example.com") +(None, None) +>>> add("e", None, "7", "g") +>>> mgr.find_user_password("e", "g.example.com") +('7', 'g') + +Default realm and host/port: + +>>> mgr.find_user_password("f", "h.example.com") +(None, None) +>>> add(None, None, "8", "h") +>>> mgr.find_user_password("f", "h.example.com") +('8', 'h') + +Default realm beats default host/port: + +>>> add("d", None, "9", "i") +>>> mgr.find_user_password("d", "f.example.com") +('6', 'f') diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_performance.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_performance.py new file mode 100644 index 0000000000000000000000000000000000000000..e9030c8040a50084cf496cb31c9fd3f1ff10a3cb --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_performance.py @@ -0,0 +1,104 @@ +import os +import time +import sys +import unittest + +import mechanize +from mechanize._testcase import TestCase, TempDirMaker +from mechanize._rfc3986 import urljoin + + +KB = 1024 +MB = 1024**2 +GB = 1024**3 + + +def time_it(operation): + t = time.time() + operation() + return time.time() - t + +def write_data(filename, nr_bytes): + block_size = 4096 + block = "01234567" * (block_size // 8) + fh = open(filename, "w") + try: + for i in range(nr_bytes // block_size): + fh.write(block) + finally: + fh.close() + +def time_retrieve_local_file(temp_maker, size, retrieve_fn): + temp_dir = temp_maker.make_temp_dir() + filename = os.path.join(temp_dir, "data") + write_data(filename, size) + def operation(): + retrieve_fn(urljoin("file://", filename), + os.path.join(temp_dir, "retrieved")) + return time_it(operation) + + +class PerformanceTests(TestCase): + + def test_retrieve_local_file(self): + def retrieve(url, filename): + br = mechanize.Browser() + br.retrieve(url, filename) + size = 100 * MB +# size = 1 * KB + desired_rate = 2*MB # per second + desired_time = size / float(desired_rate) + fudge_factor = 2. + self.assert_less_than( + time_retrieve_local_file(self, size, retrieve), + desired_time * fudge_factor) + + +def show_plot(rows): + import matplotlib.pyplot + figure = matplotlib.pyplot.figure() + axes = figure.add_subplot(111) + axes.plot([row[0] for row in rows], [row[1] for row in rows]) + matplotlib.pyplot.show() + + +def power_2_range(start, stop): + n = start + while n <= stop: + yield n + n *= 2 + + +def performance_plot(): + def retrieve(url, filename): + br = mechanize.Browser() + br.retrieve(url, filename) + +# import urllib2 +# def retrieve(url, filename): +# urllib2.urlopen(url).read() + +# from mechanize import _useragent +# ua = _useragent.UserAgent() +# ua.set_seekable_responses(True) +# ua.set_handle_equiv(False) +# def retrieve(url, filename): +# ua.retrieve(url, filename) + + rows = [] + for size in power_2_range(256 * KB, 256 * MB): + temp_maker = TempDirMaker() + try: + elapsed = time_retrieve_local_file(temp_maker, size, retrieve) + finally: + temp_maker.tear_down() + rows.append((size//float(MB), elapsed)) + show_plot(rows) + + +if __name__ == "__main__": + args = sys.argv[1:] + if "--plot" in args: + performance_plot() + else: + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_pickle.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_pickle.py new file mode 100644 index 0000000000000000000000000000000000000000..900af0116137c9ecf352b8dfafd34e40833b4774 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_pickle.py @@ -0,0 +1,37 @@ +import cPickle +import cStringIO as StringIO +import pickle + +import mechanize +import mechanize._response +import mechanize._testcase + + +def pickle_and_unpickle(obj, implementation): + return implementation.loads(implementation.dumps(obj)) + + +def test_pickling(obj, check=lambda unpickled: None): + check(pickle_and_unpickle(obj, cPickle)) + check(pickle_and_unpickle(obj, pickle)) + + +class PickleTest(mechanize._testcase.TestCase): + + def test_pickle_cookie(self): + cookiejar = mechanize.CookieJar() + url = "http://example.com/" + request = mechanize.Request(url) + response = mechanize._response.test_response( + headers=[("Set-Cookie", "spam=eggs")], + url=url) + [cookie] = cookiejar.make_cookies(response, request) + check_equality = lambda unpickled: self.assertEqual(unpickled, cookie) + test_pickling(cookie, check_equality) + + def test_pickle_cookiejar(self): + test_pickling(mechanize.CookieJar()) + + +if __name__ == "__main__": + mechanize._testcase.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_pullparser.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_pullparser.py new file mode 100644 index 0000000000000000000000000000000000000000..27f7d4043d1c59f1a7bde573acc2ce55682de20f --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_pullparser.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python + +from unittest import TestCase + +def peek_token(p): + tok = p.get_token() + p.unget_token(tok) + return tok + + +class PullParserTests(TestCase): + from mechanize._pullparser import PullParser, TolerantPullParser + PARSERS = [(PullParser, False), (TolerantPullParser, True)] + + def data_and_file(self): + from StringIO import StringIO + data = """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" +"http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> +<title an=attr>Title</title> +</head> +<body> +<p>This is a data <img alt="blah & a"> & that was an entityref and this a is +a charref. <blah foo="bing" blam="wallop">. +<!-- comment blah blah +still a comment , blah and a space at the end +--> +<!rheum> +<?rhaponicum> +<randomtag spam="eggs"/> +</body> +</html> +""" #" + f = StringIO(data) + return data, f + + def test_encoding(self): + #from mechanize import _pullparser + #for pc, tolerant in [(_pullparser.PullParser, False)]:#PullParserTests.PARSERS: + for pc, tolerant in PullParserTests.PARSERS: + self._test_encoding(pc, tolerant) + def _test_encoding(self, parser_class, tolerant): + from StringIO import StringIO + datas = ["<a>ф</a>", "<a>ф</a>"] + def get_text(data, encoding): + p = _get_parser(data, encoding) + p.get_tag("a") + return p.get_text() + def get_attr(data, encoding, et_name, attr_name): + p = _get_parser(data, encoding) + while True: + tag = p.get_tag(et_name) + attrs = tag.attrs + if attrs is not None: + break + return dict(attrs)[attr_name] + def _get_parser(data, encoding): + f = StringIO(data) + p = parser_class(f, encoding=encoding) + #print 'p._entitydefs>>%s<<' % p._entitydefs['—'] + return p + + for data in datas: + self.assertEqual(get_text(data, "KOI8-R"), "\xc6") + self.assertEqual(get_text(data, "UTF-8"), "\xd1\x84") + + self.assertEqual(get_text("<a>—</a>", "UTF-8"), + u"\u2014".encode('utf8')) + self.assertEqual( + get_attr('<a name="—">blah</a>', "UTF-8", "a", "name"), + u"\u2014".encode('utf8')) + self.assertEqual(get_text("<a>—</a>", "ascii"), "—") + +# response = urllib.addinfourl(f, {"content-type": "text/html; charset=XXX"}, req.get_full_url()) + def test_get_token(self): + for pc, tolerant in PullParserTests.PARSERS: + self._test_get_token(pc, tolerant) + def _test_get_token(self, parser_class, tolerant): + data, f = self.data_and_file() + p = parser_class(f) + from mechanize._pullparser import NoMoreTokensError + self.assertEqual( + p.get_token(), ("decl", +'''DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" +"http://www.w3.org/TR/html4/strict.dtd"''', None)) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("starttag", "html", [])) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("starttag", "head", [])) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("starttag", "title", [("an", "attr")])) + self.assertEqual(p.get_token(), ("data", "Title", None)) + self.assertEqual(p.get_token(), ("endtag", "title", None)) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("endtag", "head", None)) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("starttag", "body", [])) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("starttag", "p", [])) + self.assertEqual(p.get_token(), ("data", "This is a data ", None)) + self.assertEqual(p.get_token(), ("starttag", "img", [("alt", "blah & a")])) + self.assertEqual(p.get_token(), ("data", " ", None)) + self.assertEqual(p.get_token(), ("entityref", "amp", None)) + self.assertEqual(p.get_token(), ("data", + " that was an entityref and this ", + None)) + self.assertEqual(p.get_token(), ("charref", "097", None)) + self.assertEqual(p.get_token(), ("data", " is\na charref. ", None)) + self.assertEqual(p.get_token(), ("starttag", "blah", + [("foo", "bing"), ("blam", "wallop")])) + self.assertEqual(p.get_token(), ("data", ".\n", None)) + self.assertEqual(p.get_token(), ( + "comment", " comment blah blah\n" + "still a comment , blah and a space at the end \n", None)) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("decl", "rheum", None)) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("pi", "rhaponicum", None)) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ( + (tolerant and "starttag" or "startendtag"), "randomtag", + [("spam", "eggs")])) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("endtag", "body", None)) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertEqual(p.get_token(), ("endtag", "html", None)) + self.assertEqual(p.get_token(), ("data", "\n", None)) + self.assertRaises(NoMoreTokensError, p.get_token) +# print "token", p.get_token() +# sys.exit() + + def test_unget_token(self): + for pc, tolerant in PullParserTests.PARSERS: + self._test_unget_token(pc, tolerant) + def _test_unget_token(self, parser_class, tolerant): + data, f = self.data_and_file() + p = parser_class(f) + p.get_token() + tok = p.get_token() + self.assertEqual(tok, ("data", "\n", None)) + p.unget_token(tok) + self.assertEqual(p.get_token(), ("data", "\n", None)) + tok = p.get_token() + self.assertEqual(tok, ("starttag", "html", [])) + p.unget_token(tok) + self.assertEqual(tok, ("starttag", "html", [])) + + def test_get_tag(self): + for pc, tolerant in PullParserTests.PARSERS: + self._test_get_tag(pc, tolerant) + def _test_get_tag(self, parser_class, tolerant): + from mechanize._pullparser import NoMoreTokensError + data, f = self.data_and_file() + p = parser_class(f) + self.assertEqual(p.get_tag(), ("starttag", "html", [])) + self.assertEqual(p.get_tag("blah", "body", "title"), + ("starttag", "title", [("an", "attr")])) + self.assertEqual(p.get_tag(), ("endtag", "title", None)) + self.assertEqual(p.get_tag("randomtag"), + ((tolerant and "starttag" or "startendtag"), "randomtag", + [("spam", "eggs")])) + self.assertEqual(p.get_tag(), ("endtag", "body", None)) + self.assertEqual(p.get_tag(), ("endtag", "html", None)) + self.assertRaises(NoMoreTokensError, p.get_tag) +# print "tag", p.get_tag() +# sys.exit() + + def test_get_text(self): + for pc, tolerant in PullParserTests.PARSERS: + self._test_get_text(pc, tolerant) + def _test_get_text(self, parser_class, tolerant): + from mechanize._pullparser import NoMoreTokensError + data, f = self.data_and_file() + p = parser_class(f) + self.assertEqual(p.get_text(), "\n") + self.assertEqual(peek_token(p).data, "html") + self.assertEqual(p.get_text(), "") + self.assertEqual(peek_token(p).data, "html"); p.get_token() + self.assertEqual(p.get_text(), "\n"); p.get_token() + self.assertEqual(p.get_text(), "\n"); p.get_token() + self.assertEqual(p.get_text(), "Title"); p.get_token() + self.assertEqual(p.get_text(), "\n"); p.get_token() + self.assertEqual(p.get_text(), "\n"); p.get_token() + self.assertEqual(p.get_text(), "\n"); p.get_token() + self.assertEqual(p.get_text(), + "This is a data blah & a[IMG]"); p.get_token() + self.assertEqual(p.get_text(), " & that was an entityref " + "and this a is\na charref. "); p.get_token() + self.assertEqual(p.get_text(), ".\n\n\n\n"); p.get_token() + self.assertEqual(p.get_text(), "\n"); p.get_token() + self.assertEqual(p.get_text(), "\n"); p.get_token() + self.assertEqual(p.get_text(), "\n"); p.get_token() + # no more tokens, so we just get empty string + self.assertEqual(p.get_text(), "") + self.assertEqual(p.get_text(), "") + self.assertRaises(NoMoreTokensError, p.get_token) + #print "text", `p.get_text()` + #sys.exit() + + def test_get_text_2(self): + for pc, tolerant in PullParserTests.PARSERS: + self._test_get_text_2(pc, tolerant) + def _test_get_text_2(self, parser_class, tolerant): + # more complicated stuff + + # endat + data, f = self.data_and_file() + p = parser_class(f) + self.assertEqual(p.get_text(endat=("endtag", "html")), + u"\n\n\nTitle\n\n\nThis is a data blah & a[IMG]" + " & that was an entityref and this a is\na charref. ." + "\n\n\n\n\n\n") + f.close() + + data, f = self.data_and_file() + p = parser_class(f) + self.assertEqual(p.get_text(endat=("endtag", "title")), + "\n\n\nTitle") + self.assertEqual(p.get_text(endat=("starttag", "img")), + "\n\n\nThis is a data blah & a[IMG]") + f.close() + + # textify arg + data, f = self.data_and_file() + p = parser_class(f, textify={"title": "an", "img": lambda x: "YYY"}) + self.assertEqual(p.get_text(endat=("endtag", "title")), + "\n\n\nattr[TITLE]Title") + self.assertEqual(p.get_text(endat=("starttag", "img")), + "\n\n\nThis is a data YYY") + f.close() + + # get_compressed_text + data, f = self.data_and_file() + p = parser_class(f) + self.assertEqual(p.get_compressed_text(endat=("endtag", "html")), + u"Title This is a data blah & a[IMG]" + " & that was an entityref and this a is a charref. .") + f.close() + + def test_tags(self): + for pc, tolerant in PullParserTests.PARSERS: + self._test_tags(pc, tolerant) + def _test_tags(self, parser_class, tolerant): + # no args + data, f = self.data_and_file() + p = parser_class(f) + + expected_tag_names = [ + "html", "head", "title", "title", "head", "body", "p", "img", + "blah", "randomtag", "body", "html" + ] + + for i, token in enumerate(p.tags()): + self.assertEquals(token.data, expected_tag_names[i]) + f.close() + + # tag name args + data, f = self.data_and_file() + p = parser_class(f) + + expected_tokens = [ + ("starttag", "head", []), + ("endtag", "head", None), + ("starttag", "p", []), + ] + + for i, token in enumerate(p.tags("head", "p")): + self.assertEquals(token, expected_tokens[i]) + f.close() + + def test_tokens(self): + for pc, tolerant in PullParserTests.PARSERS: + self._test_tokens(pc, tolerant) + def _test_tokens(self, parser_class, tolerant): + # no args + data, f = self.data_and_file() + p = parser_class(f) + + expected_token_types = [ + "decl", "data", "starttag", "data", "starttag", "data", "starttag", + "data", "endtag", "data", "endtag", "data", "starttag", "data", + "starttag", "data", "starttag", "data", "entityref", "data", + "charref", "data", "starttag", "data", "comment", "data", "decl", + "data", "pi", "data", (tolerant and "starttag" or "startendtag"), + "data", "endtag", "data", "endtag", "data" + ] + + for i, token in enumerate(p.tokens()): + self.assertEquals(token.type, expected_token_types[i]) + f.close() + + # token type args + data, f = self.data_and_file() + p = parser_class(f) + + expected_tokens = [ + ("entityref", "amp", None), + ("charref", "097", None), + ] + + for i, token in enumerate(p.tokens("charref", "entityref")): + self.assertEquals(token, expected_tokens[i]) + f.close() + + def test_token_eq(self): + from mechanize._pullparser import Token + for (a, b) in [ + (Token('endtag', 'html', None), + ('endtag', 'html', None)), + (Token('endtag', 'html', {'woof': 'bark'}), + ('endtag', 'html', {'woof': 'bark'})), + ]: + self.assertEquals(a, a) + self.assertEquals(a, b) + self.assertEquals(b, a) + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_request.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_request.doctest new file mode 100644 index 0000000000000000000000000000000000000000..b732d4ffdfea9294d34a35ecbd99afeac56240c9 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_request.doctest @@ -0,0 +1,71 @@ +>>> from mechanize import Request +>>> Request("http://example.com/foo#frag").get_selector() +'/foo' + +>>> Request("http://example.com?query").get_selector() +'/?query' + +>>> Request("http://example.com").get_selector() +'/' + + +Request Headers Dictionary +-------------------------- + +The Request.headers dictionary is not a documented interface. It should +stay that way, because the complete set of headers are only accessible +through the .get_header(), .has_header(), .header_items() interface. +However, .headers pre-dates those methods, and so real code will be using +the dictionary. + +The introduction in 2.4 of those methods was a mistake for the same reason: +code that previously saw all (urllib2 user)-provided headers in .headers +now sees only a subset (and the function interface is ugly and incomplete). +A better change would have been to replace .headers dict with a dict +subclass (or UserDict.DictMixin instance?) that preserved the .headers +interface and also provided access to the "unredirected" headers. It's +probably too late to fix that, though. + + +Check .capitalize() case normalization: + +>>> url = "http://example.com" +>>> Request(url, headers={"Spam-eggs": "blah"}).headers["Spam-eggs"] +'blah' +>>> Request(url, headers={"spam-EggS": "blah"}).headers["Spam-eggs"] +'blah' + +Currently, Request(url, "Spam-eggs").headers["Spam-Eggs"] raises KeyError, +but that could be changed in future. + + +Request Headers Methods +----------------------- + +Note the case normalization of header names here, to .capitalize()-case. +This should be preserved for backwards-compatibility. (In the HTTP case, +normalization to .title()-case is done by urllib2 before sending headers to +httplib). + +>>> url = "http://example.com" +>>> r = Request(url, headers={"Spam-eggs": "blah"}) +>>> r.has_header("Spam-eggs") +True +>>> r.header_items() +[('Spam-eggs', 'blah')] +>>> r.add_header("Foo-Bar", "baz") +>>> items = r.header_items() +>>> items.sort() +>>> items +[('Foo-bar', 'baz'), ('Spam-eggs', 'blah')] + +Note that e.g. r.has_header("spam-EggS") is currently False, and +r.get_header("spam-EggS") returns None, but that could be changed in +future. + +>>> r.has_header("Not-there") +False +>>> print r.get_header("Not-there") +None +>>> r.get_header("Not-there", "default") +'default' diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_response.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_response.doctest new file mode 100644 index 0000000000000000000000000000000000000000..f8ae57b0aaa89294ebab7b189467d02b515aef16 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_response.doctest @@ -0,0 +1,229 @@ +The read_complete flag lets us know if all of the wrapped file's data +has been read. We want to know this because Browser.back() must +.reload() the response if not. + +I've noted here the various cases where .read_complete may be set. + +>>> import mechanize + +>>> text = "To err is human, to moo, bovine.\n"*10 +>>> def get_wrapper(): +... import cStringIO +... from mechanize._response import seek_wrapper +... f = cStringIO.StringIO(text) +... wr = seek_wrapper(f) +... return wr + +.read() case #1 + +>>> wr = get_wrapper() +>>> wr.read_complete +False +>>> junk = wr.read() +>>> wr.read_complete +True +>>> wr.seek(0) +>>> wr.read_complete +True + +Excercise partial .read() and .readline(), and .seek() case #1 + +>>> wr = get_wrapper() +>>> junk = wr.read(10) +>>> wr.read_complete +False +>>> junk = wr.readline() +>>> wr.read_complete +False +>>> wr.seek(0, 2) +>>> wr.read_complete +True +>>> wr.seek(0) +>>> wr.read_complete +True + +.readlines() case #1 + +>>> wr = get_wrapper() +>>> junk = wr.readlines() +>>> wr.read_complete +True +>>> wr.seek(0) +>>> wr.read_complete +True + +.seek() case #2 + +>>> wr = get_wrapper() +>>> wr.seek(10) +>>> wr.read_complete +False +>>> wr.seek(1000000) + +.read() case #2 + +>>> wr = get_wrapper() +>>> junk = wr.read(1000000) +>>> wr.read_complete # we read to the end, but don't know it yet +False +>>> junk = wr.read(10) +>>> wr.read_complete +True + +.readline() case #1 + +>>> wr = get_wrapper() +>>> junk = wr.read(len(text)-10) +>>> wr.read_complete +False +>>> junk = wr.readline() +>>> wr.read_complete # we read to the end, but don't know it yet +False +>>> junk = wr.readline() +>>> wr.read_complete +True + +Test copying and sharing of .read_complete state + +>>> import copy +>>> wr = get_wrapper() +>>> wr2 = copy.copy(wr) +>>> wr.read_complete +False +>>> wr2.read_complete +False +>>> junk = wr2.read() +>>> wr.read_complete +True +>>> wr2.read_complete +True + + +Fix from -r36082: .read() after .close() used to break +.read_complete state + +>>> from mechanize._response import test_response +>>> r = test_response(text) +>>> junk = r.read(64) +>>> r.close() +>>> r.read_complete +False +>>> r.read() +'' +>>> r.read_complete +False + + + +Tests for the truly horrendous upgrade_response() + +>>> def is_response(r): +... names = "get_data read readline readlines close seek code msg".split() +... for name in names: +... if not hasattr(r, name): +... return False +... return r.get_data() == "test data" + +>>> from cStringIO import StringIO +>>> from mechanize._response import upgrade_response, make_headers, \ +... make_response, closeable_response, seek_wrapper +>>> data="test data"; url="http://example.com/"; code=200; msg="OK" + +Normal response (closeable_response wrapped with seek_wrapper): return a copy + +>>> r1 = make_response(data, [], url, code, msg) +>>> r2 = upgrade_response(r1) +>>> is_response(r2) +True +>>> r1 is not r2 +True +>>> r1.wrapped is r2.wrapped +True + +closeable_response with no seek_wrapper: wrap with seek_wrapper + +>>> r1 = closeable_response(StringIO(data), make_headers([]), url, code, msg) +>>> is_response(r1) +False +>>> r2 = upgrade_response(r1) +>>> is_response(r2) +True +>>> r1 is not r2 +True +>>> r1 is r2.wrapped +True + +addinfourl: extract .fp and wrap it with closeable_response and +seek_wrapper + +>>> from mechanize._urllib2_fork import addinfourl +>>> r1= addinfourl(StringIO(data), make_headers([]), url) +>>> is_response(r1) +False +>>> r2 = upgrade_response(r1) +>>> is_response(r2) +True +>>> r1 is not r2 +True +>>> r1 is not r2.wrapped +True +>>> r1.fp is r2.wrapped.fp +True + +addinfourl with code, msg + +>>> r1= addinfourl(StringIO(data), make_headers([]), url) +>>> r1.code = 206 +>>> r1.msg = "cool" +>>> r2 = upgrade_response(r1) +>>> is_response(r2) +True +>>> r2.code == r1.code +True +>>> r2.msg == r1.msg +True + +addinfourl with seek wrapper: cached data is not lost + +>>> r1= addinfourl(StringIO(data), make_headers([]), url) +>>> r1 = seek_wrapper(r1) +>>> r1.read(4) +'test' +>>> r2 = upgrade_response(r1) +>>> is_response(r2) +True + +addinfourl wrapped with HTTPError -- remains an HTTPError of the same +subclass (through horrible trickery) + +>>> hdrs = make_headers([]) +>>> r1 = addinfourl(StringIO(data), hdrs, url) +>>> class MyHTTPError(mechanize.HTTPError): pass +>>> r1 = MyHTTPError(url, code, msg, hdrs, r1) +>>> is_response(r1) +False +>>> r2 = upgrade_response(r1) +>>> is_response(r2) +True +>>> isinstance(r2, MyHTTPError) +True +>>> r2 # doctest: +ELLIPSIS +<httperror_seek_wrapper (MyHTTPError instance) at ... + +The trickery does not cause double-wrapping + +>>> r3 = upgrade_response(r2) +>>> is_response(r3) +True +>>> r3 is not r2 +True +>>> r3.wrapped is r2.wrapped +True + +Test dynamically-created class __repr__ for case where we have the +module name + +>>> r4 = addinfourl(StringIO(data), hdrs, url) +>>> r4 = mechanize.HTTPError(url, code, msg, hdrs, r4) +>>> upgrade_response(r4) # doctest: +ELLIPSIS +<httperror_seek_wrapper (urllib2.HTTPError instance) at ... diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_response.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_response.py new file mode 100644 index 0000000000000000000000000000000000000000..b67f283d54fe500040c52e003ed1bbdeb1e80edb --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_response.py @@ -0,0 +1,213 @@ +"""Tests for mechanize._response.seek_wrapper and friends.""" + +import copy +import cStringIO +from unittest import TestCase + +class TestUnSeekable: + def __init__(self, text): + self._file = cStringIO.StringIO(text) + self.log = [] + + def tell(self): return self._file.tell() + + def seek(self, offset, whence=0): assert False + + def read(self, size=-1): + self.log.append(("read", size)) + return self._file.read(size) + + def readline(self, size=-1): + self.log.append(("readline", size)) + return self._file.readline(size) + + def readlines(self, sizehint=-1): + self.log.append(("readlines", sizehint)) + return self._file.readlines(sizehint) + +class TestUnSeekableResponse(TestUnSeekable): + def __init__(self, text, headers): + TestUnSeekable.__init__(self, text) + self.code = 200 + self.msg = "OK" + self.headers = headers + self.url = "http://example.com/" + + def geturl(self): + return self.url + + def info(self): + return self.headers + + def close(self): + pass + + +class SeekableTests(TestCase): + + text = """\ +The quick brown fox +jumps over the lazy + +dog. + +""" + text_lines = map(lambda l: l+"\n", text.split("\n")[:-1]) + + def testSeekable(self): + from mechanize._response import seek_wrapper + text = self.text + for ii in range(1, 6): + fh = TestUnSeekable(text) + sfh = seek_wrapper(fh) + test = getattr(self, "_test%d" % ii) + test(sfh) + + # copies have independent seek positions + fh = TestUnSeekable(text) + sfh = seek_wrapper(fh) + self._testCopy(sfh) + + def _testCopy(self, sfh): + sfh2 = copy.copy(sfh) + sfh.read(10) + text = self.text + self.assertEqual(sfh2.read(10), text[:10]) + sfh2.seek(5) + self.assertEqual(sfh.read(10), text[10:20]) + self.assertEqual(sfh2.read(10), text[5:15]) + sfh.seek(0) + sfh2.seek(0) + return sfh2 + + def _test1(self, sfh): + text = self.text + text_lines = self.text_lines + assert sfh.read(10) == text[:10] # calls fh.read + assert sfh.log[-1] == ("read", 10) # .log delegated to fh + sfh.seek(0) # doesn't call fh.seek + assert sfh.read(10) == text[:10] # doesn't call fh.read + assert len(sfh.log) == 1 + sfh.seek(0) + assert sfh.read(5) == text[:5] # read only part of cached data + assert len(sfh.log) == 1 + sfh.seek(0) + assert sfh.read(25) == text[:25] # calls fh.read + assert sfh.log[1] == ("read", 15) + lines = [] + sfh.seek(-1, 1) + while 1: + l = sfh.readline() + if l == "": break + lines.append(l) + assert lines == ["s over the lazy\n"]+text_lines[2:] + assert sfh.log[2:] == [("readline", -1)]*5 + sfh.seek(0) + lines = [] + while 1: + l = sfh.readline() + if l == "": break + lines.append(l) + assert lines == text_lines + + def _test2(self, sfh): + text = self.text + sfh.read(5) + sfh.seek(0) + assert sfh.read() == text + assert sfh.read() == "" + sfh.seek(0) + assert sfh.read() == text + sfh.seek(0) + assert sfh.readline(5) == "The q" + assert sfh.read() == text[5:] + sfh.seek(0) + assert sfh.readline(5) == "The q" + assert sfh.readline() == "uick brown fox\n" + + def _test3(self, sfh): + text_lines = self.text_lines + sfh.read(25) + sfh.seek(-1, 1) + self.assertEqual(sfh.readlines(), ["s over the lazy\n"]+text_lines[2:]) + sfh.seek(0) + assert sfh.readlines() == text_lines + + def _test4(self, sfh): + text_lines = self.text_lines + count = 0 + limit = 10 + while count < limit: + if count == 5: + self.assertRaises(StopIteration, sfh.next) + break + else: + sfh.next() == text_lines[count] + count = count + 1 + else: + assert False, "StopIteration not raised" + + def _test5(self, sfh): + text = self.text + sfh.read(10) + sfh.seek(5) + self.assert_(sfh.invariant()) + sfh.seek(0, 2) + self.assert_(sfh.invariant()) + sfh.seek(0) + self.assertEqual(sfh.read(), text) + + def testResponseSeekWrapper(self): + from mechanize import response_seek_wrapper + hdrs = {"Content-type": "text/html"} + r = TestUnSeekableResponse(self.text, hdrs) + rsw = response_seek_wrapper(r) + rsw2 = self._testCopy(rsw) + self.assert_(rsw is not rsw2) + self.assertEqual(rsw.info(), rsw2.info()) + self.assert_(rsw.info() is not rsw2.info()) + + # should be able to close already-closed object + rsw2.close() + rsw2.close() + + def testSetResponseData(self): + from mechanize import response_seek_wrapper + r = TestUnSeekableResponse(self.text, {'blah': 'yawn'}) + rsw = response_seek_wrapper(r) + rsw.set_data("""\ +A Seeming somwhat more than View; + That doth instruct the Mind + In Things that ly behind, +""") + self.assertEqual(rsw.read(9), "A Seeming") + self.assertEqual(rsw.read(13), " somwhat more") + rsw.seek(0) + self.assertEqual(rsw.read(9), "A Seeming") + self.assertEqual(rsw.readline(), " somwhat more than View;\n") + rsw.seek(0) + self.assertEqual(rsw.readline(), "A Seeming somwhat more than View;\n") + rsw.seek(-1, 1) + self.assertEqual(rsw.read(7), "\n That") + + r = TestUnSeekableResponse(self.text, {'blah': 'yawn'}) + rsw = response_seek_wrapper(r) + rsw.set_data(self.text) + self._test2(rsw) + rsw.seek(0) + self._test4(rsw) + + def testGetResponseData(self): + from mechanize import response_seek_wrapper + r = TestUnSeekableResponse(self.text, {'blah': 'yawn'}) + rsw = response_seek_wrapper(r) + + self.assertEqual(rsw.get_data(), self.text) + self._test2(rsw) + rsw.seek(0) + self._test4(rsw) + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_rfc3986.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_rfc3986.doctest new file mode 100644 index 0000000000000000000000000000000000000000..dd69b5e155170f7408246c9d0c33bae21b236e6e --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_rfc3986.doctest @@ -0,0 +1,168 @@ +>>> from mechanize._rfc3986 import urlsplit, urljoin, remove_dot_segments + +Some common cases + +>>> urlsplit("http://example.com/spam/eggs/spam.html?apples=pears&a=b#foo") +('http', 'example.com', '/spam/eggs/spam.html', 'apples=pears&a=b', 'foo') +>>> urlsplit("http://example.com/spam.html#foo") +('http', 'example.com', '/spam.html', None, 'foo') +>>> urlsplit("ftp://example.com/foo.gif") +('ftp', 'example.com', '/foo.gif', None, None) +>>> urlsplit('ftp://joe:password@example.com:port') +('ftp', 'joe:password@example.com:port', '', None, None) +>>> urlsplit("mailto:jjl@pobox.com") +('mailto', None, 'jjl@pobox.com', None, None) + +The five path productions + +path-abempty: + +>>> urlsplit("http://www.example.com") +('http', 'www.example.com', '', None, None) +>>> urlsplit("http://www.example.com/foo") +('http', 'www.example.com', '/foo', None, None) + +path-absolute: + +>>> urlsplit("a:/") +('a', None, '/', None, None) +>>> urlsplit("a:/b:/c/") +('a', None, '/b:/c/', None, None) + +path-noscheme: + +>>> urlsplit("a:b/:c/") +('a', None, 'b/:c/', None, None) + +path-rootless: + +>>> urlsplit("a:b:/c/") +('a', None, 'b:/c/', None, None) + +path-empty: + +>>> urlsplit("quack:") +('quack', None, '', None, None) + + +>>> remove_dot_segments("/a/b/c/./../../g") +'/a/g' +>>> remove_dot_segments("mid/content=5/../6") +'mid/6' +>>> remove_dot_segments("/b/c/.") +'/b/c/' +>>> remove_dot_segments("/b/c/./.") +'/b/c/' +>>> remove_dot_segments(".") +'' +>>> remove_dot_segments("/.") +'/' +>>> remove_dot_segments("./") +'' +>>> remove_dot_segments("/..") +'/' +>>> remove_dot_segments("/../") +'/' + + +Examples from RFC 3986 section 5.4 + +Normal Examples + +>>> base = "http://a/b/c/d;p?q" +>>> def join(uri): return urljoin(base, uri) +>>> join("g:h") +'g:h' +>>> join("g") +'http://a/b/c/g' +>>> join("./g") +'http://a/b/c/g' +>>> join("g/") +'http://a/b/c/g/' +>>> join("/g") +'http://a/g' +>>> join("//g") +'http://g' +>>> join("?y") +'http://a/b/c/d;p?y' +>>> join("g?y") +'http://a/b/c/g?y' +>>> join("#s") +'http://a/b/c/d;p?q#s' +>>> join("g#s") +'http://a/b/c/g#s' +>>> join("g?y#s") +'http://a/b/c/g?y#s' +>>> join(";x") +'http://a/b/c/;x' +>>> join("g;x") +'http://a/b/c/g;x' +>>> join("g;x?y#s") +'http://a/b/c/g;x?y#s' +>>> join("") +'http://a/b/c/d;p?q' +>>> join(".") +'http://a/b/c/' +>>> join("./") +'http://a/b/c/' +>>> join("..") +'http://a/b/' +>>> join("../") +'http://a/b/' +>>> join("../g") +'http://a/b/g' +>>> join("../..") +'http://a/' +>>> join("../../") +'http://a/' +>>> join("../../g") +'http://a/g' + +Abnormal Examples + +>>> join("../../../g") +'http://a/g' +>>> join("../../../../g") +'http://a/g' +>>> join("/./g") +'http://a/g' +>>> join("/../g") +'http://a/g' +>>> join("g.") +'http://a/b/c/g.' +>>> join(".g") +'http://a/b/c/.g' +>>> join("g..") +'http://a/b/c/g..' +>>> join("..g") +'http://a/b/c/..g' +>>> join("./../g") +'http://a/b/g' +>>> join("./g/.") +'http://a/b/c/g/' +>>> join("g/./h") +'http://a/b/c/g/h' +>>> join("g/../h") +'http://a/b/c/h' +>>> join("g;x=1/./y") +'http://a/b/c/g;x=1/y' +>>> join("g;x=1/../y") +'http://a/b/c/y' +>>> join("g?y/./x") +'http://a/b/c/g?y/./x' +>>> join("g?y/../x") +'http://a/b/c/g?y/../x' +>>> join("g#s/./x") +'http://a/b/c/g#s/./x' +>>> join("g#s/../x") +'http://a/b/c/g#s/../x' +>>> join("http:g") +'http://a/b/c/g' + + +Additional urljoin tests, not taken from RFC: + +>>> join("/..") +'http://a/' +>>> join("/../") +'http://a/' diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_robotfileparser.doctest b/LTA/LTAIngest/mechanize-0.2.5/test/test_robotfileparser.doctest new file mode 100644 index 0000000000000000000000000000000000000000..0939900b50eba74f5c289132ab3aa9417654bbce --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_robotfileparser.doctest @@ -0,0 +1,8 @@ +>>> from mechanize._http import MechanizeRobotFileParser + +Calling .set_opener() without args sets a default opener. + +>>> rfp = MechanizeRobotFileParser() +>>> rfp.set_opener() +>>> rfp._opener # doctest: +ELLIPSIS +<mechanize._opener.OpenerDirector instance at ...> diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_unittest.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_unittest.py new file mode 100644 index 0000000000000000000000000000000000000000..89597e3058bc617f29439070af9ed44b1acff66c --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_unittest.py @@ -0,0 +1,3785 @@ +"""Test script for unittest. + +By Collin Winter <collinw at gmail.com> + +Still need testing: + TestCase.{assert,fail}* methods (some are tested implicitly) +""" + +from StringIO import StringIO +import __builtin__ +import os +import re +import sys +import unittest +from unittest import TestCase, TestProgram +import types +from copy import deepcopy +from cStringIO import StringIO +import pickle + +### Support code +################################################################ + +class LoggingResult(unittest.TestResult): + def __init__(self, log): + self._events = log + super(LoggingResult, self).__init__() + + def startTest(self, test): + self._events.append('startTest') + super(LoggingResult, self).startTest(test) + + def startTestRun(self): + self._events.append('startTestRun') + super(LoggingResult, self).startTestRun() + + def stopTest(self, test): + self._events.append('stopTest') + super(LoggingResult, self).stopTest(test) + + def stopTestRun(self): + self._events.append('stopTestRun') + super(LoggingResult, self).stopTestRun() + + def addFailure(self, *args): + self._events.append('addFailure') + super(LoggingResult, self).addFailure(*args) + + def addSuccess(self, *args): + self._events.append('addSuccess') + super(LoggingResult, self).addSuccess(*args) + + def addError(self, *args): + self._events.append('addError') + super(LoggingResult, self).addError(*args) + + def addSkip(self, *args): + self._events.append('addSkip') + super(LoggingResult, self).addSkip(*args) + + def addExpectedFailure(self, *args): + self._events.append('addExpectedFailure') + super(LoggingResult, self).addExpectedFailure(*args) + + def addUnexpectedSuccess(self, *args): + self._events.append('addUnexpectedSuccess') + super(LoggingResult, self).addUnexpectedSuccess(*args) + + +class TestEquality(object): + """Used as a mixin for TestCase""" + + # Check for a valid __eq__ implementation + def test_eq(self): + for obj_1, obj_2 in self.eq_pairs: + self.assertEqual(obj_1, obj_2) + self.assertEqual(obj_2, obj_1) + + # Check for a valid __ne__ implementation + def test_ne(self): + for obj_1, obj_2 in self.ne_pairs: + self.assertNotEqual(obj_1, obj_2) + self.assertNotEqual(obj_2, obj_1) + +class TestHashing(object): + """Used as a mixin for TestCase""" + + # Check for a valid __hash__ implementation + def test_hash(self): + for obj_1, obj_2 in self.eq_pairs: + try: + if not hash(obj_1) == hash(obj_2): + self.fail("%r and %r do not hash equal" % (obj_1, obj_2)) + except KeyboardInterrupt: + raise + except Exception, e: + self.fail("Problem hashing %r and %r: %s" % (obj_1, obj_2, e)) + + for obj_1, obj_2 in self.ne_pairs: + try: + if hash(obj_1) == hash(obj_2): + self.fail("%s and %s hash equal, but shouldn't" % + (obj_1, obj_2)) + except KeyboardInterrupt: + raise + except Exception, e: + self.fail("Problem hashing %s and %s: %s" % (obj_1, obj_2, e)) + + +# List subclass we can add attributes to. +class MyClassSuite(list): + + def __init__(self, tests): + super(MyClassSuite, self).__init__(tests) + + +################################################################ +### /Support code + +class Test_TestLoader(TestCase): + + ### Tests for TestLoader.loadTestsFromTestCase + ################################################################ + + # "Return a suite of all tests cases contained in the TestCase-derived + # class testCaseClass" + def test_loadTestsFromTestCase(self): + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + + tests = unittest.TestSuite([Foo('test_1'), Foo('test_2')]) + + loader = unittest.TestLoader() + self.assertEqual(loader.loadTestsFromTestCase(Foo), tests) + + # "Return a suite of all tests cases contained in the TestCase-derived + # class testCaseClass" + # + # Make sure it does the right thing even if no tests were found + def test_loadTestsFromTestCase__no_matches(self): + class Foo(unittest.TestCase): + def foo_bar(self): pass + + empty_suite = unittest.TestSuite() + + loader = unittest.TestLoader() + self.assertEqual(loader.loadTestsFromTestCase(Foo), empty_suite) + + # "Return a suite of all tests cases contained in the TestCase-derived + # class testCaseClass" + # + # What happens if loadTestsFromTestCase() is given an object + # that isn't a subclass of TestCase? Specifically, what happens + # if testCaseClass is a subclass of TestSuite? + # + # This is checked for specifically in the code, so we better add a + # test for it. + def test_loadTestsFromTestCase__TestSuite_subclass(self): + class NotATestCase(unittest.TestSuite): + pass + + loader = unittest.TestLoader() + try: + loader.loadTestsFromTestCase(NotATestCase) + except TypeError: + pass + else: + self.fail('Should raise TypeError') + + # "Return a suite of all tests cases contained in the TestCase-derived + # class testCaseClass" + # + # Make sure loadTestsFromTestCase() picks up the default test method + # name (as specified by TestCase), even though the method name does + # not match the default TestLoader.testMethodPrefix string + def test_loadTestsFromTestCase__default_method_name(self): + class Foo(unittest.TestCase): + def runTest(self): + pass + + loader = unittest.TestLoader() + # This has to be false for the test to succeed + self.assertFalse('runTest'.startswith(loader.testMethodPrefix)) + + suite = loader.loadTestsFromTestCase(Foo) + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), [Foo('runTest')]) + + ################################################################ + ### /Tests for TestLoader.loadTestsFromTestCase + + ### Tests for TestLoader.loadTestsFromModule + ################################################################ + + # "This method searches `module` for classes derived from TestCase" + def test_loadTestsFromModule__TestCase_subclass(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromModule(m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + expected = [loader.suiteClass([MyTestCase('test')])] + self.assertEqual(list(suite), expected) + + # "This method searches `module` for classes derived from TestCase" + # + # What happens if no tests are found (no TestCase instances)? + def test_loadTestsFromModule__no_TestCase_instances(self): + m = types.ModuleType('m') + + loader = unittest.TestLoader() + suite = loader.loadTestsFromModule(m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), []) + + # "This method searches `module` for classes derived from TestCase" + # + # What happens if no tests are found (TestCases instances, but no tests)? + def test_loadTestsFromModule__no_TestCase_tests(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromModule(m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + self.assertEqual(list(suite), [loader.suiteClass()]) + + # "This method searches `module` for classes derived from TestCase"s + # + # What happens if loadTestsFromModule() is given something other + # than a module? + # + # XXX Currently, it succeeds anyway. This flexibility + # should either be documented or loadTestsFromModule() should + # raise a TypeError + # + # XXX Certain people are using this behaviour. We'll add a test for it + def test_loadTestsFromModule__not_a_module(self): + class MyTestCase(unittest.TestCase): + def test(self): + pass + + class NotAModule(object): + test_2 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromModule(NotAModule) + + reference = [unittest.TestSuite([MyTestCase('test')])] + self.assertEqual(list(suite), reference) + + + # Check that loadTestsFromModule honors (or not) a module + # with a load_tests function. + def test_loadTestsFromModule__load_tests(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + load_tests_args = [] + def load_tests(loader, tests, pattern): + load_tests_args.extend((loader, tests, pattern)) + return tests + m.load_tests = load_tests + + loader = unittest.TestLoader() + suite = loader.loadTestsFromModule(m) + self.assertEquals(load_tests_args, [loader, suite, None]) + + load_tests_args = [] + suite = loader.loadTestsFromModule(m, use_load_tests=False) + self.assertEquals(load_tests_args, []) + + ################################################################ + ### /Tests for TestLoader.loadTestsFromModule() + + ### Tests for TestLoader.loadTestsFromName() + ################################################################ + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # Is ValueError raised in response to an empty name? + def test_loadTestsFromName__empty_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromName('') + except ValueError, e: + self.assertEqual(str(e), "Empty module name") + else: + self.fail("TestLoader.loadTestsFromName failed to raise ValueError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # What happens when the name contains invalid characters? + def test_loadTestsFromName__malformed_name(self): + loader = unittest.TestLoader() + + # XXX Should this raise ValueError or ImportError? + try: + loader.loadTestsFromName('abc () //') + except ValueError: + pass + except ImportError: + pass + else: + self.fail("TestLoader.loadTestsFromName failed to raise ValueError") + + # "The specifier name is a ``dotted name'' that may resolve ... to a + # module" + # + # What happens when a module by that name can't be found? + def test_loadTestsFromName__unknown_module_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromName('sdasfasfasdf') + except ImportError, e: + self.assertEqual(str(e), "No module named sdasfasfasdf") + else: + self.fail("TestLoader.loadTestsFromName failed to raise ImportError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # What happens when the module is found, but the attribute can't? + def test_loadTestsFromName__unknown_attr_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromName('unittest.sdasfasfasdf') + except AttributeError, e: + self.assertEqual(str(e), "'module' object has no attribute 'sdasfasfasdf'") + else: + self.fail("TestLoader.loadTestsFromName failed to raise AttributeError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # What happens when we provide the module, but the attribute can't be + # found? + def test_loadTestsFromName__relative_unknown_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromName('sdasfasfasdf', unittest) + except AttributeError, e: + self.assertEqual(str(e), "'module' object has no attribute 'sdasfasfasdf'") + else: + self.fail("TestLoader.loadTestsFromName failed to raise AttributeError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # ... + # "The method optionally resolves name relative to the given module" + # + # Does loadTestsFromName raise ValueError when passed an empty + # name relative to a provided module? + # + # XXX Should probably raise a ValueError instead of an AttributeError + def test_loadTestsFromName__relative_empty_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromName('', unittest) + except AttributeError, e: + pass + else: + self.fail("Failed to raise AttributeError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # ... + # "The method optionally resolves name relative to the given module" + # + # What happens when an impossible name is given, relative to the provided + # `module`? + def test_loadTestsFromName__relative_malformed_name(self): + loader = unittest.TestLoader() + + # XXX Should this raise AttributeError or ValueError? + try: + loader.loadTestsFromName('abc () //', unittest) + except ValueError: + pass + except AttributeError: + pass + else: + self.fail("TestLoader.loadTestsFromName failed to raise ValueError") + + # "The method optionally resolves name relative to the given module" + # + # Does loadTestsFromName raise TypeError when the `module` argument + # isn't a module object? + # + # XXX Accepts the not-a-module object, ignorning the object's type + # This should raise an exception or the method name should be changed + # + # XXX Some people are relying on this, so keep it for now + def test_loadTestsFromName__relative_not_a_module(self): + class MyTestCase(unittest.TestCase): + def test(self): + pass + + class NotAModule(object): + test_2 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromName('test_2', NotAModule) + + reference = [MyTestCase('test')] + self.assertEqual(list(suite), reference) + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # Does it raise an exception if the name resolves to an invalid + # object? + def test_loadTestsFromName__relative_bad_object(self): + m = types.ModuleType('m') + m.testcase_1 = object() + + loader = unittest.TestLoader() + try: + loader.loadTestsFromName('testcase_1', m) + except TypeError: + pass + else: + self.fail("Should have raised TypeError") + + # "The specifier name is a ``dotted name'' that may + # resolve either to ... a test case class" + def test_loadTestsFromName__relative_TestCase_subclass(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromName('testcase_1', m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), [MyTestCase('test')]) + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + def test_loadTestsFromName__relative_TestSuite(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testsuite = unittest.TestSuite([MyTestCase('test')]) + + loader = unittest.TestLoader() + suite = loader.loadTestsFromName('testsuite', m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + self.assertEqual(list(suite), [MyTestCase('test')]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a test method within a test case class" + def test_loadTestsFromName__relative_testmethod(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromName('testcase_1.test', m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + self.assertEqual(list(suite), [MyTestCase('test')]) + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # Does loadTestsFromName() raise the proper exception when trying to + # resolve "a test method within a test case class" that doesn't exist + # for the given name (relative to a provided module)? + def test_loadTestsFromName__relative_invalid_testmethod(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + try: + loader.loadTestsFromName('testcase_1.testfoo', m) + except AttributeError, e: + self.assertEqual(str(e), "type object 'MyTestCase' has no attribute 'testfoo'") + else: + self.fail("Failed to raise AttributeError") + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a callable object which returns a ... TestSuite instance" + def test_loadTestsFromName__callable__TestSuite(self): + m = types.ModuleType('m') + testcase_1 = unittest.FunctionTestCase(lambda: None) + testcase_2 = unittest.FunctionTestCase(lambda: None) + def return_TestSuite(): + return unittest.TestSuite([testcase_1, testcase_2]) + m.return_TestSuite = return_TestSuite + + loader = unittest.TestLoader() + suite = loader.loadTestsFromName('return_TestSuite', m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), [testcase_1, testcase_2]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a callable object which returns a TestCase ... instance" + def test_loadTestsFromName__callable__TestCase_instance(self): + m = types.ModuleType('m') + testcase_1 = unittest.FunctionTestCase(lambda: None) + def return_TestCase(): + return testcase_1 + m.return_TestCase = return_TestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromName('return_TestCase', m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), [testcase_1]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a callable object which returns a TestCase ... instance" + #***************************************************************** + #Override the suiteClass attribute to ensure that the suiteClass + #attribute is used + def test_loadTestsFromName__callable__TestCase_instance_ProperSuiteClass(self): + class SubTestSuite(unittest.TestSuite): + pass + m = types.ModuleType('m') + testcase_1 = unittest.FunctionTestCase(lambda: None) + def return_TestCase(): + return testcase_1 + m.return_TestCase = return_TestCase + + loader = unittest.TestLoader() + loader.suiteClass = SubTestSuite + suite = loader.loadTestsFromName('return_TestCase', m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), [testcase_1]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a test method within a test case class" + #***************************************************************** + #Override the suiteClass attribute to ensure that the suiteClass + #attribute is used + def test_loadTestsFromName__relative_testmethod_ProperSuiteClass(self): + class SubTestSuite(unittest.TestSuite): + pass + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + loader.suiteClass=SubTestSuite + suite = loader.loadTestsFromName('testcase_1.test', m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + self.assertEqual(list(suite), [MyTestCase('test')]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a callable object which returns a TestCase or TestSuite instance" + # + # What happens if the callable returns something else? + def test_loadTestsFromName__callable__wrong_type(self): + m = types.ModuleType('m') + def return_wrong(): + return 6 + m.return_wrong = return_wrong + + loader = unittest.TestLoader() + try: + suite = loader.loadTestsFromName('return_wrong', m) + except TypeError: + pass + else: + self.fail("TestLoader.loadTestsFromName failed to raise TypeError") + + # "The specifier can refer to modules and packages which have not been + # imported; they will be imported as a side-effect" + def test_loadTestsFromName__module_not_loaded(self): + # We're going to try to load this module as a side-effect, so it + # better not be loaded before we try. + # + # Why pick audioop? Google shows it isn't used very often, so there's + # a good chance that it won't be imported when this test is run + module_name = 'audioop' + + import sys + if module_name in sys.modules: + del sys.modules[module_name] + + loader = unittest.TestLoader() + try: + suite = loader.loadTestsFromName(module_name) + + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), []) + + # audioop should now be loaded, thanks to loadTestsFromName() + self.assertTrue(module_name in sys.modules) + finally: + if module_name in sys.modules: + del sys.modules[module_name] + + ################################################################ + ### Tests for TestLoader.loadTestsFromName() + + ### Tests for TestLoader.loadTestsFromNames() + ################################################################ + + # "Similar to loadTestsFromName(), but takes a sequence of names rather + # than a single name." + # + # What happens if that sequence of names is empty? + def test_loadTestsFromNames__empty_name_list(self): + loader = unittest.TestLoader() + + suite = loader.loadTestsFromNames([]) + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), []) + + # "Similar to loadTestsFromName(), but takes a sequence of names rather + # than a single name." + # ... + # "The method optionally resolves name relative to the given module" + # + # What happens if that sequence of names is empty? + # + # XXX Should this raise a ValueError or just return an empty TestSuite? + def test_loadTestsFromNames__relative_empty_name_list(self): + loader = unittest.TestLoader() + + suite = loader.loadTestsFromNames([], unittest) + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), []) + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # Is ValueError raised in response to an empty name? + def test_loadTestsFromNames__empty_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromNames(['']) + except ValueError, e: + self.assertEqual(str(e), "Empty module name") + else: + self.fail("TestLoader.loadTestsFromNames failed to raise ValueError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # What happens when presented with an impossible module name? + def test_loadTestsFromNames__malformed_name(self): + loader = unittest.TestLoader() + + # XXX Should this raise ValueError or ImportError? + try: + loader.loadTestsFromNames(['abc () //']) + except ValueError: + pass + except ImportError: + pass + else: + self.fail("TestLoader.loadTestsFromNames failed to raise ValueError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # What happens when no module can be found for the given name? + def test_loadTestsFromNames__unknown_module_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromNames(['sdasfasfasdf']) + except ImportError, e: + self.assertEqual(str(e), "No module named sdasfasfasdf") + else: + self.fail("TestLoader.loadTestsFromNames failed to raise ImportError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # What happens when the module can be found, but not the attribute? + def test_loadTestsFromNames__unknown_attr_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromNames(['unittest.sdasfasfasdf', 'unittest']) + except AttributeError, e: + self.assertEqual(str(e), "'module' object has no attribute 'sdasfasfasdf'") + else: + self.fail("TestLoader.loadTestsFromNames failed to raise AttributeError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # ... + # "The method optionally resolves name relative to the given module" + # + # What happens when given an unknown attribute on a specified `module` + # argument? + def test_loadTestsFromNames__unknown_name_relative_1(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromNames(['sdasfasfasdf'], unittest) + except AttributeError, e: + self.assertEqual(str(e), "'module' object has no attribute 'sdasfasfasdf'") + else: + self.fail("TestLoader.loadTestsFromName failed to raise AttributeError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # ... + # "The method optionally resolves name relative to the given module" + # + # Do unknown attributes (relative to a provided module) still raise an + # exception even in the presence of valid attribute names? + def test_loadTestsFromNames__unknown_name_relative_2(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromNames(['TestCase', 'sdasfasfasdf'], unittest) + except AttributeError, e: + self.assertEqual(str(e), "'module' object has no attribute 'sdasfasfasdf'") + else: + self.fail("TestLoader.loadTestsFromName failed to raise AttributeError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # ... + # "The method optionally resolves name relative to the given module" + # + # What happens when faced with the empty string? + # + # XXX This currently raises AttributeError, though ValueError is probably + # more appropriate + def test_loadTestsFromNames__relative_empty_name(self): + loader = unittest.TestLoader() + + try: + loader.loadTestsFromNames([''], unittest) + except AttributeError: + pass + else: + self.fail("Failed to raise ValueError") + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # ... + # "The method optionally resolves name relative to the given module" + # + # What happens when presented with an impossible attribute name? + def test_loadTestsFromNames__relative_malformed_name(self): + loader = unittest.TestLoader() + + # XXX Should this raise AttributeError or ValueError? + try: + loader.loadTestsFromNames(['abc () //'], unittest) + except AttributeError: + pass + except ValueError: + pass + else: + self.fail("TestLoader.loadTestsFromNames failed to raise ValueError") + + # "The method optionally resolves name relative to the given module" + # + # Does loadTestsFromNames() make sure the provided `module` is in fact + # a module? + # + # XXX This validation is currently not done. This flexibility should + # either be documented or a TypeError should be raised. + def test_loadTestsFromNames__relative_not_a_module(self): + class MyTestCase(unittest.TestCase): + def test(self): + pass + + class NotAModule(object): + test_2 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromNames(['test_2'], NotAModule) + + reference = [unittest.TestSuite([MyTestCase('test')])] + self.assertEqual(list(suite), reference) + + # "The specifier name is a ``dotted name'' that may resolve either to + # a module, a test case class, a TestSuite instance, a test method + # within a test case class, or a callable object which returns a + # TestCase or TestSuite instance." + # + # Does it raise an exception if the name resolves to an invalid + # object? + def test_loadTestsFromNames__relative_bad_object(self): + m = types.ModuleType('m') + m.testcase_1 = object() + + loader = unittest.TestLoader() + try: + loader.loadTestsFromNames(['testcase_1'], m) + except TypeError: + pass + else: + self.fail("Should have raised TypeError") + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a test case class" + def test_loadTestsFromNames__relative_TestCase_subclass(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromNames(['testcase_1'], m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + expected = loader.suiteClass([MyTestCase('test')]) + self.assertEqual(list(suite), [expected]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a TestSuite instance" + def test_loadTestsFromNames__relative_TestSuite(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testsuite = unittest.TestSuite([MyTestCase('test')]) + + loader = unittest.TestLoader() + suite = loader.loadTestsFromNames(['testsuite'], m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + self.assertEqual(list(suite), [m.testsuite]) + + # "The specifier name is a ``dotted name'' that may resolve ... to ... a + # test method within a test case class" + def test_loadTestsFromNames__relative_testmethod(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromNames(['testcase_1.test'], m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + ref_suite = unittest.TestSuite([MyTestCase('test')]) + self.assertEqual(list(suite), [ref_suite]) + + # "The specifier name is a ``dotted name'' that may resolve ... to ... a + # test method within a test case class" + # + # Does the method gracefully handle names that initially look like they + # resolve to "a test method within a test case class" but don't? + def test_loadTestsFromNames__relative_invalid_testmethod(self): + m = types.ModuleType('m') + class MyTestCase(unittest.TestCase): + def test(self): + pass + m.testcase_1 = MyTestCase + + loader = unittest.TestLoader() + try: + loader.loadTestsFromNames(['testcase_1.testfoo'], m) + except AttributeError, e: + self.assertEqual(str(e), "type object 'MyTestCase' has no attribute 'testfoo'") + else: + self.fail("Failed to raise AttributeError") + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a callable object which returns a ... TestSuite instance" + def test_loadTestsFromNames__callable__TestSuite(self): + m = types.ModuleType('m') + testcase_1 = unittest.FunctionTestCase(lambda: None) + testcase_2 = unittest.FunctionTestCase(lambda: None) + def return_TestSuite(): + return unittest.TestSuite([testcase_1, testcase_2]) + m.return_TestSuite = return_TestSuite + + loader = unittest.TestLoader() + suite = loader.loadTestsFromNames(['return_TestSuite'], m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + expected = unittest.TestSuite([testcase_1, testcase_2]) + self.assertEqual(list(suite), [expected]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a callable object which returns a TestCase ... instance" + def test_loadTestsFromNames__callable__TestCase_instance(self): + m = types.ModuleType('m') + testcase_1 = unittest.FunctionTestCase(lambda: None) + def return_TestCase(): + return testcase_1 + m.return_TestCase = return_TestCase + + loader = unittest.TestLoader() + suite = loader.loadTestsFromNames(['return_TestCase'], m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + ref_suite = unittest.TestSuite([testcase_1]) + self.assertEqual(list(suite), [ref_suite]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a callable object which returns a TestCase or TestSuite instance" + # + # Are staticmethods handled correctly? + def test_loadTestsFromNames__callable__call_staticmethod(self): + m = types.ModuleType('m') + class Test1(unittest.TestCase): + def test(self): + pass + + testcase_1 = Test1('test') + class Foo(unittest.TestCase): + @staticmethod + def foo(): + return testcase_1 + m.Foo = Foo + + loader = unittest.TestLoader() + suite = loader.loadTestsFromNames(['Foo.foo'], m) + self.assertTrue(isinstance(suite, loader.suiteClass)) + + ref_suite = unittest.TestSuite([testcase_1]) + self.assertEqual(list(suite), [ref_suite]) + + # "The specifier name is a ``dotted name'' that may resolve ... to + # ... a callable object which returns a TestCase or TestSuite instance" + # + # What happens when the callable returns something else? + def test_loadTestsFromNames__callable__wrong_type(self): + m = types.ModuleType('m') + def return_wrong(): + return 6 + m.return_wrong = return_wrong + + loader = unittest.TestLoader() + try: + suite = loader.loadTestsFromNames(['return_wrong'], m) + except TypeError: + pass + else: + self.fail("TestLoader.loadTestsFromNames failed to raise TypeError") + + # "The specifier can refer to modules and packages which have not been + # imported; they will be imported as a side-effect" + def test_loadTestsFromNames__module_not_loaded(self): + # We're going to try to load this module as a side-effect, so it + # better not be loaded before we try. + # + # Why pick audioop? Google shows it isn't used very often, so there's + # a good chance that it won't be imported when this test is run + module_name = 'audioop' + + import sys + if module_name in sys.modules: + del sys.modules[module_name] + + loader = unittest.TestLoader() + try: + suite = loader.loadTestsFromNames([module_name]) + + self.assertTrue(isinstance(suite, loader.suiteClass)) + self.assertEqual(list(suite), [unittest.TestSuite()]) + + # audioop should now be loaded, thanks to loadTestsFromName() + self.assertTrue(module_name in sys.modules) + finally: + if module_name in sys.modules: + del sys.modules[module_name] + + ################################################################ + ### /Tests for TestLoader.loadTestsFromNames() + + ### Tests for TestLoader.getTestCaseNames() + ################################################################ + + # "Return a sorted sequence of method names found within testCaseClass" + # + # Test.foobar is defined to make sure getTestCaseNames() respects + # loader.testMethodPrefix + def test_getTestCaseNames(self): + class Test(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foobar(self): pass + + loader = unittest.TestLoader() + + self.assertEqual(loader.getTestCaseNames(Test), ['test_1', 'test_2']) + + # "Return a sorted sequence of method names found within testCaseClass" + # + # Does getTestCaseNames() behave appropriately if no tests are found? + def test_getTestCaseNames__no_tests(self): + class Test(unittest.TestCase): + def foobar(self): pass + + loader = unittest.TestLoader() + + self.assertEqual(loader.getTestCaseNames(Test), []) + + # "Return a sorted sequence of method names found within testCaseClass" + # + # Are not-TestCases handled gracefully? + # + # XXX This should raise a TypeError, not return a list + # + # XXX It's too late in the 2.5 release cycle to fix this, but it should + # probably be revisited for 2.6 + def test_getTestCaseNames__not_a_TestCase(self): + class BadCase(int): + def test_foo(self): + pass + + loader = unittest.TestLoader() + names = loader.getTestCaseNames(BadCase) + + self.assertEqual(names, ['test_foo']) + + # "Return a sorted sequence of method names found within testCaseClass" + # + # Make sure inherited names are handled. + # + # TestP.foobar is defined to make sure getTestCaseNames() respects + # loader.testMethodPrefix + def test_getTestCaseNames__inheritance(self): + class TestP(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foobar(self): pass + + class TestC(TestP): + def test_1(self): pass + def test_3(self): pass + + loader = unittest.TestLoader() + + names = ['test_1', 'test_2', 'test_3'] + self.assertEqual(loader.getTestCaseNames(TestC), names) + + ################################################################ + ### /Tests for TestLoader.getTestCaseNames() + + ### Tests for TestLoader.testMethodPrefix + ################################################################ + + # "String giving the prefix of method names which will be interpreted as + # test methods" + # + # Implicit in the documentation is that testMethodPrefix is respected by + # all loadTestsFrom* methods. + def test_testMethodPrefix__loadTestsFromTestCase(self): + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + + tests_1 = unittest.TestSuite([Foo('foo_bar')]) + tests_2 = unittest.TestSuite([Foo('test_1'), Foo('test_2')]) + + loader = unittest.TestLoader() + loader.testMethodPrefix = 'foo' + self.assertEqual(loader.loadTestsFromTestCase(Foo), tests_1) + + loader.testMethodPrefix = 'test' + self.assertEqual(loader.loadTestsFromTestCase(Foo), tests_2) + + # "String giving the prefix of method names which will be interpreted as + # test methods" + # + # Implicit in the documentation is that testMethodPrefix is respected by + # all loadTestsFrom* methods. + def test_testMethodPrefix__loadTestsFromModule(self): + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + m.Foo = Foo + + tests_1 = [unittest.TestSuite([Foo('foo_bar')])] + tests_2 = [unittest.TestSuite([Foo('test_1'), Foo('test_2')])] + + loader = unittest.TestLoader() + loader.testMethodPrefix = 'foo' + self.assertEqual(list(loader.loadTestsFromModule(m)), tests_1) + + loader.testMethodPrefix = 'test' + self.assertEqual(list(loader.loadTestsFromModule(m)), tests_2) + + # "String giving the prefix of method names which will be interpreted as + # test methods" + # + # Implicit in the documentation is that testMethodPrefix is respected by + # all loadTestsFrom* methods. + def test_testMethodPrefix__loadTestsFromName(self): + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + m.Foo = Foo + + tests_1 = unittest.TestSuite([Foo('foo_bar')]) + tests_2 = unittest.TestSuite([Foo('test_1'), Foo('test_2')]) + + loader = unittest.TestLoader() + loader.testMethodPrefix = 'foo' + self.assertEqual(loader.loadTestsFromName('Foo', m), tests_1) + + loader.testMethodPrefix = 'test' + self.assertEqual(loader.loadTestsFromName('Foo', m), tests_2) + + # "String giving the prefix of method names which will be interpreted as + # test methods" + # + # Implicit in the documentation is that testMethodPrefix is respected by + # all loadTestsFrom* methods. + def test_testMethodPrefix__loadTestsFromNames(self): + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + m.Foo = Foo + + tests_1 = unittest.TestSuite([unittest.TestSuite([Foo('foo_bar')])]) + tests_2 = unittest.TestSuite([Foo('test_1'), Foo('test_2')]) + tests_2 = unittest.TestSuite([tests_2]) + + loader = unittest.TestLoader() + loader.testMethodPrefix = 'foo' + self.assertEqual(loader.loadTestsFromNames(['Foo'], m), tests_1) + + loader.testMethodPrefix = 'test' + self.assertEqual(loader.loadTestsFromNames(['Foo'], m), tests_2) + + # "The default value is 'test'" + def test_testMethodPrefix__default_value(self): + loader = unittest.TestLoader() + self.assertTrue(loader.testMethodPrefix == 'test') + + ################################################################ + ### /Tests for TestLoader.testMethodPrefix + + ### Tests for TestLoader.sortTestMethodsUsing + ################################################################ + + # "Function to be used to compare method names when sorting them in + # getTestCaseNames() and all the loadTestsFromX() methods" + def test_sortTestMethodsUsing__loadTestsFromTestCase(self): + def reversed_cmp(x, y): + return -cmp(x, y) + + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + + loader = unittest.TestLoader() + loader.sortTestMethodsUsing = reversed_cmp + + tests = loader.suiteClass([Foo('test_2'), Foo('test_1')]) + self.assertEqual(loader.loadTestsFromTestCase(Foo), tests) + + # "Function to be used to compare method names when sorting them in + # getTestCaseNames() and all the loadTestsFromX() methods" + def test_sortTestMethodsUsing__loadTestsFromModule(self): + def reversed_cmp(x, y): + return -cmp(x, y) + + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + m.Foo = Foo + + loader = unittest.TestLoader() + loader.sortTestMethodsUsing = reversed_cmp + + tests = [loader.suiteClass([Foo('test_2'), Foo('test_1')])] + self.assertEqual(list(loader.loadTestsFromModule(m)), tests) + + # "Function to be used to compare method names when sorting them in + # getTestCaseNames() and all the loadTestsFromX() methods" + def test_sortTestMethodsUsing__loadTestsFromName(self): + def reversed_cmp(x, y): + return -cmp(x, y) + + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + m.Foo = Foo + + loader = unittest.TestLoader() + loader.sortTestMethodsUsing = reversed_cmp + + tests = loader.suiteClass([Foo('test_2'), Foo('test_1')]) + self.assertEqual(loader.loadTestsFromName('Foo', m), tests) + + # "Function to be used to compare method names when sorting them in + # getTestCaseNames() and all the loadTestsFromX() methods" + def test_sortTestMethodsUsing__loadTestsFromNames(self): + def reversed_cmp(x, y): + return -cmp(x, y) + + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + m.Foo = Foo + + loader = unittest.TestLoader() + loader.sortTestMethodsUsing = reversed_cmp + + tests = [loader.suiteClass([Foo('test_2'), Foo('test_1')])] + self.assertEqual(list(loader.loadTestsFromNames(['Foo'], m)), tests) + + # "Function to be used to compare method names when sorting them in + # getTestCaseNames()" + # + # Does it actually affect getTestCaseNames()? + def test_sortTestMethodsUsing__getTestCaseNames(self): + def reversed_cmp(x, y): + return -cmp(x, y) + + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + + loader = unittest.TestLoader() + loader.sortTestMethodsUsing = reversed_cmp + + test_names = ['test_2', 'test_1'] + self.assertEqual(loader.getTestCaseNames(Foo), test_names) + + # "The default value is the built-in cmp() function" + def test_sortTestMethodsUsing__default_value(self): + loader = unittest.TestLoader() + self.assertTrue(loader.sortTestMethodsUsing is cmp) + + # "it can be set to None to disable the sort." + # + # XXX How is this different from reassigning cmp? Are the tests returned + # in a random order or something? This behaviour should die + def test_sortTestMethodsUsing__None(self): + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + + loader = unittest.TestLoader() + loader.sortTestMethodsUsing = None + + test_names = ['test_2', 'test_1'] + self.assertEqual(set(loader.getTestCaseNames(Foo)), set(test_names)) + + ################################################################ + ### /Tests for TestLoader.sortTestMethodsUsing + + ### Tests for TestLoader.suiteClass + ################################################################ + + # "Callable object that constructs a test suite from a list of tests." + def test_suiteClass__loadTestsFromTestCase(self): + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + + tests = [Foo('test_1'), Foo('test_2')] + + loader = unittest.TestLoader() + loader.suiteClass = list + self.assertEqual(loader.loadTestsFromTestCase(Foo), tests) + + # It is implicit in the documentation for TestLoader.suiteClass that + # all TestLoader.loadTestsFrom* methods respect it. Let's make sure + def test_suiteClass__loadTestsFromModule(self): + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + m.Foo = Foo + + tests = [[Foo('test_1'), Foo('test_2')]] + + loader = unittest.TestLoader() + loader.suiteClass = list + self.assertEqual(loader.loadTestsFromModule(m), tests) + + # It is implicit in the documentation for TestLoader.suiteClass that + # all TestLoader.loadTestsFrom* methods respect it. Let's make sure + def test_suiteClass__loadTestsFromName(self): + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + m.Foo = Foo + + tests = [Foo('test_1'), Foo('test_2')] + + loader = unittest.TestLoader() + loader.suiteClass = list + self.assertEqual(loader.loadTestsFromName('Foo', m), tests) + + # It is implicit in the documentation for TestLoader.suiteClass that + # all TestLoader.loadTestsFrom* methods respect it. Let's make sure + def test_suiteClass__loadTestsFromNames(self): + m = types.ModuleType('m') + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def foo_bar(self): pass + m.Foo = Foo + + tests = [[Foo('test_1'), Foo('test_2')]] + + loader = unittest.TestLoader() + loader.suiteClass = list + self.assertEqual(loader.loadTestsFromNames(['Foo'], m), tests) + + # "The default value is the TestSuite class" + def test_suiteClass__default_value(self): + loader = unittest.TestLoader() + self.assertTrue(loader.suiteClass is unittest.TestSuite) + + ################################################################ + ### /Tests for TestLoader.suiteClass + +### Support code for Test_TestSuite +################################################################ + +class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + def test_3(self): pass + def runTest(self): pass + +def _mk_TestSuite(*names): + return unittest.TestSuite(Foo(n) for n in names) + +################################################################ +### /Support code for Test_TestSuite + +class Test_TestSuite(TestCase, TestEquality): + + ### Set up attributes needed by inherited tests + ################################################################ + + # Used by TestEquality.test_eq + eq_pairs = [(unittest.TestSuite(), unittest.TestSuite()) + ,(unittest.TestSuite(), unittest.TestSuite([])) + ,(_mk_TestSuite('test_1'), _mk_TestSuite('test_1'))] + + # Used by TestEquality.test_ne + ne_pairs = [(unittest.TestSuite(), _mk_TestSuite('test_1')) + ,(unittest.TestSuite([]), _mk_TestSuite('test_1')) + ,(_mk_TestSuite('test_1', 'test_2'), _mk_TestSuite('test_1', 'test_3')) + ,(_mk_TestSuite('test_1'), _mk_TestSuite('test_2'))] + + ################################################################ + ### /Set up attributes needed by inherited tests + + ### Tests for TestSuite.__init__ + ################################################################ + + # "class TestSuite([tests])" + # + # The tests iterable should be optional + def test_init__tests_optional(self): + suite = unittest.TestSuite() + + self.assertEqual(suite.countTestCases(), 0) + + # "class TestSuite([tests])" + # ... + # "If tests is given, it must be an iterable of individual test cases + # or other test suites that will be used to build the suite initially" + # + # TestSuite should deal with empty tests iterables by allowing the + # creation of an empty suite + def test_init__empty_tests(self): + suite = unittest.TestSuite([]) + + self.assertEqual(suite.countTestCases(), 0) + + # "class TestSuite([tests])" + # ... + # "If tests is given, it must be an iterable of individual test cases + # or other test suites that will be used to build the suite initially" + # + # TestSuite should allow any iterable to provide tests + def test_init__tests_from_any_iterable(self): + def tests(): + yield unittest.FunctionTestCase(lambda: None) + yield unittest.FunctionTestCase(lambda: None) + + suite_1 = unittest.TestSuite(tests()) + self.assertEqual(suite_1.countTestCases(), 2) + + suite_2 = unittest.TestSuite(suite_1) + self.assertEqual(suite_2.countTestCases(), 2) + + suite_3 = unittest.TestSuite(set(suite_1)) + self.assertEqual(suite_3.countTestCases(), 2) + + # "class TestSuite([tests])" + # ... + # "If tests is given, it must be an iterable of individual test cases + # or other test suites that will be used to build the suite initially" + # + # Does TestSuite() also allow other TestSuite() instances to be present + # in the tests iterable? + def test_init__TestSuite_instances_in_tests(self): + def tests(): + ftc = unittest.FunctionTestCase(lambda: None) + yield unittest.TestSuite([ftc]) + yield unittest.FunctionTestCase(lambda: None) + + suite = unittest.TestSuite(tests()) + self.assertEqual(suite.countTestCases(), 2) + + ################################################################ + ### /Tests for TestSuite.__init__ + + # Container types should support the iter protocol + def test_iter(self): + test1 = unittest.FunctionTestCase(lambda: None) + test2 = unittest.FunctionTestCase(lambda: None) + suite = unittest.TestSuite((test1, test2)) + + self.assertEqual(list(suite), [test1, test2]) + + # "Return the number of tests represented by the this test object. + # ...this method is also implemented by the TestSuite class, which can + # return larger [greater than 1] values" + # + # Presumably an empty TestSuite returns 0? + def test_countTestCases_zero_simple(self): + suite = unittest.TestSuite() + + self.assertEqual(suite.countTestCases(), 0) + + # "Return the number of tests represented by the this test object. + # ...this method is also implemented by the TestSuite class, which can + # return larger [greater than 1] values" + # + # Presumably an empty TestSuite (even if it contains other empty + # TestSuite instances) returns 0? + def test_countTestCases_zero_nested(self): + class Test1(unittest.TestCase): + def test(self): + pass + + suite = unittest.TestSuite([unittest.TestSuite()]) + + self.assertEqual(suite.countTestCases(), 0) + + # "Return the number of tests represented by the this test object. + # ...this method is also implemented by the TestSuite class, which can + # return larger [greater than 1] values" + def test_countTestCases_simple(self): + test1 = unittest.FunctionTestCase(lambda: None) + test2 = unittest.FunctionTestCase(lambda: None) + suite = unittest.TestSuite((test1, test2)) + + self.assertEqual(suite.countTestCases(), 2) + + # "Return the number of tests represented by the this test object. + # ...this method is also implemented by the TestSuite class, which can + # return larger [greater than 1] values" + # + # Make sure this holds for nested TestSuite instances, too + def test_countTestCases_nested(self): + class Test1(unittest.TestCase): + def test1(self): pass + def test2(self): pass + + test2 = unittest.FunctionTestCase(lambda: None) + test3 = unittest.FunctionTestCase(lambda: None) + child = unittest.TestSuite((Test1('test2'), test2)) + parent = unittest.TestSuite((test3, child, Test1('test1'))) + + self.assertEqual(parent.countTestCases(), 4) + + # "Run the tests associated with this suite, collecting the result into + # the test result object passed as result." + # + # And if there are no tests? What then? + def test_run__empty_suite(self): + events = [] + result = LoggingResult(events) + + suite = unittest.TestSuite() + + suite.run(result) + + self.assertEqual(events, []) + + # "Note that unlike TestCase.run(), TestSuite.run() requires the + # "result object to be passed in." + def test_run__requires_result(self): + suite = unittest.TestSuite() + + try: + suite.run() + except TypeError: + pass + else: + self.fail("Failed to raise TypeError") + + # "Run the tests associated with this suite, collecting the result into + # the test result object passed as result." + def test_run(self): + events = [] + result = LoggingResult(events) + + class LoggingCase(unittest.TestCase): + def run(self, result): + events.append('run %s' % self._testMethodName) + + def test1(self): pass + def test2(self): pass + + tests = [LoggingCase('test1'), LoggingCase('test2')] + + unittest.TestSuite(tests).run(result) + + self.assertEqual(events, ['run test1', 'run test2']) + + # "Add a TestCase ... to the suite" + def test_addTest__TestCase(self): + class Foo(unittest.TestCase): + def test(self): pass + + test = Foo('test') + suite = unittest.TestSuite() + + suite.addTest(test) + + self.assertEqual(suite.countTestCases(), 1) + self.assertEqual(list(suite), [test]) + + # "Add a ... TestSuite to the suite" + def test_addTest__TestSuite(self): + class Foo(unittest.TestCase): + def test(self): pass + + suite_2 = unittest.TestSuite([Foo('test')]) + + suite = unittest.TestSuite() + suite.addTest(suite_2) + + self.assertEqual(suite.countTestCases(), 1) + self.assertEqual(list(suite), [suite_2]) + + # "Add all the tests from an iterable of TestCase and TestSuite + # instances to this test suite." + # + # "This is equivalent to iterating over tests, calling addTest() for + # each element" + def test_addTests(self): + class Foo(unittest.TestCase): + def test_1(self): pass + def test_2(self): pass + + test_1 = Foo('test_1') + test_2 = Foo('test_2') + inner_suite = unittest.TestSuite([test_2]) + + def gen(): + yield test_1 + yield test_2 + yield inner_suite + + suite_1 = unittest.TestSuite() + suite_1.addTests(gen()) + + self.assertEqual(list(suite_1), list(gen())) + + # "This is equivalent to iterating over tests, calling addTest() for + # each element" + suite_2 = unittest.TestSuite() + for t in gen(): + suite_2.addTest(t) + + self.assertEqual(suite_1, suite_2) + + # "Add all the tests from an iterable of TestCase and TestSuite + # instances to this test suite." + # + # What happens if it doesn't get an iterable? + def test_addTest__noniterable(self): + suite = unittest.TestSuite() + + try: + suite.addTests(5) + except TypeError: + pass + else: + self.fail("Failed to raise TypeError") + + def test_addTest__noncallable(self): + suite = unittest.TestSuite() + self.assertRaises(TypeError, suite.addTest, 5) + + def test_addTest__casesuiteclass(self): + suite = unittest.TestSuite() + self.assertRaises(TypeError, suite.addTest, Test_TestSuite) + self.assertRaises(TypeError, suite.addTest, unittest.TestSuite) + + def test_addTests__string(self): + suite = unittest.TestSuite() + self.assertRaises(TypeError, suite.addTests, "foo") + + +class Test_FunctionTestCase(TestCase): + + # "Return the number of tests represented by the this test object. For + # TestCase instances, this will always be 1" + def test_countTestCases(self): + test = unittest.FunctionTestCase(lambda: None) + + self.assertEqual(test.countTestCases(), 1) + + # "When a setUp() method is defined, the test runner will run that method + # prior to each test. Likewise, if a tearDown() method is defined, the + # test runner will invoke that method after each test. In the example, + # setUp() was used to create a fresh sequence for each test." + # + # Make sure the proper call order is maintained, even if setUp() raises + # an exception. + def test_run_call_order__error_in_setUp(self): + events = [] + result = LoggingResult(events) + + def setUp(): + events.append('setUp') + raise RuntimeError('raised by setUp') + + def test(): + events.append('test') + + def tearDown(): + events.append('tearDown') + + expected = ['startTest', 'setUp', 'addError', 'stopTest'] + unittest.FunctionTestCase(test, setUp, tearDown).run(result) + self.assertEqual(events, expected) + + # "When a setUp() method is defined, the test runner will run that method + # prior to each test. Likewise, if a tearDown() method is defined, the + # test runner will invoke that method after each test. In the example, + # setUp() was used to create a fresh sequence for each test." + # + # Make sure the proper call order is maintained, even if the test raises + # an error (as opposed to a failure). + def test_run_call_order__error_in_test(self): + events = [] + result = LoggingResult(events) + + def setUp(): + events.append('setUp') + + def test(): + events.append('test') + raise RuntimeError('raised by test') + + def tearDown(): + events.append('tearDown') + + expected = ['startTest', 'setUp', 'test', 'addError', 'tearDown', + 'stopTest'] + unittest.FunctionTestCase(test, setUp, tearDown).run(result) + self.assertEqual(events, expected) + + # "When a setUp() method is defined, the test runner will run that method + # prior to each test. Likewise, if a tearDown() method is defined, the + # test runner will invoke that method after each test. In the example, + # setUp() was used to create a fresh sequence for each test." + # + # Make sure the proper call order is maintained, even if the test signals + # a failure (as opposed to an error). + def test_run_call_order__failure_in_test(self): + events = [] + result = LoggingResult(events) + + def setUp(): + events.append('setUp') + + def test(): + events.append('test') + self.fail('raised by test') + + def tearDown(): + events.append('tearDown') + + expected = ['startTest', 'setUp', 'test', 'addFailure', 'tearDown', + 'stopTest'] + unittest.FunctionTestCase(test, setUp, tearDown).run(result) + self.assertEqual(events, expected) + + # "When a setUp() method is defined, the test runner will run that method + # prior to each test. Likewise, if a tearDown() method is defined, the + # test runner will invoke that method after each test. In the example, + # setUp() was used to create a fresh sequence for each test." + # + # Make sure the proper call order is maintained, even if tearDown() raises + # an exception. + def test_run_call_order__error_in_tearDown(self): + events = [] + result = LoggingResult(events) + + def setUp(): + events.append('setUp') + + def test(): + events.append('test') + + def tearDown(): + events.append('tearDown') + raise RuntimeError('raised by tearDown') + + expected = ['startTest', 'setUp', 'test', 'tearDown', 'addError', + 'stopTest'] + unittest.FunctionTestCase(test, setUp, tearDown).run(result) + self.assertEqual(events, expected) + + # "Return a string identifying the specific test case." + # + # Because of the vague nature of the docs, I'm not going to lock this + # test down too much. Really all that can be asserted is that the id() + # will be a string (either 8-byte or unicode -- again, because the docs + # just say "string") + def test_id(self): + test = unittest.FunctionTestCase(lambda: None) + + self.assertTrue(isinstance(test.id(), basestring)) + + # "Returns a one-line description of the test, or None if no description + # has been provided. The default implementation of this method returns + # the first line of the test method's docstring, if available, or None." + def test_shortDescription__no_docstring(self): + test = unittest.FunctionTestCase(lambda: None) + + self.assertEqual(test.shortDescription(), None) + + # "Returns a one-line description of the test, or None if no description + # has been provided. The default implementation of this method returns + # the first line of the test method's docstring, if available, or None." + def test_shortDescription__singleline_docstring(self): + desc = "this tests foo" + test = unittest.FunctionTestCase(lambda: None, description=desc) + + self.assertEqual(test.shortDescription(), "this tests foo") + +class Test_TestResult(TestCase): + # Note: there are not separate tests for TestResult.wasSuccessful(), + # TestResult.errors, TestResult.failures, TestResult.testsRun or + # TestResult.shouldStop because these only have meaning in terms of + # other TestResult methods. + # + # Accordingly, tests for the aforenamed attributes are incorporated + # in with the tests for the defining methods. + ################################################################ + + def test_init(self): + result = unittest.TestResult() + + self.assertTrue(result.wasSuccessful()) + self.assertEqual(len(result.errors), 0) + self.assertEqual(len(result.failures), 0) + self.assertEqual(result.testsRun, 0) + self.assertEqual(result.shouldStop, False) + + # "This method can be called to signal that the set of tests being + # run should be aborted by setting the TestResult's shouldStop + # attribute to True." + def test_stop(self): + result = unittest.TestResult() + + result.stop() + + self.assertEqual(result.shouldStop, True) + + # "Called when the test case test is about to be run. The default + # implementation simply increments the instance's testsRun counter." + def test_startTest(self): + class Foo(unittest.TestCase): + def test_1(self): + pass + + test = Foo('test_1') + + result = unittest.TestResult() + + result.startTest(test) + + self.assertTrue(result.wasSuccessful()) + self.assertEqual(len(result.errors), 0) + self.assertEqual(len(result.failures), 0) + self.assertEqual(result.testsRun, 1) + self.assertEqual(result.shouldStop, False) + + result.stopTest(test) + + # "Called after the test case test has been executed, regardless of + # the outcome. The default implementation does nothing." + def test_stopTest(self): + class Foo(unittest.TestCase): + def test_1(self): + pass + + test = Foo('test_1') + + result = unittest.TestResult() + + result.startTest(test) + + self.assertTrue(result.wasSuccessful()) + self.assertEqual(len(result.errors), 0) + self.assertEqual(len(result.failures), 0) + self.assertEqual(result.testsRun, 1) + self.assertEqual(result.shouldStop, False) + + result.stopTest(test) + + # Same tests as above; make sure nothing has changed + self.assertTrue(result.wasSuccessful()) + self.assertEqual(len(result.errors), 0) + self.assertEqual(len(result.failures), 0) + self.assertEqual(result.testsRun, 1) + self.assertEqual(result.shouldStop, False) + + # "Called before and after tests are run. The default implementation does nothing." + def test_startTestRun_stopTestRun(self): + result = unittest.TestResult() + result.startTestRun() + result.stopTestRun() + + # "addSuccess(test)" + # ... + # "Called when the test case test succeeds" + # ... + # "wasSuccessful() - Returns True if all tests run so far have passed, + # otherwise returns False" + # ... + # "testsRun - The total number of tests run so far." + # ... + # "errors - A list containing 2-tuples of TestCase instances and + # formatted tracebacks. Each tuple represents a test which raised an + # unexpected exception. Contains formatted + # tracebacks instead of sys.exc_info() results." + # ... + # "failures - A list containing 2-tuples of TestCase instances and + # formatted tracebacks. Each tuple represents a test where a failure was + # explicitly signalled using the TestCase.fail*() or TestCase.assert*() + # methods. Contains formatted tracebacks instead + # of sys.exc_info() results." + def test_addSuccess(self): + class Foo(unittest.TestCase): + def test_1(self): + pass + + test = Foo('test_1') + + result = unittest.TestResult() + + result.startTest(test) + result.addSuccess(test) + result.stopTest(test) + + self.assertTrue(result.wasSuccessful()) + self.assertEqual(len(result.errors), 0) + self.assertEqual(len(result.failures), 0) + self.assertEqual(result.testsRun, 1) + self.assertEqual(result.shouldStop, False) + + # "addFailure(test, err)" + # ... + # "Called when the test case test signals a failure. err is a tuple of + # the form returned by sys.exc_info(): (type, value, traceback)" + # ... + # "wasSuccessful() - Returns True if all tests run so far have passed, + # otherwise returns False" + # ... + # "testsRun - The total number of tests run so far." + # ... + # "errors - A list containing 2-tuples of TestCase instances and + # formatted tracebacks. Each tuple represents a test which raised an + # unexpected exception. Contains formatted + # tracebacks instead of sys.exc_info() results." + # ... + # "failures - A list containing 2-tuples of TestCase instances and + # formatted tracebacks. Each tuple represents a test where a failure was + # explicitly signalled using the TestCase.fail*() or TestCase.assert*() + # methods. Contains formatted tracebacks instead + # of sys.exc_info() results." + def test_addFailure(self): + import sys + + class Foo(unittest.TestCase): + def test_1(self): + pass + + test = Foo('test_1') + try: + test.fail("foo") + except: + exc_info_tuple = sys.exc_info() + + result = unittest.TestResult() + + result.startTest(test) + result.addFailure(test, exc_info_tuple) + result.stopTest(test) + + self.assertFalse(result.wasSuccessful()) + self.assertEqual(len(result.errors), 0) + self.assertEqual(len(result.failures), 1) + self.assertEqual(result.testsRun, 1) + self.assertEqual(result.shouldStop, False) + + test_case, formatted_exc = result.failures[0] + self.assertTrue(test_case is test) + self.assertTrue(isinstance(formatted_exc, str)) + + # "addError(test, err)" + # ... + # "Called when the test case test raises an unexpected exception err + # is a tuple of the form returned by sys.exc_info(): + # (type, value, traceback)" + # ... + # "wasSuccessful() - Returns True if all tests run so far have passed, + # otherwise returns False" + # ... + # "testsRun - The total number of tests run so far." + # ... + # "errors - A list containing 2-tuples of TestCase instances and + # formatted tracebacks. Each tuple represents a test which raised an + # unexpected exception. Contains formatted + # tracebacks instead of sys.exc_info() results." + # ... + # "failures - A list containing 2-tuples of TestCase instances and + # formatted tracebacks. Each tuple represents a test where a failure was + # explicitly signalled using the TestCase.fail*() or TestCase.assert*() + # methods. Contains formatted tracebacks instead + # of sys.exc_info() results." + def test_addError(self): + import sys + + class Foo(unittest.TestCase): + def test_1(self): + pass + + test = Foo('test_1') + try: + raise TypeError() + except: + exc_info_tuple = sys.exc_info() + + result = unittest.TestResult() + + result.startTest(test) + result.addError(test, exc_info_tuple) + result.stopTest(test) + + self.assertFalse(result.wasSuccessful()) + self.assertEqual(len(result.errors), 1) + self.assertEqual(len(result.failures), 0) + self.assertEqual(result.testsRun, 1) + self.assertEqual(result.shouldStop, False) + + test_case, formatted_exc = result.errors[0] + self.assertTrue(test_case is test) + self.assertTrue(isinstance(formatted_exc, str)) + +### Support code for Test_TestCase +################################################################ + +class Foo(unittest.TestCase): + def runTest(self): pass + def test1(self): pass + +class Bar(Foo): + def test2(self): pass + +class LoggingTestCase(unittest.TestCase): + """A test case which logs its calls.""" + + def __init__(self, events): + super(LoggingTestCase, self).__init__('test') + self.events = events + + def setUp(self): + if self.__class__ is LoggingTestCase: + # evade test discovery + raise unittest.SkipTest + self.events.append('setUp') + + def test(self): + self.events.append('test') + + def tearDown(self): + self.events.append('tearDown') + +class ResultWithNoStartTestRunStopTestRun(object): + """An object honouring TestResult before startTestRun/stopTestRun.""" + + def __init__(self): + self.failures = [] + self.errors = [] + self.testsRun = 0 + self.skipped = [] + self.expectedFailures = [] + self.unexpectedSuccesses = [] + self.shouldStop = False + + def startTest(self, test): + pass + + def stopTest(self, test): + pass + + def addError(self, test): + pass + + def addFailure(self, test): + pass + + def addSuccess(self, test): + pass + + def wasSuccessful(self): + return True + + +################################################################ +### /Support code for Test_TestCase + +class Test_TestCase(TestCase, TestEquality, TestHashing): + + ### Set up attributes used by inherited tests + ################################################################ + + # Used by TestHashing.test_hash and TestEquality.test_eq + eq_pairs = [(Foo('test1'), Foo('test1'))] + + # Used by TestEquality.test_ne + ne_pairs = [(Foo('test1'), Foo('runTest')) + ,(Foo('test1'), Bar('test1')) + ,(Foo('test1'), Bar('test2'))] + + ################################################################ + ### /Set up attributes used by inherited tests + + + # "class TestCase([methodName])" + # ... + # "Each instance of TestCase will run a single test method: the + # method named methodName." + # ... + # "methodName defaults to "runTest"." + # + # Make sure it really is optional, and that it defaults to the proper + # thing. + def test_init__no_test_name(self): + class Test(unittest.TestCase): + def runTest(self): raise MyException() + def test(self): pass + + self.assertEqual(Test().id()[-13:], '.Test.runTest') + + # "class TestCase([methodName])" + # ... + # "Each instance of TestCase will run a single test method: the + # method named methodName." + def test_init__test_name__valid(self): + class Test(unittest.TestCase): + def runTest(self): raise MyException() + def test(self): pass + + self.assertEqual(Test('test').id()[-10:], '.Test.test') + + # "class TestCase([methodName])" + # ... + # "Each instance of TestCase will run a single test method: the + # method named methodName." + def test_init__test_name__invalid(self): + class Test(unittest.TestCase): + def runTest(self): raise MyException() + def test(self): pass + + try: + Test('testfoo') + except ValueError: + pass + else: + self.fail("Failed to raise ValueError") + + # "Return the number of tests represented by the this test object. For + # TestCase instances, this will always be 1" + def test_countTestCases(self): + class Foo(unittest.TestCase): + def test(self): pass + + self.assertEqual(Foo('test').countTestCases(), 1) + + # "Return the default type of test result object to be used to run this + # test. For TestCase instances, this will always be + # unittest.TestResult; subclasses of TestCase should + # override this as necessary." + def test_defaultTestResult(self): + class Foo(unittest.TestCase): + def runTest(self): + pass + + result = Foo().defaultTestResult() + self.assertEqual(type(result), unittest.TestResult) + + # "When a setUp() method is defined, the test runner will run that method + # prior to each test. Likewise, if a tearDown() method is defined, the + # test runner will invoke that method after each test. In the example, + # setUp() was used to create a fresh sequence for each test." + # + # Make sure the proper call order is maintained, even if setUp() raises + # an exception. + def test_run_call_order__error_in_setUp(self): + events = [] + result = LoggingResult(events) + + class Foo(LoggingTestCase): + def setUp(self): + super(Foo, self).setUp() + raise RuntimeError('raised by Foo.setUp') + + Foo(events).run(result) + expected = ['startTest', 'setUp', 'addError', 'stopTest'] + self.assertEqual(events, expected) + + # "With a temporary result stopTestRun is called when setUp errors. + def test_run_call_order__error_in_setUp_default_result(self): + events = [] + + class Foo(LoggingTestCase): + def defaultTestResult(self): + return LoggingResult(self.events) + + def setUp(self): + super(Foo, self).setUp() + raise RuntimeError('raised by Foo.setUp') + + Foo(events).run() + expected = ['startTestRun', 'startTest', 'setUp', 'addError', + 'stopTest', 'stopTestRun'] + self.assertEqual(events, expected) + + # "When a setUp() method is defined, the test runner will run that method + # prior to each test. Likewise, if a tearDown() method is defined, the + # test runner will invoke that method after each test. In the example, + # setUp() was used to create a fresh sequence for each test." + # + # Make sure the proper call order is maintained, even if the test raises + # an error (as opposed to a failure). + def test_run_call_order__error_in_test(self): + events = [] + result = LoggingResult(events) + + class Foo(LoggingTestCase): + def test(self): + super(Foo, self).test() + raise RuntimeError('raised by Foo.test') + + expected = ['startTest', 'setUp', 'test', 'addError', 'tearDown', + 'stopTest'] + Foo(events).run(result) + self.assertEqual(events, expected) + + # "With a default result, an error in the test still results in stopTestRun + # being called." + def test_run_call_order__error_in_test_default_result(self): + events = [] + + class Foo(LoggingTestCase): + def defaultTestResult(self): + return LoggingResult(self.events) + + def test(self): + super(Foo, self).test() + raise RuntimeError('raised by Foo.test') + + expected = ['startTestRun', 'startTest', 'setUp', 'test', 'addError', + 'tearDown', 'stopTest', 'stopTestRun'] + Foo(events).run() + self.assertEqual(events, expected) + + # "When a setUp() method is defined, the test runner will run that method + # prior to each test. Likewise, if a tearDown() method is defined, the + # test runner will invoke that method after each test. In the example, + # setUp() was used to create a fresh sequence for each test." + # + # Make sure the proper call order is maintained, even if the test signals + # a failure (as opposed to an error). + def test_run_call_order__failure_in_test(self): + events = [] + result = LoggingResult(events) + + class Foo(LoggingTestCase): + def test(self): + super(Foo, self).test() + self.fail('raised by Foo.test') + + expected = ['startTest', 'setUp', 'test', 'addFailure', 'tearDown', + 'stopTest'] + Foo(events).run(result) + self.assertEqual(events, expected) + + # "When a test fails with a default result stopTestRun is still called." + def test_run_call_order__failure_in_test_default_result(self): + + class Foo(LoggingTestCase): + def defaultTestResult(self): + return LoggingResult(self.events) + def test(self): + super(Foo, self).test() + self.fail('raised by Foo.test') + + expected = ['startTestRun', 'startTest', 'setUp', 'test', 'addFailure', + 'tearDown', 'stopTest', 'stopTestRun'] + events = [] + Foo(events).run() + self.assertEqual(events, expected) + + # "When a setUp() method is defined, the test runner will run that method + # prior to each test. Likewise, if a tearDown() method is defined, the + # test runner will invoke that method after each test. In the example, + # setUp() was used to create a fresh sequence for each test." + # + # Make sure the proper call order is maintained, even if tearDown() raises + # an exception. + def test_run_call_order__error_in_tearDown(self): + events = [] + result = LoggingResult(events) + + class Foo(LoggingTestCase): + def tearDown(self): + super(Foo, self).tearDown() + raise RuntimeError('raised by Foo.tearDown') + + Foo(events).run(result) + expected = ['startTest', 'setUp', 'test', 'tearDown', 'addError', + 'stopTest'] + self.assertEqual(events, expected) + + # "When tearDown errors with a default result stopTestRun is still called." + def test_run_call_order__error_in_tearDown_default_result(self): + + class Foo(LoggingTestCase): + def defaultTestResult(self): + return LoggingResult(self.events) + def tearDown(self): + super(Foo, self).tearDown() + raise RuntimeError('raised by Foo.tearDown') + + events = [] + Foo(events).run() + expected = ['startTestRun', 'startTest', 'setUp', 'test', 'tearDown', + 'addError', 'stopTest', 'stopTestRun'] + self.assertEqual(events, expected) + + # "TestCase.run() still works when the defaultTestResult is a TestResult + # that does not support startTestRun and stopTestRun. + def test_run_call_order_default_result(self): + + class Foo(unittest.TestCase): + def defaultTestResult(self): + return ResultWithNoStartTestRunStopTestRun() + def test(self): + pass + + Foo('test').run() + + # "This class attribute gives the exception raised by the test() method. + # If a test framework needs to use a specialized exception, possibly to + # carry additional information, it must subclass this exception in + # order to ``play fair'' with the framework. The initial value of this + # attribute is AssertionError" + def test_failureException__default(self): + class Foo(unittest.TestCase): + def test(self): + pass + + self.assertTrue(Foo('test').failureException is AssertionError) + + # "This class attribute gives the exception raised by the test() method. + # If a test framework needs to use a specialized exception, possibly to + # carry additional information, it must subclass this exception in + # order to ``play fair'' with the framework." + # + # Make sure TestCase.run() respects the designated failureException + def test_failureException__subclassing__explicit_raise(self): + events = [] + result = LoggingResult(events) + + class Foo(unittest.TestCase): + def test(self): + raise RuntimeError() + + failureException = RuntimeError + + self.assertTrue(Foo('test').failureException is RuntimeError) + + + Foo('test').run(result) + expected = ['startTest', 'addFailure', 'stopTest'] + self.assertEqual(events, expected) + + # "This class attribute gives the exception raised by the test() method. + # If a test framework needs to use a specialized exception, possibly to + # carry additional information, it must subclass this exception in + # order to ``play fair'' with the framework." + # + # Make sure TestCase.run() respects the designated failureException + def test_failureException__subclassing__implicit_raise(self): + events = [] + result = LoggingResult(events) + + class Foo(unittest.TestCase): + def test(self): + self.fail("foo") + + failureException = RuntimeError + + self.assertTrue(Foo('test').failureException is RuntimeError) + + + Foo('test').run(result) + expected = ['startTest', 'addFailure', 'stopTest'] + self.assertEqual(events, expected) + + # "The default implementation does nothing." + def test_setUp(self): + class Foo(unittest.TestCase): + def runTest(self): + pass + + # ... and nothing should happen + Foo().setUp() + + # "The default implementation does nothing." + def test_tearDown(self): + class Foo(unittest.TestCase): + def runTest(self): + pass + + # ... and nothing should happen + Foo().tearDown() + + # "Return a string identifying the specific test case." + # + # Because of the vague nature of the docs, I'm not going to lock this + # test down too much. Really all that can be asserted is that the id() + # will be a string (either 8-byte or unicode -- again, because the docs + # just say "string") + def test_id(self): + class Foo(unittest.TestCase): + def runTest(self): + pass + + self.assertTrue(isinstance(Foo().id(), basestring)) + + # "If result is omitted or None, a temporary result object is created + # and used, but is not made available to the caller. As TestCase owns the + # temporary result startTestRun and stopTestRun are called. + + def test_run__uses_defaultTestResult(self): + events = [] + + class Foo(unittest.TestCase): + def test(self): + events.append('test') + + def defaultTestResult(self): + return LoggingResult(events) + + # Make run() find a result object on its own + Foo('test').run() + + expected = ['startTestRun', 'startTest', 'test', 'addSuccess', + 'stopTest', 'stopTestRun'] + self.assertEqual(events, expected) + + def testShortDescriptionWithoutDocstring(self): + self.assertEqual( + self.shortDescription(), + 'testShortDescriptionWithoutDocstring (' + __name__ + + '.Test_TestCase)') + + def testShortDescriptionWithOneLineDocstring(self): + """Tests shortDescription() for a method with a docstring.""" + self.assertEqual( + self.shortDescription(), + ('testShortDescriptionWithOneLineDocstring ' + '(' + __name__ + '.Test_TestCase)\n' + 'Tests shortDescription() for a method with a docstring.')) + + def testShortDescriptionWithMultiLineDocstring(self): + """Tests shortDescription() for a method with a longer docstring. + + This method ensures that only the first line of a docstring is + returned used in the short description, no matter how long the + whole thing is. + """ + self.assertEqual( + self.shortDescription(), + ('testShortDescriptionWithMultiLineDocstring ' + '(' + __name__ + '.Test_TestCase)\n' + 'Tests shortDescription() for a method with a longer ' + 'docstring.')) + + def testAddTypeEqualityFunc(self): + class SadSnake(object): + """Dummy class for test_addTypeEqualityFunc.""" + s1, s2 = SadSnake(), SadSnake() + self.assertFalse(s1 == s2) + def AllSnakesCreatedEqual(a, b, msg=None): + return type(a) == type(b) == SadSnake + self.addTypeEqualityFunc(SadSnake, AllSnakesCreatedEqual) + self.assertEqual(s1, s2) + # No this doesn't clean up and remove the SadSnake equality func + # from this TestCase instance but since its a local nothing else + # will ever notice that. + + def testAssertIs(self): + thing = object() + self.assertIs(thing, thing) + self.assertRaises(self.failureException, self.assertIs, thing, object()) + + def testAssertIsNot(self): + thing = object() + self.assertIsNot(thing, object()) + self.assertRaises(self.failureException, self.assertIsNot, thing, thing) + + def testAssertIsInstance(self): + thing = [] + self.assertIsInstance(thing, list) + self.assertRaises(self.failureException, self.assertIsInstance, + thing, dict) + + def testAssertNotIsInstance(self): + thing = [] + self.assertNotIsInstance(thing, dict) + self.assertRaises(self.failureException, self.assertNotIsInstance, + thing, list) + + def testAssertIn(self): + animals = {'monkey': 'banana', 'cow': 'grass', 'seal': 'fish'} + + self.assertIn('a', 'abc') + self.assertIn(2, [1, 2, 3]) + self.assertIn('monkey', animals) + + self.assertNotIn('d', 'abc') + self.assertNotIn(0, [1, 2, 3]) + self.assertNotIn('otter', animals) + + self.assertRaises(self.failureException, self.assertIn, 'x', 'abc') + self.assertRaises(self.failureException, self.assertIn, 4, [1, 2, 3]) + self.assertRaises(self.failureException, self.assertIn, 'elephant', + animals) + + self.assertRaises(self.failureException, self.assertNotIn, 'c', 'abc') + self.assertRaises(self.failureException, self.assertNotIn, 1, [1, 2, 3]) + self.assertRaises(self.failureException, self.assertNotIn, 'cow', + animals) + + def testAssertDictContainsSubset(self): + self.assertDictContainsSubset({}, {}) + self.assertDictContainsSubset({}, {'a': 1}) + self.assertDictContainsSubset({'a': 1}, {'a': 1}) + self.assertDictContainsSubset({'a': 1}, {'a': 1, 'b': 2}) + self.assertDictContainsSubset({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) + + self.assertRaises(unittest.TestCase.failureException, + self.assertDictContainsSubset, {'a': 2}, {'a': 1}, + '.*Mismatched values:.*') + + self.assertRaises(unittest.TestCase.failureException, + self.assertDictContainsSubset, {'c': 1}, {'a': 1}, + '.*Missing:.*') + + self.assertRaises(unittest.TestCase.failureException, + self.assertDictContainsSubset, {'a': 1, 'c': 1}, + {'a': 1}, '.*Missing:.*') + + self.assertRaises(unittest.TestCase.failureException, + self.assertDictContainsSubset, {'a': 1, 'c': 1}, + {'a': 1}, '.*Missing:.*Mismatched values:.*') + + def testAssertEqual(self): + equal_pairs = [ + ((), ()), + ({}, {}), + ([], []), + (set(), set()), + (frozenset(), frozenset())] + for a, b in equal_pairs: + # This mess of try excepts is to test the assertEqual behavior + # itself. + try: + self.assertEqual(a, b) + except self.failureException: + self.fail('assertEqual(%r, %r) failed' % (a, b)) + try: + self.assertEqual(a, b, msg='foo') + except self.failureException: + self.fail('assertEqual(%r, %r) with msg= failed' % (a, b)) + try: + self.assertEqual(a, b, 'foo') + except self.failureException: + self.fail('assertEqual(%r, %r) with third parameter failed' % + (a, b)) + + unequal_pairs = [ + ((), []), + ({}, set()), + (set([4,1]), frozenset([4,2])), + (frozenset([4,5]), set([2,3])), + (set([3,4]), set([5,4]))] + for a, b in unequal_pairs: + self.assertRaises(self.failureException, self.assertEqual, a, b) + self.assertRaises(self.failureException, self.assertEqual, a, b, + 'foo') + self.assertRaises(self.failureException, self.assertEqual, a, b, + msg='foo') + + def testEquality(self): + self.assertListEqual([], []) + self.assertTupleEqual((), ()) + self.assertSequenceEqual([], ()) + + a = [0, 'a', []] + b = [] + self.assertRaises(unittest.TestCase.failureException, + self.assertListEqual, a, b) + self.assertRaises(unittest.TestCase.failureException, + self.assertListEqual, tuple(a), tuple(b)) + self.assertRaises(unittest.TestCase.failureException, + self.assertSequenceEqual, a, tuple(b)) + + b.extend(a) + self.assertListEqual(a, b) + self.assertTupleEqual(tuple(a), tuple(b)) + self.assertSequenceEqual(a, tuple(b)) + self.assertSequenceEqual(tuple(a), b) + + self.assertRaises(self.failureException, self.assertListEqual, + a, tuple(b)) + self.assertRaises(self.failureException, self.assertTupleEqual, + tuple(a), b) + self.assertRaises(self.failureException, self.assertListEqual, None, b) + self.assertRaises(self.failureException, self.assertTupleEqual, None, + tuple(b)) + self.assertRaises(self.failureException, self.assertSequenceEqual, + None, tuple(b)) + self.assertRaises(self.failureException, self.assertListEqual, 1, 1) + self.assertRaises(self.failureException, self.assertTupleEqual, 1, 1) + self.assertRaises(self.failureException, self.assertSequenceEqual, + 1, 1) + + self.assertDictEqual({}, {}) + + c = { 'x': 1 } + d = {} + self.assertRaises(unittest.TestCase.failureException, + self.assertDictEqual, c, d) + + d.update(c) + self.assertDictEqual(c, d) + + d['x'] = 0 + self.assertRaises(unittest.TestCase.failureException, + self.assertDictEqual, c, d, 'These are unequal') + + self.assertRaises(self.failureException, self.assertDictEqual, None, d) + self.assertRaises(self.failureException, self.assertDictEqual, [], d) + self.assertRaises(self.failureException, self.assertDictEqual, 1, 1) + + self.assertSameElements([1, 2, 3], [3, 2, 1]) + self.assertSameElements([1, 2] + [3] * 100, [1] * 100 + [2, 3]) + self.assertSameElements(['foo', 'bar', 'baz'], ['bar', 'baz', 'foo']) + self.assertRaises(self.failureException, self.assertSameElements, + [10], [10, 11]) + self.assertRaises(self.failureException, self.assertSameElements, + [10, 11], [10]) + + # Test that sequences of unhashable objects can be tested for sameness: + self.assertSameElements([[1, 2], [3, 4]], [[3, 4], [1, 2]]) + + self.assertSameElements([{'a': 1}, {'b': 2}], [{'b': 2}, {'a': 1}]) + self.assertRaises(self.failureException, self.assertSameElements, + [[1]], [[2]]) + + def testAssertSetEqual(self): + set1 = set() + set2 = set() + self.assertSetEqual(set1, set2) + + self.assertRaises(self.failureException, self.assertSetEqual, None, set2) + self.assertRaises(self.failureException, self.assertSetEqual, [], set2) + self.assertRaises(self.failureException, self.assertSetEqual, set1, None) + self.assertRaises(self.failureException, self.assertSetEqual, set1, []) + + set1 = set(['a']) + set2 = set() + self.assertRaises(self.failureException, self.assertSetEqual, set1, set2) + + set1 = set(['a']) + set2 = set(['a']) + self.assertSetEqual(set1, set2) + + set1 = set(['a']) + set2 = set(['a', 'b']) + self.assertRaises(self.failureException, self.assertSetEqual, set1, set2) + + set1 = set(['a']) + set2 = frozenset(['a', 'b']) + self.assertRaises(self.failureException, self.assertSetEqual, set1, set2) + + set1 = set(['a', 'b']) + set2 = frozenset(['a', 'b']) + self.assertSetEqual(set1, set2) + + set1 = set() + set2 = "foo" + self.assertRaises(self.failureException, self.assertSetEqual, set1, set2) + self.assertRaises(self.failureException, self.assertSetEqual, set2, set1) + + # make sure any string formatting is tuple-safe + set1 = set([(0, 1), (2, 3)]) + set2 = set([(4, 5)]) + self.assertRaises(self.failureException, self.assertSetEqual, set1, set2) + + def testInequality(self): + # Try ints + self.assertGreater(2, 1) + self.assertGreaterEqual(2, 1) + self.assertGreaterEqual(1, 1) + self.assertLess(1, 2) + self.assertLessEqual(1, 2) + self.assertLessEqual(1, 1) + self.assertRaises(self.failureException, self.assertGreater, 1, 2) + self.assertRaises(self.failureException, self.assertGreater, 1, 1) + self.assertRaises(self.failureException, self.assertGreaterEqual, 1, 2) + self.assertRaises(self.failureException, self.assertLess, 2, 1) + self.assertRaises(self.failureException, self.assertLess, 1, 1) + self.assertRaises(self.failureException, self.assertLessEqual, 2, 1) + + # Try Floats + self.assertGreater(1.1, 1.0) + self.assertGreaterEqual(1.1, 1.0) + self.assertGreaterEqual(1.0, 1.0) + self.assertLess(1.0, 1.1) + self.assertLessEqual(1.0, 1.1) + self.assertLessEqual(1.0, 1.0) + self.assertRaises(self.failureException, self.assertGreater, 1.0, 1.1) + self.assertRaises(self.failureException, self.assertGreater, 1.0, 1.0) + self.assertRaises(self.failureException, self.assertGreaterEqual, 1.0, 1.1) + self.assertRaises(self.failureException, self.assertLess, 1.1, 1.0) + self.assertRaises(self.failureException, self.assertLess, 1.0, 1.0) + self.assertRaises(self.failureException, self.assertLessEqual, 1.1, 1.0) + + # Try Strings + self.assertGreater('bug', 'ant') + self.assertGreaterEqual('bug', 'ant') + self.assertGreaterEqual('ant', 'ant') + self.assertLess('ant', 'bug') + self.assertLessEqual('ant', 'bug') + self.assertLessEqual('ant', 'ant') + self.assertRaises(self.failureException, self.assertGreater, 'ant', 'bug') + self.assertRaises(self.failureException, self.assertGreater, 'ant', 'ant') + self.assertRaises(self.failureException, self.assertGreaterEqual, 'ant', 'bug') + self.assertRaises(self.failureException, self.assertLess, 'bug', 'ant') + self.assertRaises(self.failureException, self.assertLess, 'ant', 'ant') + self.assertRaises(self.failureException, self.assertLessEqual, 'bug', 'ant') + + # Try Unicode + self.assertGreater(u'bug', u'ant') + self.assertGreaterEqual(u'bug', u'ant') + self.assertGreaterEqual(u'ant', u'ant') + self.assertLess(u'ant', u'bug') + self.assertLessEqual(u'ant', u'bug') + self.assertLessEqual(u'ant', u'ant') + self.assertRaises(self.failureException, self.assertGreater, u'ant', u'bug') + self.assertRaises(self.failureException, self.assertGreater, u'ant', u'ant') + self.assertRaises(self.failureException, self.assertGreaterEqual, u'ant', + u'bug') + self.assertRaises(self.failureException, self.assertLess, u'bug', u'ant') + self.assertRaises(self.failureException, self.assertLess, u'ant', u'ant') + self.assertRaises(self.failureException, self.assertLessEqual, u'bug', u'ant') + + # Try Mixed String/Unicode + self.assertGreater('bug', u'ant') + self.assertGreater(u'bug', 'ant') + self.assertGreaterEqual('bug', u'ant') + self.assertGreaterEqual(u'bug', 'ant') + self.assertGreaterEqual('ant', u'ant') + self.assertGreaterEqual(u'ant', 'ant') + self.assertLess('ant', u'bug') + self.assertLess(u'ant', 'bug') + self.assertLessEqual('ant', u'bug') + self.assertLessEqual(u'ant', 'bug') + self.assertLessEqual('ant', u'ant') + self.assertLessEqual(u'ant', 'ant') + self.assertRaises(self.failureException, self.assertGreater, 'ant', u'bug') + self.assertRaises(self.failureException, self.assertGreater, u'ant', 'bug') + self.assertRaises(self.failureException, self.assertGreater, 'ant', u'ant') + self.assertRaises(self.failureException, self.assertGreater, u'ant', 'ant') + self.assertRaises(self.failureException, self.assertGreaterEqual, 'ant', + u'bug') + self.assertRaises(self.failureException, self.assertGreaterEqual, u'ant', + 'bug') + self.assertRaises(self.failureException, self.assertLess, 'bug', u'ant') + self.assertRaises(self.failureException, self.assertLess, u'bug', 'ant') + self.assertRaises(self.failureException, self.assertLess, 'ant', u'ant') + self.assertRaises(self.failureException, self.assertLess, u'ant', 'ant') + self.assertRaises(self.failureException, self.assertLessEqual, 'bug', u'ant') + self.assertRaises(self.failureException, self.assertLessEqual, u'bug', 'ant') + + def testAssertMultiLineEqual(self): + sample_text = """\ +http://www.python.org/doc/2.3/lib/module-unittest.html +test case + A test case is the smallest unit of testing. [...] +""" + revised_sample_text = """\ +http://www.python.org/doc/2.4.1/lib/module-unittest.html +test case + A test case is the smallest unit of testing. [...] You may provide your + own implementation that does not subclass from TestCase, of course. +""" + sample_text_error = """ +- http://www.python.org/doc/2.3/lib/module-unittest.html +? ^ ++ http://www.python.org/doc/2.4.1/lib/module-unittest.html +? ^^^ + test case +- A test case is the smallest unit of testing. [...] ++ A test case is the smallest unit of testing. [...] You may provide your +? +++++++++++++++++++++ ++ own implementation that does not subclass from TestCase, of course. +""" + + for type_changer in (lambda x: x, lambda x: x.decode('utf8')): + try: + self.assertMultiLineEqual(type_changer(sample_text), + type_changer(revised_sample_text)) + except self.failureException, e: + # no fair testing ourself with ourself, use assertEqual.. + self.assertEqual(sample_text_error, str(e).encode('utf8')) + + def testAssertIsNone(self): + self.assertIsNone(None) + self.assertRaises(self.failureException, self.assertIsNone, False) + self.assertIsNotNone('DjZoPloGears on Rails') + self.assertRaises(self.failureException, self.assertIsNotNone, None) + + def testAssertRegexpMatches(self): + self.assertRegexpMatches('asdfabasdf', r'ab+') + self.assertRaises(self.failureException, self.assertRegexpMatches, + 'saaas', r'aaaa') + + def testAssertRaisesRegexp(self): + class ExceptionMock(Exception): + pass + + def Stub(): + raise ExceptionMock('We expect') + + self.assertRaisesRegexp(ExceptionMock, re.compile('expect$'), Stub) + self.assertRaisesRegexp(ExceptionMock, 'expect$', Stub) + self.assertRaisesRegexp(ExceptionMock, u'expect$', Stub) + + def testAssertNotRaisesRegexp(self): + self.assertRaisesRegexp( + self.failureException, '^Exception not raised$', + self.assertRaisesRegexp, Exception, re.compile('x'), + lambda: None) + self.assertRaisesRegexp( + self.failureException, '^Exception not raised$', + self.assertRaisesRegexp, Exception, 'x', + lambda: None) + self.assertRaisesRegexp( + self.failureException, '^Exception not raised$', + self.assertRaisesRegexp, Exception, u'x', + lambda: None) + + def testAssertRaisesRegexpMismatch(self): + def Stub(): + raise Exception('Unexpected') + + self.assertRaisesRegexp( + self.failureException, + r'"\^Expected\$" does not match "Unexpected"', + self.assertRaisesRegexp, Exception, '^Expected$', + Stub) + self.assertRaisesRegexp( + self.failureException, + r'"\^Expected\$" does not match "Unexpected"', + self.assertRaisesRegexp, Exception, u'^Expected$', + Stub) + self.assertRaisesRegexp( + self.failureException, + r'"\^Expected\$" does not match "Unexpected"', + self.assertRaisesRegexp, Exception, + re.compile('^Expected$'), Stub) + + # def testAssertRaisesExcValue(self): + # class ExceptionMock(Exception): + # pass + + # def Stub(foo): + # raise ExceptionMock(foo) + # v = "particular value" + + # ctx = self.assertRaises(ExceptionMock) + # with ctx: + # Stub(v) + # e = ctx.exc_value + # self.assertTrue(isinstance(e, ExceptionMock)) + # self.assertEqual(e.args[0], v) + + def testSynonymAssertMethodNames(self): + """Test undocumented method name synonyms. + + Please do not use these methods names in your own code. + + This test confirms their continued existence and functionality + in order to avoid breaking existing code. + """ + self.assertNotEquals(3, 5) + self.assertEquals(3, 3) + self.assertAlmostEquals(2.0, 2.0) + self.assertNotAlmostEquals(3.0, 5.0) + self.assert_(True) + + def testPendingDeprecationMethodNames(self): + """Test fail* methods pending deprecation, they will warn in 3.2. + + Do not use these methods. They will go away in 3.3. + """ + self.failIfEqual(3, 5) + self.failUnlessEqual(3, 3) + self.failUnlessAlmostEqual(2.0, 2.0) + self.failIfAlmostEqual(3.0, 5.0) + self.failUnless(True) + self.failUnlessRaises(TypeError, lambda _: 3.14 + u'spam') + self.failIf(False) + + # not sure why this is broken, don't care + # def testDeepcopy(self): + # # Issue: 5660 + # class TestableTest(TestCase): + # def testNothing(self): + # pass + + # test = TestableTest('testNothing') + + # # This shouldn't blow up + # deepcopy(test) + + +class Test_TestSkipping(TestCase): + + def test_skipping(self): + class Foo(unittest.TestCase): + def test_skip_me(self): + self.skipTest("skip") + events = [] + result = LoggingResult(events) + test = Foo("test_skip_me") + test.run(result) + self.assertEqual(events, ['startTest', 'addSkip', 'stopTest']) + self.assertEqual(result.skipped, [(test, "skip")]) + + # Try letting setUp skip the test now. + class Foo(unittest.TestCase): + def setUp(self): + self.skipTest("testing") + def test_nothing(self): pass + events = [] + result = LoggingResult(events) + test = Foo("test_nothing") + test.run(result) + self.assertEqual(events, ['startTest', 'addSkip', 'stopTest']) + self.assertEqual(result.skipped, [(test, "testing")]) + self.assertEqual(result.testsRun, 1) + + def test_skipping_decorators(self): + op_table = ((unittest.skipUnless, False, True), + (unittest.skipIf, True, False)) + for deco, do_skip, dont_skip in op_table: + class Foo(unittest.TestCase): + @deco(do_skip, "testing") + def test_skip(self): pass + + @deco(dont_skip, "testing") + def test_dont_skip(self): pass + test_do_skip = Foo("test_skip") + test_dont_skip = Foo("test_dont_skip") + suite = unittest.TestSuite([test_do_skip, test_dont_skip]) + events = [] + result = LoggingResult(events) + suite.run(result) + self.assertEqual(len(result.skipped), 1) + expected = ['startTest', 'addSkip', 'stopTest', + 'startTest', 'addSuccess', 'stopTest'] + self.assertEqual(events, expected) + self.assertEqual(result.testsRun, 2) + self.assertEqual(result.skipped, [(test_do_skip, "testing")]) + self.assertTrue(result.wasSuccessful()) + + def test_skip_class(self): + class Foo(unittest.TestCase): + def test_1(self): + record.append(1) + Foo = unittest.skip("testing")(Foo) + record = [] + result = unittest.TestResult() + test = Foo("test_1") + suite = unittest.TestSuite([test]) + suite.run(result) + self.assertEqual(result.skipped, [(test, "testing")]) + self.assertEqual(record, []) + + def test_expected_failure(self): + class Foo(unittest.TestCase): + @unittest.expectedFailure + def test_die(self): + self.fail("help me!") + events = [] + result = LoggingResult(events) + test = Foo("test_die") + test.run(result) + self.assertEqual(events, + ['startTest', 'addExpectedFailure', 'stopTest']) + self.assertEqual(result.expectedFailures[0][0], test) + self.assertTrue(result.wasSuccessful()) + + def test_unexpected_success(self): + class Foo(unittest.TestCase): + @unittest.expectedFailure + def test_die(self): + pass + events = [] + result = LoggingResult(events) + test = Foo("test_die") + test.run(result) + self.assertEqual(events, + ['startTest', 'addUnexpectedSuccess', 'stopTest']) + self.assertFalse(result.failures) + self.assertEqual(result.unexpectedSuccesses, [test]) + self.assertTrue(result.wasSuccessful()) + + + +class Test_Assertions(TestCase): + def test_AlmostEqual(self): + self.assertAlmostEqual(1.00000001, 1.0) + self.assertNotAlmostEqual(1.0000001, 1.0) + self.assertRaises(self.failureException, + self.assertAlmostEqual, 1.0000001, 1.0) + self.assertRaises(self.failureException, + self.assertNotAlmostEqual, 1.00000001, 1.0) + + self.assertAlmostEqual(1.1, 1.0, places=0) + self.assertRaises(self.failureException, + self.assertAlmostEqual, 1.1, 1.0, places=1) + + self.assertAlmostEqual(0, .1+.1j, places=0) + self.assertNotAlmostEqual(0, .1+.1j, places=1) + self.assertRaises(self.failureException, + self.assertAlmostEqual, 0, .1+.1j, places=1) + self.assertRaises(self.failureException, + self.assertNotAlmostEqual, 0, .1+.1j, places=0) + + self.assertAlmostEqual(float('inf'), float('inf')) + self.assertRaises(self.failureException, self.assertNotAlmostEqual, + float('inf'), float('inf')) + + + def test_assertRaises(self): + def _raise(e): + raise e + self.assertRaises(KeyError, _raise, KeyError) + self.assertRaises(KeyError, _raise, KeyError("key")) + try: + self.assertRaises(KeyError, lambda: None) + except self.failureException, e: + self.assert_("KeyError not raised" in e, str(e)) + else: + self.fail("assertRaises() didn't fail") + try: + self.assertRaises(KeyError, _raise, ValueError) + except ValueError: + pass + else: + self.fail("assertRaises() didn't let exception pass through") + # with self.assertRaises(KeyError): + # raise KeyError + # with self.assertRaises(KeyError): + # raise KeyError("key") + # try: + # with self.assertRaises(KeyError): + # pass + # except self.failureException as e: + # self.assert_("KeyError not raised" in e, str(e)) + # else: + # self.fail("assertRaises() didn't fail") + # try: + # with self.assertRaises(KeyError): + # raise ValueError + # except ValueError: + # pass + # else: + # self.fail("assertRaises() didn't let exception pass through") + + +class TestLongMessage(TestCase): + """Test that the individual asserts honour longMessage. + This actually tests all the message behaviour for + asserts that use longMessage.""" + + def setUp(self): + class TestableTestFalse(TestCase): + longMessage = False + failureException = self.failureException + + def testTest(self): + pass + + class TestableTestTrue(TestCase): + longMessage = True + failureException = self.failureException + + def testTest(self): + pass + + self.testableTrue = TestableTestTrue('testTest') + self.testableFalse = TestableTestFalse('testTest') + + def testDefault(self): + self.assertFalse(TestCase.longMessage) + + def test_formatMsg(self): + self.assertEquals(self.testableFalse._formatMessage(None, "foo"), "foo") + self.assertEquals(self.testableFalse._formatMessage("foo", "bar"), "foo") + + self.assertEquals(self.testableTrue._formatMessage(None, "foo"), "foo") + self.assertEquals(self.testableTrue._formatMessage("foo", "bar"), "bar : foo") + + def assertMessages(self, methodName, args, errors): + def getMethod(i): + useTestableFalse = i < 2 + if useTestableFalse: + test = self.testableFalse + else: + test = self.testableTrue + return getattr(test, methodName) + + for i, expected_regexp in enumerate(errors): + testMethod = getMethod(i) + kwargs = {} + withMsg = i % 2 + if withMsg: + kwargs = {"msg": "oops"} + + self.assertRaisesRegexp(self.failureException, expected_regexp, + lambda: testMethod(*args, **kwargs)) + + def testAssertTrue(self): + self.assertMessages('assertTrue', (False,), + ["^False is not True$", "^oops$", "^False is not True$", + "^False is not True : oops$"]) + + def testAssertFalse(self): + self.assertMessages('assertFalse', (True,), + ["^True is not False$", "^oops$", "^True is not False$", + "^True is not False : oops$"]) + + def testNotEqual(self): + self.assertMessages('assertNotEqual', (1, 1), + ["^1 == 1$", "^oops$", "^1 == 1$", + "^1 == 1 : oops$"]) + + def testAlmostEqual(self): + self.assertMessages('assertAlmostEqual', (1, 2), + ["^1 != 2 within 7 places$", "^oops$", + "^1 != 2 within 7 places$", "^1 != 2 within 7 places : oops$"]) + + def testNotAlmostEqual(self): + self.assertMessages('assertNotAlmostEqual', (1, 1), + ["^1 == 1 within 7 places$", "^oops$", + "^1 == 1 within 7 places$", "^1 == 1 within 7 places : oops$"]) + + def test_baseAssertEqual(self): + self.assertMessages('_baseAssertEqual', (1, 2), + ["^1 != 2$", "^oops$", "^1 != 2$", "^1 != 2 : oops$"]) + + def testAssertSequenceEqual(self): + # Error messages are multiline so not testing on full message + # assertTupleEqual and assertListEqual delegate to this method + self.assertMessages('assertSequenceEqual', ([], [None]), + ["\+ \[None\]$", "^oops$", r"\+ \[None\]$", + r"\+ \[None\] : oops$"]) + + def testAssertSetEqual(self): + self.assertMessages('assertSetEqual', (set(), set([None])), + ["None$", "^oops$", "None$", + "None : oops$"]) + + def testAssertIn(self): + self.assertMessages('assertIn', (None, []), + ['^None not found in \[\]$', "^oops$", + '^None not found in \[\]$', + '^None not found in \[\] : oops$']) + + def testAssertNotIn(self): + self.assertMessages('assertNotIn', (None, [None]), + ['^None unexpectedly found in \[None\]$', "^oops$", + '^None unexpectedly found in \[None\]$', + '^None unexpectedly found in \[None\] : oops$']) + + def testAssertDictEqual(self): + self.assertMessages('assertDictEqual', ({}, {'key': 'value'}), + [r"\+ \{'key': 'value'\}$", "^oops$", + "\+ \{'key': 'value'\}$", + "\+ \{'key': 'value'\} : oops$"]) + + def testAssertDictContainsSubset(self): + self.assertMessages('assertDictContainsSubset', ({'key': 'value'}, {}), + ["^Missing: 'key'$", "^oops$", + "^Missing: 'key'$", + "^Missing: 'key' : oops$"]) + + def testAssertSameElements(self): + self.assertMessages('assertSameElements', ([], [None]), + [r"\[None\]$", "^oops$", + r"\[None\]$", + r"\[None\] : oops$"]) + + def testAssertMultiLineEqual(self): + self.assertMessages('assertMultiLineEqual', ("", "foo"), + [r"\+ foo$", "^oops$", + r"\+ foo$", + r"\+ foo : oops$"]) + + def testAssertLess(self): + self.assertMessages('assertLess', (2, 1), + ["^2 not less than 1$", "^oops$", + "^2 not less than 1$", "^2 not less than 1 : oops$"]) + + def testAssertLessEqual(self): + self.assertMessages('assertLessEqual', (2, 1), + ["^2 not less than or equal to 1$", "^oops$", + "^2 not less than or equal to 1$", + "^2 not less than or equal to 1 : oops$"]) + + def testAssertGreater(self): + self.assertMessages('assertGreater', (1, 2), + ["^1 not greater than 2$", "^oops$", + "^1 not greater than 2$", + "^1 not greater than 2 : oops$"]) + + def testAssertGreaterEqual(self): + self.assertMessages('assertGreaterEqual', (1, 2), + ["^1 not greater than or equal to 2$", "^oops$", + "^1 not greater than or equal to 2$", + "^1 not greater than or equal to 2 : oops$"]) + + def testAssertIsNone(self): + self.assertMessages('assertIsNone', ('not None',), + ["^'not None' is not None$", "^oops$", + "^'not None' is not None$", + "^'not None' is not None : oops$"]) + + def testAssertIsNotNone(self): + self.assertMessages('assertIsNotNone', (None,), + ["^unexpectedly None$", "^oops$", + "^unexpectedly None$", + "^unexpectedly None : oops$"]) + + def testAssertIs(self): + self.assertMessages('assertIs', (None, 'foo'), + ["^None is not 'foo'$", "^oops$", + "^None is not 'foo'$", + "^None is not 'foo' : oops$"]) + + def testAssertIsNot(self): + self.assertMessages('assertIsNot', (None, None), + ["^unexpectedly identical: None$", "^oops$", + "^unexpectedly identical: None$", + "^unexpectedly identical: None : oops$"]) + + +class TestCleanUp(TestCase): + + def testCleanUp(self): + class TestableTest(TestCase): + def testNothing(self): + pass + + test = TestableTest('testNothing') + self.assertEqual(test._cleanups, []) + + cleanups = [] + + def cleanup1(*args, **kwargs): + cleanups.append((1, args, kwargs)) + + def cleanup2(*args, **kwargs): + cleanups.append((2, args, kwargs)) + + test.addCleanup(cleanup1, 1, 2, 3, four='hello', five='goodbye') + test.addCleanup(cleanup2) + + self.assertEqual(test._cleanups, + [(cleanup1, (1, 2, 3), dict(four='hello', five='goodbye')), + (cleanup2, (), {})]) + + result = test.doCleanups() + self.assertTrue(result) + + self.assertEqual(cleanups, [(2, (), {}), (1, (1, 2, 3), dict(four='hello', five='goodbye'))]) + + def testCleanUpWithErrors(self): + class TestableTest(TestCase): + def testNothing(self): + pass + + class MockResult(object): + errors = [] + def addError(self, test, exc_info): + self.errors.append((test, exc_info)) + + result = MockResult() + test = TestableTest('testNothing') + test._resultForDoCleanups = result + + exc1 = Exception('foo') + exc2 = Exception('bar') + def cleanup1(): + raise exc1 + + def cleanup2(): + raise exc2 + + test.addCleanup(cleanup1) + test.addCleanup(cleanup2) + + self.assertFalse(test.doCleanups()) + + (test1, (Type1, instance1, _)), (test2, (Type2, instance2, _)) = reversed(MockResult.errors) + self.assertEqual((test1, Type1, instance1), (test, Exception, exc1)) + self.assertEqual((test2, Type2, instance2), (test, Exception, exc2)) + + def testCleanupInRun(self): + blowUp = False + ordering = [] + + class TestableTest(TestCase): + def setUp(self): + ordering.append('setUp') + if blowUp: + raise Exception('foo') + + def testNothing(self): + ordering.append('test') + + def tearDown(self): + ordering.append('tearDown') + + test = TestableTest('testNothing') + + def cleanup1(): + ordering.append('cleanup1') + def cleanup2(): + ordering.append('cleanup2') + test.addCleanup(cleanup1) + test.addCleanup(cleanup2) + + def success(some_test): + self.assertEqual(some_test, test) + ordering.append('success') + + result = unittest.TestResult() + result.addSuccess = success + + test.run(result) + self.assertEqual(ordering, ['setUp', 'test', 'tearDown', + 'cleanup2', 'cleanup1', 'success']) + + blowUp = True + ordering = [] + test = TestableTest('testNothing') + test.addCleanup(cleanup1) + test.run(result) + self.assertEqual(ordering, ['setUp', 'cleanup1']) + + +class Test_TestProgram(TestCase): + + # Horrible white box test + def testNoExit(self): + result = object() + test = object() + + class FakeRunner(object): + def run(self, test): + self.test = test + return result + + runner = FakeRunner() + + oldParseArgs = TestProgram.parseArgs + def restoreParseArgs(): + TestProgram.parseArgs = oldParseArgs + TestProgram.parseArgs = lambda *args: None + self.addCleanup(restoreParseArgs) + + def removeTest(): + del TestProgram.test + TestProgram.test = test + self.addCleanup(removeTest) + + program = TestProgram(testRunner=runner, exit=False, verbosity=2) + + self.assertEqual(program.result, result) + self.assertEqual(runner.test, test) + self.assertEqual(program.verbosity, 2) + + class FooBar(unittest.TestCase): + def testPass(self): + assert True + def testFail(self): + assert False + + class FooBarLoader(unittest.TestLoader): + """Test loader that returns a suite containing FooBar.""" + def loadTestsFromModule(self, module): + return self.suiteClass( + [self.loadTestsFromTestCase(Test_TestProgram.FooBar)]) + + + def test_NonExit(self): + program = unittest.main(exit=False, + argv=["foobar"], + testRunner=unittest.TextTestRunner(stream=StringIO()), + testLoader=self.FooBarLoader()) + self.assertTrue(hasattr(program, 'result')) + + + def test_Exit(self): + self.assertRaises( + SystemExit, + unittest.main, + argv=["foobar"], + testRunner=unittest.TextTestRunner(stream=StringIO()), + exit=True, + testLoader=self.FooBarLoader()) + + + def test_ExitAsDefault(self): + self.assertRaises( + SystemExit, + unittest.main, + argv=["foobar"], + testRunner=unittest.TextTestRunner(stream=StringIO()), + testLoader=self.FooBarLoader()) + + +class Test_TextTestRunner(TestCase): + """Tests for TextTestRunner.""" + + def test_works_with_result_without_startTestRun_stopTestRun(self): + class OldTextResult(ResultWithNoStartTestRunStopTestRun): + separator2 = '' + def printErrors(self): + pass + + class Runner(unittest.TextTestRunner): + def __init__(self): + super(Runner, self).__init__(StringIO()) + + def _makeResult(self): + return OldTextResult() + + runner = Runner() + runner.run(unittest.TestSuite()) + + def test_startTestRun_stopTestRun_called(self): + class LoggingTextResult(LoggingResult): + separator2 = '' + def printErrors(self): + pass + + class LoggingRunner(unittest.TextTestRunner): + def __init__(self, events): + super(LoggingRunner, self).__init__(StringIO()) + self._events = events + + def _makeResult(self): + return LoggingTextResult(self._events) + + events = [] + runner = LoggingRunner(events) + runner.run(unittest.TestSuite()) + expected = ['startTestRun', 'stopTestRun'] + self.assertEqual(events, expected) + + def test_pickle_unpickle(self): + # Issue #7197: a TextTestRunner should be (un)pickleable. This is + # required by test_multiprocessing under Windows (in verbose mode). + import StringIO + # cStringIO objects are not pickleable, but StringIO objects are. + stream = StringIO.StringIO("foo") + runner = unittest.TextTestRunner(stream) + for protocol in range(pickle.HIGHEST_PROTOCOL + 1): + s = pickle.dumps(runner, protocol=protocol) + obj = pickle.loads(s) + # StringIO objects never compare equal, a cheap test instead. + self.assertEqual(obj.stream.getvalue(), stream.getvalue()) + + +class TestDiscovery(TestCase): + + # Heavily mocked tests so I can avoid hitting the filesystem + def test_get_name_from_path(self): + loader = unittest.TestLoader() + + loader._top_level_dir = '/foo' + name = loader._get_name_from_path('/foo/bar/baz.py') + self.assertEqual(name, 'bar.baz') + + if not __debug__: + # asserts are off + return + + self.assertRaises(AssertionError, + loader._get_name_from_path, '/bar/baz.py') + + def test_find_tests(self): + loader = unittest.TestLoader() + + original_listdir = os.listdir + def restore_listdir(): + os.listdir = original_listdir + original_isfile = os.path.isfile + def restore_isfile(): + os.path.isfile = original_isfile + original_isdir = os.path.isdir + def restore_isdir(): + os.path.isdir = original_isdir + + path_lists = [['test1.py', 'test2.py', 'not_a_test.py', 'test_dir', + 'test.foo', 'test-not-a-module.py', 'another_dir'], + ['test3.py', 'test4.py', ]] + os.listdir = lambda path: path_lists.pop(0) + self.addCleanup(restore_listdir) + + def isdir(path): + return path.endswith('dir') + os.path.isdir = isdir + self.addCleanup(restore_isdir) + + def isfile(path): + # another_dir is not a package and so shouldn't be recursed into + return not path.endswith('dir') and not 'another_dir' in path + os.path.isfile = isfile + self.addCleanup(restore_isfile) + + loader._get_module_from_name = lambda path: path + ' module' + loader.loadTestsFromModule = lambda module: module + ' tests' + + loader._top_level_dir = '/foo' + suite = list(loader._find_tests('/foo', 'test*.py')) + + expected = [name + ' module tests' for name in + ('test1', 'test2')] + expected.extend([('test_dir.%s' % name) + ' module tests' for name in + ('test3', 'test4')]) + self.assertEqual(suite, expected) + + def test_find_tests_with_package(self): + loader = unittest.TestLoader() + + original_listdir = os.listdir + def restore_listdir(): + os.listdir = original_listdir + original_isfile = os.path.isfile + def restore_isfile(): + os.path.isfile = original_isfile + original_isdir = os.path.isdir + def restore_isdir(): + os.path.isdir = original_isdir + + directories = ['a_directory', 'test_directory', 'test_directory2'] + path_lists = [directories, [], [], []] + os.listdir = lambda path: path_lists.pop(0) + self.addCleanup(restore_listdir) + + os.path.isdir = lambda path: True + self.addCleanup(restore_isdir) + + os.path.isfile = lambda path: os.path.basename(path) not in directories + self.addCleanup(restore_isfile) + + class Module(object): + paths = [] + load_tests_args = [] + + def __init__(self, path): + self.path = path + self.paths.append(path) + if os.path.basename(path) == 'test_directory': + def load_tests(loader, tests, pattern): + self.load_tests_args.append((loader, tests, pattern)) + return 'load_tests' + self.load_tests = load_tests + + def __eq__(self, other): + return self.path == other.path + + loader._get_module_from_name = lambda name: Module(name) + def loadTestsFromModule(module, use_load_tests): + if use_load_tests: + raise self.failureException('use_load_tests should be False for packages') + return module.path + ' module tests' + loader.loadTestsFromModule = loadTestsFromModule + + loader._top_level_dir = '/foo' + # this time no '.py' on the pattern so that it can match + # a test package + suite = list(loader._find_tests('/foo', 'test*')) + + # We should have loaded tests from the test_directory package by calling load_tests + # and directly from the test_directory2 package + self.assertEqual(suite, + ['load_tests', 'test_directory2' + ' module tests']) + self.assertEqual(Module.paths, ['test_directory', 'test_directory2']) + + # load_tests should have been called once with loader, tests and pattern + self.assertEqual(Module.load_tests_args, + [(loader, 'test_directory' + ' module tests', 'test*')]) + + def test_discover(self): + loader = unittest.TestLoader() + + original_isfile = os.path.isfile + def restore_isfile(): + os.path.isfile = original_isfile + + os.path.isfile = lambda path: False + self.addCleanup(restore_isfile) + + orig_sys_path = sys.path[:] + def restore_path(): + sys.path[:] = orig_sys_path + self.addCleanup(restore_path) + + full_path = os.path.abspath(os.path.normpath('/foo')) + self.assertRaises(ImportError, + loader.discover, '/foo/bar', top_level_dir='/foo') + + self.assertEqual(loader._top_level_dir, full_path) + self.assertIn(full_path, sys.path) + + os.path.isfile = lambda path: True + _find_tests_args = [] + def test(): + pass + tests = [test] + def _find_tests(start_dir, pattern): + _find_tests_args.append((start_dir, pattern)) + return [tests] + loader._find_tests = _find_tests + + suite = loader.discover('/foo/bar/baz', 'pattern', '/foo/bar') + + top_level_dir = os.path.abspath(os.path.normpath('/foo/bar')) + start_dir = os.path.abspath(os.path.normpath('/foo/bar/baz')) + self.assertEqual(list(suite), tests) + self.assertEqual(loader._top_level_dir, top_level_dir) + self.assertEqual(_find_tests_args, [(start_dir, 'pattern')]) + self.assertIn(top_level_dir, sys.path) + + def test_discover_with_modules_that_fail_to_import(self): + loader = unittest.TestLoader() + + listdir = os.listdir + os.listdir = lambda _: ['test_this_does_not_exist.py'] + isfile = os.path.isfile + os.path.isfile = lambda _: True + orig_sys_path = sys.path[:] + def restore(): + os.path.isfile = isfile + os.listdir = listdir + sys.path[:] = orig_sys_path + self.addCleanup(restore) + + suite = loader.discover('.') + self.assertIn(os.getcwd(), sys.path) + self.assertEqual(suite.countTestCases(), 1) + test = list(suite)[0] # extract test from suite + + self.assertRaises(ImportError, test.test_this_does_not_exist) + + def test_command_line_handling_parseArgs(self): + # Haha - take that uninstantiable class + program = object.__new__(TestProgram) + + args = [] + def do_discovery(argv): + args.extend(argv) + program._do_discovery = do_discovery + program.parseArgs(['something', 'discover']) + self.assertEqual(args, []) + + program.parseArgs(['something', 'discover', 'foo', 'bar']) + self.assertEqual(args, ['foo', 'bar']) + + def test_command_line_handling_do_discovery_too_many_arguments(self): + class Stop(Exception): + pass + def usageExit(): + raise Stop + + program = object.__new__(TestProgram) + program.usageExit = usageExit + + # too many args + self.assertRaises( + Stop, + lambda: program._do_discovery(['one', 'two', 'three', 'four'])) + + + def test_command_line_handling_do_discovery_calls_loader(self): + program = object.__new__(TestProgram) + + class Loader(object): + args = [] + def discover(self, start_dir, pattern, top_level_dir): + self.args.append((start_dir, pattern, top_level_dir)) + return 'tests' + + program._do_discovery(['-v'], Loader=Loader) + self.assertEqual(program.verbosity, 2) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('.', 'test*.py', None)]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery(['--verbose'], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('.', 'test*.py', None)]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery([], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('.', 'test*.py', None)]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery(['fish'], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('fish', 'test*.py', None)]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery(['fish', 'eggs'], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('fish', 'eggs', None)]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery(['fish', 'eggs', 'ham'], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('fish', 'eggs', 'ham')]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery(['-s', 'fish'], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('fish', 'test*.py', None)]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery(['-t', 'fish'], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('.', 'test*.py', 'fish')]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery(['-p', 'fish'], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('.', 'fish', None)]) + + Loader.args = [] + program = object.__new__(TestProgram) + program._do_discovery(['-p', 'eggs', '-s', 'fish', '-v'], Loader=Loader) + self.assertEqual(program.test, 'tests') + self.assertEqual(Loader.args, [('fish', 'eggs', None)]) + self.assertEqual(program.verbosity, 2) + + +###################################################################### +## Main +###################################################################### + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_urllib2.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_urllib2.py new file mode 100644 index 0000000000000000000000000000000000000000..abe2624e99fc6de001656347a1e05ccf37412038 --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_urllib2.py @@ -0,0 +1,1680 @@ +"""Tests for urllib2-level functionality. + +This is urllib2's tests (most of which came from mechanize originally), plus +some extra tests added, and modifications from bug fixes and feature additions +to mechanize. +""" + +# TODO: +# Request +# CacheFTPHandler (hard to write) +# parse_keqv_list, parse_http_list + +import StringIO +import httplib +import os +import sys +import unittest + +import mechanize + +from mechanize._http import parse_head +from mechanize._response import test_response +from mechanize import HTTPRedirectHandler, \ + HTTPEquivProcessor, HTTPRefreshProcessor, \ + HTTPCookieProcessor, HTTPRefererProcessor, \ + HTTPErrorProcessor, HTTPHandler +from mechanize import OpenerDirector, build_opener, Request +from mechanize._urllib2_fork import AbstractHTTPHandler +from mechanize._util import write_file + +import mechanize._response +import mechanize._sockettimeout as _sockettimeout +import mechanize._testcase +import mechanize._urllib2_fork + + +## from logging import getLogger, DEBUG +## l = getLogger("mechanize") +## l.setLevel(DEBUG) + + +class AlwaysEqual: + def __cmp__(self, other): + return 0 + + +class TrivialTests(mechanize._testcase.TestCase): + + def test_trivial(self): + # A couple trivial tests + + self.assertRaises(ValueError, mechanize.urlopen, 'bogus url') + + fname = os.path.join(self.make_temp_dir(), "test.txt") + write_file(fname, "data") + if fname[1:2] == ":": + fname = fname[2:] + # And more hacking to get it to work on MacOS. This assumes + # urllib.pathname2url works, unfortunately... + if os.name == 'mac': + fname = '/' + fname.replace(':', '/') + elif os.name == 'riscos': + import string + fname = os.expand(fname) + fname = fname.translate(string.maketrans("/.", "./")) + + file_url = "file://%s" % fname + f = mechanize.urlopen(file_url) + + buf = f.read() + f.close() + + def test_parse_http_list(self): + tests = [('a,b,c', ['a', 'b', 'c']), + ('path"o,l"og"i"cal, example', ['path"o,l"og"i"cal', 'example']), + ('a, b, "c", "d", "e,f", g, h', ['a', 'b', '"c"', '"d"', '"e,f"', 'g', 'h']), + ('a="b\\"c", d="e\\,f", g="h\\\\i"', ['a="b"c"', 'd="e,f"', 'g="h\\i"'])] + for string, list in tests: + self.assertEquals(mechanize._urllib2_fork.parse_http_list(string), + list) + + +def test_request_headers_dict(): + """ + The Request.headers dictionary is not a documented interface. It should + stay that way, because the complete set of headers are only accessible + through the .get_header(), .has_header(), .header_items() interface. + However, .headers pre-dates those methods, and so real code will be using + the dictionary. + + The introduction in 2.4 of those methods was a mistake for the same reason: + code that previously saw all (urllib2 user)-provided headers in .headers + now sees only a subset (and the function interface is ugly and incomplete). + A better change would have been to replace .headers dict with a dict + subclass (or UserDict.DictMixin instance?) that preserved the .headers + interface and also provided access to the "unredirected" headers. It's + probably too late to fix that, though. + + + Check .capitalize() case normalization: + + >>> url = "http://example.com" + >>> Request(url, headers={"Spam-eggs": "blah"}).headers["Spam-eggs"] + 'blah' + >>> Request(url, headers={"spam-EggS": "blah"}).headers["Spam-eggs"] + 'blah' + + Currently, Request(url, "Spam-eggs").headers["Spam-Eggs"] raises KeyError, + but that could be changed in future. + + """ + +def test_request_headers_methods(): + """ + Note the case normalization of header names here, to .capitalize()-case. + This should be preserved for backwards-compatibility. (In the HTTP case, + normalization to .title()-case is done by urllib2 before sending headers to + httplib). + + >>> url = "http://example.com" + >>> r = Request(url, headers={"Spam-eggs": "blah"}) + >>> r.has_header("Spam-eggs") + True + >>> r.header_items() + [('Spam-eggs', 'blah')] + >>> r.add_header("Foo-Bar", "baz") + >>> items = r.header_items() + >>> items.sort() + >>> items + [('Foo-bar', 'baz'), ('Spam-eggs', 'blah')] + + Note that e.g. r.has_header("spam-EggS") is currently False, and + r.get_header("spam-EggS") returns None, but that could be changed in + future. + + >>> r.has_header("Not-there") + False + >>> print r.get_header("Not-there") + None + >>> r.get_header("Not-there", "default") + 'default' + + """ + + +def test_password_manager(self): + """ + >>> mgr = mechanize.HTTPPasswordMgr() + >>> add = mgr.add_password + >>> add("Some Realm", "http://example.com/", "joe", "password") + >>> add("Some Realm", "http://example.com/ni", "ni", "ni") + >>> add("c", "http://example.com/foo", "foo", "ni") + >>> add("c", "http://example.com/bar", "bar", "nini") + >>> add("b", "http://example.com/", "first", "blah") + >>> add("b", "http://example.com/", "second", "spam") + >>> add("a", "http://example.com", "1", "a") + >>> add("Some Realm", "http://c.example.com:3128", "3", "c") + >>> add("Some Realm", "d.example.com", "4", "d") + >>> add("Some Realm", "e.example.com:3128", "5", "e") + + >>> mgr.find_user_password("Some Realm", "example.com") + ('joe', 'password') + >>> mgr.find_user_password("Some Realm", "http://example.com") + ('joe', 'password') + >>> mgr.find_user_password("Some Realm", "http://example.com/") + ('joe', 'password') + >>> mgr.find_user_password("Some Realm", "http://example.com/spam") + ('joe', 'password') + >>> mgr.find_user_password("Some Realm", "http://example.com/spam/spam") + ('joe', 'password') + >>> mgr.find_user_password("c", "http://example.com/foo") + ('foo', 'ni') + >>> mgr.find_user_password("c", "http://example.com/bar") + ('bar', 'nini') + + Actually, this is really undefined ATM +## Currently, we use the highest-level path where more than one match: + +## >>> mgr.find_user_password("Some Realm", "http://example.com/ni") +## ('joe', 'password') + + Use latest add_password() in case of conflict: + + >>> mgr.find_user_password("b", "http://example.com/") + ('second', 'spam') + + No special relationship between a.example.com and example.com: + + >>> mgr.find_user_password("a", "http://example.com/") + ('1', 'a') + >>> mgr.find_user_password("a", "http://a.example.com/") + (None, None) + + Ports: + + >>> mgr.find_user_password("Some Realm", "c.example.com") + (None, None) + >>> mgr.find_user_password("Some Realm", "c.example.com:3128") + ('3', 'c') + >>> mgr.find_user_password("Some Realm", "http://c.example.com:3128") + ('3', 'c') + >>> mgr.find_user_password("Some Realm", "d.example.com") + ('4', 'd') + >>> mgr.find_user_password("Some Realm", "e.example.com:3128") + ('5', 'e') + + """ + pass + + +def test_password_manager_default_port(self): + """ + >>> mgr = mechanize.HTTPPasswordMgr() + >>> add = mgr.add_password + + The point to note here is that we can't guess the default port if there's + no scheme. This applies to both add_password and find_user_password. + + >>> add("f", "http://g.example.com:80", "10", "j") + >>> add("g", "http://h.example.com", "11", "k") + >>> add("h", "i.example.com:80", "12", "l") + >>> add("i", "j.example.com", "13", "m") + >>> mgr.find_user_password("f", "g.example.com:100") + (None, None) + >>> mgr.find_user_password("f", "g.example.com:80") + ('10', 'j') + >>> mgr.find_user_password("f", "g.example.com") + (None, None) + >>> mgr.find_user_password("f", "http://g.example.com:100") + (None, None) + >>> mgr.find_user_password("f", "http://g.example.com:80") + ('10', 'j') + >>> mgr.find_user_password("f", "http://g.example.com") + ('10', 'j') + >>> mgr.find_user_password("g", "h.example.com") + ('11', 'k') + >>> mgr.find_user_password("g", "h.example.com:80") + ('11', 'k') + >>> mgr.find_user_password("g", "http://h.example.com:80") + ('11', 'k') + >>> mgr.find_user_password("h", "i.example.com") + (None, None) + >>> mgr.find_user_password("h", "i.example.com:80") + ('12', 'l') + >>> mgr.find_user_password("h", "http://i.example.com:80") + ('12', 'l') + >>> mgr.find_user_password("i", "j.example.com") + ('13', 'm') + >>> mgr.find_user_password("i", "j.example.com:80") + (None, None) + >>> mgr.find_user_password("i", "http://j.example.com") + ('13', 'm') + >>> mgr.find_user_password("i", "http://j.example.com:80") + (None, None) + + """ + +class MockOpener: + addheaders = [] + def open(self, req, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + self.req, self.data, self.timeout = req, data, timeout + def error(self, proto, *args): + self.proto, self.args = proto, args + +class MockFile: + def read(self, count=None): pass + def readline(self, count=None): pass + def close(self): pass + +def http_message(mapping): + """ + >>> http_message({"Content-Type": "text/html"}).items() + [('content-type', 'text/html')] + + """ + f = [] + for kv in mapping.items(): + f.append("%s: %s" % kv) + f.append("") + msg = httplib.HTTPMessage(StringIO.StringIO("\r\n".join(f))) + return msg + +class MockResponse(StringIO.StringIO): + def __init__(self, code, msg, headers, data, url=None): + StringIO.StringIO.__init__(self, data) + self.code, self.msg, self.headers, self.url = code, msg, headers, url + def info(self): + return self.headers + def geturl(self): + return self.url + +class MockCookieJar: + def add_cookie_header(self, request, unverifiable=False): + self.ach_req, self.ach_u = request, unverifiable + def extract_cookies(self, response, request, unverifiable=False): + self.ec_req, self.ec_r, self.ec_u = request, response, unverifiable + +class FakeMethod: + def __init__(self, meth_name, action, handle): + self.meth_name = meth_name + self.handle = handle + self.action = action + def __call__(self, *args): + return self.handle(self.meth_name, self.action, *args) + +class MockHandler: + # useful for testing handler machinery + # see add_ordered_mock_handlers() docstring + handler_order = 500 + def __init__(self, methods): + self._define_methods(methods) + def _define_methods(self, methods): + for spec in methods: + if len(spec) == 2: name, action = spec + else: name, action = spec, None + meth = FakeMethod(name, action, self.handle) + setattr(self.__class__, name, meth) + def handle(self, fn_name, action, *args, **kwds): + self.parent.calls.append((self, fn_name, args, kwds)) + if action is None: + return None + elif action == "return self": + return self + elif action == "return response": + res = MockResponse(200, "OK", {}, "") + return res + elif action == "return request": + return Request("http://blah/") + elif action.startswith("error"): + code = action[action.rfind(" ")+1:] + try: + code = int(code) + except ValueError: + pass + res = MockResponse(200, "OK", {}, "") + return self.parent.error("http", args[0], res, code, "", {}) + elif action == "raise": + raise mechanize.URLError("blah") + assert False + def close(self): pass + def add_parent(self, parent): + self.parent = parent + self.parent.calls = [] + def __lt__(self, other): + if not hasattr(other, "handler_order"): + # Try to preserve the old behavior of having custom classes + # inserted after default ones (works only for custom user + # classes which are not aware of handler_order). + return True + return self.handler_order < other.handler_order + +def add_ordered_mock_handlers(opener, meth_spec): + """Create MockHandlers and add them to an OpenerDirector. + + meth_spec: list of lists of tuples and strings defining methods to define + on handlers. eg: + + [["http_error", "ftp_open"], ["http_open"]] + + defines methods .http_error() and .ftp_open() on one handler, and + .http_open() on another. These methods just record their arguments and + return None. Using a tuple instead of a string causes the method to + perform some action (see MockHandler.handle()), eg: + + [["http_error"], [("http_open", "return request")]] + + defines .http_error() on one handler (which simply returns None), and + .http_open() on another handler, which returns a Request object. + + """ + handlers = [] + count = 0 + for meths in meth_spec: + class MockHandlerSubclass(MockHandler): pass + h = MockHandlerSubclass(meths) + h.handler_order += count + h.add_parent(opener) + count = count + 1 + handlers.append(h) + opener.add_handler(h) + return handlers + +def build_test_opener(*handler_instances): + opener = OpenerDirector() + for h in handler_instances: + opener.add_handler(h) + return opener + +class MockHTTPHandler(mechanize.BaseHandler): + # useful for testing redirections and auth + # sends supplied headers and code as first response + # sends 200 OK as second response + def __init__(self, code, headers): + self.code = code + self.headers = headers + self.reset() + def reset(self): + self._count = 0 + self.requests = [] + def http_open(self, req): + import mimetools, copy + from StringIO import StringIO + self.requests.append(copy.deepcopy(req)) + if self._count == 0: + self._count = self._count + 1 + name = "Not important" + msg = mimetools.Message(StringIO(self.headers)) + return self.parent.error( + "http", req, test_response(), self.code, name, msg) + else: + self.req = req + return test_response("", [], req.get_full_url()) + +class MockPasswordManager: + def add_password(self, realm, uri, user, password): + self.realm = realm + self.url = uri + self.user = user + self.password = password + def find_user_password(self, realm, authuri): + self.target_realm = realm + self.target_url = authuri + return self.user, self.password + + +class OpenerDirectorTests(unittest.TestCase): + + def test_add_non_handler(self): + class NonHandler(object): + pass + self.assertRaises(TypeError, + OpenerDirector().add_handler, NonHandler()) + + def test_badly_named_methods(self): + # test work-around for three methods that accidentally follow the + # naming conventions for handler methods + # (*_open() / *_request() / *_response()) + + # These used to call the accidentally-named methods, causing a + # TypeError in real code; here, returning self from these mock + # methods would either cause no exception, or AttributeError. + + from mechanize import URLError + + o = OpenerDirector() + meth_spec = [ + [("do_open", "return self"), ("proxy_open", "return self")], + [("redirect_request", "return self")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + o.add_handler(mechanize.UnknownHandler()) + for scheme in "do", "proxy", "redirect": + self.assertRaises(URLError, o.open, scheme+"://example.com/") + + def test_handled(self): + # handler returning non-None means no more handlers will be called + o = OpenerDirector() + meth_spec = [ + ["http_open", "ftp_open", "http_error_302"], + ["ftp_open"], + [("http_open", "return self")], + [("http_open", "return self")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + req = Request("http://example.com/") + r = o.open(req) + # Second .http_open() gets called, third doesn't, since second returned + # non-None. Handlers without .http_open() never get any methods called + # on them. + # In fact, second mock handler defining .http_open() returns self + # (instead of response), which becomes the OpenerDirector's return + # value. + self.assertEqual(r, handlers[2]) + calls = [(handlers[0], "http_open"), (handlers[2], "http_open")] + for expected, got in zip(calls, o.calls): + handler, name, args, kwds = got + self.assertEqual((handler, name), expected) + self.assertEqual(args, (req,)) + + + def test_reindex_handlers(self): + o = OpenerDirector() + class MockHandler: + def add_parent(self, parent): pass + def close(self):pass + def __lt__(self, other): + return self.handler_order < other.handler_order + # this first class is here as an obscure regression test for bug + # encountered during development: if something manages to get through + # to _maybe_reindex_handlers, make sure it's properly removed and + # doesn't affect adding of subsequent handlers + class NonHandler(MockHandler): + handler_order = 1 + class Handler(MockHandler): + handler_order = 2 + def http_open(self): pass + class Processor(MockHandler): + handler_order = 3 + def any_response(self): pass + def http_response(self): pass + o.add_handler(NonHandler()) + h = Handler() + o.add_handler(h) + p = Processor() + o.add_handler(p) + o._maybe_reindex_handlers() + self.assertEqual(o.handle_open, {"http": [h]}) + self.assertEqual(len(o.process_response.keys()), 1) + self.assertEqual(list(o.process_response["http"]), [p]) + self.assertEqual(list(o._any_response), [p]) + self.assertEqual(o.handlers, [h, p]) + + def test_handler_order(self): + o = OpenerDirector() + handlers = [] + for meths, handler_order in [ + ([("http_open", "return self")], 500), + (["http_open"], 0), + ]: + class MockHandlerSubclass(MockHandler): pass + h = MockHandlerSubclass(meths) + h.handler_order = handler_order + handlers.append(h) + o.add_handler(h) + + r = o.open("http://example.com/") + # handlers called in reverse order, thanks to their sort order + self.assertEqual(o.calls[0][0], handlers[1]) + self.assertEqual(o.calls[1][0], handlers[0]) + + def test_raise(self): + # raising URLError stops processing of request + o = OpenerDirector() + meth_spec = [ + [("http_open", "raise")], + [("http_open", "return self")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + req = Request("http://example.com/") + self.assertRaises(mechanize.URLError, o.open, req) + self.assertEqual(o.calls, [(handlers[0], "http_open", (req,), {})]) + +## def test_error(self): +## # XXX this doesn't actually seem to be used in standard library, +## # but should really be tested anyway... + + def test_http_error(self): + # XXX http_error_default + # http errors are a special case + o = OpenerDirector() + meth_spec = [ + [("http_open", "error 302")], + [("http_error_400", "raise"), "http_open"], + [("http_error_302", "return response"), "http_error_303", + "http_error"], + [("http_error_302")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + req = Request("http://example.com/") + r = o.open(req) + assert len(o.calls) == 2 + calls = [(handlers[0], "http_open", (req,)), + (handlers[2], "http_error_302", + (req, AlwaysEqual(), 302, "", {}))] + for expected, got in zip(calls, o.calls): + handler, method_name, args = expected + self.assertEqual((handler, method_name), got[:2]) + self.assertEqual(args, got[2]) + + def test_http_error_raised(self): + # should get an HTTPError if an HTTP handler raises a non-200 response + # XXX it worries me that this is the only test that excercises the else + # branch in HTTPDefaultErrorHandler + from mechanize import _response + o = mechanize.OpenerDirector() + o.add_handler(mechanize.HTTPErrorProcessor()) + o.add_handler(mechanize.HTTPDefaultErrorHandler()) + class HTTPHandler(AbstractHTTPHandler): + def http_open(self, req): + return _response.test_response(code=302) + o.add_handler(HTTPHandler()) + self.assertRaises(mechanize.HTTPError, o.open, "http://example.com/") + + def test_processors(self): + # *_request / *_response methods get called appropriately + o = OpenerDirector() + meth_spec = [ + [("http_request", "return request"), + ("http_response", "return response")], + [("http_request", "return request"), + ("http_response", "return response")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + req = Request("http://example.com/") + r = o.open(req) + # processor methods are called on *all* handlers that define them, + # not just the first handler that handles the request + calls = [ + (handlers[0], "http_request"), (handlers[1], "http_request"), + (handlers[0], "http_response"), (handlers[1], "http_response")] + + self.assertEqual(len(o.calls), len(calls)) + for i, (handler, name, args, kwds) in enumerate(o.calls): + if i < 2: + # *_request + self.assertEqual((handler, name), calls[i]) + self.assertEqual(len(args), 1) + self.assertTrue(isinstance(args[0], Request)) + else: + # *_response + self.assertEqual((handler, name), calls[i]) + self.assertEqual(len(args), 2) + self.assertTrue(isinstance(args[0], Request)) + # response from opener.open is None, because there's no + # handler that defines http_open to handle it + self.assertTrue(args[1] is None or + isinstance(args[1], MockResponse)) + + def test_any(self): + # XXXXX two handlers case: ordering + o = OpenerDirector() + meth_spec = [[ + ("http_request", "return request"), + ("http_response", "return response"), + ("ftp_request", "return request"), + ("ftp_response", "return response"), + ("any_request", "return request"), + ("any_response", "return response"), + ]] + handlers = add_ordered_mock_handlers(o, meth_spec) + handler = handlers[0] + + for scheme in ["http", "ftp"]: + o.calls = [] + req = Request("%s://example.com/" % scheme) + r = o.open(req) + + calls = [(handler, "any_request"), + (handler, ("%s_request" % scheme)), + (handler, "any_response"), + (handler, ("%s_response" % scheme)), + ] + self.assertEqual(len(o.calls), len(calls)) + for i, ((handler, name, args, kwds), calls) in ( + enumerate(zip(o.calls, calls))): + if i < 2: + # *_request + self.assert_((handler, name) == calls) + self.assert_(len(args) == 1) + self.assert_(isinstance(args[0], Request)) + else: + # *_response + self.assert_((handler, name) == calls) + self.assert_(len(args) == 2) + self.assert_(isinstance(args[0], Request)) + # response from opener.open is None, because there's no + # handler that defines http_open to handle it + self.assert_(args[1] is None or + isinstance(args[1], MockResponse)) + + +def sanepathname2url(path): + import urllib + urlpath = urllib.pathname2url(path) + if os.name == "nt" and urlpath.startswith("///"): + urlpath = urlpath[2:] + # XXX don't ask me about the mac... + return urlpath + + +class MockRobotFileParserClass: + def __init__(self): + self.calls = [] + self._can_fetch = True + def clear(self): + self.calls = [] + def __call__(self): + self.calls.append("__call__") + return self + def set_url(self, url): + self.calls.append(("set_url", url)) + def set_timeout(self, timeout): + self.calls.append(("set_timeout", timeout)) + def set_opener(self, opener): + self.calls.append(("set_opener", opener)) + def read(self): + self.calls.append("read") + def can_fetch(self, ua, url): + self.calls.append(("can_fetch", ua, url)) + return self._can_fetch + +class MockPasswordManager: + def add_password(self, realm, uri, user, password): + self.realm = realm + self.url = uri + self.user = user + self.password = password + def find_user_password(self, realm, authuri): + self.target_realm = realm + self.target_url = authuri + return self.user, self.password + +class HandlerTests(mechanize._testcase.TestCase): + + def test_ftp(self): + class MockFTPWrapper: + def __init__(self, data): self.data = data + def retrfile(self, filename, filetype): + self.filename, self.filetype = filename, filetype + return StringIO.StringIO(self.data), len(self.data) + + class NullFTPHandler(mechanize.FTPHandler): + def __init__(self, data): self.data = data + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + self.user, self.passwd = user, passwd + self.host, self.port = host, port + self.dirs = dirs + self.timeout = timeout + self.ftpwrapper = MockFTPWrapper(self.data) + return self.ftpwrapper + + import ftplib, socket + data = "rheum rhaponicum" + h = NullFTPHandler(data) + o = h.parent = MockOpener() + + for url, host, port, type_, dirs, timeout, filename, mimetype in [ + ("ftp://localhost/foo/bar/baz.html", + "localhost", ftplib.FTP_PORT, "I", + ["foo", "bar"], _sockettimeout._GLOBAL_DEFAULT_TIMEOUT, + "baz.html", "text/html"), + ("ftp://localhost:80/foo/bar/", + "localhost", 80, "D", + ["foo", "bar"], _sockettimeout._GLOBAL_DEFAULT_TIMEOUT, + "", None), + ("ftp://localhost/baz.gif;type=a", + "localhost", ftplib.FTP_PORT, "A", + [], _sockettimeout._GLOBAL_DEFAULT_TIMEOUT, + "baz.gif", None), # TODO: really this should guess image/gif + ]: + req = Request(url, timeout=timeout) + r = h.ftp_open(req) + # ftp authentication not yet implemented by FTPHandler + self.assertTrue(h.user == h.passwd == "") + self.assertEqual(h.host, socket.gethostbyname(host)) + self.assertEqual(h.port, port) + self.assertEqual(h.dirs, dirs) + if sys.version_info >= (2, 6): + self.assertEquals(h.timeout, timeout) + self.assertEqual(h.ftpwrapper.filename, filename) + self.assertEqual(h.ftpwrapper.filetype, type_) + headers = r.info() + self.assertEqual(headers.get("Content-type"), mimetype) + self.assertEqual(int(headers["Content-length"]), len(data)) + + def test_file(self): + import rfc822, socket + h = mechanize.FileHandler() + o = h.parent = MockOpener() + + temp_file = os.path.join(self.make_temp_dir(), "test.txt") + urlpath = sanepathname2url(os.path.abspath(temp_file)) + towrite = "hello, world\n" + try: + fqdn = socket.gethostbyname(socket.gethostname()) + except socket.gaierror: + fqdn = "localhost" + for url in [ + "file://localhost%s" % urlpath, + "file://%s" % urlpath, + "file://%s%s" % (socket.gethostbyname('localhost'), urlpath), + "file://%s%s" % (fqdn, urlpath) + ]: + write_file(temp_file, towrite) + r = h.file_open(Request(url)) + try: + data = r.read() + headers = r.info() + newurl = r.geturl() + finally: + r.close() + stats = os.stat(temp_file) + modified = rfc822.formatdate(stats.st_mtime) + self.assertEqual(data, towrite) + self.assertEqual(headers["Content-type"], "text/plain") + self.assertEqual(headers["Content-length"], "13") + self.assertEqual(headers["Last-modified"], modified) + + for url in [ + "file://localhost:80%s" % urlpath, + "file:///file_does_not_exist.txt", + "file://%s:80%s/%s" % (socket.gethostbyname('localhost'), + os.getcwd(), temp_file), + "file://somerandomhost.ontheinternet.com%s/%s" % + (os.getcwd(), temp_file), + ]: + write_file(temp_file, towrite) + self.assertRaises(mechanize.URLError, h.file_open, Request(url)) + + h = mechanize.FileHandler() + o = h.parent = MockOpener() + # XXXX why does // mean ftp (and /// mean not ftp!), and where + # is file: scheme specified? I think this is really a bug, and + # what was intended was to distinguish between URLs like: + # file:/blah.txt (a file) + # file://localhost/blah.txt (a file) + # file:///blah.txt (a file) + # file://ftp.example.com/blah.txt (an ftp URL) + for url, ftp in [ + ("file://ftp.example.com//foo.txt", True), + ("file://ftp.example.com///foo.txt", False), +# XXXX bug: fails with OSError, should be URLError + ("file://ftp.example.com/foo.txt", False), + ]: + req = Request(url) + try: + h.file_open(req) + # XXXX remove OSError when bug fixed + except (mechanize.URLError, OSError): + self.assertFalse(ftp) + else: + self.assertTrue(o.req is req) + self.assertEqual(req.type, "ftp") + + def test_http(self): + class MockHTTPResponse: + def __init__(self, fp, msg, status, reason): + self.fp = fp + self.msg = msg + self.status = status + self.reason = reason + def read(self): + return '' + class MockHTTPClass: + def __init__(self): + self.req_headers = [] + self.data = None + self.raise_on_endheaders = False + def __call__(self, host, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + self.host = host + self.timeout = timeout + return self + def set_debuglevel(self, level): + self.level = level + def request(self, method, url, body=None, headers={}): + self.method = method + self.selector = url + self.req_headers += headers.items() + self.req_headers.sort() + if body: + self.data = body + if self.raise_on_endheaders: + import socket + raise socket.error() + def getresponse(self): + return MockHTTPResponse(MockFile(), {}, 200, "OK") + + h = AbstractHTTPHandler() + o = h.parent = MockOpener() + + url = "http://example.com/" + for method, data in [("GET", None), ("POST", "blah")]: + req = Request(url, data, {"Foo": "bar"}) + req.add_unredirected_header("Spam", "eggs") + http = MockHTTPClass() + r = h.do_open(http, req) + + # result attributes + r.read; r.readline # wrapped MockFile methods + r.info; r.geturl # addinfourl methods + r.code, r.msg == 200, "OK" # added from MockHTTPClass.getreply() + hdrs = r.info() + hdrs.get; hdrs.has_key # r.info() gives dict from .getreply() + self.assertEqual(r.geturl(), url) + + self.assertEqual(http.host, "example.com") + self.assertEqual(http.level, 0) + self.assertEqual(http.method, method) + self.assertEqual(http.selector, "/") + self.assertEqual(http.req_headers, + [("Connection", "close"), + ("Foo", "bar"), ("Spam", "eggs")]) + self.assertEqual(http.data, data) + + # check socket.error converted to URLError + http.raise_on_endheaders = True + self.assertRaises(mechanize.URLError, h.do_open, http, req) + + # check adding of standard headers + o.addheaders = [("Spam", "eggs")] + for data in "", None: # POST, GET + req = Request("http://example.com/", data) + r = MockResponse(200, "OK", {}, "") + newreq = h.do_request_(req) + if data is None: # GET + self.assertTrue("Content-length" not in req.unredirected_hdrs) + self.assertTrue("Content-type" not in req.unredirected_hdrs) + else: # POST + self.assertEqual(req.unredirected_hdrs["Content-length"], "0") + self.assertEqual(req.unredirected_hdrs["Content-type"], + "application/x-www-form-urlencoded") + # XXX the details of Host could be better tested + self.assertEqual(req.unredirected_hdrs["Host"], "example.com") + self.assertEqual(req.unredirected_hdrs["Spam"], "eggs") + + # don't clobber existing headers + req.add_unredirected_header("Content-length", "foo") + req.add_unredirected_header("Content-type", "bar") + req.add_unredirected_header("Host", "baz") + req.add_unredirected_header("Spam", "foo") + newreq = h.do_request_(req) + self.assertEqual(req.unredirected_hdrs["Content-length"], "foo") + self.assertEqual(req.unredirected_hdrs["Content-type"], "bar") + self.assertEqual(req.unredirected_hdrs["Host"], "baz") + self.assertEqual(req.unredirected_hdrs["Spam"], "foo") + + def test_http_double_slash(self): + # Checks that the presence of an unnecessary double slash in a url + # doesn't break anything Previously, a double slash directly after the + # host could cause incorrect parsing of the url + h = AbstractHTTPHandler() + o = h.parent = MockOpener() + + data = "" + ds_urls = [ + "http://example.com/foo/bar/baz.html", + "http://example.com//foo/bar/baz.html", + "http://example.com/foo//bar/baz.html", + "http://example.com/foo/bar//baz.html", + ] + + for ds_url in ds_urls: + ds_req = Request(ds_url, data) + + # Check whether host is determined correctly if there is no proxy + np_ds_req = h.do_request_(ds_req) + self.assertEqual(np_ds_req.unredirected_hdrs["Host"],"example.com") + + # Check whether host is determined correctly if there is a proxy + ds_req.set_proxy("someproxy:3128",None) + p_ds_req = h.do_request_(ds_req) + self.assertEqual(p_ds_req.unredirected_hdrs["Host"],"example.com") + + def test_errors(self): + h = HTTPErrorProcessor() + o = h.parent = MockOpener() + + req = Request("http://example.com") + # all 2xx are passed through + r = mechanize._response.test_response() + newr = h.http_response(req, r) + self.assertTrue(r is newr) + self.assertTrue(not hasattr(o, "proto")) # o.error not called + r = mechanize._response.test_response(code=202, msg="Accepted") + newr = h.http_response(req, r) + self.assertTrue(r is newr) + self.assertTrue(not hasattr(o, "proto")) # o.error not called + r = mechanize._response.test_response(code=206, msg="Partial content") + newr = h.http_response(req, r) + self.assertTrue(r is newr) + self.assertTrue(not hasattr(o, "proto")) # o.error not called + # anything else calls o.error (and MockOpener returns None, here) + r = mechanize._response.test_response(code=502, msg="Bad gateway") + self.assertTrue(h.http_response(req, r) is None) + self.assertEqual(o.proto, "http") # o.error called + self.assertEqual(o.args, (req, r, 502, "Bad gateway", AlwaysEqual())) + + def test_referer(self): + h = HTTPRefererProcessor() + o = h.parent = MockOpener() + + # normal case + url = "http://example.com/" + req = Request(url) + r = MockResponse(200, "OK", {}, "", url) + newr = h.http_response(req, r) + self.assert_(r is newr) + self.assert_(h.referer == url) + newreq = h.http_request(req) + self.assert_(req is newreq) + self.assert_(req.unredirected_hdrs["Referer"] == url) + # don't clobber existing Referer + ref = "http://set.by.user.com/" + req.add_unredirected_header("Referer", ref) + newreq = h.http_request(req) + self.assert_(req is newreq) + self.assert_(req.unredirected_hdrs["Referer"] == ref) + + def test_raise_http_errors(self): + # HTTPDefaultErrorHandler should raise HTTPError if no error handler + # handled the error response + from mechanize import _response + h = mechanize.HTTPDefaultErrorHandler() + + url = "http://example.com"; code = 500; msg = "Error" + request = mechanize.Request(url) + response = _response.test_response(url=url, code=code, msg=msg) + + # case 1. it's not an HTTPError + try: + h.http_error_default( + request, response, code, msg, response.info()) + except mechanize.HTTPError, exc: + self.assert_(exc is not response) + self.assert_(exc.fp is response) + else: + self.assert_(False) + + # case 2. response object is already an HTTPError, so just re-raise it + error = mechanize.HTTPError( + url, code, msg, "fake headers", response) + try: + h.http_error_default( + request, error, code, msg, error.info()) + except mechanize.HTTPError, exc: + self.assert_(exc is error) + else: + self.assert_(False) + + def test_robots(self): + # XXX useragent + from mechanize import HTTPRobotRulesProcessor + opener = OpenerDirector() + rfpc = MockRobotFileParserClass() + h = HTTPRobotRulesProcessor(rfpc) + opener.add_handler(h) + + url = "http://example.com:80/foo/bar.html" + req = Request(url) + # first time: initialise and set up robots.txt parser before checking + # whether OK to fetch URL + h.http_request(req) + self.assertEquals(rfpc.calls, [ + "__call__", + ("set_opener", opener), + ("set_url", "http://example.com:80/robots.txt"), + ("set_timeout", _sockettimeout._GLOBAL_DEFAULT_TIMEOUT), + "read", + ("can_fetch", "", url), + ]) + # second time: just use existing parser + rfpc.clear() + req = Request(url) + h.http_request(req) + self.assert_(rfpc.calls == [ + ("can_fetch", "", url), + ]) + # different URL on same server: same again + rfpc.clear() + url = "http://example.com:80/blah.html" + req = Request(url) + h.http_request(req) + self.assert_(rfpc.calls == [ + ("can_fetch", "", url), + ]) + # disallowed URL + rfpc.clear() + rfpc._can_fetch = False + url = "http://example.com:80/rhubarb.html" + req = Request(url) + try: + h.http_request(req) + except mechanize.HTTPError, e: + self.assert_(e.request == req) + self.assert_(e.code == 403) + # new host: reload robots.txt (even though the host and port are + # unchanged, we treat this as a new host because + # "example.com" != "example.com:80") + rfpc.clear() + rfpc._can_fetch = True + url = "http://example.com/rhubarb.html" + req = Request(url) + h.http_request(req) + self.assertEquals(rfpc.calls, [ + "__call__", + ("set_opener", opener), + ("set_url", "http://example.com/robots.txt"), + ("set_timeout", _sockettimeout._GLOBAL_DEFAULT_TIMEOUT), + "read", + ("can_fetch", "", url), + ]) + # https url -> should fetch robots.txt from https url too + rfpc.clear() + url = "https://example.org/rhubarb.html" + req = Request(url) + h.http_request(req) + self.assertEquals(rfpc.calls, [ + "__call__", + ("set_opener", opener), + ("set_url", "https://example.org/robots.txt"), + ("set_timeout", _sockettimeout._GLOBAL_DEFAULT_TIMEOUT), + "read", + ("can_fetch", "", url), + ]) + # non-HTTP URL -> ignore robots.txt + rfpc.clear() + url = "ftp://example.com/" + req = Request(url) + h.http_request(req) + self.assert_(rfpc.calls == []) + + def test_redirected_robots_txt(self): + # redirected robots.txt fetch shouldn't result in another attempted + # robots.txt fetch to check the redirection is allowed! + import mechanize + from mechanize import build_opener, HTTPHandler, \ + HTTPDefaultErrorHandler, HTTPRedirectHandler, \ + HTTPRobotRulesProcessor + + class MockHTTPHandler(mechanize.BaseHandler): + def __init__(self): + self.requests = [] + def http_open(self, req): + import mimetools, httplib, copy + from StringIO import StringIO + self.requests.append(copy.deepcopy(req)) + if req.get_full_url() == "http://example.com/robots.txt": + hdr = "Location: http://example.com/en/robots.txt\r\n\r\n" + msg = mimetools.Message(StringIO(hdr)) + return self.parent.error( + "http", req, test_response(), 302, "Blah", msg) + else: + return test_response("Allow: *", [], req.get_full_url()) + + hh = MockHTTPHandler() + hdeh = HTTPDefaultErrorHandler() + hrh = HTTPRedirectHandler() + rh = HTTPRobotRulesProcessor() + o = build_test_opener(hh, hdeh, hrh, rh) + o.open("http://example.com/") + self.assertEqual([req.get_full_url() for req in hh.requests], + ["http://example.com/robots.txt", + "http://example.com/en/robots.txt", + "http://example.com/", + ]) + + def test_cookies(self): + cj = MockCookieJar() + h = HTTPCookieProcessor(cj) + o = h.parent = MockOpener() + + req = Request("http://example.com/") + r = MockResponse(200, "OK", {}, "") + newreq = h.http_request(req) + self.assertTrue(cj.ach_req is req is newreq) + self.assertEquals(req.get_origin_req_host(), "example.com") + self.assertFalse(cj.ach_u) + newr = h.http_response(req, r) + self.assertTrue(cj.ec_req is req) + self.assertTrue(cj.ec_r is r is newr) + self.assertFalse(cj.ec_u) + + def test_http_equiv(self): + h = HTTPEquivProcessor() + o = h.parent = MockOpener() + + data = ('<html><head>' + '<meta http-equiv="Refresh" content="spam&eggs">' + '</head></html>' + ) + headers = [("Foo", "Bar"), + ("Content-type", "text/html"), + ("Refresh", "blah"), + ] + url = "http://example.com/" + req = Request(url) + r = mechanize._response.make_response(data, headers, url, 200, "OK") + newr = h.http_response(req, r) + + new_headers = newr.info() + self.assertEqual(new_headers["Foo"], "Bar") + self.assertEqual(new_headers["Refresh"], "spam&eggs") + self.assertEqual(new_headers.getheaders("Refresh"), + ["blah", "spam&eggs"]) + + def test_refresh(self): + # XXX test processor constructor optional args + h = HTTPRefreshProcessor(max_time=None, honor_time=False) + + for val, valid in [ + ('0; url="http://example.com/foo/"', True), + ("2", True), + # in the past, this failed with UnboundLocalError + ('0; "http://example.com/foo/"', False), + ]: + o = h.parent = MockOpener() + req = Request("http://example.com/") + headers = http_message({"refresh": val}) + r = MockResponse(200, "OK", headers, "", "http://example.com/") + newr = h.http_response(req, r) + if valid: + self.assertEqual(o.proto, "http") + self.assertEqual(o.args, (req, r, "refresh", "OK", headers)) + + def test_refresh_honor_time(self): + class SleepTester: + def __init__(self, test, seconds): + self._test = test + if seconds is 0: + seconds = None # don't expect a sleep for 0 seconds + self._expected = seconds + self._got = None + def sleep(self, seconds): + self._got = seconds + def verify(self): + self._test.assertEqual(self._expected, self._got) + class Opener: + called = False + def error(self, *args, **kwds): + self.called = True + def test(rp, header, refresh_after): + expect_refresh = refresh_after is not None + opener = Opener() + rp.parent = opener + st = SleepTester(self, refresh_after) + rp._sleep = st.sleep + rp.http_response(Request("http://example.com"), + test_response(headers=[("Refresh", header)]), + ) + self.assertEqual(expect_refresh, opener.called) + st.verify() + + # by default, only zero-time refreshes are honoured + test(HTTPRefreshProcessor(), "0", 0) + test(HTTPRefreshProcessor(), "2", None) + + # if requested, more than zero seconds are allowed + test(HTTPRefreshProcessor(max_time=None), "2", 2) + test(HTTPRefreshProcessor(max_time=30), "2", 2) + + # no sleep if we don't "honor_time" + test(HTTPRefreshProcessor(max_time=30, honor_time=False), "2", 0) + + # request for too-long wait before refreshing --> no refresh occurs + test(HTTPRefreshProcessor(max_time=30), "60", None) + + def test_redirect(self): + from_url = "http://example.com/a.html" + to_url = "http://example.com/b.html" + h = HTTPRedirectHandler() + o = h.parent = MockOpener() + + # ordinary redirect behaviour + for code in 301, 302, 303, 307, "refresh": + for data in None, "blah\nblah\n": + method = getattr(h, "http_error_%s" % code) + req = Request(from_url, data) + req.add_header("Nonsense", "viking=withhold") + req.add_unredirected_header("Spam", "spam") + req.origin_req_host = "example.com" # XXX + try: + method(req, MockFile(), code, "Blah", + http_message({"location": to_url})) + except mechanize.HTTPError: + # 307 in response to POST requires user OK + self.assertEqual(code, 307) + self.assertTrue(data is not None) + self.assertEqual(o.req.get_full_url(), to_url) + try: + self.assertEqual(o.req.get_method(), "GET") + except AttributeError: + self.assertFalse(o.req.has_data()) + + # now it's a GET, there should not be headers regarding content + # (possibly dragged from before being a POST) + headers = [x.lower() for x in o.req.headers] + self.assertTrue("content-length" not in headers) + self.assertTrue("content-type" not in headers) + + self.assertEqual(o.req.headers["Nonsense"], "viking=withhold") + self.assertTrue("Spam" not in o.req.headers) + self.assertTrue("Spam" not in o.req.unredirected_hdrs) + + # loop detection + req = Request(from_url) + def redirect(h, req, url=to_url): + h.http_error_302(req, MockFile(), 302, "Blah", + http_message({"location": url})) + # Note that the *original* request shares the same record of + # redirections with the sub-requests caused by the redirections. + + # detect infinite loop redirect of a URL to itself + req = Request(from_url, origin_req_host="example.com") + count = 0 + try: + while 1: + redirect(h, req, "http://example.com/") + count = count + 1 + except mechanize.HTTPError: + # don't stop until max_repeats, because cookies may introduce state + self.assertEqual(count, HTTPRedirectHandler.max_repeats) + + # detect endless non-repeating chain of redirects + req = Request(from_url, origin_req_host="example.com") + count = 0 + try: + while 1: + redirect(h, req, "http://example.com/%d" % count) + count = count + 1 + except mechanize.HTTPError: + self.assertEqual(count, HTTPRedirectHandler.max_redirections) + + def test_redirect_bad_uri(self): + # bad URIs should be cleaned up before redirection + from mechanize._response import test_html_response + from_url = "http://example.com/a.html" + bad_to_url = "http://example.com/b. |html" + good_to_url = "http://example.com/b.%20%7Chtml" + + h = HTTPRedirectHandler() + o = h.parent = MockOpener() + + req = Request(from_url) + h.http_error_302(req, test_html_response(), 302, "Blah", + http_message({"location": bad_to_url}), + ) + self.assertEqual(o.req.get_full_url(), good_to_url) + + def test_refresh_bad_uri(self): + # bad URIs should be cleaned up before redirection + from mechanize._response import test_html_response + from_url = "http://example.com/a.html" + bad_to_url = "http://example.com/b. |html" + good_to_url = "http://example.com/b.%20%7Chtml" + + h = HTTPRefreshProcessor(max_time=None, honor_time=False) + o = h.parent = MockOpener() + + req = Request("http://example.com/") + r = test_html_response( + headers=[("refresh", '0; url="%s"' % bad_to_url)]) + newr = h.http_response(req, r) + headers = o.args[-1] + self.assertEqual(headers["Location"], good_to_url) + + def test_cookie_redirect(self): + # cookies shouldn't leak into redirected requests + import mechanize + from mechanize import CookieJar, build_opener, HTTPHandler, \ + HTTPCookieProcessor, HTTPError, HTTPDefaultErrorHandler, \ + HTTPRedirectHandler + + from test_cookies import interact_netscape + + cj = CookieJar() + interact_netscape(cj, "http://www.example.com/", "spam=eggs") + hh = MockHTTPHandler(302, "Location: http://www.cracker.com/\r\n\r\n") + hdeh = HTTPDefaultErrorHandler() + hrh = HTTPRedirectHandler() + cp = HTTPCookieProcessor(cj) + o = build_test_opener(hh, hdeh, hrh, cp) + o.open("http://www.example.com/") + self.assertFalse(hh.req.has_header("Cookie")) + + def test_proxy(self): + o = OpenerDirector() + ph = mechanize.ProxyHandler(dict(http="proxy.example.com:3128")) + o.add_handler(ph) + meth_spec = [ + [("http_open", "return response")] + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + o._maybe_reindex_handlers() + + req = Request("http://acme.example.com/") + self.assertEqual(req.get_host(), "acme.example.com") + r = o.open(req) + self.assertEqual(req.get_host(), "proxy.example.com:3128") + + self.assertEqual([(handlers[0], "http_open")], + [tup[0:2] for tup in o.calls]) + + def test_proxy_no_proxy(self): + self.monkey_patch_environ("no_proxy", "python.org") + o = OpenerDirector() + ph = mechanize.ProxyHandler(dict(http="proxy.example.com")) + o.add_handler(ph) + req = Request("http://www.perl.org/") + self.assertEqual(req.get_host(), "www.perl.org") + r = o.open(req) + self.assertEqual(req.get_host(), "proxy.example.com") + req = Request("http://www.python.org") + self.assertEqual(req.get_host(), "www.python.org") + r = o.open(req) + if sys.version_info >= (2, 6): + # no_proxy environment variable not supported in python 2.5 + self.assertEqual(req.get_host(), "www.python.org") + + def test_proxy_custom_proxy_bypass(self): + self.monkey_patch_environ("no_proxy", + mechanize._testcase.MonkeyPatcher.Unset) + def proxy_bypass(hostname): + return hostname == "noproxy.com" + o = OpenerDirector() + ph = mechanize.ProxyHandler(dict(http="proxy.example.com"), + proxy_bypass=proxy_bypass) + def is_proxied(url): + o.add_handler(ph) + req = Request(url) + o.open(req) + return req.has_proxy() + self.assertTrue(is_proxied("http://example.com")) + self.assertFalse(is_proxied("http://noproxy.com")) + + def test_proxy_https(self): + o = OpenerDirector() + ph = mechanize.ProxyHandler(dict(https='proxy.example.com:3128')) + o.add_handler(ph) + meth_spec = [ + [("https_open","return response")] + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + req = Request("https://www.example.com/") + self.assertEqual(req.get_host(), "www.example.com") + r = o.open(req) + self.assertEqual(req.get_host(), "proxy.example.com:3128") + self.assertEqual([(handlers[0], "https_open")], + [tup[0:2] for tup in o.calls]) + + def test_basic_auth(self, quote_char='"'): + opener = OpenerDirector() + password_manager = MockPasswordManager() + auth_handler = mechanize.HTTPBasicAuthHandler(password_manager) + realm = "ACME Widget Store" + http_handler = MockHTTPHandler( + 401, 'WWW-Authenticate: Basic realm=%s%s%s\r\n\r\n' % + (quote_char, realm, quote_char) ) + opener.add_handler(auth_handler) + opener.add_handler(http_handler) + self._test_basic_auth(opener, auth_handler, "Authorization", + realm, http_handler, password_manager, + "http://acme.example.com/protected", + "http://acme.example.com/protected", + ) + + def test_basic_auth_with_single_quoted_realm(self): + self.test_basic_auth(quote_char="'") + + def test_proxy_basic_auth(self): + opener = OpenerDirector() + ph = mechanize.ProxyHandler(dict(http="proxy.example.com:3128")) + opener.add_handler(ph) + password_manager = MockPasswordManager() + auth_handler = mechanize.ProxyBasicAuthHandler(password_manager) + realm = "ACME Networks" + http_handler = MockHTTPHandler( + 407, 'Proxy-Authenticate: Basic realm="%s"\r\n\r\n' % realm) + opener.add_handler(auth_handler) + opener.add_handler(http_handler) + self._test_basic_auth(opener, auth_handler, "Proxy-authorization", + realm, http_handler, password_manager, + "http://acme.example.com:3128/protected", + "proxy.example.com:3128", + ) + + def test_basic_and_digest_auth_handlers(self): + # HTTPDigestAuthHandler threw an exception if it couldn't handle a 40* + # response (http://python.org/sf/1479302), where it should instead + # return None to allow another handler (especially + # HTTPBasicAuthHandler) to handle the response. + + # Also (http://python.org/sf/1479302, RFC 2617 section 1.2), we must + # try digest first (since it's the strongest auth scheme), so we record + # order of calls here to check digest comes first: + class RecordingOpenerDirector(OpenerDirector): + def __init__(self): + OpenerDirector.__init__(self) + self.recorded = [] + def record(self, info): + self.recorded.append(info) + class TestDigestAuthHandler(mechanize.HTTPDigestAuthHandler): + def http_error_401(self, *args, **kwds): + self.parent.record("digest") + mechanize.HTTPDigestAuthHandler.http_error_401(self, + *args, **kwds) + class TestBasicAuthHandler(mechanize.HTTPBasicAuthHandler): + def http_error_401(self, *args, **kwds): + self.parent.record("basic") + mechanize.HTTPBasicAuthHandler.http_error_401(self, + *args, **kwds) + + opener = RecordingOpenerDirector() + password_manager = MockPasswordManager() + digest_handler = TestDigestAuthHandler(password_manager) + basic_handler = TestBasicAuthHandler(password_manager) + realm = "ACME Networks" + http_handler = MockHTTPHandler( + 401, 'WWW-Authenticate: Basic realm="%s"\r\n\r\n' % realm) + opener.add_handler(digest_handler) + opener.add_handler(basic_handler) + opener.add_handler(http_handler) + opener._maybe_reindex_handlers() + + # check basic auth isn't blocked by digest handler failing + self._test_basic_auth(opener, basic_handler, "Authorization", + realm, http_handler, password_manager, + "http://acme.example.com/protected", + "http://acme.example.com/protected", + ) + # check digest was tried before basic (twice, because + # _test_basic_auth called .open() twice) + self.assertEqual(opener.recorded, ["digest", "basic"]*2) + + def _test_basic_auth(self, opener, auth_handler, auth_header, + realm, http_handler, password_manager, + request_url, protected_url): + import base64 + user, password = "wile", "coyote" + + # .add_password() fed through to password manager + auth_handler.add_password(realm, request_url, user, password) + self.assertEqual(realm, password_manager.realm) + self.assertEqual(request_url, password_manager.url) + self.assertEqual(user, password_manager.user) + self.assertEqual(password, password_manager.password) + + r = opener.open(request_url) + + # should have asked the password manager for the username/password + self.assertEqual(password_manager.target_realm, realm) + self.assertEqual(password_manager.target_url, protected_url) + + # expect one request without authorization, then one with + self.assertEqual(len(http_handler.requests), 2) + self.assertFalse(http_handler.requests[0].has_header(auth_header)) + userpass = '%s:%s' % (user, password) + auth_hdr_value = 'Basic '+base64.encodestring(userpass).strip() + self.assertEqual(http_handler.requests[1].get_header(auth_header), + auth_hdr_value) + + # if the password manager can't find a password, the handler won't + # handle the HTTP auth error + password_manager.user = password_manager.password = None + http_handler.reset() + r = opener.open(request_url) + self.assertEqual(len(http_handler.requests), 1) + self.assertFalse(http_handler.requests[0].has_header(auth_header)) + + +class HeadParserTests(unittest.TestCase): + + def test(self): + # XXX XHTML + from mechanize import HeadParser + htmls = [ + ("""<meta http-equiv="refresh" content="1; http://example.com/"> + """, + [("refresh", "1; http://example.com/")] + ), + (""" + <html><head> + <meta http-equiv="refresh" content="1; http://example.com/"> + <meta name="spam" content="eggs"> + <meta http-equiv="foo" content="bar"> + <p> <!-- p is not allowed in head, so parsing should stop here--> + <meta http-equiv="moo" content="cow"> + </html> + """, + [("refresh", "1; http://example.com/"), ("foo", "bar")]), + ("""<meta http-equiv="refresh"> + """, + []) + ] + for html, result in htmls: + self.assertEqual(parse_head(StringIO.StringIO(html), HeadParser()), result) + + + +class A: + def a(self): pass +class B(A): + def a(self): pass + def b(self): pass +class C(A): + def c(self): pass +class D(C, B): + def a(self): pass + def d(self): pass + +class FunctionTests(unittest.TestCase): + + def test_build_opener(self): + class MyHTTPHandler(HTTPHandler): pass + class FooHandler(mechanize.BaseHandler): + def foo_open(self): pass + class BarHandler(mechanize.BaseHandler): + def bar_open(self): pass + + o = build_opener(FooHandler, BarHandler) + self.opener_has_handler(o, FooHandler) + self.opener_has_handler(o, BarHandler) + + # can take a mix of classes and instances + o = build_opener(FooHandler, BarHandler()) + self.opener_has_handler(o, FooHandler) + self.opener_has_handler(o, BarHandler) + + # subclasses of default handlers override default handlers + o = build_opener(MyHTTPHandler) + self.opener_has_handler(o, MyHTTPHandler) + + # a particular case of overriding: default handlers can be passed + # in explicitly + o = build_opener() + self.opener_has_handler(o, HTTPHandler) + o = build_opener(HTTPHandler) + self.opener_has_handler(o, HTTPHandler) + o = build_opener(HTTPHandler()) + self.opener_has_handler(o, HTTPHandler) + + # Issue2670: multiple handlers sharing the same base class + class MyOtherHTTPHandler(HTTPHandler): pass + o = build_opener(MyHTTPHandler, MyOtherHTTPHandler) + self.opener_has_handler(o, MyHTTPHandler) + self.opener_has_handler(o, MyOtherHTTPHandler) + + def opener_has_handler(self, opener, handler_class): + for h in opener.handlers: + if h.__class__ == handler_class: + break + else: + self.assertTrue(False) + +class RequestTests(unittest.TestCase): + + def setUp(self): + self.get = Request("http://www.python.org/~jeremy/") + self.post = Request("http://www.python.org/~jeremy/", + "data", + headers={"X-Test": "test"}) + + def test_method(self): + self.assertEqual("POST", self.post.get_method()) + self.assertEqual("GET", self.get.get_method()) + + def test_add_data(self): + self.assertTrue(not self.get.has_data()) + self.assertEqual("GET", self.get.get_method()) + self.get.add_data("spam") + self.assertTrue(self.get.has_data()) + self.assertEqual("POST", self.get.get_method()) + + def test_get_full_url(self): + self.assertEqual("http://www.python.org/~jeremy/", + self.get.get_full_url()) + + def test_selector(self): + self.assertEqual("/~jeremy/", self.get.get_selector()) + req = Request("http://www.python.org/") + self.assertEqual("/", req.get_selector()) + + def test_get_type(self): + self.assertEqual("http", self.get.get_type()) + + def test_get_host(self): + self.assertEqual("www.python.org", self.get.get_host()) + + def test_get_host_unquote(self): + req = Request("http://www.%70ython.org/") + self.assertEqual("www.python.org", req.get_host()) + + def test_proxy(self): + self.assertTrue(not self.get.has_proxy()) + self.get.set_proxy("www.perl.org", "http") + self.assertTrue(self.get.has_proxy()) + self.assertEqual("www.python.org", self.get.get_origin_req_host()) + self.assertEqual("www.perl.org", self.get.get_host()) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_urllib2_localnet.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_urllib2_localnet.py new file mode 100644 index 0000000000000000000000000000000000000000..8689088051ab9dfb54ed2d0d3b6aecffc147e9fc --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_urllib2_localnet.py @@ -0,0 +1,525 @@ +#!/usr/bin/env python + +"""Functional tests from the Python standard library test suite.""" + +import mimetools +import threading +import urlparse +import mechanize +import BaseHTTPServer +import unittest + +from mechanize._testcase import TestCase +from mechanize._urllib2_fork import md5_digest + +import testprogram + + +# Loopback http server infrastructure + +class LoopbackHttpServer(BaseHTTPServer.HTTPServer): + """HTTP server w/ a few modifications that make it useful for + loopback testing purposes. + """ + + def __init__(self, server_address, RequestHandlerClass): + BaseHTTPServer.HTTPServer.__init__(self, + server_address, + RequestHandlerClass) + + # Set the timeout of our listening socket really low so + # that we can stop the server easily. + self.socket.settimeout(1.0) + + def get_request(self): + """BaseHTTPServer method, overridden.""" + + request, client_address = self.socket.accept() + + # It's a loopback connection, so setting the timeout + # really low shouldn't affect anything, but should make + # deadlocks less likely to occur. + request.settimeout(10.0) + + return (request, client_address) + +class LoopbackHttpServerThread(threading.Thread): + """Stoppable thread that runs a loopback http server.""" + + def __init__(self, handle_request=None): + threading.Thread.__init__(self) + self._stop = False + self.ready = threading.Event() + self._request_handler = None + if handle_request is None: + handle_request = self._handle_request + self.httpd = LoopbackHttpServer(('127.0.0.1', 0), handle_request) + #print "Serving HTTP on %s port %s" % (self.httpd.server_name, + # self.httpd.server_port) + self.port = self.httpd.server_port + + def set_request_handler(self, request_handler): + self._request_handler = request_handler + + def _handle_request(self, *args, **kwds): + self._request_handler.handle_request(*args, **kwds) + return self._request_handler + + def stop(self): + """Stops the webserver if it's currently running.""" + + # Set the stop flag. + self._stop = True + + self.join() + + def run(self): + self.ready.set() + while not self._stop: + self.httpd.handle_request() + +# Authentication infrastructure + +class DigestAuthHandler: + """Handler for performing digest authentication.""" + + def __init__(self): + self._request_num = 0 + self._nonces = [] + self._users = {} + self._realm_name = "Test Realm" + self._qop = "auth" + + def set_qop(self, qop): + self._qop = qop + + def set_users(self, users): + assert isinstance(users, dict) + self._users = users + + def set_realm(self, realm): + self._realm_name = realm + + def _generate_nonce(self): + self._request_num += 1 + nonce = md5_digest(str(self._request_num)) + self._nonces.append(nonce) + return nonce + + def _create_auth_dict(self, auth_str): + first_space_index = auth_str.find(" ") + auth_str = auth_str[first_space_index+1:] + + parts = auth_str.split(",") + + auth_dict = {} + for part in parts: + name, value = part.split("=") + name = name.strip() + if value[0] == '"' and value[-1] == '"': + value = value[1:-1] + else: + value = value.strip() + auth_dict[name] = value + return auth_dict + + def _validate_auth(self, auth_dict, password, method, uri): + final_dict = {} + final_dict.update(auth_dict) + final_dict["password"] = password + final_dict["method"] = method + final_dict["uri"] = uri + HA1_str = "%(username)s:%(realm)s:%(password)s" % final_dict + HA1 = md5_digest(HA1_str) + HA2_str = "%(method)s:%(uri)s" % final_dict + HA2 = md5_digest(HA2_str) + final_dict["HA1"] = HA1 + final_dict["HA2"] = HA2 + response_str = "%(HA1)s:%(nonce)s:%(nc)s:" \ + "%(cnonce)s:%(qop)s:%(HA2)s" % final_dict + response = md5_digest(response_str) + + return response == auth_dict["response"] + + def _return_auth_challenge(self, request_handler): + request_handler.send_response(407, "Proxy Authentication Required") + request_handler.send_header("Content-Type", "text/html") + request_handler.send_header( + 'Proxy-Authenticate', 'Digest realm="%s", ' + 'qop="%s",' + 'nonce="%s", ' % \ + (self._realm_name, self._qop, self._generate_nonce())) + # XXX: Not sure if we're supposed to add this next header or + # not. + #request_handler.send_header('Connection', 'close') + request_handler.end_headers() + request_handler.wfile.write("Proxy Authentication Required.") + return False + + def handle_request(self, request_handler): + """Performs digest authentication on the given HTTP request + handler. Returns True if authentication was successful, False + otherwise. + + If no users have been set, then digest auth is effectively + disabled and this method will always return True. + """ + + if len(self._users) == 0: + return True + + if not request_handler.headers.has_key('Proxy-Authorization'): + return self._return_auth_challenge(request_handler) + else: + auth_dict = self._create_auth_dict( + request_handler.headers['Proxy-Authorization'] + ) + if self._users.has_key(auth_dict["username"]): + password = self._users[ auth_dict["username"] ] + else: + return self._return_auth_challenge(request_handler) + if not auth_dict.get("nonce") in self._nonces: + return self._return_auth_challenge(request_handler) + else: + self._nonces.remove(auth_dict["nonce"]) + + auth_validated = False + + # MSIE uses short_path in its validation, but mechanize uses the + # full path, so we're going to see if either of them works here. + + for path in [request_handler.path, request_handler.short_path]: + if self._validate_auth(auth_dict, + password, + request_handler.command, + path): + auth_validated = True + + if not auth_validated: + return self._return_auth_challenge(request_handler) + return True + +# Proxy test infrastructure + +class FakeProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler): + """This is a 'fake proxy' that makes it look like the entire + internet has gone down due to a sudden zombie invasion. It main + utility is in providing us with authentication support for + testing. + """ + + protocol_version = "HTTP/1.0" + + def __init__(self, digest_auth_handler, *args, **kwargs): + # This has to be set before calling our parent's __init__(), which will + # try to call do_GET(). + self.digest_auth_handler = digest_auth_handler + BaseHTTPServer.BaseHTTPRequestHandler.__init__(self, *args, **kwargs) + + def log_message(self, format, *args): + # Uncomment the next line for debugging. + #sys.stderr.write(format % args) + pass + + def do_GET(self): + (scm, netloc, path, params, query, fragment) = urlparse.urlparse( + self.path, 'http') + self.short_path = path + if self.digest_auth_handler.handle_request(self): + self.send_response(200, "OK") + self.send_header("Content-Type", "text/html") + self.end_headers() + self.wfile.write("You've reached %s!<BR>" % self.path) + self.wfile.write("Our apologies, but our server is down due to " + "a sudden zombie invasion.") + + +def make_started_server(make_request_handler=None): + server = LoopbackHttpServerThread(make_request_handler) + server.start() + server.ready.wait() + return server + + +# Test cases + + +class ProxyAuthTests(TestCase): + URL = "http://localhost" + + USER = "tester" + PASSWD = "test123" + REALM = "TestRealm" + + def _make_server(self, qop="auth"): + digest_auth_handler = DigestAuthHandler() + digest_auth_handler.set_users({self.USER: self.PASSWD}) + digest_auth_handler.set_realm(self.REALM) + digest_auth_handler.set_qop(qop) + def create_fake_proxy_handler(*args, **kwargs): + return FakeProxyHandler(digest_auth_handler, *args, **kwargs) + return make_started_server(create_fake_proxy_handler) + + def setUp(self): + TestCase.setUp(self) + fixture_name = "test_urllib2_localnet_ProxyAuthTests_server" + self.register_context_manager(fixture_name, + testprogram.ServerCM(self._make_server)) + server = self.get_cached_fixture(fixture_name) + + proxy_url = "http://127.0.0.1:%d" % server.port + handler = mechanize.ProxyHandler({"http" : proxy_url}) + self.proxy_digest_handler = mechanize.ProxyDigestAuthHandler() + self.opener = mechanize.build_opener(handler, self.proxy_digest_handler) + + def test_proxy_with_bad_password_raises_httperror(self): + self.proxy_digest_handler.add_password(self.REALM, self.URL, + self.USER, self.PASSWD+"bad") + self.assertRaises(mechanize.HTTPError, + self.opener.open, + self.URL) + + def test_proxy_with_no_password_raises_httperror(self): + self.assertRaises(mechanize.HTTPError, + self.opener.open, + self.URL) + + def test_proxy_qop_auth_works(self): + self.proxy_digest_handler.add_password(self.REALM, self.URL, + self.USER, self.PASSWD) + result = self.opener.open(self.URL) + while result.read(): + pass + result.close() + + def test_proxy_qop_auth_int_works_or_throws_urlerror(self): + server = self._make_server("auth-int") + self.add_teardown(lambda: server.stop()) + self.proxy_digest_handler.add_password(self.REALM, self.URL, + self.USER, self.PASSWD) + try: + result = self.opener.open(self.URL) + except mechanize.URLError: + # It's okay if we don't support auth-int, but we certainly + # shouldn't receive any kind of exception here other than + # a URLError. + result = None + if result: + while result.read(): + pass + result.close() + + +class RecordingHTTPRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): + + server_version = "TestHTTP/" + protocol_version = "HTTP/1.0" + + def __init__(self, port, get_next_response, + record_request, record_received_headers, + *args, **kwds): + self._port = port + self._get_next_response = get_next_response + self._record_request = record_request + self._record_received_headers = record_received_headers + BaseHTTPServer.BaseHTTPRequestHandler.__init__(self, *args, **kwds) + + def do_GET(self): + body = self.send_head() + if body: + self.wfile.write(body) + + def do_POST(self): + content_length = self.headers['Content-Length'] + post_data = self.rfile.read(int(content_length)) + self.do_GET() + self._record_request(post_data) + + def send_head(self): + self._record_received_headers(self.headers) + self._record_request(self.path) + response_code, headers, body = self._get_next_response() + + self.send_response(response_code) + + for (header, value) in headers: + self.send_header(header, value % self._port) + if body: + self.send_header('Content-type', 'text/plain') + self.end_headers() + return body + self.end_headers() + + def log_message(self, *args): + pass + + +class FakeHTTPRequestHandler(object): + + def __init__(self, port, responses): + self.port = port + self._responses = responses + self.requests = [] + self.received_headers = None + + def _get_next_response(self): + return self._responses.pop(0) + + def _record_request(self, request): + self.requests.append(request) + + def _record_received_headers(self, headers): + self.received_headers = headers + + def handle_request(self, *args, **kwds): + RecordingHTTPRequestHandler( + self.port, self._get_next_response, + self._record_request, self._record_received_headers, + *args, **kwds) + + +class TestUrlopen(TestCase): + """Tests mechanize.urlopen using the network. + + These tests are not exhaustive. Assuming that testing using files does a + good job overall of some of the basic interface features. There are no + tests exercising the optional 'data' and 'proxies' arguments. No tests + for transparent redirection have been written. + """ + + fixture_name = "test_urllib2_localnet_TestUrlopen_server" + + def setUp(self): + TestCase.setUp(self) + self.register_context_manager( + self.fixture_name, testprogram.ServerCM(make_started_server)) + + def get_server(self): + return self.get_cached_fixture(self.fixture_name) + + def _make_request_handler(self, responses): + server = self.get_server() + handler = FakeHTTPRequestHandler(server.port, responses) + server.set_request_handler(handler) + return handler + + def test_redirection(self): + expected_response = 'We got here...' + responses = [ + (302, [('Location', 'http://localhost:%s/somewhere_else')], ''), + (200, [], expected_response) + ] + + handler = self._make_request_handler(responses) + + f = mechanize.urlopen('http://localhost:%s/' % handler.port) + data = f.read() + f.close() + + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ['/', '/somewhere_else']) + + def test_404(self): + expected_response = 'Bad bad bad...' + handler = self._make_request_handler([(404, [], expected_response)]) + + try: + mechanize.urlopen('http://localhost:%s/weeble' % handler.port) + except mechanize.URLError, f: + pass + else: + self.fail('404 should raise URLError') + + data = f.read() + f.close() + + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ['/weeble']) + + def test_200(self): + expected_response = 'pycon 2008...' + handler = self._make_request_handler([(200, [], expected_response)]) + + f = mechanize.urlopen('http://localhost:%s/bizarre' % handler.port) + data = f.read() + f.close() + + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ['/bizarre']) + + def test_200_with_parameters(self): + expected_response = 'pycon 2008...' + handler = self._make_request_handler([(200, [], expected_response)]) + + f = mechanize.urlopen('http://localhost:%s/bizarre' % handler.port, + 'get=with_feeling') + data = f.read() + f.close() + + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ['/bizarre', 'get=with_feeling']) + + def test_sending_headers(self): + handler = self._make_request_handler([(200, [], "we don't care")]) + + req = mechanize.Request("http://localhost:%s/" % handler.port, + headers={'Range': 'bytes=20-39'}) + mechanize.urlopen(req) + self.assertEqual(handler.received_headers['Range'], 'bytes=20-39') + + def test_basic(self): + handler = self._make_request_handler([(200, [], "we don't care")]) + + open_url = mechanize.urlopen("http://localhost:%s" % handler.port) + for attr in ("read", "close", "info", "geturl"): + self.assertTrue(hasattr(open_url, attr), "object returned from " + "urlopen lacks the %s attribute" % attr) + try: + self.assertTrue(open_url.read(), "calling 'read' failed") + finally: + open_url.close() + + def test_info(self): + handler = self._make_request_handler([(200, [], "we don't care")]) + + open_url = mechanize.urlopen("http://localhost:%s" % handler.port) + info_obj = open_url.info() + self.assertTrue(isinstance(info_obj, mimetools.Message), + "object returned by 'info' is not an instance of " + "mimetools.Message") + self.assertEqual(info_obj.getsubtype(), "plain") + + def test_geturl(self): + # Make sure same URL as opened is returned by geturl. + handler = self._make_request_handler([(200, [], "we don't care")]) + + open_url = mechanize.urlopen("http://localhost:%s" % handler.port) + url = open_url.geturl() + self.assertEqual(url, "http://localhost:%s" % handler.port) + + def test_bad_address(self): + # Make sure proper exception is raised when connecting to a bogus + # address. + self.assertRaises(IOError, + # Given that both VeriSign and various ISPs have in + # the past or are presently hijacking various invalid + # domain name requests in an attempt to boost traffic + # to their own sites, finding a domain name to use + # for this test is difficult. RFC2606 leads one to + # believe that '.invalid' should work, but experience + # seemed to indicate otherwise. Single character + # TLDs are likely to remain invalid, so this seems to + # be the best choice. The trailing '.' prevents a + # related problem: The normal DNS resolver appends + # the domain names from the search path if there is + # no '.' the end and, and if one of those domains + # implements a '*' rule a result is returned. + # However, none of this will prevent the test from + # failing if the ISP hijacks all invalid domain + # requests. The real solution would be to be able to + # parameterize the framework with a mock resolver. + mechanize.urlopen, "http://sadflkjsasf.i.nvali.d./") + + +if __name__ == "__main__": + unittest.main() diff --git a/LTA/LTAIngest/mechanize-0.2.5/test/test_useragent.py b/LTA/LTAIngest/mechanize-0.2.5/test/test_useragent.py new file mode 100644 index 0000000000000000000000000000000000000000..cfd218387055d11a51a6bb0f8ed68a45b45586ef --- /dev/null +++ b/LTA/LTAIngest/mechanize-0.2.5/test/test_useragent.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +from unittest import TestCase + +import mechanize + +from test_browser import make_mock_handler + + +class UserAgentTests(TestCase): + + def _get_handler_from_ua(self, ua, name): + handler = ua._ua_handlers.get(name) + self.assertTrue(handler in ua.handlers) + return handler + + def test_set_proxies(self): + ua = mechanize.UserAgentBase() + def proxy_bypass(hostname): + return False + proxies = {"http": "http://spam"} + ua.set_proxies(proxies, proxy_bypass) + proxy_handler = self._get_handler_from_ua(ua, "_proxy") + self.assertTrue(proxy_handler._proxy_bypass is proxy_bypass) + self.assertTrue(proxy_handler.proxies, proxies) + + def test_set_handled_schemes(self): + class MockHandlerClass(make_mock_handler()): + def __call__(self): return self + class BlahHandlerClass(MockHandlerClass): pass + class BlahProcessorClass(MockHandlerClass): pass + BlahHandler = BlahHandlerClass([("blah_open", None)]) + BlahProcessor = BlahProcessorClass([("blah_request", None)]) + class TestUserAgent(mechanize.UserAgent): + default_schemes = ["http"] + default_others = [] + default_features = [] + handler_classes = mechanize.UserAgent.handler_classes.copy() + handler_classes.update( + {"blah": BlahHandler, "_blah": BlahProcessor}) + ua = TestUserAgent() + + self.assertEqual(list(h.__class__.__name__ for h in ua.handlers), + ["HTTPHandler"]) + ua.set_handled_schemes(["http", "file"]) + self.assertEqual(sorted(h.__class__.__name__ for h in ua.handlers), + ["FileHandler", "HTTPHandler"]) + self.assertRaises(ValueError, + ua.set_handled_schemes, ["blah", "non-existent"]) + self.assertRaises(ValueError, + ua.set_handled_schemes, ["blah", "_blah"]) + ua.set_handled_schemes(["blah"]) + + req = mechanize.Request("blah://example.com/") + r = ua.open(req) + exp_calls = [("blah_open", (req,), {})] + assert len(ua.calls) == len(exp_calls) + for got, expect in zip(ua.calls, exp_calls): + self.assertEqual(expect, got[1:]) + + ua.calls = [] + req = mechanize.Request("blah://example.com/") + ua._set_handler("_blah", True) + r = ua.open(req) + exp_calls = [ + ("blah_request", (req,), {}), + ("blah_open", (req,), {})] + assert len(ua.calls) == len(exp_calls) + for got, expect in zip(ua.calls, exp_calls): + self.assertEqual(expect, got[1:]) + ua._set_handler("_blah", True) + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/LTA/LTAIngest/mechanize/__init__.py b/LTA/LTAIngest/mechanize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c4429be394a39a739b8ceae874a79013a297c3d4 --- /dev/null +++ b/LTA/LTAIngest/mechanize/__init__.py @@ -0,0 +1,211 @@ +__all__ = [ + 'AbstractBasicAuthHandler', + 'AbstractDigestAuthHandler', + 'BaseHandler', + 'Browser', + 'BrowserStateError', + 'CacheFTPHandler', + 'ContentTooShortError', + 'Cookie', + 'CookieJar', + 'CookiePolicy', + 'DefaultCookiePolicy', + 'DefaultFactory', + 'FTPHandler', + 'Factory', + 'FileCookieJar', + 'FileHandler', + 'FormNotFoundError', + 'FormsFactory', + 'HTTPBasicAuthHandler', + 'HTTPCookieProcessor', + 'HTTPDefaultErrorHandler', + 'HTTPDigestAuthHandler', + 'HTTPEquivProcessor', + 'HTTPError', + 'HTTPErrorProcessor', + 'HTTPHandler', + 'HTTPPasswordMgr', + 'HTTPPasswordMgrWithDefaultRealm', + 'HTTPProxyPasswordMgr', + 'HTTPRedirectDebugProcessor', + 'HTTPRedirectHandler', + 'HTTPRefererProcessor', + 'HTTPRefreshProcessor', + 'HTTPResponseDebugProcessor', + 'HTTPRobotRulesProcessor', + 'HTTPSClientCertMgr', + 'HeadParser', + 'History', + 'LWPCookieJar', + 'Link', + 'LinkNotFoundError', + 'LinksFactory', + 'LoadError', + 'MSIECookieJar', + 'MozillaCookieJar', + 'OpenerDirector', + 'OpenerFactory', + 'ParseError', + 'ProxyBasicAuthHandler', + 'ProxyDigestAuthHandler', + 'ProxyHandler', + 'Request', + 'RobotExclusionError', + 'RobustFactory', + 'RobustFormsFactory', + 'RobustLinksFactory', + 'RobustTitleFactory', + 'SeekableResponseOpener', + 'TitleFactory', + 'URLError', + 'USE_BARE_EXCEPT', + 'UnknownHandler', + 'UserAgent', + 'UserAgentBase', + 'XHTMLCompatibleHeadParser', + '__version__', + 'build_opener', + 'install_opener', + 'lwp_cookie_str', + 'make_response', + 'request_host', + 'response_seek_wrapper', # XXX deprecate in public interface? + 'seek_wrapped_response', # XXX should probably use this internally in place of response_seek_wrapper() + 'str2time', + 'urlopen', + 'urlretrieve', + 'urljoin', + + # ClientForm API + 'AmbiguityError', + 'ControlNotFoundError', + 'FormParser', + 'ItemCountError', + 'ItemNotFoundError', + 'LocateError', + 'Missing', + 'ParseFile', + 'ParseFileEx', + 'ParseResponse', + 'ParseResponseEx', + 'ParseString', + 'XHTMLCompatibleFormParser', + # deprecated + 'CheckboxControl', + 'Control', + 'FileControl', + 'HTMLForm', + 'HiddenControl', + 'IgnoreControl', + 'ImageControl', + 'IsindexControl', + 'Item', + 'Label', + 'ListControl', + 'PasswordControl', + 'RadioControl', + 'ScalarControl', + 'SelectControl', + 'SubmitButtonControl', + 'SubmitControl', + 'TextControl', + 'TextareaControl', + ] + +import logging +import sys + +from _version import __version__ + +# high-level stateful browser-style interface +from _mechanize import \ + Browser, History, \ + BrowserStateError, LinkNotFoundError, FormNotFoundError + +# configurable URL-opener interface +from _useragent import UserAgentBase, UserAgent +from _html import \ + Link, \ + Factory, DefaultFactory, RobustFactory, \ + FormsFactory, LinksFactory, TitleFactory, \ + RobustFormsFactory, RobustLinksFactory, RobustTitleFactory + +# urllib2 work-alike interface. This is a superset of the urllib2 interface. +from _urllib2 import * +import _urllib2 +if hasattr(_urllib2, "HTTPSHandler"): + __all__.append("HTTPSHandler") +del _urllib2 + +# misc +from _http import HeadParser +from _http import XHTMLCompatibleHeadParser +from _opener import ContentTooShortError, OpenerFactory, urlretrieve +from _response import \ + response_seek_wrapper, seek_wrapped_response, make_response +from _rfc3986 import urljoin +from _util import http2time as str2time + +# cookies +from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \ + CookieJar, FileCookieJar, LoadError, request_host_lc as request_host, \ + effective_request_host +from _lwpcookiejar import LWPCookieJar, lwp_cookie_str +# 2.4 raises SyntaxError due to generator / try/finally use +if sys.version_info[:2] > (2,4): + try: + import sqlite3 + except ImportError: + pass + else: + from _firefox3cookiejar import Firefox3CookieJar +from _mozillacookiejar import MozillaCookieJar +from _msiecookiejar import MSIECookieJar + +# forms +from _form import ( + AmbiguityError, + ControlNotFoundError, + FormParser, + ItemCountError, + ItemNotFoundError, + LocateError, + Missing, + ParseError, + ParseFile, + ParseFileEx, + ParseResponse, + ParseResponseEx, + ParseString, + XHTMLCompatibleFormParser, + # deprecated + CheckboxControl, + Control, + FileControl, + HTMLForm, + HiddenControl, + IgnoreControl, + ImageControl, + IsindexControl, + Item, + Label, + ListControl, + PasswordControl, + RadioControl, + ScalarControl, + SelectControl, + SubmitButtonControl, + SubmitControl, + TextControl, + TextareaControl, + ) + +# If you hate the idea of turning bugs into warnings, do: +# import mechanize; mechanize.USE_BARE_EXCEPT = False +USE_BARE_EXCEPT = True + +logger = logging.getLogger("mechanize") +if logger.level is logging.NOTSET: + logger.setLevel(logging.CRITICAL) +del logger diff --git a/LTA/LTAIngest/mechanize/_auth.py b/LTA/LTAIngest/mechanize/_auth.py new file mode 100644 index 0000000000000000000000000000000000000000..900e201ea6029d502a17cdab9e043c95b49056e2 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_auth.py @@ -0,0 +1,68 @@ +"""HTTP Authentication and Proxy support. + + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +from _urllib2_fork import HTTPPasswordMgr + + +# TODO: stop deriving from HTTPPasswordMgr +class HTTPProxyPasswordMgr(HTTPPasswordMgr): + # has default realm and host/port + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if uri is None or isinstance(uri, basestring): + uris = [uri] + else: + uris = uri + passwd_by_domain = self.passwd.setdefault(realm, {}) + for uri in uris: + for default_port in True, False: + reduced_uri = self.reduce_uri(uri, default_port) + passwd_by_domain[reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + attempts = [(realm, authuri), (None, authuri)] + # bleh, want default realm to take precedence over default + # URI/authority, hence this outer loop + for default_uri in False, True: + for realm, authuri in attempts: + authinfo_by_domain = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uri, authinfo in authinfo_by_domain.iteritems(): + if uri is None and not default_uri: + continue + if self.is_suburi(uri, reduced_authuri): + return authinfo + user, password = None, None + + if user is not None: + break + return user, password + + def reduce_uri(self, uri, default_port=True): + if uri is None: + return None + return HTTPPasswordMgr.reduce_uri(self, uri, default_port) + + def is_suburi(self, base, test): + if base is None: + # default to the proxy's host/port + hostport, path = test + base = (hostport, "/") + return HTTPPasswordMgr.is_suburi(self, base, test) + + +class HTTPSClientCertMgr(HTTPPasswordMgr): + # implementation inheritance: this is not a proper subclass + def add_key_cert(self, uri, key_file, cert_file): + self.add_password(None, uri, key_file, cert_file) + def find_key_cert(self, authuri): + return HTTPPasswordMgr.find_user_password(self, None, authuri) diff --git a/LTA/LTAIngest/mechanize/_beautifulsoup.py b/LTA/LTAIngest/mechanize/_beautifulsoup.py new file mode 100644 index 0000000000000000000000000000000000000000..a157ef279cc49f46d7eb41463e8a6dfddba3cc4d --- /dev/null +++ b/LTA/LTAIngest/mechanize/_beautifulsoup.py @@ -0,0 +1,1077 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +v2.1.1 +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance +into a tree representation. It provides methods and Pythonic idioms +that make it easy to search and modify the tree. + +A well-formed XML/HTML document will yield a well-formed data +structure. An ill-formed XML/HTML document will yield a +correspondingly ill-formed data structure. If your document is only +locally well-formed, you can use this library to find and process the +well-formed part of it. The BeautifulSoup class has heuristics for +obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup has no external dependencies. It works with Python 2.2 +and up. + +Beautiful Soup defines classes for four different parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. + + * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML + that trips up BeautifulSoup. + + * BeautifulSOAP, for making it easier to parse XML documents that use + lots of subelements containing a single string, where you'd prefer + they put that string into an attribute (such as SOAP messages). + +You can subclass BeautifulStoneSoup or BeautifulSoup to create a +parsing strategy specific to an XML schema or a particular bizarre +HTML document. Typically your subclass would just override +SELF_CLOSING_TAGS and/or NESTABLE_TAGS. +""" #" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "2.1.1" +__date__ = "$Date$" +__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" +__license__ = "PSF" + +from _sgmllib_copy import SGMLParser, SGMLParseError +import types +import re +import _sgmllib_copy as sgmllib + +class NullType(object): + + """Similar to NoneType with a corresponding singleton instance + 'Null' that, unlike None, accepts any message and returns itself. + + Examples: + >>> Null("send", "a", "message")("and one more", + ... "and what you get still") is Null + True + """ + + def __new__(cls): return Null + def __call__(self, *args, **kwargs): return Null +## def __getstate__(self, *args): return Null + def __getattr__(self, attr): return Null + def __getitem__(self, item): return Null + def __setattr__(self, attr, value): pass + def __setitem__(self, item, value): pass + def __len__(self): return 0 + # FIXME: is this a python bug? otherwise ``for x in Null: pass`` + # never terminates... + def __iter__(self): return iter([]) + def __contains__(self, item): return False + def __repr__(self): return "Null" +Null = object.__new__(NullType) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=Null, previous=Null): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = Null + self.previousSibling = Null + self.nextSibling = Null + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def findNext(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._first(self.fetchNext, name, attrs, text) + firstNext = findNext + + def fetchNext(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before after Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextGenerator) + + def findNextSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._first(self.fetchNextSiblings, name, attrs, text) + firstNextSibling = findNextSibling + + def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) + + def findPrevious(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._first(self.fetchPrevious, name, attrs, text) + + def fetchPrevious(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.previousGenerator) + firstPrevious = findPrevious + + def findPreviousSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._first(self.fetchPreviousSiblings, name, attrs, text) + firstPreviousSibling = findPreviousSibling + + def fetchPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, + self.previousSiblingGenerator) + + def findParent(self, name=None, attrs={}): + """Returns the closest parent of this Tag that matches the given + criteria.""" + r = Null + l = self.fetchParents(name, attrs, 1) + if l: + r = l[0] + return r + firstParent = findParent + + def fetchParents(self, name=None, attrs={}, limit=None): + """Returns the parents of this Tag that match the given + criteria.""" + return self._fetch(name, attrs, None, limit, self.parentGenerator) + + #These methods do the real heavy lifting. + + def _first(self, method, name, attrs, text): + r = Null + l = method(name, attrs, text, 1) + if l: + r = l[0] + return r + + def _fetch(self, name, attrs, text, limit, generator): + "Iterates over a generator looking for things that match." + if not hasattr(attrs, 'items'): + attrs = {'class' : attrs} + + results = [] + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + found = None + if isinstance(i, Tag): + if not text: + if not name or self._matches(i, name): + match = True + for attr, matchAgainst in attrs.items(): + check = i.get(attr) + if not self._matches(check, matchAgainst): + match = False + break + if match: + found = i + elif text: + if self._matches(i, text): + found = i + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #Generators that can be used to navigate starting from both + #NavigableTexts and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + def _matches(self, chunk, howToMatch): + #print 'looking for %s in %s' % (howToMatch, chunk) + # + # If given a list of items, return true if the list contains a + # text element that matches. + if isList(chunk) and not isinstance(chunk, Tag): + for tag in chunk: + if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): + return True + return False + if callable(howToMatch): + return howToMatch(chunk) + if isinstance(chunk, Tag): + #Custom match methods take the tag as an argument, but all other + #ways of matching match the tag name as a string + chunk = chunk.name + #Now we know that chunk is a string + if not isinstance(chunk, basestring): + chunk = str(chunk) + if hasattr(howToMatch, 'match'): + # It's a regexp object. + return howToMatch.search(chunk) + if isList(howToMatch): + return chunk in howToMatch + if hasattr(howToMatch, 'items'): + return howToMatch.has_key(chunk) + #It's just a string + return str(howToMatch) == chunk + +class NavigableText(PageElement): + + def __getattr__(self, attr): + "For backwards compatibility, text.string gives you text" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + +class NavigableString(str, NavigableText): + pass + +class NavigableUnicodeString(unicode, NavigableText): + pass + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, name, attrs=None, parent=Null, previous=Null): + "Basic constructor." + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + fetch() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.fetch, args, kwargs) + + def __getattr__(self, tag): + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.first(tag[:-3]) + elif tag.find('__') != 0: + return self.first(tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self): + """Renders this tag as a string.""" + return str(self) + + def __unicode__(self): + return self.__str__(1) + + def __str__(self, needUnicode=None, showStructureIndent=None): + """Returns a string or Unicode representation of this tag and + its contents. + + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + attrs.append('%s="%s"' % (key, val)) + close = '' + closeTag = '' + if self.isSelfClosing(): + close = ' /' + else: + closeTag = '</%s>' % self.name + indentIncrement = None + if showStructureIndent != None: + indentIncrement = showStructureIndent + if not self.hidden: + indentIncrement += 1 + contents = self.renderContents(indentIncrement, needUnicode=needUnicode) + if showStructureIndent: + space = '\n%s' % (' ' * showStructureIndent) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if showStructureIndent: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + s.append(contents) + if closeTag and showStructureIndent != None: + s.append(space) + s.append(closeTag) + s = ''.join(s) + isUnicode = type(s) == types.UnicodeType + if needUnicode and not isUnicode: + s = unicode(s) + elif isUnicode and needUnicode==False: + s = str(s) + return s + + def prettify(self, needUnicode=None): + return self.__str__(needUnicode, showStructureIndent=True) + + def renderContents(self, showStructureIndent=None, needUnicode=None): + """Renders the contents of this tag as a (possibly Unicode) + string.""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: + text = unicode(c) + elif isinstance(c, Tag): + s.append(c.__str__(needUnicode, showStructureIndent)) + elif needUnicode: + text = unicode(c) + else: + text = str(c) + if text: + if showStructureIndent != None: + if text[-1] == '\n': + text = text[:-1] + s.append(text) + return ''.join(s) + + #Soup methods + + def firstText(self, text, recursive=True): + """Convenience method to retrieve the first piece of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.first(recursive=recursive, text=text) + + def fetchText(self, text, recursive=True, limit=None): + """Convenience method to retrieve all pieces of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.fetch(recursive=recursive, text=text, limit=limit) + + def first(self, name=None, attrs={}, recursive=True, text=None): + """Return only the first child of this + Tag matching the given criteria.""" + r = Null + l = self.fetch(name, attrs, recursive, text, 1) + if l: + r = l[0] + return r + findChild = first + + def fetch(self, name=None, attrs={}, recursive=True, text=None, + limit=None): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._fetch(name, attrs, text, limit, generator) + fetchChildren = fetch + + #Utility methods + + def isSelfClosing(self): + """Returns true iff this is a self-closing tag as defined in the HTML + standard. + + TODO: This is specific to BeautifulSoup and its subclasses, but it's + used by __str__""" + return self.name in BeautifulSoup.SELF_CLOSING_TAGS + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.contents.append(tag) + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out + of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +class BeautifulStoneSoup(Tag, SGMLParser): + + """This class contains the basic parser and fetch code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "<foo><bar></foo>" actually means + "<foo><bar></bar></foo>". + + [Another possible explanation is "<foo><bar /></foo>", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + + #As a public service we will by default silently replace MS smart quotes + #and similar characters with their HTML or ASCII equivalents. + MS_CHARS = { '\x80' : '€', + '\x81' : ' ', + '\x82' : '‚', + '\x83' : 'ƒ', + '\x84' : '„', + '\x85' : '…', + '\x86' : '†', + '\x87' : '‡', + '\x88' : '⁁', + '\x89' : '%', + '\x8A' : 'Š', + '\x8B' : '<', + '\x8C' : 'Œ', + '\x8D' : '?', + '\x8E' : 'Z', + '\x8F' : '?', + '\x90' : '?', + '\x91' : '‘', + '\x92' : '’', + '\x93' : '“', + '\x94' : '”', + '\x95' : '•', + '\x96' : '–', + '\x97' : '—', + '\x98' : '˜', + '\x99' : '™', + '\x9a' : 'š', + '\x9b' : '>', + '\x9c' : 'œ', + '\x9d' : '?', + '\x9e' : 'z', + '\x9f' : 'Ÿ',} + + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda(x):'<!' + x.group(1) + '>'), + (re.compile("([\x80-\x9f])"), + lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1))) + ] + + ROOT_TAG_NAME = '[document]' + + def __init__(self, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + """Initialize this as the 'root tag' and feed in any text to + the parser. + + NOTE about avoidParserProblems: sgmllib will process most bad + HTML, and BeautifulSoup has tricks for dealing with some HTML + that kills sgmllib, but Beautiful Soup can nonetheless choke + or lose data if your data uses self-closing tags or + declarations incorrectly. By default, Beautiful Soup sanitizes + its input to avoid the vast majority of these problems. The + problems are relatively rare, even in bad HTML, so feel free + to pass in False to avoidParserProblems if they don't apply to + you, and you'll get better performance. The only reason I have + this turned on by default is so I don't get so many tech + support questions. + + The two most common instances of invalid HTML that will choke + sgmllib are fixed by the default parser massage techniques: + + <br/> (No space between name of closing tag and tag close) + <! --Comment--> (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + Tag.__init__(self, self.ROOT_TAG_NAME) + if avoidParserProblems \ + and not isList(avoidParserProblems): + avoidParserProblems = self.PARSER_MASSAGE + self.avoidParserProblems = avoidParserProblems + SGMLParser.__init__(self) + self.quoteStack = [] + self.hidden = 1 + self.reset() + if hasattr(text, 'read'): + #It's a file-type object. + text = text.read() + if text: + self.feed(text) + if initialTextIsEverything: + self.done() + + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + + def feed(self, text): + if self.avoidParserProblems: + for fix, m in self.avoidParserProblems: + text = fix.sub(m, text) + SGMLParser.feed(self, text) + + def done(self): + """Called when you're done parsing, so that the unclosed tags can be + correctly processed.""" + self.endData() #NEW + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + SGMLParser.reset(self) + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableText): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self): + currentData = ''.join(self.currentData) + if currentData: + if not currentData.strip(): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + c = NavigableString + if type(currentData) == types.UnicodeType: + c = NavigableUnicodeString + o = c(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + self.currentData = [] + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: + <p>Foo<b>Bar<p> should pop to 'p', not 'b'. + <p>Foo<table>Bar<p> should pop to 'table', not 'p'. + <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'. + <p>Foo<b>Bar<p> should pop to 'p', not 'b'. + + <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. + <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' + <td><tr><td> *<td>* should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s" % name + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + if not name in self.SELF_CLOSING_TAGS and not selfClosing: + self._smartPop(name) + tag = Tag(name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or name in self.SELF_CLOSING_TAGS: + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + + def unknown_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print "</%s> is not real!" % name + self.handle_data('</%s>' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def handle_pi(self, text): + "Propagate processing instructions right through." + self.handle_data("<?%s>" % text) + + def handle_comment(self, text): + "Propagate comments right through." + self.handle_data("<!--%s-->" % text) + + def handle_charref(self, ref): + "Propagate char refs right through." + self.handle_data('&#%s;' % ref) + + def handle_entityref(self, ref): + "Propagate entity refs right through." + self.handle_data('&%s;' % ref) + + def handle_decl(self, data): + "Propagate DOCTYPEs and the like right through." + self.handle_data('<!%s>' % data) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as regular data.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + self.handle_data(self.rawdata[i+9:k]) + j = k+3 + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a <p> tag should implicitly close the previous <p> tag. + + <p>Para1<p>Para2 + should be transformed into: + <p>Para1</p><p>Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a <blockquote> tag should _not_ implicitly close the previous + <blockquote> tag. + + Alice said: <blockquote>Bob said: <blockquote>Blah + should NOT be transformed into: + Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a <tr> tag should + implicitly close the previous <tr> tag within the same <table>, + but not close a <tr> tag in another table. + + <table><tr>Blah<tr>Blah + should be transformed into: + <table><tr>Blah</tr><tr>Blah + but, + <tr>Blah<table><tr>Blah + should NOT be transformed into + <tr>Blah<table></tr><tr>Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup before writing your own + subclass.""" + + SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + QUOTE_TAGS = {'script': None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + <b>Foo<b>Bar</b></b> + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "<b>Foo<b>Bar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '</b></b>' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close (eg.) a 'b' + tag than to actually use nested 'b' tags, and the BeautifulSoup + class handles the common case. This class handles the + not-co-common case: where you can't believe someone wrote what + they did, but it's valid HTML and BeautifulSoup screwed up by + assuming it wouldn't be. + + If this doesn't do what you need, try subclassing this class or + BeautifulSoup, and providing your own list of NESTABLE_TAGS.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class BeautifulSOAP(BeautifulStoneSoup): + """This class will push a tag with only a single string child into + the tag's parent as an attribute. The attribute's name is the tag + name, and the value is the string child. An example should give + the flavor of the change: + + <foo><bar>baz</bar></foo> + => + <foo bar="baz"><bar>baz</bar></foo> + + You can then access fooTag['bar'] instead of fooTag.barTag.string. + + This is, of course, useful for scraping structures that tend to + use subelements instead of attributes, such as SOAP messages. Note + that it modifies its input, so don't print the modified version + out. + + I'm not sure how many people really want to use this class; let me + know if you do. Mainly I like the name.""" + + def popTag(self): + if len(self.tagStack) > 1: + tag = self.tagStack[-1] + parent = self.tagStack[-2] + parent._getAttrMap() + if (isinstance(tag, Tag) and len(tag.contents) == 1 and + isinstance(tag.contents[0], NavigableText) and + not parent.attrMap.has_key(tag.name)): + parent[tag.name] = tag.contents[0] + BeautifulStoneSoup.popTag(self) + +#Enterprise class names! It has come to our attention that some people +#think the names of the Beautiful Soup parser classes are too silly +#and "unprofessional" for use in enterprise screen-scraping. We feel +#your pain! For such-minded folk, the Beautiful Soup Consortium And +#All-Night Kosher Bakery recommends renaming this file to +#"RobustParser.py" (or, in cases of extreme enterprisitude, +#"RobustParserBeanInterface.class") and using the following +#enterprise-friendly class aliases: +class RobustXMLParser(BeautifulStoneSoup): + pass +class RobustHTMLParser(BeautifulSoup): + pass +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): + pass +class SimplifyingSOAPParser(BeautifulSOAP): + pass + +### + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulStoneSoup(sys.stdin.read()) + print soup.prettify() diff --git a/LTA/LTAIngest/mechanize/_clientcookie.py b/LTA/LTAIngest/mechanize/_clientcookie.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed4c878271a11bf564b0a64b377fd8a7fbe6a2a --- /dev/null +++ b/LTA/LTAIngest/mechanize/_clientcookie.py @@ -0,0 +1,1725 @@ +"""HTTP cookie handling for web clients. + +This module originally developed from my port of Gisle Aas' Perl module +HTTP::Cookies, from the libwww-perl library. + +Docstrings, comments and debug strings in this code refer to the +attributes of the HTTP cookie system as cookie-attributes, to distinguish +them clearly from Python attributes. + + CookieJar____ + / \ \ + FileCookieJar \ \ + / | \ \ \ + MozillaCookieJar | LWPCookieJar \ \ + | | \ + | ---MSIEBase | \ + | / | | \ + | / MSIEDBCookieJar BSDDBCookieJar + |/ + MSIECookieJar + +Comments to John J Lee <jjl@pobox.com>. + + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) +Copyright 2002-2003 Johnny Lee (original MSIE Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import sys, re, copy, time, urllib, types, logging +try: + import threading + _threading = threading; del threading +except ImportError: + import dummy_threading + _threading = dummy_threading; del dummy_threading + +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") +DEFAULT_HTTP_PORT = "80" + +from _headersutil import split_header_words, parse_ns_headers +from _util import isstringlike +import _rfc3986 + +debug = logging.getLogger("mechanize.cookies").debug + + +def reraise_unmasked_exceptions(unmasked=()): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. + # This function re-raises some exceptions we don't want to trap. + import mechanize, warnings + if not mechanize.USE_BARE_EXCEPT: + raise + unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) + etype = sys.exc_info()[0] + if issubclass(etype, unmasked): + raise + # swallowed an exception + import traceback, StringIO + f = StringIO.StringIO() + traceback.print_exc(None, f) + msg = f.getvalue() + warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2) + + +IPV4_RE = re.compile(r"\.\d+$") +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + return not (IPV4_RE.search(text) or + text == "" or + text[0] == "." or text[-1] == ".") + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = A.lower() + B = B.lower() + if A == B: + return True + if not is_HDN(A): + return False + i = A.rfind(B) + has_form_nb = not (i == -1 or i == 0) + return ( + has_form_nb and + B.startswith(".") and + is_HDN(B[1:]) + ) + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + return not IPV4_RE.search(text) + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = A.lower() + B = B.lower() + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = B.startswith(".") + if initial_dot and A.endswith(B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = _rfc3986.urlsplit(url)[1] + if host is None: + host = request.get_header("Host", "") + # remove port, if present + return cut_port_re.sub("", host, 1) + +def request_host_lc(request): + return request_host(request).lower() + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name).""" + erhn = req_host = request_host(request) + if req_host.find(".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def eff_request_host_lc(request): + req_host, erhn = eff_request_host(request) + return req_host.lower(), erhn.lower() + +def effective_request_host(request): + """Return the effective request-host, as defined by RFC 2965.""" + return eff_request_host(request)[1] + +def request_path(request): + """Return path component of request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + path = escape_path(_rfc3986.urlsplit(url)[2]) + if not path.startswith("/"): + path = "/" + path + return path + +def request_port(request): + host = request.get_host() + i = host.find(':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + debug("nonnumeric port: '%s'", port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +def request_is_unverifiable(request): + try: + return request.is_unverifiable() + except AttributeError: + if hasattr(request, "unverifiable"): + return request.unverifiable + else: + raise + +# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't +# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). +HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" +ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +def uppercase_escaped_char(match): + return "%%%s" % match.group(1).upper() +def escape_path(path): + """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" + # There's no knowing what character encoding was used to create URLs + # containing %-escapes, but since we have to pick one to escape invalid + # path characters, we pick UTF-8, as recommended in the HTML 4.0 + # specification: + # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 + # And here, kind of: draft-fielding-uri-rfc2396bis-03 + # (And in draft IRI specification: draft-duerst-iri-05) + # (And here, for new URI schemes: RFC 2718) + if isinstance(path, types.UnicodeType): + path = path.encode("utf-8") + path = urllib.quote(path, HTTP_PATH_SAFE) + path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = h.find(".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = b.find(".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = request_host_lc(request) + # the origin request's request-host was stuffed into request by + # _urllib2_support.AbstractHTTPHandler + return not domain_match(req_host, reach(request.origin_req_host)) + + +try: + all +except NameError: + # python 2.4 + def all(iterable): + for x in iterable: + if not x: + return False + return True + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + version: integer; + name: string; + value: string (may be None); + port: string; None indicates no attribute was supplied (e.g. "Port", rather + than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list + string (e.g. "80,8080") + port_specified: boolean; true if a value was supplied with the Port + cookie-attribute + domain: string; + domain_specified: boolean; true if Domain was explicitly set + domain_initial_dot: boolean; true if Domain as set in HTTP header by server + started with a dot (yes, this really is necessary!) + path: string; + path_specified: boolean; true if Path was explicitly set + secure: boolean; true if should only be returned over secure connection + expires: integer; seconds since epoch (RFC 2965 cookies should calculate + this value from the Max-Age attribute) + discard: boolean, true if this is a session cookie; (if no expires value, + this should be true) + comment: string; + comment_url: string; + rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not + Set-Cookie2:) header, but had a version cookie-attribute of 1 + rest: mapping of other cookie-attributes + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + + _attrs = ("version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + "rfc2109", "_rest") + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest, + rfc2109=False, + ): + + if version is not None: version = int(version) + if expires is not None: expires = int(expires) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = domain.lower() + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + self.rfc2109 = rfc2109 + + self._rest = copy.copy(rest) + + def has_nonstandard_attr(self, name): + return self._rest.has_key(name) + def get_nonstandard_attr(self, name, default=None): + return self._rest.get(name, default) + def set_nonstandard_attr(self, name, value): + self._rest[name] = value + def nonstandard_attr_keys(self): + return self._rest.keys() + + def is_expired(self, now=None): + if now is None: now = time.time() + return (self.expires is not None) and (self.expires <= now) + + def __eq__(self, other): + return all(getattr(self, a) == getattr(other, a) for a in self._attrs) + + def __ne__(self, other): + return not (self == other) + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.value is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.name + return "<Cookie %s for %s>" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ["version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + ]: + attr = getattr(self, name) + args.append("%s=%s" % (name, repr(attr))) + args.append("rest=%s" % repr(self._rest)) + args.append("rfc2109=%s" % repr(self.rfc2109)) + return "Cookie(%s)" % ", ".join(args) + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + May also modify cookies. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customised policy. + + As well as implementing set_ok and return_ok, implementations of this + interface must also supply the following attributes, indicating which + protocols should be used, and how. These can be read and set at any time, + though whether that makes complete sense from the protocol point of view is + doubtful. + + Public attributes: + + netscape: implement netscape protocol + rfc2965: implement RFC 2965 protocol + rfc2109_as_netscape: + WARNING: This argument will change or go away if is not accepted into + the Python standard library in this form! + If true, treat RFC 2109 cookies as though they were Netscape cookies. The + default is for this attribute to be None, which means treat 2109 cookies + as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is, + by default), and as Netscape cookies otherwise. + hide_cookie2: don't add Cookie2 header to requests (the presence of + this header indicates to the server that we understand RFC 2965 + cookies) + + """ + def set_ok(self, cookie, request): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.extract_cookies.__doc__ + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request): + """Return true if (and only if) cookie should be returned to server. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.add_cookie_header.__doc__ + + """ + raise NotImplementedError() + + def domain_return_ok(self, domain, request): + """Return false if cookies should not be returned, given cookie domain. + + This is here as an optimization, to remove the need for checking every + cookie with a particular domain (which may involve reading many files). + The default implementations of domain_return_ok and path_return_ok + (return True) leave all the work to return_ok. + + If domain_return_ok returns true for the cookie domain, path_return_ok + is called for the cookie path. Otherwise, path_return_ok and return_ok + are never called for that cookie domain. If path_return_ok returns + true, return_ok is called with the Cookie object itself for a full + check. Otherwise, return_ok is never called for that cookie path. + + Note that domain_return_ok is called for every *cookie* domain, not + just for the *request* domain. For example, the function might be + called with both ".acme.com" and "www.acme.com" if the request domain + is "www.acme.com". The same goes for path_return_ok. + + For argument documentation, see the docstring for return_ok. + + """ + return True + + def path_return_ok(self, path, request): + """Return false if cookies should not be returned, given cookie path. + + See the docstring for domain_return_ok. + + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies. + + Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is + switched off by default. + + The easiest way to provide your own policy is to override this class and + call its methods in your overriden implementations before adding your own + additional checks. + + import mechanize + class MyCookiePolicy(mechanize.DefaultCookiePolicy): + def set_ok(self, cookie, request): + if not mechanize.DefaultCookiePolicy.set_ok( + self, cookie, request): + return False + if i_dont_want_to_store_this_cookie(): + return False + return True + + In addition to the features required to implement the CookiePolicy + interface, this class allows you to block and allow domains from setting + and receiving cookies. There are also some strictness switches that allow + you to tighten up the rather loose Netscape protocol rules a little bit (at + the cost of blocking some benign cookies). + + A domain blacklist and whitelist is provided (both off by default). Only + domains not in the blacklist and present in the whitelist (if the whitelist + is active) participate in cookie setting and returning. Use the + blocked_domains constructor argument, and blocked_domains and + set_blocked_domains methods (and the corresponding argument and methods for + allowed_domains). If you set a whitelist, you can turn it off again by + setting it to None. + + Domains in block or allow lists that do not start with a dot must + string-compare equal. For example, "acme.com" matches a blacklist entry of + "acme.com", but "www.acme.com" does not. Domains that do start with a dot + are matched by more specific domains too. For example, both "www.acme.com" + and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does + not). IP addresses are an exception, and must match exactly. For example, + if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is + blocked, but 193.168.1.2 is not. + + Additional Public Attributes: + + General strictness switches + + strict_domain: don't allow sites to set two-component domains with + country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. + This is far from perfect and isn't guaranteed to work! + + RFC 2965 protocol strictness switches + + strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable + transactions (usually, an unverifiable transaction is one resulting from + a redirect or an image hosted on another site); if this is false, cookies + are NEVER blocked on the basis of verifiability + + Netscape protocol strictness switches + + strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions + even to Netscape cookies + strict_ns_domain: flags indicating how strict to be with domain-matching + rules for Netscape cookies: + DomainStrictNoDots: when setting cookies, host prefix must not contain a + dot (e.g. www.foo.bar.com can't set a cookie for .bar.com, because + www.foo contains a dot) + DomainStrictNonDomain: cookies that did not explicitly specify a Domain + cookie-attribute can only be returned to a domain that string-compares + equal to the domain that set the cookie (e.g. rockets.acme.com won't + be returned cookies from acme.com that had no Domain cookie-attribute) + DomainRFC2965Match: when setting cookies, require a full RFC 2965 + domain-match + DomainLiberal and DomainStrict are the most useful combinations of the + above flags, for convenience + strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that + have names starting with '$' + strict_ns_set_path: don't allow setting cookies whose path doesn't + path-match request URI + + """ + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=False, + # WARNING: this argument will change or go away if is not + # accepted into the Python standard library in this form! + # default, ie. treat 2109 as netscape iff not rfc2965 + rfc2109_as_netscape=None, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False, + ): + """ + Constructor arguments should be used as keyword arguments only. + + blocked_domains: sequence of domain names that we never accept cookies + from, nor return cookies to + allowed_domains: if not None, this is a sequence of the only domains + for which we accept and return cookies + + For other arguments, see CookiePolicy.__doc__ and + DefaultCookiePolicy.__doc__.. + + """ + self.netscape = netscape + self.rfc2965 = rfc2965 + self.rfc2109_as_netscape = rfc2109_as_netscape + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request): + """ + If you override set_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + debug(" - checking cookie %s", cookie) + + assert cookie.name is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + + return True + + def set_ok_version(self, cookie, request): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + debug(" Set-Cookie2 without version attribute (%s)", cookie) + return False + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during " + "unverifiable transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during " + "unverifiable transaction") + return False + return True + + def set_ok_name(self, cookie, request): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + cookie.name.startswith("$")): + debug(" illegal name (starts with '$'): '%s'", cookie.name) + return False + return True + + def set_ok_path(self, cookie, request): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not req_path.startswith(cookie.path)): + debug(" path attribute %s is not a prefix of request " + "path %s", cookie.path, req_path) + return False + return True + + def set_ok_countrycode_domain(self, cookie, request): + """Return False if explicit cookie domain is not acceptable. + + Called by set_ok_domain, for convenience of overriding by + subclasses. + + """ + if cookie.domain_specified and self.strict_domain: + domain = cookie.domain + # since domain was specified, we know that: + assert domain.startswith(".") + if domain.count(".") == 2: + # domain like .foo.bar + i = domain.rfind(".") + tld = domain[i+1:] + sld = domain[1:i] + if (sld.lower() in [ + "co", "ac", + "com", "edu", "org", "net", "gov", "mil", "int", + "aero", "biz", "cat", "coop", "info", "jobs", "mobi", + "museum", "name", "pro", "travel", + ] and + len(tld) == 2): + # domain like .co.uk + return False + return True + + def set_ok_domain(self, cookie, request): + if self.is_blocked(cookie.domain): + debug(" domain %s is in user block-list", cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + debug(" domain %s is not in user allow-list", cookie.domain) + return False + if not self.set_ok_countrycode_domain(cookie, request): + debug(" country-code second level domain %s", cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + if domain.startswith("."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (undotted_domain.find(".") >= 0) + if not embedded_dots and domain != ".local": + debug(" non-local domain %s contains no embedded dot", + domain) + return False + if cookie.version == 0: + if (not erhn.endswith(domain) and + (not erhn.startswith(".") and + not ("."+erhn).endswith(domain))): + debug(" effective request-host %s (even with added " + "initial dot) does not end end with %s", + erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + debug(" effective request-host %s does not domain-match " + "%s", erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (host_prefix.find(".") >= 0 and + not IPV4_RE.search(req_host)): + debug(" host prefix %s for domain %s contains a dot", + host_prefix, domain) + return False + return True + + def set_ok_port(self, cookie, request): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in cookie.port.split(","): + try: + int(p) + except ValueError: + debug(" bad port %s (not numeric)", p) + return False + if p == req_port: + break + else: + debug(" request port (%s) not found in %s", + req_port, cookie.port) + return False + return True + + def return_ok(self, cookie, request): + """ + If you override return_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to return). + + """ + # Path has already been checked by path_return_ok, and domain blocking + # done by domain_return_ok. + debug(" - checking cookie %s", cookie) + + for n in ("version", "verifiability", "secure", "expires", "port", + "domain"): + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + return True + + def return_ok_version(self, cookie, request): + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request): + if cookie.secure and request.get_type() != "https": + debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request): + if cookie.is_expired(self._now): + debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in cookie.port.split(","): + if p == req_port: + break + else: + debug(" request port %s does not match cookie port %s", + req_port, cookie.port) + return False + return True + + def return_ok_domain(self, cookie, request): + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s", erhn, domain) + return False + if cookie.version == 0 and not ("."+erhn).endswith(domain): + debug(" request-host %s does not match Netscape cookie domain " + "%s", req_host, domain) + return False + return True + + def domain_return_ok(self, domain, request): + # Liberal check of domain. This is here as an optimization to avoid + # having to load lots of MSIE cookie files unless necessary. + + # Munge req_host and erhn to always start with a dot, so as to err on + # the side of letting cookies through. + dotted_req_host, dotted_erhn = eff_request_host_lc(request) + if not dotted_req_host.startswith("."): + dotted_req_host = "."+dotted_req_host + if not dotted_erhn.startswith("."): + dotted_erhn = "."+dotted_erhn + if not (dotted_req_host.endswith(domain) or + dotted_erhn.endswith(domain)): + #debug(" request domain %s does not match cookie domain %s", + # req_host, domain) + return False + + if self.is_blocked(domain): + debug(" domain %s is in user block-list", domain) + return False + if self.is_not_allowed(domain): + debug(" domain %s is not in user allow-list", domain) + return False + + return True + + def path_return_ok(self, path, request): + debug("- checking cookie path=%s", path) + req_path = request_path(request) + if not req_path.startswith(path): + debug(" %s does not path-match %s", req_path, path) + return False + return True + + +def vals_sorted_by_key(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +class MappingIterator: + """Iterates over nested mapping, depth-first, in sorted order by key.""" + def __init__(self, mapping): + self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack + + def __iter__(self): return self + + def next(self): + # this is hairy because of lack of generators + while 1: + try: + vals, i, prev_item = self._s.pop() + except IndexError: + raise StopIteration() + if i < len(vals): + item = vals[i] + i = i + 1 + self._s.append((vals, i, prev_item)) + try: + item.items + except AttributeError: + # non-mapping + break + else: + # mapping + self._s.append((vals_sorted_by_key(item), 0, item)) + continue + return item + + +# Used as second parameter to dict.get method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + You may not need to know about this class: try mechanize.urlopen(). + + The major methods are extract_cookies and add_cookie_header; these are all + you are likely to need. + + CookieJar supports the iterator protocol: + + for cookie in cookiejar: + # do something with cookie + + Methods: + + add_cookie_header(request) + extract_cookies(response, request) + get_policy() + set_policy(policy) + cookies_for_request(request) + make_cookies(response, request) + set_cookie_if_ok(cookie, request) + set_cookie(cookie) + clear_session_cookies() + clear_expired_cookies() + clear(domain=None, path=None, name=None) + + Public attributes + + policy: CookiePolicy object + + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + def __init__(self, policy=None): + """ + See CookieJar.__doc__ for argument documentation. + + """ + if policy is None: + policy = DefaultCookiePolicy() + self._policy = policy + + self._cookies = {} + + # for __getitem__ iteration in pre-2.2 Pythons + self._prev_getitem_index = 0 + + def get_policy(self): + return self._policy + + def set_policy(self, policy): + self._policy = policy + + def _cookies_for_domain(self, domain, request): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + cookies_by_path = self._cookies[domain] + for path in cookies_by_path.keys(): + if not self._policy.path_return_ok(path, request): + continue + cookies_by_name = cookies_by_path[path] + for cookie in cookies_by_name.values(): + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + cookies.append(cookie) + return cookies + + def cookies_for_request(self, request): + """Return a list of cookies to be returned to server. + + The returned list of cookie instances is sorted in the order they + should appear in the Cookie: header for return to the server. + + See add_cookie_header.__doc__ for the interface required of the + request argument. + + New in version 0.1.10 + + """ + self._policy._now = self._now = int(time.time()) + cookies = self._cookies_for_request(request) + # add cookies in order of most specific (i.e. longest) path first + def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) + cookies.sort(decreasing_size) + return cookies + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + # this method still exists (alongside cookies_for_request) because it + # is part of an implied protected interface for subclasses of cookiejar + # XXX document that implied interface, or provide another way of + # implementing cookiejars than subclassing + cookies = [] + for domain in self._cookies.keys(): + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + The $Version attribute is also added when appropriate (currently only + once per request). + + >>> jar = CookieJar() + >>> ns_cookie = Cookie(0, "foo", '"bar"', None, False, + ... "example.com", False, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([ns_cookie]) + ['foo="bar"'] + >>> rfc2965_cookie = Cookie(1, "foo", "bar", None, False, + ... ".example.com", True, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([rfc2965_cookie]) + ['$Version=1', 'foo=bar', '$Domain="example.com"'] + + """ + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if ((cookie.value is not None) and + self.non_word_re.search(cookie.value) and version > 0): + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.value is None: + attrs.append(cookie.name) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if cookie.domain.startswith("."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + domain.startswith(".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request): + """Add correct Cookie: header to request (mechanize.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + The request object (usually a mechanize.Request instance) must support + the methods get_full_url, get_host, is_unverifiable, get_type, + has_header, get_header, header_items and add_unredirected_header, as + documented by urllib2. + """ + debug("add_cookie_header") + cookies = self.cookies_for_request(request) + + attrs = self._cookie_attrs(cookies) + if attrs: + if not request.has_header("Cookie"): + request.add_unredirected_header("Cookie", "; ".join(attrs)) + + # if necessary, advertise that we know RFC 2965 + if self._policy.rfc2965 and not self._policy.hide_cookie2: + for cookie in cookies: + if cookie.version != 1 and not request.has_header("Cookie2"): + request.add_unredirected_header("Cookie2", '$Version="1"') + break + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = k.lower() + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if standard.has_key(k): + # only first value is significant + continue + if k == "domain": + if v is None: + debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = v.lower() + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + if v is None: + debug(" missing value for max-age attribute") + bad_cookie = True + break + try: + v = int(v) + except ValueError: + debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age is a + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ["port", "comment", "commenturl"]): + debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: + try: + version = int(version) + except ValueError: + return None # invalid version, ignore cookie + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = escape_path(path) + else: + path_specified = False + path = request_path(request) + i = path.rfind("/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(domain.startswith(".")) + if domain is Absent: + req_host, erhn = eff_request_host_lc(request) + domain = erhn + elif not domain.startswith("."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def _process_rfc2109_cookies(self, cookies): + if self._policy.rfc2109_as_netscape is None: + rfc2109_as_netscape = not self._policy.rfc2965 + else: + rfc2109_as_netscape = self._policy.rfc2109_as_netscape + for cookie in cookies: + if cookie.version == 1: + cookie.rfc2109 = True + if rfc2109_as_netscape: + # treat 2109 cookies as Netscape cookies rather than + # as RFC2965 cookies + cookie.version = 0 + + def _make_cookies(self, response, request): + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = headers.getheaders("Set-Cookie2") + ns_hdrs = headers.getheaders("Set-Cookie") + + rfc2965 = self._policy.rfc2965 + netscape = self._policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except: + reraise_unmasked_exceptions() + cookies = [] + + if ns_hdrs and netscape: + try: + # RFC 2109 and Netscape cookies + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except: + reraise_unmasked_exceptions() + ns_cookies = [] + self._process_rfc2109_cookies(ns_cookies) + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return not lookup.has_key(key) + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object. + + See extract_cookies.__doc__ for the interface required of the + response and request arguments. + + """ + self._policy._now = self._now = int(time.time()) + return [cookie for cookie in self._make_cookies(response, request) + if cookie.expires is None or not cookie.expires <= self._now] + + def set_cookie_if_ok(self, cookie, request): + """Set a cookie if policy says it's OK to do so. + + cookie: mechanize.Cookie instance + request: see extract_cookies.__doc__ for the required interface + + """ + self._policy._now = self._now = int(time.time()) + + if self._policy.set_ok(cookie, request): + self.set_cookie(cookie) + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set. + + cookie: mechanize.Cookie instance + """ + c = self._cookies + if not c.has_key(cookie.domain): c[cookie.domain] = {} + c2 = c[cookie.domain] + if not c2.has_key(cookie.path): c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + + def extract_cookies(self, response, request): + """Extract cookies from response, where allowable given the request. + + Look for allowable Set-Cookie: and Set-Cookie2: headers in the response + object passed as argument. Any of these headers that are found are + used to update the state of the object (subject to the policy.set_ok + method's approval). + + The response object (usually be the result of a call to + mechanize.urlopen, or similar) should support an info method, which + returns a mimetools.Message object (in fact, the 'mimetools.Message + object' may be any object that provides a getheaders method). + + The request object (usually a mechanize.Request instance) must support + the methods get_full_url, get_type, get_host, and is_unverifiable, as + documented by mechanize, and the port attribute (the port number). The + request is used to set default values for cookie-attributes as well as + for checking that the cookie is OK to be set. + + """ + debug("extract_cookies: %s", response.info()) + self._policy._now = self._now = int(time.time()) + + for cookie in self._make_cookies(response, request): + if cookie.expires is not None and cookie.expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + self.clear(cookie.domain, cookie.path, cookie.name) + except KeyError: + pass + debug("Expiring cookie, domain='%s', path='%s', name='%s'", + cookie.domain, cookie.path, cookie.name) + elif self._policy.set_ok(cookie, request): + debug(" setting cookie: %s", cookie) + self.set_cookie(cookie) + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self._cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self._cookies[domain][path] + elif domain is not None: + del self._cookies[domain] + else: + self._cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Discards all cookies held by object which had either no Max-Age or + Expires cookie-attribute or an explicit Discard cookie-attribute, or + which otherwise have ended up with a true discard attribute. For + interactive browsers, the end of a session usually corresponds to + closing the browser window. + + Note that the save method won't save session cookies anyway, unless you + ask otherwise by passing a true ignore_discard argument. + + """ + for cookie in self: + if cookie.discard: + self.clear(cookie.domain, cookie.path, cookie.name) + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the save + method won't save expired cookies anyway (unless you ask otherwise by + passing a true ignore_expires argument). + + """ + now = time.time() + for cookie in self: + if cookie.is_expired(now): + self.clear(cookie.domain, cookie.path, cookie.name) + + def __getitem__(self, i): + if i == 0: + self._getitem_iterator = self.__iter__() + elif self._prev_getitem_index != i-1: raise IndexError( + "CookieJar.__getitem__ only supports sequential iteration") + self._prev_getitem_index = i + try: + return self._getitem_iterator.next() + except StopIteration: + raise IndexError() + + def __iter__(self): + return MappingIterator(self._cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + +class LoadError(Exception): pass + +class FileCookieJar(CookieJar): + """CookieJar that can be loaded from and saved to a file. + + Additional methods + + save(filename=None, ignore_discard=False, ignore_expires=False) + load(filename=None, ignore_discard=False, ignore_expires=False) + revert(filename=None, ignore_discard=False, ignore_expires=False) + + Additional public attributes + + filename: filename for loading and saving cookies + + Additional public readable attributes + + delayload: request that cookies are lazily loaded from disk; this is only + a hint since this only affects performance, not behaviour (unless the + cookies on disk are changing); a CookieJar object may ignore it (in fact, + only MSIECookieJar lazily loads cookies at the moment) + + """ + + def __init__(self, filename=None, delayload=False, policy=None): + """ + See FileCookieJar.__doc__ for argument documentation. + + Cookies are NOT loaded from the named file until either the load or + revert method is called. + + """ + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self.delayload = bool(delayload) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file. + + filename: name of file in which to save cookies + ignore_discard: save even cookies set to be discarded + ignore_expires: save even cookies that have expired + + The file is overwritten if it already exists, thus wiping all its + cookies. Saved cookies can be restored later using the load or revert + methods. If filename is not specified, self.filename is used; if + self.filename is None, ValueError is raised. + + """ + raise NotImplementedError() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file. + + Old cookies are kept unless overwritten by newly loaded ones. + + Arguments are as for .save(). + + If filename is not specified, self.filename is used; if self.filename + is None, ValueError is raised. The named file must be in the format + understood by the class, or LoadError will be raised. This format will + be identical to that written by the save method, unless the load format + is not sufficiently well understood (as is the case for MSIECookieJar). + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename) + try: + self._really_load(f, filename, ignore_discard, ignore_expires) + finally: + f.close() + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises LoadError (or IOError) if reversion is not successful; the + object's state will not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + old_state = copy.deepcopy(self._cookies) + self._cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except (LoadError, IOError): + self._cookies = old_state + raise diff --git a/LTA/LTAIngest/mechanize/_debug.py b/LTA/LTAIngest/mechanize/_debug.py new file mode 100644 index 0000000000000000000000000000000000000000..8243969990ddf98865bbcf8bcd910819cc18dfb4 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_debug.py @@ -0,0 +1,28 @@ +import logging + +from _response import response_seek_wrapper +from _urllib2_fork import BaseHandler + + +class HTTPResponseDebugProcessor(BaseHandler): + handler_order = 900 # before redirections, after everything else + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + info = logging.getLogger("mechanize.http_responses").info + try: + info(response.read()) + finally: + response.seek(0) + info("*****************************************************") + return response + + https_response = http_response + +class HTTPRedirectDebugProcessor(BaseHandler): + def http_request(self, request): + if hasattr(request, "redirect_dict"): + info = logging.getLogger("mechanize.http_redirects").info + info("redirecting to %s", request.get_full_url()) + return request diff --git a/LTA/LTAIngest/mechanize/_firefox3cookiejar.py b/LTA/LTAIngest/mechanize/_firefox3cookiejar.py new file mode 100644 index 0000000000000000000000000000000000000000..a64d70f35d43af4492db93174f55fb74a104fa92 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_firefox3cookiejar.py @@ -0,0 +1,248 @@ +"""Firefox 3 "cookies.sqlite" cookie persistence. + +Copyright 2008 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import logging +import time + +from _clientcookie import CookieJar, Cookie, MappingIterator +from _util import isstringlike, experimental +debug = logging.getLogger("mechanize.cookies").debug + + +class Firefox3CookieJar(CookieJar): + + """Firefox 3 cookie jar. + + The cookies are stored in Firefox 3's "cookies.sqlite" format. + + Constructor arguments: + + filename: filename of cookies.sqlite (typically found at the top level + of a firefox profile directory) + autoconnect: as a convenience, connect to the SQLite cookies database at + Firefox3CookieJar construction time (default True) + policy: an object satisfying the mechanize.CookiePolicy interface + + Note that this is NOT a FileCookieJar, and there are no .load(), + .save() or .restore() methods. The database is in sync with the + cookiejar object's state after each public method call. + + Following Firefox's own behaviour, session cookies are never saved to + the database. + + The file is created, and an sqlite database written to it, if it does + not already exist. The moz_cookies database table is created if it does + not already exist. + """ + + # XXX + # handle DatabaseError exceptions + # add a FileCookieJar (explicit .save() / .revert() / .load() methods) + + def __init__(self, filename, autoconnect=True, policy=None): + experimental("Firefox3CookieJar is experimental code") + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self._conn = None + if autoconnect: + self.connect() + + def connect(self): + import sqlite3 # not available in Python 2.4 stdlib + self._conn = sqlite3.connect(self.filename) + self._conn.isolation_level = "DEFERRED" + self._create_table_if_necessary() + + def close(self): + self._conn.close() + + def _transaction(self, func): + try: + cur = self._conn.cursor() + try: + result = func(cur) + finally: + cur.close() + except: + self._conn.rollback() + raise + else: + self._conn.commit() + return result + + def _execute(self, query, params=()): + return self._transaction(lambda cur: cur.execute(query, params)) + + def _query(self, query, params=()): + # XXX should we bother with a transaction? + cur = self._conn.cursor() + try: + cur.execute(query, params) + return cur.fetchall() + finally: + cur.close() + + def _create_table_if_necessary(self): + self._execute("""\ +CREATE TABLE IF NOT EXISTS moz_cookies (id INTEGER PRIMARY KEY, name TEXT, + value TEXT, host TEXT, path TEXT,expiry INTEGER, + lastAccessed INTEGER, isSecure INTEGER, isHttpOnly INTEGER)""") + + def _cookie_from_row(self, row): + (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) = row + + version = 0 + domain = domain.encode("ascii", "ignore") + path = path.encode("ascii", "ignore") + name = name.encode("ascii", "ignore") + value = value.encode("ascii", "ignore") + secure = bool(secure) + + # last_accessed isn't a cookie attribute, so isn't added to rest + rest = {} + if http_only: + rest["HttpOnly"] = None + + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + domain_specified = initial_dot + + discard = False + if expires == "": + expires = None + discard = True + + return Cookie(version, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + rest) + + def clear(self, domain=None, path=None, name=None): + CookieJar.clear(self, domain, path, name) + where_parts = [] + sql_params = [] + if domain is not None: + where_parts.append("host = ?") + sql_params.append(domain) + if path is not None: + where_parts.append("path = ?") + sql_params.append(path) + if name is not None: + where_parts.append("name = ?") + sql_params.append(name) + where = " AND ".join(where_parts) + if where: + where = " WHERE " + where + def clear(cur): + cur.execute("DELETE FROM moz_cookies%s" % where, + tuple(sql_params)) + self._transaction(clear) + + def _row_from_cookie(self, cookie, cur): + expires = cookie.expires + if cookie.discard: + expires = "" + + domain = unicode(cookie.domain) + path = unicode(cookie.path) + name = unicode(cookie.name) + value = unicode(cookie.value) + secure = bool(int(cookie.secure)) + + if value is None: + value = name + name = "" + + last_accessed = int(time.time()) + http_only = cookie.has_nonstandard_attr("HttpOnly") + + query = cur.execute("""SELECT MAX(id) + 1 from moz_cookies""") + pk = query.fetchone()[0] + if pk is None: + pk = 1 + + return (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) + + def set_cookie(self, cookie): + if cookie.discard: + CookieJar.set_cookie(self, cookie) + return + + def set_cookie(cur): + # XXX + # is this RFC 2965-correct? + # could this do an UPDATE instead? + row = self._row_from_cookie(cookie, cur) + name, unused, domain, path = row[1:5] + cur.execute("""\ +DELETE FROM moz_cookies WHERE host = ? AND path = ? AND name = ?""", + (domain, path, name)) + cur.execute("""\ +INSERT INTO moz_cookies VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +""", row) + self._transaction(set_cookie) + + def __iter__(self): + # session (non-persistent) cookies + for cookie in MappingIterator(self._cookies): + yield cookie + # persistent cookies + for row in self._query("""\ +SELECT * FROM moz_cookies ORDER BY name, path, host"""): + yield self._cookie_from_row(row) + + def _cookies_for_request(self, request): + session_cookies = CookieJar._cookies_for_request(self, request) + def get_cookies(cur): + query = cur.execute("SELECT host from moz_cookies") + domains = [row[0] for row in query.fetchall()] + cookies = [] + for domain in domains: + cookies += self._persistent_cookies_for_domain(domain, + request, cur) + return cookies + persistent_coookies = self._transaction(get_cookies) + return session_cookies + persistent_coookies + + def _persistent_cookies_for_domain(self, domain, request, cur): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + query = cur.execute("""\ +SELECT * from moz_cookies WHERE host = ? ORDER BY path""", + (domain,)) + cookies = [self._cookie_from_row(row) for row in query.fetchall()] + last_path = None + r = [] + for cookie in cookies: + if (cookie.path != last_path and + not self._policy.path_return_ok(cookie.path, request)): + last_path = cookie.path + continue + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + r.append(cookie) + return r diff --git a/LTA/LTAIngest/mechanize/_form.py b/LTA/LTAIngest/mechanize/_form.py new file mode 100644 index 0000000000000000000000000000000000000000..d45bdfc395e266f14912d0934326dbbdf7a5a832 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_form.py @@ -0,0 +1,3280 @@ +"""HTML form handling for web clients. + +HTML form handling for web clients: useful for parsing HTML forms, filling them +in and returning the completed forms to the server. This code developed from a +port of Gisle Aas' Perl module HTML::Form, from the libwww-perl library, but +the interface is not the same. + +The most useful docstring is the one for HTMLForm. + +RFC 1866: HTML 2.0 +RFC 1867: Form-based File Upload in HTML +RFC 2388: Returning Values from Forms: multipart/form-data +HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) +HTML 4.01 Specification, W3C Recommendation 24 December 1999 + + +Copyright 2002-2007 John J. Lee <jjl@pobox.com> +Copyright 2005 Gary Poster +Copyright 2005 Zope Corporation +Copyright 1998-2000 Gisle Aas. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# TODO: +# Clean up post the merge into mechanize +# * Remove code that was duplicated in ClientForm and mechanize +# * Remove weird import stuff +# * Remove pre-Python 2.4 compatibility cruft +# * Clean up tests +# * Later release: Remove the ClientForm 0.1 backwards-compatibility switch +# Remove parser testing hack +# Clean action URI +# Switch to unicode throughout +# See Wichert Akkerman's 2004-01-22 message to c.l.py. +# Apply recommendations from google code project CURLIES +# Apply recommendations from HTML 5 spec +# Add charset parameter to Content-type headers? How to find value?? +# Functional tests to add: +# Single and multiple file upload +# File upload with missing name (check standards) +# mailto: submission & enctype text/plain?? + +# Replace by_label etc. with moniker / selector concept. Allows, e.g., a +# choice between selection by value / id / label / element contents. Or +# choice between matching labels exactly or by substring. etc. + + +__all__ = ['AmbiguityError', 'CheckboxControl', 'Control', + 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm', + 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl', + 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label', + 'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile', + 'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl', + 'RadioControl', 'ScalarControl', 'SelectControl', + 'SubmitButtonControl', 'SubmitControl', 'TextControl', + 'TextareaControl', 'XHTMLCompatibleFormParser'] + +import HTMLParser +from cStringIO import StringIO +import inspect +import logging +import random +import re +import sys +import urllib +import urlparse +import warnings + +import _beautifulsoup +import _request + +# from Python itself, for backwards compatibility of raised exceptions +import sgmllib +# bundled copy of sgmllib +import _sgmllib_copy + + +VERSION = "0.2.11" + +CHUNK = 1024 # size of chunks fed to parser, in bytes + +DEFAULT_ENCODING = "latin-1" + +_logger = logging.getLogger("mechanize.forms") +OPTIMIZATION_HACK = True + +def debug(msg, *args, **kwds): + if OPTIMIZATION_HACK: + return + + caller_name = inspect.stack()[1][3] + extended_msg = '%%s %s' % msg + extended_args = (caller_name,)+args + _logger.debug(extended_msg, *extended_args, **kwds) + +def _show_debug_messages(): + global OPTIMIZATION_HACK + OPTIMIZATION_HACK = False + _logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.DEBUG) + _logger.addHandler(handler) + + +def deprecation(message, stack_offset=0): + warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset) + + +class Missing: pass + +_compress_re = re.compile(r"\s+") +def compress_text(text): return _compress_re.sub(" ", text.strip()) + +def normalize_line_endings(text): + return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text) + + +def unescape(data, entities, encoding=DEFAULT_ENCODING): + if data is None or "&" not in data: + return data + + def replace_entities(match, entities=entities, encoding=encoding): + ent = match.group() + if ent[1] == "#": + return unescape_charref(ent[2:-1], encoding) + + repl = entities.get(ent) + if repl is not None: + if type(repl) != type(""): + try: + repl = repl.encode(encoding) + except UnicodeError: + repl = ent + else: + repl = ent + + return repl + + return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) + +def unescape_charref(data, encoding): + name, base = data, 10 + if name.startswith("x"): + name, base= name[1:], 16 + uc = unichr(int(name, base)) + if encoding is None: + return uc + else: + try: + repl = uc.encode(encoding) + except UnicodeError: + repl = "&#%s;" % data + return repl + +def get_entitydefs(): + import htmlentitydefs + from codecs import latin_1_decode + entitydefs = {} + try: + htmlentitydefs.name2codepoint + except AttributeError: + entitydefs = {} + for name, char in htmlentitydefs.entitydefs.items(): + uc = latin_1_decode(char)[0] + if uc.startswith("&#") and uc.endswith(";"): + uc = unescape_charref(uc[2:-1], None) + entitydefs["&%s;" % name] = uc + else: + for name, codepoint in htmlentitydefs.name2codepoint.items(): + entitydefs["&%s;" % name] = unichr(codepoint) + return entitydefs + + +def issequence(x): + try: + x[0] + except (TypeError, KeyError): + return False + except IndexError: + pass + return True + +def isstringlike(x): + try: x+"" + except: return False + else: return True + + +def choose_boundary(): + """Return a string usable as a multipart boundary.""" + # follow IE and firefox + nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2]) + return "-"*27 + nonce + +# This cut-n-pasted MimeWriter from standard library is here so can add +# to HTTP headers rather than message body when appropriate. It also uses +# \r\n in place of \n. This is a bit nasty. +class MimeWriter: + + """Generic MIME writer. + + Methods: + + __init__() + addheader() + flushheaders() + startbody() + startmultipartbody() + nextpart() + lastpart() + + A MIME writer is much more primitive than a MIME parser. It + doesn't seek around on the output file, and it doesn't use large + amounts of buffer space, so you have to write the parts in the + order they should occur on the output file. It does buffer the + headers you add, allowing you to rearrange their order. + + General usage is: + + f = <open the output file> + w = MimeWriter(f) + ...call w.addheader(key, value) 0 or more times... + + followed by either: + + f = w.startbody(content_type) + ...call f.write(data) for body data... + + or: + + w.startmultipartbody(subtype) + for each part: + subwriter = w.nextpart() + ...use the subwriter's methods to create the subpart... + w.lastpart() + + The subwriter is another MimeWriter instance, and should be + treated in the same way as the toplevel MimeWriter. This way, + writing recursive body parts is easy. + + Warning: don't forget to call lastpart()! + + XXX There should be more state so calls made in the wrong order + are detected. + + Some special cases: + + - startbody() just returns the file passed to the constructor; + but don't use this knowledge, as it may be changed. + + - startmultipartbody() actually returns a file as well; + this can be used to write the initial 'if you can read this your + mailer is not MIME-aware' message. + + - If you call flushheaders(), the headers accumulated so far are + written out (and forgotten); this is useful if you don't need a + body part at all, e.g. for a subpart of type message/rfc822 + that's (mis)used to store some header-like information. + + - Passing a keyword argument 'prefix=<flag>' to addheader(), + start*body() affects where the header is inserted; 0 means + append at the end, 1 means insert at the start; default is + append for addheader(), but insert for start*body(), which use + it to determine where the Content-type header goes. + + """ + + def __init__(self, fp, http_hdrs=None): + self._http_hdrs = http_hdrs + self._fp = fp + self._headers = [] + self._boundary = [] + self._first_part = True + + def addheader(self, key, value, prefix=0, + add_to_http_hdrs=0): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + lines = value.split("\r\n") + while lines and not lines[-1]: del lines[-1] + while lines and not lines[0]: del lines[0] + if add_to_http_hdrs: + value = "".join(lines) + # 2.2 urllib2 doesn't normalize header case + self._http_hdrs.append((key.capitalize(), value)) + else: + for i in range(1, len(lines)): + lines[i] = " " + lines[i].strip() + value = "\r\n".join(lines) + "\r\n" + line = key.title() + ": " + value + if prefix: + self._headers.insert(0, line) + else: + self._headers.append(line) + + def flushheaders(self): + self._fp.writelines(self._headers) + self._headers = [] + + def startbody(self, ctype=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + if content_type and ctype: + for name, value in plist: + ctype = ctype + ';\r\n %s=%s' % (name, value) + self.addheader("Content-Type", ctype, prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs) + self.flushheaders() + if not add_to_http_hdrs: self._fp.write("\r\n") + self._first_part = True + return self._fp + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + boundary = boundary or choose_boundary() + self._boundary.append(boundary) + return self.startbody("multipart/" + subtype, + [("boundary", boundary)] + plist, + prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs, + content_type=content_type) + + def nextpart(self): + boundary = self._boundary[-1] + if self._first_part: + self._first_part = False + else: + self._fp.write("\r\n") + self._fp.write("--" + boundary + "\r\n") + return self.__class__(self._fp) + + def lastpart(self): + if self._first_part: + self.nextpart() + boundary = self._boundary.pop() + self._fp.write("\r\n--" + boundary + "--\r\n") + + +class LocateError(ValueError): pass +class AmbiguityError(LocateError): pass +class ControlNotFoundError(LocateError): pass +class ItemNotFoundError(LocateError): pass + +class ItemCountError(ValueError): pass + +# for backwards compatibility, ParseError derives from exceptions that were +# raised by versions of ClientForm <= 0.2.5 +# TODO: move to _html +class ParseError(sgmllib.SGMLParseError, + HTMLParser.HTMLParseError): + + def __init__(self, *args, **kwds): + Exception.__init__(self, *args, **kwds) + + def __str__(self): + return Exception.__str__(self) + + +class _AbstractFormParser: + """forms attribute contains HTMLForm instances on completion.""" + # thanks to Moshe Zadka for an example of sgmllib/htmllib usage + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + if entitydefs is None: + entitydefs = get_entitydefs() + self._entitydefs = entitydefs + self._encoding = encoding + + self.base = None + self.forms = [] + self.labels = [] + self._current_label = None + self._current_form = None + self._select = None + self._optgroup = None + self._option = None + self._textarea = None + + # forms[0] will contain all controls that are outside of any form + # self._global_form is an alias for self.forms[0] + self._global_form = None + self.start_form([]) + self.end_form() + self._current_form = self._global_form = self.forms[0] + + def do_base(self, attrs): + debug("%s", attrs) + for key, value in attrs: + if key == "href": + self.base = self.unescape_attr_if_required(value) + + def end_body(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is not self._global_form: + self.end_form() + + def start_form(self, attrs): + debug("%s", attrs) + if self._current_form is not self._global_form: + raise ParseError("nested FORMs") + name = None + action = None + enctype = "application/x-www-form-urlencoded" + method = "GET" + d = {} + for key, value in attrs: + if key == "name": + name = self.unescape_attr_if_required(value) + elif key == "action": + action = self.unescape_attr_if_required(value) + elif key == "method": + method = self.unescape_attr_if_required(value.upper()) + elif key == "enctype": + enctype = self.unescape_attr_if_required(value.lower()) + d[key] = self.unescape_attr_if_required(value) + controls = [] + self._current_form = (name, action, method, enctype), d, controls + + def end_form(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is self._global_form: + raise ParseError("end of FORM before start") + self.forms.append(self._current_form) + self._current_form = self._global_form + + def start_select(self, attrs): + debug("%s", attrs) + if self._select is not None: + raise ParseError("nested SELECTs") + if self._textarea is not None: + raise ParseError("SELECT inside TEXTAREA") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._select = d + self._add_label(d) + + self._append_select_control({"__select": d}) + + def end_select(self): + debug("") + if self._select is None: + raise ParseError("end of SELECT before start") + + if self._option is not None: + self._end_option() + + self._select = None + + def start_optgroup(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTGROUP outside of SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._optgroup = d + + def end_optgroup(self): + debug("") + if self._optgroup is None: + raise ParseError("end of OPTGROUP before start") + self._optgroup = None + + def _start_option(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTION outside of SELECT") + if self._option is not None: + self._end_option() + + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._option = {} + self._option.update(d) + if (self._optgroup and self._optgroup.has_key("disabled") and + not self._option.has_key("disabled")): + self._option["disabled"] = None + + def _end_option(self): + debug("") + if self._option is None: + raise ParseError("end of OPTION before start") + + contents = self._option.get("contents", "").strip() + self._option["contents"] = contents + if not self._option.has_key("value"): + self._option["value"] = contents + if not self._option.has_key("label"): + self._option["label"] = contents + # stuff dict of SELECT HTML attrs into a special private key + # (gets deleted again later) + self._option["__select"] = self._select + self._append_select_control(self._option) + self._option = None + + def _append_select_control(self, attrs): + debug("%s", attrs) + controls = self._current_form[2] + name = self._select.get("name") + controls.append(("select", name, attrs)) + + def start_textarea(self, attrs): + debug("%s", attrs) + if self._textarea is not None: + raise ParseError("nested TEXTAREAs") + if self._select is not None: + raise ParseError("TEXTAREA inside SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + self._add_label(d) + + self._textarea = d + + def end_textarea(self): + debug("") + if self._textarea is None: + raise ParseError("end of TEXTAREA before start") + controls = self._current_form[2] + name = self._textarea.get("name") + controls.append(("textarea", name, self._textarea)) + self._textarea = None + + def start_label(self, attrs): + debug("%s", attrs) + if self._current_label: + self.end_label() + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + taken = bool(d.get("for")) # empty id is invalid + d["__text"] = "" + d["__taken"] = taken + if taken: + self.labels.append(d) + self._current_label = d + + def end_label(self): + debug("") + label = self._current_label + if label is None: + # something is ugly in the HTML, but we're ignoring it + return + self._current_label = None + # if it is staying around, it is True in all cases + del label["__taken"] + + def _add_label(self, d): + #debug("%s", d) + if self._current_label is not None: + if not self._current_label["__taken"]: + self._current_label["__taken"] = True + d["__label"] = self._current_label + + def handle_data(self, data): + debug("%s", data) + + if self._option is not None: + # self._option is a dictionary of the OPTION element's HTML + # attributes, but it has two special keys, one of which is the + # special "contents" key contains text between OPTION tags (the + # other is the "__select" key: see the end_option method) + map = self._option + key = "contents" + elif self._textarea is not None: + map = self._textarea + key = "value" + data = normalize_line_endings(data) + # not if within option or textarea + elif self._current_label is not None: + map = self._current_label + key = "__text" + else: + return + + if data and not map.has_key(key): + # according to + # http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break + # immediately after start tags or immediately before end tags must + # be ignored, but real browsers only ignore a line break after a + # start tag, so we'll do that. + if data[0:2] == "\r\n": + data = data[2:] + elif data[0:1] in ["\n", "\r"]: + data = data[1:] + map[key] = data + else: + map[key] = map[key] + data + + def do_button(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "submit" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + # we don't want to lose information, so use a type string that + # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON} + # e.g. type for BUTTON/RESET is "resetbutton" + # (type for INPUT/RESET is "reset") + type = type+"button" + self._add_label(d) + controls.append((type, name, d)) + + def do_input(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "text" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + self._add_label(d) + controls.append((type, name, d)) + + def do_isindex(self, attrs): + debug("%s", attrs) + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + self._add_label(d) + # isindex doesn't have type or name HTML attributes + controls.append(("isindex", None, d)) + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + try: + val.items + except AttributeError: + escaped_attrs[key] = self.unescape_attr(val) + else: + # e.g. "__select" -- yuck! + escaped_attrs[key] = self.unescape_attrs(val) + return escaped_attrs + + def unknown_entityref(self, ref): self.handle_data("&%s;" % ref) + def unknown_charref(self, ref): self.handle_data("&#%s;" % ref) + + +class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): + """Good for XHTML, bad for tolerance of incorrect HTML.""" + # thanks to Michael Howitz for this! + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + HTMLParser.HTMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + HTMLParser.HTMLParser.feed(self, data) + except HTMLParser.HTMLParseError, exc: + raise ParseError(exc) + + def start_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + def end_option(self): + _AbstractFormParser._end_option(self) + + def handle_starttag(self, tag, attrs): + try: + method = getattr(self, "start_" + tag) + except AttributeError: + try: + method = getattr(self, "do_" + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + try: + method = getattr(self, "end_" + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + def close(self): + HTMLParser.HTMLParser.close(self) + self.end_body() + + +class _AbstractSgmllibParser(_AbstractFormParser): + + def do_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + # we override this attr to decode hex charrefs + entity_or_charref = re.compile( + '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)') + def convert_entityref(self, name): + return unescape("&%s;" % name, self._entitydefs, self._encoding) + def convert_charref(self, name): + return unescape_charref("%s" % name, self._encoding) + def unescape_attr_if_required(self, name): + return name # sgmllib already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + +class FormParser(_AbstractSgmllibParser, _sgmllib_copy.SGMLParser): + """Good for tolerance of incorrect HTML, bad for XHTML.""" + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _sgmllib_copy.SGMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + _sgmllib_copy.SGMLParser.feed(self, data) + except _sgmllib_copy.SGMLParseError, exc: + raise ParseError(exc) + + def close(self): + _sgmllib_copy.SGMLParser.close(self) + self.end_body() + + +class _AbstractBSFormParser(_AbstractSgmllibParser): + + bs_base_class = None + + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _AbstractFormParser.__init__(self, entitydefs, encoding) + self.bs_base_class.__init__(self) + + def handle_data(self, data): + _AbstractFormParser.handle_data(self, data) + self.bs_base_class.handle_data(self, data) + + def feed(self, data): + try: + self.bs_base_class.feed(self, data) + except _sgmllib_copy.SGMLParseError, exc: + raise ParseError(exc) + + def close(self): + self.bs_base_class.close(self) + self.end_body() + + +class RobustFormParser(_AbstractBSFormParser, _beautifulsoup.BeautifulSoup): + + """Tries to be highly tolerant of incorrect HTML.""" + + bs_base_class = _beautifulsoup.BeautifulSoup + + +class NestingRobustFormParser(_AbstractBSFormParser, + _beautifulsoup.ICantBelieveItsBeautifulSoup): + + """Tries to be highly tolerant of incorrect HTML. + + Different from RobustFormParser in that it more often guesses nesting + above missing end tags (see BeautifulSoup docs). + """ + + bs_base_class = _beautifulsoup.ICantBelieveItsBeautifulSoup + + +#FormParser = XHTMLCompatibleFormParser # testing hack +#FormParser = RobustFormParser # testing hack + + +def ParseResponseEx(response, + select_default=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseResponse, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(response, response.geturl(), + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseFileEx(file, base_uri, + select_default=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseFile, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(file, base_uri, + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseString(text, base_uri, *args, **kwds): + fh = StringIO(text) + return ParseFileEx(fh, base_uri, *args, **kwds) + +def ParseResponse(response, *args, **kwds): + """Parse HTTP response and return a list of HTMLForm instances. + + The return value of mechanize.urlopen can be conveniently passed to this + function as the response parameter. + + mechanize.ParseError is raised on parse errors. + + response: file-like object (supporting read() method) with a method + geturl(), returning the URI of the HTTP response + select_default: for multiple-selection SELECT controls and RADIO controls, + pick the first item as the default if none are selected in the HTML + form_parser_class: class to instantiate and use to pass + request_class: class to return from .click() method (default is + mechanize.Request) + entitydefs: mapping like {"&": "&", ...} containing HTML entity + definitions (a sensible default is used) + encoding: character encoding used for encoding numeric character references + when matching link text. mechanize does not attempt to find the encoding + in a META HTTP-EQUIV attribute in the document itself (mechanize, for + example, does do that and will pass the correct value to mechanize using + this parameter). + + backwards_compat: boolean that determines whether the returned HTMLForm + objects are backwards-compatible with old code. If backwards_compat is + true: + + - ClientForm 0.1 code will continue to work as before. + + - Label searches that do not specify a nr (number or count) will always + get the first match, even if other controls match. If + backwards_compat is False, label searches that have ambiguous results + will raise an AmbiguityError. + + - Item label matching is done by strict string comparison rather than + substring matching. + + - De-selecting individual list items is allowed even if the Item is + disabled. + + The backwards_compat argument will be removed in a future release. + + Pass a true value for select_default if you want the behaviour specified by + RFC 1866 (the HTML 2.0 standard), which is to select the first item in a + RADIO or multiple-selection SELECT control if none were selected in the + HTML. Most browsers (including Microsoft Internet Explorer (IE) and + Netscape Navigator) instead leave all items unselected in these cases. The + W3C HTML 4.0 standard leaves this behaviour undefined in the case of + multiple-selection SELECT controls, but insists that at least one RADIO + button should be checked at all times, in contradiction to browser + behaviour. + + There is a choice of parsers. mechanize.XHTMLCompatibleFormParser (uses + HTMLParser.HTMLParser) works best for XHTML, mechanize.FormParser (uses + bundled copy of sgmllib.SGMLParser) (the default) works better for ordinary + grubby HTML. Note that HTMLParser is only available in Python 2.2 and + later. You can pass your own class in here as a hack to work around bad + HTML, but at your own risk: there is no well-defined interface. + + """ + return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:] + +def ParseFile(file, base_uri, *args, **kwds): + """Parse HTML and return a list of HTMLForm instances. + + mechanize.ParseError is raised on parse errors. + + file: file-like object (supporting read() method) containing HTML with zero + or more forms to be parsed + base_uri: the URI of the document (note that the base URI used to submit + the form will be that given in the BASE element if present, not that of + the document) + + For the other arguments and further details, see ParseResponse.__doc__. + + """ + return _ParseFileEx(file, base_uri, *args, **kwds)[1:] + +def _ParseFileEx(file, base_uri, + select_default=False, + ignore_errors=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + backwards_compat=True, + encoding=DEFAULT_ENCODING, + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + if backwards_compat: + deprecation("operating in backwards-compatibility mode", 1) + fp = form_parser_class(entitydefs, encoding) + while 1: + data = file.read(CHUNK) + try: + fp.feed(data) + except ParseError, e: + e.base_uri = base_uri + raise + if len(data) != CHUNK: break + fp.close() + if fp.base is not None: + # HTML BASE element takes precedence over document URI + base_uri = fp.base + labels = [] # Label(label) for label in fp.labels] + id_to_labels = {} + for l in fp.labels: + label = Label(l) + labels.append(label) + for_id = l["for"] + coll = id_to_labels.get(for_id) + if coll is None: + id_to_labels[for_id] = [label] + else: + coll.append(label) + forms = [] + for (name, action, method, enctype), attrs, controls in fp.forms: + if action is None: + action = base_uri + else: + action = _urljoin(base_uri, action) + # would be nice to make HTMLForm class (form builder) pluggable + form = HTMLForm( + action, method, enctype, name, attrs, request_class, + forms, labels, id_to_labels, backwards_compat) + form._urlparse = _urlparse + form._urlunparse = _urlunparse + for ii in range(len(controls)): + type, name, attrs = controls[ii] + # index=ii*10 allows ImageControl to return multiple ordered pairs + form.new_control( + type, name, attrs, select_default=select_default, index=ii*10) + forms.append(form) + for form in forms: + form.fixup() + return forms + + +class Label: + def __init__(self, attrs): + self.id = attrs.get("for") + self._text = attrs.get("__text").strip() + self._ctext = compress_text(self._text) + self.attrs = attrs + self._backwards_compat = False # maintained by HTMLForm + + def __getattr__(self, name): + if name == "text": + if self._backwards_compat: + return self._text + else: + return self._ctext + return getattr(Label, name) + + def __setattr__(self, name, value): + if name == "text": + # don't see any need for this, so make it read-only + raise AttributeError("text attribute is read-only") + self.__dict__[name] = value + + def __str__(self): + return "<Label(id=%r, text=%r)>" % (self.id, self.text) + + +def _get_label(attrs): + text = attrs.get("__label") + if text is not None: + return Label(text) + else: + return None + +class Control: + """An HTML form control. + + An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm + are accessed using the HTMLForm.find_control method or the + HTMLForm.controls attribute. + + Control instances are usually constructed using the ParseFile / + ParseResponse functions. If you use those functions, you can ignore the + rest of this paragraph. A Control is only properly initialised after the + fixup method has been called. In fact, this is only strictly necessary for + ListControl instances. This is necessary because ListControls are built up + from ListControls each containing only a single item, and their initial + value(s) can only be known after the sequence is complete. + + The types and values that are acceptable for assignment to the value + attribute are defined by subclasses. + + If the disabled attribute is true, this represents the state typically + represented by browsers by 'greying out' a control. If the disabled + attribute is true, the Control will raise AttributeError if an attempt is + made to change its value. In addition, the control will not be considered + 'successful' as defined by the W3C HTML 4 standard -- ie. it will + contribute no data to the return value of the HTMLForm.click* methods. To + enable a control, set the disabled attribute to a false value. + + If the readonly attribute is true, the Control will raise AttributeError if + an attempt is made to change its value. To make a control writable, set + the readonly attribute to a false value. + + All controls have the disabled and readonly attributes, not only those that + may have the HTML attributes of the same names. + + On assignment to the value attribute, the following exceptions are raised: + TypeError, AttributeError (if the value attribute should not be assigned + to, because the control is disabled, for example) and ValueError. + + If the name or value attributes are None, or the value is an empty list, or + if the control is disabled, the control is not successful. + + Public attributes: + + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) (readonly) + name: name of control (readonly) + value: current value of control (subclasses may allow a single value, a + sequence of values, or either) + disabled: disabled state + readonly: readonly state + id: value of id HTML attribute + + """ + def __init__(self, type, name, attrs, index=None): + """ + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) + name: control name + attrs: HTML attributes of control's HTML element + + """ + raise NotImplementedError() + + def add_to_form(self, form): + self._form = form + form.controls.append(self) + + def fixup(self): + pass + + def is_of_kind(self, kind): + raise NotImplementedError() + + def clear(self): + raise NotImplementedError() + + def __getattr__(self, name): raise NotImplementedError() + def __setattr__(self, name, value): raise NotImplementedError() + + def pairs(self): + """Return list of (key, value) pairs suitable for passing to urlencode. + """ + return [(k, v) for (i, k, v) in self._totally_ordered_pairs()] + + def _totally_ordered_pairs(self): + """Return list of (key, value, index) tuples. + + Like pairs, but allows preserving correct ordering even where several + controls are involved. + + """ + raise NotImplementedError() + + def _write_mime_data(self, mw, name, value): + """Write data for a subitem of this control to a MimeWriter.""" + # called by HTMLForm + mw2 = mw.nextpart() + mw2.addheader("Content-Disposition", + 'form-data; name="%s"' % name, 1) + f = mw2.startbody(prefix=0) + f.write(value) + + def __str__(self): + raise NotImplementedError() + + def get_labels(self): + """Return all labels (Label instances) for this control. + + If the control was surrounded by a <label> tag, that will be the first + label; all other labels, connected by 'for' and 'id', are in the order + that appear in the HTML. + + """ + res = [] + if self._label: + res.append(self._label) + if self.id: + res.extend(self._form._id_to_labels.get(self.id, ())) + return res + + +#--------------------------------------------------- +class ScalarControl(Control): + """Control whose value is not restricted to one of a prescribed set. + + Some ScalarControls don't accept any value attribute. Otherwise, takes a + single value, which must be string-like. + + Additional read-only public attribute: + + attrs: dictionary mapping the names of original HTML attributes of the + control to their values + + """ + def __init__(self, type, name, attrs, index=None): + self._index = index + self._label = _get_label(attrs) + self.__dict__["type"] = type.lower() + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = attrs.has_key("disabled") + self.readonly = attrs.has_key("readonly") + self.id = attrs.get("id") + + self.attrs = attrs.copy() + + self._clicked = False + + self._urlparse = urlparse.urlparse + self._urlunparse = urlparse.urlunparse + + def __getattr__(self, name): + if name == "value": + return self.__dict__["_value"] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if not isstringlike(value): + raise TypeError("must assign a string") + elif self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + elif self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + self.__dict__["_value"] = value + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _totally_ordered_pairs(self): + name = self.name + value = self.value + if name is None or value is None or self.disabled: + return [] + return [(self._index, name, value)] + + def clear(self): + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self.__dict__["_value"] = None + + def __str__(self): + name = self.name + value = self.value + if name is None: name = "<None>" + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class TextControl(ScalarControl): + """Textual input control. + + Covers: + + INPUT/TEXT + INPUT/PASSWORD + INPUT/HIDDEN + TEXTAREA + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + if self.type == "hidden": self.readonly = True + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind == "text" + +#--------------------------------------------------- +class FileControl(ScalarControl): + """File upload with INPUT TYPE=FILE. + + The value attribute of a FileControl is always None. Use add_file instead. + + Additional public method: add_file + + """ + + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + self._value = None + self._upload_data = [] + + def is_of_kind(self, kind): return kind == "file" + + def clear(self): + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._upload_data = [] + + def __setattr__(self, name, value): + if name in ("value", "name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def add_file(self, file_object, content_type=None, filename=None): + if not hasattr(file_object, "read"): + raise TypeError("file-like object must have read method") + if content_type is not None and not isstringlike(content_type): + raise TypeError("content type must be None or string-like") + if filename is not None and not isstringlike(filename): + raise TypeError("filename must be None or string-like") + if content_type is None: + content_type = "application/octet-stream" + self._upload_data.append((file_object, content_type, filename)) + + def _totally_ordered_pairs(self): + # XXX should it be successful even if unnamed? + if self.name is None or self.disabled: + return [] + return [(self._index, self.name, "")] + + # If enctype is application/x-www-form-urlencoded and there's a FILE + # control present, what should be sent? Strictly, it should be 'name=data' + # (see HTML 4.01 spec., section 17.13.2), but code sends "name=" ATM. What + # about multiple file upload? + def _write_mime_data(self, mw, _name, _value): + # called by HTMLForm + # assert _name == self.name and _value == '' + if len(self._upload_data) < 2: + if len(self._upload_data) == 0: + file_object = StringIO() + content_type = "application/octet-stream" + filename = "" + else: + file_object, content_type, filename = self._upload_data[0] + if filename is None: + filename = "" + mw2 = mw.nextpart() + fn_part = '; filename="%s"' % filename + disp = 'form-data; name="%s"%s' % (self.name, fn_part) + mw2.addheader("Content-Disposition", disp, prefix=1) + fh = mw2.startbody(content_type, prefix=0) + fh.write(file_object.read()) + else: + # multiple files + mw2 = mw.nextpart() + disp = 'form-data; name="%s"' % self.name + mw2.addheader("Content-Disposition", disp, prefix=1) + fh = mw2.startmultipartbody("mixed", prefix=0) + for file_object, content_type, filename in self._upload_data: + mw3 = mw2.nextpart() + if filename is None: + filename = "" + fn_part = '; filename="%s"' % filename + disp = "file%s" % fn_part + mw3.addheader("Content-Disposition", disp, prefix=1) + fh2 = mw3.startbody(content_type, prefix=0) + fh2.write(file_object.read()) + mw2.lastpart() + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + if not self._upload_data: + value = "<No files added>" + else: + value = [] + for file, ctype, filename in self._upload_data: + if filename is None: + value.append("<Unnamed file>") + else: + value.append(filename) + value = ", ".join(value) + + info = [] + if self.disabled: info.append("disabled") + if self.readonly: info.append("readonly") + info = ", ".join(info) + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class IsindexControl(ScalarControl): + """ISINDEX control. + + ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really + part of regular HTML forms at all, and predates it. You're only allowed + one ISINDEX per HTML document. ISINDEX and regular form submission are + mutually exclusive -- either submit a form, or the ISINDEX. + + Having said this, since ISINDEX controls may appear in forms (which is + probably bad HTML), ParseFile / ParseResponse will include them in the + HTMLForm instances it returns. You can set the ISINDEX's value, as with + any other control (but note that ISINDEX controls have no name, so you'll + need to use the type argument of set_value!). When you submit the form, + the ISINDEX will not be successful (ie., no data will get returned to the + server as a result of its presence), unless you click on the ISINDEX + control, in which case the ISINDEX gets submitted instead of the form: + + form.set_value("my isindex value", type="isindex") + mechanize.urlopen(form.click(type="isindex")) + + ISINDEX elements outside of FORMs are ignored. If you want to submit one + by hand, do it like so: + + url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value")) + result = mechanize.urlopen(url) + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind in ["text", "clickable"] + + def _totally_ordered_pairs(self): + return [] + + def _click(self, form, coord, return_type, request_class=_request.Request): + # Relative URL for ISINDEX submission: instead of "foo=bar+baz", + # want "bar+baz". + # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is + # deprecated in 4.01, but it should still say how to submit it). + # Submission of ISINDEX is explained in the HTML 3.2 spec, though. + parts = self._urlparse(form.action) + rest, (query, frag) = parts[:-2], parts[-2:] + parts = rest + (urllib.quote_plus(self.value), None) + url = self._urlunparse(parts) + req_data = url, None, [] + + if return_type == "pairs": + return [] + elif return_type == "request_data": + return req_data + else: + return request_class(url) + + def __str__(self): + value = self.value + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s)%s>" % (self.__class__.__name__, value, info) + + +#--------------------------------------------------- +class IgnoreControl(ScalarControl): + """Control that we're not interested in. + + Covers: + + INPUT/RESET + BUTTON/RESET + INPUT/BUTTON + BUTTON/BUTTON + + These controls are always unsuccessful, in the terminology of HTML 4 (ie. + they never require any information to be returned to the server). + + BUTTON/BUTTON is used to generate events for script embedded in HTML. + + The value attribute of IgnoreControl is always None. + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + self._value = None + + def is_of_kind(self, kind): return False + + def __setattr__(self, name, value): + if name == "value": + raise AttributeError( + "control '%s' is ignored, hence read-only" % self.name) + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + +#--------------------------------------------------- +# ListControls + +# helpers and subsidiary classes + +class Item: + def __init__(self, control, attrs, index=None): + label = _get_label(attrs) + self.__dict__.update({ + "name": attrs["value"], + "_labels": label and [label] or [], + "attrs": attrs, + "_control": control, + "disabled": attrs.has_key("disabled"), + "_selected": False, + "id": attrs.get("id"), + "_index": index, + }) + control.items.append(self) + + def get_labels(self): + """Return all labels (Label instances) for this item. + + For items that represent radio buttons or checkboxes, if the item was + surrounded by a <label> tag, that will be the first label; all other + labels, connected by 'for' and 'id', are in the order that appear in + the HTML. + + For items that represent select options, if the option had a label + attribute, that will be the first label. If the option has contents + (text within the option tags) and it is not the same as the label + attribute (if any), that will be a label. There is nothing in the + spec to my knowledge that makes an option with an id unable to be the + target of a label's for attribute, so those are included, if any, for + the sake of consistency and completeness. + + """ + res = [] + res.extend(self._labels) + if self.id: + res.extend(self._control._form._id_to_labels.get(self.id, ())) + return res + + def __getattr__(self, name): + if name=="selected": + return self._selected + raise AttributeError(name) + + def __setattr__(self, name, value): + if name == "selected": + self._control._set_selected_state(self, value) + elif name == "disabled": + self.__dict__["disabled"] = bool(value) + else: + raise AttributeError(name) + + def __str__(self): + res = self.name + if self.selected: + res = "*" + res + if self.disabled: + res = "(%s)" % res + return res + + def __repr__(self): + # XXX appending the attrs without distinguishing them from name and id + # is silly + attrs = [("name", self.name), ("id", self.id)]+self.attrs.items() + return "<%s %s>" % ( + self.__class__.__name__, + " ".join(["%s=%r" % (k, v) for k, v in attrs]) + ) + +def disambiguate(items, nr, **kwds): + msgs = [] + for key, value in kwds.items(): + msgs.append("%s=%r" % (key, value)) + msg = " ".join(msgs) + if not items: + raise ItemNotFoundError(msg) + if nr is None: + if len(items) > 1: + raise AmbiguityError(msg) + nr = 0 + if len(items) <= nr: + raise ItemNotFoundError(msg) + return items[nr] + +class ListControl(Control): + """Control representing a sequence of items. + + The value attribute of a ListControl represents the successful list items + in the control. The successful list items are those that are selected and + not disabled. + + ListControl implements both list controls that take a length-1 value + (single-selection) and those that take length >1 values + (multiple-selection). + + ListControls accept sequence values only. Some controls only accept + sequences of length 0 or 1 (RADIO, and single-selection SELECT). + In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes + and multiple-selection SELECTs (those having the "multiple" HTML attribute) + accept sequences of any length. + + Note the following mistake: + + control.value = some_value + assert control.value == some_value # not necessarily true + + The reason for this is that the value attribute always gives the list items + in the order they were listed in the HTML. + + ListControl items can also be referred to by their labels instead of names. + Use the label argument to .get(), and the .set_value_by_label(), + .get_value_by_label() methods. + + Note that, rather confusingly, though SELECT controls are represented in + HTML by SELECT elements (which contain OPTION elements, representing + individual list items), CHECKBOXes and RADIOs are not represented by *any* + element. Instead, those controls are represented by a collection of INPUT + elements. For example, this is a SELECT control, named "control1": + + <select name="control1"> + <option>foo</option> + <option value="1">bar</option> + </select> + + and this is a CHECKBOX control, named "control2": + + <input type="checkbox" name="control2" value="foo" id="cbe1"> + <input type="checkbox" name="control2" value="bar" id="cbe2"> + + The id attribute of a CHECKBOX or RADIO ListControl is always that of its + first element (for example, "cbe1" above). + + + Additional read-only public attribute: multiple. + + """ + + # ListControls are built up by the parser from their component items by + # creating one ListControl per item, consolidating them into a single + # master ListControl held by the HTMLForm: + + # -User calls form.new_control(...) + # -Form creates Control, and calls control.add_to_form(self). + # -Control looks for a Control with the same name and type in the form, + # and if it finds one, merges itself with that control by calling + # control.merge_control(self). The first Control added to the form, of + # a particular name and type, is the only one that survives in the + # form. + # -Form calls control.fixup for all its controls. ListControls in the + # form know they can now safely pick their default values. + + # To create a ListControl without an HTMLForm, use: + + # control.merge_control(new_control) + + # (actually, it's much easier just to use ParseFile) + + _label = None + + def __init__(self, type, name, attrs={}, select_default=False, + called_as_base_class=False, index=None): + """ + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present + + """ + if not called_as_base_class: + raise NotImplementedError() + + self.__dict__["type"] = type.lower() + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = False + self.readonly = False + self.id = attrs.get("id") + self._closed = False + + # As Controls are merged in with .merge_control(), self.attrs will + # refer to each Control in turn -- always the most recently merged + # control. Each merged-in Control instance corresponds to a single + # list item: see ListControl.__doc__. + self.items = [] + self._form = None + + self._select_default = select_default + self._clicked = False + + def clear(self): + self.value = [] + + def is_of_kind(self, kind): + if kind == "list": + return True + elif kind == "multilist": + return bool(self.multiple) + elif kind == "singlelist": + return not self.multiple + else: + return False + + def get_items(self, name=None, label=None, id=None, + exclude_disabled=False): + """Return matching items by name or label. + + For argument docs, see the docstring for .get() + + """ + if name is not None and not isstringlike(name): + raise TypeError("item name must be string-like") + if label is not None and not isstringlike(label): + raise TypeError("item label must be string-like") + if id is not None and not isstringlike(id): + raise TypeError("item id must be string-like") + items = [] # order is important + compat = self._form.backwards_compat + for o in self.items: + if exclude_disabled and o.disabled: + continue + if name is not None and o.name != name: + continue + if label is not None: + for l in o.get_labels(): + if ((compat and l.text == label) or + (not compat and l.text.find(label) > -1)): + break + else: + continue + if id is not None and o.id != id: + continue + items.append(o) + return items + + def get(self, name=None, label=None, id=None, nr=None, + exclude_disabled=False): + """Return item by name or label, disambiguating if necessary with nr. + + All arguments must be passed by name, with the exception of 'name', + which may be used as a positional argument. + + If name is specified, then the item must have the indicated name. + + If label is specified, then the item must have a label whose + whitespace-compressed, stripped, text substring-matches the indicated + label string (e.g. label="please choose" will match + " Do please choose an item "). + + If id is specified, then the item must have the indicated id. + + nr is an optional 0-based index of the items matching the query. + + If nr is the default None value and more than item is found, raises + AmbiguityError (unless the HTMLForm instance's backwards_compat + attribute is true). + + If no item is found, or if items are found but nr is specified and not + found, raises ItemNotFoundError. + + Optionally excludes disabled items. + + """ + if nr is None and self._form.backwards_compat: + nr = 0 # :-/ + items = self.get_items(name, label, id, exclude_disabled) + return disambiguate(items, nr, name=name, label=label, id=id) + + def _get(self, name, by_label=False, nr=None, exclude_disabled=False): + # strictly for use by deprecated methods + if by_label: + name, label = None, name + else: + name, label = name, None + return self.get(name, label, nr, exclude_disabled) + + def toggle(self, name, by_label=False, nr=None): + """Deprecated: given a name or label and optional disambiguating index + nr, toggle the matching item's selection. + + Selecting items follows the behavior described in the docstring of the + 'get' method. + + if the item is disabled, or this control is disabled or readonly, + raise AttributeError. + + """ + deprecation( + "item = control.get(...); item.selected = not item.selected") + o = self._get(name, by_label, nr) + self._set_selected_state(o, not o.selected) + + def set(self, selected, name, by_label=False, nr=None): + """Deprecated: given a name or label and optional disambiguating index + nr, set the matching item's selection to the bool value of selected. + + Selecting items follows the behavior described in the docstring of the + 'get' method. + + if the item is disabled, or this control is disabled or readonly, + raise AttributeError. + + """ + deprecation( + "control.get(...).selected = <boolean>") + self._set_selected_state(self._get(name, by_label, nr), selected) + + def _set_selected_state(self, item, action): + # action: + # bool False: off + # bool True: on + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + action == bool(action) + compat = self._form.backwards_compat + if not compat and item.disabled: + raise AttributeError("item is disabled") + else: + if compat and item.disabled and action: + raise AttributeError("item is disabled") + if self.multiple: + item.__dict__["_selected"] = action + else: + if not action: + item.__dict__["_selected"] = False + else: + for o in self.items: + o.__dict__["_selected"] = False + item.__dict__["_selected"] = True + + def toggle_single(self, by_label=None): + """Deprecated: toggle the selection of the single item in this control. + + Raises ItemCountError if the control does not contain only one item. + + by_label argument is ignored, and included only for backwards + compatibility. + + """ + deprecation( + "control.items[0].selected = not control.items[0].selected") + if len(self.items) != 1: + raise ItemCountError( + "'%s' is not a single-item control" % self.name) + item = self.items[0] + self._set_selected_state(item, not item.selected) + + def set_single(self, selected, by_label=None): + """Deprecated: set the selection of the single item in this control. + + Raises ItemCountError if the control does not contain only one item. + + by_label argument is ignored, and included only for backwards + compatibility. + + """ + deprecation( + "control.items[0].selected = <boolean>") + if len(self.items) != 1: + raise ItemCountError( + "'%s' is not a single-item control" % self.name) + self._set_selected_state(self.items[0], selected) + + def get_item_disabled(self, name, by_label=False, nr=None): + """Get disabled state of named list item in a ListControl.""" + deprecation( + "control.get(...).disabled") + return self._get(name, by_label, nr).disabled + + def set_item_disabled(self, disabled, name, by_label=False, nr=None): + """Set disabled state of named list item in a ListControl. + + disabled: boolean disabled state + + """ + deprecation( + "control.get(...).disabled = <boolean>") + self._get(name, by_label, nr).disabled = disabled + + def set_all_items_disabled(self, disabled): + """Set disabled state of all list items in a ListControl. + + disabled: boolean disabled state + + """ + for o in self.items: + o.disabled = disabled + + def get_item_attrs(self, name, by_label=False, nr=None): + """Return dictionary of HTML attributes for a single ListControl item. + + The HTML element types that describe list items are: OPTION for SELECT + controls, INPUT for the rest. These elements have HTML attributes that + you may occasionally want to know about -- for example, the "alt" HTML + attribute gives a text string describing the item (graphical browsers + usually display this as a tooltip). + + The returned dictionary maps HTML attribute names to values. The names + and values are taken from the original HTML. + + """ + deprecation( + "control.get(...).attrs") + return self._get(name, by_label, nr).attrs + + def close_control(self): + self._closed = True + + def add_to_form(self, form): + assert self._form is None or form == self._form, ( + "can't add control to more than one form") + self._form = form + if self.name is None: + # always count nameless elements as separate controls + Control.add_to_form(self, form) + else: + for ii in range(len(form.controls)-1, -1, -1): + control = form.controls[ii] + if control.name == self.name and control.type == self.type: + if control._closed: + Control.add_to_form(self, form) + else: + control.merge_control(self) + break + else: + Control.add_to_form(self, form) + + def merge_control(self, control): + assert bool(control.multiple) == bool(self.multiple) + # usually, isinstance(control, self.__class__) + self.items.extend(control.items) + + def fixup(self): + """ + ListControls are built up from component list items (which are also + ListControls) during parsing. This method should be called after all + items have been added. See ListControl.__doc__ for the reason this is + required. + + """ + # Need to set default selection where no item was indicated as being + # selected by the HTML: + + # CHECKBOX: + # Nothing should be selected. + # SELECT/single, SELECT/multiple and RADIO: + # RFC 1866 (HTML 2.0): says first item should be selected. + # W3C HTML 4.01 Specification: says that client behaviour is + # undefined in this case. For RADIO, exactly one must be selected, + # though which one is undefined. + # Both Netscape and Microsoft Internet Explorer (IE) choose first + # item for SELECT/single. However, both IE5 and Mozilla (both 1.0 + # and Firebird 0.6) leave all items unselected for RADIO and + # SELECT/multiple. + + # Since both Netscape and IE all choose the first item for + # SELECT/single, we do the same. OTOH, both Netscape and IE + # leave SELECT/multiple with nothing selected, in violation of RFC 1866 + # (but not in violation of the W3C HTML 4 standard); the same is true + # of RADIO (which *is* in violation of the HTML 4 standard). We follow + # RFC 1866 if the _select_default attribute is set, and Netscape and IE + # otherwise. RFC 1866 and HTML 4 are always violated insofar as you + # can deselect all items in a RadioControl. + + for o in self.items: + # set items' controls to self, now that we've merged + o.__dict__["_control"] = self + + def __getattr__(self, name): + if name == "value": + compat = self._form.backwards_compat + if self.name is None: + return [] + return [o.name for o in self.items if o.selected and + (not o.disabled or compat)] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._set_value(value) + elif name in ("name", "type", "multiple"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + if not value: + compat = self._form.backwards_compat + for o in self.items: + if not o.disabled or compat: + o.selected = False + elif self.multiple: + self._multiple_set_value(value) + elif len(value) > 1: + raise ItemCountError( + "single selection list, must set sequence of " + "length 0 or 1") + else: + self._single_set_value(value) + + def _get_items(self, name, target=1): + all_items = self.get_items(name) + items = [o for o in all_items if not o.disabled] + if len(items) < target: + if len(all_items) < target: + raise ItemNotFoundError( + "insufficient items with name %r" % name) + else: + raise AttributeError( + "insufficient non-disabled items with name %s" % name) + on = [] + off = [] + for o in items: + if o.selected: + on.append(o) + else: + off.append(o) + return on, off + + def _single_set_value(self, value): + assert len(value) == 1 + on, off = self._get_items(value[0]) + assert len(on) <= 1 + if not on: + off[0].selected = True + + def _multiple_set_value(self, value): + compat = self._form.backwards_compat + turn_on = [] # transactional-ish + turn_off = [item for item in self.items if + item.selected and (not item.disabled or compat)] + names = {} + for nn in value: + if nn in names.keys(): + names[nn] += 1 + else: + names[nn] = 1 + for name, count in names.items(): + on, off = self._get_items(name, count) + for i in range(count): + if on: + item = on[0] + del on[0] + del turn_off[turn_off.index(item)] + else: + item = off[0] + del off[0] + turn_on.append(item) + for item in turn_off: + item.selected = False + for item in turn_on: + item.selected = True + + def set_value_by_label(self, value): + """Set the value of control by item labels. + + value is expected to be an iterable of strings that are substrings of + the item labels that should be selected. Before substring matching is + performed, the original label text is whitespace-compressed + (consecutive whitespace characters are converted to a single space + character) and leading and trailing whitespace is stripped. Ambiguous + labels are accepted without complaint if the form's backwards_compat is + True; otherwise, it will not complain as long as all ambiguous labels + share the same item name (e.g. OPTION value). + + """ + if isstringlike(value): + raise TypeError(value) + if not self.multiple and len(value) > 1: + raise ItemCountError( + "single selection list, must set sequence of " + "length 0 or 1") + items = [] + for nn in value: + found = self.get_items(label=nn) + if len(found) > 1: + if not self._form.backwards_compat: + # ambiguous labels are fine as long as item names (e.g. + # OPTION values) are same + opt_name = found[0].name + if [o for o in found[1:] if o.name != opt_name]: + raise AmbiguityError(nn) + else: + # OK, we'll guess :-( Assume first available item. + found = found[:1] + for o in found: + # For the multiple-item case, we could try to be smarter, + # saving them up and trying to resolve, but that's too much. + if self._form.backwards_compat or o not in items: + items.append(o) + break + else: # all of them are used + raise ItemNotFoundError(nn) + # now we have all the items that should be on + # let's just turn everything off and then back on. + self.value = [] + for o in items: + o.selected = True + + def get_value_by_label(self): + """Return the value of the control as given by normalized labels.""" + res = [] + compat = self._form.backwards_compat + for o in self.items: + if (not o.disabled or compat) and o.selected: + for l in o.get_labels(): + if l.text: + res.append(l.text) + break + else: + res.append(None) + return res + + def possible_items(self, by_label=False): + """Deprecated: return the names or labels of all possible items. + + Includes disabled items, which may be misleading for some use cases. + + """ + deprecation( + "[item.name for item in self.items]") + if by_label: + res = [] + for o in self.items: + for l in o.get_labels(): + if l.text: + res.append(l.text) + break + else: + res.append(None) + return res + return [o.name for o in self.items] + + def _totally_ordered_pairs(self): + if self.disabled or self.name is None: + return [] + else: + return [(o._index, self.name, o.name) for o in self.items + if o.selected and not o.disabled] + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + display = [str(o) for o in self.items] + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s=[%s])%s>" % (self.__class__.__name__, + name, ", ".join(display), info) + + +class RadioControl(ListControl): + """ + Covers: + + INPUT/RADIO + + """ + def __init__(self, type, name, attrs, select_default=False, index=None): + attrs.setdefault("value", "on") + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True, index=index) + self.__dict__["multiple"] = False + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("checked") + + def fixup(self): + ListControl.fixup(self) + found = [o for o in self.items if o.selected and not o.disabled] + if not found: + if self._select_default: + for o in self.items: + if not o.disabled: + o.selected = True + break + else: + # Ensure only one item selected. Choose the last one, + # following IE and Firefox. + for o in found[:-1]: + o.selected = False + + def get_labels(self): + return [] + +class CheckboxControl(ListControl): + """ + Covers: + + INPUT/CHECKBOX + + """ + def __init__(self, type, name, attrs, select_default=False, index=None): + attrs.setdefault("value", "on") + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True, index=index) + self.__dict__["multiple"] = True + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("checked") + + def get_labels(self): + return [] + + +class SelectControl(ListControl): + """ + Covers: + + SELECT (and OPTION) + + + OPTION 'values', in HTML parlance, are Item 'names' in mechanize parlance. + + SELECT control values and labels are subject to some messy defaulting + rules. For example, if the HTML representation of the control is: + + <SELECT name=year> + <OPTION value=0 label="2002">current year</OPTION> + <OPTION value=1>2001</OPTION> + <OPTION>2000</OPTION> + </SELECT> + + The items, in order, have labels "2002", "2001" and "2000", whereas their + names (the OPTION values) are "0", "1" and "2000" respectively. Note that + the value of the last OPTION in this example defaults to its contents, as + specified by RFC 1866, as do the labels of the second and third OPTIONs. + + The OPTION labels are sometimes more meaningful than the OPTION values, + which can make for more maintainable code. + + Additional read-only public attribute: attrs + + The attrs attribute is a dictionary of the original HTML attributes of the + SELECT element. Other ListControls do not have this attribute, because in + other cases the control as a whole does not correspond to any single HTML + element. control.get(...).attrs may be used as usual to get at the HTML + attributes of the HTML elements corresponding to individual list items (for + SELECT controls, these are OPTION elements). + + Another special case is that the Item.attrs dictionaries have a special key + "contents" which does not correspond to any real HTML attribute, but rather + contains the contents of the OPTION element: + + <OPTION>this bit</OPTION> + + """ + # HTML attributes here are treated slightly differently from other list + # controls: + # -The SELECT HTML attributes dictionary is stuffed into the OPTION + # HTML attributes dictionary under the "__select" key. + # -The content of each OPTION element is stored under the special + # "contents" key of the dictionary. + # After all this, the dictionary is passed to the SelectControl constructor + # as the attrs argument, as usual. However: + # -The first SelectControl constructed when building up a SELECT control + # has a constructor attrs argument containing only the __select key -- so + # this SelectControl represents an empty SELECT control. + # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and + # the __select dictionary containing the SELECT HTML-attributes. + + def __init__(self, type, name, attrs, select_default=False, index=None): + # fish out the SELECT HTML attributes from the OPTION HTML attributes + # dictionary + self.attrs = attrs["__select"].copy() + self.__dict__["_label"] = _get_label(self.attrs) + self.__dict__["id"] = self.attrs.get("id") + self.__dict__["multiple"] = self.attrs.has_key("multiple") + # the majority of the contents, label, and value dance already happened + contents = attrs.get("contents") + attrs = attrs.copy() + del attrs["__select"] + + ListControl.__init__(self, type, name, self.attrs, select_default, + called_as_base_class=True, index=index) + self.disabled = self.attrs.has_key("disabled") + self.readonly = self.attrs.has_key("readonly") + if attrs.has_key("value"): + # otherwise it is a marker 'select started' token + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("selected") + # add 'label' label and contents label, if different. If both are + # provided, the 'label' label is used for display in HTML + # 4.0-compliant browsers (and any lower spec? not sure) while the + # contents are used for display in older or less-compliant + # browsers. We make label objects for both, if the values are + # different. + label = attrs.get("label") + if label: + o._labels.append(Label({"__text": label})) + if contents and contents != label: + o._labels.append(Label({"__text": contents})) + elif contents: + o._labels.append(Label({"__text": contents})) + + def fixup(self): + ListControl.fixup(self) + # Firefox doesn't exclude disabled items from those considered here + # (i.e. from 'found', for both branches of the if below). Note that + # IE6 doesn't support the disabled attribute on OPTIONs at all. + found = [o for o in self.items if o.selected] + if not found: + if not self.multiple or self._select_default: + for o in self.items: + if not o.disabled: + was_disabled = self.disabled + self.disabled = False + try: + o.selected = True + finally: + o.disabled = was_disabled + break + elif not self.multiple: + # Ensure only one item selected. Choose the last one, + # following IE and Firefox. + for o in found[:-1]: + o.selected = False + + +#--------------------------------------------------- +class SubmitControl(ScalarControl): + """ + Covers: + + INPUT/SUBMIT + BUTTON/SUBMIT + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it + # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem + # to define this. + if self.value is None: self.value = "" + self.readonly = True + + def get_labels(self): + res = [] + if self.value: + res.append(Label({"__text": self.value})) + res.extend(ScalarControl.get_labels(self)) + return res + + def is_of_kind(self, kind): return kind == "clickable" + + def _click(self, form, coord, return_type, request_class=_request.Request): + self._clicked = coord + r = form._switch_click(return_type, request_class) + self._clicked = False + return r + + def _totally_ordered_pairs(self): + if not self._clicked: + return [] + return ScalarControl._totally_ordered_pairs(self) + + +#--------------------------------------------------- +class ImageControl(SubmitControl): + """ + Covers: + + INPUT/IMAGE + + Coordinates are specified using one of the HTMLForm.click* methods. + + """ + def __init__(self, type, name, attrs, index=None): + SubmitControl.__init__(self, type, name, attrs, index) + self.readonly = False + + def _totally_ordered_pairs(self): + clicked = self._clicked + if self.disabled or not clicked: + return [] + name = self.name + if name is None: return [] + pairs = [ + (self._index, "%s.x" % name, str(clicked[0])), + (self._index+1, "%s.y" % name, str(clicked[1])), + ] + value = self._value + if value: + pairs.append((self._index+2, name, value)) + return pairs + + get_labels = ScalarControl.get_labels + +# aliases, just to make str(control) and str(form) clearer +class PasswordControl(TextControl): pass +class HiddenControl(TextControl): pass +class TextareaControl(TextControl): pass +class SubmitButtonControl(SubmitControl): pass + + +def is_listcontrol(control): return control.is_of_kind("list") + + +class HTMLForm: + """Represents a single HTML <form> ... </form> element. + + A form consists of a sequence of controls that usually have names, and + which can take on various values. The values of the various types of + controls represent variously: text, zero-or-one-of-many or many-of-many + choices, and files to be uploaded. Some controls can be clicked on to + submit the form, and clickable controls' values sometimes include the + coordinates of the click. + + Forms can be filled in with data to be returned to the server, and then + submitted, using the click method to generate a request object suitable for + passing to mechanize.urlopen (or the click_request_data or click_pairs + methods for integration with third-party code). + + import mechanize + forms = mechanize.ParseFile(html, base_uri) + form = forms[0] + + form["query"] = "Python" + form.find_control("nr_results").get("lots").selected = True + + response = mechanize.urlopen(form.click()) + + Usually, HTMLForm instances are not created directly. Instead, the + ParseFile or ParseResponse factory functions are used. If you do construct + HTMLForm objects yourself, however, note that an HTMLForm instance is only + properly initialised after the fixup method has been called (ParseFile and + ParseResponse do this for you). See ListControl.__doc__ for the reason + this is required. + + Indexing a form (form["control_name"]) returns the named Control's value + attribute. Assignment to a form index (form["control_name"] = something) + is equivalent to assignment to the named Control's value attribute. If you + need to be more specific than just supplying the control's name, use the + set_value and get_value methods. + + ListControl values are lists of item names (specifically, the names of the + items that are selected and not disabled, and hence are "successful" -- ie. + cause data to be returned to the server). The list item's name is the + value of the corresponding HTML element's"value" attribute. + + Example: + + <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT> + <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT> + + defines a CHECKBOX control with name "cheeses" which has two items, named + "leicester" and "cheddar". + + Another example: + + <SELECT name="more_cheeses"> + <OPTION>1</OPTION> + <OPTION value="2" label="CHEDDAR">cheddar</OPTION> + </SELECT> + + defines a SELECT control with name "more_cheeses" which has two items, + named "1" and "2" (because the OPTION element's value HTML attribute + defaults to the element contents -- see SelectControl.__doc__ for more on + these defaulting rules). + + To select, deselect or otherwise manipulate individual list items, use the + HTMLForm.find_control() and ListControl.get() methods. To set the whole + value, do as for any other control: use indexing or the set_/get_value + methods. + + Example: + + # select *only* the item named "cheddar" + form["cheeses"] = ["cheddar"] + # select "cheddar", leave other items unaffected + form.find_control("cheeses").get("cheddar").selected = True + + Some controls (RADIO and SELECT without the multiple attribute) can only + have zero or one items selected at a time. Some controls (CHECKBOX and + SELECT with the multiple attribute) can have multiple items selected at a + time. To set the whole value of a ListControl, assign a sequence to a form + index: + + form["cheeses"] = ["cheddar", "leicester"] + + If the ListControl is not multiple-selection, the assigned list must be of + length one. + + To check if a control has an item, if an item is selected, or if an item is + successful (selected and not disabled), respectively: + + "cheddar" in [item.name for item in form.find_control("cheeses").items] + "cheddar" in [item.name for item in form.find_control("cheeses").items and + item.selected] + "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses")) + + Note that some list items may be disabled (see below). + + Note the following mistake: + + form[control_name] = control_value + assert form[control_name] == control_value # not necessarily true + + The reason for this is that form[control_name] always gives the list items + in the order they were listed in the HTML. + + List items (hence list values, too) can be referred to in terms of list + item labels rather than list item names using the appropriate label + arguments. Note that each item may have several labels. + + The question of default values of OPTION contents, labels and values is + somewhat complicated: see SelectControl.__doc__ and + ListControl.get_item_attrs.__doc__ if you think you need to know. + + Controls can be disabled or readonly. In either case, the control's value + cannot be changed until you clear those flags (see example below). + Disabled is the state typically represented by browsers by 'greying out' a + control. Disabled controls are not 'successful' -- they don't cause data + to get returned to the server. Readonly controls usually appear in + browsers as read-only text boxes. Readonly controls are successful. List + items can also be disabled. Attempts to select or deselect disabled items + fail with AttributeError. + + If a lot of controls are readonly, it can be useful to do this: + + form.set_all_readonly(False) + + To clear a control's value attribute, so that it is not successful (until a + value is subsequently set): + + form.clear("cheeses") + + More examples: + + control = form.find_control("cheeses") + control.disabled = False + control.readonly = False + control.get("gruyere").disabled = True + control.items[0].selected = True + + See the various Control classes for further documentation. Many methods + take name, type, kind, id, label and nr arguments to specify the control to + be operated on: see HTMLForm.find_control.__doc__. + + ControlNotFoundError (subclass of ValueError) is raised if the specified + control can't be found. This includes occasions where a non-ListControl + is found, but the method (set, for example) requires a ListControl. + ItemNotFoundError (subclass of ValueError) is raised if a list item can't + be found. ItemCountError (subclass of ValueError) is raised if an attempt + is made to select more than one item and the control doesn't allow that, or + set/get_single are called and the control contains more than one item. + AttributeError is raised if a control or item is readonly or disabled and + an attempt is made to alter its value. + + Security note: Remember that any passwords you store in HTMLForm instances + will be saved to disk in the clear if you pickle them (directly or + indirectly). The simplest solution to this is to avoid pickling HTMLForm + objects. You could also pickle before filling in any password, or just set + the password to "" before pickling. + + + Public attributes: + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form (None if no name was specified) + attrs: dictionary mapping original HTML form attributes to their values + + controls: list of Control instances; do not alter this list + (instead, call form.new_control to make a Control and add it to the + form, or control.add_to_form if you already have a Control instance) + + + + Methods for form filling: + ------------------------- + + Most of the these methods have very similar arguments. See + HTMLForm.find_control.__doc__ for details of the name, type, kind, label + and nr arguments. + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None, label=None) + + get_value(name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None) + set_value(value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None) + + clear_all() + clear(name=None, type=None, kind=None, id=None, nr=None, label=None) + + set_all_readonly(readonly) + + + Method applying only to FileControls: + + add_file(file_object, + content_type="application/octet-stream", filename=None, + name=None, id=None, nr=None, label=None) + + + Methods applying only to clickable controls: + + click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None) + click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1), + label=None) + click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None) + + """ + + type2class = { + "text": TextControl, + "password": PasswordControl, + "hidden": HiddenControl, + "textarea": TextareaControl, + + "isindex": IsindexControl, + + "file": FileControl, + + "button": IgnoreControl, + "buttonbutton": IgnoreControl, + "reset": IgnoreControl, + "resetbutton": IgnoreControl, + + "submit": SubmitControl, + "submitbutton": SubmitButtonControl, + "image": ImageControl, + + "radio": RadioControl, + "checkbox": CheckboxControl, + "select": SelectControl, + } + +#--------------------------------------------------- +# Initialisation. Use ParseResponse / ParseFile instead. + + def __init__(self, action, method="GET", + enctype="application/x-www-form-urlencoded", + name=None, attrs=None, + request_class=_request.Request, + forms=None, labels=None, id_to_labels=None, + backwards_compat=True): + """ + In the usual case, use ParseResponse (or ParseFile) to create new + HTMLForm objects. + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form + attrs: dictionary mapping original HTML form attributes to their values + + """ + self.action = action + self.method = method + self.enctype = enctype + self.name = name + if attrs is not None: + self.attrs = attrs.copy() + else: + self.attrs = {} + self.controls = [] + self._request_class = request_class + + # these attributes are used by zope.testbrowser + self._forms = forms # this is a semi-public API! + self._labels = labels # this is a semi-public API! + self._id_to_labels = id_to_labels # this is a semi-public API! + + self.backwards_compat = backwards_compat # note __setattr__ + + self._urlunparse = urlparse.urlunparse + self._urlparse = urlparse.urlparse + + def __getattr__(self, name): + if name == "backwards_compat": + return self._backwards_compat + return getattr(HTMLForm, name) + + def __setattr__(self, name, value): + # yuck + if name == "backwards_compat": + name = "_backwards_compat" + value = bool(value) + for cc in self.controls: + try: + items = cc.items + except AttributeError: + continue + else: + for ii in items: + for ll in ii.get_labels(): + ll._backwards_compat = value + self.__dict__[name] = value + + def new_control(self, type, name, attrs, + ignore_unknown=False, select_default=False, index=None): + """Adds a new control to the form. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + Note that controls representing lists of items are built up from + controls holding only a single list item. See ListControl.__doc__ for + further information. + + type: type of control (see Control.__doc__ for a list) + attrs: HTML attributes of control + ignore_unknown: if true, use a dummy Control instance for controls of + unknown type; otherwise, use a TextControl + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present (this defaulting happens when the HTMLForm.fixup method is + called) + index: index of corresponding element in HTML (see + MoreFormTests.test_interspersed_controls for motivation) + + """ + type = type.lower() + klass = self.type2class.get(type) + if klass is None: + if ignore_unknown: + klass = IgnoreControl + else: + klass = TextControl + + a = attrs.copy() + if issubclass(klass, ListControl): + control = klass(type, name, a, select_default, index) + else: + control = klass(type, name, a, index) + + if type == "select" and len(attrs) == 1: + for ii in range(len(self.controls)-1, -1, -1): + ctl = self.controls[ii] + if ctl.type == "select": + ctl.close_control() + break + + control.add_to_form(self) + control._urlparse = self._urlparse + control._urlunparse = self._urlunparse + + def fixup(self): + """Normalise form after all controls have been added. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + This method should only be called once, after all controls have been + added to the form. + + """ + for control in self.controls: + control.fixup() + self.backwards_compat = self._backwards_compat + +#--------------------------------------------------- + def __str__(self): + header = "%s%s %s %s" % ( + (self.name and self.name+" " or ""), + self.method, self.action, self.enctype) + rep = [header] + for control in self.controls: + rep.append(" %s" % str(control)) + return "<%s>" % "\n".join(rep) + +#--------------------------------------------------- +# Form-filling methods. + + def __getitem__(self, name): + return self.find_control(name).value + def __contains__(self, name): + return bool(self.find_control(name)) + def __setitem__(self, name, value): + control = self.find_control(name) + try: + control.value = value + except AttributeError, e: + raise ValueError(str(e)) + + def get_value(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None): + """Return value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] + + """ + if by_label: + deprecation("form.get_value_by_label(...)") + c = self.find_control(name, type, kind, id, label=label, nr=nr) + if by_label: + try: + meth = c.get_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + return meth() + else: + return c.value + def set_value(self, value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None): + """Set value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] = value + + """ + if by_label: + deprecation("form.get_value_by_label(...)") + c = self.find_control(name, type, kind, id, label=label, nr=nr) + if by_label: + try: + meth = c.set_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + meth(value) + else: + c.value = value + def get_value_by_label( + self, name=None, type=None, kind=None, id=None, label=None, nr=None): + """ + + All arguments should be passed by name. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + return c.get_value_by_label() + + def set_value_by_label( + self, value, + name=None, type=None, kind=None, id=None, label=None, nr=None): + """ + + All arguments should be passed by name. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + c.set_value_by_label(value) + + def set_all_readonly(self, readonly): + for control in self.controls: + control.readonly = bool(readonly) + + def clear_all(self): + """Clear the value attributes of all controls in the form. + + See HTMLForm.clear.__doc__. + + """ + for control in self.controls: + control.clear() + + def clear(self, + name=None, type=None, kind=None, id=None, nr=None, label=None): + """Clear the value attribute of a control. + + As a result, the affected control will not be successful until a value + is subsequently set. AttributeError is raised on readonly controls. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + c.clear() + + +#--------------------------------------------------- +# Form-filling methods applying only to ListControls. + + def possible_items(self, # deprecated + name=None, type=None, kind=None, id=None, + nr=None, by_label=False, label=None): + """Return a list of all values that the specified control can take.""" + c = self._find_list_control(name, type, kind, id, label, nr) + return c.possible_items(by_label) + + def set(self, selected, item_name, # deprecated + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, label=None): + """Select / deselect named list item. + + selected: boolean selected state + + """ + self._find_list_control(name, type, kind, id, label, nr).set( + selected, item_name, by_label) + def toggle(self, item_name, # deprecated + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, label=None): + """Toggle selected state of named list item.""" + self._find_list_control(name, type, kind, id, label, nr).toggle( + item_name, by_label) + + def set_single(self, selected, # deprecated + name=None, type=None, kind=None, id=None, + nr=None, by_label=None, label=None): + """Select / deselect list item in a control having only one item. + + If the control has multiple list items, ItemCountError is raised. + + This is just a convenience method, so you don't need to know the item's + name -- the item name in these single-item controls is usually + something meaningless like "1" or "on". + + For example, if a checkbox has a single item named "on", the following + two calls are equivalent: + + control.toggle("on") + control.toggle_single() + + """ # by_label ignored and deprecated + self._find_list_control( + name, type, kind, id, label, nr).set_single(selected) + def toggle_single(self, name=None, type=None, kind=None, id=None, + nr=None, by_label=None, label=None): # deprecated + """Toggle selected state of list item in control having only one item. + + The rest is as for HTMLForm.set_single.__doc__. + + """ # by_label ignored and deprecated + self._find_list_control(name, type, kind, id, label, nr).toggle_single() + +#--------------------------------------------------- +# Form-filling method applying only to FileControls. + + def add_file(self, file_object, content_type=None, filename=None, + name=None, id=None, nr=None, label=None): + """Add a file to be uploaded. + + file_object: file-like object (with read method) from which to read + data to upload + content_type: MIME content type of data to upload + filename: filename to pass to server + + If filename is None, no filename is sent to the server. + + If content_type is None, the content type is guessed based on the + filename and the data from read from the file object. + + XXX + At the moment, guessed content type is always application/octet-stream. + Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and + plain text. + + Note the following useful HTML attributes of file upload controls (see + HTML 4.01 spec, section 17): + + accept: comma-separated list of content types that the server will + handle correctly; you can use this to filter out non-conforming files + size: XXX IIRC, this is indicative of whether form wants multiple or + single files + maxlength: XXX hint of max content length in bytes? + + """ + self.find_control(name, "file", id=id, label=label, nr=nr).add_file( + file_object, content_type, filename) + +#--------------------------------------------------- +# Form submission methods, applying only to clickable controls. + + def click(self, name=None, type=None, id=None, nr=0, coord=(1,1), + request_class=_request.Request, + label=None): + """Return request that would result from clicking on a control. + + The request object is a mechanize.Request instance, which you can pass + to mechanize.urlopen. + + Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and + IMAGEs) can be clicked. + + Will click on the first clickable control, subject to the name, type + and nr arguments (as for find_control). If no name, type, id or number + is specified and there are no clickable controls, a request will be + returned for the form in its current, un-clicked, state. + + IndexError is raised if any of name, type, id or nr is specified but no + matching control is found. ValueError is raised if the HTMLForm has an + enctype attribute that is not recognised. + + You can optionally specify a coordinate to click at, which only makes a + difference if you clicked on an image. + + """ + return self._click(name, type, id, label, nr, coord, "request", + self._request_class) + + def click_request_data(self, + name=None, type=None, id=None, + nr=0, coord=(1,1), + request_class=_request.Request, + label=None): + """As for click method, but return a tuple (url, data, headers). + + You can use this data to send a request to the server. This is useful + if you're using httplib or urllib rather than mechanize. Otherwise, + use the click method. + + # Untested. Have to subclass to add headers, I think -- so use + # mechanize instead! + import urllib + url, data, hdrs = form.click_request_data() + r = urllib.urlopen(url, data) + + # Untested. I don't know of any reason to use httplib -- you can get + # just as much control with mechanize. + import httplib, urlparse + url, data, hdrs = form.click_request_data() + tup = urlparse(url) + host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:]) + conn = httplib.HTTPConnection(host) + if data: + httplib.request("POST", path, data, hdrs) + else: + httplib.request("GET", path, headers=hdrs) + r = conn.getresponse() + + """ + return self._click(name, type, id, label, nr, coord, "request_data", + self._request_class) + + def click_pairs(self, name=None, type=None, id=None, + nr=0, coord=(1,1), + label=None): + """As for click_request_data, but returns a list of (key, value) pairs. + + You can use this list as an argument to urllib.urlencode. This is + usually only useful if you're using httplib or urllib rather than + mechanize. It may also be useful if you want to manually tweak the + keys and/or values, but this should not be necessary. Otherwise, use + the click method. + + Note that this method is only useful for forms of MIME type + x-www-form-urlencoded. In particular, it does not return the + information required for file upload. If you need file upload and are + not using mechanize, use click_request_data. + """ + return self._click(name, type, id, label, nr, coord, "pairs", + self._request_class) + +#--------------------------------------------------- + + def find_control(self, + name=None, type=None, kind=None, id=None, + predicate=None, nr=None, + label=None): + """Locate and return some specific control within the form. + + At least one of the name, type, kind, predicate and nr arguments must + be supplied. If no matching control is found, ControlNotFoundError is + raised. + + If name is specified, then the control must have the indicated name. + + If type is specified then the control must have the specified type (in + addition to the types possible for <input> HTML tags: "text", + "password", "hidden", "submit", "image", "button", "radio", "checkbox", + "file" we also have "reset", "buttonbutton", "submitbutton", + "resetbutton", "textarea", "select" and "isindex"). + + If kind is specified, then the control must fall into the specified + group, each of which satisfies a particular interface. The types are + "text", "list", "multilist", "singlelist", "clickable" and "file". + + If id is specified, then the control must have the indicated id. + + If predicate is specified, then the control must match that function. + The predicate function is passed the control as its single argument, + and should return a boolean value indicating whether the control + matched. + + nr, if supplied, is the sequence number of the control (where 0 is the + first). Note that control 0 is the first control matching all the + other arguments (if supplied); it is not necessarily the first control + in the form. If no nr is supplied, AmbiguityError is raised if + multiple controls match the other arguments (unless the + .backwards-compat attribute is true). + + If label is specified, then the control must have this label. Note + that radio controls and checkboxes never have labels: their items do. + + """ + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (label is None) and (predicate is None) and + (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + return self._find_control(name, type, kind, id, label, predicate, nr) + +#--------------------------------------------------- +# Private methods. + + def _find_list_control(self, + name=None, type=None, kind=None, id=None, + label=None, nr=None): + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (label is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + + return self._find_control(name, type, kind, id, label, + is_listcontrol, nr) + + def _find_control(self, name, type, kind, id, label, predicate, nr): + if ((name is not None) and (name is not Missing) and + not isstringlike(name)): + raise TypeError("control name must be string-like") + if (type is not None) and not isstringlike(type): + raise TypeError("control type must be string-like") + if (kind is not None) and not isstringlike(kind): + raise TypeError("control kind must be string-like") + if (id is not None) and not isstringlike(id): + raise TypeError("control id must be string-like") + if (label is not None) and not isstringlike(label): + raise TypeError("control label must be string-like") + if (predicate is not None) and not callable(predicate): + raise TypeError("control predicate must be callable") + if (nr is not None) and nr < 0: + raise ValueError("control number must be a positive integer") + + orig_nr = nr + found = None + ambiguous = False + if nr is None and self.backwards_compat: + nr = 0 + + for control in self.controls: + if ((name is not None and name != control.name) and + (name is not Missing or control.name is not None)): + continue + if type is not None and type != control.type: + continue + if kind is not None and not control.is_of_kind(kind): + continue + if id is not None and id != control.id: + continue + if predicate and not predicate(control): + continue + if label: + for l in control.get_labels(): + if l.text.find(label) > -1: + break + else: + continue + if nr is not None: + if nr == 0: + return control # early exit: unambiguous due to nr + nr -= 1 + continue + if found: + ambiguous = True + break + found = control + + if found and not ambiguous: + return found + + description = [] + if name is not None: description.append("name %s" % repr(name)) + if type is not None: description.append("type '%s'" % type) + if kind is not None: description.append("kind '%s'" % kind) + if id is not None: description.append("id '%s'" % id) + if label is not None: description.append("label '%s'" % label) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr: description.append("nr %d" % orig_nr) + description = ", ".join(description) + + if ambiguous: + raise AmbiguityError("more than one control matching "+description) + elif not found: + raise ControlNotFoundError("no control matching "+description) + assert False + + def _click(self, name, type, id, label, nr, coord, return_type, + request_class=_request.Request): + try: + control = self._find_control( + name, type, "clickable", id, label, None, nr) + except ControlNotFoundError: + if ((name is not None) or (type is not None) or (id is not None) or + (label is not None) or (nr != 0)): + raise + # no clickable controls, but no control was explicitly requested, + # so return state without clicking any control + return self._switch_click(return_type, request_class) + else: + return control._click(self, coord, return_type, request_class) + + def _pairs(self): + """Return sequence of (key, value) pairs suitable for urlencoding.""" + return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()] + + + def _pairs_and_controls(self): + """Return sequence of (index, key, value, control_index) + of totally ordered pairs suitable for urlencoding. + + control_index is the index of the control in self.controls + """ + pairs = [] + for control_index in range(len(self.controls)): + control = self.controls[control_index] + for ii, key, val in control._totally_ordered_pairs(): + pairs.append((ii, key, val, control_index)) + + # stable sort by ONLY first item in tuple + pairs.sort() + + return pairs + + def _request_data(self): + """Return a tuple (url, data, headers).""" + method = self.method.upper() + #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action) + parts = self._urlparse(self.action) + rest, (query, frag) = parts[:-2], parts[-2:] + + if method == "GET": + if self.enctype != "application/x-www-form-urlencoded": + raise ValueError( + "unknown GET form encoding type '%s'" % self.enctype) + parts = rest + (urllib.urlencode(self._pairs()), None) + uri = self._urlunparse(parts) + return uri, None, [] + elif method == "POST": + parts = rest + (query, None) + uri = self._urlunparse(parts) + if self.enctype == "application/x-www-form-urlencoded": + return (uri, urllib.urlencode(self._pairs()), + [("Content-Type", self.enctype)]) + elif self.enctype == "multipart/form-data": + data = StringIO() + http_hdrs = [] + mw = MimeWriter(data, http_hdrs) + mw.startmultipartbody("form-data", add_to_http_hdrs=True, + prefix=0) + for ii, k, v, control_index in self._pairs_and_controls(): + self.controls[control_index]._write_mime_data(mw, k, v) + mw.lastpart() + return uri, data.getvalue(), http_hdrs + else: + raise ValueError( + "unknown POST form encoding type '%s'" % self.enctype) + else: + raise ValueError("Unknown method '%s'" % method) + + def _switch_click(self, return_type, request_class=_request.Request): + # This is called by HTMLForm and clickable Controls to hide switching + # on return_type. + if return_type == "pairs": + return self._pairs() + elif return_type == "request_data": + return self._request_data() + else: + req_data = self._request_data() + req = request_class(req_data[0], req_data[1]) + for key, val in req_data[2]: + add_hdr = req.add_header + if key.lower() == "content-type": + try: + add_hdr = req.add_unredirected_header + except AttributeError: + # pre-2.4 and not using ClientCookie + pass + add_hdr(key, val) + return req diff --git a/LTA/LTAIngest/mechanize/_gzip.py b/LTA/LTAIngest/mechanize/_gzip.py new file mode 100644 index 0000000000000000000000000000000000000000..7e9d6a0ce7de0fe11f5333e6fedefee2c9c1e95f --- /dev/null +++ b/LTA/LTAIngest/mechanize/_gzip.py @@ -0,0 +1,105 @@ +from cStringIO import StringIO + +import _response +import _urllib2_fork + + +# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library +class GzipConsumer: + + def __init__(self, consumer): + self.__consumer = consumer + self.__decoder = None + self.__data = "" + + def __getattr__(self, key): + return getattr(self.__consumer, key) + + def feed(self, data): + if self.__decoder is None: + # check if we have a full gzip header + data = self.__data + data + try: + i = 10 + flag = ord(data[3]) + if flag & 4: # extra + x = ord(data[i]) + 256*ord(data[i+1]) + i = i + 2 + x + if flag & 8: # filename + while ord(data[i]): + i = i + 1 + i = i + 1 + if flag & 16: # comment + while ord(data[i]): + i = i + 1 + i = i + 1 + if flag & 2: # crc + i = i + 2 + if len(data) < i: + raise IndexError("not enough data") + if data[:3] != "\x1f\x8b\x08": + raise IOError("invalid gzip data") + data = data[i:] + except IndexError: + self.__data = data + return # need more data + import zlib + self.__data = "" + self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS) + data = self.__decoder.decompress(data) + if data: + self.__consumer.feed(data) + + def close(self): + if self.__decoder: + data = self.__decoder.flush() + if data: + self.__consumer.feed(data) + self.__consumer.close() + + +# -------------------------------------------------------------------- + +# the rest of this module is John Lee's stupid code, not +# Fredrik's nice code :-) + +class stupid_gzip_consumer: + def __init__(self): self.data = [] + def feed(self, data): self.data.append(data) + +class stupid_gzip_wrapper(_response.closeable_response): + def __init__(self, response): + self._response = response + + c = stupid_gzip_consumer() + gzc = GzipConsumer(c) + gzc.feed(response.read()) + self.__data = StringIO("".join(c.data)) + + def read(self, size=-1): + return self.__data.read(size) + def readline(self, size=-1): + return self.__data.readline(size) + def readlines(self, sizehint=-1): + return self.__data.readlines(sizehint) + + def __getattr__(self, name): + # delegate unknown methods/attributes + return getattr(self._response, name) + +class HTTPGzipProcessor(_urllib2_fork.BaseHandler): + handler_order = 200 # response processing before HTTPEquivProcessor + + def http_request(self, request): + request.add_header("Accept-Encoding", "gzip") + return request + + def http_response(self, request, response): + # post-process response + enc_hdrs = response.info().getheaders("Content-encoding") + for enc_hdr in enc_hdrs: + if ("gzip" in enc_hdr) or ("compress" in enc_hdr): + return stupid_gzip_wrapper(response) + return response + + https_response = http_response diff --git a/LTA/LTAIngest/mechanize/_headersutil.py b/LTA/LTAIngest/mechanize/_headersutil.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c78e93ba0319eef0bc6e33a34223776e1e0f3f --- /dev/null +++ b/LTA/LTAIngest/mechanize/_headersutil.py @@ -0,0 +1,241 @@ +"""Utility functions for HTTP header value parsing and construction. + +Copyright 1997-1998, Gisle Aas +Copyright 2002-2006, John J. Lee + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import os, re +from types import StringType +from types import UnicodeType +STRING_TYPES = StringType, UnicodeType + +from _util import http2time +import _rfc3986 + + +def is_html_file_extension(url, allow_xhtml): + ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1] + html_exts = [".htm", ".html"] + if allow_xhtml: + html_exts += [".xhtml"] + return ext in html_exts + + +def is_html(ct_headers, url, allow_xhtml=False): + """ + ct_headers: Sequence of Content-Type headers + url: Response URL + + """ + if not ct_headers: + return is_html_file_extension(url, allow_xhtml) + headers = split_header_words(ct_headers) + if len(headers) < 1: + return is_html_file_extension(url, allow_xhtml) + first_header = headers[0] + first_parameter = first_header[0] + ct = first_parameter[0] + html_types = ["text/html"] + if allow_xhtml: + html_types += [ + "text/xhtml", "text/xml", + "application/xml", "application/xhtml+xml", + ] + return ct in html_types + + +def unmatched(match): + """Return unmatched part of re.Match object.""" + start, end = match.span(0) + return match.string[:start]+match.string[end:] + +token_re = re.compile(r"^\s*([^=\s;,]+)") +quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") +value_re = re.compile(r"^\s*=\s*([^\s;,]*)") +escape_re = re.compile(r"\\(.)") +def split_header_words(header_values): + r"""Parse header values into a list of lists containing key,value pairs. + + The function knows how to deal with ",", ";" and "=" as well as quoted + values after "=". A list of space separated tokens are parsed as if they + were separated by ";". + + If the header_values passed as argument contains multiple values, then they + are treated as if they were a single value separated by comma ",". + + This means that this function is useful for parsing header fields that + follow this syntax (BNF as from the HTTP/1.1 specification, but we relax + the requirement for tokens). + + headers = #header + header = (token | parameter) *( [";"] (token | parameter)) + + token = 1*<any CHAR except CTLs or separators> + separators = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + + quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) + qdtext = <any TEXT except <">> + quoted-pair = "\" CHAR + + parameter = attribute "=" value + attribute = token + value = token | quoted-string + + Each header is represented by a list of key/value pairs. The value for a + simple token (not part of a parameter) is None. Syntactically incorrect + headers will not necessarily be parsed as you would want. + + This is easier to describe with some examples: + + >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) + [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] + >>> split_header_words(['text/html; charset="iso-8859-1"']) + [[('text/html', None), ('charset', 'iso-8859-1')]] + >>> split_header_words([r'Basic realm="\"foo\bar\""']) + [[('Basic', None), ('realm', '"foobar"')]] + + """ + assert type(header_values) not in STRING_TYPES + result = [] + for text in header_values: + orig_text = text + pairs = [] + while text: + m = token_re.search(text) + if m: + text = unmatched(m) + name = m.group(1) + m = quoted_value_re.search(text) + if m: # quoted value + text = unmatched(m) + value = m.group(1) + value = escape_re.sub(r"\1", value) + else: + m = value_re.search(text) + if m: # unquoted value + text = unmatched(m) + value = m.group(1) + value = value.rstrip() + else: + # no value, a lone token + value = None + pairs.append((name, value)) + elif text.lstrip().startswith(","): + # concatenated headers, as per RFC 2616 section 4.2 + text = text.lstrip()[1:] + if pairs: result.append(pairs) + pairs = [] + else: + # skip junk + non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) + assert nr_junk_chars > 0, ( + "split_header_words bug: '%s', '%s', %s" % + (orig_text, text, pairs)) + text = non_junk + if pairs: result.append(pairs) + return result + +join_escape_re = re.compile(r"([\"\\])") +def join_header_words(lists): + """Do the inverse of the conversion done by split_header_words. + + Takes a list of lists of (key, value) pairs and produces a single header + value. Attribute values are quoted if needed. + + >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) + 'text/plain; charset="iso-8859/1"' + >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) + 'text/plain, charset="iso-8859/1"' + + """ + headers = [] + for pairs in lists: + attr = [] + for k, v in pairs: + if v is not None: + if not re.search(r"^\w+$", v): + v = join_escape_re.sub(r"\\\1", v) # escape " and \ + v = '"%s"' % v + if k is None: # Netscape cookies may have no name + k = v + else: + k = "%s=%s" % (k, v) + attr.append(k) + if attr: headers.append("; ".join(attr)) + return ", ".join(headers) + +def strip_quotes(text): + if text.startswith('"'): + text = text[1:] + if text.endswith('"'): + text = text[:-1] + return text + +def parse_ns_headers(ns_headers): + """Ad-hoc parser for Netscape protocol cookie-attributes. + + The old Netscape cookie format for Set-Cookie can for instance contain + an unquoted "," in the expires field, so we have to use this ad-hoc + parser instead of split_header_words. + + XXX This may not make the best possible effort to parse all the crap + that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient + parser is probably better, so could do worse than following that if + this ever gives any trouble. + + Currently, this is also used for parsing RFC 2109 cookies. + + """ + known_attrs = ("expires", "domain", "path", "secure", + # RFC 2109 attrs (may turn up in Netscape cookies, too) + "version", "port", "max-age") + + result = [] + for ns_header in ns_headers: + pairs = [] + version_set = False + params = re.split(r";\s*", ns_header) + for ii in range(len(params)): + param = params[ii] + param = param.rstrip() + if param == "": continue + if "=" not in param: + k, v = param, None + else: + k, v = re.split(r"\s*=\s*", param, 1) + k = k.lstrip() + if ii != 0: + lc = k.lower() + if lc in known_attrs: + k = lc + if k == "version": + # This is an RFC 2109 cookie. + v = strip_quotes(v) + version_set = True + if k == "expires": + # convert expires date to seconds since epoch + v = http2time(strip_quotes(v)) # None if invalid + pairs.append((k, v)) + + if pairs: + if not version_set: + pairs.append(("version", "0")) + result.append(pairs) + + return result + + +def _test(): + import doctest, _headersutil + return doctest.testmod(_headersutil) + +if __name__ == "__main__": + _test() diff --git a/LTA/LTAIngest/mechanize/_html.py b/LTA/LTAIngest/mechanize/_html.py new file mode 100644 index 0000000000000000000000000000000000000000..1a4e2c0281571b8c7bbc7da8dcb377835adbb091 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_html.py @@ -0,0 +1,629 @@ +"""HTML handling. + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import codecs +import copy +import htmlentitydefs +import re + +import _sgmllib_copy as sgmllib + +import _beautifulsoup +import _form +from _headersutil import split_header_words, is_html as _is_html +import _request +import _rfc3986 + +DEFAULT_ENCODING = "latin-1" + +COMPRESS_RE = re.compile(r"\s+") + + +class CachingGeneratorFunction(object): + """Caching wrapper around a no-arguments iterable.""" + + def __init__(self, iterable): + self._cache = [] + # wrap iterable to make it non-restartable (otherwise, repeated + # __call__ would give incorrect results) + self._iterator = iter(iterable) + + def __call__(self): + cache = self._cache + for item in cache: + yield item + for item in self._iterator: + cache.append(item) + yield item + + +class EncodingFinder: + def __init__(self, default_encoding): + self._default_encoding = default_encoding + def encoding(self, response): + # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV + # headers may be in the response. HTTP-EQUIV headers come last, + # so try in order from first to last. + for ct in response.info().getheaders("content-type"): + for k, v in split_header_words([ct])[0]: + if k == "charset": + encoding = v + try: + codecs.lookup(v) + except LookupError: + continue + else: + return encoding + return self._default_encoding + + +class ResponseTypeFinder: + def __init__(self, allow_xhtml): + self._allow_xhtml = allow_xhtml + def is_html(self, response, encoding): + ct_hdrs = response.info().getheaders("content-type") + url = response.geturl() + # XXX encoding + return _is_html(ct_hdrs, url, self._allow_xhtml) + + +class Args(object): + + # idea for this argument-processing trick is from Peter Otten + + def __init__(self, args_map): + self.__dict__["dictionary"] = dict(args_map) + + def __getattr__(self, key): + try: + return self.dictionary[key] + except KeyError: + return getattr(self.__class__, key) + + def __setattr__(self, key, value): + if key == "dictionary": + raise AttributeError() + self.dictionary[key] = value + + +def form_parser_args( + select_default=False, + form_parser_class=None, + request_class=None, + backwards_compat=False, + ): + return Args(locals()) + + +class Link: + def __init__(self, base_url, url, text, tag, attrs): + assert None not in [url, tag, attrs] + self.base_url = base_url + self.absolute_url = _rfc3986.urljoin(base_url, url) + self.url, self.text, self.tag, self.attrs = url, text, tag, attrs + def __cmp__(self, other): + try: + for name in "url", "text", "tag", "attrs": + if getattr(self, name) != getattr(other, name): + return -1 + except AttributeError: + return -1 + return 0 + def __repr__(self): + return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % ( + self.base_url, self.url, self.text, self.tag, self.attrs) + + +class LinksFactory: + + def __init__(self, + link_parser_class=None, + link_class=Link, + urltags=None, + ): + import _pullparser + if link_parser_class is None: + link_parser_class = _pullparser.TolerantPullParser + self.link_parser_class = link_parser_class + self.link_class = link_class + if urltags is None: + urltags = { + "a": "href", + "area": "href", + "frame": "src", + "iframe": "src", + } + self.urltags = urltags + self._response = None + self._encoding = None + + def set_response(self, response, base_url, encoding): + self._response = response + self._encoding = encoding + self._base_url = base_url + + def links(self): + """Return an iterator that provides links of the document.""" + response = self._response + encoding = self._encoding + base_url = self._base_url + p = self.link_parser_class(response, encoding=encoding) + + try: + for token in p.tags(*(self.urltags.keys()+["base"])): + if token.type == "endtag": + continue + if token.data == "base": + base_href = dict(token.attrs).get("href") + if base_href is not None: + base_url = base_href + continue + attrs = dict(token.attrs) + tag = token.data + text = None + # XXX use attr_encoding for ref'd doc if that doc does not + # provide one by other means + #attr_encoding = attrs.get("charset") + url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? + if not url: + # Probably an <A NAME="blah"> link or <AREA NOHREF...>. + # For our purposes a link is something with a URL, so + # ignore this. + continue + + url = _rfc3986.clean_url(url, encoding) + if tag == "a": + if token.type != "startendtag": + # hmm, this'd break if end tag is missing + text = p.get_compressed_text(("endtag", tag)) + # but this doesn't work for e.g. + # <a href="blah"><b>Andy</b></a> + #text = p.get_compressed_text() + + yield Link(base_url, url, text, tag, token.attrs) + except sgmllib.SGMLParseError, exc: + raise _form.ParseError(exc) + +class FormsFactory: + + """Makes a sequence of objects satisfying HTMLForm interface. + + After calling .forms(), the .global_form attribute is a form object + containing all controls not a descendant of any FORM element. + + For constructor argument docs, see ParseResponse argument docs. + """ + + def __init__(self, + select_default=False, + form_parser_class=None, + request_class=None, + backwards_compat=False, + ): + self.select_default = select_default + if form_parser_class is None: + form_parser_class = _form.FormParser + self.form_parser_class = form_parser_class + if request_class is None: + request_class = _request.Request + self.request_class = request_class + self.backwards_compat = backwards_compat + self._response = None + self.encoding = None + self.global_form = None + + def set_response(self, response, encoding): + self._response = response + self.encoding = encoding + self.global_form = None + + def forms(self): + encoding = self.encoding + forms = _form.ParseResponseEx( + self._response, + select_default=self.select_default, + form_parser_class=self.form_parser_class, + request_class=self.request_class, + encoding=encoding, + _urljoin=_rfc3986.urljoin, + _urlparse=_rfc3986.urlsplit, + _urlunparse=_rfc3986.urlunsplit, + ) + self.global_form = forms[0] + return forms[1:] + +class TitleFactory: + def __init__(self): + self._response = self._encoding = None + + def set_response(self, response, encoding): + self._response = response + self._encoding = encoding + + def _get_title_text(self, parser): + import _pullparser + text = [] + tok = None + while 1: + try: + tok = parser.get_token() + except _pullparser.NoMoreTokensError: + break + if tok.type == "data": + text.append(str(tok)) + elif tok.type == "entityref": + t = unescape("&%s;" % tok.data, + parser._entitydefs, parser.encoding) + text.append(t) + elif tok.type == "charref": + t = unescape_charref(tok.data, parser.encoding) + text.append(t) + elif tok.type in ["starttag", "endtag", "startendtag"]: + tag_name = tok.data + if tok.type == "endtag" and tag_name == "title": + break + text.append(str(tok)) + return COMPRESS_RE.sub(" ", "".join(text).strip()) + + def title(self): + import _pullparser + p = _pullparser.TolerantPullParser( + self._response, encoding=self._encoding) + try: + try: + p.get_tag("title") + except _pullparser.NoMoreTokensError: + return None + else: + return self._get_title_text(p) + except sgmllib.SGMLParseError, exc: + raise _form.ParseError(exc) + + +def unescape(data, entities, encoding): + if data is None or "&" not in data: + return data + + def replace_entities(match): + ent = match.group() + if ent[1] == "#": + return unescape_charref(ent[2:-1], encoding) + + repl = entities.get(ent[1:-1]) + if repl is not None: + repl = unichr(repl) + if type(repl) != type(""): + try: + repl = repl.encode(encoding) + except UnicodeError: + repl = ent + else: + repl = ent + return repl + + return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) + +def unescape_charref(data, encoding): + name, base = data, 10 + if name.startswith("x"): + name, base= name[1:], 16 + uc = unichr(int(name, base)) + if encoding is None: + return uc + else: + try: + repl = uc.encode(encoding) + except UnicodeError: + repl = "&#%s;" % data + return repl + + +class MechanizeBs(_beautifulsoup.BeautifulSoup): + _entitydefs = htmlentitydefs.name2codepoint + # don't want the magic Microsoft-char workaround + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda(x):'<!' + x.group(1) + '>') + ] + + def __init__(self, encoding, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + self._encoding = encoding + _beautifulsoup.BeautifulSoup.__init__( + self, text, avoidParserProblems, initialTextIsEverything) + + def handle_charref(self, ref): + t = unescape("&#%s;"%ref, self._entitydefs, self._encoding) + self.handle_data(t) + def handle_entityref(self, ref): + t = unescape("&%s;"%ref, self._entitydefs, self._encoding) + self.handle_data(t) + def unescape_attrs(self, attrs): + escaped_attrs = [] + for key, val in attrs: + val = unescape(val, self._entitydefs, self._encoding) + escaped_attrs.append((key, val)) + return escaped_attrs + +class RobustLinksFactory: + + compress_re = COMPRESS_RE + + def __init__(self, + link_parser_class=None, + link_class=Link, + urltags=None, + ): + if link_parser_class is None: + link_parser_class = MechanizeBs + self.link_parser_class = link_parser_class + self.link_class = link_class + if urltags is None: + urltags = { + "a": "href", + "area": "href", + "frame": "src", + "iframe": "src", + } + self.urltags = urltags + self._bs = None + self._encoding = None + self._base_url = None + + def set_soup(self, soup, base_url, encoding): + self._bs = soup + self._base_url = base_url + self._encoding = encoding + + def links(self): + bs = self._bs + base_url = self._base_url + encoding = self._encoding + for ch in bs.recursiveChildGenerator(): + if (isinstance(ch, _beautifulsoup.Tag) and + ch.name in self.urltags.keys()+["base"]): + link = ch + attrs = bs.unescape_attrs(link.attrs) + attrs_dict = dict(attrs) + if link.name == "base": + base_href = attrs_dict.get("href") + if base_href is not None: + base_url = base_href + continue + url_attr = self.urltags[link.name] + url = attrs_dict.get(url_attr) + if not url: + continue + url = _rfc3986.clean_url(url, encoding) + text = link.fetchText(lambda t: True) + if not text: + # follow _pullparser's weird behaviour rigidly + if link.name == "a": + text = "" + else: + text = None + else: + text = self.compress_re.sub(" ", " ".join(text).strip()) + yield Link(base_url, url, text, link.name, attrs) + + +class RobustFormsFactory(FormsFactory): + def __init__(self, *args, **kwds): + args = form_parser_args(*args, **kwds) + if args.form_parser_class is None: + args.form_parser_class = _form.RobustFormParser + FormsFactory.__init__(self, **args.dictionary) + + def set_response(self, response, encoding): + self._response = response + self.encoding = encoding + + +class RobustTitleFactory: + def __init__(self): + self._bs = self._encoding = None + + def set_soup(self, soup, encoding): + self._bs = soup + self._encoding = encoding + + def title(self): + title = self._bs.first("title") + if title == _beautifulsoup.Null: + return None + else: + inner_html = "".join([str(node) for node in title.contents]) + return COMPRESS_RE.sub(" ", inner_html.strip()) + + +class Factory: + """Factory for forms, links, etc. + + This interface may expand in future. + + Public methods: + + set_request_class(request_class) + set_response(response) + forms() + links() + + Public attributes: + + Note that accessing these attributes may raise ParseError. + + encoding: string specifying the encoding of response if it contains a text + document (this value is left unspecified for documents that do not have + an encoding, e.g. an image file) + is_html: true if response contains an HTML document (XHTML may be + regarded as HTML too) + title: page title, or None if no title or not HTML + global_form: form object containing all controls that are not descendants + of any FORM element, or None if the forms_factory does not support + supplying a global form + + """ + + LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"] + + def __init__(self, forms_factory, links_factory, title_factory, + encoding_finder=EncodingFinder(DEFAULT_ENCODING), + response_type_finder=ResponseTypeFinder(allow_xhtml=False), + ): + """ + + Pass keyword arguments only. + + default_encoding: character encoding to use if encoding cannot be + determined (or guessed) from the response. You should turn on + HTTP-EQUIV handling if you want the best chance of getting this right + without resorting to this default. The default value of this + parameter (currently latin-1) may change in future. + + """ + self._forms_factory = forms_factory + self._links_factory = links_factory + self._title_factory = title_factory + self._encoding_finder = encoding_finder + self._response_type_finder = response_type_finder + + self.set_response(None) + + def set_request_class(self, request_class): + """Set request class (mechanize.Request by default). + + HTMLForm instances returned by .forms() will return instances of this + class when .click()ed. + + """ + self._forms_factory.request_class = request_class + + def set_response(self, response): + """Set response. + + The response must either be None or implement the same interface as + objects returned by mechanize.urlopen(). + + """ + self._response = response + self._forms_genf = self._links_genf = None + self._get_title = None + for name in self.LAZY_ATTRS: + try: + delattr(self, name) + except AttributeError: + pass + + def __getattr__(self, name): + if name not in self.LAZY_ATTRS: + return getattr(self.__class__, name) + + if name == "encoding": + self.encoding = self._encoding_finder.encoding( + copy.copy(self._response)) + return self.encoding + elif name == "is_html": + self.is_html = self._response_type_finder.is_html( + copy.copy(self._response), self.encoding) + return self.is_html + elif name == "title": + if self.is_html: + self.title = self._title_factory.title() + else: + self.title = None + return self.title + elif name == "global_form": + self.forms() + return self.global_form + + def forms(self): + """Return iterable over HTMLForm-like objects. + + Raises mechanize.ParseError on failure. + """ + # this implementation sets .global_form as a side-effect, for benefit + # of __getattr__ impl + if self._forms_genf is None: + try: + self._forms_genf = CachingGeneratorFunction( + self._forms_factory.forms()) + except: # XXXX define exception! + self.set_response(self._response) + raise + self.global_form = getattr( + self._forms_factory, "global_form", None) + return self._forms_genf() + + def links(self): + """Return iterable over mechanize.Link-like objects. + + Raises mechanize.ParseError on failure. + """ + if self._links_genf is None: + try: + self._links_genf = CachingGeneratorFunction( + self._links_factory.links()) + except: # XXXX define exception! + self.set_response(self._response) + raise + return self._links_genf() + +class DefaultFactory(Factory): + """Based on sgmllib.""" + def __init__(self, i_want_broken_xhtml_support=False): + Factory.__init__( + self, + forms_factory=FormsFactory(), + links_factory=LinksFactory(), + title_factory=TitleFactory(), + response_type_finder=ResponseTypeFinder( + allow_xhtml=i_want_broken_xhtml_support), + ) + + def set_response(self, response): + Factory.set_response(self, response) + if response is not None: + self._forms_factory.set_response( + copy.copy(response), self.encoding) + self._links_factory.set_response( + copy.copy(response), response.geturl(), self.encoding) + self._title_factory.set_response( + copy.copy(response), self.encoding) + +class RobustFactory(Factory): + """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is + DefaultFactory. + + """ + def __init__(self, i_want_broken_xhtml_support=False, + soup_class=None): + Factory.__init__( + self, + forms_factory=RobustFormsFactory(), + links_factory=RobustLinksFactory(), + title_factory=RobustTitleFactory(), + response_type_finder=ResponseTypeFinder( + allow_xhtml=i_want_broken_xhtml_support), + ) + if soup_class is None: + soup_class = MechanizeBs + self._soup_class = soup_class + + def set_response(self, response): + Factory.set_response(self, response) + if response is not None: + data = response.read() + soup = self._soup_class(self.encoding, data) + self._forms_factory.set_response( + copy.copy(response), self.encoding) + self._links_factory.set_soup( + soup, response.geturl(), self.encoding) + self._title_factory.set_soup(soup, self.encoding) diff --git a/LTA/LTAIngest/mechanize/_http.py b/LTA/LTAIngest/mechanize/_http.py new file mode 100644 index 0000000000000000000000000000000000000000..657973519dedccbcdfe86715d25fea4f7359ebbc --- /dev/null +++ b/LTA/LTAIngest/mechanize/_http.py @@ -0,0 +1,447 @@ +"""HTTP related handlers. + +Note that some other HTTP handlers live in more specific modules: _auth.py, +_gzip.py, etc. + + +Copyright 2002-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import HTMLParser +from cStringIO import StringIO +import htmlentitydefs +import logging +import robotparser +import socket +import time + +import _sgmllib_copy as sgmllib +from _urllib2_fork import HTTPError, BaseHandler + +from _headersutil import is_html +from _html import unescape, unescape_charref +from _request import Request +from _response import response_seek_wrapper +import _rfc3986 +import _sockettimeout + +debug = logging.getLogger("mechanize").debug +debug_robots = logging.getLogger("mechanize.robots").debug + +# monkeypatch urllib2.HTTPError to show URL +## import urllib2 +## def urllib2_str(self): +## return 'HTTP Error %s: %s (%s)' % ( +## self.code, self.msg, self.geturl()) +## urllib2.HTTPError.__str__ = urllib2_str + + +CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes +DEFAULT_ENCODING = 'latin-1' + +# XXX would self.reset() work, instead of raising this exception? +class EndOfHeadError(Exception): pass +class AbstractHeadParser: + # only these elements are allowed in or before HEAD of document + head_elems = ("html", "head", + "title", "base", + "script", "style", "meta", "link", "object") + _entitydefs = htmlentitydefs.name2codepoint + _encoding = DEFAULT_ENCODING + + def __init__(self): + self.http_equiv = [] + + def start_meta(self, attrs): + http_equiv = content = None + for key, value in attrs: + if key == "http-equiv": + http_equiv = self.unescape_attr_if_required(value) + elif key == "content": + content = self.unescape_attr_if_required(value) + if http_equiv is not None and content is not None: + self.http_equiv.append((http_equiv, content)) + + def end_head(self): + raise EndOfHeadError() + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + escaped_attrs[key] = self.unescape_attr(val) + return escaped_attrs + + def unknown_entityref(self, ref): + self.handle_data("&%s;" % ref) + + def unknown_charref(self, ref): + self.handle_data("&#%s;" % ref) + + +class XHTMLCompatibleHeadParser(AbstractHeadParser, + HTMLParser.HTMLParser): + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + AbstractHeadParser.__init__(self) + + def handle_starttag(self, tag, attrs): + if tag not in self.head_elems: + raise EndOfHeadError() + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + if tag not in self.head_elems: + raise EndOfHeadError() + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + +class HeadParser(AbstractHeadParser, sgmllib.SGMLParser): + + def _not_called(self): + assert False + + def __init__(self): + sgmllib.SGMLParser.__init__(self) + AbstractHeadParser.__init__(self) + + def handle_starttag(self, tag, method, attrs): + if tag not in self.head_elems: + raise EndOfHeadError() + if tag == "meta": + method(attrs) + + def unknown_starttag(self, tag, attrs): + self.handle_starttag(tag, self._not_called, attrs) + + def handle_endtag(self, tag, method): + if tag in self.head_elems: + method() + else: + raise EndOfHeadError() + + def unescape_attr_if_required(self, name): + return self.unescape_attr(name) + +def parse_head(fileobj, parser): + """Return a list of key, value pairs.""" + while 1: + data = fileobj.read(CHUNK) + try: + parser.feed(data) + except EndOfHeadError: + break + if len(data) != CHUNK: + # this should only happen if there is no HTML body, or if + # CHUNK is big + break + return parser.http_equiv + +class HTTPEquivProcessor(BaseHandler): + """Append META HTTP-EQUIV headers to regular HTTP headers.""" + + handler_order = 300 # before handlers that look at HTTP headers + + def __init__(self, head_parser_class=HeadParser, + i_want_broken_xhtml_support=False, + ): + self.head_parser_class = head_parser_class + self._allow_xhtml = i_want_broken_xhtml_support + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + http_message = response.info() + url = response.geturl() + ct_hdrs = http_message.getheaders("content-type") + if is_html(ct_hdrs, url, self._allow_xhtml): + try: + try: + html_headers = parse_head(response, + self.head_parser_class()) + finally: + response.seek(0) + except (HTMLParser.HTMLParseError, + sgmllib.SGMLParseError): + pass + else: + for hdr, val in html_headers: + # add a header + http_message.dict[hdr.lower()] = val + text = hdr + ": " + val + for line in text.split("\n"): + http_message.headers.append(line + "\n") + return response + + https_response = http_response + + +class MechanizeRobotFileParser(robotparser.RobotFileParser): + + def __init__(self, url='', opener=None): + robotparser.RobotFileParser.__init__(self, url) + self._opener = opener + self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT + + def set_opener(self, opener=None): + import _opener + if opener is None: + opener = _opener.OpenerDirector() + self._opener = opener + + def set_timeout(self, timeout): + self._timeout = timeout + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + if self._opener is None: + self.set_opener() + req = Request(self.url, unverifiable=True, visit=False, + timeout=self._timeout) + try: + f = self._opener.open(req) + except HTTPError, f: + pass + except (IOError, socket.error, OSError), exc: + debug_robots("ignoring error opening %r: %s" % + (self.url, exc)) + return + lines = [] + line = f.readline() + while line: + lines.append(line.strip()) + line = f.readline() + status = f.code + if status == 401 or status == 403: + self.disallow_all = True + debug_robots("disallow all") + elif status >= 400: + self.allow_all = True + debug_robots("allow all") + elif status == 200 and lines: + debug_robots("parse lines") + self.parse(lines) + +class RobotExclusionError(HTTPError): + def __init__(self, request, *args): + apply(HTTPError.__init__, (self,)+args) + self.request = request + +class HTTPRobotRulesProcessor(BaseHandler): + # before redirections, after everything else + handler_order = 800 + + try: + from httplib import HTTPMessage + except: + from mimetools import Message + http_response_class = Message + else: + http_response_class = HTTPMessage + + def __init__(self, rfp_class=MechanizeRobotFileParser): + self.rfp_class = rfp_class + self.rfp = None + self._host = None + + def http_request(self, request): + scheme = request.get_type() + if scheme not in ["http", "https"]: + # robots exclusion only applies to HTTP + return request + + if request.get_selector() == "/robots.txt": + # /robots.txt is always OK to fetch + return request + + host = request.get_host() + + # robots.txt requests don't need to be allowed by robots.txt :-) + origin_req = getattr(request, "_origin_req", None) + if (origin_req is not None and + origin_req.get_selector() == "/robots.txt" and + origin_req.get_host() == host + ): + return request + + if host != self._host: + self.rfp = self.rfp_class() + try: + self.rfp.set_opener(self.parent) + except AttributeError: + debug("%r instance does not support set_opener" % + self.rfp.__class__) + self.rfp.set_url(scheme+"://"+host+"/robots.txt") + self.rfp.set_timeout(request.timeout) + self.rfp.read() + self._host = host + + ua = request.get_header("User-agent", "") + if self.rfp.can_fetch(ua, request.get_full_url()): + return request + else: + # XXX This should really have raised URLError. Too late now... + msg = "request disallowed by robots.txt" + raise RobotExclusionError( + request, + request.get_full_url(), + 403, msg, + self.http_response_class(StringIO()), StringIO(msg)) + + https_request = http_request + +class HTTPRefererProcessor(BaseHandler): + """Add Referer header to requests. + + This only makes sense if you use each RefererProcessor for a single + chain of requests only (so, for example, if you use a single + HTTPRefererProcessor to fetch a series of URLs extracted from a single + page, this will break). + + There's a proper implementation of this in mechanize.Browser. + + """ + def __init__(self): + self.referer = None + + def http_request(self, request): + if ((self.referer is not None) and + not request.has_header("Referer")): + request.add_unredirected_header("Referer", self.referer) + return request + + def http_response(self, request, response): + self.referer = response.geturl() + return response + + https_request = http_request + https_response = http_response + + +def clean_refresh_url(url): + # e.g. Firefox 1.5 does (something like) this + if ((url.startswith('"') and url.endswith('"')) or + (url.startswith("'") and url.endswith("'"))): + url = url[1:-1] + return _rfc3986.clean_url(url, "latin-1") # XXX encoding + +def parse_refresh_header(refresh): + """ + >>> parse_refresh_header("1; url=http://example.com/") + (1.0, 'http://example.com/') + >>> parse_refresh_header("1; url='http://example.com/'") + (1.0, 'http://example.com/') + >>> parse_refresh_header("1") + (1.0, None) + >>> parse_refresh_header("blah") # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ValueError: invalid literal for float(): blah + + """ + + ii = refresh.find(";") + if ii != -1: + pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:] + jj = newurl_spec.find("=") + key = None + if jj != -1: + key, newurl = newurl_spec[:jj], newurl_spec[jj+1:] + newurl = clean_refresh_url(newurl) + if key is None or key.strip().lower() != "url": + raise ValueError() + else: + pause, newurl = float(refresh), None + return pause, newurl + +class HTTPRefreshProcessor(BaseHandler): + """Perform HTTP Refresh redirections. + + Note that if a non-200 HTTP code has occurred (for example, a 30x + redirect), this processor will do nothing. + + By default, only zero-time Refresh headers are redirected. Use the + max_time attribute / constructor argument to allow Refresh with longer + pauses. Use the honor_time attribute / constructor argument to control + whether the requested pause is honoured (with a time.sleep()) or + skipped in favour of immediate redirection. + + Public attributes: + + max_time: see above + honor_time: see above + + """ + handler_order = 1000 + + def __init__(self, max_time=0, honor_time=True): + self.max_time = max_time + self.honor_time = honor_time + self._sleep = time.sleep + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code == 200 and hdrs.has_key("refresh"): + refresh = hdrs.getheaders("refresh")[0] + try: + pause, newurl = parse_refresh_header(refresh) + except ValueError: + debug("bad Refresh header: %r" % refresh) + return response + + if newurl is None: + newurl = response.geturl() + if (self.max_time is None) or (pause <= self.max_time): + if pause > 1E-3 and self.honor_time: + self._sleep(pause) + hdrs["location"] = newurl + # hardcoded http is NOT a bug + response = self.parent.error( + "http", request, response, + "refresh", msg, hdrs) + else: + debug("Refresh header ignored: %r" % refresh) + + return response + + https_response = http_response diff --git a/LTA/LTAIngest/mechanize/_lwpcookiejar.py b/LTA/LTAIngest/mechanize/_lwpcookiejar.py new file mode 100644 index 0000000000000000000000000000000000000000..f8d49cf2d4af25f239ec7c96863de1adcc36831b --- /dev/null +++ b/LTA/LTAIngest/mechanize/_lwpcookiejar.py @@ -0,0 +1,185 @@ +"""Load / save to libwww-perl (LWP) format files. + +Actually, the format is slightly extended from that used by LWP's +(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information +not recorded by LWP. + +It uses the version string "2.0", though really there isn't an LWP Cookies +2.0 format. This indicates that there is extra information in here +(domain_dot and port_spec) while still being compatible with libwww-perl, +I hope. + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import time, re, logging + +from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError +from _headersutil import join_header_words, split_header_words +from _util import iso2time, time2isoz + +debug = logging.getLogger("mechanize").debug + + +def lwp_cookie_str(cookie): + """Return string representation of Cookie in an the LWP cookie file format. + + Actually, the format is extended a bit -- see module docstring. + + """ + h = [(cookie.name, cookie.value), + ("path", cookie.path), + ("domain", cookie.domain)] + if cookie.port is not None: h.append(("port", cookie.port)) + if cookie.path_specified: h.append(("path_spec", None)) + if cookie.port_specified: h.append(("port_spec", None)) + if cookie.domain_initial_dot: h.append(("domain_dot", None)) + if cookie.secure: h.append(("secure", None)) + if cookie.expires: h.append(("expires", + time2isoz(float(cookie.expires)))) + if cookie.discard: h.append(("discard", None)) + if cookie.comment: h.append(("comment", cookie.comment)) + if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) + if cookie.rfc2109: h.append(("rfc2109", None)) + + keys = cookie.nonstandard_attr_keys() + keys.sort() + for k in keys: + h.append((k, str(cookie.get_nonstandard_attr(k)))) + + h.append(("version", str(cookie.version))) + + return join_header_words([h]) + +class LWPCookieJar(FileCookieJar): + """ + The LWPCookieJar saves a sequence of"Set-Cookie3" lines. + "Set-Cookie3" is the format used by the libwww-perl libary, not known + to be compatible with any browser, but which is easy to read and + doesn't lose information about RFC 2965 cookies. + + Additional methods + + as_lwp_str(ignore_discard=True, ignore_expired=True) + + """ + + magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" + + def as_lwp_str(self, ignore_discard=True, ignore_expires=True): + """Return cookies as a string of "\n"-separated "Set-Cookie3" headers. + + ignore_discard and ignore_expires: see docstring for FileCookieJar.save + + """ + now = time.time() + r = [] + for cookie in self: + if not ignore_discard and cookie.discard: + debug(" Not saving %s: marked for discard", cookie.name) + continue + if not ignore_expires and cookie.is_expired(now): + debug(" Not saving %s: expired", cookie.name) + continue + r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) + return "\n".join(r+[""]) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + debug("Saving LWP cookies file") + # There really isn't an LWP Cookies 2.0 format, but this indicates + # that there is extra information in here (domain_dot and + # port_spec) while still being compatible with libwww-perl, I hope. + f.write("#LWP-Cookies-2.0\n") + f.write(self.as_lwp_str(ignore_discard, ignore_expires)) + finally: + f.close() + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + magic = f.readline() + if not re.search(self.magic_re, magic): + msg = "%s does not seem to contain cookies" % filename + raise LoadError(msg) + + now = time.time() + + header = "Set-Cookie3:" + boolean_attrs = ("port_spec", "path_spec", "domain_dot", + "secure", "discard", "rfc2109") + value_attrs = ("version", + "port", "path", "domain", + "expires", + "comment", "commenturl") + + try: + while 1: + line = f.readline() + if line == "": break + if not line.startswith(header): + continue + line = line[len(header):].strip() + + for data in split_header_words([line]): + name, value = data[0] + standard = {} + rest = {} + for k in boolean_attrs: + standard[k] = False + for k, v in data[1:]: + if k is not None: + lc = k.lower() + else: + lc = None + # don't lose case distinction for unknown fields + if (lc in value_attrs) or (lc in boolean_attrs): + k = lc + if k in boolean_attrs: + if v is None: v = True + standard[k] = v + elif k in value_attrs: + standard[k] = v + else: + rest[k] = v + + h = standard.get + expires = h("expires") + discard = h("discard") + if expires is not None: + expires = iso2time(expires) + if expires is None: + discard = True + domain = h("domain") + domain_specified = domain.startswith(".") + c = Cookie(h("version"), name, value, + h("port"), h("port_spec"), + domain, domain_specified, h("domain_dot"), + h("path"), h("path_spec"), + h("secure"), + expires, + discard, + h("comment"), + h("commenturl"), + rest, + h("rfc2109"), + ) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + except: + reraise_unmasked_exceptions((IOError,)) + raise LoadError("invalid Set-Cookie3 format file %s" % filename) + diff --git a/LTA/LTAIngest/mechanize/_markupbase.py b/LTA/LTAIngest/mechanize/_markupbase.py new file mode 100644 index 0000000000000000000000000000000000000000..ae9c2a875f22b2dc5a8cd879668a82c1854dfc48 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_markupbase.py @@ -0,0 +1,393 @@ +# Taken from Python 2.6.4 for use by _sgmllib.py +"""Shared support for scanning document type declarations in HTML and XHTML. + +This module is used as a foundation for the HTMLParser and sgmllib +modules (indirectly, for htmllib as well). It has no documented +public API and should not be used directly. + +""" + +import re + +_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match +_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match +_commentclose = re.compile(r'--\s*>') +_markedsectionclose = re.compile(r']\s*]\s*>') + +# An analysis of the MS-Word extensions is available at +# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf + +_msmarkedsectionclose = re.compile(r']\s*>') + +del re + + +class ParserBase: + """Parser base class which provides some common support methods used + by the SGML/HTML and XHTML parsers.""" + + def __init__(self): + if self.__class__ is ParserBase: + raise RuntimeError( + "markupbase.ParserBase must be subclassed") + + def error(self, message): + raise NotImplementedError( + "subclasses of ParserBase must override error()") + + def reset(self): + self.lineno = 1 + self.offset = 0 + + def getpos(self): + """Return current line number and offset.""" + return self.lineno, self.offset + + # Internal -- update line number and offset. This should be + # called for each piece of data exactly once, in order -- in other + # words the concatenation of all the input strings to this + # function should be exactly the entire input. + def updatepos(self, i, j): + if i >= j: + return j + rawdata = self.rawdata + nlines = rawdata.count("\n", i, j) + if nlines: + self.lineno = self.lineno + nlines + pos = rawdata.rindex("\n", i, j) # Should not fail + self.offset = j-(pos+1) + else: + self.offset = self.offset + j-i + return j + + _decl_otherchars = '' + + # Internal -- parse declaration (for use by subclasses). + def parse_declaration(self, i): + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration ("<!DOCTYPE html...>"). + # ISO 8879:1986, however, has more complex + # declaration syntax for elements in <!...>, including: + # --comment-- + # [marked section] + # name in the following list: ENTITY, DOCTYPE, ELEMENT, + # ATTLIST, NOTATION, SHORTREF, USEMAP, + # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM + rawdata = self.rawdata + j = i + 2 + assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" + if rawdata[j:j+1] == ">": + # the empty comment <!> + return j + 1 + if rawdata[j:j+1] in ("-", ""): + # Start of comment followed by buffer boundary, + # or just a buffer boundary. + return -1 + # A simple, practical version could look like: ((name|stringlit) S*) + '>' + n = len(rawdata) + if rawdata[j:j+2] == '--': #comment + # Locate --.*-- as the body of the comment + return self.parse_comment(i) + elif rawdata[j] == '[': #marked section + # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section + # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA + # Note that this is extended by Microsoft Office "Save as Web" function + # to include [if...] and [endif]. + return self.parse_marked_section(i) + else: #all other declaration elements + decltype, j = self._scan_name(j, i) + if j < 0: + return j + if decltype == "doctype": + self._decl_otherchars = '' + while j < n: + c = rawdata[j] + if c == ">": + # end of declaration syntax + data = rawdata[i+2:j] + if decltype == "doctype": + self.handle_decl(data) + else: + self.unknown_decl(data) + return j + 1 + if c in "\"'": + m = _declstringlit_match(rawdata, j) + if not m: + return -1 # incomplete + j = m.end() + elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": + name, j = self._scan_name(j, i) + elif c in self._decl_otherchars: + j = j + 1 + elif c == "[": + # this could be handled in a separate doctype parser + if decltype == "doctype": + j = self._parse_doctype_subset(j + 1, i) + elif decltype in ("attlist", "linktype", "link", "element"): + # must tolerate []'d groups in a content model in an element declaration + # also in data attribute specifications of attlist declaration + # also link type declaration subsets in linktype declarations + # also link attribute specification lists in link declarations + self.error("unsupported '[' char in %s declaration" % decltype) + else: + self.error("unexpected '[' char in declaration") + else: + self.error( + "unexpected %r char in declaration" % rawdata[j]) + if j < 0: + return j + return -1 # incomplete + + # Internal -- parse a marked section + # Override this to handle MS-word extension syntax <![if word]>content<![endif]> + def parse_marked_section(self, i, report=1): + rawdata= self.rawdata + assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" + sectName, j = self._scan_name( i+3, i ) + if j < 0: + return j + if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): + # look for standard ]]> ending + match= _markedsectionclose.search(rawdata, i+3) + elif sectName in ("if", "else", "endif"): + # look for MS Office ]> ending + match= _msmarkedsectionclose.search(rawdata, i+3) + else: + self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) + if not match: + return -1 + if report: + j = match.start(0) + self.unknown_decl(rawdata[i+3: j]) + return match.end(0) + + # Internal -- parse comment, return length or -1 if not terminated + def parse_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+4] != '<!--': + self.error('unexpected call to parse_comment()') + match = _commentclose.search(rawdata, i+4) + if not match: + return -1 + if report: + j = match.start(0) + self.handle_comment(rawdata[i+4: j]) + return match.end(0) + + # Internal -- scan past the internal subset in a <!DOCTYPE declaration, + # returning the index just past any whitespace following the trailing ']'. + def _parse_doctype_subset(self, i, declstartpos): + rawdata = self.rawdata + n = len(rawdata) + j = i + while j < n: + c = rawdata[j] + if c == "<": + s = rawdata[j:j+2] + if s == "<": + # end of buffer; incomplete + return -1 + if s != "<!": + self.updatepos(declstartpos, j + 1) + self.error("unexpected char in internal subset (in %r)" % s) + if (j + 2) == n: + # end of buffer; incomplete + return -1 + if (j + 4) > n: + # end of buffer; incomplete + return -1 + if rawdata[j:j+4] == "<!--": + j = self.parse_comment(j, report=0) + if j < 0: + return j + continue + name, j = self._scan_name(j + 2, declstartpos) + if j == -1: + return -1 + if name not in ("attlist", "element", "entity", "notation"): + self.updatepos(declstartpos, j + 2) + self.error( + "unknown declaration %r in internal subset" % name) + # handle the individual names + meth = getattr(self, "_parse_doctype_" + name) + j = meth(j, declstartpos) + if j < 0: + return j + elif c == "%": + # parameter entity reference + if (j + 1) == n: + # end of buffer; incomplete + return -1 + s, j = self._scan_name(j + 1, declstartpos) + if j < 0: + return j + if rawdata[j] == ";": + j = j + 1 + elif c == "]": + j = j + 1 + while j < n and rawdata[j].isspace(): + j = j + 1 + if j < n: + if rawdata[j] == ">": + return j + self.updatepos(declstartpos, j) + self.error("unexpected char after internal subset") + else: + return -1 + elif c.isspace(): + j = j + 1 + else: + self.updatepos(declstartpos, j) + self.error("unexpected char %r in internal subset" % c) + # end of buffer reached + return -1 + + # Internal -- scan past <!ELEMENT declarations + def _parse_doctype_element(self, i, declstartpos): + name, j = self._scan_name(i, declstartpos) + if j == -1: + return -1 + # style content model; just skip until '>' + rawdata = self.rawdata + if '>' in rawdata[j:]: + return rawdata.find(">", j) + 1 + return -1 + + # Internal -- scan past <!ATTLIST declarations + def _parse_doctype_attlist(self, i, declstartpos): + rawdata = self.rawdata + name, j = self._scan_name(i, declstartpos) + c = rawdata[j:j+1] + if c == "": + return -1 + if c == ">": + return j + 1 + while 1: + # scan a series of attribute descriptions; simplified: + # name type [value] [#constraint] + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + c = rawdata[j:j+1] + if c == "": + return -1 + if c == "(": + # an enumerated type; look for ')' + if ")" in rawdata[j:]: + j = rawdata.find(")", j) + 1 + else: + return -1 + while rawdata[j:j+1].isspace(): + j = j + 1 + if not rawdata[j:]: + # end of buffer, incomplete + return -1 + else: + name, j = self._scan_name(j, declstartpos) + c = rawdata[j:j+1] + if not c: + return -1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if m: + j = m.end() + else: + return -1 + c = rawdata[j:j+1] + if not c: + return -1 + if c == "#": + if rawdata[j:] == "#": + # end of buffer + return -1 + name, j = self._scan_name(j + 1, declstartpos) + if j < 0: + return j + c = rawdata[j:j+1] + if not c: + return -1 + if c == '>': + # all done + return j + 1 + + # Internal -- scan past <!NOTATION declarations + def _parse_doctype_notation(self, i, declstartpos): + name, j = self._scan_name(i, declstartpos) + if j < 0: + return j + rawdata = self.rawdata + while 1: + c = rawdata[j:j+1] + if not c: + # end of buffer; incomplete + return -1 + if c == '>': + return j + 1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if not m: + return -1 + j = m.end() + else: + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + + # Internal -- scan past <!ENTITY declarations + def _parse_doctype_entity(self, i, declstartpos): + rawdata = self.rawdata + if rawdata[i:i+1] == "%": + j = i + 1 + while 1: + c = rawdata[j:j+1] + if not c: + return -1 + if c.isspace(): + j = j + 1 + else: + break + else: + j = i + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + while 1: + c = self.rawdata[j:j+1] + if not c: + return -1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if m: + j = m.end() + else: + return -1 # incomplete + elif c == ">": + return j + 1 + else: + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + + # Internal -- scan a name token and the new position and the token, or + # return -1 if we've reached the end of the buffer. + def _scan_name(self, i, declstartpos): + rawdata = self.rawdata + n = len(rawdata) + if i == n: + return None, -1 + m = _declname_match(rawdata, i) + if m: + s = m.group() + name = s.strip() + if (i + len(s)) == n: + return None, -1 # end of buffer + return name.lower(), m.end() + else: + self.updatepos(declstartpos, i) + self.error("expected name token at %r" + % rawdata[declstartpos:declstartpos+20]) + + # To be overridden -- handlers for unknown objects + def unknown_decl(self, data): + pass diff --git a/LTA/LTAIngest/mechanize/_mechanize.py b/LTA/LTAIngest/mechanize/_mechanize.py new file mode 100644 index 0000000000000000000000000000000000000000..5ce71a6cea102cb919dec37c4ade52a0ac1bce5f --- /dev/null +++ b/LTA/LTAIngest/mechanize/_mechanize.py @@ -0,0 +1,669 @@ +"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize. + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> +Copyright 2003 Andy Lester (original Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import copy, re, os, urllib, urllib2 + +from _html import DefaultFactory +import _response +import _request +import _rfc3986 +import _sockettimeout +import _urllib2_fork +from _useragent import UserAgentBase + +class BrowserStateError(Exception): pass +class LinkNotFoundError(Exception): pass +class FormNotFoundError(Exception): pass + + +def sanepathname2url(path): + urlpath = urllib.pathname2url(path) + if os.name == "nt" and urlpath.startswith("///"): + urlpath = urlpath[2:] + # XXX don't ask me about the mac... + return urlpath + + +class History: + """ + + Though this will become public, the implied interface is not yet stable. + + """ + def __init__(self): + self._history = [] # LIFO + def add(self, request, response): + self._history.append((request, response)) + def back(self, n, _response): + response = _response # XXX move Browser._response into this class? + while n > 0 or response is None: + try: + request, response = self._history.pop() + except IndexError: + raise BrowserStateError("already at start of history") + n -= 1 + return request, response + def clear(self): + del self._history[:] + def close(self): + for request, response in self._history: + if response is not None: + response.close() + del self._history[:] + + +class HTTPRefererProcessor(_urllib2_fork.BaseHandler): + def http_request(self, request): + # See RFC 2616 14.36. The only times we know the source of the + # request URI has a URI associated with it are redirect, and + # Browser.click() / Browser.submit() / Browser.follow_link(). + # Otherwise, it's the user's job to add any Referer header before + # .open()ing. + if hasattr(request, "redirect_dict"): + request = self.parent._add_referer_header( + request, origin_request=False) + return request + + https_request = http_request + + +class Browser(UserAgentBase): + """Browser-like class with support for history, forms and links. + + BrowserStateError is raised whenever the browser is in the wrong state to + complete the requested operation - e.g., when .back() is called when the + browser history is empty, or when .follow_link() is called when the current + response does not contain HTML data. + + Public attributes: + + request: current request (mechanize.Request) + form: currently selected form (see .select_form()) + + """ + + handler_classes = copy.copy(UserAgentBase.handler_classes) + handler_classes["_referer"] = HTTPRefererProcessor + default_features = copy.copy(UserAgentBase.default_features) + default_features.append("_referer") + + def __init__(self, + factory=None, + history=None, + request_class=None, + ): + """ + + Only named arguments should be passed to this constructor. + + factory: object implementing the mechanize.Factory interface. + history: object implementing the mechanize.History interface. Note + this interface is still experimental and may change in future. + request_class: Request class to use. Defaults to mechanize.Request + + The Factory and History objects passed in are 'owned' by the Browser, + so they should not be shared across Browsers. In particular, + factory.set_response() should not be called except by the owning + Browser itself. + + Note that the supplied factory's request_class is overridden by this + constructor, to ensure only one Request class is used. + + """ + self._handle_referer = True + + if history is None: + history = History() + self._history = history + + if request_class is None: + request_class = _request.Request + + if factory is None: + factory = DefaultFactory() + factory.set_request_class(request_class) + self._factory = factory + self.request_class = request_class + + self.request = None + self._set_response(None, False) + + # do this last to avoid __getattr__ problems + UserAgentBase.__init__(self) + + def close(self): + UserAgentBase.close(self) + if self._response is not None: + self._response.close() + if self._history is not None: + self._history.close() + self._history = None + + # make use after .close easy to spot + self.form = None + self.request = self._response = None + self.request = self.response = self.set_response = None + self.geturl = self.reload = self.back = None + self.clear_history = self.set_cookie = self.links = self.forms = None + self.viewing_html = self.encoding = self.title = None + self.select_form = self.click = self.submit = self.click_link = None + self.follow_link = self.find_link = None + + def set_handle_referer(self, handle): + """Set whether to add Referer header to each request.""" + self._set_handler("_referer", handle) + self._handle_referer = bool(handle) + + def _add_referer_header(self, request, origin_request=True): + if self.request is None: + return request + scheme = request.get_type() + original_scheme = self.request.get_type() + if scheme not in ["http", "https"]: + return request + if not origin_request and not self.request.has_header("Referer"): + return request + + if (self._handle_referer and + original_scheme in ["http", "https"] and + not (original_scheme == "https" and scheme != "https")): + # strip URL fragment (RFC 2616 14.36) + parts = _rfc3986.urlsplit(self.request.get_full_url()) + parts = parts[:-1]+(None,) + referer = _rfc3986.urlunsplit(parts) + request.add_unredirected_header("Referer", referer) + return request + + def open_novisit(self, url, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + """Open a URL without visiting it. + + Browser state (including request, response, history, forms and links) + is left unchanged by calling this function. + + The interface is the same as for .open(). + + This is useful for things like fetching images. + + See also .retrieve(). + + """ + return self._mech_open(url, data, visit=False, timeout=timeout) + + def open(self, url, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return self._mech_open(url, data, timeout=timeout) + + def _mech_open(self, url, data=None, update_history=True, visit=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + try: + url.get_full_url + except AttributeError: + # string URL -- convert to absolute URL if required + scheme, authority = _rfc3986.urlsplit(url)[:2] + if scheme is None: + # relative URL + if self._response is None: + raise BrowserStateError( + "can't fetch relative reference: " + "not viewing any document") + url = _rfc3986.urljoin(self._response.geturl(), url) + + request = self._request(url, data, visit, timeout) + visit = request.visit + if visit is None: + visit = True + + if visit: + self._visit_request(request, update_history) + + success = True + try: + response = UserAgentBase.open(self, request, data) + except urllib2.HTTPError, error: + success = False + if error.fp is None: # not a response + raise + response = error +## except (IOError, socket.error, OSError), error: +## # Yes, urllib2 really does raise all these :-(( +## # See test_urllib2.py for examples of socket.gaierror and OSError, +## # plus note that FTPHandler raises IOError. +## # XXX I don't seem to have an example of exactly socket.error being +## # raised, only socket.gaierror... +## # I don't want to start fixing these here, though, since this is a +## # subclass of OpenerDirector, and it would break old code. Even in +## # Python core, a fix would need some backwards-compat. hack to be +## # acceptable. +## raise + + if visit: + self._set_response(response, False) + response = copy.copy(self._response) + elif response is not None: + response = _response.upgrade_response(response) + + if not success: + raise response + return response + + def __str__(self): + text = [] + text.append("<%s " % self.__class__.__name__) + if self._response: + text.append("visiting %s" % self._response.geturl()) + else: + text.append("(not visiting a URL)") + if self.form: + text.append("\n selected form:\n %s\n" % str(self.form)) + text.append(">") + return "".join(text) + + def response(self): + """Return a copy of the current response. + + The returned object has the same interface as the object returned by + .open() (or mechanize.urlopen()). + + """ + return copy.copy(self._response) + + def open_local_file(self, filename): + path = sanepathname2url(os.path.abspath(filename)) + url = 'file://'+path + return self.open(url) + + def set_response(self, response): + """Replace current response with (a copy of) response. + + response may be None. + + This is intended mostly for HTML-preprocessing. + """ + self._set_response(response, True) + + def _set_response(self, response, close_current): + # sanity check, necessary but far from sufficient + if not (response is None or + (hasattr(response, "info") and hasattr(response, "geturl") and + hasattr(response, "read") + ) + ): + raise ValueError("not a response object") + + self.form = None + if response is not None: + response = _response.upgrade_response(response) + if close_current and self._response is not None: + self._response.close() + self._response = response + self._factory.set_response(response) + + def visit_response(self, response, request=None): + """Visit the response, as if it had been .open()ed. + + Unlike .set_response(), this updates history rather than replacing the + current response. + """ + if request is None: + request = _request.Request(response.geturl()) + self._visit_request(request, True) + self._set_response(response, False) + + def _visit_request(self, request, update_history): + if self._response is not None: + self._response.close() + if self.request is not None and update_history: + self._history.add(self.request, self._response) + self._response = None + # we want self.request to be assigned even if UserAgentBase.open + # fails + self.request = request + + def geturl(self): + """Get URL of current document.""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._response.geturl() + + def reload(self): + """Reload current document, and return response object.""" + if self.request is None: + raise BrowserStateError("no URL has yet been .open()ed") + if self._response is not None: + self._response.close() + return self._mech_open(self.request, update_history=False) + + def back(self, n=1): + """Go back n steps in history, and return response object. + + n: go back this number of steps (default 1 step) + + """ + if self._response is not None: + self._response.close() + self.request, response = self._history.back(n, self._response) + self.set_response(response) + if not response.read_complete: + return self.reload() + return copy.copy(response) + + def clear_history(self): + self._history.clear() + + def set_cookie(self, cookie_string): + """Request to set a cookie. + + Note that it is NOT necessary to call this method under ordinary + circumstances: cookie handling is normally entirely automatic. The + intended use case is rather to simulate the setting of a cookie by + client script in a web page (e.g. JavaScript). In that case, use of + this method is necessary because mechanize currently does not support + JavaScript, VBScript, etc. + + The cookie is added in the same way as if it had arrived with the + current response, as a result of the current request. This means that, + for example, if it is not appropriate to set the cookie based on the + current request, no cookie will be set. + + The cookie will be returned automatically with subsequent responses + made by the Browser instance whenever that's appropriate. + + cookie_string should be a valid value of the Set-Cookie header. + + For example: + + browser.set_cookie( + "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT") + + Currently, this method does not allow for adding RFC 2986 cookies. + This limitation will be lifted if anybody requests it. + + """ + if self._response is None: + raise BrowserStateError("not viewing any document") + if self.request.get_type() not in ["http", "https"]: + raise BrowserStateError("can't set cookie for non-HTTP/HTTPS " + "transactions") + cookiejar = self._ua_handlers["_cookies"].cookiejar + response = self.response() # copy + headers = response.info() + headers["Set-cookie"] = cookie_string + cookiejar.extract_cookies(response, self.request) + + def links(self, **kwds): + """Return iterable over links (mechanize.Link objects).""" + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + links = self._factory.links() + if kwds: + return self._filter_links(links, **kwds) + else: + return links + + def forms(self): + """Return iterable over forms. + + The returned form objects implement the mechanize.HTMLForm interface. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.forms() + + def global_form(self): + """Return the global form object, or None if the factory implementation + did not supply one. + + The "global" form object contains all controls that are not descendants + of any FORM element. + + The returned form object implements the mechanize.HTMLForm interface. + + This is a separate method since the global form is not regarded as part + of the sequence of forms in the document -- mostly for + backwards-compatibility. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.global_form + + def viewing_html(self): + """Return whether the current response contains HTML data.""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._factory.is_html + + def encoding(self): + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._factory.encoding + + def title(self): + r"""Return title, or None if there is no title element in the document. + + Treatment of any tag children of attempts to follow Firefox and IE + (currently, tags are preserved). + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.title + + def select_form(self, name=None, predicate=None, nr=None): + """Select an HTML form for input. + + This is a bit like giving a form the "input focus" in a browser. + + If a form is selected, the Browser object supports the HTMLForm + interface, so you can call methods like .set_value(), .set(), and + .click(). + + Another way to select a form is to assign to the .form attribute. The + form assigned should be one of the objects returned by the .forms() + method. + + At least one of the name, predicate and nr arguments must be supplied. + If no matching form is found, mechanize.FormNotFoundError is raised. + + If name is specified, then the form must have the indicated name. + + If predicate is specified, then the form must match that function. The + predicate function is passed the HTMLForm as its single argument, and + should return a boolean value indicating whether the form matched. + + nr, if supplied, is the sequence number of the form (where 0 is the + first). Note that control 0 is the first form matching all the other + arguments (if supplied); it is not necessarily the first control in the + form. The "global form" (consisting of all form controls not contained + in any FORM element) is considered not to be part of this sequence and + to have no name, so will not be matched unless both name and nr are + None. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + if (name is None) and (predicate is None) and (nr is None): + raise ValueError( + "at least one argument must be supplied to specify form") + + global_form = self._factory.global_form + if nr is None and name is None and \ + predicate is not None and predicate(global_form): + self.form = global_form + return + + orig_nr = nr + for form in self.forms(): + if name is not None and name != form.name: + continue + if predicate is not None and not predicate(form): + continue + if nr: + nr -= 1 + continue + self.form = form + break # success + else: + # failure + description = [] + if name is not None: description.append("name '%s'" % name) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr is not None: description.append("nr %d" % orig_nr) + description = ", ".join(description) + raise FormNotFoundError("no form matching "+description) + + def click(self, *args, **kwds): + """See mechanize.HTMLForm.click for documentation.""" + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + request = self.form.click(*args, **kwds) + return self._add_referer_header(request) + + def submit(self, *args, **kwds): + """Submit current form. + + Arguments are as for mechanize.HTMLForm.click(). + + Return value is same as for Browser.open(). + + """ + return self.open(self.click(*args, **kwds)) + + def click_link(self, link=None, **kwds): + """Find a link and return a Request object for it. + + Arguments are as for .find_link(), except that a link may be supplied + as the first argument. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + if not link: + link = self.find_link(**kwds) + else: + if kwds: + raise ValueError( + "either pass a Link, or keyword arguments, not both") + request = self.request_class(link.absolute_url) + return self._add_referer_header(request) + + def follow_link(self, link=None, **kwds): + """Find a link and .open() it. + + Arguments are as for .click_link(). + + Return value is same as for Browser.open(). + + """ + return self.open(self.click_link(link, **kwds)) + + def find_link(self, **kwds): + """Find a link in current page. + + Links are returned as mechanize.Link objects. + + # Return third link that .search()-matches the regexp "python" + # (by ".search()-matches", I mean that the regular expression method + # .search() is used, rather than .match()). + find_link(text_regex=re.compile("python"), nr=2) + + # Return first http link in the current page that points to somewhere + # on python.org whose link text (after tags have been removed) is + # exactly "monty python". + find_link(text="monty python", + url_regex=re.compile("http.*python.org")) + + # Return first link with exactly three HTML attributes. + find_link(predicate=lambda link: len(link.attrs) == 3) + + Links include anchors (<a>), image maps (<area>), and frames (<frame>, + <iframe>). + + All arguments must be passed by keyword, not position. Zero or more + arguments may be supplied. In order to find a link, all arguments + supplied must match. + + If a matching link is not found, mechanize.LinkNotFoundError is raised. + + text: link text between link tags: e.g. <a href="blah">this bit</a> (as + returned by pullparser.get_compressed_text(), ie. without tags but + with opening tags "textified" as per the pullparser docs) must compare + equal to this argument, if supplied + text_regex: link text between tag (as defined above) must match the + regular expression object or regular expression string passed as this + argument, if supplied + name, name_regex: as for text and text_regex, but matched against the + name HTML attribute of the link tag + url, url_regex: as for text and text_regex, but matched against the + URL of the link tag (note this matches against Link.url, which is a + relative or absolute URL according to how it was written in the HTML) + tag: element name of opening tag, e.g. "a" + predicate: a function taking a Link object as its single argument, + returning a boolean result, indicating whether the links + nr: matches the nth link that matches all other criteria (default 0) + + """ + try: + return self._filter_links(self._factory.links(), **kwds).next() + except StopIteration: + raise LinkNotFoundError() + + def __getattr__(self, name): + # pass through _form.HTMLForm methods and attributes + form = self.__dict__.get("form") + if form is None: + raise AttributeError( + "%s instance has no attribute %s (perhaps you forgot to " + ".select_form()?)" % (self.__class__, name)) + return getattr(form, name) + + def _filter_links(self, links, + text=None, text_regex=None, + name=None, name_regex=None, + url=None, url_regex=None, + tag=None, + predicate=None, + nr=0 + ): + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + + orig_nr = nr + + for link in links: + if url is not None and url != link.url: + continue + if url_regex is not None and not re.search(url_regex, link.url): + continue + if (text is not None and + (link.text is None or text != link.text)): + continue + if (text_regex is not None and + (link.text is None or not re.search(text_regex, link.text))): + continue + if name is not None and name != dict(link.attrs).get("name"): + continue + if name_regex is not None: + link_name = dict(link.attrs).get("name") + if link_name is None or not re.search(name_regex, link_name): + continue + if tag is not None and tag != link.tag: + continue + if predicate is not None and not predicate(link): + continue + if nr: + nr -= 1 + continue + yield link + nr = orig_nr diff --git a/LTA/LTAIngest/mechanize/_mozillacookiejar.py b/LTA/LTAIngest/mechanize/_mozillacookiejar.py new file mode 100644 index 0000000000000000000000000000000000000000..51e81bb62d414d2336e7a801ead9402de965955b --- /dev/null +++ b/LTA/LTAIngest/mechanize/_mozillacookiejar.py @@ -0,0 +1,161 @@ +"""Mozilla / Netscape cookie loading / saving. + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import re, time, logging + +from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError +debug = logging.getLogger("ClientCookie").debug + + +class MozillaCookieJar(FileCookieJar): + """ + + WARNING: you may want to backup your browser's cookies file if you use + this class to save cookies. I *think* it works, but there have been + bugs in the past! + + This class differs from CookieJar only in the format it uses to save and + load cookies to and from a file. This class uses the Mozilla/Netscape + `cookies.txt' format. lynx uses this file format, too. + + Don't expect cookies saved while the browser is running to be noticed by + the browser (in fact, Mozilla on unix will overwrite your saved cookies if + you change them on disk while it's running; on Windows, you probably can't + save at all while the browser is running). + + Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to + Netscape cookies on saving. + + In particular, the cookie version and port number information is lost, + together with information about whether or not Path, Port and Discard were + specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the + domain as set in the HTTP header started with a dot (yes, I'm aware some + domains in Netscape files start with a dot and some don't -- trust me, you + really don't want to know any more about this). + + Note that though Mozilla and Netscape use the same format, they use + slightly different headers. The class saves cookies using the Netscape + header by default (Mozilla can cope with that). + + """ + magic_re = "#( Netscape)? HTTP Cookie File" + header = """\ + # Netscape HTTP Cookie File + # http://www.netscape.com/newsref/std/cookie_spec.html + # This is a generated file! Do not edit. + +""" + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + now = time.time() + + magic = f.readline() + if not re.search(self.magic_re, magic): + f.close() + raise LoadError( + "%s does not look like a Netscape format cookies file" % + filename) + + try: + while 1: + line = f.readline() + if line == "": break + + # last field may be absent, so keep any trailing tab + if line.endswith("\n"): line = line[:-1] + + # skip comments and blank lines XXX what is $ for? + if (line.strip().startswith("#") or + line.strip().startswith("$") or + line.strip() == ""): + continue + + domain, domain_specified, path, secure, expires, name, value = \ + line.split("\t", 6) + secure = (secure == "TRUE") + domain_specified = (domain_specified == "TRUE") + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + if domain_specified != initial_dot: + raise LoadError("domain and domain specified flag don't " + "match in %s: %s" % (filename, line)) + + discard = False + if expires == "": + expires = None + discard = True + + # assume path_specified is false + c = Cookie(0, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + {}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + + except: + reraise_unmasked_exceptions((IOError, LoadError)) + raise LoadError("invalid Netscape format file %s: %s" % + (filename, line)) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + debug("Saving Netscape cookies.txt file") + f.write(self.header) + now = time.time() + for cookie in self: + if not ignore_discard and cookie.discard: + debug(" Not saving %s: marked for discard", cookie.name) + continue + if not ignore_expires and cookie.is_expired(now): + debug(" Not saving %s: expired", cookie.name) + continue + if cookie.secure: secure = "TRUE" + else: secure = "FALSE" + if cookie.domain.startswith("."): initial_dot = "TRUE" + else: initial_dot = "FALSE" + if cookie.expires is not None: + expires = str(cookie.expires) + else: + expires = "" + if cookie.value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas cookielib regards it as a + # cookie with no value. + name = "" + value = cookie.name + else: + name = cookie.name + value = cookie.value + f.write( + "\t".join([cookie.domain, initial_dot, cookie.path, + secure, expires, name, value])+ + "\n") + finally: + f.close() diff --git a/LTA/LTAIngest/mechanize/_msiecookiejar.py b/LTA/LTAIngest/mechanize/_msiecookiejar.py new file mode 100644 index 0000000000000000000000000000000000000000..8af11c0e4ceae55d2c7394a92effc5f551c392f7 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_msiecookiejar.py @@ -0,0 +1,388 @@ +"""Microsoft Internet Explorer cookie loading on Windows. + +Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code) +Copyright 2002-2006 John J Lee <jjl@pobox.com> (The Python port) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# XXX names and comments are not great here + +import os, re, time, struct, logging +if os.name == "nt": + import _winreg + +from _clientcookie import FileCookieJar, CookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError + +debug = logging.getLogger("mechanize").debug + + +def regload(path, leaf): + key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0, + _winreg.KEY_ALL_ACCESS) + try: + value = _winreg.QueryValueEx(key, leaf)[0] + except WindowsError: + value = None + return value + +WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME + +def epoch_time_offset_from_win32_filetime(filetime): + """Convert from win32 filetime to seconds-since-epoch value. + + MSIE stores create and expire times as Win32 FILETIME, which is 64 + bits of 100 nanosecond intervals since Jan 01 1601. + + mechanize expects time in 32-bit value expressed in seconds since the + epoch (Jan 01 1970). + + """ + if filetime < WIN32_EPOCH: + raise ValueError("filetime (%d) is before epoch (%d)" % + (filetime, WIN32_EPOCH)) + + return divmod((filetime - WIN32_EPOCH), 10000000L)[0] + +def binary_to_char(c): return "%02X" % ord(c) +def binary_to_str(d): return "".join(map(binary_to_char, list(d))) + +class MSIEBase: + magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*") + padding = "\x0d\xf0\xad\x0b" + + msie_domain_re = re.compile(r"^([^/]+)(/.*)$") + cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?" + "(.+\@[\x21-\xFF]+\.txt)") + + # path under HKEY_CURRENT_USER from which to get location of index.dat + reg_path = r"software\microsoft\windows" \ + r"\currentversion\explorer\shell folders" + reg_key = "Cookies" + + def __init__(self): + self._delayload_domains = {} + + def _delayload_domain(self, domain): + # if necessary, lazily load cookies for this domain + delayload_info = self._delayload_domains.get(domain) + if delayload_info is not None: + cookie_file, ignore_discard, ignore_expires = delayload_info + try: + self.load_cookie_data(cookie_file, + ignore_discard, ignore_expires) + except (LoadError, IOError): + debug("error reading cookie file, skipping: %s", cookie_file) + else: + del self._delayload_domains[domain] + + def _load_cookies_from_file(self, filename): + debug("Loading MSIE cookies file: %s", filename) + cookies = [] + + cookies_fh = open(filename) + + try: + while 1: + key = cookies_fh.readline() + if key == "": break + + rl = cookies_fh.readline + def getlong(rl=rl): return long(rl().rstrip()) + def getstr(rl=rl): return rl().rstrip() + + key = key.rstrip() + value = getstr() + domain_path = getstr() + flags = getlong() # 0x2000 bit is for secure I think + lo_expire = getlong() + hi_expire = getlong() + lo_create = getlong() + hi_create = getlong() + sep = getstr() + + if "" in (key, value, domain_path, flags, hi_expire, lo_expire, + hi_create, lo_create, sep) or (sep != "*"): + break + + m = self.msie_domain_re.search(domain_path) + if m: + domain = m.group(1) + path = m.group(2) + + cookies.append({"KEY": key, "VALUE": value, + "DOMAIN": domain, "PATH": path, + "FLAGS": flags, "HIXP": hi_expire, + "LOXP": lo_expire, "HICREATE": hi_create, + "LOCREATE": lo_create}) + finally: + cookies_fh.close() + + return cookies + + def load_cookie_data(self, filename, + ignore_discard=False, ignore_expires=False): + """Load cookies from file containing actual cookie data. + + Old cookies are kept unless overwritten by newly loaded ones. + + You should not call this method if the delayload attribute is set. + + I think each of these files contain all cookies for one user, domain, + and path. + + filename: file containing cookies -- usually found in a file like + C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt + + """ + now = int(time.time()) + + cookie_data = self._load_cookies_from_file(filename) + + for cookie in cookie_data: + flags = cookie["FLAGS"] + secure = ((flags & 0x2000) != 0) + filetime = (cookie["HIXP"] << 32) + cookie["LOXP"] + expires = epoch_time_offset_from_win32_filetime(filetime) + if expires < now: + discard = True + else: + discard = False + domain = cookie["DOMAIN"] + initial_dot = domain.startswith(".") + if initial_dot: + domain_specified = True + else: + # MSIE 5 does not record whether the domain cookie-attribute + # was specified. + # Assuming it wasn't is conservative, because with strict + # domain matching this will match less frequently; with regular + # Netscape tail-matching, this will match at exactly the same + # times that domain_specified = True would. It also means we + # don't have to prepend a dot to achieve consistency with our + # own & Mozilla's domain-munging scheme. + domain_specified = False + + # assume path_specified is false + # XXX is there other stuff in here? -- e.g. comment, commentURL? + c = Cookie(0, + cookie["KEY"], cookie["VALUE"], + None, False, + domain, domain_specified, initial_dot, + cookie["PATH"], False, + secure, + expires, + discard, + None, + None, + {"flags": flags}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + CookieJar.set_cookie(self, c) + + def load_from_registry(self, ignore_discard=False, ignore_expires=False, + username=None): + """ + username: only required on win9x + + """ + cookies_dir = regload(self.reg_path, self.reg_key) + filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT")) + self.load(filename, ignore_discard, ignore_expires, username) + + def _really_load(self, index, filename, ignore_discard, ignore_expires, + username): + now = int(time.time()) + + if username is None: + username = os.environ['USERNAME'].lower() + + cookie_dir = os.path.dirname(filename) + + data = index.read(256) + if len(data) != 256: + raise LoadError("%s file is too short" % filename) + + # Cookies' index.dat file starts with 32 bytes of signature + # followed by an offset to the first record, stored as a little- + # endian DWORD. + sig, size, data = data[:32], data[32:36], data[36:] + size = struct.unpack("<L", size)[0] + + # check that sig is valid + if not self.magic_re.match(sig) or size != 0x4000: + raise LoadError("%s ['%s' %s] does not seem to contain cookies" % + (str(filename), sig, size)) + + # skip to start of first record + index.seek(size, 0) + + sector = 128 # size of sector in bytes + + while 1: + data = "" + + # Cookies are usually in two contiguous sectors, so read in two + # sectors and adjust if not a Cookie. + to_read = 2 * sector + d = index.read(to_read) + if len(d) != to_read: + break + data = data + d + + # Each record starts with a 4-byte signature and a count + # (little-endian DWORD) of sectors for the record. + sig, size, data = data[:4], data[4:8], data[8:] + size = struct.unpack("<L", size)[0] + + to_read = (size - 2) * sector + +## from urllib import quote +## print "data", quote(data) +## print "sig", quote(sig) +## print "size in sectors", size +## print "size in bytes", size*sector +## print "size in units of 16 bytes", (size*sector) / 16 +## print "size to read in bytes", to_read +## print + + if sig != "URL ": + assert sig in ("HASH", "LEAK", \ + self.padding, "\x00\x00\x00\x00"), \ + "unrecognized MSIE index.dat record: %s" % \ + binary_to_str(sig) + if sig == "\x00\x00\x00\x00": + # assume we've got all the cookies, and stop + break + if sig == self.padding: + continue + # skip the rest of this record + assert to_read >= 0 + if size != 2: + assert to_read != 0 + index.seek(to_read, 1) + continue + + # read in rest of record if necessary + if size > 2: + more_data = index.read(to_read) + if len(more_data) != to_read: break + data = data + more_data + + cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username + + "(%s\@[\x21-\xFF]+\.txt)" % username) + m = re.search(cookie_re, data, re.I) + if m: + cookie_file = os.path.join(cookie_dir, m.group(2)) + if not self.delayload: + try: + self.load_cookie_data(cookie_file, + ignore_discard, ignore_expires) + except (LoadError, IOError): + debug("error reading cookie file, skipping: %s", + cookie_file) + else: + domain = m.group(1) + i = domain.find("/") + if i != -1: + domain = domain[:i] + + self._delayload_domains[domain] = ( + cookie_file, ignore_discard, ignore_expires) + + +class MSIECookieJar(MSIEBase, FileCookieJar): + """FileCookieJar that reads from the Windows MSIE cookies database. + + MSIECookieJar can read the cookie files of Microsoft Internet Explorer + (MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and + Windows 98. Other configurations may also work, but are untested. Saving + cookies in MSIE format is NOT supported. If you save cookies, they'll be + in the usual Set-Cookie3 format, which you can read back in using an + instance of the plain old CookieJar class. Don't save using the same + filename that you loaded cookies from, because you may succeed in + clobbering your MSIE cookies index file! + + You should be able to have LWP share Internet Explorer's cookies like + this (note you need to supply a username to load_from_registry if you're on + Windows 9x or Windows ME): + + cj = MSIECookieJar(delayload=1) + # find cookies index file in registry and load cookies from it + cj.load_from_registry() + opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) + response = opener.open("http://example.com/") + + Iterating over a delayloaded MSIECookieJar instance will not cause any + cookies to be read from disk. To force reading of all cookies from disk, + call read_all_cookies. Note that the following methods iterate over self: + clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__ + and as_string. + + Additional methods: + + load_from_registry(ignore_discard=False, ignore_expires=False, + username=None) + load_cookie_data(filename, ignore_discard=False, ignore_expires=False) + read_all_cookies() + + """ + def __init__(self, filename=None, delayload=False, policy=None): + MSIEBase.__init__(self) + FileCookieJar.__init__(self, filename, delayload, policy) + + def set_cookie(self, cookie): + if self.delayload: + self._delayload_domain(cookie.domain) + CookieJar.set_cookie(self, cookie) + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + domains = self._cookies.copy() + domains.update(self._delayload_domains) + domains = domains.keys() + + cookies = [] + for domain in domains: + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookies_for_domain(self, domain, request): + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + if self.delayload: + self._delayload_domain(domain) + return CookieJar._cookies_for_domain(self, domain, request) + + def read_all_cookies(self): + """Eagerly read in all cookies.""" + if self.delayload: + for domain in self._delayload_domains.keys(): + self._delayload_domain(domain) + + def load(self, filename, ignore_discard=False, ignore_expires=False, + username=None): + """Load cookies from an MSIE 'index.dat' cookies index file. + + filename: full path to cookie index file + username: only required on win9x + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + index = open(filename, "rb") + + try: + self._really_load(index, filename, ignore_discard, ignore_expires, + username) + finally: + index.close() diff --git a/LTA/LTAIngest/mechanize/_opener.py b/LTA/LTAIngest/mechanize/_opener.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8412d817e4e9149a1d4d2b8dc7aedc748a878b --- /dev/null +++ b/LTA/LTAIngest/mechanize/_opener.py @@ -0,0 +1,442 @@ +"""URL opener. + +Copyright 2004-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import os, urllib2, bisect, httplib, types, tempfile +try: + import threading as _threading +except ImportError: + import dummy_threading as _threading +try: + set +except NameError: + import sets + set = sets.Set + +from _request import Request +import _response +import _rfc3986 +import _sockettimeout +import _urllib2_fork +from _util import isstringlike + +open_file = open + + +class ContentTooShortError(urllib2.URLError): + def __init__(self, reason, result): + urllib2.URLError.__init__(self, reason) + self.result = result + + +def set_request_attr(req, name, value, default): + try: + getattr(req, name) + except AttributeError: + setattr(req, name, default) + if value is not default: + setattr(req, name, value) + + +class OpenerDirector(_urllib2_fork.OpenerDirector): + def __init__(self): + _urllib2_fork.OpenerDirector.__init__(self) + # really none of these are (sanely) public -- the lack of initial + # underscore on some is just due to following urllib2 + self.process_response = {} + self.process_request = {} + self._any_request = {} + self._any_response = {} + self._handler_index_valid = True + self._tempfiles = [] + + def add_handler(self, handler): + if not hasattr(handler, "add_parent"): + raise TypeError("expected BaseHandler instance, got %r" % + type(handler)) + + if handler in self.handlers: + return + # XXX why does self.handlers need to be sorted? + bisect.insort(self.handlers, handler) + handler.add_parent(self) + self._handler_index_valid = False + + def _maybe_reindex_handlers(self): + if self._handler_index_valid: + return + + handle_error = {} + handle_open = {} + process_request = {} + process_response = {} + any_request = set() + any_response = set() + unwanted = [] + + for handler in self.handlers: + added = False + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + if meth == "any_request": + any_request.add(handler) + added = True + continue + elif meth == "any_response": + any_response.add(handler) + added = True + continue + + ii = meth.find("_") + scheme = meth[:ii] + condition = meth[ii+1:] + + if condition.startswith("error"): + jj = meth[ii+1:].find("_") + ii + 1 + kind = meth[jj+1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = handle_error.setdefault(scheme, {}) + elif condition == "open": + kind = scheme + lookup = handle_open + elif condition == "request": + kind = scheme + lookup = process_request + elif condition == "response": + kind = scheme + lookup = process_response + else: + continue + + lookup.setdefault(kind, set()).add(handler) + added = True + + if not added: + unwanted.append(handler) + + for handler in unwanted: + self.handlers.remove(handler) + + # sort indexed methods + # XXX could be cleaned up + for lookup in [process_request, process_response]: + for scheme, handlers in lookup.iteritems(): + lookup[scheme] = handlers + for scheme, lookup in handle_error.iteritems(): + for code, handlers in lookup.iteritems(): + handlers = list(handlers) + handlers.sort() + lookup[code] = handlers + for scheme, handlers in handle_open.iteritems(): + handlers = list(handlers) + handlers.sort() + handle_open[scheme] = handlers + + # cache the indexes + self.handle_error = handle_error + self.handle_open = handle_open + self.process_request = process_request + self.process_response = process_response + self._any_request = any_request + self._any_response = any_response + + def _request(self, url_or_req, data, visit, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + if isstringlike(url_or_req): + req = Request(url_or_req, data, visit=visit, timeout=timeout) + else: + # already a mechanize.Request instance + req = url_or_req + if data is not None: + req.add_data(data) + # XXX yuck + set_request_attr(req, "visit", visit, None) + set_request_attr(req, "timeout", timeout, + _sockettimeout._GLOBAL_DEFAULT_TIMEOUT) + return req + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + req = self._request(fullurl, data, None, timeout) + req_scheme = req.get_type() + + self._maybe_reindex_handlers() + + # pre-process request + # XXX should we allow a Processor to change the URL scheme + # of the request? + request_processors = set(self.process_request.get(req_scheme, [])) + request_processors.update(self._any_request) + request_processors = list(request_processors) + request_processors.sort() + for processor in request_processors: + for meth_name in ["any_request", req_scheme+"_request"]: + meth = getattr(processor, meth_name, None) + if meth: + req = meth(req) + + # In Python >= 2.4, .open() supports processors already, so we must + # call ._open() instead. + urlopen = _urllib2_fork.OpenerDirector._open + response = urlopen(self, req, data) + + # post-process response + response_processors = set(self.process_response.get(req_scheme, [])) + response_processors.update(self._any_response) + response_processors = list(response_processors) + response_processors.sort() + for processor in response_processors: + for meth_name in ["any_response", req_scheme+"_response"]: + meth = getattr(processor, meth_name, None) + if meth: + response = meth(req, response) + + return response + + def error(self, proto, *args): + if proto in ['http', 'https']: + # XXX http[s] protocols are special-cased + dict = self.handle_error['http'] # https is not different than http + proto = args[2] # YUCK! + meth_name = 'http_error_%s' % proto + http_err = 1 + orig_args = args + else: + dict = self.handle_error + meth_name = proto + '_error' + http_err = 0 + args = (dict, proto, meth_name) + args + result = apply(self._call_chain, args) + if result: + return result + + if http_err: + args = (dict, 'default', 'http_error_default') + orig_args + return apply(self._call_chain, args) + + BLOCK_SIZE = 1024*8 + def retrieve(self, fullurl, filename=None, reporthook=None, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT, + open=open_file): + """Returns (filename, headers). + + For remote objects, the default filename will refer to a temporary + file. Temporary files are removed when the OpenerDirector.close() + method is called. + + For file: URLs, at present the returned filename is None. This may + change in future. + + If the actual number of bytes read is less than indicated by the + Content-Length header, raises ContentTooShortError (a URLError + subclass). The exception's .result attribute contains the (filename, + headers) that would have been returned. + + """ + req = self._request(fullurl, data, False, timeout) + scheme = req.get_type() + fp = self.open(req) + try: + headers = fp.info() + if filename is None and scheme == 'file': + # XXX req.get_selector() seems broken here, return None, + # pending sanity :-/ + return None, headers + #return urllib.url2pathname(req.get_selector()), headers + if filename: + tfp = open(filename, 'wb') + else: + path = _rfc3986.urlsplit(req.get_full_url())[2] + suffix = os.path.splitext(path)[1] + fd, filename = tempfile.mkstemp(suffix) + self._tempfiles.append(filename) + tfp = os.fdopen(fd, 'wb') + try: + result = filename, headers + bs = self.BLOCK_SIZE + size = -1 + read = 0 + blocknum = 0 + if reporthook: + if "content-length" in headers: + size = int(headers["Content-Length"]) + reporthook(blocknum, bs, size) + while 1: + block = fp.read(bs) + if block == "": + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, bs, size) + finally: + tfp.close() + finally: + fp.close() + + # raise exception if actual size does not match content-length header + if size >= 0 and read < size: + raise ContentTooShortError( + "retrieval incomplete: " + "got only %i out of %i bytes" % (read, size), + result + ) + + return result + + def close(self): + _urllib2_fork.OpenerDirector.close(self) + + # make it very obvious this object is no longer supposed to be used + self.open = self.error = self.retrieve = self.add_handler = None + + if self._tempfiles: + for filename in self._tempfiles: + try: + os.unlink(filename) + except OSError: + pass + del self._tempfiles[:] + + +def wrapped_open(urlopen, process_response_object, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + success = True + try: + response = urlopen(fullurl, data, timeout) + except urllib2.HTTPError, error: + success = False + if error.fp is None: # not a response + raise + response = error + + if response is not None: + response = process_response_object(response) + + if not success: + raise response + return response + +class ResponseProcessingOpener(OpenerDirector): + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + def bound_open(fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return OpenerDirector.open(self, fullurl, data, timeout) + return wrapped_open( + bound_open, self.process_response_object, fullurl, data, timeout) + + def process_response_object(self, response): + return response + + +class SeekableResponseOpener(ResponseProcessingOpener): + def process_response_object(self, response): + return _response.seek_wrapped_response(response) + + +def isclass(obj): + return isinstance(obj, (types.ClassType, type)) + + +class OpenerFactory: + """This class's interface is quite likely to change.""" + + default_classes = [ + # handlers + _urllib2_fork.ProxyHandler, + _urllib2_fork.UnknownHandler, + _urllib2_fork.HTTPHandler, + _urllib2_fork.HTTPDefaultErrorHandler, + _urllib2_fork.HTTPRedirectHandler, + _urllib2_fork.FTPHandler, + _urllib2_fork.FileHandler, + # processors + _urllib2_fork.HTTPCookieProcessor, + _urllib2_fork.HTTPErrorProcessor, + ] + if hasattr(httplib, 'HTTPS'): + default_classes.append(_urllib2_fork.HTTPSHandler) + handlers = [] + replacement_handlers = [] + + def __init__(self, klass=OpenerDirector): + self.klass = klass + + def build_opener(self, *handlers): + """Create an opener object from a list of handlers and processors. + + The opener will use several default handlers and processors, including + support for HTTP and FTP. + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + + """ + opener = self.klass() + default_classes = list(self.default_classes) + skip = set() + for klass in default_classes: + for check in handlers: + if isclass(check): + if issubclass(check, klass): + skip.add(klass) + elif isinstance(check, klass): + skip.add(klass) + for klass in skip: + default_classes.remove(klass) + + for klass in default_classes: + opener.add_handler(klass()) + for h in handlers: + if isclass(h): + h = h() + opener.add_handler(h) + + return opener + + +build_opener = OpenerFactory().build_opener + +_opener = None +urlopen_lock = _threading.Lock() +def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + urlopen_lock.acquire() + try: + if _opener is None: + _opener = build_opener() + finally: + urlopen_lock.release() + return _opener.open(url, data, timeout) + +def urlretrieve(url, filename=None, reporthook=None, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + urlopen_lock.acquire() + try: + if _opener is None: + _opener = build_opener() + finally: + urlopen_lock.release() + return _opener.retrieve(url, filename, reporthook, data, timeout) + +def install_opener(opener): + global _opener + _opener = opener diff --git a/LTA/LTAIngest/mechanize/_pullparser.py b/LTA/LTAIngest/mechanize/_pullparser.py new file mode 100644 index 0000000000000000000000000000000000000000..1f212c1512bacd2d3ef95e51ba9d578ce0adfbf5 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_pullparser.py @@ -0,0 +1,391 @@ +"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser. + +Examples + +This program extracts all links from a document. It will print one +line for each link, containing the URL and the textual description +between the <A>...</A> tags: + +import pullparser, sys +f = file(sys.argv[1]) +p = pullparser.PullParser(f) +for token in p.tags("a"): + if token.type == "endtag": continue + url = dict(token.attrs).get("href", "-") + text = p.get_compressed_text(endat=("endtag", "a")) + print "%s\t%s" % (url, text) + +This program extracts the <TITLE> from the document: + +import pullparser, sys +f = file(sys.argv[1]) +p = pullparser.PullParser(f) +if p.get_tag("title"): + title = p.get_compressed_text() + print "Title: %s" % title + + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> +Copyright 1998-2001 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses. + +""" + +import re, htmlentitydefs +import _sgmllib_copy as sgmllib +import HTMLParser +from xml.sax import saxutils + +from _html import unescape, unescape_charref + + +class NoMoreTokensError(Exception): pass + +class Token: + """Represents an HTML tag, declaration, processing instruction etc. + + Behaves as both a tuple-like object (ie. iterable) and has attributes + .type, .data and .attrs. + + >>> t = Token("starttag", "a", [("href", "http://www.python.org/")]) + >>> t == ("starttag", "a", [("href", "http://www.python.org/")]) + True + >>> (t.type, t.data) == ("starttag", "a") + True + >>> t.attrs == [("href", "http://www.python.org/")] + True + + Public attributes + + type: one of "starttag", "endtag", "startendtag", "charref", "entityref", + "data", "comment", "decl", "pi", after the corresponding methods of + HTMLParser.HTMLParser + data: For a tag, the tag name; otherwise, the relevant data carried by the + tag, as a string + attrs: list of (name, value) pairs representing HTML attributes + (or None if token does not represent an opening tag) + + """ + def __init__(self, type, data, attrs=None): + self.type = type + self.data = data + self.attrs = attrs + def __iter__(self): + return iter((self.type, self.data, self.attrs)) + def __eq__(self, other): + type, data, attrs = other + if (self.type == type and + self.data == data and + self.attrs == attrs): + return True + else: + return False + def __ne__(self, other): return not self.__eq__(other) + def __repr__(self): + args = ", ".join(map(repr, [self.type, self.data, self.attrs])) + return self.__class__.__name__+"(%s)" % args + + def __str__(self): + """ + >>> print Token("starttag", "br") + <br> + >>> print Token("starttag", "a", + ... [("href", "http://www.python.org/"), ("alt", '"foo"')]) + <a href="http://www.python.org/" alt='"foo"'> + >>> print Token("startendtag", "br") + <br /> + >>> print Token("startendtag", "br", [("spam", "eggs")]) + <br spam="eggs" /> + >>> print Token("endtag", "p") + </p> + >>> print Token("charref", "38") + & + >>> print Token("entityref", "amp") + & + >>> print Token("data", "foo\\nbar") + foo + bar + >>> print Token("comment", "Life is a bowl\\nof cherries.") + <!--Life is a bowl + of cherries.--> + >>> print Token("decl", "decl") + <!decl> + >>> print Token("pi", "pi") + <?pi> + """ + if self.attrs is not None: + attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for + k, v in self.attrs]) + else: + attrs = "" + if self.type == "starttag": + return "<%s%s>" % (self.data, attrs) + elif self.type == "startendtag": + return "<%s%s />" % (self.data, attrs) + elif self.type == "endtag": + return "</%s>" % self.data + elif self.type == "charref": + return "&#%s;" % self.data + elif self.type == "entityref": + return "&%s;" % self.data + elif self.type == "data": + return self.data + elif self.type == "comment": + return "<!--%s-->" % self.data + elif self.type == "decl": + return "<!%s>" % self.data + elif self.type == "pi": + return "<?%s>" % self.data + assert False + + +def iter_until_exception(fn, exception, *args, **kwds): + while 1: + try: + yield fn(*args, **kwds) + except exception: + raise StopIteration + + +class _AbstractParser: + chunk = 1024 + compress_re = re.compile(r"\s+") + def __init__(self, fh, textify={"img": "alt", "applet": "alt"}, + encoding="ascii", entitydefs=None): + """ + fh: file-like object (only a .read() method is required) from which to + read HTML to be parsed + textify: mapping used by .get_text() and .get_compressed_text() methods + to represent opening tags as text + encoding: encoding used to encode numeric character references by + .get_text() and .get_compressed_text() ("ascii" by default) + + entitydefs: mapping like {"amp": "&", ...} containing HTML entity + definitions (a sensible default is used). This is used to unescape + entities in .get_text() (and .get_compressed_text()) and attribute + values. If the encoding can not represent the character, the entity + reference is left unescaped. Note that entity references (both + numeric - e.g. { or ઼ - and non-numeric - e.g. &) are + unescaped in attribute values and the return value of .get_text(), but + not in data outside of tags. Instead, entity references outside of + tags are represented as tokens. This is a bit odd, it's true :-/ + + If the element name of an opening tag matches a key in the textify + mapping then that tag is converted to text. The corresponding value is + used to specify which tag attribute to obtain the text from. textify + maps from element names to either: + + - an HTML attribute name, in which case the HTML attribute value is + used as its text value along with the element name in square + brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute + were missing, just "[IMG]") + - a callable object (e.g. a function) which takes a Token and returns + the string to be used as its text value + + If textify has no key for an element name, nothing is substituted for + the opening tag. + + Public attributes: + + encoding and textify: see above + + """ + self._fh = fh + self._tokenstack = [] # FIFO + self.textify = textify + self.encoding = encoding + if entitydefs is None: + entitydefs = htmlentitydefs.name2codepoint + self._entitydefs = entitydefs + + def __iter__(self): return self + + def tags(self, *names): + return iter_until_exception(self.get_tag, NoMoreTokensError, *names) + + def tokens(self, *tokentypes): + return iter_until_exception(self.get_token, NoMoreTokensError, + *tokentypes) + + def next(self): + try: + return self.get_token() + except NoMoreTokensError: + raise StopIteration() + + def get_token(self, *tokentypes): + """Pop the next Token object from the stack of parsed tokens. + + If arguments are given, they are taken to be token types in which the + caller is interested: tokens representing other elements will be + skipped. Element names must be given in lower case. + + Raises NoMoreTokensError. + + """ + while 1: + while self._tokenstack: + token = self._tokenstack.pop(0) + if tokentypes: + if token.type in tokentypes: + return token + else: + return token + data = self._fh.read(self.chunk) + if not data: + raise NoMoreTokensError() + self.feed(data) + + def unget_token(self, token): + """Push a Token back onto the stack.""" + self._tokenstack.insert(0, token) + + def get_tag(self, *names): + """Return the next Token that represents an opening or closing tag. + + If arguments are given, they are taken to be element names in which the + caller is interested: tags representing other elements will be skipped. + Element names must be given in lower case. + + Raises NoMoreTokensError. + + """ + while 1: + tok = self.get_token() + if tok.type not in ["starttag", "endtag", "startendtag"]: + continue + if names: + if tok.data in names: + return tok + else: + return tok + + def get_text(self, endat=None): + """Get some text. + + endat: stop reading text at this tag (the tag is included in the + returned text); endtag is a tuple (type, name) where type is + "starttag", "endtag" or "startendtag", and name is the element name of + the tag (element names must be given in lower case) + + If endat is not given, .get_text() will stop at the next opening or + closing tag, or when there are no more tokens (no exception is raised). + Note that .get_text() includes the text representation (if any) of the + opening tag, but pushes the opening tag back onto the stack. As a + result, if you want to call .get_text() again, you need to call + .get_tag() first (unless you want an empty string returned when you + next call .get_text()). + + Entity references are translated using the value of the entitydefs + constructor argument (a mapping from names to characters like that + provided by the standard module htmlentitydefs). Named entity + references that are not in this mapping are left unchanged. + + The textify attribute is used to translate opening tags into text: see + the class docstring. + + """ + text = [] + tok = None + while 1: + try: + tok = self.get_token() + except NoMoreTokensError: + # unget last token (not the one we just failed to get) + if tok: self.unget_token(tok) + break + if tok.type == "data": + text.append(tok.data) + elif tok.type == "entityref": + t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding) + text.append(t) + elif tok.type == "charref": + t = unescape_charref(tok.data, self.encoding) + text.append(t) + elif tok.type in ["starttag", "endtag", "startendtag"]: + tag_name = tok.data + if tok.type in ["starttag", "startendtag"]: + alt = self.textify.get(tag_name) + if alt is not None: + if callable(alt): + text.append(alt(tok)) + elif tok.attrs is not None: + for k, v in tok.attrs: + if k == alt: + text.append(v) + text.append("[%s]" % tag_name.upper()) + if endat is None or endat == (tok.type, tag_name): + self.unget_token(tok) + break + return "".join(text) + + def get_compressed_text(self, *args, **kwds): + """ + As .get_text(), but collapses each group of contiguous whitespace to a + single space character, and removes all initial and trailing + whitespace. + + """ + text = self.get_text(*args, **kwds) + text = text.strip() + return self.compress_re.sub(" ", text) + + def handle_startendtag(self, tag, attrs): + self._tokenstack.append(Token("startendtag", tag, attrs)) + def handle_starttag(self, tag, attrs): + self._tokenstack.append(Token("starttag", tag, attrs)) + def handle_endtag(self, tag): + self._tokenstack.append(Token("endtag", tag)) + def handle_charref(self, name): + self._tokenstack.append(Token("charref", name)) + def handle_entityref(self, name): + self._tokenstack.append(Token("entityref", name)) + def handle_data(self, data): + self._tokenstack.append(Token("data", data)) + def handle_comment(self, data): + self._tokenstack.append(Token("comment", data)) + def handle_decl(self, decl): + self._tokenstack.append(Token("decl", decl)) + def unknown_decl(self, data): + # XXX should this call self.error instead? + #self.error("unknown declaration: " + `data`) + self._tokenstack.append(Token("decl", data)) + def handle_pi(self, data): + self._tokenstack.append(Token("pi", data)) + + def unescape_attr(self, name): + return unescape(name, self._entitydefs, self.encoding) + def unescape_attrs(self, attrs): + escaped_attrs = [] + for key, val in attrs: + escaped_attrs.append((key, self.unescape_attr(val))) + return escaped_attrs + +class PullParser(_AbstractParser, HTMLParser.HTMLParser): + def __init__(self, *args, **kwds): + HTMLParser.HTMLParser.__init__(self) + _AbstractParser.__init__(self, *args, **kwds) + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + +class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser): + def __init__(self, *args, **kwds): + sgmllib.SGMLParser.__init__(self) + _AbstractParser.__init__(self, *args, **kwds) + def unknown_starttag(self, tag, attrs): + attrs = self.unescape_attrs(attrs) + self._tokenstack.append(Token("starttag", tag, attrs)) + def unknown_endtag(self, tag): + self._tokenstack.append(Token("endtag", tag)) + + +def _test(): + import doctest, _pullparser + return doctest.testmod(_pullparser) + +if __name__ == "__main__": + _test() diff --git a/LTA/LTAIngest/mechanize/_request.py b/LTA/LTAIngest/mechanize/_request.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8acb6cc45706d66a1af55e1a1dfa6b3e4550ec --- /dev/null +++ b/LTA/LTAIngest/mechanize/_request.py @@ -0,0 +1,40 @@ +"""Integration with Python standard library module urllib2: Request class. + +Copyright 2004-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import logging + +import _rfc3986 +import _sockettimeout +import _urllib2_fork + +warn = logging.getLogger("mechanize").warning + + +class Request(_urllib2_fork.Request): + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False, visit=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + # In mechanize 0.2, the interpretation of a unicode url argument will + # change: A unicode url argument will be interpreted as an IRI, and a + # bytestring as a URI. For now, we accept unicode or bytestring. We + # don't insist that the value is always a URI (specifically, must only + # contain characters which are legal), because that might break working + # code (who knows what bytes some servers want to see, especially with + # browser plugins for internationalised URIs). + if not _rfc3986.is_clean_uri(url): + warn("url argument is not a URI " + "(contains illegal characters) %r" % url) + _urllib2_fork.Request.__init__(self, url, data, headers) + self.selector = None + self.visit = visit + self.timeout = timeout + + def __str__(self): + return "<Request for %s>" % self.get_full_url() diff --git a/LTA/LTAIngest/mechanize/_response.py b/LTA/LTAIngest/mechanize/_response.py new file mode 100644 index 0000000000000000000000000000000000000000..d5ca5f2e4e6e44ee375a1ef6a926f5310cf336d2 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_response.py @@ -0,0 +1,525 @@ +"""Response classes. + +The seek_wrapper code is not used if you're using UserAgent with +.set_seekable_responses(False), or if you're using the urllib2-level interface +HTTPEquivProcessor. Class closeable_response is instantiated by some handlers +(AbstractHTTPHandler), but the closeable_response interface is only depended +upon by Browser-level code. Function upgrade_response is only used if you're +using Browser. + + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import copy, mimetools, urllib2 +from cStringIO import StringIO + + +def len_of_seekable(file_): + # this function exists because evaluation of len(file_.getvalue()) on every + # .read() from seek_wrapper would be O(N**2) in number of .read()s + pos = file_.tell() + file_.seek(0, 2) # to end + try: + return file_.tell() + finally: + file_.seek(pos) + + +# XXX Andrew Dalke kindly sent me a similar class in response to my request on +# comp.lang.python, which I then proceeded to lose. I wrote this class +# instead, but I think he's released his code publicly since, could pinch the +# tests from it, at least... + +# For testing seek_wrapper invariant (note that +# test_urllib2.HandlerTest.test_seekable is expected to fail when this +# invariant checking is turned on). The invariant checking is done by module +# ipdc, which is available here: +# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834 +## from ipdbc import ContractBase +## class seek_wrapper(ContractBase): +class seek_wrapper: + """Adds a seek method to a file object. + + This is only designed for seeking on readonly file-like objects. + + Wrapped file-like object must have a read method. The readline method is + only supported if that method is present on the wrapped object. The + readlines method is always supported. xreadlines and iteration are + supported only for Python 2.2 and above. + + Public attributes: + + wrapped: the wrapped file object + is_closed: true iff .close() has been called + + WARNING: All other attributes of the wrapped object (ie. those that are not + one of wrapped, read, readline, readlines, xreadlines, __iter__ and next) + are passed through unaltered, which may or may not make sense for your + particular file object. + + """ + # General strategy is to check that cache is full enough, then delegate to + # the cache (self.__cache, which is a cStringIO.StringIO instance). A seek + # position (self.__pos) is maintained independently of the cache, in order + # that a single cache may be shared between multiple seek_wrapper objects. + # Copying using module copy shares the cache in this way. + + def __init__(self, wrapped): + self.wrapped = wrapped + self.__read_complete_state = [False] + self.__is_closed_state = [False] + self.__have_readline = hasattr(self.wrapped, "readline") + self.__cache = StringIO() + self.__pos = 0 # seek position + + def invariant(self): + # The end of the cache is always at the same place as the end of the + # wrapped file (though the .tell() method is not required to be present + # on wrapped file). + return self.wrapped.tell() == len(self.__cache.getvalue()) + + def close(self): + self.wrapped.close() + self.is_closed = True + + def __getattr__(self, name): + if name == "is_closed": + return self.__is_closed_state[0] + elif name == "read_complete": + return self.__read_complete_state[0] + + wrapped = self.__dict__.get("wrapped") + if wrapped: + return getattr(wrapped, name) + + return getattr(self.__class__, name) + + def __setattr__(self, name, value): + if name == "is_closed": + self.__is_closed_state[0] = bool(value) + elif name == "read_complete": + if not self.is_closed: + self.__read_complete_state[0] = bool(value) + else: + self.__dict__[name] = value + + def seek(self, offset, whence=0): + assert whence in [0,1,2] + + # how much data, if any, do we need to read? + if whence == 2: # 2: relative to end of *wrapped* file + if offset < 0: raise ValueError("negative seek offset") + # since we don't know yet where the end of that file is, we must + # read everything + to_read = None + else: + if whence == 0: # 0: absolute + if offset < 0: raise ValueError("negative seek offset") + dest = offset + else: # 1: relative to current position + pos = self.__pos + if pos < offset: + raise ValueError("seek to before start of file") + dest = pos + offset + end = len_of_seekable(self.__cache) + to_read = dest - end + if to_read < 0: + to_read = 0 + + if to_read != 0: + self.__cache.seek(0, 2) + if to_read is None: + assert whence == 2 + self.__cache.write(self.wrapped.read()) + self.read_complete = True + self.__pos = self.__cache.tell() - offset + else: + data = self.wrapped.read(to_read) + if not data: + self.read_complete = True + else: + self.__cache.write(data) + # Don't raise an exception even if we've seek()ed past the end + # of .wrapped, since fseek() doesn't complain in that case. + # Also like fseek(), pretend we have seek()ed past the end, + # i.e. not: + #self.__pos = self.__cache.tell() + # but rather: + self.__pos = dest + else: + self.__pos = dest + + def tell(self): + return self.__pos + + def __copy__(self): + cpy = self.__class__(self.wrapped) + cpy.__cache = self.__cache + cpy.__read_complete_state = self.__read_complete_state + cpy.__is_closed_state = self.__is_closed_state + return cpy + + def get_data(self): + pos = self.__pos + try: + self.seek(0) + return self.read(-1) + finally: + self.__pos = pos + + def read(self, size=-1): + pos = self.__pos + end = len_of_seekable(self.__cache) + available = end - pos + + # enough data already cached? + if size <= available and size != -1: + self.__cache.seek(pos) + self.__pos = pos+size + return self.__cache.read(size) + + # no, so read sufficient data from wrapped file and cache it + self.__cache.seek(0, 2) + if size == -1: + self.__cache.write(self.wrapped.read()) + self.read_complete = True + else: + to_read = size - available + assert to_read > 0 + data = self.wrapped.read(to_read) + if not data: + self.read_complete = True + else: + self.__cache.write(data) + self.__cache.seek(pos) + + data = self.__cache.read(size) + self.__pos = self.__cache.tell() + assert self.__pos == pos + len(data) + return data + + def readline(self, size=-1): + if not self.__have_readline: + raise NotImplementedError("no readline method on wrapped object") + + # line we're about to read might not be complete in the cache, so + # read another line first + pos = self.__pos + self.__cache.seek(0, 2) + data = self.wrapped.readline() + if not data: + self.read_complete = True + else: + self.__cache.write(data) + self.__cache.seek(pos) + + data = self.__cache.readline() + if size != -1: + r = data[:size] + self.__pos = pos+size + else: + r = data + self.__pos = pos+len(data) + return r + + def readlines(self, sizehint=-1): + pos = self.__pos + self.__cache.seek(0, 2) + self.__cache.write(self.wrapped.read()) + self.read_complete = True + self.__cache.seek(pos) + data = self.__cache.readlines(sizehint) + self.__pos = self.__cache.tell() + return data + + def __iter__(self): return self + def next(self): + line = self.readline() + if line == "": raise StopIteration + return line + + xreadlines = __iter__ + + def __repr__(self): + return ("<%s at %s whose wrapped object = %r>" % + (self.__class__.__name__, hex(abs(id(self))), self.wrapped)) + + +class response_seek_wrapper(seek_wrapper): + + """ + Supports copying response objects and setting response body data. + + """ + + def __init__(self, wrapped): + seek_wrapper.__init__(self, wrapped) + self._headers = self.wrapped.info() + + def __copy__(self): + cpy = seek_wrapper.__copy__(self) + # copy headers from delegate + cpy._headers = copy.copy(self.info()) + return cpy + + # Note that .info() and .geturl() (the only two urllib2 response methods + # that are not implemented by seek_wrapper) must be here explicitly rather + # than by seek_wrapper's __getattr__ delegation) so that the nasty + # dynamically-created HTTPError classes in get_seek_wrapper_class() get the + # wrapped object's implementation, and not HTTPError's. + + def info(self): + return self._headers + + def geturl(self): + return self.wrapped.geturl() + + def set_data(self, data): + self.seek(0) + self.read() + self.close() + cache = self._seek_wrapper__cache = StringIO() + cache.write(data) + self.seek(0) + + +class eoffile: + # file-like object that always claims to be at end-of-file... + def read(self, size=-1): return "" + def readline(self, size=-1): return "" + def __iter__(self): return self + def next(self): return "" + def close(self): pass + +class eofresponse(eoffile): + def __init__(self, url, headers, code, msg): + self._url = url + self._headers = headers + self.code = code + self.msg = msg + def geturl(self): return self._url + def info(self): return self._headers + + +class closeable_response: + """Avoids unnecessarily clobbering urllib.addinfourl methods on .close(). + + Only supports responses returned by mechanize.HTTPHandler. + + After .close(), the following methods are supported: + + .read() + .readline() + .info() + .geturl() + .__iter__() + .next() + .close() + + and the following attributes are supported: + + .code + .msg + + Also supports pickling (but the stdlib currently does something to prevent + it: http://python.org/sf/1144636). + + """ + # presence of this attr indicates is useable after .close() + closeable_response = None + + def __init__(self, fp, headers, url, code, msg): + self._set_fp(fp) + self._headers = headers + self._url = url + self.code = code + self.msg = msg + + def _set_fp(self, fp): + self.fp = fp + self.read = self.fp.read + self.readline = self.fp.readline + if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines + if hasattr(self.fp, "fileno"): + self.fileno = self.fp.fileno + else: + self.fileno = lambda: None + self.__iter__ = self.fp.__iter__ + self.next = self.fp.next + + def __repr__(self): + return '<%s at %s whose fp = %r>' % ( + self.__class__.__name__, hex(abs(id(self))), self.fp) + + def info(self): + return self._headers + + def geturl(self): + return self._url + + def close(self): + wrapped = self.fp + wrapped.close() + new_wrapped = eofresponse( + self._url, self._headers, self.code, self.msg) + self._set_fp(new_wrapped) + + def __getstate__(self): + # There are three obvious options here: + # 1. truncate + # 2. read to end + # 3. close socket, pickle state including read position, then open + # again on unpickle and use Range header + # XXXX um, 4. refuse to pickle unless .close()d. This is better, + # actually ("errors should never pass silently"). Pickling doesn't + # work anyway ATM, because of http://python.org/sf/1144636 so fix + # this later + + # 2 breaks pickle protocol, because one expects the original object + # to be left unscathed by pickling. 3 is too complicated and + # surprising (and too much work ;-) to happen in a sane __getstate__. + # So we do 1. + + state = self.__dict__.copy() + new_wrapped = eofresponse( + self._url, self._headers, self.code, self.msg) + state["wrapped"] = new_wrapped + return state + +def test_response(data='test data', headers=[], + url="http://example.com/", code=200, msg="OK"): + return make_response(data, headers, url, code, msg) + +def test_html_response(data='test data', headers=[], + url="http://example.com/", code=200, msg="OK"): + headers += [("Content-type", "text/html")] + return make_response(data, headers, url, code, msg) + +def make_response(data, headers, url, code, msg): + """Convenient factory for objects implementing response interface. + + data: string containing response body data + headers: sequence of (name, value) pairs + url: URL of response + code: integer response code (e.g. 200) + msg: string response code message (e.g. "OK") + + """ + mime_headers = make_headers(headers) + r = closeable_response(StringIO(data), mime_headers, url, code, msg) + return response_seek_wrapper(r) + + +def make_headers(headers): + """ + headers: sequence of (name, value) pairs + """ + hdr_text = [] + for name_value in headers: + hdr_text.append("%s: %s" % name_value) + return mimetools.Message(StringIO("\n".join(hdr_text))) + + +# Rest of this module is especially horrible, but needed, at least until fork +# urllib2. Even then, may want to preseve urllib2 compatibility. + +def get_seek_wrapper_class(response): + # in order to wrap response objects that are also exceptions, we must + # dynamically subclass the exception :-((( + if (isinstance(response, urllib2.HTTPError) and + not hasattr(response, "seek")): + if response.__class__.__module__ == "__builtin__": + exc_class_name = response.__class__.__name__ + else: + exc_class_name = "%s.%s" % ( + response.__class__.__module__, response.__class__.__name__) + + class httperror_seek_wrapper(response_seek_wrapper, response.__class__): + # this only derives from HTTPError in order to be a subclass -- + # the HTTPError behaviour comes from delegation + + _exc_class_name = exc_class_name + + def __init__(self, wrapped): + response_seek_wrapper.__init__(self, wrapped) + # be compatible with undocumented HTTPError attributes :-( + self.hdrs = wrapped.info() + self.filename = wrapped.geturl() + + def __repr__(self): + return ( + "<%s (%s instance) at %s " + "whose wrapped object = %r>" % ( + self.__class__.__name__, self._exc_class_name, + hex(abs(id(self))), self.wrapped) + ) + wrapper_class = httperror_seek_wrapper + else: + wrapper_class = response_seek_wrapper + return wrapper_class + +def seek_wrapped_response(response): + """Return a copy of response that supports seekable response interface. + + Accepts responses from both mechanize and urllib2 handlers. + + Copes with both ordinary response instances and HTTPError instances (which + can't be simply wrapped due to the requirement of preserving the exception + base class). + """ + if not hasattr(response, "seek"): + wrapper_class = get_seek_wrapper_class(response) + response = wrapper_class(response) + assert hasattr(response, "get_data") + return response + +def upgrade_response(response): + """Return a copy of response that supports Browser response interface. + + Browser response interface is that of "seekable responses" + (response_seek_wrapper), plus the requirement that responses must be + useable after .close() (closeable_response). + + Accepts responses from both mechanize and urllib2 handlers. + + Copes with both ordinary response instances and HTTPError instances (which + can't be simply wrapped due to the requirement of preserving the exception + base class). + """ + wrapper_class = get_seek_wrapper_class(response) + if hasattr(response, "closeable_response"): + if not hasattr(response, "seek"): + response = wrapper_class(response) + assert hasattr(response, "get_data") + return copy.copy(response) + + # a urllib2 handler constructed the response, i.e. the response is an + # urllib.addinfourl or a urllib2.HTTPError, instead of a + # _Util.closeable_response as returned by e.g. mechanize.HTTPHandler + try: + code = response.code + except AttributeError: + code = None + try: + msg = response.msg + except AttributeError: + msg = None + + # may have already-.read() data from .seek() cache + data = None + get_data = getattr(response, "get_data", None) + if get_data: + data = get_data() + + response = closeable_response( + response.fp, response.info(), response.geturl(), code, msg) + response = wrapper_class(response) + if data: + response.set_data(data) + return response diff --git a/LTA/LTAIngest/mechanize/_rfc3986.py b/LTA/LTAIngest/mechanize/_rfc3986.py new file mode 100644 index 0000000000000000000000000000000000000000..91fcd197f4ec173ef00c5e05555c7936d7fbc4ed --- /dev/null +++ b/LTA/LTAIngest/mechanize/_rfc3986.py @@ -0,0 +1,245 @@ +"""RFC 3986 URI parsing and relative reference resolution / absolutization. + +(aka splitting and joining) + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM. + +import re, urllib + +## def chr_range(a, b): +## return "".join(map(chr, range(ord(a), ord(b)+1))) + +## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" +## "abcdefghijklmnopqrstuvwxyz" +## "0123456789" +## "-_.~") +## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]" +## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%' +# this re matches any character that's not in URI_CHARS +BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]") + + +def clean_url(url, encoding): + # percent-encode illegal URI characters + # Trying to come up with test cases for this gave me a headache, revisit + # when do switch to unicode. + # Somebody else's comments (lost the attribution): +## - IE will return you the url in the encoding you send it +## - Mozilla/Firefox will send you latin-1 if there's no non latin-1 +## characters in your link. It will send you utf-8 however if there are... + if type(url) == type(""): + url = url.decode(encoding, "replace") + url = url.strip() + # for second param to urllib.quote(), we want URI_CHARS, minus the + # 'always_safe' characters that urllib.quote() never percent-encodes + return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~") + +def is_clean_uri(uri): + """ + >>> is_clean_uri("ABC!") + True + >>> is_clean_uri(u"ABC!") + True + >>> is_clean_uri("ABC|") + False + >>> is_clean_uri(u"ABC|") + False + >>> is_clean_uri("http://example.com/0") + True + >>> is_clean_uri(u"http://example.com/0") + True + """ + # note module re treats bytestrings as through they were decoded as latin-1 + # so this function accepts both unicode and bytestrings + return not bool(BAD_URI_CHARS_RE.search(uri)) + + +SPLIT_MATCH = re.compile( + r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match +def urlsplit(absolute_uri): + """Return scheme, authority, path, query, fragment.""" + match = SPLIT_MATCH(absolute_uri) + if match: + g = match.groups() + return g[1], g[3], g[4], g[6], g[8] + +def urlunsplit(parts): + scheme, authority, path, query, fragment = parts + r = [] + append = r.append + if scheme is not None: + append(scheme) + append(":") + if authority is not None: + append("//") + append(authority) + append(path) + if query is not None: + append("?") + append(query) + if fragment is not None: + append("#") + append(fragment) + return "".join(r) + +def urljoin(base_uri, uri_reference): + """Join a base URI with a URI reference and return the resulting URI. + + See RFC 3986. + """ + return urlunsplit(urljoin_parts(urlsplit(base_uri), + urlsplit(uri_reference))) + +# oops, this doesn't do the same thing as the literal translation +# from the RFC below +## import posixpath +## def urljoin_parts(base_parts, reference_parts): +## scheme, authority, path, query, fragment = base_parts +## rscheme, rauthority, rpath, rquery, rfragment = reference_parts + +## # compute target URI path +## if rpath == "": +## tpath = path +## else: +## tpath = rpath +## if not tpath.startswith("/"): +## tpath = merge(authority, path, tpath) +## tpath = posixpath.normpath(tpath) + +## if rscheme is not None: +## return (rscheme, rauthority, tpath, rquery, rfragment) +## elif rauthority is not None: +## return (scheme, rauthority, tpath, rquery, rfragment) +## elif rpath == "": +## if rquery is not None: +## tquery = rquery +## else: +## tquery = query +## return (scheme, authority, tpath, tquery, rfragment) +## else: +## return (scheme, authority, tpath, rquery, rfragment) + +def urljoin_parts(base_parts, reference_parts): + scheme, authority, path, query, fragment = base_parts + rscheme, rauthority, rpath, rquery, rfragment = reference_parts + + if rscheme == scheme: + rscheme = None + + if rscheme is not None: + tscheme, tauthority, tpath, tquery = ( + rscheme, rauthority, remove_dot_segments(rpath), rquery) + else: + if rauthority is not None: + tauthority, tpath, tquery = ( + rauthority, remove_dot_segments(rpath), rquery) + else: + if rpath == "": + tpath = path + if rquery is not None: + tquery = rquery + else: + tquery = query + else: + if rpath.startswith("/"): + tpath = remove_dot_segments(rpath) + else: + tpath = merge(authority, path, rpath) + tpath = remove_dot_segments(tpath) + tquery = rquery + tauthority = authority + tscheme = scheme + tfragment = rfragment + return (tscheme, tauthority, tpath, tquery, tfragment) + +# um, something *vaguely* like this is what I want, but I have to generate +# lots of test cases first, if only to understand what it is that +# remove_dot_segments really does... +## def remove_dot_segments(path): +## if path == '': +## return '' +## comps = path.split('/') +## new_comps = [] +## for comp in comps: +## if comp in ['.', '']: +## if not new_comps or new_comps[-1]: +## new_comps.append('') +## continue +## if comp != '..': +## new_comps.append(comp) +## elif new_comps: +## new_comps.pop() +## return '/'.join(new_comps) + + +def remove_dot_segments(path): + r = [] + while path: + # A + if path.startswith("../"): + path = path[3:] + continue + if path.startswith("./"): + path = path[2:] + continue + # B + if path.startswith("/./"): + path = path[2:] + continue + if path == "/.": + path = "/" + continue + # C + if path.startswith("/../"): + path = path[3:] + if r: + r.pop() + continue + if path == "/..": + path = "/" + if r: + r.pop() + continue + # D + if path == ".": + path = path[1:] + continue + if path == "..": + path = path[2:] + continue + # E + start = 0 + if path.startswith("/"): + start = 1 + ii = path.find("/", start) + if ii < 0: + ii = None + r.append(path[:ii]) + if ii is None: + break + path = path[ii:] + return "".join(r) + +def merge(base_authority, base_path, ref_path): + # XXXX Oddly, the sample Perl implementation of this by Roy Fielding + # doesn't even take base_authority as a parameter, despite the wording in + # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity. + #if base_authority is not None and base_path == "": + if base_path == "": + return "/" + ref_path + ii = base_path.rfind("/") + if ii >= 0: + return base_path[:ii+1] + ref_path + return ref_path + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/LTA/LTAIngest/mechanize/_sgmllib_copy.py b/LTA/LTAIngest/mechanize/_sgmllib_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..a545d25eb9dda19ef70f032f437015870856cd54 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_sgmllib_copy.py @@ -0,0 +1,559 @@ +# Taken from Python 2.6.4 and regexp module constants modified +"""A parser for SGML, using the derived class as a static DTD.""" + +# XXX This only supports those SGML features used by HTML. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). RCDATA is +# not supported at all. + + +# from warnings import warnpy3k +# warnpy3k("the sgmllib module has been removed in Python 3.0", +# stacklevel=2) +# del warnpy3k + +import markupbase +import re + +__all__ = ["SGMLParser", "SGMLParseError"] + +# Regular expressions used for parsing + +interesting = re.compile('[&<]') +incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|' + '/([a-zA-Z][^<>]*)?|' + '![^<>]*)?') + +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') +# hack to fix http://bugs.python.org/issue803422 +# charref = re.compile('&#([0-9]+)[^0-9]') +charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") + +starttagopen = re.compile('<[>a-zA-Z]') +shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') +shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') +piclose = re.compile('>') +endbracket = re.compile('[<>]') +# hack moved from _beautifulsoup.py (bundled BeautifulSoup version 2) +#This code makes Beautiful Soup able to parse XML with namespaces +# tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') +tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +attrfind = re.compile( + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + + +class SGMLParseError(RuntimeError): + """Exception raised for all parse errors.""" + pass + + +# SGML parser base class -- find tags and call handler functions. +# Usage: p = SGMLParser(); p.feed(data); ...; p.close(). +# The dtd is defined by deriving a class which defines methods +# with special names to handle tags: start_foo and end_foo to handle +# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. +# (Tags are converted to lower case for this purpose.) The data +# between tags is passed to the parser by calling self.handle_data() +# with some data as argument (the data may be split up in arbitrary +# chunks). Entity references are passed by calling +# self.handle_entityref() with the entity reference as argument. + +class SGMLParser(markupbase.ParserBase): + # Definition of entities -- derived classes may override + entity_or_charref = re.compile('&(?:' + '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' + ')(;?)') + + def __init__(self, verbose=0): + """Initialize and reset this instance.""" + self.verbose = verbose + self.reset() + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" + self.__starttag_text = None + self.rawdata = '' + self.stack = [] + self.lasttag = '???' + self.nomoretags = 0 + self.literal = 0 + markupbase.ParserBase.reset(self) + + def setnomoretags(self): + """Enter literal mode (CDATA) till EOF. + + Intended for derived classes only. + """ + self.nomoretags = self.literal = 1 + + def setliteral(self, *args): + """Enter literal mode (CDATA). + + Intended for derived classes only. + """ + self.literal = 1 + + def feed(self, data): + """Feed some data to the parser. + + Call this as often as you want, with as little or as much text + as you want (may include '\n'). (This just saves the text, + all the processing is done by goahead().) + """ + + self.rawdata = self.rawdata + data + self.goahead(0) + + def close(self): + """Handle the remaining data.""" + self.goahead(1) + + def error(self, message): + raise SGMLParseError(message) + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.nomoretags: + self.handle_data(rawdata[i:n]) + i = n + break + match = interesting.search(rawdata, i) + if match: j = match.start() + else: j = n + if i < j: + self.handle_data(rawdata[i:j]) + i = j + if i == n: break + if rawdata[i] == '<': + if starttagopen.match(rawdata, i): + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_starttag(i) + if k < 0: break + i = k + continue + if rawdata.startswith("</", i): + k = self.parse_endtag(i) + if k < 0: break + i = k + self.literal = 0 + continue + if self.literal: + if n > (i + 1): + self.handle_data("<") + i = i+1 + else: + # incomplete + break + continue + if rawdata.startswith("<!--", i): + # Strictly speaking, a comment is --.*-- + # within a declaration tag <!...>. + # This should be removed, + # and comments handled only in parse_declaration. + k = self.parse_comment(i) + if k < 0: break + i = k + continue + if rawdata.startswith("<?", i): + k = self.parse_pi(i) + if k < 0: break + i = i+k + continue + if rawdata.startswith("<!", i): + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration ("<!DOCTYPE html...>"). + k = self.parse_declaration(i) + if k < 0: break + i = k + continue + elif rawdata[i] == '&': + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + match = charref.match(rawdata, i) + if match: + name = match.group(1) + self.handle_charref(name) + i = match.end(0) + if rawdata[i-1] != ';': i = i-1 + continue + match = entityref.match(rawdata, i) + if match: + name = match.group(1) + self.handle_entityref(name) + i = match.end(0) + if rawdata[i-1] != ';': i = i-1 + continue + else: + self.error('neither < nor & ??') + # We get here only if incomplete matches but + # nothing else + match = incomplete.match(rawdata, i) + if not match: + self.handle_data(rawdata[i]) + i = i+1 + continue + j = match.end(0) + if j == n: + break # Really incomplete + self.handle_data(rawdata[i:j]) + i = j + # end while + if end and i < n: + self.handle_data(rawdata[i:n]) + i = n + self.rawdata = rawdata[i:] + # XXX if end: check for empty stack + + # Extensions for the DOCTYPE scanner: + _decl_otherchars = '=' + + # Internal -- parse processing instr, return length or -1 if not terminated + def parse_pi(self, i): + rawdata = self.rawdata + if rawdata[i:i+2] != '<?': + self.error('unexpected call to parse_pi()') + match = piclose.search(rawdata, i+2) + if not match: + return -1 + j = match.start(0) + self.handle_pi(rawdata[i+2: j]) + j = match.end(0) + return j-i + + def get_starttag_text(self): + return self.__starttag_text + + # Internal -- handle starttag, return length or -1 if not terminated + def parse_starttag(self, i): + self.__starttag_text = None + start_pos = i + rawdata = self.rawdata + if shorttagopen.match(rawdata, i): + # SGML shorthand: <tag/data/ == <tag>data</tag> + # XXX Can data contain &... (entity or char refs)? + # XXX Can data contain < or > (tag characters)? + # XXX Can there be whitespace before the first /? + match = shorttag.match(rawdata, i) + if not match: + return -1 + tag, data = match.group(1, 2) + self.__starttag_text = '<%s/' % tag + tag = tag.lower() + k = match.end(0) + self.finish_shorttag(tag, data) + self.__starttag_text = rawdata[start_pos:match.end(1) + 1] + return k + # XXX The following should skip matching quotes (' or ") + # As a shortcut way to exit, this isn't so bad, but shouldn't + # be used to locate the actual end of the start tag since the + # < or > characters may be embedded in an attribute value. + match = endbracket.search(rawdata, i+1) + if not match: + return -1 + j = match.start(0) + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + if rawdata[i:i+2] == '<>': + # SGML shorthand: <> == <last open tag seen> + k = j + tag = self.lasttag + else: + match = tagfind.match(rawdata, i+1) + if not match: + self.error('unexpected call to parse_starttag') + k = match.end(0) + tag = rawdata[i+1:k].lower() + self.lasttag = tag + while k < j: + match = attrfind.match(rawdata, k) + if not match: break + attrname, rest, attrvalue = match.group(1, 2, 3) + if not rest: + attrvalue = attrname + else: + if (attrvalue[:1] == "'" == attrvalue[-1:] or + attrvalue[:1] == '"' == attrvalue[-1:]): + # strip quotes + attrvalue = attrvalue[1:-1] + attrvalue = self.entity_or_charref.sub( + self._convert_ref, attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = match.end(0) + if rawdata[j] == '>': + j = j+1 + self.__starttag_text = rawdata[start_pos:j] + self.finish_starttag(tag, attrs) + return j + + # Internal -- convert entity or character reference + def _convert_ref(self, match): + if match.group(2): + return self.convert_charref(match.group(2)) or \ + '&#%s%s' % match.groups()[1:] + elif match.group(3): + return self.convert_entityref(match.group(1)) or \ + '&%s;' % match.group(1) + else: + return '&%s' % match.group(1) + + # Internal -- parse endtag + def parse_endtag(self, i): + rawdata = self.rawdata + match = endbracket.search(rawdata, i+1) + if not match: + return -1 + j = match.start(0) + tag = rawdata[i+2:j].strip().lower() + if rawdata[j] == '>': + j = j+1 + self.finish_endtag(tag) + return j + + # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) + def finish_shorttag(self, tag, data): + self.finish_starttag(tag, []) + self.handle_data(data) + self.finish_endtag(tag) + + # Internal -- finish processing of start tag + # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag + def finish_starttag(self, tag, attrs): + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + self.unknown_starttag(tag, attrs) + return -1 + else: + self.handle_starttag(tag, method, attrs) + return 0 + else: + self.stack.append(tag) + self.handle_starttag(tag, method, attrs) + return 1 + + # Internal -- finish processing of end tag + def finish_endtag(self, tag): + if not tag: + found = len(self.stack) - 1 + if found < 0: + self.unknown_endtag(tag) + return + else: + if tag not in self.stack: + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + self.unknown_endtag(tag) + else: + self.report_unbalanced(tag) + return + found = len(self.stack) + for i in range(found): + if self.stack[i] == tag: found = i + while len(self.stack) > found: + tag = self.stack[-1] + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + method = None + if method: + self.handle_endtag(tag, method) + else: + self.unknown_endtag(tag) + del self.stack[-1] + + # Overridable -- handle start tag + def handle_starttag(self, tag, method, attrs): + method(attrs) + + # Overridable -- handle end tag + def handle_endtag(self, tag, method): + method() + + # Example -- report an unbalanced </...> tag. + def report_unbalanced(self, tag): + if self.verbose: + print '*** Unbalanced </' + tag + '>' + print '*** Stack:', self.stack + + def convert_charref(self, name): + """Convert character reference, may be overridden.""" + try: + n = int(name) + except ValueError: + return + if not 0 <= n <= 127: + return + return self.convert_codepoint(n) + + def convert_codepoint(self, codepoint): + return chr(codepoint) + + def handle_charref(self, name): + """Handle character reference, no need to override.""" + replacement = self.convert_charref(name) + if replacement is None: + self.unknown_charref(name) + else: + self.handle_data(replacement) + + # Definition of entities -- derived classes may override + entitydefs = \ + {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} + + def convert_entityref(self, name): + """Convert entity references. + + As an alternative to overriding this method; one can tailor the + results by setting up the self.entitydefs mapping appropriately. + """ + table = self.entitydefs + if name in table: + return table[name] + else: + return + + def handle_entityref(self, name): + """Handle entity references, no need to override.""" + replacement = self.convert_entityref(name) + if replacement is None: + self.unknown_entityref(name) + else: + self.handle_data(replacement) + + # Example -- handle data, should be overridden + def handle_data(self, data): + pass + + # Example -- handle comment, could be overridden + def handle_comment(self, data): + pass + + # Example -- handle declaration, could be overridden + def handle_decl(self, decl): + pass + + # Example -- handle processing instruction, could be overridden + def handle_pi(self, data): + pass + + # To be overridden -- handlers for unknown objects + def unknown_starttag(self, tag, attrs): pass + def unknown_endtag(self, tag): pass + def unknown_charref(self, ref): pass + def unknown_entityref(self, ref): pass + + +class TestSGMLParser(SGMLParser): + + def __init__(self, verbose=0): + self.testdata = "" + SGMLParser.__init__(self, verbose) + + def handle_data(self, data): + self.testdata = self.testdata + data + if len(repr(self.testdata)) >= 70: + self.flush() + + def flush(self): + data = self.testdata + if data: + self.testdata = "" + print 'data:', repr(data) + + def handle_comment(self, data): + self.flush() + r = repr(data) + if len(r) > 68: + r = r[:32] + '...' + r[-32:] + print 'comment:', r + + def unknown_starttag(self, tag, attrs): + self.flush() + if not attrs: + print 'start tag: <' + tag + '>' + else: + print 'start tag: <' + tag, + for name, value in attrs: + print name + '=' + '"' + value + '"', + print '>' + + def unknown_endtag(self, tag): + self.flush() + print 'end tag: </' + tag + '>' + + def unknown_entityref(self, ref): + self.flush() + print '*** unknown entity ref: &' + ref + ';' + + def unknown_charref(self, ref): + self.flush() + print '*** unknown char ref: &#' + ref + ';' + + def unknown_decl(self, data): + self.flush() + print '*** unknown decl: [' + data + ']' + + def close(self): + SGMLParser.close(self) + self.flush() + + +def test(args = None): + import sys + + if args is None: + args = sys.argv[1:] + + if args and args[0] == '-s': + args = args[1:] + klass = SGMLParser + else: + klass = TestSGMLParser + + if args: + file = args[0] + else: + file = 'test.html' + + if file == '-': + f = sys.stdin + else: + try: + f = open(file, 'r') + except IOError, msg: + print file, ":", msg + sys.exit(1) + + data = f.read() + if f is not sys.stdin: + f.close() + + x = klass() + for c in data: + x.feed(c) + x.close() + + +if __name__ == '__main__': + test() diff --git a/LTA/LTAIngest/mechanize/_sockettimeout.py b/LTA/LTAIngest/mechanize/_sockettimeout.py new file mode 100644 index 0000000000000000000000000000000000000000..c22b7346a05f966d3f71eb27e5211393a302dbe6 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_sockettimeout.py @@ -0,0 +1,6 @@ +import socket + +try: + _GLOBAL_DEFAULT_TIMEOUT = socket._GLOBAL_DEFAULT_TIMEOUT +except AttributeError: + _GLOBAL_DEFAULT_TIMEOUT = object() diff --git a/LTA/LTAIngest/mechanize/_testcase.py b/LTA/LTAIngest/mechanize/_testcase.py new file mode 100644 index 0000000000000000000000000000000000000000..f372760ef9ee72558e270e9112bc3b228cc2384a --- /dev/null +++ b/LTA/LTAIngest/mechanize/_testcase.py @@ -0,0 +1,162 @@ +import os +import shutil +import subprocess +import tempfile +import unittest + + +class SetupStack(object): + + def __init__(self): + self._on_teardown = [] + + def add_teardown(self, teardown): + self._on_teardown.append(teardown) + + def tear_down(self): + for func in reversed(self._on_teardown): + func() + + +class TearDownConvenience(object): + + def __init__(self, setup_stack=None): + self._own_setup_stack = setup_stack is None + if setup_stack is None: + setup_stack = SetupStack() + self._setup_stack = setup_stack + + # only call this convenience method if no setup_stack was supplied to c'tor + def tear_down(self): + assert self._own_setup_stack + self._setup_stack.tear_down() + + +class TempDirMaker(TearDownConvenience): + + def make_temp_dir(self, dir_=None): + temp_dir = tempfile.mkdtemp(prefix="tmp-%s-" % self.__class__.__name__, + dir=dir_) + def tear_down(): + shutil.rmtree(temp_dir) + self._setup_stack.add_teardown(tear_down) + return temp_dir + + +class MonkeyPatcher(TearDownConvenience): + + Unset = object() + + def monkey_patch(self, obj, name, value): + orig_value = getattr(obj, name) + setattr(obj, name, value) + def reverse_patch(): + setattr(obj, name, orig_value) + self._setup_stack.add_teardown(reverse_patch) + + def _set_environ(self, env, name, value): + if value is self.Unset: + try: + del env[name] + except KeyError: + pass + else: + env[name] = value + + def monkey_patch_environ(self, name, value, env=os.environ): + orig_value = env.get(name, self.Unset) + self._set_environ(env, name, value) + def reverse_patch(): + self._set_environ(env, name, orig_value) + self._setup_stack.add_teardown(reverse_patch) + + +class FixtureFactory(object): + + def __init__(self): + self._setup_stack = SetupStack() + self._context_managers = {} + self._fixtures = {} + + def register_context_manager(self, name, context_manager): + self._context_managers[name] = context_manager + + def get_fixture(self, name, add_teardown): + context_manager = self._context_managers[name] + fixture = context_manager.__enter__() + add_teardown(lambda: context_manager.__exit__(None, None, None)) + return fixture + + def get_cached_fixture(self, name): + fixture = self._fixtures.get(name) + if fixture is None: + fixture = self.get_fixture(name, self._setup_stack.add_teardown) + self._fixtures[name] = fixture + return fixture + + def tear_down(self): + self._setup_stack.tear_down() + + +class TestCase(unittest.TestCase): + + def setUp(self): + self._setup_stack = SetupStack() + self._monkey_patcher = MonkeyPatcher(self._setup_stack) + + def tearDown(self): + self._setup_stack.tear_down() + + def register_context_manager(self, name, context_manager): + return self.fixture_factory.register_context_manager( + name, context_manager) + + def get_fixture(self, name): + return self.fixture_factory.get_fixture(name, self.add_teardown) + + def get_cached_fixture(self, name): + return self.fixture_factory.get_cached_fixture(name) + + def add_teardown(self, *args, **kwds): + self._setup_stack.add_teardown(*args, **kwds) + + def make_temp_dir(self, *args, **kwds): + return TempDirMaker(self._setup_stack).make_temp_dir(*args, **kwds) + + def monkey_patch(self, *args, **kwds): + return self._monkey_patcher.monkey_patch(*args, **kwds) + + def monkey_patch_environ(self, *args, **kwds): + return self._monkey_patcher.monkey_patch_environ(*args, **kwds) + + def assert_contains(self, container, containee): + self.assertTrue(containee in container, "%r not in %r" % + (containee, container)) + + def assert_less_than(self, got, expected): + self.assertTrue(got < expected, "%r >= %r" % + (got, expected)) + + +# http://lackingrhoticity.blogspot.com/2009/01/testing-using-golden-files-in-python.html + +class GoldenTestCase(TestCase): + + run_meld = False + + def assert_golden(self, dir_got, dir_expect): + assert os.path.exists(dir_expect), dir_expect + proc = subprocess.Popen(["diff", "--recursive", "-u", "-N", + "--exclude=.*", dir_expect, dir_got], + stdout=subprocess.PIPE) + stdout, stderr = proc.communicate() + if len(stdout) > 0: + if self.run_meld: + # Put expected output on the right because that is the + # side we usually edit. + subprocess.call(["meld", dir_got, dir_expect]) + raise AssertionError( + "Differences from golden files found.\n" + "Try running with --meld to update golden files.\n" + "%s" % stdout) + self.assertEquals(proc.wait(), 0) diff --git a/LTA/LTAIngest/mechanize/_urllib2.py b/LTA/LTAIngest/mechanize/_urllib2.py new file mode 100644 index 0000000000000000000000000000000000000000..29b7038dd49d33c82fba2685283f9ba044da1e37 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_urllib2.py @@ -0,0 +1,50 @@ +# urllib2 work-alike interface +# ...from urllib2... +from urllib2 import \ + URLError, \ + HTTPError +# ...and from mechanize +from _auth import \ + HTTPProxyPasswordMgr, \ + HTTPSClientCertMgr +from _debug import \ + HTTPResponseDebugProcessor, \ + HTTPRedirectDebugProcessor +# crap ATM +## from _gzip import \ +## HTTPGzipProcessor +from _urllib2_fork import \ + AbstractBasicAuthHandler, \ + AbstractDigestAuthHandler, \ + BaseHandler, \ + CacheFTPHandler, \ + FileHandler, \ + FTPHandler, \ + HTTPBasicAuthHandler, \ + HTTPCookieProcessor, \ + HTTPDefaultErrorHandler, \ + HTTPDigestAuthHandler, \ + HTTPErrorProcessor, \ + HTTPHandler, \ + HTTPPasswordMgr, \ + HTTPPasswordMgrWithDefaultRealm, \ + HTTPRedirectHandler, \ + ProxyBasicAuthHandler, \ + ProxyDigestAuthHandler, \ + ProxyHandler, \ + UnknownHandler +from _http import \ + HTTPEquivProcessor, \ + HTTPRefererProcessor, \ + HTTPRefreshProcessor, \ + HTTPRobotRulesProcessor, \ + RobotExclusionError +import httplib +if hasattr(httplib, 'HTTPS'): + from _urllib2_fork import HTTPSHandler +del httplib +from _opener import OpenerDirector, \ + SeekableResponseOpener, \ + build_opener, install_opener, urlopen +from _request import \ + Request diff --git a/LTA/LTAIngest/mechanize/_urllib2_fork.py b/LTA/LTAIngest/mechanize/_urllib2_fork.py new file mode 100644 index 0000000000000000000000000000000000000000..d0cfe382f80ce8deb96f071d9a79a2f6e9482a07 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_urllib2_fork.py @@ -0,0 +1,1414 @@ +"""Fork of urllib2. + +When reading this, don't assume that all code in here is reachable. Code in +the rest of mechanize may be used instead. + +Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 Python +Software Foundation; All Rights Reserved + +Copyright 2002-2009 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# XXX issues: +# If an authentication error handler that tries to perform +# authentication for some reason but fails, how should the error be +# signalled? The client needs to know the HTTP error code. But if +# the handler knows that the problem was, e.g., that it didn't know +# that hash algo that requested in the challenge, it would be good to +# pass that information along to the client, too. +# ftp errors aren't handled cleanly +# check digest against correct (i.e. non-apache) implementation + +# Possible extensions: +# complex proxies XXX not sure what exactly was meant by this +# abstract factory for opener + +import copy +import base64 +import httplib +import mimetools +import logging +import os +import posixpath +import random +import re +import socket +import sys +import time +import urllib +import urlparse +import bisect + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +try: + import hashlib +except ImportError: + # python 2.4 + import md5 + import sha + def sha1_digest(bytes): + return sha.new(bytes).hexdigest() + def md5_digest(bytes): + return md5.new(bytes).hexdigest() +else: + def sha1_digest(bytes): + return hashlib.sha1(bytes).hexdigest() + def md5_digest(bytes): + return hashlib.md5(bytes).hexdigest() + + +try: + socket._fileobject("fake socket", close=True) +except TypeError: + # python <= 2.4 + create_readline_wrapper = socket._fileobject +else: + def create_readline_wrapper(fh): + return socket._fileobject(fh, close=True) + + +# python 2.4 splithost has a bug in empty path component case +_hostprog = None +def splithost(url): + """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" + global _hostprog + if _hostprog is None: + import re + _hostprog = re.compile('^//([^/?]*)(.*)$') + + match = _hostprog.match(url) + if match: return match.group(1, 2) + return None, url + + +from urllib import (unwrap, unquote, splittype, quote, + addinfourl, splitport, + splitattr, ftpwrapper, splituser, splitpasswd, splitvalue) + +# support for FileHandler, proxies via environment variables +from urllib import localhost, url2pathname, getproxies + +from urllib2 import HTTPError, URLError + +import _request +import _rfc3986 +import _sockettimeout + +from _clientcookie import CookieJar +from _response import closeable_response + + +# used in User-Agent header sent +__version__ = sys.version[:3] + +_opener = None +def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + _opener = build_opener() + return _opener.open(url, data, timeout) + +def install_opener(opener): + global _opener + _opener = opener + +# copied from cookielib.py +_cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = urlparse.urlparse(url)[1] + if host == "": + host = request.get_header("Host", "") + + # remove port, if present + host = _cut_port_re.sub("", host, 1) + return host.lower() + +class Request: + + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False): + # unwrap('<URL:type://host/path>') --> 'type://host/path' + self.__original = unwrap(url) + self.type = None + # self.__r_type is what's left after doing the splittype + self.host = None + self.port = None + self._tunnel_host = None + self.data = data + self.headers = {} + for key, value in headers.items(): + self.add_header(key, value) + self.unredirected_hdrs = {} + if origin_req_host is None: + origin_req_host = request_host(self) + self.origin_req_host = origin_req_host + self.unverifiable = unverifiable + + def __getattr__(self, attr): + # XXX this is a fallback mechanism to guard against these + # methods getting called in a non-standard order. this may be + # too complicated and/or unnecessary. + # XXX should the __r_XXX attributes be public? + if attr[:12] == '_Request__r_': + name = attr[12:] + if hasattr(Request, 'get_' + name): + getattr(self, 'get_' + name)() + return getattr(self, attr) + raise AttributeError, attr + + def get_method(self): + if self.has_data(): + return "POST" + else: + return "GET" + + # XXX these helper methods are lame + + def add_data(self, data): + self.data = data + + def has_data(self): + return self.data is not None + + def get_data(self): + return self.data + + def get_full_url(self): + return self.__original + + def get_type(self): + if self.type is None: + self.type, self.__r_type = splittype(self.__original) + if self.type is None: + raise ValueError, "unknown url type: %s" % self.__original + return self.type + + def get_host(self): + if self.host is None: + self.host, self.__r_host = splithost(self.__r_type) + if self.host: + self.host = unquote(self.host) + return self.host + + def get_selector(self): + scheme, authority, path, query, fragment = _rfc3986.urlsplit( + self.__r_host) + if path == "": + path = "/" # RFC 2616, section 3.2.2 + fragment = None # RFC 3986, section 3.5 + return _rfc3986.urlunsplit([scheme, authority, path, query, fragment]) + + def set_proxy(self, host, type): + orig_host = self.get_host() + if self.get_type() == 'https' and not self._tunnel_host: + self._tunnel_host = orig_host + else: + self.type = type + self.__r_host = self.__original + + self.host = host + + def has_proxy(self): + """Private method.""" + # has non-HTTPS proxy + return self.__r_host == self.__original + + def get_origin_req_host(self): + return self.origin_req_host + + def is_unverifiable(self): + return self.unverifiable + + def add_header(self, key, val): + # useful for something like authentication + self.headers[key.capitalize()] = val + + def add_unredirected_header(self, key, val): + # will not be added to a redirected request + self.unredirected_hdrs[key.capitalize()] = val + + def has_header(self, header_name): + return (header_name in self.headers or + header_name in self.unredirected_hdrs) + + def get_header(self, header_name, default=None): + return self.headers.get( + header_name, + self.unredirected_hdrs.get(header_name, default)) + + def header_items(self): + hdrs = self.unredirected_hdrs.copy() + hdrs.update(self.headers) + return hdrs.items() + +class OpenerDirector: + def __init__(self): + client_version = "Python-urllib/%s" % __version__ + self.addheaders = [('User-agent', client_version)] + # manage the individual handlers + self.handlers = [] + self.handle_open = {} + self.handle_error = {} + self.process_response = {} + self.process_request = {} + + def add_handler(self, handler): + if not hasattr(handler, "add_parent"): + raise TypeError("expected BaseHandler instance, got %r" % + type(handler)) + + added = False + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i+1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j+1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = self.handle_error.get(protocol, {}) + self.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = self.handle_open + elif condition == "response": + kind = protocol + lookup = self.process_response + elif condition == "request": + kind = protocol + lookup = self.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + bisect.insort(handlers, handler) + else: + handlers.append(handler) + added = True + + if added: + # the handlers must work in an specific order, the order + # is specified in a Handler attribute + bisect.insort(self.handlers, handler) + handler.add_parent(self) + + def close(self): + # Only exists for backwards compatibility. + pass + + def _call_chain(self, chain, kind, meth_name, *args): + # Handlers raise an exception if no one else should try to handle + # the request, or return None if they can't but another handler + # could. Otherwise, they return the response. + handlers = chain.get(kind, ()) + for handler in handlers: + func = getattr(handler, meth_name) + + result = func(*args) + if result is not None: + return result + + def _open(self, req, data=None): + result = self._call_chain(self.handle_open, 'default', + 'default_open', req) + if result: + return result + + protocol = req.get_type() + result = self._call_chain(self.handle_open, protocol, protocol + + '_open', req) + if result: + return result + + return self._call_chain(self.handle_open, 'unknown', + 'unknown_open', req) + + def error(self, proto, *args): + if proto in ('http', 'https'): + # XXX http[s] protocols are special-cased + dict = self.handle_error['http'] # https is not different than http + proto = args[2] # YUCK! + meth_name = 'http_error_%s' % proto + http_err = 1 + orig_args = args + else: + dict = self.handle_error + meth_name = proto + '_error' + http_err = 0 + args = (dict, proto, meth_name) + args + result = self._call_chain(*args) + if result: + return result + + if http_err: + args = (dict, 'default', 'http_error_default') + orig_args + return self._call_chain(*args) + +# XXX probably also want an abstract factory that knows when it makes +# sense to skip a superclass in favor of a subclass and when it might +# make sense to include both + +def build_opener(*handlers): + """Create an opener object from a list of handlers. + + The opener will use several default handlers, including support + for HTTP, FTP and when applicable, HTTPS. + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + """ + import types + def isclass(obj): + return isinstance(obj, (types.ClassType, type)) + + opener = OpenerDirector() + default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, + HTTPDefaultErrorHandler, HTTPRedirectHandler, + FTPHandler, FileHandler, HTTPErrorProcessor] + if hasattr(httplib, 'HTTPS'): + default_classes.append(HTTPSHandler) + skip = set() + for klass in default_classes: + for check in handlers: + if isclass(check): + if issubclass(check, klass): + skip.add(klass) + elif isinstance(check, klass): + skip.add(klass) + for klass in skip: + default_classes.remove(klass) + + for klass in default_classes: + opener.add_handler(klass()) + + for h in handlers: + if isclass(h): + h = h() + opener.add_handler(h) + return opener + +class BaseHandler: + handler_order = 500 + + def add_parent(self, parent): + self.parent = parent + + def close(self): + # Only exists for backwards compatibility + pass + + def __lt__(self, other): + if not hasattr(other, "handler_order"): + # Try to preserve the old behavior of having custom classes + # inserted after default ones (works only for custom user + # classes which are not aware of handler_order). + return True + return self.handler_order < other.handler_order + + +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses. + + The purpose of this handler is to to allow other response processors a + look-in by removing the call to parent.error() from + AbstractHTTPHandler. + + For non-2xx error codes, this just passes the job on to the + Handler.<proto>_error_<code> methods, via the OpenerDirector.error method. + Eventually, HTTPDefaultErrorHandler will raise an HTTPError if no other + handler handles the error. + + """ + handler_order = 1000 # after all other processors + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + # According to RFC 2616, "2xx" code indicates that the client's + # request was successfully received, understood, and accepted. + if not (200 <= code < 300): + # hardcoded http is NOT a bug + response = self.parent.error( + 'http', request, response, code, msg, hdrs) + + return response + + https_response = http_response + +class HTTPDefaultErrorHandler(BaseHandler): + def http_error_default(self, req, fp, code, msg, hdrs): + # why these error methods took the code, msg, headers args in the first + # place rather than a response object, I don't know, but to avoid + # multiple wrapping, we're discarding them + + if isinstance(fp, HTTPError): + response = fp + else: + response = HTTPError( + req.get_full_url(), code, msg, hdrs, fp) + assert code == response.code + assert msg == response.msg + assert hdrs == response.hdrs + raise response + +class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections to any single URL + # this is needed because of the state that cookies introduce + max_repeats = 4 + # maximum total number of redirections (regardless of URL) before + # assuming we're in a loop + max_redirections = 10 + + # Implementation notes: + + # To avoid the server sending us into an infinite loop, the request + # object needs to track what URLs we have already seen. Do this by + # adding a handler-specific attribute to the Request object. The value + # of the dict is used to count the number of times the same URL has + # been visited. This is needed because visiting the same URL twice + # does not necessarily imply a loop, thanks to state introduced by + # cookies. + + # Always unhandled redirection codes: + # 300 Multiple Choices: should not handle this here. + # 304 Not Modified: no need to handle here: only of interest to caches + # that do conditional GETs + # 305 Use Proxy: probably not worth dealing with here + # 306 Unused: what was this for in the previous versions of protocol?? + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (code in (301, 302, 303, 307, "refresh") and m in ("GET", "HEAD") + or code in (301, 302, 303, "refresh") and m == "POST"): + # Strictly (according to RFC 2616), 301 or 302 in response + # to a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib2, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + # TODO: really refresh redirections should be visiting; tricky to fix + new = _request.Request( + newurl, + headers=req.headers, + origin_req_host=req.get_origin_req_host(), + unverifiable=True, + visit=False, + timeout=req.timeout) + new._origin_req = getattr(req, "_origin_req", req) + return new + else: + raise HTTPError(req.get_full_url(), code, msg, headers, fp) + + def http_error_302(self, req, fp, code, msg, headers): + # Some servers (incorrectly) return multiple Location headers + # (so probably same goes for URI). Use first header. + if 'location' in headers: + newurl = headers.getheaders('location')[0] + elif 'uri' in headers: + newurl = headers.getheaders('uri')[0] + else: + return + newurl = _rfc3986.clean_url(newurl, "latin-1") + newurl = _rfc3986.urljoin(req.get_full_url(), newurl) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + new = self.redirect_request(req, fp, code, msg, headers, newurl) + if new is None: + return + + # loop detection + # .redirect_dict has a key url if url was previously visited. + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if (visited.get(newurl, 0) >= self.max_repeats or + len(visited) >= self.max_redirections): + raise HTTPError(req.get_full_url(), code, + self.inf_msg + msg, headers, fp) + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[newurl] = visited.get(newurl, 0) + 1 + + # Don't close the fp until we are sure that we won't use it + # with HTTPError. + fp.read() + fp.close() + + return self.parent.open(new) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + http_error_refresh = http_error_302 + + inf_msg = "The HTTP server returned a redirect error that would " \ + "lead to an infinite loop.\n" \ + "The last 30x error message was:\n" + + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + scheme, r_scheme = splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = splituser(authority) + if userinfo is not None: + user, password = splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + +class ProxyHandler(BaseHandler): + # Proxies must be in front + handler_order = 100 + + def __init__(self, proxies=None, proxy_bypass=None): + if proxies is None: + proxies = getproxies() + + assert hasattr(proxies, 'has_key'), "proxies must be a mapping" + self.proxies = proxies + for type, url in proxies.items(): + setattr(self, '%s_open' % type, + lambda r, proxy=url, type=type, meth=self.proxy_open: \ + meth(r, proxy, type)) + if proxy_bypass is None: + proxy_bypass = urllib.proxy_bypass + self._proxy_bypass = proxy_bypass + + def proxy_open(self, req, proxy, type): + orig_type = req.get_type() + proxy_type, user, password, hostport = _parse_proxy(proxy) + + if proxy_type is None: + proxy_type = orig_type + + if req.get_host() and self._proxy_bypass(req.get_host()): + return None + + if user and password: + user_pass = '%s:%s' % (unquote(user), unquote(password)) + creds = base64.b64encode(user_pass).strip() + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type or orig_type == 'https': + # let other handlers take care of it + return None + else: + # need to start over, because the other handlers don't + # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a + return self.parent.open(req) + + +class HTTPPasswordMgr: + + def __init__(self): + self.passwd = {} + + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if isinstance(uri, basestring): + uri = [uri] + if not realm in self.passwd: + self.passwd[realm] = {} + for default_port in True, False: + reduced_uri = tuple( + [self.reduce_uri(u, default_port) for u in uri]) + self.passwd[realm][reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + domains = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uris, authinfo in domains.iteritems(): + for uri in uris: + if self.is_suburi(uri, reduced_authuri): + return authinfo + return None, None + + def reduce_uri(self, uri, default_port=True): + """Accept authority or URI and extract only the authority and path.""" + # note HTTP URLs do not have a userinfo component + parts = urlparse.urlsplit(uri) + if parts[1]: + # URI + scheme = parts[0] + authority = parts[1] + path = parts[2] or '/' + else: + # host or host:port + scheme = None + authority = uri + path = '/' + host, port = splitport(authority) + if default_port and port is None and scheme is not None: + dport = {"http": 80, + "https": 443, + }.get(scheme) + if dport is not None: + authority = "%s:%d" % (host, dport) + return authority, path + + def is_suburi(self, base, test): + """Check if test is below base in a URI tree + + Both args must be URIs in reduced form. + """ + if base == test: + return True + if base[0] != test[0]: + return False + common = posixpath.commonprefix((base[1], test[1])) + if len(common) == len(base[1]): + return True + return False + + +class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): + + def find_user_password(self, realm, authuri): + user, password = HTTPPasswordMgr.find_user_password(self, realm, + authuri) + if user is not None: + return user, password + return HTTPPasswordMgr.find_user_password(self, None, authuri) + + +class AbstractBasicAuthHandler: + + # XXX this allows for multiple auth-schemes, but will stupidly pick + # the last one with a realm specified. + + # allow for double- and single-quoted realm values + # (single quotes are a violation of the RFC, but appear in the wild) + rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' + 'realm=(["\'])(.*?)\\2', re.I) + + # XXX could pre-emptively send auth info already accepted (RFC 2617, + # end of section 2, and section 1.2 immediately after "credentials" + # production). + + def __init__(self, password_mgr=None): + if password_mgr is None: + password_mgr = HTTPPasswordMgr() + self.passwd = password_mgr + self.add_password = self.passwd.add_password + + def http_error_auth_reqed(self, authreq, host, req, headers): + # host may be an authority (without userinfo) or a URL with an + # authority + # XXX could be multiple headers + authreq = headers.get(authreq, None) + if authreq: + mo = AbstractBasicAuthHandler.rx.search(authreq) + if mo: + scheme, quote, realm = mo.groups() + if scheme.lower() == 'basic': + return self.retry_http_basic_auth(host, req, realm) + + def retry_http_basic_auth(self, host, req, realm): + user, pw = self.passwd.find_user_password(realm, host) + if pw is not None: + raw = "%s:%s" % (user, pw) + auth = 'Basic %s' % base64.b64encode(raw).strip() + if req.headers.get(self.auth_header, None) == auth: + return None + newreq = copy.copy(req) + newreq.add_header(self.auth_header, auth) + newreq.visit = False + return self.parent.open(newreq) + else: + return None + + +class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Authorization' + + def http_error_401(self, req, fp, code, msg, headers): + url = req.get_full_url() + return self.http_error_auth_reqed('www-authenticate', + url, req, headers) + + +class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Proxy-authorization' + + def http_error_407(self, req, fp, code, msg, headers): + # http_error_auth_reqed requires that there is no userinfo component in + # authority. Assume there isn't one, since urllib2 does not (and + # should not, RFC 3986 s. 3.2.1) support requests for URLs containing + # userinfo. + authority = req.get_host() + return self.http_error_auth_reqed('proxy-authenticate', + authority, req, headers) + + +def randombytes(n): + """Return n random bytes.""" + # Use /dev/urandom if it is available. Fall back to random module + # if not. It might be worthwhile to extend this function to use + # other platform-specific mechanisms for getting random bytes. + if os.path.exists("/dev/urandom"): + f = open("/dev/urandom") + s = f.read(n) + f.close() + return s + else: + L = [chr(random.randrange(0, 256)) for i in range(n)] + return "".join(L) + +class AbstractDigestAuthHandler: + # Digest authentication is specified in RFC 2617. + + # XXX The client does not inspect the Authentication-Info header + # in a successful response. + + # XXX It should be possible to test this implementation against + # a mock server that just generates a static set of challenges. + + # XXX qop="auth-int" supports is shaky + + def __init__(self, passwd=None): + if passwd is None: + passwd = HTTPPasswordMgr() + self.passwd = passwd + self.add_password = self.passwd.add_password + self.retried = 0 + self.nonce_count = 0 + self.last_nonce = None + + def reset_retry_count(self): + self.retried = 0 + + def http_error_auth_reqed(self, auth_header, host, req, headers): + authreq = headers.get(auth_header, None) + if self.retried > 5: + # Don't fail endlessly - if we failed once, we'll probably + # fail a second time. Hm. Unless the Password Manager is + # prompting for the information. Crap. This isn't great + # but it's better than the current 'repeat until recursion + # depth exceeded' approach <wink> + raise HTTPError(req.get_full_url(), 401, "digest auth failed", + headers, None) + else: + self.retried += 1 + if authreq: + scheme = authreq.split()[0] + if scheme.lower() == 'digest': + return self.retry_http_digest_auth(req, authreq) + + def retry_http_digest_auth(self, req, auth): + token, challenge = auth.split(' ', 1) + chal = parse_keqv_list(parse_http_list(challenge)) + auth = self.get_authorization(req, chal) + if auth: + auth_val = 'Digest %s' % auth + if req.headers.get(self.auth_header, None) == auth_val: + return None + newreq = copy.copy(req) + newreq.add_unredirected_header(self.auth_header, auth_val) + newreq.visit = False + return self.parent.open(newreq) + + def get_cnonce(self, nonce): + # The cnonce-value is an opaque + # quoted string value provided by the client and used by both client + # and server to avoid chosen plaintext attacks, to provide mutual + # authentication, and to provide some message integrity protection. + # This isn't a fabulous effort, but it's probably Good Enough. + dig = sha1_digest("%s:%s:%s:%s" % (self.nonce_count, nonce, + time.ctime(), randombytes(8))) + return dig[:16] + + def get_authorization(self, req, chal): + try: + realm = chal['realm'] + nonce = chal['nonce'] + qop = chal.get('qop') + algorithm = chal.get('algorithm', 'MD5') + # mod_digest doesn't send an opaque, even though it isn't + # supposed to be optional + opaque = chal.get('opaque', None) + except KeyError: + return None + + H, KD = self.get_algorithm_impls(algorithm) + if H is None: + return None + + user, pw = self.passwd.find_user_password(realm, req.get_full_url()) + if user is None: + return None + + # XXX not implemented yet + if req.has_data(): + entdig = self.get_entity_digest(req.get_data(), chal) + else: + entdig = None + + A1 = "%s:%s:%s" % (user, realm, pw) + A2 = "%s:%s" % (req.get_method(), + # XXX selector: what about proxies and full urls + req.get_selector()) + if qop == 'auth': + if nonce == self.last_nonce: + self.nonce_count += 1 + else: + self.nonce_count = 1 + self.last_nonce = nonce + + ncvalue = '%08x' % self.nonce_count + cnonce = self.get_cnonce(nonce) + noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) + respdig = KD(H(A1), noncebit) + elif qop is None: + respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) + else: + # XXX handle auth-int. + logger = logging.getLogger("mechanize.auth") + logger.info("digest auth auth-int qop is not supported, not " + "handling digest authentication") + return None + + # XXX should the partial digests be encoded too? + + base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ + 'response="%s"' % (user, realm, nonce, req.get_selector(), + respdig) + if opaque: + base += ', opaque="%s"' % opaque + if entdig: + base += ', digest="%s"' % entdig + base += ', algorithm="%s"' % algorithm + if qop: + base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) + return base + + def get_algorithm_impls(self, algorithm): + # algorithm should be case-insensitive according to RFC2617 + algorithm = algorithm.upper() + if algorithm == 'MD5': + H = md5_digest + elif algorithm == 'SHA': + H = sha1_digest + # XXX MD5-sess + KD = lambda s, d: H("%s:%s" % (s, d)) + return H, KD + + def get_entity_digest(self, data, chal): + # XXX not implemented yet + return None + + +class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + """An authentication protocol defined by RFC 2069 + + Digest authentication improves on basic authentication because it + does not transmit passwords in the clear. + """ + + auth_header = 'Authorization' + handler_order = 490 # before Basic auth + + def http_error_401(self, req, fp, code, msg, headers): + host = urlparse.urlparse(req.get_full_url())[1] + retry = self.http_error_auth_reqed('www-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + + auth_header = 'Proxy-Authorization' + handler_order = 490 # before Basic auth + + def http_error_407(self, req, fp, code, msg, headers): + host = req.get_host() + retry = self.http_error_auth_reqed('proxy-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + +class AbstractHTTPHandler(BaseHandler): + + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def do_request_(self, request): + host = request.get_host() + if not host: + raise URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if not request.has_header('Content-length'): + request.add_unredirected_header( + 'Content-length', '%d' % len(data)) + + sel_host = host + if request.has_proxy(): + scheme, sel = splittype(request.get_selector()) + sel_host, sel_path = splithost(sel) + + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + + def do_open(self, http_class, req): + """Return an addinfourl object for the request, using http_class. + + http_class must implement the HTTPConnection API from httplib. + The addinfourl return value is a file-like object. It also + has methods and attributes including: + - info(): return a mimetools.Message object for the headers + - geturl(): return the original request URL + - code: HTTP status code + """ + host_port = req.get_host() + if not host_port: + raise URLError('no host given') + + try: + h = http_class(host_port, timeout=req.timeout) + except TypeError: + # Python < 2.6, no per-connection timeout support + h = http_class(host_port) + h.set_debuglevel(self._debuglevel) + + headers = dict(req.headers) + headers.update(req.unredirected_hdrs) + # We want to make an HTTP/1.1 request, but the addinfourl + # class isn't prepared to deal with a persistent connection. + # It will try to read all remaining data from the socket, + # which will block while the server waits for the next request. + # So make sure the connection gets closed after the (only) + # request. + headers["Connection"] = "close" + headers = dict( + (name.title(), val) for name, val in headers.items()) + + if req._tunnel_host: + if not hasattr(h, "set_tunnel"): + if not hasattr(h, "_set_tunnel"): + raise URLError("HTTPS through proxy not supported " + "(Python >= 2.6.4 required)") + else: + # python 2.6 + set_tunnel = h._set_tunnel + else: + set_tunnel = h.set_tunnel + set_tunnel(req._tunnel_host) + + try: + h.request(req.get_method(), req.get_selector(), req.data, headers) + r = h.getresponse() + except socket.error, err: # XXX what error? + raise URLError(err) + + # Pick apart the HTTPResponse object to get the addinfourl + # object initialized properly. + + # Wrap the HTTPResponse object in socket's file object adapter + # for Windows. That adapter calls recv(), so delegate recv() + # to read(). This weird wrapping allows the returned object to + # have readline() and readlines() methods. + + # XXX It might be better to extract the read buffering code + # out of socket._fileobject() and into a base class. + + r.recv = r.read + fp = create_readline_wrapper(r) + + resp = closeable_response(fp, r.msg, req.get_full_url(), + r.status, r.reason) + return resp + + +class HTTPHandler(AbstractHTTPHandler): + + def http_open(self, req): + return self.do_open(httplib.HTTPConnection, req) + + http_request = AbstractHTTPHandler.do_request_ + +if hasattr(httplib, 'HTTPS'): + + class HTTPSConnectionFactory: + def __init__(self, key_file, cert_file): + self._key_file = key_file + self._cert_file = cert_file + def __call__(self, hostport): + return httplib.HTTPSConnection( + hostport, + key_file=self._key_file, cert_file=self._cert_file) + + class HTTPSHandler(AbstractHTTPHandler): + + def __init__(self, client_cert_manager=None): + AbstractHTTPHandler.__init__(self) + self.client_cert_manager = client_cert_manager + + def https_open(self, req): + if self.client_cert_manager is not None: + key_file, cert_file = self.client_cert_manager.find_key_cert( + req.get_full_url()) + conn_factory = HTTPSConnectionFactory(key_file, cert_file) + else: + conn_factory = httplib.HTTPSConnection + return self.do_open(conn_factory, req) + + https_request = AbstractHTTPHandler.do_request_ + +class HTTPCookieProcessor(BaseHandler): + """Handle HTTP cookies. + + Public attributes: + + cookiejar: CookieJar instance + + """ + def __init__(self, cookiejar=None): + if cookiejar is None: + cookiejar = CookieJar() + self.cookiejar = cookiejar + + def http_request(self, request): + self.cookiejar.add_cookie_header(request) + return request + + def http_response(self, request, response): + self.cookiejar.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + +class UnknownHandler(BaseHandler): + def unknown_open(self, req): + type = req.get_type() + raise URLError('unknown url type: %s' % type) + +def parse_keqv_list(l): + """Parse list of key=value strings where keys are not duplicated.""" + parsed = {} + for elt in l: + k, v = elt.split('=', 1) + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + parsed[k] = v + return parsed + +def parse_http_list(s): + """Parse lists as described by RFC 2068 Section 2. + + In particular, parse comma-separated lists where the elements of + the list may include quoted-strings. A quoted-string could + contain a comma. A non-quoted string could have quotes in the + middle. Neither commas nor quotes count if they are escaped. + Only double-quotes count, not single-quotes. + """ + res = [] + part = '' + + escape = quote = False + for cur in s: + if escape: + part += cur + escape = False + continue + if quote: + if cur == '\\': + escape = True + continue + elif cur == '"': + quote = False + part += cur + continue + + if cur == ',': + res.append(part) + part = '' + continue + + if cur == '"': + quote = True + + part += cur + + # append last part + if part: + res.append(part) + + return [part.strip() for part in res] + +class FileHandler(BaseHandler): + # Use local file or FTP depending on form of URL + def file_open(self, req): + url = req.get_selector() + if url[:2] == '//' and url[2:3] != '/': + req.type = 'ftp' + return self.parent.open(req) + else: + return self.open_local_file(req) + + # names for the localhost + names = None + def get_names(self): + if FileHandler.names is None: + try: + FileHandler.names = (socket.gethostbyname('localhost'), + socket.gethostbyname(socket.gethostname())) + except socket.gaierror: + FileHandler.names = (socket.gethostbyname('localhost'),) + return FileHandler.names + + # not entirely sure what the rules are here + def open_local_file(self, req): + try: + import email.utils as emailutils + except ImportError: + # python 2.4 + import email.Utils as emailutils + import mimetypes + host = req.get_host() + file = req.get_selector() + localfile = url2pathname(file) + try: + stats = os.stat(localfile) + size = stats.st_size + modified = emailutils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(file)[0] + headers = mimetools.Message(StringIO( + 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified))) + if host: + host, port = splitport(host) + if not host or \ + (not port and socket.gethostbyname(host) in self.get_names()): + return addinfourl(open(localfile, 'rb'), + headers, 'file:'+file) + except OSError, msg: + # urllib2 users shouldn't expect OSErrors coming from urlopen() + raise URLError(msg) + raise URLError('file not on local host') + +class FTPHandler(BaseHandler): + def ftp_open(self, req): + import ftplib + import mimetypes + host = req.get_host() + if not host: + raise URLError('ftp error: no host given') + host, port = splitport(host) + if port is None: + port = ftplib.FTP_PORT + else: + port = int(port) + + # username/password handling + user, host = splituser(host) + if user: + user, passwd = splitpasswd(user) + else: + passwd = None + host = unquote(host) + user = unquote(user or '') + passwd = unquote(passwd or '') + + try: + host = socket.gethostbyname(host) + except socket.error, msg: + raise URLError(msg) + path, attrs = splitattr(req.get_selector()) + dirs = path.split('/') + dirs = map(unquote, dirs) + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: + dirs = dirs[1:] + try: + fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) + type = file and 'I' or 'D' + for attr in attrs: + attr, value = splitvalue(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + fp, retrlen = fw.retrfile(file, type) + headers = "" + mtype = mimetypes.guess_type(req.get_full_url())[0] + if mtype: + headers += "Content-type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-length: %d\n" % retrlen + sf = StringIO(headers) + headers = mimetools.Message(sf) + return addinfourl(fp, headers, req.get_full_url()) + except ftplib.all_errors, msg: + raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2] + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + try: + fw = ftpwrapper(user, passwd, host, port, dirs, timeout) + except TypeError: + # Python < 2.6, no per-connection timeout support + fw = ftpwrapper(user, passwd, host, port, dirs) +## fw.ftp.set_debuglevel(1) + return fw + +class CacheFTPHandler(FTPHandler): + # XXX would be nice to have pluggable cache strategies + # XXX this stuff is definitely not thread safe + def __init__(self): + self.cache = {} + self.timeout = {} + self.soonest = 0 + self.delay = 60 + self.max_conns = 16 + + def setTimeout(self, t): + self.delay = t + + def setMaxConns(self, m): + self.max_conns = m + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + key = user, host, port, '/'.join(dirs), timeout + if key in self.cache: + self.timeout[key] = time.time() + self.delay + else: + self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout) + self.timeout[key] = time.time() + self.delay + self.check_cache() + return self.cache[key] + + def check_cache(self): + # first check for old ones + t = time.time() + if self.soonest <= t: + for k, v in self.timeout.items(): + if v < t: + self.cache[k].close() + del self.cache[k] + del self.timeout[k] + self.soonest = min(self.timeout.values()) + + # then check the size + if len(self.cache) == self.max_conns: + for k, v in self.timeout.items(): + if v == self.soonest: + del self.cache[k] + del self.timeout[k] + break + self.soonest = min(self.timeout.values()) diff --git a/LTA/LTAIngest/mechanize/_useragent.py b/LTA/LTAIngest/mechanize/_useragent.py new file mode 100644 index 0000000000000000000000000000000000000000..ac28bdd7bbfda39efa1e882f4086d3eabef6f6b4 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_useragent.py @@ -0,0 +1,367 @@ +"""Convenient HTTP UserAgent class. + +This is a subclass of urllib2.OpenerDirector. + + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import warnings + +import _auth +import _gzip +import _opener +import _response +import _sockettimeout +import _urllib2 + + +class UserAgentBase(_opener.OpenerDirector): + """Convenient user-agent class. + + Do not use .add_handler() to add a handler for something already dealt with + by this code. + + The only reason at present for the distinction between UserAgent and + UserAgentBase is so that classes that depend on .seek()able responses + (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass + UserAgent exposes a .set_seekable_responses() method that allows switching + off the adding of a .seek() method to responses. + + Public attributes: + + addheaders: list of (name, value) pairs specifying headers to send with + every request, unless they are overridden in the Request instance. + + >>> ua = UserAgentBase() + >>> ua.addheaders = [ + ... ("User-agent", "Mozilla/5.0 (compatible)"), + ... ("From", "responsible.person@example.com")] + + """ + + handler_classes = { + # scheme handlers + "http": _urllib2.HTTPHandler, + # CacheFTPHandler is buggy, at least in 2.3, so we don't use it + "ftp": _urllib2.FTPHandler, + "file": _urllib2.FileHandler, + + # other handlers + "_unknown": _urllib2.UnknownHandler, + # HTTP{S,}Handler depend on HTTPErrorProcessor too + "_http_error": _urllib2.HTTPErrorProcessor, + "_http_default_error": _urllib2.HTTPDefaultErrorHandler, + + # feature handlers + "_basicauth": _urllib2.HTTPBasicAuthHandler, + "_digestauth": _urllib2.HTTPDigestAuthHandler, + "_redirect": _urllib2.HTTPRedirectHandler, + "_cookies": _urllib2.HTTPCookieProcessor, + "_refresh": _urllib2.HTTPRefreshProcessor, + "_equiv": _urllib2.HTTPEquivProcessor, + "_proxy": _urllib2.ProxyHandler, + "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler, + "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler, + "_robots": _urllib2.HTTPRobotRulesProcessor, + "_gzip": _gzip.HTTPGzipProcessor, # experimental! + + # debug handlers + "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor, + "_debug_response_body": _urllib2.HTTPResponseDebugProcessor, + } + + default_schemes = ["http", "ftp", "file"] + default_others = ["_unknown", "_http_error", "_http_default_error"] + default_features = ["_redirect", "_cookies", + "_refresh", "_equiv", + "_basicauth", "_digestauth", + "_proxy", "_proxy_basicauth", "_proxy_digestauth", + "_robots", + ] + if hasattr(_urllib2, 'HTTPSHandler'): + handler_classes["https"] = _urllib2.HTTPSHandler + default_schemes.append("https") + + def __init__(self): + _opener.OpenerDirector.__init__(self) + + ua_handlers = self._ua_handlers = {} + for scheme in (self.default_schemes+ + self.default_others+ + self.default_features): + klass = self.handler_classes[scheme] + ua_handlers[scheme] = klass() + for handler in ua_handlers.itervalues(): + self.add_handler(handler) + + # Yuck. + # Ensure correct default constructor args were passed to + # HTTPRefreshProcessor and HTTPEquivProcessor. + if "_refresh" in ua_handlers: + self.set_handle_refresh(True) + if "_equiv" in ua_handlers: + self.set_handle_equiv(True) + # Ensure default password managers are installed. + pm = ppm = None + if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers: + pm = _urllib2.HTTPPasswordMgrWithDefaultRealm() + if ("_proxy_basicauth" in ua_handlers or + "_proxy_digestauth" in ua_handlers): + ppm = _auth.HTTPProxyPasswordMgr() + self.set_password_manager(pm) + self.set_proxy_password_manager(ppm) + # set default certificate manager + if "https" in ua_handlers: + cm = _urllib2.HTTPSClientCertMgr() + self.set_client_cert_manager(cm) + + def close(self): + _opener.OpenerDirector.close(self) + self._ua_handlers = None + + # XXX +## def set_timeout(self, timeout): +## self._timeout = timeout +## def set_http_connection_cache(self, conn_cache): +## self._http_conn_cache = conn_cache +## def set_ftp_connection_cache(self, conn_cache): +## # XXX ATM, FTP has cache as part of handler; should it be separate? +## self._ftp_conn_cache = conn_cache + + def set_handled_schemes(self, schemes): + """Set sequence of URL scheme (protocol) strings. + + For example: ua.set_handled_schemes(["http", "ftp"]) + + If this fails (with ValueError) because you've passed an unknown + scheme, the set of handled schemes will not be changed. + + """ + want = {} + for scheme in schemes: + if scheme.startswith("_"): + raise ValueError("not a scheme '%s'" % scheme) + if scheme not in self.handler_classes: + raise ValueError("unknown scheme '%s'") + want[scheme] = None + + # get rid of scheme handlers we don't want + for scheme, oldhandler in self._ua_handlers.items(): + if scheme.startswith("_"): continue # not a scheme handler + if scheme not in want: + self._replace_handler(scheme, None) + else: + del want[scheme] # already got it + # add the scheme handlers that are missing + for scheme in want.keys(): + self._set_handler(scheme, True) + + def set_cookiejar(self, cookiejar): + """Set a mechanize.CookieJar, or None.""" + self._set_handler("_cookies", obj=cookiejar) + + # XXX could use Greg Stein's httpx for some of this instead? + # or httplib2?? + def set_proxies(self, proxies=None, proxy_bypass=None): + """Configure proxy settings. + + proxies: dictionary mapping URL scheme to proxy specification. None + means use the default system-specific settings. + proxy_bypass: function taking hostname, returning whether proxy should + be used. None means use the default system-specific settings. + + The default is to try to obtain proxy settings from the system (see the + documentation for urllib.urlopen for information about the + system-specific methods used -- note that's urllib, not urllib2). + + To avoid all use of proxies, pass an empty proxies dict. + + >>> ua = UserAgentBase() + >>> def proxy_bypass(hostname): + ... return hostname == "noproxy.com" + >>> ua.set_proxies( + ... {"http": "joe:password@myproxy.example.com:3128", + ... "ftp": "proxy.example.com"}, + ... proxy_bypass) + + """ + self._set_handler("_proxy", True, + constructor_kwds=dict(proxies=proxies, + proxy_bypass=proxy_bypass)) + + def add_password(self, url, user, password, realm=None): + self._password_manager.add_password(realm, url, user, password) + def add_proxy_password(self, user, password, hostport=None, realm=None): + self._proxy_password_manager.add_password( + realm, hostport, user, password) + + def add_client_certificate(self, url, key_file, cert_file): + """Add an SSL client certificate, for HTTPS client auth. + + key_file and cert_file must be filenames of the key and certificate + files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS + 12) file to PEM format: + + openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem + openssl pkcs12 -nocerts -in cert.p12 -out key.pem + + + Note that client certificate password input is very inflexible ATM. At + the moment this seems to be console only, which is presumably the + default behaviour of libopenssl. In future mechanize may support + third-party libraries that (I assume) allow more options here. + + """ + self._client_cert_manager.add_key_cert(url, key_file, cert_file) + + # the following are rarely useful -- use add_password / add_proxy_password + # instead + def set_password_manager(self, password_manager): + """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None.""" + self._password_manager = password_manager + self._set_handler("_basicauth", obj=password_manager) + self._set_handler("_digestauth", obj=password_manager) + def set_proxy_password_manager(self, password_manager): + """Set a mechanize.HTTPProxyPasswordMgr, or None.""" + self._proxy_password_manager = password_manager + self._set_handler("_proxy_basicauth", obj=password_manager) + self._set_handler("_proxy_digestauth", obj=password_manager) + def set_client_cert_manager(self, cert_manager): + """Set a mechanize.HTTPClientCertMgr, or None.""" + self._client_cert_manager = cert_manager + handler = self._ua_handlers["https"] + handler.client_cert_manager = cert_manager + + # these methods all take a boolean parameter + def set_handle_robots(self, handle): + """Set whether to observe rules from robots.txt.""" + self._set_handler("_robots", handle) + def set_handle_redirect(self, handle): + """Set whether to handle HTTP 30x redirections.""" + self._set_handler("_redirect", handle) + def set_handle_refresh(self, handle, max_time=None, honor_time=True): + """Set whether to handle HTTP Refresh headers.""" + self._set_handler("_refresh", handle, constructor_kwds= + {"max_time": max_time, "honor_time": honor_time}) + def set_handle_equiv(self, handle, head_parser_class=None): + """Set whether to treat HTML http-equiv headers like HTTP headers. + + Response objects may be .seek()able if this is set (currently returned + responses are, raised HTTPError exception responses are not). + + """ + if head_parser_class is not None: + constructor_kwds = {"head_parser_class": head_parser_class} + else: + constructor_kwds={} + self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds) + def set_handle_gzip(self, handle): + """Handle gzip transfer encoding. + + """ + if handle: + warnings.warn( + "gzip transfer encoding is experimental!", stacklevel=2) + self._set_handler("_gzip", handle) + def set_debug_redirects(self, handle): + """Log information about HTTP redirects (including refreshes). + + Logging is performed using module logging. The logger name is + "mechanize.http_redirects". To actually print some debug output, + eg: + + import sys, logging + logger = logging.getLogger("mechanize.http_redirects") + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.setLevel(logging.INFO) + + Other logger names relevant to this module: + + "mechanize.http_responses" + "mechanize.cookies" + + To turn on everything: + + import sys, logging + logger = logging.getLogger("mechanize") + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.setLevel(logging.INFO) + + """ + self._set_handler("_debug_redirect", handle) + def set_debug_responses(self, handle): + """Log HTTP response bodies. + + See docstring for .set_debug_redirects() for details of logging. + + Response objects may be .seek()able if this is set (currently returned + responses are, raised HTTPError exception responses are not). + + """ + self._set_handler("_debug_response_body", handle) + def set_debug_http(self, handle): + """Print HTTP headers to sys.stdout.""" + level = int(bool(handle)) + for scheme in "http", "https": + h = self._ua_handlers.get(scheme) + if h is not None: + h.set_http_debuglevel(level) + + def _set_handler(self, name, handle=None, obj=None, + constructor_args=(), constructor_kwds={}): + if handle is None: + handle = obj is not None + if handle: + handler_class = self.handler_classes[name] + if obj is not None: + newhandler = handler_class(obj) + else: + newhandler = handler_class( + *constructor_args, **constructor_kwds) + else: + newhandler = None + self._replace_handler(name, newhandler) + + def _replace_handler(self, name, newhandler=None): + # first, if handler was previously added, remove it + if name is not None: + handler = self._ua_handlers.get(name) + if handler: + try: + self.handlers.remove(handler) + except ValueError: + pass + # then add the replacement, if any + if newhandler is not None: + self.add_handler(newhandler) + self._ua_handlers[name] = newhandler + + +class UserAgent(UserAgentBase): + + def __init__(self): + UserAgentBase.__init__(self) + self._seekable = False + + def set_seekable_responses(self, handle): + """Make response objects .seek()able.""" + self._seekable = bool(handle) + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + if self._seekable: + def bound_open(fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return UserAgentBase.open(self, fullurl, data, timeout) + response = _opener.wrapped_open( + bound_open, _response.seek_wrapped_response, fullurl, data, + timeout) + else: + response = UserAgentBase.open(self, fullurl, data) + return response diff --git a/LTA/LTAIngest/mechanize/_util.py b/LTA/LTAIngest/mechanize/_util.py new file mode 100644 index 0000000000000000000000000000000000000000..0a5ebb1f31f75f3c2b5f572b555198b9fe0c7e69 --- /dev/null +++ b/LTA/LTAIngest/mechanize/_util.py @@ -0,0 +1,305 @@ +"""Utility functions and date/time routines. + + Copyright 2002-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). +""" + +import re +import time +import warnings + + +class ExperimentalWarning(UserWarning): + pass + +def experimental(message): + warnings.warn(message, ExperimentalWarning, stacklevel=3) +def hide_experimental_warnings(): + warnings.filterwarnings("ignore", category=ExperimentalWarning) +def reset_experimental_warnings(): + warnings.filterwarnings("default", category=ExperimentalWarning) + +def deprecation(message): + warnings.warn(message, DeprecationWarning, stacklevel=3) +def hide_deprecations(): + warnings.filterwarnings("ignore", category=DeprecationWarning) +def reset_deprecations(): + warnings.filterwarnings("default", category=DeprecationWarning) + + +def write_file(filename, data): + f = open(filename, "wb") + try: + f.write(data) + finally: + f.close() + + +def get1(sequence): + assert len(sequence) == 1 + return sequence[0] + + +def isstringlike(x): + try: x+"" + except: return False + else: return True + +## def caller(): +## try: +## raise SyntaxError +## except: +## import sys +## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name + + +from calendar import timegm + +# Date/time conversion routines for formats used by the HTTP protocol. + +EPOCH = 1970 +def my_timegm(tt): + year, month, mday, hour, min, sec = tt[:6] + if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and + (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): + return timegm(tt) + else: + return None + +days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] +months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] +months_lower = [] +for month in months: months_lower.append(month.lower()) + + +def time2isoz(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", + representing Universal Time (UTC, aka GMT). An example of this format is: + + 1994-11-24 08:49:37Z + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec = time.gmtime(t)[:6] + return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( + year, mon, mday, hour, min, sec) + +def time2netscape(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like this: + + Wed, DD-Mon-YYYY HH:MM:SS GMT + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7] + return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % ( + days[wday], mday, months[mon-1], year, hour, min, sec) + + +UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} + +timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") +def offset_from_tz_string(tz): + offset = None + if UTC_ZONES.has_key(tz): + offset = 0 + else: + m = timezone_re.search(tz) + if m: + offset = 3600 * int(m.group(2)) + if m.group(3): + offset = offset + 60 * int(m.group(3)) + if m.group(1) == '-': + offset = -offset + return offset + +def _str2time(day, mon, yr, hr, min, sec, tz): + # translate month name to number + # month numbers start with 1 (January) + try: + mon = months_lower.index(mon.lower())+1 + except ValueError: + # maybe it's already a number + try: + imon = int(mon) + except ValueError: + return None + if 1 <= imon <= 12: + mon = imon + else: + return None + + # make sure clock elements are defined + if hr is None: hr = 0 + if min is None: min = 0 + if sec is None: sec = 0 + + yr = int(yr) + day = int(day) + hr = int(hr) + min = int(min) + sec = int(sec) + + if yr < 1000: + # find "obvious" year + cur_yr = time.localtime(time.time())[0] + m = cur_yr % 100 + tmp = yr + yr = yr + cur_yr - m + m = m - tmp + if abs(m) > 50: + if m > 0: yr = yr + 100 + else: yr = yr - 100 + + # convert UTC time tuple to seconds since epoch (not timezone-adjusted) + t = my_timegm((yr, mon, day, hr, min, sec, tz)) + + if t is not None: + # adjust time using timezone string, to get absolute time since epoch + if tz is None: + tz = "UTC" + tz = tz.upper() + offset = offset_from_tz_string(tz) + if offset is None: + return None + t = t - offset + + return t + + +strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " + r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") +wkday_re = re.compile( + r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) +loose_http_re = re.compile( + r"""^ + (\d\d?) # day + (?:\s+|[-\/]) + (\w+) # month + (?:\s+|[-\/]) + (\d+) # year + (?: + (?:\s+|:) # separator before clock + (\d\d?):(\d\d) # hour:min + (?::(\d\d))? # optional seconds + )? # optional clock + \s* + ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone + \s* + (?:\(\w+\))? # ASCII representation of timezone in parens. + \s*$""", re.X) +def http2time(text): + """Returns time in seconds since epoch of time represented by a string. + + Return value is an integer. + + None is returned if the format of str is unrecognized, the time is outside + the representable range, or the timezone string is not recognized. If the + string contains no timezone, UTC is assumed. + + The timezone in the string may be numerical (like "-0800" or "+0100") or a + string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the + timezone strings equivalent to UTC (zero offset) are known to the function. + + The function loosely parses the following formats: + + Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format + Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format + Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format + 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) + 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) + 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) + + The parser ignores leading and trailing whitespace. The time may be + absent. + + If the year is given with only 2 digits, the function will select the + century that makes the year closest to the current date. + + """ + # fast exit for strictly conforming string + m = strict_re.search(text) + if m: + g = m.groups() + mon = months_lower.index(g[1].lower()) + 1 + tt = (int(g[2]), mon, int(g[0]), + int(g[3]), int(g[4]), float(g[5])) + return my_timegm(tt) + + # No, we need some messy parsing... + + # clean up + text = text.lstrip() + text = wkday_re.sub("", text, 1) # Useless weekday + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = loose_http_re.search(text) + if m is not None: + day, mon, yr, hr, min, sec, tz = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + + +iso_re = re.compile( + """^ + (\d{4}) # year + [-\/]? + (\d\d?) # numerical month + [-\/]? + (\d\d?) # day + (?: + (?:\s+|[-:Tt]) # separator before clock + (\d\d?):?(\d\d) # hour:min + (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) + )? # optional clock + \s* + ([-+]?\d\d?:?(:?\d\d)? + |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) + \s*$""", re.X) +def iso2time(text): + """ + As for http2time, but parses the ISO 8601 formats: + + 1994-02-03 14:15:29 -0100 -- ISO 8601 format + 1994-02-03 14:15:29 -- zone is optional + 1994-02-03 -- only date + 1994-02-03T14:15:29 -- Use T as separator + 19940203T141529Z -- ISO 8601 compact format + 19940203 -- only date + + """ + # clean up + text = text.lstrip() + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = iso_re.search(text) + if m is not None: + # XXX there's an extra bit of the timezone I'm ignoring here: is + # this the right thing to do? + yr, mon, day, hr, min, sec, tz, _ = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) diff --git a/LTA/LTAIngest/mechanize/_version.py b/LTA/LTAIngest/mechanize/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..ab5b07b714725721454ec14497adb7134496d06f --- /dev/null +++ b/LTA/LTAIngest/mechanize/_version.py @@ -0,0 +1,2 @@ +"0.2.5" +__version__ = (0, 2, 5, None, None) diff --git a/LTA/LTAIngest/mom_http.py b/LTA/LTAIngest/mom_http.py new file mode 100755 index 0000000000000000000000000000000000000000..d65730eb0dda184ebaa1ec26bf86bb257ec25fdc --- /dev/null +++ b/LTA/LTAIngest/mom_http.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +import urllib, urllib2, cookielib, os.path, ClientForm, socket + +class client: + """This is an HTTP client that knows how to use the Single Sign On of Mom2. + It is used instead of a SOAP client, because SOAPpy doesn't support + form handling and cookies.""" + def __init__(self, loginUrl, url, logoutUrl): + self._loginurl = loginUrl + self._url = url + self._logouturl = logoutUrl + self._headers = {'User-agent' : 'Mozilla/4.0 (compatible; http_login.py)'} + ## The following is a global setting! + socket.setdefaulttimeout(3600) + + def _login(self): + try: + cookiejar = cookielib.CookieJar() ## Not using any of the more specialised versions as we do not want to save them. + hch = urllib2.HTTPCookieProcessor(cookiejar) + hrh = urllib2.HTTPRedirectHandler() ## we need a redirect handler to handle the funcky stuff MoM does + ## this code is for debugging + ## hh = urllib2.HTTPHandler() + ## hhs = urllib2.HTTPSHandler() + ## hh.set_http_debuglevel(1) + ## hhs.set_http_debuglevel(1) + self.opener = urllib2.build_opener(hrh, hch) + ##self.opener = urllib2.build_opener(hh, hhs, hrh, hch) + + request = urllib2.Request(self._loginurl, None, self._headers) + response = self.opener.open(request) ## get's us a JSESSIONID in a cookie, uses a redirect + + forms = ClientForm.ParseResponse(response) + if len(forms) == 0: + raise Exception('Unable to parse MoM login form or form not available') + form = forms[0] + form['j_username'] = self.username + form['j_password'] = self.password + + request = form.click() + response = self.opener.open(request) ## get's us a JSESSIONIDSSO in a cookie, uses a redirect + except Exception, e: + raise Exception("Logging into MoM failed: " + str(e)) + + def _setStatus(self, exportID, status): + try: + request = urllib2.Request(self._url + '?exportId=' + str(exportID) + '&status=' + str(status), None, self._headers) + response = self.opener.open(request) ## We tell what we want + reply = response.readlines() + if reply == ['ok']: + result = (0, 'http_login updated ' + str(exportID) + ' to ' + str(status)) + else: + result = (1, 'http_login for ' + str(exportID) + ' failed on: ' + str(reply)) + except Exception, e: + return Exception(2, 'http_login failed with exception: ' + str(e)) + else: + return result + + def _getSIP(self, Type, MomId, StorageTicket, FileName, URI, FileSize, MD5Checksum, Adler32Checksum): + try: + xmlcontent = """<?xml version="1.0" encoding="UTF-8"?> + <lofar:%s mom2DPId="%s" xmlns:lofar="http://www.astron.nl/MoM2-Lofar"> + <locations> + <location> + <uri>lta://%s/%s/%s</uri> + </location> + </locations> + <storageTicket>%s</storageTicket> + <fileSize>%s</fileSize> + <checksums> + <checksum> + <algorithm>MD5</algorithm> + <value>%s</value> + </checksum> + <checksum> + <algorithm>Adler32</algorithm> + <value>%s</value> + </checksum> + </checksums> + </lofar:%s>""" % (Type, MomId, StorageTicket, FileName, URI, StorageTicket, FileSize, MD5Checksum, Adler32Checksum, Type) + + data = urllib.urlencode({"command" : "get-sip-with-input", "xmlcontent" : xmlcontent}) + # Now get that file-like object again, remembering to mention the data. + request = urllib2.Request(self._url, data, self._headers) + response = self.opener.open(request) + result = response.read() + response.close() + return result + except Exception, e: + raise Exception("getting SIP from MoM failed: " + str(e)) + return '' + + def _logout(self): + try: + request = urllib2.Request(self._logouturl, None, self._headers) + response = self.opener.open(request) ## we get out again + except Exception, e: + raise Exception("Logging out of MoM failed: " + str(e)) + + def setStatus(self, exportID, status): + self._login() + result = self._setStatus(exportID, status) + self._logout() + return result + + def getSIP(self, MomId, StorageTicket, FileName, URI, FileSize, MD5Checksum, Adler32Checksum): + self._login() + #result = self._getSIP("uvDataProduct", MomId, StorageTicket, FileName, URI, FileSize, MD5Checksum, Adler32Checksum) + result = self._getSIP("DataProduct", MomId, StorageTicket, FileName, URI, FileSize, MD5Checksum, Adler32Checksum) + self._logout() + return result + diff --git a/LTA/LTAIngest/multiprocessing/__init__.py b/LTA/LTAIngest/multiprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4fb0bd0bf5c270bd9e2910d997218ca59f2d0e67 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/__init__.py @@ -0,0 +1,272 @@ +# +# Package analogous to 'threading.py' but using processes +# +# multiprocessing/__init__.py +# +# This package is intended to duplicate the functionality (and much of +# the API) of threading.py but uses processes instead of threads. A +# subpackage 'multiprocessing.dummy' has the same API but is a simple +# wrapper for 'threading'. +# +# Try calling `multiprocessing.doc.main()` to read the html +# documentation in in a webbrowser. +# +# +# Copyright (c) 2006-2008, R Oudkerk +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Neither the name of author nor the names of any contributors may be +# used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# + +__version__ = '0.70a1' + +__all__ = [ + 'Process', 'current_process', 'active_children', 'freeze_support', + 'Manager', 'Pipe', 'cpu_count', 'log_to_stderr', 'get_logger', + 'allow_connection_pickling', 'BufferTooShort', 'TimeoutError', + 'Lock', 'RLock', 'Semaphore', 'BoundedSemaphore', 'Condition', + 'Event', 'Queue', 'JoinableQueue', 'Pool', 'Value', 'Array', + 'RawValue', 'RawArray', 'SUBDEBUG', 'SUBWARNING', + ] + +__author__ = 'R. Oudkerk (r.m.oudkerk@gmail.com)' + +# +# Imports +# + +import os +import sys + +from multiprocessing.process import Process, current_process, active_children +from multiprocessing.util import SUBDEBUG, SUBWARNING + +# +# Exceptions +# + +class ProcessError(Exception): + pass + +class BufferTooShort(ProcessError): + pass + +class TimeoutError(ProcessError): + pass + +class AuthenticationError(ProcessError): + pass + +# This is down here because _multiprocessing uses BufferTooShort +import _multiprocessing + +# +# Definitions not depending on native semaphores +# + +def Manager(): + ''' + Returns a manager associated with a running server process + + The managers methods such as `Lock()`, `Condition()` and `Queue()` + can be used to create shared objects. + ''' + from multiprocessing.managers import SyncManager + m = SyncManager() + m.start() + return m + +def Pipe(duplex=True): + ''' + Returns two connection object connected by a pipe + ''' + from multiprocessing.connection import Pipe + return Pipe(duplex) + +def cpu_count(): + ''' + Returns the number of CPUs in the system + ''' + if sys.platform == 'win32': + try: + num = int(os.environ['NUMBER_OF_PROCESSORS']) + except (ValueError, KeyError): + num = 0 + elif 'bsd' in sys.platform or sys.platform == 'darwin': + try: + num = int(os.popen('sysctl -n hw.ncpu').read()) + except ValueError: + num = 0 + else: + try: + num = os.sysconf('SC_NPROCESSORS_ONLN') + except (ValueError, OSError, AttributeError): + num = 0 + + if num >= 1: + return num + else: + raise NotImplementedError('cannot determine number of cpus') + +def freeze_support(): + ''' + Check whether this is a fake forked process in a frozen executable. + If so then run code specified by commandline and exit. + ''' + if sys.platform == 'win32' and getattr(sys, 'frozen', False): + from multiprocessing.forking import freeze_support + freeze_support() + +def get_logger(): + ''' + Return package logger -- if it does not already exist then it is created + ''' + from multiprocessing.util import get_logger + return get_logger() + +def log_to_stderr(level=None): + ''' + Turn on logging and add a handler which prints to stderr + ''' + from multiprocessing.util import log_to_stderr + return log_to_stderr(level) + +def allow_connection_pickling(): + ''' + Install support for sending connections and sockets between processes + ''' + from multiprocessing import reduction + +# +# Definitions depending on native semaphores +# + +def Lock(): + ''' + Returns a non-recursive lock object + ''' + from multiprocessing.synchronize import Lock + return Lock() + +def RLock(): + ''' + Returns a recursive lock object + ''' + from multiprocessing.synchronize import RLock + return RLock() + +def Condition(lock=None): + ''' + Returns a condition object + ''' + from multiprocessing.synchronize import Condition + return Condition(lock) + +def Semaphore(value=1): + ''' + Returns a semaphore object + ''' + from multiprocessing.synchronize import Semaphore + return Semaphore(value) + +def BoundedSemaphore(value=1): + ''' + Returns a bounded semaphore object + ''' + from multiprocessing.synchronize import BoundedSemaphore + return BoundedSemaphore(value) + +def Event(): + ''' + Returns an event object + ''' + from multiprocessing.synchronize import Event + return Event() + +def Queue(maxsize=0): + ''' + Returns a queue object + ''' + from multiprocessing.queues import Queue + return Queue(maxsize) + +def JoinableQueue(maxsize=0): + ''' + Returns a queue object + ''' + from multiprocessing.queues import JoinableQueue + return JoinableQueue(maxsize) + +def Pool(processes=None, initializer=None, initargs=()): + ''' + Returns a process pool object + ''' + from multiprocessing.pool import Pool + return Pool(processes, initializer, initargs) + +def RawValue(typecode_or_type, *args): + ''' + Returns a shared object + ''' + from multiprocessing.sharedctypes import RawValue + return RawValue(typecode_or_type, *args) + +def RawArray(typecode_or_type, size_or_initializer): + ''' + Returns a shared array + ''' + from multiprocessing.sharedctypes import RawArray + return RawArray(typecode_or_type, size_or_initializer) + +def Value(typecode_or_type, *args, **kwds): + ''' + Returns a synchronized shared object + ''' + from multiprocessing.sharedctypes import Value + return Value(typecode_or_type, *args, **kwds) + +def Array(typecode_or_type, size_or_initializer, **kwds): + ''' + Returns a synchronized shared array + ''' + from multiprocessing.sharedctypes import Array + return Array(typecode_or_type, size_or_initializer, **kwds) + +# +# +# + +if sys.platform == 'win32': + + def set_executable(executable): + ''' + Sets the path to a python.exe or pythonw.exe binary used to run + child processes on Windows instead of sys.executable. + Useful for people embedding Python. + ''' + from multiprocessing.forking import set_executable + set_executable(executable) + + __all__ += ['set_executable'] diff --git a/LTA/LTAIngest/multiprocessing/connection.py b/LTA/LTAIngest/multiprocessing/connection.py new file mode 100644 index 0000000000000000000000000000000000000000..fc42b160a300a142ebee68b9697e7e626add6291 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/connection.py @@ -0,0 +1,439 @@ +# +# A higher level module for using sockets (or Windows named pipes) +# +# multiprocessing/connection.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = [ 'Client', 'Listener', 'Pipe' ] + +import os +import sys +import socket +import errno +import time +import tempfile +import itertools + +import _multiprocessing +from multiprocessing import current_process, AuthenticationError +from multiprocessing.util import get_temp_dir, Finalize, sub_debug, debug +from multiprocessing.forking import duplicate, close + + +# +# +# + +BUFSIZE = 8192 +# A very generous timeout when it comes to local connections... +CONNECTION_TIMEOUT = 20. + +_mmap_counter = itertools.count() + +default_family = 'AF_INET' +families = ['AF_INET'] + +if hasattr(socket, 'AF_UNIX'): + default_family = 'AF_UNIX' + families += ['AF_UNIX'] + +if sys.platform == 'win32': + default_family = 'AF_PIPE' + families += ['AF_PIPE'] + + +def _init_timeout(timeout=CONNECTION_TIMEOUT): + return time.time() + timeout + +def _check_timeout(t): + return time.time() > t + +# +# +# + +def arbitrary_address(family): + ''' + Return an arbitrary free address for the given family + ''' + if family == 'AF_INET': + return ('localhost', 0) + elif family == 'AF_UNIX': + return tempfile.mktemp(prefix='listener-', dir=get_temp_dir()) + elif family == 'AF_PIPE': + return tempfile.mktemp(prefix=r'\\.\pipe\pyc-%d-%d-' % + (os.getpid(), _mmap_counter.next())) + else: + raise ValueError('unrecognized family') + + +def address_type(address): + ''' + Return the types of the address + + This can be 'AF_INET', 'AF_UNIX', or 'AF_PIPE' + ''' + if type(address) == tuple: + return 'AF_INET' + elif type(address) is str and address.startswith('\\\\'): + return 'AF_PIPE' + elif type(address) is str: + return 'AF_UNIX' + else: + raise ValueError('address type of %r unrecognized' % address) + +# +# Public functions +# + +class Listener(object): + ''' + Returns a listener object. + + This is a wrapper for a bound socket which is 'listening' for + connections, or for a Windows named pipe. + ''' + def __init__(self, address=None, family=None, backlog=1, authkey=None): + family = family or (address and address_type(address)) \ + or default_family + address = address or arbitrary_address(family) + + if family == 'AF_PIPE': + self._listener = PipeListener(address, backlog) + else: + self._listener = SocketListener(address, family, backlog) + + if authkey is not None and not isinstance(authkey, bytes): + raise TypeError, 'authkey should be a byte string' + + self._authkey = authkey + + def accept(self): + ''' + Accept a connection on the bound socket or named pipe of `self`. + + Returns a `Connection` object. + ''' + c = self._listener.accept() + if self._authkey: + deliver_challenge(c, self._authkey) + answer_challenge(c, self._authkey) + return c + + def close(self): + ''' + Close the bound socket or named pipe of `self`. + ''' + return self._listener.close() + + address = property(lambda self: self._listener._address) + last_accepted = property(lambda self: self._listener._last_accepted) + + +def Client(address, family=None, authkey=None): + ''' + Returns a connection to the address of a `Listener` + ''' + family = family or address_type(address) + if family == 'AF_PIPE': + c = PipeClient(address) + else: + c = SocketClient(address) + + if authkey is not None and not isinstance(authkey, bytes): + raise TypeError, 'authkey should be a byte string' + + if authkey is not None: + answer_challenge(c, authkey) + deliver_challenge(c, authkey) + + return c + + +if sys.platform != 'win32': + + def Pipe(duplex=True): + ''' + Returns pair of connection objects at either end of a pipe + ''' + if duplex: + s1, s2 = socket.socketpair() + # _multiprocessing.Connection read/write semantics do not handle + # non-blocking sockets correctly (issue 6056). This bug-fix + # retains current behavior and allows for a default socket timeout + s1.settimeout(None) + s2.settimeout(None) + c1 = _multiprocessing.Connection(os.dup(s1.fileno())) + c2 = _multiprocessing.Connection(os.dup(s2.fileno())) + s1.close() + s2.close() + else: + fd1, fd2 = os.pipe() + c1 = _multiprocessing.Connection(fd1, writable=False) + c2 = _multiprocessing.Connection(fd2, readable=False) + + return c1, c2 + +else: + + from ._multiprocessing import win32 + + def Pipe(duplex=True): + ''' + Returns pair of connection objects at either end of a pipe + ''' + address = arbitrary_address('AF_PIPE') + if duplex: + openmode = win32.PIPE_ACCESS_DUPLEX + access = win32.GENERIC_READ | win32.GENERIC_WRITE + obsize, ibsize = BUFSIZE, BUFSIZE + else: + openmode = win32.PIPE_ACCESS_INBOUND + access = win32.GENERIC_WRITE + obsize, ibsize = 0, BUFSIZE + + h1 = win32.CreateNamedPipe( + address, openmode, + win32.PIPE_TYPE_MESSAGE | win32.PIPE_READMODE_MESSAGE | + win32.PIPE_WAIT, + 1, obsize, ibsize, win32.NMPWAIT_WAIT_FOREVER, win32.NULL + ) + h2 = win32.CreateFile( + address, access, 0, win32.NULL, win32.OPEN_EXISTING, 0, win32.NULL + ) + win32.SetNamedPipeHandleState( + h2, win32.PIPE_READMODE_MESSAGE, None, None + ) + + try: + win32.ConnectNamedPipe(h1, win32.NULL) + except WindowsError, e: + if e.args[0] != win32.ERROR_PIPE_CONNECTED: + raise + + c1 = _multiprocessing.PipeConnection(h1, writable=duplex) + c2 = _multiprocessing.PipeConnection(h2, readable=duplex) + + return c1, c2 + +# +# Definitions for connections based on sockets +# + +class SocketListener(object): + ''' + Representation of a socket which is bound to an address and listening + ''' + def __init__(self, address, family, backlog=1): + self._socket = socket.socket(getattr(socket, family)) + self._socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + # non-blocking sockets fix for issue 6056 + self._socket.settimeout(None) + self._socket.bind(address) + self._socket.listen(backlog) + self._address = self._socket.getsockname() + self._family = family + self._last_accepted = None + + if family == 'AF_UNIX': + self._unlink = Finalize( + self, os.unlink, args=(address,), exitpriority=0 + ) + else: + self._unlink = None + + def accept(self): + s, self._last_accepted = self._socket.accept() + # non-blocking sockets fix for issue 6056 + s.settimeout(None) + fd = duplicate(s.fileno()) + conn = _multiprocessing.Connection(fd) + s.close() + return conn + + def close(self): + self._socket.close() + if self._unlink is not None: + self._unlink() + + +def SocketClient(address): + ''' + Return a connection object connected to the socket given by `address` + ''' + family = address_type(address) + s = socket.socket( getattr(socket, family) ) + # non-blocking sockets fix for issue 6056 + s.settimeout(None) + t = _init_timeout() + + while 1: + try: + s.connect(address) + except socket.error, e: + if e.args[0] != errno.ECONNREFUSED or _check_timeout(t): + debug('failed to connect to address %s', address) + raise + time.sleep(0.01) + else: + break + else: + raise + + fd = duplicate(s.fileno()) + conn = _multiprocessing.Connection(fd) + s.close() + return conn + +# +# Definitions for connections based on named pipes +# + +if sys.platform == 'win32': + + class PipeListener(object): + ''' + Representation of a named pipe + ''' + def __init__(self, address, backlog=None): + self._address = address + handle = win32.CreateNamedPipe( + address, win32.PIPE_ACCESS_DUPLEX, + win32.PIPE_TYPE_MESSAGE | win32.PIPE_READMODE_MESSAGE | + win32.PIPE_WAIT, + win32.PIPE_UNLIMITED_INSTANCES, BUFSIZE, BUFSIZE, + win32.NMPWAIT_WAIT_FOREVER, win32.NULL + ) + self._handle_queue = [handle] + self._last_accepted = None + + sub_debug('listener created with address=%r', self._address) + + self.close = Finalize( + self, PipeListener._finalize_pipe_listener, + args=(self._handle_queue, self._address), exitpriority=0 + ) + + def accept(self): + newhandle = win32.CreateNamedPipe( + self._address, win32.PIPE_ACCESS_DUPLEX, + win32.PIPE_TYPE_MESSAGE | win32.PIPE_READMODE_MESSAGE | + win32.PIPE_WAIT, + win32.PIPE_UNLIMITED_INSTANCES, BUFSIZE, BUFSIZE, + win32.NMPWAIT_WAIT_FOREVER, win32.NULL + ) + self._handle_queue.append(newhandle) + handle = self._handle_queue.pop(0) + try: + win32.ConnectNamedPipe(handle, win32.NULL) + except WindowsError, e: + if e.args[0] != win32.ERROR_PIPE_CONNECTED: + raise + return _multiprocessing.PipeConnection(handle) + + @staticmethod + def _finalize_pipe_listener(queue, address): + sub_debug('closing listener with address=%r', address) + for handle in queue: + close(handle) + + def PipeClient(address): + ''' + Return a connection object connected to the pipe given by `address` + ''' + t = _init_timeout() + while 1: + try: + win32.WaitNamedPipe(address, 1000) + h = win32.CreateFile( + address, win32.GENERIC_READ | win32.GENERIC_WRITE, + 0, win32.NULL, win32.OPEN_EXISTING, 0, win32.NULL + ) + except WindowsError, e: + if e.args[0] not in (win32.ERROR_SEM_TIMEOUT, + win32.ERROR_PIPE_BUSY) or _check_timeout(t): + raise + else: + break + else: + raise + + win32.SetNamedPipeHandleState( + h, win32.PIPE_READMODE_MESSAGE, None, None + ) + return _multiprocessing.PipeConnection(h) + +# +# Authentication stuff +# + +MESSAGE_LENGTH = 20 + +CHALLENGE = b'#CHALLENGE#' +WELCOME = b'#WELCOME#' +FAILURE = b'#FAILURE#' + +def deliver_challenge(connection, authkey): + import hmac + assert isinstance(authkey, bytes) + message = os.urandom(MESSAGE_LENGTH) + connection.send_bytes(CHALLENGE + message) + digest = hmac.new(authkey, message).digest() + response = connection.recv_bytes(256) # reject large message + if response == digest: + connection.send_bytes(WELCOME) + else: + connection.send_bytes(FAILURE) + raise AuthenticationError('digest received was wrong') + +def answer_challenge(connection, authkey): + import hmac + assert isinstance(authkey, bytes) + message = connection.recv_bytes(256) # reject large message + assert message[:len(CHALLENGE)] == CHALLENGE, 'message = %r' % message + message = message[len(CHALLENGE):] + digest = hmac.new(authkey, message).digest() + connection.send_bytes(digest) + response = connection.recv_bytes(256) # reject large message + if response != WELCOME: + raise AuthenticationError('digest sent was rejected') + +# +# Support for using xmlrpclib for serialization +# + +class ConnectionWrapper(object): + def __init__(self, conn, dumps, loads): + self._conn = conn + self._dumps = dumps + self._loads = loads + for attr in ('fileno', 'close', 'poll', 'recv_bytes', 'send_bytes'): + obj = getattr(conn, attr) + setattr(self, attr, obj) + def send(self, obj): + s = self._dumps(obj) + self._conn.send_bytes(s) + def recv(self): + s = self._conn.recv_bytes() + return self._loads(s) + +def _xml_dumps(obj): + return xmlrpclib.dumps((obj,), None, None, None, 1).encode('utf8') + +def _xml_loads(s): + (obj,), method = xmlrpclib.loads(s.decode('utf8')) + return obj + +class XmlListener(Listener): + def accept(self): + global xmlrpclib + import xmlrpclib + obj = Listener.accept(self) + return ConnectionWrapper(obj, _xml_dumps, _xml_loads) + +def XmlClient(*args, **kwds): + global xmlrpclib + import xmlrpclib + return ConnectionWrapper(Client(*args, **kwds), _xml_dumps, _xml_loads) diff --git a/LTA/LTAIngest/multiprocessing/dummy/__init__.py b/LTA/LTAIngest/multiprocessing/dummy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..30b1b20f56af2f690811398a9e7226a6dbb48f64 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/dummy/__init__.py @@ -0,0 +1,126 @@ +# +# Support for the API of the multiprocessing package using threads +# +# multiprocessing/dummy/__init__.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = [ + 'Process', 'current_process', 'active_children', 'freeze_support', + 'Lock', 'RLock', 'Semaphore', 'BoundedSemaphore', 'Condition', + 'Event', 'Queue', 'Manager', 'Pipe', 'Pool', 'JoinableQueue' + ] + +# +# Imports +# + +import threading +import sys +import weakref +import array +import itertools + +from multiprocessing import TimeoutError, cpu_count +from multiprocessing.dummy.connection import Pipe +from threading import Lock, RLock, Semaphore, BoundedSemaphore +from threading import Event +from Queue import Queue + +# +# +# + +class DummyProcess(threading.Thread): + + def __init__(self, group=None, target=None, name=None, args=(), kwargs={}): + threading.Thread.__init__(self, group, target, name, args, kwargs) + self._pid = None + self._children = weakref.WeakKeyDictionary() + self._start_called = False + self._parent = current_process() + + def start(self): + assert self._parent is current_process() + self._start_called = True + self._parent._children[self] = None + threading.Thread.start(self) + + @property + def exitcode(self): + if self._start_called and not self.is_alive(): + return 0 + else: + return None + +# +# +# + +class Condition(threading._Condition): + notify_all = threading._Condition.notify_all.im_func + +# +# +# + +Process = DummyProcess +current_process = threading.current_thread +current_process()._children = weakref.WeakKeyDictionary() + +def active_children(): + children = current_process()._children + for p in list(children): + if not p.is_alive(): + children.pop(p, None) + return list(children) + +def freeze_support(): + pass + +# +# +# + +class Namespace(object): + def __init__(self, **kwds): + self.__dict__.update(kwds) + def __repr__(self): + items = self.__dict__.items() + temp = [] + for name, value in items: + if not name.startswith('_'): + temp.append('%s=%r' % (name, value)) + temp.sort() + return 'Namespace(%s)' % str.join(', ', temp) + +dict = dict +list = list + +def Array(typecode, sequence, lock=True): + return array.array(typecode, sequence) + +class Value(object): + def __init__(self, typecode, value, lock=True): + self._typecode = typecode + self._value = value + def _get(self): + return self._value + def _set(self, value): + self._value = value + value = property(_get, _set) + def __repr__(self): + return '<%r(%r, %r)>'%(type(self).__name__,self._typecode,self._value) + +def Manager(): + return sys.modules[__name__] + +def shutdown(): + pass + +def Pool(processes=None, initializer=None, initargs=()): + from multiprocessing.pool import ThreadPool + return ThreadPool(processes, initializer, initargs) + +JoinableQueue = Queue diff --git a/LTA/LTAIngest/multiprocessing/dummy/connection.py b/LTA/LTAIngest/multiprocessing/dummy/connection.py new file mode 100644 index 0000000000000000000000000000000000000000..4f0a6805de1061b5025bba7af82435489ea22565 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/dummy/connection.py @@ -0,0 +1,61 @@ +# +# Analogue of `multiprocessing.connection` which uses queues instead of sockets +# +# multiprocessing/dummy/connection.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = [ 'Client', 'Listener', 'Pipe' ] + +from Queue import Queue + + +families = [None] + + +class Listener(object): + + def __init__(self, address=None, family=None, backlog=1): + self._backlog_queue = Queue(backlog) + + def accept(self): + return Connection(*self._backlog_queue.get()) + + def close(self): + self._backlog_queue = None + + address = property(lambda self: self._backlog_queue) + + +def Client(address): + _in, _out = Queue(), Queue() + address.put((_out, _in)) + return Connection(_in, _out) + + +def Pipe(duplex=True): + a, b = Queue(), Queue() + return Connection(a, b), Connection(b, a) + + +class Connection(object): + + def __init__(self, _in, _out): + self._out = _out + self._in = _in + self.send = self.send_bytes = _out.put + self.recv = self.recv_bytes = _in.get + + def poll(self, timeout=0.0): + if self._in.qsize() > 0: + return True + if timeout <= 0.0: + return False + self._in.not_empty.acquire() + self._in.not_empty.wait(timeout) + self._in.not_empty.release() + return self._in.qsize() > 0 + + def close(self): + pass diff --git a/LTA/LTAIngest/multiprocessing/forking.py b/LTA/LTAIngest/multiprocessing/forking.py new file mode 100644 index 0000000000000000000000000000000000000000..7eda99180ac6ee93bd9b8ebaf9ac1149e32c8820 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/forking.py @@ -0,0 +1,473 @@ +# +# Module for starting a process object using os.fork() or CreateProcess() +# +# multiprocessing/forking.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +import os +import sys +import signal + +from multiprocessing import util, process + +__all__ = ['Popen', 'assert_spawning', 'exit', 'duplicate', 'close', 'ForkingPickler'] + +# +# Check that the current thread is spawning a child process +# + +def assert_spawning(self): + if not Popen.thread_is_spawning(): + raise RuntimeError( + '%s objects should only be shared between processes' + ' through inheritance' % type(self).__name__ + ) + +# +# Try making some callable types picklable +# + +from pickle import Pickler +class ForkingPickler(Pickler): + dispatch = Pickler.dispatch.copy() + + @classmethod + def register(cls, type, reduce): + def dispatcher(self, obj): + rv = reduce(obj) + self.save_reduce(obj=obj, *rv) + cls.dispatch[type] = dispatcher + +def _reduce_method(m): + if m.im_self is None: + return getattr, (m.im_class, m.im_func.func_name) + else: + return getattr, (m.im_self, m.im_func.func_name) +ForkingPickler.register(type(ForkingPickler.save), _reduce_method) + +def _reduce_method_descriptor(m): + return getattr, (m.__objclass__, m.__name__) +ForkingPickler.register(type(list.append), _reduce_method_descriptor) +ForkingPickler.register(type(int.__add__), _reduce_method_descriptor) + +#def _reduce_builtin_function_or_method(m): +# return getattr, (m.__self__, m.__name__) +#ForkingPickler.register(type(list().append), _reduce_builtin_function_or_method) +#ForkingPickler.register(type(int().__add__), _reduce_builtin_function_or_method) + +try: + from functools import partial +except ImportError: + pass +else: + def _reduce_partial(p): + return _rebuild_partial, (p.func, p.args, p.keywords or {}) + def _rebuild_partial(func, args, keywords): + return partial(func, *args, **keywords) + ForkingPickler.register(partial, _reduce_partial) + +# +# Unix +# + +if sys.platform != 'win32': + import time + + exit = os._exit + duplicate = os.dup + close = os.close + + # + # We define a Popen class similar to the one from subprocess, but + # whose constructor takes a process object as its argument. + # + + class Popen(object): + + def __init__(self, process_obj): + sys.stdout.flush() + sys.stderr.flush() + self.returncode = None + + self.pid = os.fork() + if self.pid == 0: + if 'random' in sys.modules: + import random + random.seed() + code = process_obj._bootstrap() + sys.stdout.flush() + sys.stderr.flush() + os._exit(code) + + def poll(self, flag=os.WNOHANG): + if self.returncode is None: + pid, sts = os.waitpid(self.pid, flag) + if pid == self.pid: + if os.WIFSIGNALED(sts): + self.returncode = -os.WTERMSIG(sts) + else: + assert os.WIFEXITED(sts) + self.returncode = os.WEXITSTATUS(sts) + return self.returncode + + def wait(self, timeout=None): + if timeout is None: + return self.poll(0) + deadline = time.time() + timeout + delay = 0.0005 + while 1: + res = self.poll() + if res is not None: + break + remaining = deadline - time.time() + if remaining <= 0: + break + delay = min(delay * 2, remaining, 0.05) + time.sleep(delay) + return res + + def terminate(self): + if self.returncode is None: + try: + os.kill(self.pid, signal.SIGTERM) + except OSError, e: + if self.wait(timeout=0.1) is None: + raise + + @staticmethod + def thread_is_spawning(): + return False + +# +# Windows +# + +else: + import thread + import msvcrt + import _subprocess + import time + + from ._multiprocessing import win32, Connection, PipeConnection + from .util import Finalize + + #try: + # from cPickle import dump, load, HIGHEST_PROTOCOL + #except ImportError: + from pickle import load, HIGHEST_PROTOCOL + + def dump(obj, file, protocol=None): + ForkingPickler(file, protocol).dump(obj) + + # + # + # + + TERMINATE = 0x10000 + WINEXE = (sys.platform == 'win32' and getattr(sys, 'frozen', False)) + + exit = win32.ExitProcess + close = win32.CloseHandle + + # + # _python_exe is the assumed path to the python executable. + # People embedding Python want to modify it. + # + + if sys.executable.lower().endswith('pythonservice.exe'): + _python_exe = os.path.join(sys.exec_prefix, 'python.exe') + else: + _python_exe = sys.executable + + def set_executable(exe): + global _python_exe + _python_exe = exe + + # + # + # + + def duplicate(handle, target_process=None, inheritable=False): + if target_process is None: + target_process = _subprocess.GetCurrentProcess() + return _subprocess.DuplicateHandle( + _subprocess.GetCurrentProcess(), handle, target_process, + 0, inheritable, _subprocess.DUPLICATE_SAME_ACCESS + ).Detach() + + # + # We define a Popen class similar to the one from subprocess, but + # whose constructor takes a process object as its argument. + # + + class Popen(object): + ''' + Start a subprocess to run the code of a process object + ''' + _tls = thread._local() + + def __init__(self, process_obj): + # create pipe for communication with child + rfd, wfd = os.pipe() + + # get handle for read end of the pipe and make it inheritable + rhandle = duplicate(msvcrt.get_osfhandle(rfd), inheritable=True) + os.close(rfd) + + # start process + cmd = get_command_line() + [rhandle] + cmd = ' '.join('"%s"' % x for x in cmd) + hp, ht, pid, tid = _subprocess.CreateProcess( + _python_exe, cmd, None, None, 1, 0, None, None, None + ) + ht.Close() + close(rhandle) + + # set attributes of self + self.pid = pid + self.returncode = None + self._handle = hp + + # send information to child + prep_data = get_preparation_data(process_obj._name) + to_child = os.fdopen(wfd, 'wb') + Popen._tls.process_handle = int(hp) + try: + dump(prep_data, to_child, HIGHEST_PROTOCOL) + dump(process_obj, to_child, HIGHEST_PROTOCOL) + finally: + del Popen._tls.process_handle + to_child.close() + + @staticmethod + def thread_is_spawning(): + return getattr(Popen._tls, 'process_handle', None) is not None + + @staticmethod + def duplicate_for_child(handle): + return duplicate(handle, Popen._tls.process_handle) + + def wait(self, timeout=None): + if self.returncode is None: + if timeout is None: + msecs = _subprocess.INFINITE + else: + msecs = max(0, int(timeout * 1000 + 0.5)) + + res = _subprocess.WaitForSingleObject(int(self._handle), msecs) + if res == _subprocess.WAIT_OBJECT_0: + code = _subprocess.GetExitCodeProcess(self._handle) + if code == TERMINATE: + code = -signal.SIGTERM + self.returncode = code + + return self.returncode + + def poll(self): + return self.wait(timeout=0) + + def terminate(self): + if self.returncode is None: + try: + _subprocess.TerminateProcess(int(self._handle), TERMINATE) + except WindowsError: + if self.wait(timeout=0.1) is None: + raise + + # + # + # + + def is_forking(argv): + ''' + Return whether commandline indicates we are forking + ''' + if len(argv) >= 2 and argv[1] == '--multiprocessing-fork': + assert len(argv) == 3 + return True + else: + return False + + + def freeze_support(): + ''' + Run code for process object if this in not the main process + ''' + if is_forking(sys.argv): + main() + sys.exit() + + + def get_command_line(): + ''' + Returns prefix of command line used for spawning a child process + ''' + if process.current_process()._identity==() and is_forking(sys.argv): + raise RuntimeError(''' + Attempt to start a new process before the current process + has finished its bootstrapping phase. + + This probably means that you are on Windows and you have + forgotten to use the proper idiom in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce a Windows executable.''') + + if getattr(sys, 'frozen', False): + return [sys.executable, '--multiprocessing-fork'] + else: + prog = 'from multiprocessing.forking import main; main()' + return [_python_exe, '-c', prog, '--multiprocessing-fork'] + + + def main(): + ''' + Run code specifed by data received over pipe + ''' + assert is_forking(sys.argv) + + handle = int(sys.argv[-1]) + fd = msvcrt.open_osfhandle(handle, os.O_RDONLY) + from_parent = os.fdopen(fd, 'rb') + + process.current_process()._inheriting = True + preparation_data = load(from_parent) + prepare(preparation_data) + self = load(from_parent) + process.current_process()._inheriting = False + + from_parent.close() + + exitcode = self._bootstrap() + exit(exitcode) + + + def get_preparation_data(name): + ''' + Return info about parent needed by child to unpickle process object + ''' + from .util import _logger, _log_to_stderr + + d = dict( + name=name, + sys_path=sys.path, + sys_argv=sys.argv, + log_to_stderr=_log_to_stderr, + orig_dir=process.ORIGINAL_DIR, + authkey=process.current_process().authkey, + ) + + if _logger is not None: + d['log_level'] = _logger.getEffectiveLevel() + + if not WINEXE: + main_path = getattr(sys.modules['__main__'], '__file__', None) + if not main_path and sys.argv[0] not in ('', '-c'): + main_path = sys.argv[0] + if main_path is not None: + if not os.path.isabs(main_path) and \ + process.ORIGINAL_DIR is not None: + main_path = os.path.join(process.ORIGINAL_DIR, main_path) + d['main_path'] = os.path.normpath(main_path) + + return d + + # + # Make (Pipe)Connection picklable + # + + def reduce_connection(conn): + if not Popen.thread_is_spawning(): + raise RuntimeError( + 'By default %s objects can only be shared between processes\n' + 'using inheritance' % type(conn).__name__ + ) + return type(conn), (Popen.duplicate_for_child(conn.fileno()), + conn.readable, conn.writable) + + ForkingPickler.register(Connection, reduce_connection) + ForkingPickler.register(PipeConnection, reduce_connection) + +# +# Prepare current process +# + +old_main_modules = [] + +def prepare(data): + ''' + Try to get current process ready to unpickle process object + ''' + old_main_modules.append(sys.modules['__main__']) + + if 'name' in data: + process.current_process().name = data['name'] + + if 'authkey' in data: + process.current_process()._authkey = data['authkey'] + + if 'log_to_stderr' in data and data['log_to_stderr']: + util.log_to_stderr() + + if 'log_level' in data: + util.get_logger().setLevel(data['log_level']) + + if 'sys_path' in data: + sys.path = data['sys_path'] + + if 'sys_argv' in data: + sys.argv = data['sys_argv'] + + if 'dir' in data: + os.chdir(data['dir']) + + if 'orig_dir' in data: + process.ORIGINAL_DIR = data['orig_dir'] + + if 'main_path' in data: + main_path = data['main_path'] + main_name = os.path.splitext(os.path.basename(main_path))[0] + if main_name == '__init__': + main_name = os.path.basename(os.path.dirname(main_path)) + + if main_name != 'ipython': + import imp + + if main_path is None: + dirs = None + elif os.path.basename(main_path).startswith('__init__.py'): + dirs = [os.path.dirname(os.path.dirname(main_path))] + else: + dirs = [os.path.dirname(main_path)] + + assert main_name not in sys.modules, main_name + file, path_name, etc = imp.find_module(main_name, dirs) + try: + # We would like to do "imp.load_module('__main__', ...)" + # here. However, that would cause 'if __name__ == + # "__main__"' clauses to be executed. + main_module = imp.load_module( + '__parents_main__', file, path_name, etc + ) + finally: + if file: + file.close() + + sys.modules['__main__'] = main_module + main_module.__name__ = '__main__' + + # Try to make the potentially picklable objects in + # sys.modules['__main__'] realize they are in the main + # module -- somewhat ugly. + for obj in main_module.__dict__.values(): + try: + if obj.__module__ == '__parents_main__': + obj.__module__ = '__main__' + except Exception: + pass diff --git a/LTA/LTAIngest/multiprocessing/heap.py b/LTA/LTAIngest/multiprocessing/heap.py new file mode 100644 index 0000000000000000000000000000000000000000..7e596ca70fa78e67576e38534f265c9b94112ba7 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/heap.py @@ -0,0 +1,201 @@ +# +# Module which supports allocation of memory from an mmap +# +# multiprocessing/heap.py +# +# Copyright (c) 2007-2008, R Oudkerk --- see COPYING.txt +# + +import bisect +import mmap +import tempfile +import os +import sys +import threading +import itertools + +import _multiprocessing +from multiprocessing.util import Finalize, info +from multiprocessing.forking import assert_spawning + +__all__ = ['BufferWrapper'] + +# +# Inheirtable class which wraps an mmap, and from which blocks can be allocated +# + +if sys.platform == 'win32': + + from ._multiprocessing import win32 + + class Arena(object): + + _counter = itertools.count() + + def __init__(self, size): + self.size = size + self.name = 'pym-%d-%d' % (os.getpid(), Arena._counter.next()) + self.buffer = mmap.mmap(-1, self.size, tagname=self.name) + assert win32.GetLastError() == 0, 'tagname already in use' + self._state = (self.size, self.name) + + def __getstate__(self): + assert_spawning(self) + return self._state + + def __setstate__(self, state): + self.size, self.name = self._state = state + self.buffer = mmap.mmap(-1, self.size, tagname=self.name) + assert win32.GetLastError() == win32.ERROR_ALREADY_EXISTS + +else: + + class Arena(object): + + def __init__(self, size): + self.buffer = mmap.mmap(-1, size) + self.size = size + self.name = None + +# +# Class allowing allocation of chunks of memory from arenas +# + +class Heap(object): + + _alignment = 8 + + def __init__(self, size=mmap.PAGESIZE): + self._lastpid = os.getpid() + self._lock = threading.Lock() + self._size = size + self._lengths = [] + self._len_to_seq = {} + self._start_to_block = {} + self._stop_to_block = {} + self._allocated_blocks = set() + self._arenas = [] + + @staticmethod + def _roundup(n, alignment): + # alignment must be a power of 2 + mask = alignment - 1 + return (n + mask) & ~mask + + def _malloc(self, size): + # returns a large enough block -- it might be much larger + i = bisect.bisect_left(self._lengths, size) + if i == len(self._lengths): + length = self._roundup(max(self._size, size), mmap.PAGESIZE) + self._size *= 2 + info('allocating a new mmap of length %d', length) + arena = Arena(length) + self._arenas.append(arena) + return (arena, 0, length) + else: + length = self._lengths[i] + seq = self._len_to_seq[length] + block = seq.pop() + if not seq: + del self._len_to_seq[length], self._lengths[i] + + (arena, start, stop) = block + del self._start_to_block[(arena, start)] + del self._stop_to_block[(arena, stop)] + return block + + def _free(self, block): + # free location and try to merge with neighbours + (arena, start, stop) = block + + try: + prev_block = self._stop_to_block[(arena, start)] + except KeyError: + pass + else: + start, _ = self._absorb(prev_block) + + try: + next_block = self._start_to_block[(arena, stop)] + except KeyError: + pass + else: + _, stop = self._absorb(next_block) + + block = (arena, start, stop) + length = stop - start + + try: + self._len_to_seq[length].append(block) + except KeyError: + self._len_to_seq[length] = [block] + bisect.insort(self._lengths, length) + + self._start_to_block[(arena, start)] = block + self._stop_to_block[(arena, stop)] = block + + def _absorb(self, block): + # deregister this block so it can be merged with a neighbour + (arena, start, stop) = block + del self._start_to_block[(arena, start)] + del self._stop_to_block[(arena, stop)] + + length = stop - start + seq = self._len_to_seq[length] + seq.remove(block) + if not seq: + del self._len_to_seq[length] + self._lengths.remove(length) + + return start, stop + + def free(self, block): + # free a block returned by malloc() + assert os.getpid() == self._lastpid + self._lock.acquire() + try: + self._allocated_blocks.remove(block) + self._free(block) + finally: + self._lock.release() + + def malloc(self, size): + # return a block of right size (possibly rounded up) + assert 0 <= size < sys.maxint + if os.getpid() != self._lastpid: + self.__init__() # reinitialize after fork + self._lock.acquire() + try: + size = self._roundup(max(size,1), self._alignment) + (arena, start, stop) = self._malloc(size) + new_stop = start + size + if new_stop < stop: + self._free((arena, new_stop, stop)) + block = (arena, start, new_stop) + self._allocated_blocks.add(block) + return block + finally: + self._lock.release() + +# +# Class representing a chunk of an mmap -- can be inherited +# + +class BufferWrapper(object): + + _heap = Heap() + + def __init__(self, size): + assert 0 <= size < sys.maxint + block = BufferWrapper._heap.malloc(size) + self._state = (block, size) + Finalize(self, BufferWrapper._heap.free, args=(block,)) + + def get_address(self): + (arena, start, stop), size = self._state + address, length = _multiprocessing.address_of_buffer(arena.buffer) + assert size <= length + return address + start + + def get_size(self): + return self._state[1] diff --git a/LTA/LTAIngest/multiprocessing/managers.py b/LTA/LTAIngest/multiprocessing/managers.py new file mode 100644 index 0000000000000000000000000000000000000000..e331116c89dceaa4f2978eb8c780cbe13f6fd9cd --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/managers.py @@ -0,0 +1,1083 @@ +# +# Module providing the `SyncManager` class for dealing +# with shared objects +# +# multiprocessing/managers.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = [ 'BaseManager', 'SyncManager', 'BaseProxy', 'Token' ] + +# +# Imports +# + +import os +import sys +import weakref +import threading +import array +import Queue + +from traceback import format_exc +from multiprocessing import Process, current_process, active_children, Pool, util, connection +from multiprocessing.process import AuthenticationString +from multiprocessing.forking import exit, Popen, assert_spawning, ForkingPickler +from multiprocessing.util import Finalize, info + +try: + from cPickle import PicklingError +except ImportError: + from pickle import PicklingError + +# +# Register some things for pickling +# + +def reduce_array(a): + return array.array, (a.typecode, a.tostring()) +ForkingPickler.register(array.array, reduce_array) + +view_types = [type(getattr({}, name)()) for name in ('items','keys','values')] + +# +# Type for identifying shared objects +# + +class Token(object): + ''' + Type to uniquely indentify a shared object + ''' + __slots__ = ('typeid', 'address', 'id') + + def __init__(self, typeid, address, id): + (self.typeid, self.address, self.id) = (typeid, address, id) + + def __getstate__(self): + return (self.typeid, self.address, self.id) + + def __setstate__(self, state): + (self.typeid, self.address, self.id) = state + + def __repr__(self): + return 'Token(typeid=%r, address=%r, id=%r)' % \ + (self.typeid, self.address, self.id) + +# +# Function for communication with a manager's server process +# + +def dispatch(c, id, methodname, args=(), kwds={}): + ''' + Send a message to manager using connection `c` and return response + ''' + c.send((id, methodname, args, kwds)) + kind, result = c.recv() + if kind == '#RETURN': + return result + raise convert_to_error(kind, result) + +def convert_to_error(kind, result): + if kind == '#ERROR': + return result + elif kind == '#TRACEBACK': + assert type(result) is str + return RemoteError(result) + elif kind == '#UNSERIALIZABLE': + assert type(result) is str + return RemoteError('Unserializable message: %s\n' % result) + else: + return ValueError('Unrecognized message type') + +class RemoteError(Exception): + def __str__(self): + return ('\n' + '-'*75 + '\n' + str(self.args[0]) + '-'*75) + +# +# Functions for finding the method names of an object +# + +def all_methods(obj): + ''' + Return a list of names of methods of `obj` + ''' + temp = [] + for name in dir(obj): + func = getattr(obj, name) + if hasattr(func, '__call__'): + temp.append(name) + return temp + +def public_methods(obj): + ''' + Return a list of names of methods of `obj` which do not start with '_' + ''' + return [name for name in all_methods(obj) if name[0] != '_'] + +# +# Server which is run in a process controlled by a manager +# + +class Server(object): + ''' + Server class which runs in a process controlled by a manager object + ''' + public = ['shutdown', 'create', 'accept_connection', 'get_methods', + 'debug_info', 'number_of_objects', 'dummy', 'incref', 'decref'] + + def __init__(self, registry, address, authkey, serializer): + assert isinstance(authkey, bytes) + self.registry = registry + self.authkey = AuthenticationString(authkey) + Listener, Client = listener_client[serializer] + + # do authentication later + self.listener = Listener(address=address, backlog=5) + self.address = self.listener.address + + self.id_to_obj = {0: (None, ())} + self.id_to_refcount = {} + self.mutex = threading.RLock() + self.stop = 0 + + def serve_forever(self): + ''' + Run the server forever + ''' + current_process()._manager_server = self + try: + try: + while 1: + try: + c = self.listener.accept() + except (OSError, IOError): + continue + t = threading.Thread(target=self.handle_request, args=(c,)) + t.daemon = True + t.start() + except (KeyboardInterrupt, SystemExit): + pass + finally: + self.stop = 999 + self.listener.close() + + def handle_request(self, c): + ''' + Handle a new connection + ''' + funcname = result = request = None + try: + connection.deliver_challenge(c, self.authkey) + connection.answer_challenge(c, self.authkey) + request = c.recv() + ignore, funcname, args, kwds = request + assert funcname in self.public, '%r unrecognized' % funcname + func = getattr(self, funcname) + except Exception: + msg = ('#TRACEBACK', format_exc()) + else: + try: + result = func(c, *args, **kwds) + except Exception: + msg = ('#TRACEBACK', format_exc()) + else: + msg = ('#RETURN', result) + try: + c.send(msg) + except Exception, e: + try: + c.send(('#TRACEBACK', format_exc())) + except Exception: + pass + util.info('Failure to send message: %r', msg) + util.info(' ... request was %r', request) + util.info(' ... exception was %r', e) + + c.close() + + def serve_client(self, conn): + ''' + Handle requests from the proxies in a particular process/thread + ''' + util.debug('starting server thread to service %r', + threading.current_thread().name) + + recv = conn.recv + send = conn.send + id_to_obj = self.id_to_obj + + while not self.stop: + + try: + methodname = obj = None + request = recv() + ident, methodname, args, kwds = request + obj, exposed, gettypeid = id_to_obj[ident] + + if methodname not in exposed: + raise AttributeError( + 'method %r of %r object is not in exposed=%r' % + (methodname, type(obj), exposed) + ) + + function = getattr(obj, methodname) + + try: + res = function(*args, **kwds) + except Exception, e: + msg = ('#ERROR', e) + else: + typeid = gettypeid and gettypeid.get(methodname, None) + if typeid: + rident, rexposed = self.create(conn, typeid, res) + token = Token(typeid, self.address, rident) + msg = ('#PROXY', (rexposed, token)) + else: + msg = ('#RETURN', res) + + except AttributeError: + if methodname is None: + msg = ('#TRACEBACK', format_exc()) + else: + try: + fallback_func = self.fallback_mapping[methodname] + result = fallback_func( + self, conn, ident, obj, *args, **kwds + ) + msg = ('#RETURN', result) + except Exception: + msg = ('#TRACEBACK', format_exc()) + + except EOFError: + util.debug('got EOF -- exiting thread serving %r', + threading.current_thread().name) + sys.exit(0) + + except Exception: + msg = ('#TRACEBACK', format_exc()) + + try: + try: + send(msg) + except Exception, e: + send(('#UNSERIALIZABLE', repr(msg))) + except Exception, e: + util.info('exception in thread serving %r', + threading.current_thread().name) + util.info(' ... message was %r', msg) + util.info(' ... exception was %r', e) + conn.close() + sys.exit(1) + + def fallback_getvalue(self, conn, ident, obj): + return obj + + def fallback_str(self, conn, ident, obj): + return str(obj) + + def fallback_repr(self, conn, ident, obj): + return repr(obj) + + fallback_mapping = { + '__str__':fallback_str, + '__repr__':fallback_repr, + '#GETVALUE':fallback_getvalue + } + + def dummy(self, c): + pass + + def debug_info(self, c): + ''' + Return some info --- useful to spot problems with refcounting + ''' + self.mutex.acquire() + try: + result = [] + keys = self.id_to_obj.keys() + keys.sort() + for ident in keys: + if ident != 0: + result.append(' %s: refcount=%s\n %s' % + (ident, self.id_to_refcount[ident], + str(self.id_to_obj[ident][0])[:75])) + return '\n'.join(result) + finally: + self.mutex.release() + + def number_of_objects(self, c): + ''' + Number of shared objects + ''' + return len(self.id_to_obj) - 1 # don't count ident=0 + + def shutdown(self, c): + ''' + Shutdown this process + ''' + try: + try: + util.debug('manager received shutdown message') + c.send(('#RETURN', None)) + + if sys.stdout != sys.__stdout__: + util.debug('resetting stdout, stderr') + sys.stdout = sys.__stdout__ + sys.stderr = sys.__stderr__ + + util._run_finalizers(0) + + for p in active_children(): + util.debug('terminating a child process of manager') + p.terminate() + + for p in active_children(): + util.debug('terminating a child process of manager') + p.join() + + util._run_finalizers() + util.info('manager exiting with exitcode 0') + except: + import traceback + traceback.print_exc() + finally: + exit(0) + + def create(self, c, typeid, *args, **kwds): + ''' + Create a new shared object and return its id + ''' + self.mutex.acquire() + try: + callable, exposed, method_to_typeid, proxytype = \ + self.registry[typeid] + + if callable is None: + assert len(args) == 1 and not kwds + obj = args[0] + else: + obj = callable(*args, **kwds) + + if exposed is None: + exposed = public_methods(obj) + if method_to_typeid is not None: + assert type(method_to_typeid) is dict + exposed = list(exposed) + list(method_to_typeid) + + ident = '%x' % id(obj) # convert to string because xmlrpclib + # only has 32 bit signed integers + util.debug('%r callable returned object with id %r', typeid, ident) + + self.id_to_obj[ident] = (obj, set(exposed), method_to_typeid) + if ident not in self.id_to_refcount: + self.id_to_refcount[ident] = 0 + # increment the reference count immediately, to avoid + # this object being garbage collected before a Proxy + # object for it can be created. The caller of create() + # is responsible for doing a decref once the Proxy object + # has been created. + self.incref(c, ident) + return ident, tuple(exposed) + finally: + self.mutex.release() + + def get_methods(self, c, token): + ''' + Return the methods of the shared object indicated by token + ''' + return tuple(self.id_to_obj[token.id][1]) + + def accept_connection(self, c, name): + ''' + Spawn a new thread to serve this connection + ''' + threading.current_thread().name = name + c.send(('#RETURN', None)) + self.serve_client(c) + + def incref(self, c, ident): + self.mutex.acquire() + try: + self.id_to_refcount[ident] += 1 + finally: + self.mutex.release() + + def decref(self, c, ident): + self.mutex.acquire() + try: + assert self.id_to_refcount[ident] >= 1 + self.id_to_refcount[ident] -= 1 + if self.id_to_refcount[ident] == 0: + del self.id_to_obj[ident], self.id_to_refcount[ident] + util.debug('disposing of obj with id %r', ident) + finally: + self.mutex.release() + +# +# Class to represent state of a manager +# + +class State(object): + __slots__ = ['value'] + INITIAL = 0 + STARTED = 1 + SHUTDOWN = 2 + +# +# Mapping from serializer name to Listener and Client types +# + +listener_client = { + 'pickle' : (connection.Listener, connection.Client), + 'xmlrpclib' : (connection.XmlListener, connection.XmlClient) + } + +# +# Definition of BaseManager +# + +class BaseManager(object): + ''' + Base class for managers + ''' + _registry = {} + _Server = Server + + def __init__(self, address=None, authkey=None, serializer='pickle'): + if authkey is None: + authkey = current_process().authkey + self._address = address # XXX not final address if eg ('', 0) + self._authkey = AuthenticationString(authkey) + self._state = State() + self._state.value = State.INITIAL + self._serializer = serializer + self._Listener, self._Client = listener_client[serializer] + + def __reduce__(self): + return type(self).from_address, \ + (self._address, self._authkey, self._serializer) + + def get_server(self): + ''' + Return server object with serve_forever() method and address attribute + ''' + assert self._state.value == State.INITIAL + return Server(self._registry, self._address, + self._authkey, self._serializer) + + def connect(self): + ''' + Connect manager object to the server process + ''' + Listener, Client = listener_client[self._serializer] + conn = Client(self._address, authkey=self._authkey) + dispatch(conn, None, 'dummy') + self._state.value = State.STARTED + + def start(self): + ''' + Spawn a server process for this manager object + ''' + assert self._state.value == State.INITIAL + + # pipe over which we will retrieve address of server + reader, writer = connection.Pipe(duplex=False) + + # spawn process which runs a server + self._process = Process( + target=type(self)._run_server, + args=(self._registry, self._address, self._authkey, + self._serializer, writer), + ) + ident = ':'.join(str(i) for i in self._process._identity) + self._process.name = type(self).__name__ + '-' + ident + self._process.start() + + # get address of server + writer.close() + self._address = reader.recv() + reader.close() + + # register a finalizer + self._state.value = State.STARTED + self.shutdown = util.Finalize( + self, type(self)._finalize_manager, + args=(self._process, self._address, self._authkey, + self._state, self._Client), + exitpriority=0 + ) + + @classmethod + def _run_server(cls, registry, address, authkey, serializer, writer): + ''' + Create a server, report its address and run it + ''' + # create server + server = cls._Server(registry, address, authkey, serializer) + + # inform parent process of the server's address + writer.send(server.address) + writer.close() + + # run the manager + util.info('manager serving at %r', server.address) + server.serve_forever() + + def _create(self, typeid, *args, **kwds): + ''' + Create a new shared object; return the token and exposed tuple + ''' + assert self._state.value == State.STARTED, 'server not yet started' + conn = self._Client(self._address, authkey=self._authkey) + try: + id, exposed = dispatch(conn, None, 'create', (typeid,)+args, kwds) + finally: + conn.close() + return Token(typeid, self._address, id), exposed + + def join(self, timeout=None): + ''' + Join the manager process (if it has been spawned) + ''' + self._process.join(timeout) + + def _debug_info(self): + ''' + Return some info about the servers shared objects and connections + ''' + conn = self._Client(self._address, authkey=self._authkey) + try: + return dispatch(conn, None, 'debug_info') + finally: + conn.close() + + def _number_of_objects(self): + ''' + Return the number of shared objects + ''' + conn = self._Client(self._address, authkey=self._authkey) + try: + return dispatch(conn, None, 'number_of_objects') + finally: + conn.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.shutdown() + + @staticmethod + def _finalize_manager(process, address, authkey, state, _Client): + ''' + Shutdown the manager process; will be registered as a finalizer + ''' + if process.is_alive(): + util.info('sending shutdown message to manager') + try: + conn = _Client(address, authkey=authkey) + try: + dispatch(conn, None, 'shutdown') + finally: + conn.close() + except Exception: + pass + + process.join(timeout=0.2) + if process.is_alive(): + util.info('manager still alive') + if hasattr(process, 'terminate'): + util.info('trying to `terminate()` manager process') + process.terminate() + process.join(timeout=0.1) + if process.is_alive(): + util.info('manager still alive after terminate') + + state.value = State.SHUTDOWN + try: + del BaseProxy._address_to_local[address] + except KeyError: + pass + + address = property(lambda self: self._address) + + @classmethod + def register(cls, typeid, callable=None, proxytype=None, exposed=None, + method_to_typeid=None, create_method=True): + ''' + Register a typeid with the manager type + ''' + if '_registry' not in cls.__dict__: + cls._registry = cls._registry.copy() + + if proxytype is None: + proxytype = AutoProxy + + exposed = exposed or getattr(proxytype, '_exposed_', None) + + method_to_typeid = method_to_typeid or \ + getattr(proxytype, '_method_to_typeid_', None) + + if method_to_typeid: + for key, value in method_to_typeid.items(): + assert type(key) is str, '%r is not a string' % key + assert type(value) is str, '%r is not a string' % value + + cls._registry[typeid] = ( + callable, exposed, method_to_typeid, proxytype + ) + + if create_method: + def temp(self, *args, **kwds): + util.debug('requesting creation of a shared %r object', typeid) + token, exp = self._create(typeid, *args, **kwds) + proxy = proxytype( + token, self._serializer, manager=self, + authkey=self._authkey, exposed=exp + ) + conn = self._Client(token.address, authkey=self._authkey) + dispatch(conn, None, 'decref', (token.id,)) + return proxy + temp.__name__ = typeid + setattr(cls, typeid, temp) + +# +# Subclass of set which get cleared after a fork +# + +class ProcessLocalSet(set): + def __init__(self): + util.register_after_fork(self, lambda obj: obj.clear()) + def __reduce__(self): + return type(self), () + +# +# Definition of BaseProxy +# + +class BaseProxy(object): + ''' + A base for proxies of shared objects + ''' + _address_to_local = {} + _mutex = util.ForkAwareThreadLock() + + def __init__(self, token, serializer, manager=None, + authkey=None, exposed=None, incref=True): + BaseProxy._mutex.acquire() + try: + tls_idset = BaseProxy._address_to_local.get(token.address, None) + if tls_idset is None: + tls_idset = util.ForkAwareLocal(), ProcessLocalSet() + BaseProxy._address_to_local[token.address] = tls_idset + finally: + BaseProxy._mutex.release() + + # self._tls is used to record the connection used by this + # thread to communicate with the manager at token.address + self._tls = tls_idset[0] + + # self._idset is used to record the identities of all shared + # objects for which the current process owns references and + # which are in the manager at token.address + self._idset = tls_idset[1] + + self._token = token + self._id = self._token.id + self._manager = manager + self._serializer = serializer + self._Client = listener_client[serializer][1] + + if authkey is not None: + self._authkey = AuthenticationString(authkey) + elif self._manager is not None: + self._authkey = self._manager._authkey + else: + self._authkey = current_process().authkey + + if incref: + self._incref() + + util.register_after_fork(self, BaseProxy._after_fork) + + def _connect(self): + util.debug('making connection to manager') + name = current_process().name + if threading.current_thread().name != 'MainThread': + name += '|' + threading.current_thread().name + conn = self._Client(self._token.address, authkey=self._authkey) + dispatch(conn, None, 'accept_connection', (name,)) + self._tls.connection = conn + + def _callmethod(self, methodname, args=(), kwds={}): + ''' + Try to call a method of the referrent and return a copy of the result + ''' + try: + conn = self._tls.connection + except AttributeError: + util.debug('thread %r does not own a connection', + threading.current_thread().name) + self._connect() + conn = self._tls.connection + + conn.send((self._id, methodname, args, kwds)) + kind, result = conn.recv() + + if kind == '#RETURN': + return result + elif kind == '#PROXY': + exposed, token = result + proxytype = self._manager._registry[token.typeid][-1] + proxy = proxytype( + token, self._serializer, manager=self._manager, + authkey=self._authkey, exposed=exposed + ) + conn = self._Client(token.address, authkey=self._authkey) + dispatch(conn, None, 'decref', (token.id,)) + return proxy + raise convert_to_error(kind, result) + + def _getvalue(self): + ''' + Get a copy of the value of the referent + ''' + return self._callmethod('#GETVALUE') + + def _incref(self): + conn = self._Client(self._token.address, authkey=self._authkey) + dispatch(conn, None, 'incref', (self._id,)) + util.debug('INCREF %r', self._token.id) + + self._idset.add(self._id) + + state = self._manager and self._manager._state + + self._close = util.Finalize( + self, BaseProxy._decref, + args=(self._token, self._authkey, state, + self._tls, self._idset, self._Client), + exitpriority=10 + ) + + @staticmethod + def _decref(token, authkey, state, tls, idset, _Client): + idset.discard(token.id) + + # check whether manager is still alive + if state is None or state.value == State.STARTED: + # tell manager this process no longer cares about referent + try: + util.debug('DECREF %r', token.id) + conn = _Client(token.address, authkey=authkey) + dispatch(conn, None, 'decref', (token.id,)) + except Exception, e: + util.debug('... decref failed %s', e) + + else: + util.debug('DECREF %r -- manager already shutdown', token.id) + + # check whether we can close this thread's connection because + # the process owns no more references to objects for this manager + if not idset and hasattr(tls, 'connection'): + util.debug('thread %r has no more proxies so closing conn', + threading.current_thread().name) + tls.connection.close() + del tls.connection + + def _after_fork(self): + self._manager = None + try: + self._incref() + except Exception, e: + # the proxy may just be for a manager which has shutdown + util.info('incref failed: %s' % e) + + def __reduce__(self): + kwds = {} + if Popen.thread_is_spawning(): + kwds['authkey'] = self._authkey + + if getattr(self, '_isauto', False): + kwds['exposed'] = self._exposed_ + return (RebuildProxy, + (AutoProxy, self._token, self._serializer, kwds)) + else: + return (RebuildProxy, + (type(self), self._token, self._serializer, kwds)) + + def __deepcopy__(self, memo): + return self._getvalue() + + def __repr__(self): + return '<%s object, typeid %r at %s>' % \ + (type(self).__name__, self._token.typeid, '0x%x' % id(self)) + + def __str__(self): + ''' + Return representation of the referent (or a fall-back if that fails) + ''' + try: + return self._callmethod('__repr__') + except Exception: + return repr(self)[:-1] + "; '__str__()' failed>" + +# +# Function used for unpickling +# + +def RebuildProxy(func, token, serializer, kwds): + ''' + Function used for unpickling proxy objects. + + If possible the shared object is returned, or otherwise a proxy for it. + ''' + server = getattr(current_process(), '_manager_server', None) + + if server and server.address == token.address: + return server.id_to_obj[token.id][0] + else: + incref = ( + kwds.pop('incref', True) and + not getattr(current_process(), '_inheriting', False) + ) + return func(token, serializer, incref=incref, **kwds) + +# +# Functions to create proxies and proxy types +# + +def MakeProxyType(name, exposed, _cache={}): + ''' + Return an proxy type whose methods are given by `exposed` + ''' + exposed = tuple(exposed) + try: + return _cache[(name, exposed)] + except KeyError: + pass + + dic = {} + + for meth in exposed: + exec '''def %s(self, *args, **kwds): + return self._callmethod(%r, args, kwds)''' % (meth, meth) in dic + + ProxyType = type(name, (BaseProxy,), dic) + ProxyType._exposed_ = exposed + _cache[(name, exposed)] = ProxyType + return ProxyType + + +def AutoProxy(token, serializer, manager=None, authkey=None, + exposed=None, incref=True): + ''' + Return an auto-proxy for `token` + ''' + _Client = listener_client[serializer][1] + + if exposed is None: + conn = _Client(token.address, authkey=authkey) + try: + exposed = dispatch(conn, None, 'get_methods', (token,)) + finally: + conn.close() + + if authkey is None and manager is not None: + authkey = manager._authkey + if authkey is None: + authkey = current_process().authkey + + ProxyType = MakeProxyType('AutoProxy[%s]' % token.typeid, exposed) + proxy = ProxyType(token, serializer, manager=manager, authkey=authkey, + incref=incref) + proxy._isauto = True + return proxy + +# +# Types/callables which we will register with SyncManager +# + +class Namespace(object): + def __init__(self, **kwds): + self.__dict__.update(kwds) + def __repr__(self): + items = self.__dict__.items() + temp = [] + for name, value in items: + if not name.startswith('_'): + temp.append('%s=%r' % (name, value)) + temp.sort() + return 'Namespace(%s)' % str.join(', ', temp) + +class Value(object): + def __init__(self, typecode, value, lock=True): + self._typecode = typecode + self._value = value + def get(self): + return self._value + def set(self, value): + self._value = value + def __repr__(self): + return '%s(%r, %r)'%(type(self).__name__, self._typecode, self._value) + value = property(get, set) + +def Array(typecode, sequence, lock=True): + return array.array(typecode, sequence) + +# +# Proxy types used by SyncManager +# + +class IteratorProxy(BaseProxy): + # XXX remove methods for Py3.0 and Py2.6 + _exposed_ = ('__next__', 'next', 'send', 'throw', 'close') + def __iter__(self): + return self + def __next__(self, *args): + return self._callmethod('__next__', args) + def next(self, *args): + return self._callmethod('next', args) + def send(self, *args): + return self._callmethod('send', args) + def throw(self, *args): + return self._callmethod('throw', args) + def close(self, *args): + return self._callmethod('close', args) + + +class AcquirerProxy(BaseProxy): + _exposed_ = ('acquire', 'release') + def acquire(self, blocking=True): + return self._callmethod('acquire', (blocking,)) + def release(self): + return self._callmethod('release') + def __enter__(self): + return self._callmethod('acquire') + def __exit__(self, exc_type, exc_val, exc_tb): + return self._callmethod('release') + + +class ConditionProxy(AcquirerProxy): + # XXX will Condition.notfyAll() name be available in Py3.0? + _exposed_ = ('acquire', 'release', 'wait', 'notify', 'notify_all') + def wait(self, timeout=None): + return self._callmethod('wait', (timeout,)) + def notify(self): + return self._callmethod('notify') + def notify_all(self): + return self._callmethod('notify_all') + +class EventProxy(BaseProxy): + _exposed_ = ('is_set', 'set', 'clear', 'wait') + def is_set(self): + return self._callmethod('is_set') + def set(self): + return self._callmethod('set') + def clear(self): + return self._callmethod('clear') + def wait(self, timeout=None): + return self._callmethod('wait', (timeout,)) + +class NamespaceProxy(BaseProxy): + _exposed_ = ('__getattribute__', '__setattr__', '__delattr__') + def __getattr__(self, key): + if key[0] == '_': + return object.__getattribute__(self, key) + callmethod = object.__getattribute__(self, '_callmethod') + return callmethod('__getattribute__', (key,)) + def __setattr__(self, key, value): + if key[0] == '_': + return object.__setattr__(self, key, value) + callmethod = object.__getattribute__(self, '_callmethod') + return callmethod('__setattr__', (key, value)) + def __delattr__(self, key): + if key[0] == '_': + return object.__delattr__(self, key) + callmethod = object.__getattribute__(self, '_callmethod') + return callmethod('__delattr__', (key,)) + + +class ValueProxy(BaseProxy): + _exposed_ = ('get', 'set') + def get(self): + return self._callmethod('get') + def set(self, value): + return self._callmethod('set', (value,)) + value = property(get, set) + + +BaseListProxy = MakeProxyType('BaseListProxy', ( + '__add__', '__contains__', '__delitem__', '__delslice__', + '__getitem__', '__getslice__', '__len__', '__mul__', + '__reversed__', '__rmul__', '__setitem__', '__setslice__', + 'append', 'count', 'extend', 'index', 'insert', 'pop', 'remove', + 'reverse', 'sort', '__imul__' + )) # XXX __getslice__ and __setslice__ unneeded in Py3.0 +class ListProxy(BaseListProxy): + def __iadd__(self, value): + self._callmethod('extend', (value,)) + return self + def __imul__(self, value): + self._callmethod('__imul__', (value,)) + return self + + +DictProxy = MakeProxyType('DictProxy', ( + '__contains__', '__delitem__', '__getitem__', '__len__', + '__setitem__', 'clear', 'copy', 'get', 'has_key', 'items', + 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values' + )) + + +ArrayProxy = MakeProxyType('ArrayProxy', ( + '__len__', '__getitem__', '__setitem__', '__getslice__', '__setslice__' + )) # XXX __getslice__ and __setslice__ unneeded in Py3.0 + + +PoolProxy = MakeProxyType('PoolProxy', ( + 'apply', 'apply_async', 'close', 'imap', 'imap_unordered', 'join', + 'map', 'map_async', 'terminate' + )) +PoolProxy._method_to_typeid_ = { + 'apply_async': 'AsyncResult', + 'map_async': 'AsyncResult', + 'imap': 'Iterator', + 'imap_unordered': 'Iterator' + } + +# +# Definition of SyncManager +# + +class SyncManager(BaseManager): + ''' + Subclass of `BaseManager` which supports a number of shared object types. + + The types registered are those intended for the synchronization + of threads, plus `dict`, `list` and `Namespace`. + + The `multiprocessing.Manager()` function creates started instances of + this class. + ''' + +SyncManager.register('Queue', Queue.Queue) +SyncManager.register('JoinableQueue', Queue.Queue) +SyncManager.register('Event', threading.Event, EventProxy) +SyncManager.register('Lock', threading.Lock, AcquirerProxy) +SyncManager.register('RLock', threading.RLock, AcquirerProxy) +SyncManager.register('Semaphore', threading.Semaphore, AcquirerProxy) +SyncManager.register('BoundedSemaphore', threading.BoundedSemaphore, + AcquirerProxy) +SyncManager.register('Condition', threading.Condition, ConditionProxy) +SyncManager.register('Pool', Pool, PoolProxy) +SyncManager.register('list', list, ListProxy) +SyncManager.register('dict', dict, DictProxy) +SyncManager.register('Value', Value, ValueProxy) +SyncManager.register('Array', Array, ArrayProxy) +SyncManager.register('Namespace', Namespace, NamespaceProxy) + +# types returned by methods of PoolProxy +SyncManager.register('Iterator', proxytype=IteratorProxy, create_method=False) +SyncManager.register('AsyncResult', create_method=False) diff --git a/LTA/LTAIngest/multiprocessing/pool.py b/LTA/LTAIngest/multiprocessing/pool.py new file mode 100644 index 0000000000000000000000000000000000000000..9da27d48edfff5d35bb8acc9c00ebf36ee4c8fc1 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/pool.py @@ -0,0 +1,596 @@ +# +# Module providing the `Pool` class for managing a process pool +# +# multiprocessing/pool.py +# +# Copyright (c) 2007-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = ['Pool'] + +# +# Imports +# + +import threading +import Queue +import itertools +import collections +import time + +from multiprocessing import Process, cpu_count, TimeoutError +from multiprocessing.util import Finalize, debug + +# +# Constants representing the state of a pool +# + +RUN = 0 +CLOSE = 1 +TERMINATE = 2 + +# +# Miscellaneous +# + +job_counter = itertools.count() + +def mapstar(args): + return map(*args) + +# +# Code run by worker processes +# + +def worker(inqueue, outqueue, initializer=None, initargs=()): + put = outqueue.put + get = inqueue.get + if hasattr(inqueue, '_writer'): + inqueue._writer.close() + outqueue._reader.close() + + if initializer is not None: + initializer(*initargs) + + while 1: + try: + task = get() + except (EOFError, IOError): + debug('worker got EOFError or IOError -- exiting') + break + + if task is None: + debug('worker got sentinel -- exiting') + break + + job, i, func, args, kwds = task + try: + result = (True, func(*args, **kwds)) + except Exception, e: + result = (False, e) + put((job, i, result)) + +# +# Class representing a process pool +# + +class Pool(object): + ''' + Class which supports an async version of the `apply()` builtin + ''' + Process = Process + + def __init__(self, processes=None, initializer=None, initargs=()): + self._setup_queues() + self._taskqueue = Queue.Queue() + self._cache = {} + self._state = RUN + + if processes is None: + try: + processes = cpu_count() + except NotImplementedError: + processes = 1 + + self._pool = [] + for i in range(processes): + w = self.Process( + target=worker, + args=(self._inqueue, self._outqueue, initializer, initargs) + ) + self._pool.append(w) + w.name = w.name.replace('Process', 'PoolWorker') + w.daemon = True + w.start() + + self._task_handler = threading.Thread( + target=Pool._handle_tasks, + args=(self._taskqueue, self._quick_put, self._outqueue, self._pool) + ) + self._task_handler.daemon = True + self._task_handler._state = RUN + self._task_handler.start() + + self._result_handler = threading.Thread( + target=Pool._handle_results, + args=(self._outqueue, self._quick_get, self._cache) + ) + self._result_handler.daemon = True + self._result_handler._state = RUN + self._result_handler.start() + + self._terminate = Finalize( + self, self._terminate_pool, + args=(self._taskqueue, self._inqueue, self._outqueue, self._pool, + self._task_handler, self._result_handler, self._cache), + exitpriority=15 + ) + + def _setup_queues(self): + from .queues import SimpleQueue + self._inqueue = SimpleQueue() + self._outqueue = SimpleQueue() + self._quick_put = self._inqueue._writer.send + self._quick_get = self._outqueue._reader.recv + + def apply(self, func, args=(), kwds={}): + ''' + Equivalent of `apply()` builtin + ''' + assert self._state == RUN + return self.apply_async(func, args, kwds).get() + + def map(self, func, iterable, chunksize=None): + ''' + Equivalent of `map()` builtin + ''' + assert self._state == RUN + return self.map_async(func, iterable, chunksize).get() + + def imap(self, func, iterable, chunksize=1): + ''' + Equivalent of `itertools.imap()` -- can be MUCH slower than `Pool.map()` + ''' + assert self._state == RUN + if chunksize == 1: + result = IMapIterator(self._cache) + self._taskqueue.put((((result._job, i, func, (x,), {}) + for i, x in enumerate(iterable)), result._set_length)) + return result + else: + assert chunksize > 1 + task_batches = Pool._get_tasks(func, iterable, chunksize) + result = IMapIterator(self._cache) + self._taskqueue.put((((result._job, i, mapstar, (x,), {}) + for i, x in enumerate(task_batches)), result._set_length)) + return (item for chunk in result for item in chunk) + + def imap_unordered(self, func, iterable, chunksize=1): + ''' + Like `imap()` method but ordering of results is arbitrary + ''' + assert self._state == RUN + if chunksize == 1: + result = IMapUnorderedIterator(self._cache) + self._taskqueue.put((((result._job, i, func, (x,), {}) + for i, x in enumerate(iterable)), result._set_length)) + return result + else: + assert chunksize > 1 + task_batches = Pool._get_tasks(func, iterable, chunksize) + result = IMapUnorderedIterator(self._cache) + self._taskqueue.put((((result._job, i, mapstar, (x,), {}) + for i, x in enumerate(task_batches)), result._set_length)) + return (item for chunk in result for item in chunk) + + def apply_async(self, func, args=(), kwds={}, callback=None): + ''' + Asynchronous equivalent of `apply()` builtin + ''' + assert self._state == RUN + result = ApplyResult(self._cache, callback) + self._taskqueue.put(([(result._job, None, func, args, kwds)], None)) + return result + + def map_async(self, func, iterable, chunksize=None, callback=None): + ''' + Asynchronous equivalent of `map()` builtin + ''' + assert self._state == RUN + if not hasattr(iterable, '__len__'): + iterable = list(iterable) + + if chunksize is None: + chunksize, extra = divmod(len(iterable), len(self._pool) * 4) + if extra: + chunksize += 1 + + task_batches = Pool._get_tasks(func, iterable, chunksize) + result = MapResult(self._cache, chunksize, len(iterable), callback) + self._taskqueue.put((((result._job, i, mapstar, (x,), {}) + for i, x in enumerate(task_batches)), None)) + return result + + @staticmethod + def _handle_tasks(taskqueue, put, outqueue, pool): + thread = threading.current_thread() + + for taskseq, set_length in iter(taskqueue.get, None): + i = -1 + for i, task in enumerate(taskseq): + if thread._state: + debug('task handler found thread._state != RUN') + break + try: + put(task) + except IOError: + debug('could not put task on queue') + break + else: + if set_length: + debug('doing set_length()') + set_length(i+1) + continue + break + else: + debug('task handler got sentinel') + + + try: + # tell result handler to finish when cache is empty + debug('task handler sending sentinel to result handler') + outqueue.put(None) + + # tell workers there is no more work + debug('task handler sending sentinel to workers') + for p in pool: + put(None) + except IOError: + debug('task handler got IOError when sending sentinels') + + debug('task handler exiting') + + @staticmethod + def _handle_results(outqueue, get, cache): + thread = threading.current_thread() + + while 1: + try: + task = get() + except (IOError, EOFError): + debug('result handler got EOFError/IOError -- exiting') + return + + if thread._state: + assert thread._state == TERMINATE + debug('result handler found thread._state=TERMINATE') + break + + if task is None: + debug('result handler got sentinel') + break + + job, i, obj = task + try: + cache[job]._set(i, obj) + except KeyError: + pass + + while cache and thread._state != TERMINATE: + try: + task = get() + except (IOError, EOFError): + debug('result handler got EOFError/IOError -- exiting') + return + + if task is None: + debug('result handler ignoring extra sentinel') + continue + job, i, obj = task + try: + cache[job]._set(i, obj) + except KeyError: + pass + + if hasattr(outqueue, '_reader'): + debug('ensuring that outqueue is not full') + # If we don't make room available in outqueue then + # attempts to add the sentinel (None) to outqueue may + # block. There is guaranteed to be no more than 2 sentinels. + try: + for i in range(10): + if not outqueue._reader.poll(): + break + get() + except (IOError, EOFError): + pass + + debug('result handler exiting: len(cache)=%s, thread._state=%s', + len(cache), thread._state) + + @staticmethod + def _get_tasks(func, it, size): + it = iter(it) + while 1: + x = tuple(itertools.islice(it, size)) + if not x: + return + yield (func, x) + + def __reduce__(self): + raise NotImplementedError( + 'pool objects cannot be passed between processes or pickled' + ) + + def close(self): + debug('closing pool') + if self._state == RUN: + self._state = CLOSE + self._taskqueue.put(None) + + def terminate(self): + debug('terminating pool') + self._state = TERMINATE + self._terminate() + + def join(self): + debug('joining pool') + assert self._state in (CLOSE, TERMINATE) + self._task_handler.join() + self._result_handler.join() + for p in self._pool: + p.join() + + @staticmethod + def _help_stuff_finish(inqueue, task_handler, size): + # task_handler may be blocked trying to put items on inqueue + debug('removing tasks from inqueue until task handler finished') + inqueue._rlock.acquire() + while task_handler.is_alive() and inqueue._reader.poll(): + inqueue._reader.recv() + time.sleep(0) + + @classmethod + def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool, + task_handler, result_handler, cache): + # this is guaranteed to only be called once + debug('finalizing pool') + + task_handler._state = TERMINATE + taskqueue.put(None) # sentinel + + debug('helping task handler/workers to finish') + cls._help_stuff_finish(inqueue, task_handler, len(pool)) + + assert result_handler.is_alive() or len(cache) == 0 + + result_handler._state = TERMINATE + outqueue.put(None) # sentinel + + if pool and hasattr(pool[0], 'terminate'): + debug('terminating workers') + for p in pool: + p.terminate() + + debug('joining task handler') + task_handler.join(1e100) + + debug('joining result handler') + result_handler.join(1e100) + + if pool and hasattr(pool[0], 'terminate'): + debug('joining pool workers') + for p in pool: + p.join() + +# +# Class whose instances are returned by `Pool.apply_async()` +# + +class ApplyResult(object): + + def __init__(self, cache, callback): + self._cond = threading.Condition(threading.Lock()) + self._job = job_counter.next() + self._cache = cache + self._ready = False + self._callback = callback + cache[self._job] = self + + def ready(self): + return self._ready + + def successful(self): + assert self._ready + return self._success + + def wait(self, timeout=None): + self._cond.acquire() + try: + if not self._ready: + self._cond.wait(timeout) + finally: + self._cond.release() + + def get(self, timeout=None): + self.wait(timeout) + if not self._ready: + raise TimeoutError + if self._success: + return self._value + else: + raise self._value + + def _set(self, i, obj): + self._success, self._value = obj + if self._callback and self._success: + self._callback(self._value) + self._cond.acquire() + try: + self._ready = True + self._cond.notify() + finally: + self._cond.release() + del self._cache[self._job] + +# +# Class whose instances are returned by `Pool.map_async()` +# + +class MapResult(ApplyResult): + + def __init__(self, cache, chunksize, length, callback): + ApplyResult.__init__(self, cache, callback) + self._success = True + self._value = [None] * length + self._chunksize = chunksize + if chunksize <= 0: + self._number_left = 0 + self._ready = True + else: + self._number_left = length//chunksize + bool(length % chunksize) + + def _set(self, i, success_result): + success, result = success_result + if success: + self._value[i*self._chunksize:(i+1)*self._chunksize] = result + self._number_left -= 1 + if self._number_left == 0: + if self._callback: + self._callback(self._value) + del self._cache[self._job] + self._cond.acquire() + try: + self._ready = True + self._cond.notify() + finally: + self._cond.release() + + else: + self._success = False + self._value = result + del self._cache[self._job] + self._cond.acquire() + try: + self._ready = True + self._cond.notify() + finally: + self._cond.release() + +# +# Class whose instances are returned by `Pool.imap()` +# + +class IMapIterator(object): + + def __init__(self, cache): + self._cond = threading.Condition(threading.Lock()) + self._job = job_counter.next() + self._cache = cache + self._items = collections.deque() + self._index = 0 + self._length = None + self._unsorted = {} + cache[self._job] = self + + def __iter__(self): + return self + + def next(self, timeout=None): + self._cond.acquire() + try: + try: + item = self._items.popleft() + except IndexError: + if self._index == self._length: + raise StopIteration + self._cond.wait(timeout) + try: + item = self._items.popleft() + except IndexError: + if self._index == self._length: + raise StopIteration + raise TimeoutError + finally: + self._cond.release() + + success, value = item + if success: + return value + raise value + + __next__ = next # XXX + + def _set(self, i, obj): + self._cond.acquire() + try: + if self._index == i: + self._items.append(obj) + self._index += 1 + while self._index in self._unsorted: + obj = self._unsorted.pop(self._index) + self._items.append(obj) + self._index += 1 + self._cond.notify() + else: + self._unsorted[i] = obj + + if self._index == self._length: + del self._cache[self._job] + finally: + self._cond.release() + + def _set_length(self, length): + self._cond.acquire() + try: + self._length = length + if self._index == self._length: + self._cond.notify() + del self._cache[self._job] + finally: + self._cond.release() + +# +# Class whose instances are returned by `Pool.imap_unordered()` +# + +class IMapUnorderedIterator(IMapIterator): + + def _set(self, i, obj): + self._cond.acquire() + try: + self._items.append(obj) + self._index += 1 + self._cond.notify() + if self._index == self._length: + del self._cache[self._job] + finally: + self._cond.release() + +# +# +# + +class ThreadPool(Pool): + + from .dummy import Process + + def __init__(self, processes=None, initializer=None, initargs=()): + Pool.__init__(self, processes, initializer, initargs) + + def _setup_queues(self): + self._inqueue = Queue.Queue() + self._outqueue = Queue.Queue() + self._quick_put = self._inqueue.put + self._quick_get = self._outqueue.get + + @staticmethod + def _help_stuff_finish(inqueue, task_handler, size): + # put sentinels at head of inqueue to make workers finish + inqueue.not_empty.acquire() + try: + inqueue.queue.clear() + inqueue.queue.extend([None] * size) + inqueue.not_empty.notify_all() + finally: + inqueue.not_empty.release() diff --git a/LTA/LTAIngest/multiprocessing/process.py b/LTA/LTAIngest/multiprocessing/process.py new file mode 100644 index 0000000000000000000000000000000000000000..56719d9c9cde17e11030405fdaaaf55d091f1841 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/process.py @@ -0,0 +1,297 @@ +# +# Module providing the `Process` class which emulates `threading.Thread` +# +# multiprocessing/process.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = ['Process', 'current_process', 'active_children'] + +# +# Imports +# + +import os +import sys +import signal +import itertools + +# +# +# + +try: + ORIGINAL_DIR = os.path.abspath(os.getcwd()) +except OSError: + ORIGINAL_DIR = None + +# +# Public functions +# + +def current_process(): + ''' + Return process object representing the current process + ''' + return _current_process + +def active_children(): + ''' + Return list of process objects corresponding to live child processes + ''' + _cleanup() + return list(_current_process._children) + +# +# +# + +def _cleanup(): + # check for processes which have finished + for p in list(_current_process._children): + if p._popen.poll() is not None: + _current_process._children.discard(p) + +# +# The `Process` class +# + +class Process(object): + ''' + Process objects represent activity that is run in a separate process + + The class is analagous to `threading.Thread` + ''' + _Popen = None + + def __init__(self, group=None, target=None, name=None, args=(), kwargs={}): + assert group is None, 'group argument must be None for now' + count = _current_process._counter.next() + self._identity = _current_process._identity + (count,) + self._authkey = _current_process._authkey + self._daemonic = _current_process._daemonic + self._tempdir = _current_process._tempdir + self._parent_pid = os.getpid() + self._popen = None + self._target = target + self._args = tuple(args) + self._kwargs = dict(kwargs) + self._name = name or type(self).__name__ + '-' + \ + ':'.join(str(i) for i in self._identity) + + def run(self): + ''' + Method to be run in sub-process; can be overridden in sub-class + ''' + if self._target: + self._target(*self._args, **self._kwargs) + + def start(self): + ''' + Start child process + ''' + assert self._popen is None, 'cannot start a process twice' + assert self._parent_pid == os.getpid(), \ + 'can only start a process object created by current process' + assert not _current_process._daemonic, \ + 'daemonic processes are not allowed to have children' + _cleanup() + if self._Popen is not None: + Popen = self._Popen + else: + from .forking import Popen + self._popen = Popen(self) + _current_process._children.add(self) + + def terminate(self): + ''' + Terminate process; sends SIGTERM signal or uses TerminateProcess() + ''' + self._popen.terminate() + + def join(self, timeout=None): + ''' + Wait until child process terminates + ''' + assert self._parent_pid == os.getpid(), 'can only join a child process' + assert self._popen is not None, 'can only join a started process' + res = self._popen.wait(timeout) + if res is not None: + _current_process._children.discard(self) + + def is_alive(self): + ''' + Return whether process is alive + ''' + if self is _current_process: + return True + assert self._parent_pid == os.getpid(), 'can only test a child process' + if self._popen is None: + return False + self._popen.poll() + return self._popen.returncode is None + + @property + def name(self): + return self._name + + @name.setter + def name(self, name): + assert isinstance(name, basestring), 'name must be a string' + self._name = name + + @property + def daemon(self): + ''' + Return whether process is a daemon + ''' + return self._daemonic + + @daemon.setter + def daemon(self, daemonic): + ''' + Set whether process is a daemon + ''' + assert self._popen is None, 'process has already started' + self._daemonic = daemonic + + @property + def authkey(self): + return self._authkey + + @authkey.setter + def authkey(self, authkey): + ''' + Set authorization key of process + ''' + self._authkey = AuthenticationString(authkey) + + @property + def exitcode(self): + ''' + Return exit code of process or `None` if it has yet to stop + ''' + if self._popen is None: + return self._popen + return self._popen.poll() + + @property + def ident(self): + ''' + Return indentifier (PID) of process or `None` if it has yet to start + ''' + if self is _current_process: + return os.getpid() + else: + return self._popen and self._popen.pid + + pid = ident + + def __repr__(self): + if self is _current_process: + status = 'started' + elif self._parent_pid != os.getpid(): + status = 'unknown' + elif self._popen is None: + status = 'initial' + else: + if self._popen.poll() is not None: + status = self.exitcode + else: + status = 'started' + + if type(status) is int: + if status == 0: + status = 'stopped' + else: + status = 'stopped[%s]' % _exitcode_to_name.get(status, status) + + return '<%s(%s, %s%s)>' % (type(self).__name__, self._name, + status, self._daemonic and ' daemon' or '') + + ## + + def _bootstrap(self): + from . import util + global _current_process + + try: + self._children = set() + self._counter = itertools.count(1) + try: + sys.stdin.close() + sys.stdin = open(os.devnull) + except (OSError, ValueError): + pass + _current_process = self + util._finalizer_registry.clear() + util._run_after_forkers() + util.info('child process calling self.run()') + try: + self.run() + exitcode = 0 + finally: + util._exit_function() + except SystemExit, e: + if not e.args: + exitcode = 1 + elif type(e.args[0]) is int: + exitcode = e.args[0] + else: + sys.stderr.write(e.args[0] + '\n') + sys.stderr.flush() + exitcode = 1 + except: + exitcode = 1 + import traceback + sys.stderr.write('Process %s:\n' % self.name) + sys.stderr.flush() + traceback.print_exc() + + util.info('process exiting with exitcode %d' % exitcode) + return exitcode + +# +# We subclass bytes to avoid accidental transmission of auth keys over network +# + +class AuthenticationString(bytes): + def __reduce__(self): + from .forking import Popen + if not Popen.thread_is_spawning(): + raise TypeError( + 'Pickling an AuthenticationString object is ' + 'disallowed for security reasons' + ) + return AuthenticationString, (bytes(self),) + +# +# Create object representing the main process +# + +class _MainProcess(Process): + + def __init__(self): + self._identity = () + self._daemonic = False + self._name = 'MainProcess' + self._parent_pid = None + self._popen = None + self._counter = itertools.count(1) + self._children = set() + self._authkey = AuthenticationString(os.urandom(32)) + self._tempdir = None + +_current_process = _MainProcess() +del _MainProcess + +# +# Give names to some return codes +# + +_exitcode_to_name = {} + +for name, signum in signal.__dict__.items(): + if name[:3]=='SIG' and '_' not in name: + _exitcode_to_name[-signum] = name diff --git a/LTA/LTAIngest/multiprocessing/queues.py b/LTA/LTAIngest/multiprocessing/queues.py new file mode 100644 index 0000000000000000000000000000000000000000..ea279911b6c12fd6ae0d4c8f815da2707ecb2b37 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/queues.py @@ -0,0 +1,369 @@ +# +# Module implementing queues +# +# multiprocessing/queues.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = ['Queue', 'SimpleQueue', 'JoinableQueue'] + +import sys +import os +import threading +import collections +import time +import atexit +import weakref + +from Queue import Empty, Full +import _multiprocessing +from multiprocessing import Pipe +from multiprocessing.synchronize import Lock, BoundedSemaphore, Semaphore, Condition +from multiprocessing.util import debug, info, Finalize, register_after_fork +from multiprocessing.forking import assert_spawning + +# +# Queue type using a pipe, buffer and thread +# + +class Queue(object): + + def __init__(self, maxsize=0): + if maxsize <= 0: + maxsize = _multiprocessing.SemLock.SEM_VALUE_MAX + self._maxsize = maxsize + self._reader, self._writer = Pipe(duplex=False) + self._rlock = Lock() + self._opid = os.getpid() + if sys.platform == 'win32': + self._wlock = None + else: + self._wlock = Lock() + self._sem = BoundedSemaphore(maxsize) + + self._after_fork() + + if sys.platform != 'win32': + register_after_fork(self, Queue._after_fork) + + def __getstate__(self): + assert_spawning(self) + return (self._maxsize, self._reader, self._writer, + self._rlock, self._wlock, self._sem, self._opid) + + def __setstate__(self, state): + (self._maxsize, self._reader, self._writer, + self._rlock, self._wlock, self._sem, self._opid) = state + self._after_fork() + + def _after_fork(self): + debug('Queue._after_fork()') + self._notempty = threading.Condition(threading.Lock()) + self._buffer = collections.deque() + self._thread = None + self._jointhread = None + self._joincancelled = False + self._closed = False + self._close = None + self._send = self._writer.send + self._recv = self._reader.recv + self._poll = self._reader.poll + + def put(self, obj, block=True, timeout=None): + assert not self._closed + if not self._sem.acquire(block, timeout): + raise Full + + self._notempty.acquire() + try: + if self._thread is None: + self._start_thread() + self._buffer.append(obj) + self._notempty.notify() + finally: + self._notempty.release() + + def get(self, block=True, timeout=None): + if block and timeout is None: + self._rlock.acquire() + try: + res = self._recv() + self._sem.release() + return res + finally: + self._rlock.release() + + else: + if block: + deadline = time.time() + timeout + if not self._rlock.acquire(block, timeout): + raise Empty + try: + if not self._poll(block and (deadline-time.time()) or 0.0): + raise Empty + res = self._recv() + self._sem.release() + return res + finally: + self._rlock.release() + + def qsize(self): + # Raises NotImplementedError on Mac OSX because of broken sem_getvalue() + return self._maxsize - self._sem._semlock._get_value() + + def empty(self): + return not self._poll() + + def full(self): + return self._sem._semlock._is_zero() + + def get_nowait(self): + return self.get(False) + + def put_nowait(self, obj): + return self.put(obj, False) + + def close(self): + self._closed = True + self._reader.close() + if self._close: + self._close() + + def join_thread(self): + debug('Queue.join_thread()') + assert self._closed + if self._jointhread: + self._jointhread() + + def cancel_join_thread(self): + debug('Queue.cancel_join_thread()') + self._joincancelled = True + try: + self._jointhread.cancel() + except AttributeError: + pass + + def _start_thread(self): + debug('Queue._start_thread()') + + # Start thread which transfers data from buffer to pipe + self._buffer.clear() + self._thread = threading.Thread( + target=Queue._feed, + args=(self._buffer, self._notempty, self._send, + self._wlock, self._writer.close), + name='QueueFeederThread' + ) + self._thread.daemon = True + + debug('doing self._thread.start()') + self._thread.start() + debug('... done self._thread.start()') + + # On process exit we will wait for data to be flushed to pipe. + # + # However, if this process created the queue then all + # processes which use the queue will be descendants of this + # process. Therefore waiting for the queue to be flushed + # is pointless once all the child processes have been joined. + created_by_this_process = (self._opid == os.getpid()) + if not self._joincancelled and not created_by_this_process: + self._jointhread = Finalize( + self._thread, Queue._finalize_join, + [weakref.ref(self._thread)], + exitpriority=-5 + ) + + # Send sentinel to the thread queue object when garbage collected + self._close = Finalize( + self, Queue._finalize_close, + [self._buffer, self._notempty], + exitpriority=10 + ) + + @staticmethod + def _finalize_join(twr): + debug('joining queue thread') + thread = twr() + if thread is not None: + thread.join() + debug('... queue thread joined') + else: + debug('... queue thread already dead') + + @staticmethod + def _finalize_close(buffer, notempty): + debug('telling queue thread to quit') + notempty.acquire() + try: + buffer.append(_sentinel) + notempty.notify() + finally: + notempty.release() + + @staticmethod + def _feed(buffer, notempty, send, writelock, close): + debug('starting thread to feed data to pipe') + from .util import is_exiting + + nacquire = notempty.acquire + nrelease = notempty.release + nwait = notempty.wait + bpopleft = buffer.popleft + sentinel = _sentinel + if sys.platform != 'win32': + wacquire = writelock.acquire + wrelease = writelock.release + else: + wacquire = None + + try: + while 1: + nacquire() + try: + if not buffer: + nwait() + finally: + nrelease() + try: + while 1: + obj = bpopleft() + if obj is sentinel: + debug('feeder thread got sentinel -- exiting') + close() + return + + if wacquire is None: + send(obj) + else: + wacquire() + try: + send(obj) + finally: + wrelease() + except IndexError: + pass + except Exception, e: + # Since this runs in a daemon thread the resources it uses + # may be become unusable while the process is cleaning up. + # We ignore errors which happen after the process has + # started to cleanup. + try: + if is_exiting(): + info('error in queue thread: %s', e) + else: + import traceback + traceback.print_exc() + except Exception: + pass + +_sentinel = object() + +# +# A queue type which also supports join() and task_done() methods +# +# Note that if you do not call task_done() for each finished task then +# eventually the counter's semaphore may overflow causing Bad Things +# to happen. +# + +class JoinableQueue(Queue): + + def __init__(self, maxsize=0): + Queue.__init__(self, maxsize) + self._unfinished_tasks = Semaphore(0) + self._cond = Condition() + + def __getstate__(self): + return Queue.__getstate__(self) + (self._cond, self._unfinished_tasks) + + def __setstate__(self, state): + Queue.__setstate__(self, state[:-2]) + self._cond, self._unfinished_tasks = state[-2:] + + def put(self, obj, block=True, timeout=None): + assert not self._closed + if not self._sem.acquire(block, timeout): + raise Full + + self._notempty.acquire() + self._cond.acquire() + try: + if self._thread is None: + self._start_thread() + self._buffer.append(obj) + self._unfinished_tasks.release() + self._notempty.notify() + finally: + self._cond.release() + self._notempty.release() + + def task_done(self): + self._cond.acquire() + try: + if not self._unfinished_tasks.acquire(False): + raise ValueError('task_done() called too many times') + if self._unfinished_tasks._semlock._is_zero(): + self._cond.notify_all() + finally: + self._cond.release() + + def join(self): + self._cond.acquire() + try: + if not self._unfinished_tasks._semlock._is_zero(): + self._cond.wait() + finally: + self._cond.release() + +# +# Simplified Queue type -- really just a locked pipe +# + +class SimpleQueue(object): + + def __init__(self): + self._reader, self._writer = Pipe(duplex=False) + self._rlock = Lock() + if sys.platform == 'win32': + self._wlock = None + else: + self._wlock = Lock() + self._make_methods() + + def empty(self): + return not self._reader.poll() + + def __getstate__(self): + assert_spawning(self) + return (self._reader, self._writer, self._rlock, self._wlock) + + def __setstate__(self, state): + (self._reader, self._writer, self._rlock, self._wlock) = state + self._make_methods() + + def _make_methods(self): + recv = self._reader.recv + racquire, rrelease = self._rlock.acquire, self._rlock.release + def get(): + racquire() + try: + return recv() + finally: + rrelease() + self.get = get + + if self._wlock is None: + # writes to a message oriented win32 pipe are atomic + self.put = self._writer.send + else: + send = self._writer.send + wacquire, wrelease = self._wlock.acquire, self._wlock.release + def put(obj): + wacquire() + try: + return send(obj) + finally: + wrelease() + self.put = put diff --git a/LTA/LTAIngest/multiprocessing/reduction.py b/LTA/LTAIngest/multiprocessing/reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..1813729e64eb2f1f178c57734105199690fec2a5 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/reduction.py @@ -0,0 +1,189 @@ +# +# Module to allow connection and socket objects to be transferred +# between processes +# +# multiprocessing/reduction.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = [] + +import os +import sys +import socket +import threading + +import _multiprocessing +from multiprocessing import current_process +from multiprocessing.forking import Popen, duplicate, close, ForkingPickler +from multiprocessing.util import register_after_fork, debug, sub_debug +from multiprocessing.connection import Client, Listener + + +# +# +# + +if not(sys.platform == 'win32' or hasattr(_multiprocessing, 'recvfd')): + raise ImportError('pickling of connections not supported') + +# +# Platform specific definitions +# + +if sys.platform == 'win32': + import _subprocess + from ._multiprocessing import win32 + + def send_handle(conn, handle, destination_pid): + process_handle = win32.OpenProcess( + win32.PROCESS_ALL_ACCESS, False, destination_pid + ) + try: + new_handle = duplicate(handle, process_handle) + conn.send(new_handle) + finally: + close(process_handle) + + def recv_handle(conn): + return conn.recv() + +else: + def send_handle(conn, handle, destination_pid): + _multiprocessing.sendfd(conn.fileno(), handle) + + def recv_handle(conn): + return _multiprocessing.recvfd(conn.fileno()) + +# +# Support for a per-process server thread which caches pickled handles +# + +_cache = set() + +def _reset(obj): + global _lock, _listener, _cache + for h in _cache: + close(h) + _cache.clear() + _lock = threading.Lock() + _listener = None + +_reset(None) +register_after_fork(_reset, _reset) + +def _get_listener(): + global _listener + + if _listener is None: + _lock.acquire() + try: + if _listener is None: + debug('starting listener and thread for sending handles') + _listener = Listener(authkey=current_process().authkey) + t = threading.Thread(target=_serve) + t.daemon = True + t.start() + finally: + _lock.release() + + return _listener + +def _serve(): + from .util import is_exiting, sub_warning + + while 1: + try: + conn = _listener.accept() + handle_wanted, destination_pid = conn.recv() + _cache.remove(handle_wanted) + send_handle(conn, handle_wanted, destination_pid) + close(handle_wanted) + conn.close() + except: + if not is_exiting(): + import traceback + sub_warning( + 'thread for sharing handles raised exception :\n' + + '-'*79 + '\n' + traceback.format_exc() + '-'*79 + ) + +# +# Functions to be used for pickling/unpickling objects with handles +# + +def reduce_handle(handle): + if Popen.thread_is_spawning(): + return (None, Popen.duplicate_for_child(handle), True) + dup_handle = duplicate(handle) + _cache.add(dup_handle) + sub_debug('reducing handle %d', handle) + return (_get_listener().address, dup_handle, False) + +def rebuild_handle(pickled_data): + address, handle, inherited = pickled_data + if inherited: + return handle + sub_debug('rebuilding handle %d', handle) + conn = Client(address, authkey=current_process().authkey) + conn.send((handle, os.getpid())) + new_handle = recv_handle(conn) + conn.close() + return new_handle + +# +# Register `_multiprocessing.Connection` with `ForkingPickler` +# + +def reduce_connection(conn): + rh = reduce_handle(conn.fileno()) + return rebuild_connection, (rh, conn.readable, conn.writable) + +def rebuild_connection(reduced_handle, readable, writable): + handle = rebuild_handle(reduced_handle) + return _multiprocessing.Connection( + handle, readable=readable, writable=writable + ) + +ForkingPickler.register(_multiprocessing.Connection, reduce_connection) + +# +# Register `socket.socket` with `ForkingPickler` +# + +def fromfd(fd, family, type_, proto=0): + s = socket.fromfd(fd, family, type_, proto) + if s.__class__ is not socket.socket: + s = socket.socket(_sock=s) + return s + +def reduce_socket(s): + reduced_handle = reduce_handle(s.fileno()) + return rebuild_socket, (reduced_handle, s.family, s.type, s.proto) + +def rebuild_socket(reduced_handle, family, type_, proto): + fd = rebuild_handle(reduced_handle) + _sock = fromfd(fd, family, type_, proto) + close(fd) + return _sock + +ForkingPickler.register(socket.socket, reduce_socket) + +# +# Register `_multiprocessing.PipeConnection` with `ForkingPickler` +# + +if sys.platform == 'win32': + + def reduce_pipe_connection(conn): + rh = reduce_handle(conn.fileno()) + return rebuild_pipe_connection, (rh, conn.readable, conn.writable) + + def rebuild_pipe_connection(reduced_handle, readable, writable): + handle = rebuild_handle(reduced_handle) + return _multiprocessing.PipeConnection( + handle, readable=readable, writable=writable + ) + + ForkingPickler.register(_multiprocessing.PipeConnection, reduce_pipe_connection) diff --git a/LTA/LTAIngest/multiprocessing/sharedctypes.py b/LTA/LTAIngest/multiprocessing/sharedctypes.py new file mode 100644 index 0000000000000000000000000000000000000000..76b5e94b65bd3cf0140614beaa2ded9cbce0a9c1 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/sharedctypes.py @@ -0,0 +1,238 @@ +# +# Module which supports allocation of ctypes objects from shared memory +# +# multiprocessing/sharedctypes.py +# +# Copyright (c) 2007-2008, R Oudkerk --- see COPYING.txt +# + +import sys +import ctypes +import weakref + +from multiprocessing import heap, RLock +from multiprocessing.forking import assert_spawning, ForkingPickler + +__all__ = ['RawValue', 'RawArray', 'Value', 'Array', 'copy', 'synchronized'] + +# +# +# + +typecode_to_type = { + 'c': ctypes.c_char, 'u': ctypes.c_wchar, + 'b': ctypes.c_byte, 'B': ctypes.c_ubyte, + 'h': ctypes.c_short, 'H': ctypes.c_ushort, + 'i': ctypes.c_int, 'I': ctypes.c_uint, + 'l': ctypes.c_long, 'L': ctypes.c_ulong, + 'f': ctypes.c_float, 'd': ctypes.c_double + } + +# +# +# + +def _new_value(type_): + size = ctypes.sizeof(type_) + wrapper = heap.BufferWrapper(size) + return rebuild_ctype(type_, wrapper, None) + +def RawValue(typecode_or_type, *args): + ''' + Returns a ctypes object allocated from shared memory + ''' + type_ = typecode_to_type.get(typecode_or_type, typecode_or_type) + obj = _new_value(type_) + ctypes.memset(ctypes.addressof(obj), 0, ctypes.sizeof(obj)) + obj.__init__(*args) + return obj + +def RawArray(typecode_or_type, size_or_initializer): + ''' + Returns a ctypes array allocated from shared memory + ''' + type_ = typecode_to_type.get(typecode_or_type, typecode_or_type) + if isinstance(size_or_initializer, int): + type_ = type_ * size_or_initializer + return _new_value(type_) + else: + type_ = type_ * len(size_or_initializer) + result = _new_value(type_) + result.__init__(*size_or_initializer) + return result + +def Value(typecode_or_type, *args, **kwds): + ''' + Return a synchronization wrapper for a Value + ''' + lock = kwds.pop('lock', None) + if kwds: + raise ValueError('unrecognized keyword argument(s): %s' % kwds.keys()) + obj = RawValue(typecode_or_type, *args) + if lock is False: + return obj + if lock in (True, None): + lock = RLock() + if not hasattr(lock, 'acquire'): + raise AttributeError("'%r' has no method 'acquire'" % lock) + return synchronized(obj, lock) + +def Array(typecode_or_type, size_or_initializer, **kwds): + ''' + Return a synchronization wrapper for a RawArray + ''' + lock = kwds.pop('lock', None) + if kwds: + raise ValueError('unrecognized keyword argument(s): %s' % kwds.keys()) + obj = RawArray(typecode_or_type, size_or_initializer) + if lock is False: + return obj + if lock in (True, None): + lock = RLock() + if not hasattr(lock, 'acquire'): + raise AttributeError("'%r' has no method 'acquire'" % lock) + return synchronized(obj, lock) + +def copy(obj): + new_obj = _new_value(type(obj)) + ctypes.pointer(new_obj)[0] = obj + return new_obj + +def synchronized(obj, lock=None): + assert not isinstance(obj, SynchronizedBase), 'object already synchronized' + + if isinstance(obj, ctypes._SimpleCData): + return Synchronized(obj, lock) + elif isinstance(obj, ctypes.Array): + if obj._type_ is ctypes.c_char: + return SynchronizedString(obj, lock) + return SynchronizedArray(obj, lock) + else: + cls = type(obj) + try: + scls = class_cache[cls] + except KeyError: + names = [field[0] for field in cls._fields_] + d = dict((name, make_property(name)) for name in names) + classname = 'Synchronized' + cls.__name__ + scls = class_cache[cls] = type(classname, (SynchronizedBase,), d) + return scls(obj, lock) + +# +# Functions for pickling/unpickling +# + +def reduce_ctype(obj): + assert_spawning(obj) + if isinstance(obj, ctypes.Array): + return rebuild_ctype, (obj._type_, obj._wrapper, obj._length_) + else: + return rebuild_ctype, (type(obj), obj._wrapper, None) + +def rebuild_ctype(type_, wrapper, length): + if length is not None: + type_ = type_ * length + ForkingPickler.register(type_, reduce_ctype) + obj = type_.from_address(wrapper.get_address()) + obj._wrapper = wrapper + return obj + +# +# Function to create properties +# + +def make_property(name): + try: + return prop_cache[name] + except KeyError: + d = {} + exec template % ((name,)*7) in d + prop_cache[name] = d[name] + return d[name] + +template = ''' +def get%s(self): + self.acquire() + try: + return self._obj.%s + finally: + self.release() +def set%s(self, value): + self.acquire() + try: + self._obj.%s = value + finally: + self.release() +%s = property(get%s, set%s) +''' + +prop_cache = {} +class_cache = weakref.WeakKeyDictionary() + +# +# Synchronized wrappers +# + +class SynchronizedBase(object): + + def __init__(self, obj, lock=None): + self._obj = obj + self._lock = lock or RLock() + self.acquire = self._lock.acquire + self.release = self._lock.release + + def __reduce__(self): + assert_spawning(self) + return synchronized, (self._obj, self._lock) + + def get_obj(self): + return self._obj + + def get_lock(self): + return self._lock + + def __repr__(self): + return '<%s wrapper for %s>' % (type(self).__name__, self._obj) + + +class Synchronized(SynchronizedBase): + value = make_property('value') + + +class SynchronizedArray(SynchronizedBase): + + def __len__(self): + return len(self._obj) + + def __getitem__(self, i): + self.acquire() + try: + return self._obj[i] + finally: + self.release() + + def __setitem__(self, i, value): + self.acquire() + try: + self._obj[i] = value + finally: + self.release() + + def __getslice__(self, start, stop): + self.acquire() + try: + return self._obj[start:stop] + finally: + self.release() + + def __setslice__(self, start, stop, values): + self.acquire() + try: + self._obj[start:stop] = values + finally: + self.release() + + +class SynchronizedString(SynchronizedArray): + value = make_property('value') + raw = make_property('raw') diff --git a/LTA/LTAIngest/multiprocessing/synchronize.py b/LTA/LTAIngest/multiprocessing/synchronize.py new file mode 100644 index 0000000000000000000000000000000000000000..dacf45acca71dc0a3cac8c97881ff5dda6c0b5e1 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/synchronize.py @@ -0,0 +1,305 @@ +# +# Module implementing synchronization primitives +# +# multiprocessing/synchronize.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +__all__ = [ + 'Lock', 'RLock', 'Semaphore', 'BoundedSemaphore', 'Condition', 'Event' + ] + +import threading +import os +import sys + +from time import time as _time, sleep as _sleep + +import _multiprocessing +from multiprocessing.process import current_process +from multiprocessing.util import Finalize, register_after_fork, debug +from multiprocessing.forking import assert_spawning, Popen + +# Try to import the mp.synchronize module cleanly, if it fails +# raise ImportError for platforms lacking a working sem_open implementation. +# See issue 3770 +try: + from _multiprocessing import SemLock +except (ImportError): + raise ImportError("This platform lacks a functioning sem_open" + + " implementation, therefore, the required" + + " synchronization primitives needed will not" + + " function, see issue 3770.") + +# +# Constants +# + +RECURSIVE_MUTEX, SEMAPHORE = range(2) +SEM_VALUE_MAX = _multiprocessing.SemLock.SEM_VALUE_MAX + +# +# Base class for semaphores and mutexes; wraps `_multiprocessing.SemLock` +# + +class SemLock(object): + + def __init__(self, kind, value, maxvalue): + sl = self._semlock = _multiprocessing.SemLock(kind, value, maxvalue) + debug('created semlock with handle %s' % sl.handle) + self._make_methods() + + if sys.platform != 'win32': + def _after_fork(obj): + obj._semlock._after_fork() + register_after_fork(self, _after_fork) + + def _make_methods(self): + self.acquire = self._semlock.acquire + self.release = self._semlock.release + self.__enter__ = self._semlock.__enter__ + self.__exit__ = self._semlock.__exit__ + + def __getstate__(self): + assert_spawning(self) + sl = self._semlock + return (Popen.duplicate_for_child(sl.handle), sl.kind, sl.maxvalue) + + def __setstate__(self, state): + self._semlock = _multiprocessing.SemLock._rebuild(*state) + debug('recreated blocker with handle %r' % state[0]) + self._make_methods() + +# +# Semaphore +# + +class Semaphore(SemLock): + + def __init__(self, value=1): + SemLock.__init__(self, SEMAPHORE, value, SEM_VALUE_MAX) + + def get_value(self): + return self._semlock._get_value() + + def __repr__(self): + try: + value = self._semlock._get_value() + except Exception: + value = 'unknown' + return '<Semaphore(value=%s)>' % value + +# +# Bounded semaphore +# + +class BoundedSemaphore(Semaphore): + + def __init__(self, value=1): + SemLock.__init__(self, SEMAPHORE, value, value) + + def __repr__(self): + try: + value = self._semlock._get_value() + except Exception: + value = 'unknown' + return '<BoundedSemaphore(value=%s, maxvalue=%s)>' % \ + (value, self._semlock.maxvalue) + +# +# Non-recursive lock +# + +class Lock(SemLock): + + def __init__(self): + SemLock.__init__(self, SEMAPHORE, 1, 1) + + def __repr__(self): + try: + if self._semlock._is_mine(): + name = current_process().name + if threading.current_thread().name != 'MainThread': + name += '|' + threading.current_thread().name + elif self._semlock._get_value() == 1: + name = 'None' + elif self._semlock._count() > 0: + name = 'SomeOtherThread' + else: + name = 'SomeOtherProcess' + except Exception: + name = 'unknown' + return '<Lock(owner=%s)>' % name + +# +# Recursive lock +# + +class RLock(SemLock): + + def __init__(self): + SemLock.__init__(self, RECURSIVE_MUTEX, 1, 1) + + def __repr__(self): + try: + if self._semlock._is_mine(): + name = current_process().name + if threading.current_thread().name != 'MainThread': + name += '|' + threading.current_thread().name + count = self._semlock._count() + elif self._semlock._get_value() == 1: + name, count = 'None', 0 + elif self._semlock._count() > 0: + name, count = 'SomeOtherThread', 'nonzero' + else: + name, count = 'SomeOtherProcess', 'nonzero' + except Exception: + name, count = 'unknown', 'unknown' + return '<RLock(%s, %s)>' % (name, count) + +# +# Condition variable +# + +class Condition(object): + + def __init__(self, lock=None): + self._lock = lock or RLock() + self._sleeping_count = Semaphore(0) + self._woken_count = Semaphore(0) + self._wait_semaphore = Semaphore(0) + self._make_methods() + + def __getstate__(self): + assert_spawning(self) + return (self._lock, self._sleeping_count, + self._woken_count, self._wait_semaphore) + + def __setstate__(self, state): + (self._lock, self._sleeping_count, + self._woken_count, self._wait_semaphore) = state + self._make_methods() + + def _make_methods(self): + self.acquire = self._lock.acquire + self.release = self._lock.release + self.__enter__ = self._lock.__enter__ + self.__exit__ = self._lock.__exit__ + + def __repr__(self): + try: + num_waiters = (self._sleeping_count._semlock._get_value() - + self._woken_count._semlock._get_value()) + except Exception: + num_waiters = 'unkown' + return '<Condition(%s, %s)>' % (self._lock, num_waiters) + + def wait(self, timeout=None): + assert self._lock._semlock._is_mine(), \ + 'must acquire() condition before using wait()' + + # indicate that this thread is going to sleep + self._sleeping_count.release() + + # release lock + count = self._lock._semlock._count() + for i in xrange(count): + self._lock.release() + + try: + # wait for notification or timeout + self._wait_semaphore.acquire(True, timeout) + finally: + # indicate that this thread has woken + self._woken_count.release() + + # reacquire lock + for i in xrange(count): + self._lock.acquire() + + def notify(self): + assert self._lock._semlock._is_mine(), 'lock is not owned' + assert not self._wait_semaphore.acquire(False) + + # to take account of timeouts since last notify() we subtract + # woken_count from sleeping_count and rezero woken_count + while self._woken_count.acquire(False): + res = self._sleeping_count.acquire(False) + assert res + + if self._sleeping_count.acquire(False): # try grabbing a sleeper + self._wait_semaphore.release() # wake up one sleeper + self._woken_count.acquire() # wait for the sleeper to wake + + # rezero _wait_semaphore in case a timeout just happened + self._wait_semaphore.acquire(False) + + def notify_all(self): + assert self._lock._semlock._is_mine(), 'lock is not owned' + assert not self._wait_semaphore.acquire(False) + + # to take account of timeouts since last notify*() we subtract + # woken_count from sleeping_count and rezero woken_count + while self._woken_count.acquire(False): + res = self._sleeping_count.acquire(False) + assert res + + sleepers = 0 + while self._sleeping_count.acquire(False): + self._wait_semaphore.release() # wake up one sleeper + sleepers += 1 + + if sleepers: + for i in xrange(sleepers): + self._woken_count.acquire() # wait for a sleeper to wake + + # rezero wait_semaphore in case some timeouts just happened + while self._wait_semaphore.acquire(False): + pass + +# +# Event +# + +class Event(object): + + def __init__(self): + self._cond = Condition(Lock()) + self._flag = Semaphore(0) + + def is_set(self): + self._cond.acquire() + try: + if self._flag.acquire(False): + self._flag.release() + return True + return False + finally: + self._cond.release() + + def set(self): + self._cond.acquire() + try: + self._flag.acquire(False) + self._flag.release() + self._cond.notify_all() + finally: + self._cond.release() + + def clear(self): + self._cond.acquire() + try: + self._flag.acquire(False) + finally: + self._cond.release() + + def wait(self, timeout=None): + self._cond.acquire() + try: + if self._flag.acquire(False): + self._flag.release() + else: + self._cond.wait(timeout) + finally: + self._cond.release() diff --git a/LTA/LTAIngest/multiprocessing/util.py b/LTA/LTAIngest/multiprocessing/util.py new file mode 100644 index 0000000000000000000000000000000000000000..632adb1d7c8586f051731e7bf6d1558115871970 --- /dev/null +++ b/LTA/LTAIngest/multiprocessing/util.py @@ -0,0 +1,291 @@ +# +# Module providing various facilities to other parts of the package +# +# multiprocessing/util.py +# +# Copyright (c) 2006-2008, R Oudkerk --- see COPYING.txt +# + +import itertools +import weakref +import atexit +import threading # we want threading to install it's + # cleanup function before multiprocessing does + +from multiprocessing.process import current_process, active_children + +__all__ = [ + 'sub_debug', 'debug', 'info', 'sub_warning', 'get_logger', + 'log_to_stderr', 'get_temp_dir', 'register_after_fork', + 'is_exiting', 'Finalize', 'ForkAwareThreadLock', 'ForkAwareLocal', + 'SUBDEBUG', 'SUBWARNING', + ] + +# +# Logging +# + +NOTSET = 0 +SUBDEBUG = 5 +DEBUG = 10 +INFO = 20 +SUBWARNING = 25 + +LOGGER_NAME = 'multiprocessing' +DEFAULT_LOGGING_FORMAT = '[%(levelname)s/%(processName)s] %(message)s' + +_logger = None +_log_to_stderr = False + +def sub_debug(msg, *args): + if _logger: + _logger.log(SUBDEBUG, msg, *args) + +def debug(msg, *args): + if _logger: + _logger.log(DEBUG, msg, *args) + +def info(msg, *args): + if _logger: + _logger.log(INFO, msg, *args) + +def sub_warning(msg, *args): + if _logger: + _logger.log(SUBWARNING, msg, *args) + +def get_logger(): + ''' + Returns logger used by multiprocessing + ''' + global _logger + import logging, atexit + + logging._acquireLock() + try: + if not _logger: + + _logger = logging.getLogger(LOGGER_NAME) + _logger.propagate = 0 + logging.addLevelName(SUBDEBUG, 'SUBDEBUG') + logging.addLevelName(SUBWARNING, 'SUBWARNING') + + # XXX multiprocessing should cleanup before logging + if hasattr(atexit, 'unregister'): + atexit.unregister(_exit_function) + atexit.register(_exit_function) + else: + atexit._exithandlers.remove((_exit_function, (), {})) + atexit._exithandlers.append((_exit_function, (), {})) + + finally: + logging._releaseLock() + + return _logger + +def log_to_stderr(level=None): + ''' + Turn on logging and add a handler which prints to stderr + ''' + global _log_to_stderr + import logging + + logger = get_logger() + formatter = logging.Formatter(DEFAULT_LOGGING_FORMAT) + handler = logging.StreamHandler() + handler.setFormatter(formatter) + logger.addHandler(handler) + + if level: + logger.setLevel(level) + _log_to_stderr = True + return _logger + +# +# Function returning a temp directory which will be removed on exit +# + +def get_temp_dir(): + # get name of a temp directory which will be automatically cleaned up + if current_process()._tempdir is None: + import shutil, tempfile + tempdir = tempfile.mkdtemp(prefix='pymp-') + info('created temp directory %s', tempdir) + Finalize(None, shutil.rmtree, args=[tempdir], exitpriority=-100) + current_process()._tempdir = tempdir + return current_process()._tempdir + +# +# Support for reinitialization of objects when bootstrapping a child process +# + +_afterfork_registry = weakref.WeakValueDictionary() +_afterfork_counter = itertools.count() + +def _run_after_forkers(): + items = list(_afterfork_registry.items()) + items.sort() + for (index, ident, func), obj in items: + try: + func(obj) + except Exception, e: + info('after forker raised exception %s', e) + +def register_after_fork(obj, func): + _afterfork_registry[(_afterfork_counter.next(), id(obj), func)] = obj + +# +# Finalization using weakrefs +# + +_finalizer_registry = {} +_finalizer_counter = itertools.count() + + +class Finalize(object): + ''' + Class which supports object finalization using weakrefs + ''' + def __init__(self, obj, callback, args=(), kwargs=None, exitpriority=None): + assert exitpriority is None or type(exitpriority) is int + + if obj is not None: + self._weakref = weakref.ref(obj, self) + else: + assert exitpriority is not None + + self._callback = callback + self._args = args + self._kwargs = kwargs or {} + self._key = (exitpriority, _finalizer_counter.next()) + + _finalizer_registry[self._key] = self + + def __call__(self, wr=None): + ''' + Run the callback unless it has already been called or cancelled + ''' + try: + del _finalizer_registry[self._key] + except KeyError: + sub_debug('finalizer no longer registered') + else: + sub_debug('finalizer calling %s with args %s and kwargs %s', + self._callback, self._args, self._kwargs) + res = self._callback(*self._args, **self._kwargs) + self._weakref = self._callback = self._args = \ + self._kwargs = self._key = None + return res + + def cancel(self): + ''' + Cancel finalization of the object + ''' + try: + del _finalizer_registry[self._key] + except KeyError: + pass + else: + self._weakref = self._callback = self._args = \ + self._kwargs = self._key = None + + def still_active(self): + ''' + Return whether this finalizer is still waiting to invoke callback + ''' + return self._key in _finalizer_registry + + def __repr__(self): + try: + obj = self._weakref() + except (AttributeError, TypeError): + obj = None + + if obj is None: + return '<Finalize object, dead>' + + x = '<Finalize object, callback=%s' % \ + getattr(self._callback, '__name__', self._callback) + if self._args: + x += ', args=' + str(self._args) + if self._kwargs: + x += ', kwargs=' + str(self._kwargs) + if self._key[0] is not None: + x += ', exitprority=' + str(self._key[0]) + return x + '>' + + +def _run_finalizers(minpriority=None): + ''' + Run all finalizers whose exit priority is not None and at least minpriority + + Finalizers with highest priority are called first; finalizers with + the same priority will be called in reverse order of creation. + ''' + if minpriority is None: + f = lambda p : p[0][0] is not None + else: + f = lambda p : p[0][0] is not None and p[0][0] >= minpriority + + items = [x for x in _finalizer_registry.items() if f(x)] + items.sort(reverse=True) + + for key, finalizer in items: + sub_debug('calling %s', finalizer) + try: + finalizer() + except Exception: + import traceback + traceback.print_exc() + + if minpriority is None: + _finalizer_registry.clear() + +# +# Clean up on exit +# + +def is_exiting(): + ''' + Returns true if the process is shutting down + ''' + return _exiting or _exiting is None + +_exiting = False + +def _exit_function(): + global _exiting + + info('process shutting down') + debug('running all "atexit" finalizers with priority >= 0') + _run_finalizers(0) + + for p in active_children(): + if p._daemonic: + info('calling terminate() for daemon %s', p.name) + p._popen.terminate() + + for p in active_children(): + info('calling join() for process %s', p.name) + p.join() + + debug('running the remaining "atexit" finalizers') + _run_finalizers() + +atexit.register(_exit_function) + +# +# Some fork aware types +# + +class ForkAwareThreadLock(object): + def __init__(self): + self._lock = threading.Lock() + self.acquire = self._lock.acquire + self.release = self._lock.release + register_after_fork(self, ForkAwareThreadLock.__init__) + +class ForkAwareLocal(threading.local): + def __init__(self): + register_after_fork(self, lambda obj : obj.__dict__.clear()) + def __reduce__(self): + return type(self), () diff --git a/LTA/LTAIngest/simple_server.py b/LTA/LTAIngest/simple_server.py new file mode 100644 index 0000000000000000000000000000000000000000..13822494c7f699fbde98e4581832af94f50d3d54 --- /dev/null +++ b/LTA/LTAIngest/simple_server.py @@ -0,0 +1,7 @@ +import SimpleXMLRPCServer +s = SimpleXMLRPCServer.SimpleXMLRPCServer(('10.178.1.2', 2009)) +def boo(): + return "hello" +s.register_introspection_functions() +s.register_function(boo, 'boo') +s.serve_forever() diff --git a/LTA/LTAIngest/sitecustomize.py b/LTA/LTAIngest/sitecustomize.py new file mode 100644 index 0000000000000000000000000000000000000000..168fbdddc98912f6c2e0aa81e0a22aba676d08ad --- /dev/null +++ b/LTA/LTAIngest/sitecustomize.py @@ -0,0 +1,13 @@ +import sys, os.path +dir = os.path.split(__file__)[0] +if os.path.isdir(dir + '/ClientForm-0.1.17'): + sys.path.insert(1, dir + '/ClientForm-0.1.17') +if os.path.isdir(dir + '/SOAPpy-0.12.0'): + sys.path.insert(1, dir + '/SOAPpy-0.12.0') +if os.path.isdir(dir + '/fpconst-0.7.0'): + sys.path.insert(1, dir + '/fpconst-0.7.0') +#if os.path.isdir(dir + '/dav'): +# sys.path.insert(1, dir + '/dav') +if os.path.isdir(dir + '/multiprocessing'): + sys.path.insert(1, dir + '/multiprocessing') + diff --git a/LTA/LTAIngest/slave.py b/LTA/LTAIngest/slave.py new file mode 100755 index 0000000000000000000000000000000000000000..f72eaccabaac3d6ccae488bd2318a9a62b54f69d --- /dev/null +++ b/LTA/LTAIngest/slave.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python +from multiprocessing import Process, Queue, Value +from Queue import Empty as QueueEmpty +from multiprocessing.managers import SyncManager +from job_parser import JobRetry, JobError, JobHold, JobScheduled, JobProducing, JobProduced +import os, time, sys +from ingestpipeline import IngestPipeline, PipelineError, PipelineJobFailedError +from ingestpipeline import PipelineNoSourceError, PipelineAlreadyInLTAError, PipelineAlreadyInLTAError, PipelineNoProjectInLTAError + +##--------------------- MoM talker ----------------------------- + +class momTalker(Process): + def __init__(self, logger, client, count, maxTalkQueue): + self.logger = logger + self.jobs = Queue(maxTalkQueue) + self.exportClient = client + self.retryCount = count + super(momTalker, self).__init__() + logger.info('momTalker initialzed') + + def getQueue(self): + return self.jobs + + ## This function also exists in the master, should be refactored at some point + def communicateJob(self, job): + """function to write to log and communicate with GUI""" + if job['Status'] == JobError: self.logger.info('Job:' + str(job['ExportID']) + ' Failed') + elif job['Status'] == JobHold: self.logger.info('Job:' + str(job['ExportID']) + ' is on Hold') + elif job['Status'] == JobScheduled: self.logger.info('Job:' + str(job['ExportID']) + ' Scheduled') + elif job['Status'] == JobProducing: self.logger.info('Job:' + str(job['ExportID']) + ' Started') + elif job['Status'] == JobProduced: self.logger.info('Job:' + str(job['ExportID']) + ' Produced') + try: + if job['Status'] == JobRetry: + self.logger.info('Job:' + str(job['ExportID']) + ' retry state not communicated to MoM') + return + if not job['Type'] == 'MoM': + self.logger.info('Job:' + str(job['ExportID']) + ' not communicated to MoM') + return + (status, message) = self.exportClient.setStatus(str(job['ExportID']), str(job['Status'])) + if status: ## we retry, because the client does not do an internal retry, but only reports the problem + self.logger.warning("Problem communicating with MoM, retrying " + str(job['ExportID']) + ": " + message) + count = 1 + while (status and (count < self.retryCount)): + time.sleep(60 * count) + (status, message) = self.exportClient.setStatus(str(job['ExportID']), str(job['Status'])) + count += 1 + if status: + self.logger.warning(message) + self.logger.info(message) + except: + self.logger.exception('Could not update job %s status to %s.' % (str(job['ExportID']), str(job['Status']))) + + def run(self): + self.logger.info('momTalker started') + while True: + try: + job = self.jobs.get(True, 10) + except QueueEmpty: + job = None + if job: + self.communicateJob(job) + +## ----------------- Job executer ---------------------------------------- + +class executer(Process): + def __init__(self, logger, logdir, job, talker, jobs, momClient, ltaClient, host, port, mailCommand, manager, pipelineRetry, momRetry, ltaRetry, srmRetry, srmInit): + self.logger = logger + self.logdir = logdir + self.job = job + self.talker = talker + self.jobs = jobs + self.momClient = momClient + self.ltaClient = ltaClient + self.host = host + self.ltacpport = port + self.mailCommand = mailCommand + self.manager = manager + self.pipelineRetry = pipelineRetry + self.momRetry = momRetry + self.ltaRetry = ltaRetry + self.srmRetry = srmRetry + self.srmInit = srmInit + self.result = None + super(executer, self).__init__() + logger.info('Executer initialzed for %s (pid: %i)' % (job['ExportID'], os.getpid())) + + def run(self): + self.job['Status'] = JobProducing + self.talker.put(self.job) + pipeline = IngestPipeline(self.logdir, self.job, self.momClient, self.ltaClient, self.host, self.ltacpport, self.mailCommand, self.momRetry, self.ltaRetry, self.srmRetry, self.srmInit) + try: + pipeline.run() + self.logger.debug('Slave found no Error for %s' % self.job['ExportID']) + self.job['Status'] = JobProduced + except PipelineError as e: + self.logger.info('The Ingest Pipeline failed for %s' % self.job['ExportID']) + if e.type == PipelineNoSourceError: + self.logger.debug('Slave found PipelineNoSource Error for %s' % self.job['ExportID']) + ## It is not useful to mark the job as failed as this dataproduct is simply nonexistent + ## Todo: handle with a separate status that can be communicated with MoM? + self.job['Status'] = JobProduced + elif e.type == PipelineAlreadyInLTAError: + self.logger.debug('Slave found PipelineAlreadyInLTA Error for %s' % self.job['ExportID']) + ## It is not useful to mark the job as failed as this dataproduct is simply already done + ## Todo: handle with a separate status that can be communicated with MoM? + self.job['Status'] = JobProduced + elif e.type == PipelineNoProjectInLTAError: + self.logger.debug('Slave found PipelineNoProjectInLta Error for %s' % self.job['ExportID']) + ## We do want this status to be marked as failed + ## Todo: handle with a separate status that can be communicated with MoM? + self.job['Status'] = JobError + self.result = ("Project not in LTA","ingest") + else: + self.logger.debug('Slave found PipelineFailedError for %s' % self.job['ExportID']) + self.job['Status'] = JobError + self.result = (e.args[0], e.source) + except Exception as e: + self.logger.debug('Slave found unexpected Error for %s' % self.job['ExportID']) + self.logger.warning('The Ingest Pipeline failed for %s' % self.job['ExportID']) + self.job['Status'] = JobError + self.result = (str(e), 'ingestpipeline') + ## Only communicate failure with MoM if we have given up retrying + ## Todo: More elegant to handle all decisions about failed/successful jobs in the manager? + if self.job['Status'] == JobError: + self.job['retry'] += 1 + if self.job['retry'] < self.pipelineRetry: + self.job['Status'] = JobRetry + if (self.job['Status'] == JobProduced) or (self.job['Status'] == JobError): + self.talker.put(self.job) + self.manager.slave_done(self.job, self.result, pipeline.FileType) + with self.jobs.get_lock(): + self.jobs.value -= 1 + +## ---------------- LTA Slave -------------------------------------------- +class ltaSlave(): + def __init__(self, config): + configFile = config + try: + self.readConfig(configFile) + except Exception as e: + print ('\n%s' % e) + print('The Configuration is incomplete, exiting') + exit(2) + + self.jobs = Value('i', 0) + self.logger.info('Slave %s initialized' % self.host) + + def readConfig(self, configFile): + exec(eval("'from %s import *' % configFile")) + self.host = host + self.ltacpport = ltacpport + self.mailSlCommand = mailSlCommand + self.jobsdir = jobsdir + self.logger = logger + self.logdir = logdir + self.ltaClient = ltaClient + self.exportClient = exportClient + self.momClient = momClient + self.pipelineRetry = pipelineRetry + self.momRetry = momRetry + self.ltaRetry = ltaRetry + self.srmRetry = srmRetry + self.srmInit = srmInit + self.momServer = momServer + self.masterAddress = masterAddress + self.masterPort = masterPort + self.masterAuth = masterAuth + self.maxTalkQueue = maxSlaveTalkerQueue + self.parallelJobs = parallelJobs + + def serve(self): + class Manager(SyncManager): pass + Manager.register('add_slave') + Manager.register('remove_slave') + Manager.register('slave_done') + self.manager = Manager(address=(self.masterAddress, self.masterPort), authkey=self.masterAuth) + self.manager.connect() + self.logger.debug('Master found') + self.queue = self.manager.add_slave(self.host) + + self.momTalker = momTalker(self.logger, self.exportClient, self.momRetry, self.maxTalkQueue) + self.momTalker.start() + talker = self.momTalker.getQueue() + + self.logger.info('Slave %s started' % self.host) + while True: + if self.jobs.value < self.parallelJobs: + try: + job = self.queue.get(True, 10) + except QueueEmpty: + job = None + if job: + with self.jobs.get_lock(): + self.jobs.value += 1 + runner = executer(self.logger, self.logdir, job, talker, self.jobs, self.momClient, self.ltaClient, self.host, self.ltacpport, self.mailSlCommand, self.manager, self.pipelineRetry, self.momRetry, self.ltaRetry, self.srmRetry, self.srmInit) + runner.start() + else: + time.sleep(10) + +## Stand alone execution code ------------------------------------------ +if __name__ == '__main__': + usage = """Usage: + slave.py <config> + config Something like ingest_config (without the .py)""" + + if len(sys.argv) < 2: + print usage + exit(1) + config = sys.argv[1] + standalone = ltaSlave(config) + standalone.serve() diff --git a/LTA/LTAIngest/ssh_check.sh b/LTA/LTAIngest/ssh_check.sh new file mode 100755 index 0000000000000000000000000000000000000000..4237b7a7be349d8c6759cf9506318b1034b62b79 --- /dev/null +++ b/LTA/LTAIngest/ssh_check.sh @@ -0,0 +1,14 @@ +createTunnel() { + /usr/bin/ssh -f -N -L2010:lexar002:2010 -L2022:lexar002:22 momingest@portal.lofar.eu + if [[ $? -eq 0 ]]; then + echo Tunnel to lexar002 created successfully + else + echo An error occurred creating a tunnel to lexar002 RC was $? + fi +} +## Run the 'ls' command remotely. If it returns non-zero, then create a new connection +/usr/bin/ssh -p 2022 momingest@localhost ls +if [[ $? -ne 0 ]]; then + echo Creating new tunnel connection + createTunnel +fi diff --git a/LTA/LTAIngest/startup.csh b/LTA/LTAIngest/startup.csh new file mode 100755 index 0000000000000000000000000000000000000000..461119c4d3b0a270ddeb227373d9ef58ec041384 --- /dev/null +++ b/LTA/LTAIngest/startup.csh @@ -0,0 +1,17 @@ +#!/bin/csh -f +#starting ingest pipeline +#ar: 15 may 2013 +cd /globalhome/ingest/LTAIngest +setenv PYTHONPATH /globalhome/ingest/LTAIngest +if (! `ps uxf | grep -v grep | grep -c master.py` ) then + nohup master.py ingest_config >& nohup.out & +else + echo "Master running already" +endif +sleep 1 +if (! `ps uxf | grep -v grep | grep -c slave.py` ) then + nohup slave.py ingest_config >& nohup_slave.out & +else + echo "Slave is running already" +endif +cd /globalhome/ingest diff --git a/LTA/LTAIngest/startup_slave.csh b/LTA/LTAIngest/startup_slave.csh new file mode 100755 index 0000000000000000000000000000000000000000..01ece143f76a6362e9c29fef2e11f82fed46b3b0 --- /dev/null +++ b/LTA/LTAIngest/startup_slave.csh @@ -0,0 +1,11 @@ +#!/bin/csh -f +#starting ingest pipeline +#ar: 15 may 2013 +cd /globalhome/ingest/LTAIngest +setenv PYTHONPATH /globalhome/ingest/LTAIngest +if (! `ps uxf | grep -v grep | grep -c slave.py` ) then + nohup slave.py ingest_config >& nohup_slave.out & +else + echo "Slave is running already" +endif +cd /globalhome/ingest diff --git a/LTA/LTAIngest/startup_target.csh b/LTA/LTAIngest/startup_target.csh new file mode 100755 index 0000000000000000000000000000000000000000..3985ee19271e9ee6db440e99392fcd82a8e7fa4c --- /dev/null +++ b/LTA/LTAIngest/startup_target.csh @@ -0,0 +1,10 @@ +#!/bin/csh -f +#starting ingest pipeline +#ar: 28 august 2014 +cd /home/lofarlocal/LTAIngest +setenv PYTHONPATH /home/lofarlocal/LTAIngest +if (! `ps uxf | grep -v grep | grep -c slave.py` ) then + nohup ./slave.py ingest_config >& nohup_slave.out & +else + echo "Slave is running already" +endif diff --git a/LTA/LTAIngest/startup_target_test.csh b/LTA/LTAIngest/startup_target_test.csh new file mode 100755 index 0000000000000000000000000000000000000000..665e3412b8bf90ce6ba52e331444b1bd1979a023 --- /dev/null +++ b/LTA/LTAIngest/startup_target_test.csh @@ -0,0 +1,10 @@ +#!/bin/csh -f +#starting ingest pipeline +#ar: 28 august 2014 +cd /home/lofarlocal/LTAIngest +setenv PYTHONPATH /home/lofarlocal/LTAIngest +if (! `ps uxf | grep -v grep | grep -c slave.py` ) then + nohup ./slave.py ingest_config_test >& nohup_slave.out & +else + echo "Slave is running already" +endif diff --git a/LTA/LTAIngest/startup_test.csh b/LTA/LTAIngest/startup_test.csh new file mode 100755 index 0000000000000000000000000000000000000000..3102ceb8ed2561b7299994f2fb9b6b04eaa3af35 --- /dev/null +++ b/LTA/LTAIngest/startup_test.csh @@ -0,0 +1,16 @@ +#!/bin/csh -f +#starting ingest pipeline +#ar: 15 may 2013 +cd /globalhome/ingesttest/LTAIngest +setenv PYTHONPATH /globalhome/ingesttest/LTAIngest +if (! `ps uxf | grep -v grep | grep -c master.py` ) then + nohup ./master.py ingest_config_test >& nohup.out & +else + echo "Master running already" +endif +sleep 1 +if (! `ps uxf | grep -v grep | grep -c slave.py` ) then + nohup ./slave.py ingest_config_test >& nohup_slave.out & +else + echo "Slave is running already" +endif diff --git a/LTA/LTAIngest/test.xmlrpc b/LTA/LTAIngest/test.xmlrpc new file mode 100644 index 0000000000000000000000000000000000000000..c808e69ad29357a864215903dae7d8a4e4de5daa --- /dev/null +++ b/LTA/LTAIngest/test.xmlrpc @@ -0,0 +1,7 @@ +import xmlrpclib +url = 'https://awtier0:tier123@lofar-ingest-test.target.rug.nl:19443' +client = xmlrpclib.ServerProxy(url) +client.GetStorageTicket('MSSS','test.filename.MS',1,'1','TEST_0001','1234',False) +{'primary_uri_rnd': 'srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/test/msss/1234/test.filename_1de8b275.MS', 'primary_uri': 'srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/test/msss/1234', 'result': 'ok', 'error': '', 'secondary_uri': '', 'ticket': 'BF6FE08B8D775266E043C416A9C31B5D', 'secondary_uri_rnd': ''} +client.UpdateUriState('MSSS','BF6FE08B8D775266E043C416A9C31B5D','srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/test/msss/1234',-10) +{'primary_uri_rnd': '', 'primary_uri': '', 'result': '', 'error': 'LTA Ingest Service; an exception was raised in UpdateUriState; No StorageTicketResource found for ticket BF6FE08B8D775266E043C416A9C31B5D', 'secondary_uri': '', 'ticket': '', 'secondary_uri_rnd': ''} diff --git a/LTA/LTAIngest/test/__init__.py b/LTA/LTAIngest/test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/LTA/LTAIngest/test/test_ingest_logging.py b/LTA/LTAIngest/test/test_ingest_logging.py new file mode 100755 index 0000000000000000000000000000000000000000..462cf7e6e15bb90a1d38ad3c43b3ef2d9ee8c709 --- /dev/null +++ b/LTA/LTAIngest/test/test_ingest_logging.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +import unittest +import logging +import os +import os.path +from LTAIngest.ingest_config import logger + +class TestIngestLogging(unittest.TestCase): + """Tests for logging part of Ingest""" + + def testBasicLogging(self): + """Do some basic testing on writing to the ingest.logger""" + + #default log handler should be a FileHandler + handler = logger.handlers[0] if logger.handlers else logger.root.handlers[0] + self.assertTrue(isinstance(handler, logging.FileHandler)) + + #check if logfile exists + #should be the case, since ingest_config logged that initialization was done + logpath = handler.baseFilename + self.assertTrue(os.path.isfile(logpath)) + + #test writing and reading a log line + testlogline = 'unittest log line' + logger.debug(testlogline) + logfile = open(logpath, 'r') + lines = logfile.readlines() + self.assertTrue(lines[-1].strip().endswith(testlogline)) + + def testLogRotate(self): + """Test what happens if an external program (logrotate) moves a logfile + The logger should automatically recreate the logfile and write to it if it was moved. + """ + + handler = logger.handlers[0] if logger.handlers else logger.root.handlers[0] + logpath = handler.baseFilename + + #move (rename) log file (which is what logrotate does) + logpath_moved = logpath.replace('.log', '.log.old') + os.rename(logpath, logpath_moved) + + #check if it is gone + self.assertFalse(os.path.isfile(logpath)) + + #check new location + self.assertTrue(os.path.isfile(logpath_moved)) + + #log a line and check if the log file reappears + testlogline = 'unittest log line' + logger.debug(testlogline) + self.assertTrue(os.path.isfile(logpath), 'log file %s (no longer) exists' % logpath) + + #test reading the written log line + logfile = open(logpath, 'r') + lines = logfile.readlines() + self.assertTrue(lines[-1].strip().endswith(testlogline)) + +#run tests if main +if __name__ == '__main__': + unittest.main(verbosity=2) diff --git a/LTA/LTAIngest/testjob2.xml b/LTA/LTAIngest/testjob2.xml new file mode 100644 index 0000000000000000000000000000000000000000..851e063491086504ab65bdf87281f256b0667604 --- /dev/null +++ b/LTA/LTAIngest/testjob2.xml @@ -0,0 +1,12 @@ +<?xml version="1.0" encoding="UTF-8"?> +<exportjob exportID="A_10_10_87"> + <scriptname>IngestPipeline</scriptname> + <input name="DataProduct">L1_B0_SB0.MS</input> + <input name="Project">test-lofar</input> + <input name="JobId">A_10_10_87</input> + <input name="MomId">466</input> + <repository> + <server>webdav_lofar_repository2</server> + <resultdir>/mom2/test-lofar/80/ArchiveLogs</resultdir> + </repository> +</exportjob> diff --git a/LTA/LTAIngest/unspecifiedSIP.py b/LTA/LTAIngest/unspecifiedSIP.py new file mode 100755 index 0000000000000000000000000000000000000000..7cd0c6b698658c0c7f42def71af7f2e012d4991b --- /dev/null +++ b/LTA/LTAIngest/unspecifiedSIP.py @@ -0,0 +1,81 @@ +#!/usr/bin/python + +genericSIP = '''<?xml version="1.0" encoding="UTF-8"?> +<sip:ltaSip xmlns:sip="http://www.astron.nl/SIP-Lofar" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.astron.nl/SIP-Lofar LTA-SIP-2.3.0.xsd"> + <sipGeneratorVersion>Ingest (06-06-2013)</sipGeneratorVersion> + <project> + <projectCode>%s</projectCode> + <primaryInvestigator>Unknown</primaryInvestigator> + <coInvestigator>Unknown</coInvestigator> + <contactAuthor>Unknown</contactAuthor> + <telescope>LOFAR</telescope> + <projectDescription>Unknown</projectDescription> + </project> + <dataProduct xsi:type="sip:UnspecifiedDataProduct"> + <dataProductType>Unknown</dataProductType> + <dataProductIdentifier> + <source>%s</source> + <identifier>%s</identifier> + <name>%s</name> + </dataProductIdentifier> + <storageTicket>%s</storageTicket> + <size>%s</size> + <checksum> + <algorithm>MD5</algorithm> + <value>%s</value> + </checksum> + <checksum> + <algorithm>Adler32</algorithm> + <value>%s</value> + </checksum> + <fileName>%s</fileName> + <fileFormat>%s</fileFormat> + <processIdentifier> + <source>SAS</source> + <identifier>%s</identifier> + <name>Unknown process %s</name> + </processIdentifier> + </dataProduct> + <unspecifiedProcess xsi:type="sip:UnspecifiedProcess"> + <processIdentifier> + <source>SAS</source> + <identifier>%s</identifier> + <name>Unknown process %s</name> + </processIdentifier> + <observationId> + <source>SAS</source> + <identifier>%s</identifier> + </observationId> + <strategyName>Unknown</strategyName> + <strategyDescription>Unknown</strategyDescription> + <startTime>2010-01-01T00:00:00</startTime> + <duration>PT0S</duration> + <observingMode>Unknown</observingMode> + <description>Unknown process, information generated by Ingest</description> + </unspecifiedProcess> +</sip:ltaSip> +''' + + +def makeSIP(Project, ObsId, MomId, ticket, FileName, FileSize, MD5Checksum, Adler32Checksum, Type): + if FileName[-3:] == '.MS': + fileFormat = 'AIPS++/CASA' + elif FileName[-3:].lower() == '.h5': + fileFormat = 'HDF5' + elif FileName[-5:].lower() == '.fits': + fileFormat = 'FITS' + else: ## Maybe we need an 'Unknown' in the future? + fileFormat = 'PULP' + return genericSIP % (Project, Type, MomId, FileName, ticket, FileSize, MD5Checksum, Adler32Checksum, FileName, fileFormat, ObsId, ObsId, ObsId, ObsId, ObsId) + +## Stand alone execution code ------------------------------------------ +if __name__ == '__main__': + usage = """Usage: + unspecifiedSIP.py <config> + TBD""" + +# if len(sys.argv) < 2: +# print usage +# exit(1) + print makeSIP('test-lofar','12345','43213','VSN3FUNSP98N4F3NLSIWDUALFU3WDF','Bla.FITS',378964322,'Hoeba','Test') diff --git a/LTA/LTAIngest/user_ingest.py b/LTA/LTAIngest/user_ingest.py new file mode 100755 index 0000000000000000000000000000000000000000..0e6e564ee39db291d38c991e8f87d2a8a5a31005 --- /dev/null +++ b/LTA/LTAIngest/user_ingest.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +from multiprocessing import Process, Queue, Manager, Value +from multiprocessing.managers import SyncManager +from Queue import Empty +import SimpleXMLRPCServer, sys, os + +class MkdirServer(Process): + def makeDirectory(self, srmpath=""): + if srmpath: + targetpath = 'srm://srm.target.rug.nl:8444/lofar/user/disk/ingest' + if targetpath in srmpath: + newdir = '/' + srmpath.split('/')[-1] + try: + uploaddir = '/target/gpfs2/lofar/home/ingestmantest/test' + os.makedirs(uploaddir + newdir) + except Exception as e: + print e + return ('error', -30, 'Could not create directory') + if os.path.isdir(uploaddir + newdir): + print "New Ingest job request recieved: %s created" % (uploaddir + newdir) + return ('ok', 0, 'Directory created') + else: + return ('error', -30, 'Could not create directory') + else: + return ('error', -20, 'Illegal LTA location') + return ('error', -10, 'No srmpath given') + + def run(self): + mkdir_server= SimpleXMLRPCServer.SimpleXMLRPCServer(('192.168.210.188', 2013)) + mkdir_server.register_introspection_functions() + mkdir_server.register_function(self.makeDirectory, 'makeDirectory') + mkdir_server.serve_forever() + +class NewJobsServer(Process): + def newJobs(self, ingestId="", contactAuthor="", projectId="", observationIds=[], locations=[], dataProductIds=[], jobIds=[]): + if ingestId and contactAuthor and projectId and observationIds and locations and dataProductIds and jobIds: + try: + print "recieved new Ingest job %s for project %s with contact %s" % (ingestId, projectId, contactAuthor) + size = len(observationIds) + for i in range(size): + print i + print observationIds[i] + print locations[i] + print dataProductIds[i] + print jobIds[i] + return ('ok', 0, 'Valid Ingest recieved') + except Exception as e: + print e + return ('error', -10, 'Pie in the sky') + + def run(self): + mkdir_server= SimpleXMLRPCServer.SimpleXMLRPCServer(('192.168.210.188', 2015)) + mkdir_server.register_introspection_functions() + mkdir_server.register_function(self.newJobs, 'newJobs') + mkdir_server.serve_forever() + +class UserIngest(): + def __init__(self, config): + print 'Initializing' + self.config = config + + def serve(self): + print 'Starting' + self.mkdir = MkdirServer() + self.mkdir.start() + + self.newJobs = NewJobsServer() + self.newJobs.start() + print 'Running' + + +## Stand alone execution code ------------------------------------------ +if __name__ == '__main__': + usage = """Usage: + master.py <config> + config Something like 'ingest_config' (without the .py)""" + + if len(sys.argv) < 2: + print usage + exit(1) + config = sys.argv[1] + standalone = UserIngest(config) + standalone.serve() + diff --git a/LTA/LTAIngest/user_ingest_example b/LTA/LTAIngest/user_ingest_example new file mode 100644 index 0000000000000000000000000000000000000000..99e2cb3f9e955c51ea79a741ebab16f3aab6186d --- /dev/null +++ b/LTA/LTAIngest/user_ingest_example @@ -0,0 +1,7 @@ +import xmlrpclib +url = "http://192.168.210.188:2013" +client = xmlrpclib.ServerProxy(url) +client.makeDirectory("srm://srm.target.rug.nl:8444/lofar/user/disk/ingest/Upload1234") +url2 = "http://192.168.210.188:2015" +client2 = xmlrpclib.ServerProxy(url2) +client2.newJobs("256789", "renting@astron.nl", "LC0_001", ['197777','197777'], ['file1.tar','file2.tar'], ['123','124'], ['j034', 'j035'])