Difference between revisions of "Bulk Patent Assignee Processing"

From edegan.com
Jump to navigation Jump to search
Line 189: Line 189:
 
Here is the DTD specified by the USPTO, which specifies optional fields and :
 
Here is the DTD specified by the USPTO, which specifies optional fields and :
 
      
 
      
<?xml version="1.0" encoding="utf-8"?>  
+
<?xml version="1.0" encoding="utf-8"?> <br>
<!DOCTYPE us-patent-assignments [<!ELEMENT us-patent-assignments (action-key-code, transaction-date, patent-assignments)>  
+
<!DOCTYPE us-patent-assignments [<!ELEMENT us-patent-assignments (action-key-code, transaction-date, patent-assignments)> <br>
<!ATTLIST us-patent-assignments  dtd-version  CDATA  #IMPLIED  
+
<!ATTLIST us-patent-assignments  dtd-version  CDATA  #IMPLIED <br>
date-produced CDATA  #IMPLIED>  
+
date-produced CDATA  #IMPLIED> <br>
<!ELEMENT action-key-code (#PCDATA)>  
+
<!ELEMENT action-key-code (#PCDATA)> <br>
<!ELEMENT transaction-date (date)>
+
<!ELEMENT transaction-date (date)><br>
<!ELEMENT patent-assignments (data-available-code | patent-assignment+)>  
+
<!ELEMENT patent-assignments (data-available-code | patent-assignment+)> <br>
<!ELEMENT date (#PCDATA)>  
+
<!ELEMENT date (#PCDATA)> <br>
<!ELEMENT data-available-code (#PCDATA)>  
+
<!ELEMENT data-available-code (#PCDATA)> <br>
<!ELEMENT patent-assignment (assignment-record, patent-assignors, patent-assignees, patent-properties)>  
+
<!ELEMENT patent-assignment (assignment-record, patent-assignors, patent-assignees, patent-properties)> <br>
<!ELEMENT assignment-record (reel-no, frame-no, last-update-date, purge-indicator, recorded-date, page-count?, correspondent, conveyance-text)>  
+
<!ELEMENT assignment-record (reel-no, frame-no, last-update-date, purge-indicator, recorded-date, page-count?, correspondent, conveyance-text)> <br>
<!ELEMENT patent-assignors (patent-assignor+)>  
+
<!ELEMENT patent-assignors (patent-assignor+)> <br>
<!ELEMENT patent-assignees (patent-assignee+)>  
+
<!ELEMENT patent-assignees (patent-assignee+)> <br>
<!ELEMENT patent-properties (patent-property+)>  
+
<!ELEMENT patent-properties (patent-property+)> <br>
<!ELEMENT reel-no (#PCDATA)>  
+
<!ELEMENT reel-no (#PCDATA)> <br>
<!ELEMENT frame-no (#PCDATA)>  
+
<!ELEMENT frame-no (#PCDATA)> <br>
<!ELEMENT last-update-date (date)>  
+
<!ELEMENT last-update-date (date)> <br>  
<!ELEMENT purge-indicator (#PCDATA)>  
+
<!ELEMENT purge-indicator (#PCDATA)> <br>  
<!ELEMENT recorded-date (date)>  
+
<!ELEMENT recorded-date (date)> <br>
<!ELEMENT page-count (#PCDATA)>
+
<!ELEMENT page-count (#PCDATA)> <br>
<!ELEMENT correspondent (name, address-1?, address-2?, address-3?, address-4?)>
+
<!ELEMENT correspondent (name, address-1?, address-2?, address-3?, address-4?)> <br>
<!ELEMENT conveyance-text (#PCDATA)>
+
<!ELEMENT conveyance-text (#PCDATA)> <br>
<!ELEMENT patent-assignor (name, execution-date?, date-acknowledged?)>
+
<!ELEMENT patent-assignor (name, execution-date?, date-acknowledged?)> <br>
<!ELEMENT patent-assignee (name, address-1?, address-2?, city?, state?, country-name?, postcode?)>
+
<!ELEMENT patent-assignee (name, address-1?, address-2?, city?, state?, country-name?, postcode?)> <br>
<!ELEMENT patent-property (document-id*, invention-title?)>
+
<!ELEMENT patent-property (document-id*, invention-title?)> <br>
<!ELEMENT name (#PCDATA)>
+
<!ELEMENT name (#PCDATA)> <br>
<!ATTLIST name name-type (natural | legal)  #IMPLIED>
+
<!ATTLIST name name-type (natural | legal)  #IMPLIED> <br>
<!ELEMENT address-1 (#PCDATA)>
+
<!ELEMENT address-1 (#PCDATA)> <br>
<!ELEMENT address-2 (#PCDATA)>
+
<!ELEMENT address-2 (#PCDATA)> <br>
<!ELEMENT address-3 (#PCDATA)>
+
<!ELEMENT address-3 (#PCDATA)> <br>
<!ELEMENT address-4 (#PCDATA)>
+
<!ELEMENT address-4 (#PCDATA)> <br>
<!ELEMENT execution-date (date)>
+
<!ELEMENT execution-date (date)> <br>
<!ELEMENT date-acknowledged (date)>
+
<!ELEMENT date-acknowledged (date)> <br>
<!ELEMENT city (#PCDATA)>
+
<!ELEMENT city (#PCDATA)> <br>
<!ELEMENT state (#PCDATA)>
+
<!ELEMENT state (#PCDATA)> <br>
<!ELEMENT country-name (#PCDATA)>
+
<!ELEMENT country-name (#PCDATA)> <br>
<!ELEMENT postcode (#PCDATA)>
+
<!ELEMENT postcode (#PCDATA)> <br>
<!ELEMENT document-id (country, doc-number, kind?, name?, date?)>
+
<!ELEMENT document-id (country, doc-number, kind?, name?, date?)> <br>
<!ELEMENT invention-title (#PCDATA | b | i | u | sup | sub)*>
+
<!ELEMENT invention-title (#PCDATA | b | i | u | sup | sub)*> <br>
<!ATTLIST invention-title  id  ID    #IMPLIED
+
<!ATTLIST invention-title  id  ID    #IMPLIED <br>
  lang CDATA  #REQUIRED>
+
  lang CDATA  #REQUIRED> <br>
<!ELEMENT country (#PCDATA)>
+
<!ELEMENT country (#PCDATA)><br>
<!ELEMENT doc-number (#PCDATA)>
+
<!ELEMENT doc-number (#PCDATA)><br>
<!ELEMENT kind (#PCDATA)>
+
<!ELEMENT kind (#PCDATA)><br>
<!--bold formatting for text-->
+
<!--bold formatting for text--><br>
<!ELEMENT b (#PCDATA | i | u | smallcaps)*>
+
<!ELEMENT b (#PCDATA | i | u | smallcaps)*><br>
<!--italic formatting for text-->
+
<!--italic formatting for text--><br>
<!ELEMENT i (#PCDATA | b | u | smallcaps)*>
+
<!ELEMENT i (#PCDATA | b | u | smallcaps)*><br>
<!--underscore: style - single is default-->
+
<!--underscore: style - single is default--><br>
<!ELEMENT u (#PCDATA | b | i | smallcaps)*>
+
<!ELEMENT u (#PCDATA | b | i | smallcaps)*><br>
<!ATTLIST u  style  (single | double | dash | dots )  'single' >
+
<!ATTLIST u  style  (single | double | dash | dots )  'single' ><br>
<!--superscripted text-->
+
<!--superscripted text--><br>
<!ELEMENT sup (#PCDATA | b | u | i)*>
+
<!ELEMENT sup (#PCDATA | b | u | i)*><br>
<!--subscripted text-->
+
<!--subscripted text--><br>
<!ELEMENT sub (#PCDATA | b | u | i)*>
+
<!ELEMENT sub (#PCDATA | b | u | i)*><br>
<!--small capitals-->
+
<!--small capitals--><br>
<!ELEMENT smallcaps (#PCDATA | b | u | i)*>
+
<!ELEMENT smallcaps (#PCDATA | b | u | i)*><br>
]>
+
]><br>
  
 
===Inserting Extracted Data into Tables ===
 
===Inserting Extracted Data into Tables ===
  
 
===Clean Up ===
 
===Clean Up ===

Revision as of 12:00, 1 July 2016

USPTO Assignees Data

We would like to download and absorb data from this location on the USPTO website into our tables. The objective is to determine whether this dataset is better than the current version of our patent data (a combination of the data in the patent_2015 and patentdata databases.

Steps Followed to Extract the Data

Extracting Data from XML Files

All the historical USPTO data is available as XML files. Here is the tree structure for the XML files:

<patent-assignment>
       +<assignment-record>
       +<patent-assignors>
       +<patent-assignees>
       +<patent-properties>
</patent-assignment>

Each of the above internal nodes is mandatory, and is a logical grouping of information fields. Each node has a corresponding table created with more or less the same fields as the XML elements.

Corresponding tables are:

  • assignment-records : assignment
  • patent-assignors : assignors
  • patent-assignees : assignees
  • patent-properties : properties

Additionally, for each file that is downloaded, there are some associated specs. All of these are stored in the PatentAssignment table. Here is the data model diagram.

Assignment Records

The fields in the assignment record are:

  • last_update_date
  • purge_indicator
  • recorded_date
  • correspondent_name
  • correspondent_address_1
  • correspondent_address_2
  • correspondent_address_3
  • correspondent_address_4
  • conveyance_text

Here is the corresponding XML that we are mapping:

  -<assignment-record>
      <reel-no>27132</reel-no>
      <frame-no>841</frame-no>
     -<last-update-date>
         <date>20160122</date>
      </last-update-date>
      <purge-indicator>N</purge-indicator>
         -<recorded-date>
             <date>20111027</date>
          </recorded-date>
        <page-count>2</page-count>
     -<correspondent>
          <name>DOUGLAS B. MCKNIGHT</name>
          <address-1>595 MINER ROAD</address-1>
          <address-2>INTELLECTUAL PROPERTY & STANDARDS</address-2>
          <address-3>CLEVELAND, OH 44143</address-3>
       </correspondent>
       <conveyance-text>ASSIGNMENT OF ASSIGNORS INTEREST (SEE DOCUMENT FOR DETAILS).</conveyance-text>
 </assignment-record>


Assignors

Here are the columns in the assignors table:

  • reel_no
  • frame_no
  • assignor_name
  • execution_date

The corresponding XML node is :

-<patent-assignors>
   -<patent-assignor>
      <name>WALKER, MATTHEW J.</name>
     -<execution-date>
         <date>20090512</date>
      </execution-date>
    </patent-assignor>
   -<patent-assignor>
      <name>OLSZEWSKI, MARK E.</name>
     -<execution-date>
         <date>20090512</date>
      </execution-date>
    </patent-assignor>
  </patent-assignors>

Assignees

Here are the columns in the assignees table:

  • reel_no
  • frame_no
  • assignee_name
  • assignee_address_1
  • assignee_address_2
  • assignee_city
  • assignee_state
  • assignee_country
  • assignee_postcode

The corresponding XML nodes are:

 -<patent-assignees>
   -<patent-assignee>
       <name>KONINKLIJKE PHILIPS ELECTRONICS N V</name>
       <address-1>GROENEWOUDSEWEG 1</address-1>
       <city>EINDHOVEN</city>
       <country-name>NETHERLANDS</country-name>
       <postcode>5621 BA</postcode>
     </patent-assignee>
   </patent-assignees>

Patent Properties

Here are the columns in the properties table:

  • reel_no
  • frame_no
  • documentid
  • country
  • kind
  • filingdate
  • invention_title

The corresponding XML segment would be:

 -<patent-properties>
   -<patent-property>
     -<document-id>
         <country>US</country>
         <doc-number>14143589</doc-number>
         <kind>X0</kind>
         <date>20131230</date>
      </document-id>
     -<document-id>
         <country>US</country>
         <doc-number>20140260305</doc-number>
         <kind>A1</kind>
         <date>20140918</date>
      </document-id>
     <invention-title lang="en">LEAN AZIMUTHAL FLAME COMBUSTOR</invention-title>
   </patent-property>
 </patent-properties>

Patent properties have a many-to-one relationship : one patent can have more than one properties.

Note: We are not sure what documents with kind 'X0' say


Patent Assignment

Every XML file download has some fields associated with it, in addition to a number of patent assignment nodes.

Here are the columns in the table:

  • reel_no
  • frame_no
  • action_key_code
  • USPTO_Transaction_Date
  • USPTO_Date_Produced
  • version

Here is what the XML in a downloaded file looks like:

 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE us-patent-assignments>
-<us-patent-assignments date-produced="20131101" dtd-version="1.0">
    <action-key-code>DA</action-key-code>
   -<transaction-date>
       <date>20160122</date>
    </transaction-date>
   -<patent-assignments>
       +<patent-assignment>
       +<patent-assignment>
       +<patent-assignment>
       +<patent-assignment>
       +<patent-assignment>
       +<patent-assignment>
            .
            .
            .
     </patent-assignments>
 </us-patent-assignments>


DTD

Here is the DTD specified by the USPTO, which specifies optional fields and :

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE us-patent-assignments [<!ELEMENT us-patent-assignments (action-key-code, transaction-date, patent-assignments)>
<!ATTLIST us-patent-assignments dtd-version CDATA #IMPLIED
date-produced CDATA #IMPLIED>
<!ELEMENT action-key-code (#PCDATA)>
<!ELEMENT transaction-date (date)>
<!ELEMENT patent-assignments (data-available-code | patent-assignment+)>
<!ELEMENT date (#PCDATA)>
<!ELEMENT data-available-code (#PCDATA)>
<!ELEMENT patent-assignment (assignment-record, patent-assignors, patent-assignees, patent-properties)>
<!ELEMENT assignment-record (reel-no, frame-no, last-update-date, purge-indicator, recorded-date, page-count?, correspondent, conveyance-text)>
<!ELEMENT patent-assignors (patent-assignor+)>
<!ELEMENT patent-assignees (patent-assignee+)>
<!ELEMENT patent-properties (patent-property+)>
<!ELEMENT reel-no (#PCDATA)>
<!ELEMENT frame-no (#PCDATA)>
<!ELEMENT last-update-date (date)>
<!ELEMENT purge-indicator (#PCDATA)>
<!ELEMENT recorded-date (date)>
<!ELEMENT page-count (#PCDATA)>
<!ELEMENT correspondent (name, address-1?, address-2?, address-3?, address-4?)>
<!ELEMENT conveyance-text (#PCDATA)>
<!ELEMENT patent-assignor (name, execution-date?, date-acknowledged?)>
<!ELEMENT patent-assignee (name, address-1?, address-2?, city?, state?, country-name?, postcode?)>
<!ELEMENT patent-property (document-id*, invention-title?)>
<!ELEMENT name (#PCDATA)>
<!ATTLIST name name-type (natural | legal) #IMPLIED>
<!ELEMENT address-1 (#PCDATA)>
<!ELEMENT address-2 (#PCDATA)>
<!ELEMENT address-3 (#PCDATA)>
<!ELEMENT address-4 (#PCDATA)>
<!ELEMENT execution-date (date)>
<!ELEMENT date-acknowledged (date)>
<!ELEMENT city (#PCDATA)>
<!ELEMENT state (#PCDATA)>
<!ELEMENT country-name (#PCDATA)>
<!ELEMENT postcode (#PCDATA)>
<!ELEMENT document-id (country, doc-number, kind?, name?, date?)>
<!ELEMENT invention-title (#PCDATA | b | i | u | sup | sub)*>
<!ATTLIST invention-title id ID #IMPLIED
lang CDATA #REQUIRED>
<!ELEMENT country (#PCDATA)>
<!ELEMENT doc-number (#PCDATA)>
<!ELEMENT kind (#PCDATA)>

<!ELEMENT b (#PCDATA | i | u | smallcaps)*>

<!ELEMENT i (#PCDATA | b | u | smallcaps)*>

<!ELEMENT u (#PCDATA | b | i | smallcaps)*>
<!ATTLIST u style (single | double | dash | dots ) 'single' >

<!ELEMENT sup (#PCDATA | b | u | i)*>

<!ELEMENT sub (#PCDATA | b | u | i)*>

<!ELEMENT smallcaps (#PCDATA | b | u | i)*>
]>

Inserting Extracted Data into Tables

Clean Up