Reference Number: xxxxx Intel Restricted Secret INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL® PRODUCTS. NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER, AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT OR OTHER INTELLECTUAL PROPERTY RIGHT. Intel products are not intended for use in medical, life saving, life sustaining applications. Intel may make changes to specifications and product descriptions at any time, without notice. Designers must not rely on the absence or characteristics of any features or instructions marked “reserved” or “undefined.” Intel reserves these for future definition and shall have no responsibility whatsoever for conflicts or incompatibilities arising from future changes to them. The Itanium processor may contain design defects or errors known as errata which may cause the product to deviate from published specifications. Current characterized errata are available on request. Contact your local Intel sales office or your distributor to obtain the latest specifications and before placing your product order. Copies of documents which have an order number and are referenced in this document, or other Intel literature, may be obtained by calling 1-800-548-4725, or by visiting Intel's Web Site. Intel, Itanium and Xeon are trademarks or registered trademarks of Intel Corporation or its subsidiaries in the United States and other countries. Copyright © 1999-2004, Intel Corporation. All right reserved. *Other names and brands may be claimed as the property of others. Ref No xxxxx Intel Restricted Secret s 1 Introduction......................................................................................................................23 1.1 Preface................................................................................................................23 1.2 CSI Layers...........................................................................................................23 1.2.1 Physical Layer........................................................................................23 1.2.2 Link Layer...............................................................................................24 1.2.3 Routing Layer.........................................................................................24 1.2.4 Transport Layer......................................................................................25 1.2.5 Protocol Layer........................................................................................25 1.2.6 Communication Granularity Between Layers.........................................26 1.3 Notes...................................................................................................................26 1.4 Definition of Terms..............................................................................................26 2 Platform Scope.................................................................................................................29 2.1 Desktop/Mobile Systems.....................................................................................29 2.2 Dual Processor Systems.....................................................................................30 2.3 Quad-Socket and 8-Socket Systems ..................................................................31 2.4 Large Scale System Architectures......................................................................32 2.5 Profiles ................................................................................................................33 3 Physical Layer..................................................................................................................35 3.1 Physical Layer Overview.....................................................................................35 3.2 Physical Layer Features for Desktop/Mobile Systems - UP Profile.....................36 3.3 Physical Layer Features for Dual Processor Systems - DP Profile.....................37 3.4 Physical Layer Features for 4 and 8 Socket Systems - Small MP Profile...........37 3.5 Physical Layer Features for Large Scale Systems - Large MP Profile................38 3.6 Summary of Physical Layer Features .................................................................39 3.7 Physical Layer Reset...........................................................................................40 3.7.1 Link Power Up and Initialization Sequence............................................40 3.7.2 Link-Up Identifier....................................................................................42 3.7.3 Physical Layer Clocking.........................................................................42 3.7.4 Cold Reset..............................................................................................43 3.7.5 Inband Reset..........................................................................................43 3.7.6 Soft Reset...............................................................................................45 3.7.7 Two Stage Initialization ..........................................................................45 3.7.8 Automatic Test Equipment (ATE) Initialization Mode.............................47 3.8 Interface Between Physical Layer and Link Layer ..............................................49 3.9 Logical Sub-Block................................................................................................50 3.9.1 Supported Link Widths...........................................................................50 3.9.2 Link Training Basics...............................................................................62 3.9.3 Logical Sub-block Finite State Machine.................................................68 3.9.4 Optional Low Power Modes – An Overview...........................................83 3.9.5 Link Low Power Modes..........................................................................86 3.9.6 Physical Layer Determinism Requirements ...........................................98 3.9.7 Periodic Link Retraining .......................................................................100 3.9.8 Forwarded Clock Fail-Safe Mode – Small MP and Large MP Profiles.100 3.9.9 Link Self-Healing – Large MP Profiles..................................................101 3.9.10 Support for Hot Detect – Small MP and Large MP Profiles..................101 3.9.11 Lane Reversal......................................................................................101 Ref No xxxxx Intel Restricted Secret 4 3.10 Physical Layer Register Interface .....................................................................105 3.10.1 CSI Physical Layer Mandatory Registers.............................................106 3.10.2 Optional Registers................................................................................121 3.10.3 Electrical Parameter Registers (Examples Only).................................125 3.10.4 Testability Tool-box Registers (Examples Only) ..................................127 3.11 Electrical Sub-Block Specifications and Budgets..............................................130 3.12 Definition of Terms............................................................................................131 CSI Link Layer................................................................................................................135 4.1 Message Class..................................................................................................135 4.1.1 Required Base Message Classes........................................................137 4.2 Virtual Networks................................................................................................137 4.2.1 Base Virtual Network Requirements ....................................................138 4.3 Credit/Debit Flow Control..................................................................................138 4.4 Link Layer Buffer/Credit Management ..............................................................139 4.5 Support For Link Layer Reliable Transmission.................................................139 4.6 Packet Definition ...............................................................................................140 4.6.1 Packet Format......................................................................................140 4.6.2 Packet Fields........................................................................................159 4.6.3 Mapping of the Protocol Layer to the Link Layer..................................169 4.6.4 Width Reduction...................................................................................172 4.6.5 Organization of Packets on the Physical Layer....................................173 4.7 Link Layer Control Messages............................................................................173 4.7.1 Special Packet Format.........................................................................173 4.7.2 Null Ctrl Flit...........................................................................................174 4.7.3 Link Level Retry Ctrl Flit.......................................................................175 4.7.4 Power Management Ctrl Flit.................................................................175 4.7.5 System Management Ctrl Flit...............................................................176 4.7.6 Parameter Exchange Ctrl Flit...............................................................176 4.7.7 Sync Flit ...............................................................................................180 4.7.8 Error Indication.....................................................................................180 4.7.9 Debug...................................................................................................180 4.7.10 Idle Flit..................................................................................................187 4.8 Flit Interleave.....................................................................................................187 4.8.1 Command Insert...................................................................................188 4.8.2 Scheduled Data Interleave (SDI) .........................................................189 4.9 Transmission Error Handling.............................................................................189 4.9.1 Error Detection.....................................................................................189 4.9.2 Error Recovery.....................................................................................193 4.10 Link Layer Initialization......................................................................................200 4.11 Link Layer Required Registers..........................................................................203 4.11.1 CSILCP - CSI Link Capability Register ................................................203 4.11.2 CSILCL - CSI Link Control Register.....................................................204 4.11.3 CSILS - CSI Link Status Register.........................................................205 4.11.4 CSILP0 - CSI Link Parameter 0 Register.............................................206 4.11.5 CSILP1 - CSI Link Parameter 1 Register.............................................206 4.11.6 CSILP2 - CSI Link Parameter 2 Register.............................................206 4.11.7 CSILP3 - CSI Link Parameter 3 Register.............................................207 4.11.8 CSILP4 - CSI Link Parameter 4 Register.............................................207 4.12 Link Layer Rules and Requirements.................................................................207 4.13 Open Issues......................................................................................................207 Ref No xxxxx Intel Restricted Secret .................................................................................................................209 5.1 Introduction........................................................................................................209 5.2 Routing Rules....................................................................................................209 5.3 Routing Step......................................................................................................210 5.3.1 Router Table Simplifications.................................................................212 5.4 Routing Algorithm..............................................................................................213 5.5 Routing at Source and Destination Agents .......................................................213 5.6 Routing Broadcast of Snoops............................................................................213 5.7 Usage Models ...................................................................................................215 5.7.1 Flexible Interconnect Topologies..........................................................215 5.7.2 Flexible Partition Management.............................................................216 5.8 CSI Components’ Compatibility.........................................................................217 5.9 Configuration Space and Associated Registers................................................217 5.10 Routing Packets Before Routing Table Setup...................................................218 5.11 Routing Table Setup after System Reset/Bootup..............................................218 5.12 Route Table Setup after Partition Reset............................................................221 5.12.1 Single Partition.....................................................................................221 5.12.2 Partition with Route Through Components ..........................................221 5.13 Implementation Notes .......................................................................................221 5.14 Open Issues......................................................................................................221 6 CSI Protocol Overview...................................................................................................223 6.1 Protocol Messages............................................................................................223 6.2 Protocol Agents.................................................................................................223 6.3 Transaction IDs.................................................................................................224 6.4 Open Issues......................................................................................................225 7 Address Decode.............................................................................................................227 7.1 CSI Addressing Model.......................................................................................227 7.1.1 Types of Addresses..............................................................................227 7.1.2 Addressing Mechanism........................................................................227 7.1.3 Classification of Address Regions........................................................229 7.1.4 Relationship Between Memory Attribute, Region Attribute and CSI Transactions232 7.1.5 Assumptions and Requirements on System Address Map..................233 7.1.6 CSI Addressing Model..........................................................................234 7.1.7 Addressing Model in a Partitioned System...........................................235 7.2 Address Decoder...............................................................................................236 7.2.1 Generic Source Address Decoder........................................................237 7.2.2 Target Address Decoder at the Memory Agent....................................240 7.3 NodeID Assignment and Address Subdivision..................................................241 7.3.1 NodeID Assignment .............................................................................241 7.3.2 Caching Agent Address Subdivision....................................................241 7.3.3 Home Agent Address Subdivision........................................................242 7.4 Address Decode Configurations........................................................................242 7.5 Support for Advanced RAS Features................................................................243 8 CSI Cache Coherence Protocol.....................................................................................245 8.1 Protocol Architecture.........................................................................................245 8.1.1 Caching agent......................................................................................246 8.1.2 Home Agent .........................................................................................247 8.2 Protocol Semantics ...........................................................................................247 Ref No xxxxx Intel Restricted Secret 1 Coherent Protocol Messages...............................................................247 8.2.2 Protocol Dependencies........................................................................253 8.3 Caching Agent Interface....................................................................................256 8.3.1 Transaction Phases .............................................................................257 8.3.2 Coherence Domain ..............................................................................257 8.3.3 Cache States........................................................................................258 8.3.4 Peer Caching Agent Responses to an Incoming Snoop During the Null Phase258 8.3.5 Peer Caching Agent’s Response to a Conflicting Snoop During the Request and Writeback Phases260 8.3.6 Peer Caching Agent’s Response to a Conflicting Incoming Snoop During the AckCnflt Phase260 8.3.7 Responding to Cmp_Fwd* or Cmp to End the AckCnflt Phase............261 8.4 Source Broadcast Home Agent Algorithm ........................................................262 8.4.1 Home agent architected state ..............................................................262 8.4.2 Interpreting Protocol Flow diagrams.....................................................263 8.4.3 Protocol Flows Illuminated ...................................................................263 8.4.4 Protocol Invariants ...............................................................................272 8.4.5 Capturing Ordering...............................................................................274 8.4.6 Managing Conflict Lists........................................................................276 8.4.7 Summary of the home agent algorithm................................................281 8.5 Scaling CSI With an Out-of-Order Network.......................................................282 8.5.1 Directory Structure Requirements........................................................283 8.5.2 Home Agent Microarchitectural Constraints.........................................284 8.5.3 Simple Protocol Flows..........................................................................285 8.5.4 Home Agent Algorithm Overview.........................................................287 8.5.5 Using Coarse Sharing lists...................................................................289 8.5.6 Protocol English flows..........................................................................291 8.6 Application Notes..............................................................................................296 8.6.1 Global Observation ..............................................................................296 8.6.2 Flush Cache Operation ........................................................................296 8.6.3 Partial Write to Coherent Space...........................................................296 8.7 Coherence Protocol Open Issues .....................................................................298 8.7.1 Arbitrary AckCnflt’s...............................................................................298 Non-Coherent Protocol ..................................................................................................299 9.1 Transaction List.................................................................................................299 9.2 Protocol Layer Dependencies...........................................................................300 9.2.1 Requester Rules ..................................................................................300 9.2.2 Target Rules.........................................................................................302 9.3 Non-Coherent Memory Transactions................................................................303 9.3.1 Non-coherent Write Transaction Flow..................................................303 9.3.2 Non-Coherent Read Transaction Flow.................................................306 9.3.3 “Don’t Snoop” Transaction Flow...........................................................307 9.3.4 Length and Alignment Rules................................................................308 9.4 Peer-to-Peer Transactions................................................................................309 9.5 Legacy I/O Transactions ...................................................................................310 9.5.1 Legacy I/O Write Transaction Flow......................................................310 9.5.2 Legacy I/O Read Transaction Flow......................................................310 9.5.3 Addressing, Length and Alignment Rules............................................311 9.6 Configuration Transactions ...............................................................................311 9.6.1 Configuration Write Transaction Flow..................................................312 Ref No xxxxx Intel Restricted Secret 3 9.6.3 Addressing, Length and Alignment Rules............................................314 9.7 Secure Non-Coherent Transactions..................................................................314 9.8 Broadcast Non-Coherent Transactions.............................................................314 9.8.1 Broadcast Dependency Lists................................................................315 9.8.2 Broadcast Mechanism..........................................................................316 9.8.3 Broadcast Ordering..............................................................................316 9.8.4 Scaling to Large Systems.....................................................................316 9.9 Interrupts and Related Transactions.................................................................317 9.10 Non-coherent Messages...................................................................................317 9.10.1 Legacy Platform Interrupt Support .......................................................320 9.10.2 Power Management Support................................................................321 9.10.3 Synchronization Messages ..................................................................321 9.10.4 Virtual Legacy Wire (VLW) Transactions .............................................323 9.10.5 Special Cycle Transactions..................................................................326 9.10.6 Atomic Access (Lock)...........................................................................327 9.11 Non-Coherent Registers List.............................................................................332 10 Interrupt and Related Operations...................................................................................335 10.1 Overview ...........................................................................................................335 10.1.1 Interrupt Model for Itanium®-Based Systems.......................................336 10.1.2 Interrupt Model for IA-32 Processor Family-Based Systems ...............336 10.2 Interrupt Delivery...............................................................................................339 10.2.1 Interrupt Delivery Assumptions ............................................................342 10.2.2 Interrupt Redirection.............................................................................343 10.2.3 Interrupt Delivery for Itanium® Processor-Based Systems...................345 10.2.4 Interrupt Delivery for IA-32-Based Systems.........................................346 10.3 Level Sensitive Interrupt and End Of Interrupt..................................................349 10.4 Miscellaneous Interrupts and Events ................................................................350 10.4.1 8259A Support .....................................................................................350 10.4.2 INIT.......................................................................................................350 10.4.3 NMI.......................................................................................................350 10.4.4 SMI.......................................................................................................350 10.4.5 PMI.......................................................................................................350 10.4.6 PCI INTA - INTD and PME...................................................................351 10.5 Interrupt Related Configuration.........................................................................351 10.6 Reference Documents.......................................................................................351 11 Fault Handling................................................................................................................353 11.1 Definitions..........................................................................................................353 11.2 Error Classification............................................................................................353 11.3 Error Reporting..................................................................................................354 11.3.1 Error Reporting Mechanisms................................................................354 11.3.2 Error Reporting Priority.........................................................................358 11.4 Fault Diagnosis..................................................................................................358 11.4.1 Hierarchical Transaction Timeout.........................................................358 11.4.2 Error Logging Guidelines......................................................................361 11.5 Error Containment in Partitioned Systems........................................................361 11.5.1 Error Propagation in Partitioned Systems............................................361 11.5.2 Error Containment Through Packet Elimination...................................362 Ref No xxxxx Intel Restricted Secret ...................................................................................................367 12.1 Introduction .......................................................................................................367 12.2 CSI Reset Domains...........................................................................................367 12.2.1 CSI Physical Layer and Lower Link Layer Reset Domain....................368 12.2.2 CSI Upper Link Layer Reset Domains .................................................369 12.2.3 Routing Layer or Crossbar Reset Domain ...........................................370 12.3 Signals Involved in Reset..................................................................................372 12.3.1 PWRGOOD Signal...............................................................................372 12.3.2 RESET Signal ......................................................................................372 12.3.3 CLOCK Signals....................................................................................372 12.3.4 Other Configuration Signals.................................................................373 12.4 Initialization Timeline.........................................................................................374 12.5 Firmware Classification.....................................................................................375 12.5.1 Routing of Firmware Accesses.............................................................375 12.6 Link Initialization................................................................................................376 12.6.1 Link Initialization Options .....................................................................376 12.6.2 Exchange of System/Socket Level Parameters...................................377 12.7 System BSP Determination...............................................................................378 12.8 CSI Component Initialization Requirements .....................................................379 12.8.1 Support for Fabric Initialization.............................................................379 12.8.2 Programming of CSI Structures ..........................................................380 12.9 Support for On-Line Addition.............................................................................383 12.10 Support for Partition Reset................................................................................384 12.11 Hardware Requirements...................................................................................384 12.12 Configuration Space and Associated Registers................................................385 13 System Management Support........................................................................................387 13.1 Introduction .......................................................................................................387 13.2 Configuration Address Space ...........................................................................388 13.3 Configuration Access Mechanisms...................................................................388 13.3.1 CSI Configuration Agent ......................................................................389 13.3.2 JTAG and SMBus ................................................................................389 13.3.3 MMCFG and CF8/CFC.........................................................................390 13.4 Protected Firmware...........................................................................................390 13.4.1 Configuration Management Mode (CM Mode).....................................391 13.4.2 IA-32 Processor System Management Mode (SMM)...........................396 14 Dynamic Reconfiguration...............................................................................................399 14.1 Introduction .......................................................................................................399 14.2 Partitioning Models............................................................................................399 14.2.1 Hard physical partitioning (HPPAR).....................................................400 14.2.2 Firm physical partitioning (FPPAR)......................................................400 14.2.3 Logical or software partitioning (LPAR)................................................401 14.2.4 Virtual partitioning (VPAR) ...................................................................402 14.3 OL_* Support ....................................................................................................402 14.3.1 Implementation Dependent Quiescence/De-Quiescence ....................403 14.3.2 Flows....................................................................................................404 14.3.3 Assumptions/Requirements .................................................................407 14.3.4 Configuration Space and Associated Registers...................................408 14.3.5 Need for a Quiesce During OL_* Events..............................................409 14.4 Use of System Service Processor during OL_* Operations..............................409 Ref No xxxxx Intel Restricted Secret 1 14.5.1 Online Addition of a Processor Node (With or Without Other Agents).412 14.5.2 Online Addition of a Memory only Node...............................................414 14.5.3 Online Addition of an I/O Hub Node only .............................................415 14.6 On Line Deletion of a Node...............................................................................416 14.6.1 On Line Deletion of a Processor Node.................................................416 14.6.2 On Line Deletion of a Memory Node....................................................418 14.6.3 On Line Deletion of an I/O Hub Node...................................................419 14.7 Multi-Partition Management with Shared Interconnect......................................420 14.7.1 Restricted Option..................................................................................420 14.7.2 Restricted Option - Variant...................................................................421 14.7.3 Flexible Option .....................................................................................422 14.8 Support for Sub-Socket Partitioning..................................................................424 14.8.1 Sub-Socket Partitioning via Node ids...................................................424 14.8.2 Sub-Socket Partitioning via Firm Partition ID.......................................424 14.9 Memory RAS.....................................................................................................425 14.9.1 Memory Migration.................................................................................425 14.9.2 Memory Mirroring.................................................................................428 14.10 Hardware Requirements, Etc............................................................................431 14.11 Implementation Notes .......................................................................................432 14.12 Open Issues/Notes............................................................................................432 14.13 List of Acronyms Used ......................................................................................433 15 Power Management.......................................................................................................435 15.1 Link Power Management...................................................................................435 15.1.1 Link Power States ................................................................................435 15.1.2 L0s Link State.......................................................................................436 15.1.3 L1 Link State ........................................................................................438 15.1.4 L2 Link State ........................................................................................439 15.1.5 Link Width Modulation..........................................................................439 15.2 Platform Power Management............................................................................441 15.2.1 Platform Power States..........................................................................441 15.2.2 P, T, and C-State Coordination............................................................442 15.2.3 S-State Coordination............................................................................451 15.3 Power Management Related Messages ...........................................................454 15.3.1 Platform Power Management Messages .............................................454 15.3.2 Link Power Management Messages ....................................................455 16 Quality of Service and Isochronous Operations.............................................................459 16.1 Quality of Service (QoS)/Isochronous Platform Requirements.........................459 16.1.1 Legacy ISOC........................................................................................459 16.1.2 PCI-Express* ISOC..............................................................................459 16.1.3 Integrated Graphics ISOC services......................................................460 16.1.4 QoS extensions - compatible w/ PCI-Express......................................461 16.2 ISOC - Message Classes, and Traffic Classes.................................................461 16.2.1 Message Class definition .....................................................................461 16.2.2 Traffic Class definition..........................................................................461 16.2.3 Mapping ISOC transactions to ICS and IDS ........................................462 16.3 Link Layer Packet Fields and ISOC Support.....................................................462 16.4 Link Layer - QoS packet extensions..................................................................463 16.5 Usage Models of Isochronous Traffic in Current Platforms...............................464 16.6 ISOC/QoS Support Restrictions........................................................................465 Ref No xxxxx Intel Restricted Secret ..........................................................................................................................467 17.1 LaGrande Technology Background Information ...............................................467 17.2 Secure Launch In CSI Systems ........................................................................468 17.2.1 Simple CSI Systems ............................................................................468 17.2.2 Complex CSI Systems .........................................................................469 17.3 Link Initialization Parameters ............................................................................469 17.4 Interprocessor Communication: LT Link Layer Messages................................469 17.5 Processor-to-Chipset Communication: Protocol Layer Messages....................470 18 Design for Test and Debug............................................................................................473 18.1 Introduction .......................................................................................................473 18.2 Design For ATE-Based Testing and Debugging Through CSI .........................473 18.2.1 Tester Assumptions .............................................................................473 18.2.2 Basic Requirement: Determinism.........................................................474 18.2.3 Supporting the HVM Test Flow and Tester Fleet.................................476 18.2.4 Debug “Through” CSI – Debugging Processor or Chipset Via CSI Interface 477 18.2.5 Debug and Test of the Logic Associated with CSI...............................478 18.2.6 Desktop Processor Specific Requirements..........................................478 18.2.7 Debug of HVM Patterns .......................................................................478 18.2.8 Summary .............................................................................................479 18.3 Component and System DV/EV/AnV ...............................................................479 18.3.1 CSI Component and System DV/EV/AnV Requirements.....................480 18.3.2 Tx Characterization..............................................................................481 18.3.3 Rx Characterization..............................................................................481 18.3.4 Interconnect Characterization ..............................................................482 18.3.5 Link Characterization ...........................................................................482 18.3.6 CSI Link Debug for DV/EV/AnV ...........................................................483 18.4 CSI Phy Layer DFx Tools..................................................................................483 18.4.1 Introduction ..........................................................................................483 18.4.2 Definitions ...........................................................................................484 18.4.3 Reset Sequence...................................................................................485 18.4.4 CSI Loopback.......................................................................................485 18.4.5 Loopback Modes..................................................................................486 18.4.6 Local vs. Remote Loopback.................................................................488 18.4.7 Loopback Test Sequence ....................................................................489 18.4.8 Loopback Entry ....................................................................................489 18.4.9 Loopback Control Register...................................................................491 18.4.10Loopback Status Register....................................................................495 18.4.11 Loopback Exit.......................................................................................495 18.4.12CSI Determinism..................................................................................497 18.4.13Repeater Requirements.......................................................................500 18.4.14 CSI Eye Margining ...............................................................................501 18.4.15 Eye Width Adjust – Transmitter ...........................................................504 18.4.16 Eye Height Adjust – Receiver ..............................................................506 4.0.1 Eye Width Adjust – Receiver................................................................507 18.4.17Structural Tests....................................................................................508 18.5 Pin Leakage Testing - Transmitter and Receiver..............................................509 18.6 CSI Post-Si System Debug Requirements........................................................509 18.6.1 System Debug Requirements ..............................................................509 A Glossary ....................................................................................................................................... 515 Ref No xxxxx Intel Restricted Secret 5 A.2 List of Acronyms.............................................................................................................516 B CSI Profile Attributes ....................................................................................................................519 B.1 CSI Profile Attributes......................................................................................................519 Future Extensions - Transport Layer .............................................................................................525 C.1 Introduction .....................................................................................................................525 C.2 Reliable, End-to-End Transmission.................................................................................525 C.3 CSI Support for Reliable Transmission...........................................................................526 C.3.1 Routing..............................................................................................................527 C.3.2 Sequence Number .............................................................................................527 C.3.3 Transport Layer CSI Transactions ....................................................................528 C.3.4 Sender Node ID.................................................................................................528 C.3.5 No Time-Out Field............................................................................................529 C.4 Usage Models ..................................................................................................................529 C.5 CSI Components’ Responsibilities and Other Implementation Issues............................530 C.6 Notes, Comments for Later Revisions.............................................................................531 D Future Extensions - PTC.G............................................................................................................533 D.1 PurgeTC Special Transaction..........................................................................................533 D.1.1 Purge TC Messages and Resource Requirements.............................................533 D.1.2 Purge TC Transaction Flow ..............................................................................534 D.2 CSI Component Initialization Requirements...................................................................535 D.2.1 Programming of CSI Structures .......................................................................536 D.2.2 Online Addition of a Processor Node (With or Without Other Agents)...........536 D.2.3 On Line Deletion of a Processor Node .............................................................536 D.3 Open Issues/Notes ...........................................................................................................536 E Post Silicon Validation ..................................................................................................................537 E.1 Post-Si Validation for CSI...............................................................................................537 E.1.1 Summarized List of CSI Post-Si Validation Requirements ............................537 E.1.2 CSI Monitoring Events ...................................................................................538 E.1.3 Event Counters ................................................................................................540 E.1.4 Error Injection...................................................................................................541 E.1.5 Diagnostic Information Registers ...................................................................542 E.1.6 Programmable Configuration Overrides .........................................................543 E.1.7 Programmable Timer/Counter Values ............................................................543 E.1.8 Event Injection .................................................................................................543 E.1.9 CSI HUB-Based System Validation Concept...................................................544 E.2 Further Information (for Intel use only) .........................................................................550 E.3 DF Manufacturing Reference..........................................................................................551 E.4 Tester DV Further information........................................................................................551 F An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence .....................................................................................................................................553 F.1 Introduction .....................................................................................................................553 F.2 What CSI-IAM does and does not cover.........................................................................554 F.3 Components of CSI-IAM ................................................................................................554 F.4 Data Type Declarations ...................................................................................................554 F.5 The Initial State of the System ........................................................................................558 F.6 Invariants .........................................................................................................................559 Ref No xxxxx Intel Restricted Secret 1 F.8 Protocol Tables and Their Semantic Mappings.............................................................. 561 F.9 Utility Sub-Routines ....................................................................................................... 588 F.10 A C Reference Model Derived from CSI-IAM .............................................................. 595 F.10.1 Configuration parameters................................................................................. 595 F.10.2 Data Type Declarations.................................................................................... 596 F.10.3 API Functions................................................................................................... 597 G An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence..................................................................................................................... 601 G.1 Introduction .................................................................................................................... 601 G.2 What CSI-IAM Does and Does Not Cover .................................................................... 602 G.3 Components of CSI-IAM ............................................................................................... 602 G.3.1 IAM Component Details .................................................................................. 602 G.4 Data Type Declaration.................................................................................................... 604 G.5 The Initial State of the System ....................................................................................... 608 G.6 The Invariants ................................................................................................................. 611 G.7 Actions and Their Parameters......................................................................................... 612 G.8 Utility Sub-Routines ....................................................................................................... 662 Figures 1-1 Hierarchical Ordering of CSI Interface Layers ................................................................ 23 1-2 CSI Interface Layer Details (Routing and Transport Layers Not Shown) ....................... 24 1-3 CSI Interface Layer Details (Transport Layer Not Shown).............................................. 25 2-1 Schematic of an Intel® Itanium® Processor with CSI-Based Links Interface.................. 29 2-2 CSI-Based Uniprocessor Systems .................................................................................... 30 2-3 CSI-Based Dual Processor Systems ................................................................................. 30 2-4 4-Socket CSI-Based Platform........................................................................................... 31 2-5 4-Socket and 8-Socket CSI Systems ................................................................................ 31 2-6 Large Scale “Flat” Architecture........................................................................................ 32 2-7 Scalable Hierarchical System with OEM “Node Controllers”......................................... 33 3-1 CSI Layer Hierarchy......................................................................................................... 35 3-2 Physical Layer Power Up and Initialization Sequence – An Example............................. 41 3-3 Inband Reset Sequence Initiated by Port A to Port B....................................................... 44 3-4 Relationship between Phase Interpolator Training Pattern and Forwarded Clock Phase dur ing First Initialization Stage46 3-5 Interface Between Physical Layer and Link Layer – An Example .................................. 49 3-6 Mux Scheme for Link Width Support .............................................................................. 53 3-7 Physical Bit Swizzling...................................................................................................... 55 3-8 Sequence of Events for Acquiring Handshake Attributes ................................................ 64 3-9 State Transition Using Handshake Attributes .................................................................. 65 3-10 Logical Sub-block State Diagram..................................................................................... 68 3-11 Detect Sub-States.............................................................................................................. 70 3-12 Polling Sub-states ............................................................................................................. 74 3-13 Computing Lane-to-Lane Deskew – An Example ........................................................... 75 3-14 Config Sub-States ............................................................................................................. 79 3-15 Logical Sub-block State Diagram with Optional Low Power Modes .............................. 83 3-16 L0s Entry Sequence ..........................................................................................................87 3-17 L0s Exit Sequence ............................................................................................................ 89 3-18 Link Width Modulation Sequence.................................................................................... 93 Ref No xxxxx Intel Restricted Secret 7 3-20 Link Formed with a Straight Connection (No Lane Reversal Required)........................102 3-21 Daughter Card Topology - An Example .........................................................................102 3-22 Lane Reversal – An Example..........................................................................................103 3-23 Routing Guidelines for a Bifurcated Port using Lane Reversal on Both Halves ............104 3-24 Routing Guidelines for a Bifurcated Port Using Straight Connections on Both Halves.105 4-1 Special Packet Interleave Example .................................................................................188 4-2 Command Insert Interleave Example ..............................................................................189 4-3 Rolling CRC Scheme ......................................................................................................191 4-4 Error Detection on the Received flit Using Rolling CRC ...............................................191 4-5 Retry Queue and Related Pointers...................................................................................195 5-1 Routing Layer Functionality – 1......................................................................................211 5-2 Routing Layer Functionality – 2......................................................................................212 5-3 Abstract Structure of the Routing Table..........................................................................212 5-4 Illustrating Firmware Hub Connectivity Options............................................................219 5-5 Route Table Setup Using Breadth First Order ................................................................220 7-1 View of Types of Addresses in the System.....................................................................228 7-2 Itanium® Processor and IA-32 Addressing Models ........................................................234 7-3 Source Address Decoder at Requesting Agent................................................................237 7-4 Target Address Decoder at a Memory Agent..................................................................240 8-1 Protocol Architecture.......................................................................................................245 8-2 Caching Agent Architected State ....................................................................................246 8-3 A Visual Representation of Dependencies Within a Protocol Channel ..........................254 8-4 Home Agent Architected State........................................................................................262 8-5 Protocol Flow Legend .....................................................................................................263 8-6 Uncached RdData Request..............................................................................................264 8-7 Cached RdInvOwn Request ............................................................................................265 8-8 Standard Writeback Flow................................................................................................265 8-9 Generating a RspCnflt on a conflicting incoming Snoop................................................266 8-10 Sending an AckCnflt Due to a Conflicting Snoop ..........................................................267 8-11 Conflict Case Requiring FrcAckCnflt Flow....................................................................268 8-12 Conflict Case Continued from Figure 8-9 and Figure 8-10 ............................................269 8-13 WbMtoE Conflict ............................................................................................................270 8-14 WbMtoI Conflict .............................................................................................................271 8-15 Buried HITM Flow..........................................................................................................272 8-16 RspFwd Ordering Required.............................................................................................275 8-17 Writeback Ordering At the Home Agent.........................................................................276 8-18 Case Requiring a FrcAckCnflt to Resolve ......................................................................278 8-19 RdData Request Fetching an E-State Line and Setting Dir State....................................285 8-20 RdInvOwn Causing Invalidation of S-State Copies........................................................286 8-21 RdInvOwn Request HITM ..............................................................................................286 8-22 WbIData Arriving – We Discard Any WbMto* Message ..............................................287 8-23 Early Conflict Resolved by Detecting Request from Agent on Sharing List..................288 8-24 Late Conflict Resolved by Waiting for an AckCnflt.......................................................288 8-25 Buried HITM Case ..........................................................................................................289 8-26 Using the FrcAckCnflt/AckCnflt Handshake for a RdCode in Coarse Sharing .............290 8-27 Transiting from Explicit Sharers to Coarse Sharing........................................................291 8-28 Partial write to coherent space, Hit M.............................................................................297 8-29 Partial Write to Coherent Space, Conflict Case ..............................................................298 9-1 Non-Coherent Write Transaction Flow...........................................................................303 9-2 Non-Coherent Write Combinable Write Transaction Flow ............................................306 Ref No xxxxx Intel Restricted Secret 7 9-4 Legacy I/O Write Transaction Flow............................................................................... 310 9-5 Legacy I/O Read Transaction Flow................................................................................ 311 9-6 Configuration Write Transaction Flow........................................................................... 313 9-7 Configuration Read Transaction Flow ........................................................................... 313 9-8 Non-coherent Broadcast Example (IntPhysical) ............................................................ 316 9-9 Example Lock Flow........................................................................................................ 328 10-1 Interrupt Architecture Overview..................................................................................... 335 10-2 Address encoding in IntPhysical and IntLogical Requests ............................................ 340 10-3 Data field of IntPhysical and IntLogical Requests ......................................................... 340 10-4 Address field of IntPrioUpd Request.............................................................................. 344 10-5 Data field of IntPrioUpd Request ................................................................................... 345 10-6 Data field of NcMsgBEOI Request ................................................................................. 350 11-1 Illustration of Error Propagation Across Partitions ........................................................ 362 11-2 CSI Message Class Hierarchy ........................................................................................ 363 12-1 Reset Domains in CSI Components ............................................................................... 368 12-2 Example System Topology Diagram.............................................................................. 380 13-1 Logical View of Access Paths to CSI Configuration Registers ..................................... 387 13-2 Address Conversion Rules between Core & CSI Addresses (SMall MP) ..................... 393 13-3 Address Conversion Rules between Core & CSI Addresses (Large MP)...................... 394 13-4 Legacy SMM Memory Layout ....................................................................................... 396 13-5 IA-32 SMM Memory Layout in a CSI-Based System ................................................... 397 14-1 Hard Physical Partitioning Example............................................................................... 400 14-2 Firm Physical Partitioning Example............................................................................... 401 14-3 Logical Partitioning Example......................................................................................... 401 14-4 Virtual Partitioning Example.......................................................................................... 402 14-5 Illustrating Addition of a Node to a Running System .................................................... 411 14-6 Illustrating Removal of a Node from a Running System ............................................... 416 14-7 Multi-Partition Management - Restricted Option........................................................... 421 14-8 Multi-Partition Management- Restricted Option-Variant .............................................. 422 14-9 Multi-Partition Management - Flexible Option.............................................................. 423 14-10 Mirroring Support for Migration: Wt-Wt and Rd-Wt Mirroring ................................... 427 14-11 PMI/SMI Generation Sequence During OL_A Events .................................................. 432 15-1 Simple Lower Power State Example (Incomplete) ........................................................ 444 15-2 Lowering Power State Attempt With 1 Node Retry....................................................... 446 15-3 Lowering Power State With 2 Node Retr....................................................................... 447 15-4 Increasing from C4 to C0 State and Induced Retries on 2 Nodes .................................. 448 15-5 Conflict example - Request Passes Own Response........................................................ 450 15-6 S-State Entry Example.................................................................................................... 452 17-1 Transitive Trust Model ................................................................................................... 468 17-2 LT Link Layer Messages................................................................................................ 470 18-1 CSI Link Generic Diagram............................................................................................. 484 18-2 Segregated vs. Integrated Transceiver Floor Plans in Silicon ........................................ 486 18-3 Loopback Modes in CS .................................................................................................. 487 18-4 Local vs. Remote Loopback in CSI................................................................................ 488 18-5 Loopback Entry Timing Diagram................................................................................... 490 18-6 Loopback Entry Flow Diagram ...................................................................................... 491 18-7 Slave Agent – Receiver Input Common Mode Override ............................................... 492 18-8 Master Agent – Receiver Strobe Override...................................................................... 492 18-9 Slave Agent – Receiver Strobe Override........................................................................ 493 18-10 Master Agent – Transmitter Driver Current Override.................................................... 493 Ref No xxxxx Intel Restricted Secret Slave Agent – Transmitter Drive Current Override ........................................................494 18-12 A Basic And Minimal Pattern Buffer Architecture.........................................................495 18-13 Loopback Exit Timing Diagram......................................................................................496 18-14 Loopback Exit Flow Diagram .........................................................................................497 18-15 Example of Clock Synthesis............................................................................................498 18-16 System Level Determinism Using Counters ...................................................................499 18-17 CSI Flit Synchronization to the Tester ............................................................................500 18-18 Transmitter Eye Height Adjust Using the Equalizer.......................................................503 18-19 Transmitter Eye Height Adjust Using I-Comp Settings..................................................504 18-20 Transmitter Eye Width Adjust Using “Jitter Injection” ..................................................505 18-21 Transmitter Eye Width Adjust Using “Jitter Injection” Control Register.......................506 18-22 Receiver Eye Height Adjust Control Register.................................................................507 18-23 Receiver Eye Width Adjust by Overriding the PI Control Register ...............................508 C-1 Concept of Transport Layer Retry...................................................................................526 C-2 Interfacing of Components with and without Transport Layer.......................................530 D-1 Purge TC Transaction Flow.............................................................................................535 E-1 Histogram for FSB In-Order Queue................................................................................539 E-2 General Validation Structure...........................................................................................545 E-3 HVA Layered Architecture .............................................................................................546 E-4 HVA Data Link Level Traffic .........................................................................................547 E-5 HVA Data Link Layer Structure .....................................................................................548 E-6 HVA PHY Initialization Behavior ..................................................................................549 E-7 HVA in the Multi-linked System ....................................................................................549 E-8 HVA Implementation Structure ......................................................................................550 Tables 3-1 Physical Layer Features Supported in each CSI Profile....................................................39 3-2 Inband Reset Events for Figure 3-3...................................................................................44 3-3 ATE Initialization Mode - ATE Tx and DUT Rx .............................................................48 3-4 ATE Initialization Mode - ATE Rx and DUT Tx .............................................................48 3-5 Flit Format.........................................................................................................................50 3-6 Flit Format and Phit Order – Full Width Link ..................................................................51 3-7 Flit Format and Phit Order – Half Width Link..................................................................51 3-8 Flit Format and Phit Order – Quarter Width Link.............................................................51 3-9 Physical Pin Numbering and Clock Position on a Link with 20 Lanes.............................55 3-10 Link Map for Supported Link Widths...............................................................................56 3-11 Examples of Width Capability Indicator (WCI) ...............................................................57 3-12 CRC and Side-band Fields – Full Width Link ..................................................................58 3-13 CRC and Side-band Fields –- Half Width Link ................................................................58 3-14 CRC and Side-band Fields – Quarter Width Link.............................................................58 3-15 Pins Depopulated on Narrow Physical Interfaces .............................................................59 3-16 Narrow Physical Interface - Pin Map and Internal Representation...................................60 3-17 Summary of Narrow Physical Interfaces...........................................................................60 3-18 Physical Pin Numbering and Clock Position on a Link with 10 Lanes.............................61 3-19 Pin Map for Implementations Supporting Port Bifurcation ..............................................61 3-20 Training Sequence (TSx) Format ......................................................................................62 3-21 Summary of Handshake Attributes ...................................................................................63 3-22 Link Initialization Time Out Values..................................................................................67 3-23 Summary of "Disable/Start" state......................................................................................69 3-24 Summary of Detect.1 Sub-State ........................................................................................71 Ref No xxxxx Intel Restricted Secret 2 3-26 Summary of Detect.3 Sub-State ....................................................................................... 73 3-27 Summer of Polling.1 Sub-State ........................................................................................ 74 3-28 Description of TS2 Training Sequence ............................................................................ 75 3-29 Summary of Polling.2 Sub-State ...................................................................................... 76 3-30 Description of TS3 Training Sequence ............................................................................ 76 3-31 Summary of Polling.3 Sub-State ...................................................................................... 78 3-32 Description of TS4 Training Sequence ............................................................................ 79 3-33 Summary of “Config.1” State........................................................................................... 80 3-34 Summary of “Config.2” State........................................................................................... 81 3-35 Description of TS5 Training Sequence ............................................................................ 82 3-36 Summary of “L0” State .................................................................................................... 83 3-37 Summary of Extended L0 State with Low Power Support .............................................. 84 3-38 Summary of L0s State ...................................................................................................... 85 3-39 Summary of L1 State........................................................................................................ 86 3-40 L0s Entry Events and Timers ........................................................................................... 87 3-41 L0s Exit Events and Timers.............................................................................................. 90 3-42 Link Width Modulation Events and Timers ..................................................................... 94 3-43 L1 Entry and Exit Events/Timers ..................................................................................... 97 3-44 Register Attribute Definitions ........................................................................................ 106 3-45 CSIPHCPR0: Physical Layer Capability Register 0 ...................................................... 106 3-46 CSIPHCPR1: Physical Layer Capability Register 1 ...................................................... 107 3-47 CSIPHCTR: Physical Layer Control Register................................................................ 108 3-48 CSIPHTDC: Tx Data Lane Control Register ................................................................. 109 3-49 CSIPHTDS: Tx Data Lane Termination Detection Status Register............................... 110 3-50 CSIPHRDC: Rx Data Lane Control Register................................................................. 110 3-51 CSIPHRDS: Rx Data Lane RxReady Status Register.................................................... 111 3-52 CSIPHPIS: Physical Layer Initialization Status Register............................................... 111 3-53 CSIPHPPS: Physical Layer Previous Initialization Status Register............................... 113 3-54 State Tracker Encoding .................................................................................................. 115 3-55 CSIPHWCI: Width Capability Indicator (WCI) Register .............................................. 116 3-56 CSIPHLMS: Lane Map Status Register ......................................................................... 116 3-57 CSIPHPLS: Physical Layer Link Status Register .......................................................... 116 3-58 CSIPHITV0: Initialization Time-Out Value Register 0 ................................................. 117 3-59 CSIPHITV1: Initialization Time-Out Value Register 1 ................................................. 118 3-60 CSIPHITV2: Initialization Time-out Value Register 2.................................................. 118 3-61 CSIPHITV3: Initialization Time-Out Value Register 3 ................................................. 118 3-62 CSIPHITV4: Initialization Time-Out Value Register 4 ................................................. 119 3-63 CSIPHLDC: Link Determinism Control Register.......................................................... 119 3-64 CSIPHLDS: Link Determinism Status register .............................................................. 120 3-65 CSIPHPRT: Periodic Retraining Timer Register ........................................................... 120 3-66 CSIPHDDS: Link Determinism Drift Buffer Status Register ........................................ 121 3-67 CSIPHPMR0: Power Management Register 0............................................................... 121 3-68 CSIPHPMR1: Power Management Register 1............................................................... 122 3-69 CSIPHPMR2: Power management Register 2 ............................................................... 123 3-70 CSIPHPMR3: Power Management Register 3............................................................... 124 3-71 CSIPHPMR4: Power Management Register 4............................................................... 125 3-72 CSITCR: Termination Control Register......................................................................... 125 3-73 CSIETE: Equalization Tap Enable Register................................................................... 126 3-74 CSIECR0: Equalization Coefficient Register 0.............................................................. 126 3-75 CSIECR1: Equalization Coefficient Register 1.............................................................. 126 Ref No xxxxx Intel Restricted Secret 7 3-77 CSIRLR[0-19]: RX Lane Register n ...............................................................................127 3-78 CSILCR: Loopback Control Register .............................................................................127 3-79 CSILLMC: Loop-Back Lane Mask Control Register .....................................................128 3-80 CSILMRC: Loop-Back Master Receiver Control Register.............................................128 3-81 CSILMTC: Loop-Back Master Transmitter Control Register ........................................128 3-82 CSILSRC: Loop-Back Slave Receiver Control Register ................................................128 3-83 CSILSTC: Loop-Back Slave Transmitter Control Register............................................129 3-84 CSILPR0: Loop-Back Pattern Register 0........................................................................129 3-85 CSILPR1: Loop-Back Pattern Register 1........................................................................129 3-86 CSILPI: Loop-Back Pattern Invert Register....................................................................129 3-87 CSILSR: Loop Back Status Register...............................................................................129 3-88 CSILSP0: Loop-Back Status Pattern Register 0 .............................................................130 3-89 CSILSP1: Loop-Back Status Pattern Register 1 .............................................................130 3-90 CSILSLF: Loop-Back Status Lane Failure Register.......................................................130 3-91 Physical Layer Glossary..................................................................................................131 4-1 Message Classes, Abbreviations and Ordering Requirements........................................136 4-2 Standard Address, SA UP/DP .........................................................................................140 4-3 Standard Address, SA SMP.............................................................................................141 4-4 Standard Coherence Address, SCA UP/DP.....................................................................141 4-5 Standard Coherence Address, SCA SMP........................................................................141 4-6 Standard Coherence, SCC UP/DP...................................................................................142 4-7 Standard Coherence, SCC SMP ......................................................................................142 4-8 Standard Complete With Data, SCD UP/DP...................................................................142 4-9 Standard Complete With Data, SCD SMP......................................................................143 4-10 Extended Address, EA UP/DP ........................................................................................143 4-11 Extended Address, EA SMP............................................................................................144 4-12 Extended Address, EA LMP ...........................................................................................144 4-13 Extended Coherence Address, ECA LMP.......................................................................145 4-14 Extended Coherence No Address, ECC LMP.................................................................145 4-15 Extended Complete with Data LMP................................................................................146 4-16 Non-Coherent Message, NCM UP/DP............................................................................147 4-17 Non-Coherent Message, NCM SMP ...............................................................................148 4-18 Non-Coherent Message, NCM LMP...............................................................................149 4-19 3 Flit EIC format UP/DP .................................................................................................150 4-20 3 Flit EIC format SMP ....................................................................................................151 4-21 3 Flit EIC format LMP ....................................................................................................152 4-22 Standard Data Response, SDR UP/DP............................................................................152 4-23 Standard Data Response, SDR SMP ...............................................................................153 4-24 Standard Data Write, SDW UP/DP.................................................................................153 4-25 Standard Data Write, SDW SMP ....................................................................................153 4-26 Extended Data Response, EDR LMP..............................................................................154 4-27 Extended Data Write, EDW LMP...................................................................................154 4-28 Extended Byte Enable Data Write, EBDW UP/DP.........................................................155 4-29 Extended Byte Enable Data Write, EBDW SMP............................................................156 4-30 Extended Byte Enable Data Write, EBDW LMP............................................................157 4-31 Data Flit Format, DF .......................................................................................................157 4-32 Peer-to-Peer Tunnel SMP................................................................................................158 4-33 Peer-to-Peer Tunnel LMP................................................................................................159 4-34 Packet Length Encoding UP/DP/SMP ............................................................................160 4-35 Packet Length Encoding LMP.........................................................................................160 Ref No xxxxx Intel Restricted Secret 1 4-37 Message Class Encoding SMP/LMP.............................................................................. 162 4-38 Virtual Network Encoding.............................................................................................. 162 4-39 VC Credit Field Encoding UP/DP.................................................................................. 163 4-40 VC Credit Field Encoding SMP/LMP............................................................................ 164 4-41 Ack Field Encoding ........................................................................................................ 165 4-42 Scheduled Data Interleave Encoding.............................................................................. 166 4-43 Transfer Size Encoding .................................................................................................. 166 4-44 Special Cycle Encoding - 6b -PL ................................................................................... 167 4-45 Response Status - 2b -PL................................................................................................ 168 4-46 Response Data State - 4b - PL ........................................................................................ 168 4-47 Response Data State Encoding ....................................................................................... 168 4-48 Mapping of the Protocol Layer to the Link Layer UP/DP/SMP/LMP ........................... 169 4-49 Generic form for Special Packet, ISP............................................................................. 173 4-50 Opcode Encoding for Special Packet ............................................................................. 174 4-51 Null Ctrl Flit ................................................................................................................... 174 4-52 Link Level Retry Messages ............................................................................................ 175 4-53 Power Management Ctrl Flit .......................................................................................... 175 4-54 Power Management Link Messages ............................................................................... 176 4-55 Parameter Exchange Messages....................................................................................... 176 4-56 PE.Parameter0 ................................................................................................................ 177 4-57 PE.Parameter1 ................................................................................................................ 177 4-58 PE.Parameter2 ................................................................................................................ 178 4-59 PE.Parameter3 ................................................................................................................ 179 4-60 PE.Parameter4 ................................................................................................................ 180 4-61 Standard Debug Messages.............................................................................................. 181 4-62 Generic Debug Ctrl Flit .................................................................................................. 181 4-63 Inband Debug Event Ctrl Flit ......................................................................................... 182 4-64 Debug Relative Timing Exposure Ctrl Flit..................................................................... 185 4-65 Idle Special Packet, ISP.................................................................................................. 187 4-66 CRC Computation - Full Width...................................................................................... 192 4-67 CRC Computation - Half Width..................................................................................... 192 4-68 CRC Computation - Quarter Width................................................................................ 192 4-69 Control Messages and Their Effect on Sender and Receiver States............................... 196 4-70 Remote Retry State Transitions...................................................................................... 196 4-71 Local Retry State Transitions ......................................................................................... 198 4-72 Description of Send Controller....................................................................................... 199 4-73 Processing of Received Flit ............................................................................................ 200 4-74 Link Init and Parameter Exchange State Machine ......................................................... 201 4-75 CSILCP Format .............................................................................................................. 203 4-76 CSILCL .......................................................................................................................... 204 4-77 CSILS ............................................................................................................................. 205 4-78 CSILP0 ........................................................................................................................... 206 4-79 CSILP1 ........................................................................................................................... 206 4-80 CSILP2 ........................................................................................................................... 207 4-81 CSILP3 ........................................................................................................................... 207 4-82 CSILP4 ........................................................................................................................... 207 5-1 Combinations of Protocol Options ................................................................................. 215 5-2 Routing Layer Needs for Different Usage Models......................................................... 216 5-3 Interfacing CSI Components with Different VNs .......................................................... 217 5-4 CSI Control and Status Registers Needed by the Routing Layer ................................... 217 Ref No xxxxx Intel Restricted Secret 4 7-1 Characteristics of CSI Address Regions..........................................................................231 7-2 Allowed Attribute Combinations for Decode Register Entries.......................................232 8-1 Message Name Abbreviations.........................................................................................248 8-2 Message Field Explanations............................................................................................248 8-3 Snoop Channel Messages................................................................................................248 8-4 Home Channel Request Messages...................................................................................249 8-5 Home Channel Writeback Messages...............................................................................249 8-6 Home Channel Snoop Responses....................................................................................250 8-7 Home Channel AckCnflt Message ..................................................................................251 8-8 Response Channel Data Messages ..................................................................................252 8-9 Response Channel Grant Messages.................................................................................252 8-10 Response Channel Completions and Forces....................................................................253 8-11 Permitted Message Dependencies in CSI........................................................................255 8-12 Cache States.....................................................................................................................258 8-13 Required Cache State for Request Types ........................................................................258 8-14 A Peer Caching Agent’s Response to an Incoming Snoop .............................................259 8-15 Peer Caching Agent’s Response to a Conflicting Incoming Snoop During Request Phase, before DataC_*/GntE Response260 8-16 Cmp_Fwd* State Transitions ..........................................................................................261 8-17 Useful definitions ............................................................................................................272 8-18 Home Agent Responses, No Implicit Forward, Null Conflict List .................................279 8-19 Home Agent Responses, No Implicit Forward, Non-Null Conflict List.........................279 8-20 Cmp_Fwd* Types Sent to the Owner .............................................................................280 8-21 Example Directory Format..............................................................................................283 9-1 Non-Coherent Message Name Abbreviations.................................................................299 9-2 Non-Coherent Requests...................................................................................................299 9-3 Example Read Completion Formatting...........................................................................308 9-4 Peer-to-Peer Transactions................................................................................................309 9-5 Broadcast Non-Coherent Transactions............................................................................314 9-6 Target Agent Lists for Broadcast Transactions...............................................................315 9-7 Non-coherent Message Encodings (all use Message Header Type) ...............................317 9-8 NcMsg Parameter Encoding............................................................................................319 9-9 CmpD Parameter Encoding (uses SCC Header) .............................................................320 9-10 Legacy Pins Descriptions and CSI Handling ..................................................................323 9-11 Legacy Pin Signalling......................................................................................................324 9-12 VLW Value Field Bits (10:0) Definition.........................................................................325 9-13 VLW Value Change Bits (10:0) Definition.....................................................................326 9-14 IA-32 Special Cycles.......................................................................................................326 9-15 Lock Types ......................................................................................................................327 9-16 Non-Coherent Logical Register List ...............................................................................333 10-1 Setting of A[51:2] in IntPhysical Requests for Itanium® Processors..............................340 10-2 Setting of A[51:2] in IntPhysical and IntLogical Requests for IA-32 Processors ..........340 10-4 Setting of Data[31:0] in IntPhysical and IntLogical Requests for IA-32 Processors......341 10-3 Setting of Data[31:0] in IntPhysical Requests for Itanium® Processors.........................341 10-5 CSI Interrupt Modes........................................................................................................342 10-6 Setting of A[51:2] in IntPrioUpd Request for Itanium® Processors ...............................344 10-7 Setting of A[51:2] in IntPrioUpd Request for IA-32 Processors ....................................344 10-8 Interrupt Delivery in IA-32 Processor-Based Systems ...................................................348 11-1 Timeout Levels for CSI Requests with Source Broadcast ..............................................359 12-1 Justification for Reset Domain Separation......................................................................368 Ref No xxxxx Intel Restricted Secret 9 12-3 Features of CSI Upper Link Layer Reset Domain ......................................................... 369 12-4 Features of CSI Routing Layer or Crossbar Reset Domain............................................ 371 12-5 Features of CSI Protocol Agent Reset Domain .............................................................. 371 12-6 Node Identifier Options .................................................................................................. 373 12-7 System Type Values ....................................................................................................... 377 12-8 CSI Control and Status Registers Needed for Reset and Initialization .......................... 385 13-1 Division of Protected Resources for Isolation................................................................ 392 13-2 Sub-Regions of the Protected Region............................................................................. 392 13-3 Protected and PAL Mode Access Privileges .................................................................. 395 13-4 CSEG Operating Parameters .......................................................................................... 398 14-1 Control and Status Registers Needed for Quiesce/De-Quiesce...................................... 403 14-2 CSI Control and Status Registers Needed for Dynamic Reconfiguration Operations ... 408 15-1 Link State Overview....................................................................................................... 435 15-2 PMReq Data Field Mapping........................................................................................... 454 15-3 PMReq State_Type Field Encoding ............................................................................... 454 15-4 Power Management Transition Response Data Field Mapping ..................................... 455 15-5 CmpD State_Type Field Encoding for Power Management .......................................... 455 15-6 PM.LinkL0sConfig Data Field Mapping........................................................................ 456 15-7 PM.LinkWidthConfig Data Field Mapping.................................................................... 456 16-1 Isochronous Command and Data.................................................................................... 462 16-2 ISOC Request Attributes ................................................................................................ 462 16-3 Mapping of Traffic-class examples - to CSI Request Attributes.................................... 463 B-1 CSI Profile Attributes ..................................................................................................... 519 D-2 CSI Profile Attributes ..................................................................................................... 536 F-3 Actions of CSI-IAM ....................................................................................................... 561 F-4 Action CacheNewReqInt................................................................................................ 562 F-5 Action CacheNewReqExt............................................................................................... 564 F-6 Action CacheRecvData................................................................................................... 566 F-7 Action CacheRecvCmp .................................................................................................. 567 F-8 Action CacheRecvFwd ................................................................................................... 569 F-9 Action CacheSnpOrbMiss .............................................................................................. 572 F-10 Action CacheSnpOrbHit................................................................................................. 576 F-11 Action HomeRecvReq.................................................................................................... 577 F-12 Action HomeRecvRsp .................................................................................................... 579 F-13 Action HomeRecvAckCmp............................................................................................ 581 F-14 Action HomeRecvAckFwd............................................................................................. 582 F-15 Action HomeRecvWbData ............................................................................................. 583 F-16 Action HomeSendDataCmp ........................................................................................... 585 G-2 Action CacheNewReqInt................................................................................................ 614 G-3 Action CacheNewReqExt............................................................................................... 615 G-4 Action CacheRecvData................................................................................................... 617 G-5 Action CacheRecvCmp .................................................................................................. 619 G-6 Action CacheRecvFwd ................................................................................................... 620 G-7 Action CacheSnpOrbMiss ............................................................................................. 622 G-8 Action CacheSnpOrbHit................................................................................................. 625 G-9 Action HomeRecvReq.................................................................................................... 627 G-10 Action HomeRecvExplicitWbReq ................................................................................. 630 G-11 Action HomePRBtoSPTNoCDM ................................................................................... 633 G-12 Action HomePRBtoSPTCDM........................................................................................ 636 G-13 Action HomeRecvSnpRspNoCDM................................................................................ 637 Ref No xxxxx Intel Restricted Secret 1 G-15 Action HomeRecvWbSnpRsp.........................................................................................643 G-16 Action HomeRecvImplicitWbData .................................................................................648 G-17 Action HomeRecvRspCnfltNoCDM...............................................................................652 G-18 Action HomeRecvRspCnfltCDM....................................................................................656 G-19 Action HomeRecvAckCnflt ............................................................................................657 G-20 Action HomeSPTReadyToRespondNoCDM..................................................................659 G-21 Action HomeSPTReadyToRespondCDM.......................................................................661 Ref No xxxxx Intel Restricted Secret Information contained in this document is subject to change. Revision Number Description Date 0.0 • This is a first version of the CSI Specification for review purpose only. Do not use this version of specification for design purpose. It requires team review. • This version is showing the draft of Link Layer, Cache Coherence Protocol and Non Coherent and Interrupt transactions (along with the Introduction). March 2003 0.1 • Updated all the chapters. April 2003 0.3 • All chapter were updated. May/June 2003 0.5 • Major changes have been made to most of the chapters. The ones without changes are Introduction, Physical layer, Power Management, Fault Handling, and Security. August 2003 0.55 • The Physical layer, Power Management, Dynamic Reconfiguration chapters were updated in this revision. The Implementation Agnostic Model appendix has been removed from the document. It will be published separately. August 2003 0.7 • Added concept of the profiles to the document. Used conditional text to identify UP, DP, small MP (SMP), large MP (LMP), IA-32 and Itanium processor family profiles. • All chapters have been updated, UP Appendix has been removed, glossary and agnostic models have been added as appendices. October 2003 0.75 • All chapters have been updated. Protocol Overview chapter added. Post silicon validation appendix added. PTC.G appendix added. April 2004 § Ref No xxxxx Intel Restricted Secret 1.1 Preface This document is the specification of Intel’s CSI - a cache-coherent, link-based interconnect specification for processor, chipset, and I/O bridge components. CSI can be used in a wide variety of desktop, mobile, and server platforms spanning IA-32 and Intel® Itanium® architectures. CSI also provides support for high performance I/O transfer between I/O nodes. It allows connection to standard I/O buses such as PCI Express*, PCI-X, PCI (including peer-to-peer communication support), AGP, etc. through appropriate bridges. 1.2 CSI Layers The functionality of CSI is partitioned into five layers, one or more of which is optional for certain platform options. Each layer performs a well-defined set of non-overlapping functions. This layering results in a modular architecture that is easier to specify, implement, and validate. It also allows for easier future upgrades to the interface by allowing fairly independent optimizations at each layer. The layers, shown in Figure 1-1, from bottom to top are: Physical, Link, Routing, Transport, and Protocol. Figure 1-1. Hierarchical Ordering of CSI Interface Layers Protocol Layer Transport Layer Routing Layer Link Layer Physical Layer Optional L Optional LOptional La aay yye eers rsrs The transport and Routing layers, shown dotted in Figure 1-1, are optional and needed for certain platform options only. In desktop/mobile and dual processor systems, for example, the functionality of the Routing layer is embedded in the Link layer - hence, this layer is not separate in such systems. 1.2.1 Physical Layer The Physical layer is responsible for fast electrical transfer of information on the physical medium. The physical link is point to point between two Link layer CSI agents and uses a differential signaling scheme called Scalable Copper Interconnect Differential (SCID). Ref No xxxxx Intel Restricted Secret Introduction Introduction 1.2.2 Link Layer The Link layer abstracts the Physical layer from the upper layers and provides the following services: reliable data transfer and flow control between two directly connected CSI agents, virtualization of the physical channel into multiple virtual channels and message classes. The virtual channels can be viewed as multiple virtual networks for use by the Routing, Transport, and Protocol layers. The Protocol layer relies on the message class abstraction to map a protocol message into a message class and, hence, to one or more virtual channels. 1.2.3 Routing Layer This layer provides a flexible and distributed way to route CSI packets from a source to a destination. The routing is based on the destination. In some platform options (e.g., uniprocessor and dual processor systems), this layer may not be explicit but could be part of the Link layer; in such a case, this layer is optional. It relies on the virtual channel and message class abstraction provided by the Link Layer to specify one or more pairs to route the packet on. The mechanism for routing is defined through implementation specific routing tables. Such a definition allows a variety of usage models, which are described in the specification. Figure 1-2. CSI Interface Layer Details (Routing and Transport Layers Not Shown) Protocol Layer Phy Layer CSI Packets CSI Agent Protocol Engines CoherenceOrderingInterruptI/O Electrical Transfer Electrical Transfer . . . Buffered Flow Control CSI Agent Protocol Engines CoherenceOrderingInterruptI/O Electrical Transfer Electrical Transfer . . . Buffered Flow Control Link Layer Phit Flit = F *Phit Packet = P * Flit Ref No xxxxx Intel Restricted Secret Figure 1-3. CSI Interface Layer Details (Transport Layer Not Shown) Protocol Layer Phy Layer CSI Packets Link Layer Phy Layer CSI Packets CSI Agent Protocol Engines CoherenceOrderingInterruptI/O Electrical Transfer Electrical Transfer . . . Buffered Flow Control Routing Tables CSI Agent Protocol Engines CoherenceOrderingInterruptI/O Electrical Transfer Electrical Transfer . . . Buffered Flow Control Routing Tables Electrical Transfer Electrical Transfer . . . Buffered Flow Control Routing Tables Routing Layer Link Layer Phit Phit Flit = F *Phit Flit = F *Phit Routing Layer Packet = P * Flit Packet = P * Flit Packet = P * Flit 1.2.4 Transport Layer This layer provides support for end-to-end reliable transmission between two CSI agents that each have this layer’s capability. It relies on the services provided by the Routing layer below it, while in turn providing reliable transmission support to the Protocol layer above it. The Transport layer is optional and is provided for systems which desire a higher degree of reliability usually at the cost of perhaps lower performance and increased bandwidth utilization. In such systems, the Transport Layer functionality may be isolated to a few CSI components - in such a case, the sub-fields in the CSI packet related to this layer are defined in these components only. Since this layer is optional, it is possible to have a platform architecture with no CSI agent implementing this layer. Further, it does not follow the hierarchical layering of CSI from an implementation viewpoint (See Appendix C, “Future Extensions - Transport Layer”). In the rest of this specification, the Transport Layer is not shown or assumed, unless explicitly mentioned. 1.2.5 Protocol Layer This layer implements the higher level communication protocol between nodes such as cache coherence (reads, writes, invalidations), ordering, peer-to-peer I/O, interrupt delivery, etc. CSI provides a flexible protocol which can scale from small to large systems. The write-invalidate protocol implements the MESIF states, where the MESI states have the usual connotation (Modified, Exclusive, Shared, Invalid), and the F state indicates a read-only forwarding state. The CSI protocol allows for source snooping (the requester initiates snoops of the caching agents), home snooping (home initiates snoops of the caching agents), or a combination of the two. It is permissible for the F state to be not used (for example, in home snooping based systems). The exact functionality of this layer depends on the platform architecture. The Protocol layer is bypassed in pure routing agents resulting in low latency transfer from sender to the receiver through the interconnection network (please see Figure 1-2). Ref No xxxxx Intel Restricted Secret Introduction Introduction 1.2.6 Communication Granularity Between Layers The data transfer unit at the Physical layer is called a phit (physical unit). The Link layer between two CSI agents communicate at a higher granularity called flit (flow control unit). A flit is the smallest granularity for flow control. A flit is made of multiple phits. The protocol, transport, and Routing layers communicate at the granularity of a packet. Each packet consists of one to many flits, depending on the packet type and the system configuration - thus, it may consist of one or more header flits optionally followed by a data payload consisting of multiple flits (please see Figure 1-2). In the rest of the specification, a CSI agent always refers to a protocol agent, unless explicitly mentioned otherwise. 1.3 Notes The conditional text tags have been used in the document to distinguish between various system profiles. System profiles are defined in following chapter. System profiles are marked with conditional text and specific colors. The following is the list describing conditional text used to describe profiles: • Sample of the text for UP description • Sample of the text for DP description • Sample of the text for SMP description • Sample of the text for LMP description • Sample of the text for IA-32 description • Sample of the text for Itanium processor family description • Sample of text using multiple conditional tags (ex. UP, DP) 1.4 Definition of Terms The terms defined in this section are frequently used in subsequent chapters of the specification. Additional terms are defined in the following chapters of the document to better describe the content of the material. The definition of terms will be consolidated in the future revision of the CSI specification. The complete list of definitions is provided in Appendix A, “Glossary.” Device Address Caching Agent Configuration Agent This is the address generated by the target node of an CSI transaction to access the physical memory or device location. This address is used on the I/O buses or on the memory interface. This address may be same as the physical address part of the system address or it may be translated through some (optional) mapping mechanism at the target. A protocol agent type which can perform reads & writes into coherent memory space. The logical owner of all platform configuration registers on a CSI agent or component. A component may define a separate CA for each CSI agent on the die or it may define a single CA to represent all the CSI agents on the die. In the latter case, configuration transactions destined Ref No xxxxx Intel Restricted Secret to CSRs in other CSI agents are logically targeted to the CA, which in turn completes the access within the die via implementation-specific mechanisms. Firmware agent A CSI agent capable of supplying boot firmware to processor cores. Home Agent A protocol agent type which is responsible for guarding access to a piece of coherent memory. I/O Agent A protocol agent type which is responsible for non-CSI I/O devices behind it. As a CSI initiator, an I/O Agent makes CSI requests on behalf of I/O devices and returns responses back to the I/O device. As a target, an I/O Agent is responsible for translating CSI requests to the protocol native to its I/O interface and returns I/O responses back to the CSI requester. Physical Address This is the operating system’s view of the address space in a partition. This is obtained by translating virtual address through the operating system page translation mechanism. This is also the address used by the cache coherency mechanism, which puts certain requirements on the mapping of coherent shared address space within and across partitions. Processor Agent The CSI interface to a logical processor. (this definition needs to be revised and will need to change as we better understand how interrupts, VLWs, etc. are partitioned in designs). Routing Agent A CSI agent which implements a routing step, routing a CSI packet from the input port of a router to the destination port based on the destination node id contained in the packet. A packet is routed from its source to its destination through a series of routing steps. System Address The system address is represented by the physical address and the target (home) node identifier, which points to a unique device address in a system. The addressing model allows same physical address from different source agents to map to different system address (e.g., private firmware space per processor agent) or to the same system address (e.g., shared memory space in a partition or across partitions) irrespective of partition boundaries. System address also includes the scope of hardware cache coherency. For example, a system may have identical physical memory addresses in different partitions, but with different home nodes and different scope of coherency and therefore distinct system addresses. Also note that in the source broadcast based cache coherency scheme, the home node identifier does not play a role in specifying the scope of coherency. Virtual Address This is the address used by the applications, device drivers and devices (if I/O agents support paging). § Ref No xxxxx Intel Restricted Secret Introduction Introduction Ref No xxxxx Intel Restricted Secret This chapter outlines the flexible platform architectural options that are possible with CSI-based interconnect. CSI can be used in a wide variety of desktop, mobile, and server platforms spanning IA-32 and Itanium architectures. Figure 2-1. Schematic of an Intel® Itanium® Processor with CSI-Based Links Interface Xbar Router/ Non-routing global links interface CSI Links P P P MemoryController(Optional) Processor Cores with split/ shared caches Memory Intf (Optional) Figure 2-1 shows a schematic of a processor with external CSI-based link interfaces. The processor may have 1 or more cores. In case multiple core are present they may share caches or have separate caches The processor may also support optional integrated memory controller(s). In addition, based on level of scalability support in the processor, it may include an integrated crossbar router and 1 or more external CSI link interface. In the rest of the chapter, we discuss the various system profiles that may be supported by different processor implementations. 2.1 Desktop/Mobile Systems Figure 2-2 shows two example configurations, each with a single socket. In each case the processor is directly connected to the chipset through a single CSI link. In the first configuration, the processor's main memory is supported through a memory controller on the chipset (that also has graphics related functionality). In the second configuration, the processor's memory is directly connected to the processor socket and the processor is assumed to have an integrated memory controller on die; the chipset primarily supports graphics related functionality. Both configurations have I/O connectivity and firmware through other chipsets with connectivity as shown in the Figure 2-2. Ref No xxxxx Intel Restricted Secret Platform Scope Platform Scope Figure 2-2. CSI-Based Uniprocessor Systems IA Processor Graphics + Memory Ctrl ICH Firmware Hub CSI Link LPC Bus DMI Memory PCI Express Links IA Processor + iMC Graphics + Memory Ctrl ICH Firmware Hub CSI Link LPC Bus DMI Graphics Memory Memory PCI Express Links To keep the focus primarily on the CSI-related parts of platform configurations, most other platform components are not shown in later sections. 2.2 Dual Processor Systems Figure 2-3. CSI-Based Dual Processor Systems PCI Express Links IA Processor Graphics + Memory Ctrl CSI Link Memory IA Processor IA Processor + iMC IO Hub CSI Link DMI IA Processor + iMC Memory PCI Express Links Optional Optional The dual processor options shown in Figure 2-3 represent two of several possible platform options. The first option has a centralized main-memory connected to the graphics controller. Each processor sockets has two CSI links, one connecting it to the graphics and memory controller and the other to the second processor socket. The second option assumes a distributed memory platform with each processor having part of the main memory directly connected to it. This option shows an I/O Hub connected to the processor sockets instead of the graphics controller and represents a yet another possible variation amongst dual processor platforms. In both the configurations shown, the optional direct processor to processor link helps provide additional network bandwidth as well as reduces latency to snoop caches on the other processor and that of direct cache-to-cache transfers of instructions/data. In each of the single processor (desktop) and dual processor platform configuration the processors need not support any special routing capability. Ref No xxxxx Intel Restricted Secret 2.3 Quad-Socket and 8-Socket Systems Figure 2-4 shows a 4-socket platform configuration where each processor socket is connected to every other processor socket (“fully-connected”) and also to a single I/O Hub through CSI links. This platform also has fully-distributed memory architecture. This architecture has high performance because of its rich interconnectivity which permits quick snoop resolution, fast memory and cache-to-cache transfers. Variants of this architecture could include the use of multiple I/O Hubs. A different version of the 4-socket platform (not shown here) could be a cost optimized one with a “square” interconnect between processors such that it requires 1 fewer CSI link per processor socket. Once again, multiple I/O Hub based solutions are also possible in this configuration. Figure 2-4. 4-Socket CSI-Based Platform IA Processor + iMC IA Processor + iMC Full or Half- width CSI Link IO Hub DMI Memory CSI Link IA Processor + iMC IA Processor + iMC PCI Express Links In Figure 2-5, abstract CSI-based platform topologies for 4-socket and 8-socket platforms are shown. Figure 2-5. 4-Socket and 8-Socket CSI Systems 0 1 2 3 IA Processor w/disabled iMC XMC IA Processor + iMC OR 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 In particular, the 4-socket abstraction represents the inter-processor interconnect for the platform shown in Figure 2-5 ignoring the interconnects to I/O and other platform components. The 8socket topology of Figure 2-5 shows cube-topology which utilizes 3 CSI links for inter-socket communication. Additional connectivity between processor sockets is possible with more CSI Ref No xxxxx Intel Restricted Secret Platform Scope Platform Scope links supported by a given Itanium processor implementation, as shown by the option on the right. This would improve the overall bandwidth available in the platform as well as improve its latency characteristics. It is feasible that 8-socket systems could be built with 4 or 5 inter-processor CSI links supported by a particular Itanium processor implementation. Such richer connectivity leads to higher performance and better RAS features. 2.4 Large Scale System Architectures Platform architectures that scale beyond 4 or 8 Itanium architecture-based processors could be enabled through OEM chipsets. Figure 2-6. Large Scale “Flat” Architecture CSI Link IA Processor + iMC IA Processor + iMC CSI Interconnection Network IO Hub XMCXMC PCI Express Links One approach to build general large-scale CSI-based platform example is shown in Figure 2-6. Here each processor interfaces to two (or more) external memory controller (XMC) chips through CSI links. The XMC is an OEM component which utilizes a scaled version of CSI with directory support and other features to enhance performance (e.g., directory cache) and RAS. The interconnect network topology can be general, especially, with additional OEM components such as routers and cable drivers. As opposed to the “flat” architecture of Figure 2-6, another way to build large scale systems is to use a hierarchical approach - the basic building block comprises of a n-socket node (where n is some small number); such nodes are, in turn, interconnected through OEM node controllers. The building block uses CSI interfaces while the node controllers could use CSI links or the OEM’s proprietary interfaces. Figure 2-7 shows an example of a node-controller based 4-socket platform architecture. Such an OEM designed node-controller could optionally support a remote memory cache, a partial or full directory, and a directory-cache for scalability, performance, and RAS reasons. Itanium processors which are used in large scale systems will have an internal router to support through routing. Ref No xxxxx Intel Restricted Secret Figure 2-7. Scalable Hierarchical System with OEM “Node Controllers” IA Processor + iMC IA Processor + iMC Memory CSI Link IA Processor + iMC IA Processor + iMC Node Controller Interconnection Network CSI-based or Proprietary Interconnect for Scale-up 2.5 Profiles A central notion in CSI is that of a profile. Since CSI targets a range of architectural platforms, the interface definition identifies the essential features that are common across this range and also features that are specific to a particular platform or set of platforms. These specific features form the profile. For example, the CSI features of a desktop profile would optimize cost and performance while those for a large system profile would target scalability and RASUM (reliability, availability, serviceability, usability, manageability). CSI has been carefully designed to permit both unification across profiles and optimizations targeted to each profile. The CSI profiles fall roughly in line with the architectural options introduced in this chapter: uniprocessor system that include both the desktops and mobile platforms, dual processor systems, small scale system (4 - 8 socket systems), and large scale system that typically more than 8 sockets. It has to be noted, however, that profile dependent fields permit certain features and, correspondingly, restrict other features - processor and chipset implementations targeting specific platforms utilize the needed combination of the profile dependent fields - hence there is not a strict mapping of profile dependent fields into exact architectural options. At the highest level, CSI packet headers have the standard format (1 flit) and the extended header format (2 flit). The standard format is expected to be used for all profiles except the large scale systems. The extended format permits, for example, larger addressability, higher number of CSI agents that can be supported in the system at the expense of additional interconnect bandwidth. The standard format permits limited addressability, limited number of CSI agents, etc. In addition, Ref No xxxxx Intel Restricted Secret Platform Scope Platform Scope profile dependent fields in the header exploit particular optimizations for each profile. These profile dependent fields permit, for example, optimizations for memory access and fast data return through specification of hints, specialized interleaving scheme, critical chunk delivery, etc. - optimizations of critical importance to desktop systems. Dual processor and low end server systems may exploit critical chunk ordering at the expense of addressability. In addition, some profiles may only implement only certain features - for example, some CSI transactions related primarily to support dynamic reconfiguration may not be implemented by the desktop and mobile profile; a dual processor system may not implement the full range of virtual networks supported by CSI; a server system may not implement support for isochronous traffic. Ref No xxxxx Intel Restricted Secret 3.1 Physical Layer Overview The Physical layer is responsible for providing a means of communication between two CSI ports over a physical interconnect consisting of two uni-directional links, as shown in Figure 3-1. The Physical layer is at the lowest level of CSI hierarchy, and isolates higher CSI layers from electrical and physical implementation details. The Physical layer directly interacts with Link layer only, and can be viewed as two distinct blocks – a logical sub-block and an electrical sub-block. Figure 3-1. CSI Layer Hierarchy Other CSI Layers Other CSI Layers (see Chapter 1) (see Chapter 1) Link Layer Link Layer Higher CSI Layers Electrical Sub-block Logical Sub-block Physical Layer Rx Tx Link Physical Interconnect Electrical Sub-block Logical Sub-block Physical Layer Rx Tx Link The logical sub-block is primarily responsible for Physical layer initialization, controlling electrical sub-block during normal link operation, and for providing Physical layer test and debug hooks. After Physical layer initialization is completed, logical sub-block works under the direction of Link layer, which is responsible for flow control. From this point onwards, logical sub-block communicates with Link layer at a flit granularity and transfers flits across the link at a Phit granularity. A flit is composed of integral number of phits, where a phit is defined as the number of bits transmitted in one unit interval (UI). For instance, a full width link transmits and receives a complete flit using four phits. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer The electrical sub-block defines the electrical signaling technology for high-speed data transfer across the link. Included in the electrical sub-block are the front-end driver and receiver circuits, clock circuitry, analog circuitry for calibrating I/O, etc. The electrical sub-block is transparent to the Link layer, and only interfaces with the logical sub-block. Following is a list of key electrical properties of the Physical layer: 1. A forwarded clock is sent by transmit side of local port to receive side of remote port, and vice-versa, to maintain timing reference between Physical layers at either end of the link. 2. Clocks between connected CSI ports are mesochronous, implying link clocks on both ports need to be at the same frequency (0 ppm) but may differ by a fixed phase. 3. DC Coupling between a connected transceiver pair, and hence use of a non-encoded link. 4. Ground referenced Tx and Rx terminations. 5. No error detection mechanism exists inside the Physical layer. Data integrity across the link is ensured through CRC check by the Link layer. 3.2 Physical Layer Features for Desktop/MobileSystems - UP Profile 1. Automatically detect the presence of an active CSI port at the other end of the link. 2. Automatically distinguish between an active CSI port and a 50 O passive test probe and output a compliance test pattern when a test probe is detected. 3. An Inband Reset mechanism where a port at one end of the link can force the port at the other end to re-initialize Physical layer, without resetting higher layers. 4. Symmetric physical interface with identical physical link widths in both directions. 5. Default link consists of 20 physical lanes, referred to as a full width link. Some implementations may support half width links, with 10 physical lanes. 6. Optional support to configure a link in half width mode with 10 active lanes or in quarter width mode with five active lanes. There is no dependency between the number of active lanes in each direction of the link. The desired number of active lanes should be configured prior to link initialization, either through Physical layer CSRs or through pin straps. Note the difference between a physical lane and an active lane. A physical lane corresponds to an instantiated pin; thus, maximum link width is equal to the number of physical lanes (or instantiated pins). A link can be formed using a subset of these physical lanes, at a narrower width, in which case the lanes forming a link are referred to as active Lanes. To satisfy interface symmetry requirements (see #3 above), links in either direction are required to have the same number of physical lanes, but they may have a different number of active lanes. 7. Optional support to turn-off either CRC or sideband signals, but not both, resulting in a link with 18 active lanes in full width mode, 9 active lanes in half width mode or 5 active lanes in quarter width mode. For interoperability, these implementations are still required to instantiate 20 physical lanes (or 10 in some implementations - see #4 above) in each direction. The signals to be turned-off should be configured prior to link initialization, either through Physical layer CSRs or through pin straps. CRC or side-band signals can be turned-off in one direction independent of the other. Thus, it is acceptable to turn-off CRC signals in one direction and sideband signals in the other. Conversely, a link in one direction may choose to use a full 20 lane interface while link in other direction may turn-off either CRC or sideband signals. Ref No xxxxx Intel Restricted Secret 8. Support for Loop Back test mode, where one port acting as Loop Back Master transmits and checks test patterns, and the other port acting as Loop Back Slave echoes incoming test patterns to the Loop Back Master. 9. Support for tester determinism to ensure link repeatability across all operating conditions. 3.3 Physical Layer Features for Dual Processor Systems - DP Profile 1. Automatically detect the presence of an active CSI port at the other end of the link. 2. Automatically distinguish between an active CSI port and a 50O passive test probe and output a compliance test pattern when a test probe is detected. 3. An Inband Reset mechanism where a port at one end of the link can force the port at the other end to re-initialize Physical layer, without resetting higher layers. 4. Physical interface consists two full width links, each consisting of 20 lanes. 5. Support for Loop Back test mode, where one port acting as Loop Back Master transmits and checks test patterns, and the other port acting as Loop Back Slave echoes incoming test patterns to the Loop Back Master. 6. Support for tester determinism to ensure link repeatability across all operating conditions. 7. Optional support for Lane Reversal to reduce board layout complexity. 8. Optional support for Polarity Inversion to reduce board layout complexity. 3.4 Physical Layer Features for 4 and 8 Socket Systems -Small MP Profile 1. Automatically detect the presence of an active CSI port at the other end of the link. 2. Automatically distinguish between an active CSI port and a 50 . passive test probe and output a compliance test pattern when a test probe is detected. 3. An Inband Reset mechanism where a port at one end of the link can force the port at the other end to re-initialize Physical layer, without resetting higher layers. 4. Physical interface consists two full width links, each consisting of 20 lanes. 5. Ability to configure a full width link as a half width link with 10 active lanes or as a quarter width link with five active lanes. A link can be configured to operate in a narrower width mode independent of the link width in other direction. 6. Support for Loop Back test mode, where one port acting as Loop Back Master transmits and checks test patterns, and the other port acting as Loop Back Slave echoes incoming test patterns to the Loop Back Master. 7. Support for tester determinism to ensure link repeatability across all operating conditions. 8. Lane Reversal support to reduce board layout complexity. 9. Polarity Inversion support to reduce board layout complexity. 10. Hot Plug Support Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 11. Optional support for link Self-healing, where faulty data lanes are identified, resulting in downgrading link width in the direction of failure. Downgrading a link in one direction due to faulty data lanes does not impact link width in the other direction. 12. Optional support for Clock Fail-safe Mode, where faulty forwarded clock can be replaced with pre-designated data lanes that act as back-up clock. Clock fail-safe mode results in downgrading link to a narrower width in the direction of clock failure. Faulty clock in one direction of the link does not impact link width in the other direction. 13. Optional support for Port Bifurcation, where a single full width link can be configured as two independent half width links. Port Bifurcation is a static link configuration set prior to link initialization. Self-healing or Clock Fail-safe Mode are supported in each independent half width link. A bifurcated port is not guaranteed to tolerate both a faulty clock and a faulty data lane. Additionally, each half width link of a bifurcated port supports Lane Reversal independent of the other half width link. 14. Optional support for Lockstep operation, where a constant link latency (in UI) can be maintained by programming latency offset using Physical layer CSRs. 3.5 Physical Layer Features for Large Scale Systems - Large MP Profile 1. Automatically detect the presence of an active CSI port at the other end of the link. 2. Automatically distinguish between an active CSI port and a 50 O passive test probe and output a compliance test pattern when a test probe is detected. 3. An Inband Reset mechanism where a port at one end of the link can force the port at the other end to re-initialize Physical layer, without resetting higher layers. 4. Physical interface consists two full width links, each consisting of 20 lanes. 5. Ability to configure a full width link as a half width link with 10 active lanes or as a quarter width link with five active lanes. A link can be configured to operate in a narrower width mode independent of the link width in other direction. 6. Support for Loop Back test mode, where one port acting as Loop Back Master transmits and checks test patterns, and the other port acting as Loop Back Slave echoes incoming test patterns to the Loop Back Master. 7. Support for tester determinism to ensure link repeatability across all operating conditions. 8. Lane Reversal support to reduce board layout complexity. 9. Polarity Inversion support to reduce board layout complexity. 10. Hot Plug Support 11. Link Self-healing support, where faulty data lanes are identified, resulting in downgrading link width in the direction of failure. Downgrading a link in one direction due to faulty data lanes does not impact link width in the other direction. 12. Support for Clock Fail-safe Mode, where faulty forwarded clock can be replaced with predesignated data lanes that act as back-up clock. Clock fail-safe mode results in downgrading link to a narrower width in the direction of clock failure. Faulty clock in one direction of the link does not impact link width in the other direction. 13. Optional support for Port Bifurcation, where a single full width link can be configured as two independent half width links. Port Bifurcation is a static link configuration set prior to link initialization. Self-healing or Clock Fail-safe Mode are supported in each independent half Ref No xxxxx Intel Restricted Secret width link. A bifurcated port is not guaranteed to tolerate both a faulty clock and a faulty data lane. Additionally, each half width link of a bifurcated port supports Lane Reversal independent of the other half width link. 14. Optional support for Lockstep operation, where a constant link latency (in UI) can be maintained by programming latency offset using Physical layer CSRs. 3.6 Summary of Physical Layer Features Table 3-1 summarizes the Physical layer features supported in each CSI profile. Legend for Table 3-1: R - Required feature O - Optional feature X - Feature not supported Table 3-1. Physical Layer Features Supported in each CSI Profile Feature CSI Profile Relevant Sections (for further reading) UP DP Small MP Large MP Automatic detection of CSI port at far-end of the link R R R R Section 3.9.3.2, Section 3.9.3.2.1 Ability to distinguish between a CSI port and a 50O passive termination R R R R Section 3.9.3.2.1 Inband Reset to localize link reset to Physical layer R R R R Section 3.7.5 Periodic link re-training R R R R Section 3.9.7 Number of lanes in a full width link 20 20 20 20 Section 3.9.1 Designing a CSI port with half width links (10 physical lanes or pins in each direction) O X X X Section 3.9.1.7 Configure a full width as a half width link (10 lanes) and quarter width link (5 lanes) O X R R Section 3.9.1.3, Section 3.9.1.7, Section 3.9.3.4.1 Asymmetric link width - logical only. The configured link width can be independent in each direction, but transmit/receive portions of a port are required to have identical number of pins instantiated O X R R Section 3.9.1.3, Section 3.9.1.7, Section 3.9.3.4.1, Section 3.10 Support for turning-off either CRC or side-band signals O X X X Section 3.9.1.4, Section 3.9.3.3.3 Loopback R R R R Section 3.9.3.3.3, Section 3.9.3.6 Support for tester determinism R R R R Section 3.9.6 Lane reversal X O R R Section 3.9.1, Section 3.9.3.3.3, Section 3.9.11 Polarity inversion X O R R Section 3.9.3.2.5, Section 3.9.3.2.6 Hot plug X X R R Section 3.9.10 Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer Feature CSI Profile Relevant Sections (for further reading) UP DP Small MP LargeMP Link self-healing X X O R Section 3.9.3.4.1, Section 3.9.3.4.2, Section 3.9.9 Clock fail safe operation X X O R Section 3.9.3.2.2, Section 3.9.3.2.3, Section 3.9.3.2.4, Section 3.9.8 Port bifurcation X X O O Section 3.9.1.8, Section 3.9.12 Support for lockstep determinism X X O O Section 3.9.6 3.7 Physical Layer Reset An overview of a generic Physical layer Reset sequence and the reset types supported by Physical layer are outlined in this section. Refer to a platform EMTS or an equivalent document for exact implementation details on that platform. 3.7.1 Link Power Up and Initialization Sequence The link initialization performs the following key functions in the order listed below. These steps are explained in detail in later sections. 1. Calibrate analog circuitry. Calibration is done during Disable/Start state of the initialization sequence, as explained in Section 3.9.3.1. 2. Detect the presence of an active CSI port or a 50 O passive test probe at the other end of the link. This initialization phase corresponds to Detect.1 state explained in Section 3.9.3.2.1. 3. Activate forwarded clock lane and lock to received clock, when an active CSI port is detected. Forwarded clock is sent out during Detect.2 explained in Section 3.9.3.2.3. 4. Establish bit lock to align received clock to center of the data eye, which is done during Polling.1 of the initialization sequence as explained in Section 3.9.3.3.1. 5. Perform lane deskew to match delay across all lanes. Lanes are deskewed in Polling.2 state, explained in Section 3.9.3.3.2. 6. Exchange Physical layer parameters. Physical layer parameter exchange is done during Polling.3 state explained in Section 3.9.3.3.3. 7. Negotiate an acceptable link width in each direction. Link width negotiation phase corresponds to Config.1 state of the initialization state machine, as explained in Section 3.9.3.4.1. 8. Establish flit boundary. This step is done in Config.2 state of initialization sequence explained in Section 3.9.3.4.3. 9. Signal to the Link layer that a configured link is established. This step is done in Config.2 state explained in Section 3.9.3.4.3. 10. Transfer control of link to Link layer. This corresponds to L0 state of the link, which is explained in Section 3.9.3.7. Ref No xxxxx Intel Restricted Secret Figure 3-2 illustrates the sequence of events during Physical layer initialization, with text balloons indicating the initialization steps mentioned earlier. The initialization states of logical sub-block state machine are also shown in the Figure. After power on, the Physical layer waits for LinkClockStable signal (Section 3.7.3.1) before starting internal calibration. Internal calibration is done only during Cold Reset and, by default, is bypassed for other types of Physical layer reset (see Section 3.7.4 through Section 3.7.8). However, Physical layer provides an option to force calibration for all reset types, through Physical layer Control and Status Registers (CSRs). To facilitate the ability to test and debug the Physical layer, it is important to synchronize the progress of link initialization to external events. The Physical layer defines a signal called PhyInitBegin for triggering link initialization. No communication occurs between the two connected CSI ports until PhyInitBegin signal is received by the Physical layer. The exact mechanism for generating PhyInitBegin signal is platform dependent. For instance, a platform might choose to hardwire a system signal to PhyInitBegin or might choose to control this signal using firmware. The Physical layer indicates successful completion of link initialization using CSRs. This event is indicated as PhyInitComplete signal in Figure 3-2. After completing initialization, the Physical layer continuously transmits Null Ctrl flits (see Link layer chapter for a definition of Null Ctrl Flit) until Link layer takes control of the link. The mechanism for link hand-off between the Physical layer and Link layer is explained in Section 3.8. The time scale shown in Figure 3-2 is solely intended to illustrate approximate Physical layer initialization time. The estimates assume no power up skew between connected ports, which would extend the detect phase by the amount of this power up skew. The calibrate phase might have significant variation across implementations. It is also assumed that link time of flight is negligible (<= 64 UI in each direction). A link transfer rate of 5 Gb/s is used for these estimates. A higher link transfer rate might reduce the initialization time. Refer to platform specification for the exact initialization times for that platform. Figure 3-2. Physical Layer Power Up and Initialization Sequence – An Example PhyInitComplete idle Power-onLinkClockStable calibrate detect idle PhyInitBegin platformdependent1 msplatformdependent25 ns fwd. clock 5 usec bit lock 125 ns 1.6 usec lane deskew 125 ns 125 ns12.5 ns parameter exchange Link width negotiation Set Flit Boundary Snd/Rcv NULL flits HandoverToLinkLayer platformdependent 1 2 3 4 5 876 9 Disable/Start Detect Polling Config Initialization State <= 2 milli seconds ~ 7 micro seconds Approx Init Time (platform specific) 10 Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 3.7.2 Link-Up Identifier The Physical layer uses a flag called Link-up Identifier to track link history. This flag is set (to 1) by the Physical layer once link initialization is complete and the Physical layer enters L0, and is cleared (to 0) by the Physical layer during Cold Reset. Link-up Identifier status is used as a secondary condition to distinguish between a CSI port and a 50. passive test probe (Section 3.9.3.2.1). The status of this flag is also used to identify any inadvertent failure of the physical interface, like Physical layer power glitch, and to pass this information on to Link layer for taking an appropriate action. A pair of connected CSI ports exchange their Link-up Identifiers during Physical layer initialization. A mismatch in these flags indicates one port is re-initializing the link while the other is going through a Cold Reset, possibly due to a glitch that resembled a power on of this port. The former port, with Link-up Identifier set, configures itself to go through a Cold Reset which results in this port resetting its Link-up Identifier, in addition to loading power on default values in all its registers. Link initialization is started all over again through Inband Reset, which is initiated by the port with Link-up Identifier set. Following Inband Reset, the initiating port communicates initialization status to its Link layer and also provides an indication about Link-up Identifier mismatch encountered during most recent initialization. Link layer, upon receiving information on Link-up Identifier mismatch, may choose to take additional steps like starting a Link layer initialization sequence or resetting its retry buffer pointers etc. Of course, Link layer also has the freedom to ignore Link-up Identifier mismatch. In other words, Physical layer only provides a hint to Link layer about a potential glitch on the link but does not enforce any specific action to be taken by the Link layer. (Author’s Note: Link-up Identifier cannot be exchanged in two-stage initialization, as the port seeing a power glitch goes through two-stage initialization and the other port goes through a single- stage initialization. Link-up Identifier scheme needs to be redefined in the context of two-stage initialization. This is a WIP.) 3.7.3 Physical Layer Clocking 3.7.3.1 Link Clock The Physical layer operates in link clock domain derived from system reference clock. Physical layer allows the flexibility for an implementation to choose an appropriate means of deriving link clock from system reference clock, and hence, is not involved in generating the link clock. However, all implementations are required to provide a LinkClockStable signal to Physical layer, as shown in Figure 3-2. This signal is an indication that link clock derived from system reference clock is stable and Physical layer can start using the link clock for Physical layer initialization. This signal is used by the Physical layer only during initialization, and Physical layer should not use this signal as an indicator of link clock stability. An unstable link clock would manifest as an unstable forwarded clock (Section 3.7.3.2), resulting in an Inband Reset (Section 3.7.5). Hence, no hooks are required by the Physical layer to monitor the stability of link clock. CSI Physical layer requires mesochronous clocks between connected ports, implying link clocks on ports connected over a link are required to have identical frequency (0 ppm) but may have a constant phase difference. Ref No xxxxx Intel Restricted Secret 3.7.3.2 Forwarded and Received Clocks Each CSI port sends its link clock to the remote port using a clock lane that is part of CSI physical interface. The clock thus sent is referred to as forwarded clock on the local port, and received clock on the remote port. Connected transceiver pairs across the link use this forwarded/received clock as a common timing reference. The receiver portion of a port uses received clock to strobe data. The synchronization between received clock domain and link clock domain at the receiver is implementation specific and is not a part of this specification. 3.7.3.3 Received Clock Status Indicator Each port is required to continuously monitor the presence of received clock, although the exact mechanism is implementation specific. A received clock that is deemed unusable by a port, even for 1 UI, should be equivalent to a lost clock. A lost received clock should be interpreted as an Inband Reset and a port losing received clock should follow Inband Reset sequence described in Section 3.7.5. 3.7.4 Cold Reset Cold Reset is the equivalent of Physical layer power on reset. All Physical layer parameters and CSRs are set to power on default values. The Physical layer starts internal calibration after receiving LinkClockStable signal and initializes the link using default Physical layer parameters. The Physical layer initialization sequence for Cold Reset is shown in Figure 3-2. Cold Reset sequence is followed by the Physical layer at power on. Physical layer CSRs allow an option to force Cold Reset to re-initialize Physical layer, in which case all CSRs are reverted to their power on default values prior to starting the initialization sequence. 3.7.5 Inband Reset Inband Reset is a mechanism used by Physical layer on local port to communicate a reset event to remote port, and is done by stopping the Forwarded Clock. Inband Reset is used by Link layer to re-initialize Physical layer if the former cannot recover from CRC errors beyond retry threshold. Inband Reset is also used to configure Physical layer by overriding power on default values through Soft Reset (Section 3.7.6). Additionally, Physical layer uses Inband Reset in the event of errors encountered during Physical layer initialization, which is an indication to either re-initialize the link or to abandon the initialization sequence. Physical layer has an option of specifying a retry threshold, where Physical layer initialization is retried until this threshold is reached, before abandoning the link initialization process. The Physical layer register interface provides details on initialization status and number of initialization attempts. Inband Reset largely follows the initialization sequence shown in Figure 3-2, with a few exceptions. The PhyInitBegin signal in the case of Inband Reset is self-generated by the Physical layer using TINBAND_RESET_INIT timer shown in Table 3-22. By default, internal calibration is bypassed during Inband Reset, unless explicitly forced through CSRs. It should be noted that forcing calibration on one port does not effect the default behavior on the remote port - remote port bypasses internal calibration unless CSRs on remote port are also configured. Physical layer initialization sequence is practically not impacted if one port goes through the calibration phase and the other does not - any initialization skew thus created will be negated when both ports synchronize in Detect state. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer If calibration is bypassed, a port stays in Disable/Start state (Section 3.9.3.1) for a period of TINBAND_RESET_INIT, before self-generating a PhyInitBegin signal. If, on the other hand, calibration is performed, the internal counter starts after completing calibration and a PhyInitBegin signal is generated TINBAND_RESET_INIT after calibration is completed. Figure 3-3. Inband Reset Sequence Initiated by Port A to Port B Port A (Clock Active A1 State) (Clock Active A2 State) (Clock Active A3 State) (Disable/Start A4 State) >= TINBAND_RESET_INIT (Detect State) A5 Port B StopForwardedClock ForwardedClock EarliestBdetectsA ForwardedClock B3 B4 >= TINBAND_RESET_INIT B1 B2 B5 (Detect State) (Clock Active State) (Clock Active State) (Disable/Start State) (Detect State) StopForwardedClock ForwardedClock EarliestAdetectsB Figure 3-3 shows an Inband Reset sequence where Port A sends an Inband Reset to Port B. The various events, A# and B#, shown on vertical time axis are explained in Table 3-2. Details on different states mentioned in Figure 3-3 are described in Section 3.9.3. Table 3-2. Inband Reset Events for Figure 3-3 Port A Port B A1: Port A is in a clock active state (a state other than Disable/Start or Detect.1). Forwarded Clock is currently being transmitted and/or received clock is being received. B1: Port A is in a state other than Disable/Start or Detect.1. Forwarded Clock is currently being transmitted and/or received clock is being received. A2: Port A sends an Inband Reset to Port B by stopping forwarded clock. Simultaneously, Port A stops driving data lanes as well, but receive side on Port A continues to see received clock and accepts incoming data. B2: Port B is still in a clock active state as Inband Reset is in flight. Continues to send forwarded clock (and data) to Port A. A3: Same as A2, as Port A continues to see received clock. B3: Port B loses received clock. Interprets this as an Inband Reset. Immediately stops driving forwarded clock and data lanes. Enters Disable/Start state. Ref No xxxxx Intel Restricted Secret Table 3-2. Inband Reset Events for Figure 3-3 A4: Port A loses received clock from Port B. Uses this event as a handshake to Inband Reset initiated by Port A. Port A now enters Disable/Start state. B4: Port B waits for at least a time period of TINBAND_RESET_INIT a from B3 event and automatically generates an internal PhyInitBegin signal (see Figure 3-2), resulting in Port B advancing to Detect.1 state. If internal calibration is bypassed, Port B waits for a period of TINBAND_RESET_INIT to generate PhyInitBegin signal. If on the other hand, internal calibration is forced, Port B waits for a period of TINBAND_RESET_INIT after completing internal calibration, to generate PhyInitBegin signal. A5: Port A waits for at least a time period of TINBAND_RESET_INIT from A4 event and automatically generates an internal PhyInitBegin signal (see Figure 3-2), resulting in Port A advancing to Detect.1 state. B5: This is the earliest time Port B detects Port A, and resumes driving forwarded clock when Port A is detected. Between B4 and B5, Port B continues to wait in Detect.1, waiting to detect Port A. If internal calibration is bypassed, Port A waits for a period of TINBAND_RESET_INIT to generate PhyInitBegin signal. If on the other hand, internal calibration is forced, Port A waits for a period of TINBAND_RESET_INIT after completing internal calibration, to generate PhyInitBegin signal. This is the earliest time Port A can detect Port B, and resumes driving forwarded clock when Port B is detected. a. The parameter TINBAND_RESET_INIT is defined to be much longer than the time of flight, so Port A is guaranteed to be in Disable/Start by the time Port B advances to Detect.1. This avoids any false detection of Port A (by Port B) when Inband Reset is in flight. 3.7.6 Soft Reset Soft Reset is a mechanism used by firmware, test tools and possibly Link layer to reset Physical layer. A Soft Reset sequence involves optionally configuring link parameters at both ports and initializing the Physical layer at local end of the link. Local port would communicate reset event to the remote port using Inband Reset, and both local and remote ports would re-initialize the link using parameters currently defined in CSRs. Soft Reset comes in two flavors. In the most common usage, software configures Physical layer parameters on both sides and starts a new link initialization sequence. A second flavor of Soft Reset is to force a Cold Reset of Physical layer, where all Physical layer CSRs are reverted to power on default values. 3.7.7 Two Stage Initialization Physical layer is initialized in two stages by using the basic reset modes described in Section 3.7.4 through Section 3.7.6. Two-stage initialization relies on firmware to program certain high-speed electrical parameters (equalization coefficients, for instance) which cannot be easily determined by hardware. The first stage of initialization is done at a CSI mandated low frequency, which is defined to be one-fourth the system reference clock frequency. For example, a platform using a 200 MHz system reference clock is required to perform first stage of initialization using a 50 MHz forwarded clock, resulting in a link transfer rate of 100 MT/s. Once link is initialized at low frequency, high speed electrical parameters are programmed in Physical layer registers by Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer firmware and the link is re-initialized (second initialization stage) using Soft Reset. The second stage of initialization is done at link operational frequency. Two-stage initialization does not preclude a single stage initialization, where the link is initialized in one pass at link operational frequency. Forcing single stage initialization is described in Section 3.7.7.1. Two-stage initialization can be viewed as repeating the initialization flow shown in Figure 3-2 - using a low frequency corresponding to one-fourth the system reference clock followed by re- initializing at link operational frequency. Some minor differences exist between the two stages which are outlined below. 1. Calibrate phase shown in Figure 3-2 will be bypassed for second stage by default. However, firmware can force calibration in second stage by configuring Physical layer registers, prior to starting second initialization stage. 2. Physical layer specification does not impose an implementation style for deriving low frequency forwarded clock used in first stage of the initialization, consistent with the philosophy of keeping clock generation external to the Physical layer. For instance, an implementation may choose to have two clock inputs to the Physical layer, with the Physical layer choosing a clock based on initialization stage. Such an implementation would internally operate at low frequency as well. Conversely, a different implementation may choose to operate internally at link operational frequency for first stage also, but derives forwarded clock by dividing high frequency internal clock. This implementation would mimic low frequency operation on the interface by repeating each data bit N times, where N is defined as, N =4 ×(LinkOperationalFrequency)÷(SystemReferenceClockFrequency) 3. Normal link operation and second stage of initialization have aligned clock and data edges at the transmitting port. To permit low frequency operation, the transmitting port is required to shift forwarded clock during first stage of initialization such that clock edges are centered w.r.t. data at transmitter output. (A 90 degree shift of forwarded clock in first initialization stage compared to second initialization stage or normal link operation.) 4. Interpolator training (Section 3.9.3.3.1) pattern, ...1010...., is transmitted such that rising edge of forwarded clock is centered w.r.t. logic 0 sent on each data lane (falling clock edge is centered w.r.t. logic 1), as shown in Figure 3-4. Figure 3-4. Relationship between Phase Interpolator Training Pattern and Forwarded Clock Phase during First Initialization Stage Two-stage initialization is not required for subsequent re-initialization sequences, as high speed parameters are already known. To ensure interoperability, all implementations are required to perform two-stage initialization during Cold Reset. For any other kind of reset, low frequency initialization is bypassed and the link is initialized in one pass at link operational frequency, which is identical to the second-stage of two-stage initialization sequence. DATA DATA# DATA DATA# DATA DATA# DATA DATA# DATA DATA# CLOCK CLOCK# CLOCK CLOCK# CLOCK CLOCK# CLOCK CLOCK# CLOCK CLOCK# Logic 1 Logic 0 Logic 0 Ref No xxxxx Intel Restricted Secret 3.7.7.1 Forcing Single State Initialization Physical layer provides a configuration hook, through register interface, to force a single stage initialization on Cold Reset. This feature is deemed attractive in test/debug environment, in which case tester needs to program the appropriate register fields through an out of band interface (JTAG or ITP, for instance), ensuring Physical layer registers are configured to initialize at a high frequency. Physical layer register interface provides the ability to either force or bypass calibration - for single stage initialization the registers should be configured to force calibration. Single stage link initialization thus configured is identical to the second stage of two-stage initialization with the exception that calibrate phase shown in Figure 3-2 is not bypassed. 3.7.8 Automatic Test Equipment (ATE) Initialization Mode Physical layer provides hooks to alter the initialization flow in test/debug environment, as described in this section. Physical layer control register can be used to configure a Device Under Test (DUT) to initialize the link in ATE mode. In ATE initialization mode, DUT is required to have a specific behavior in Detect state of initialization flow. Before proceeding further, the reader is advised to have a good understanding of default link initialization sequence (Section 3.7.4 through Section 3.7.7), and the link detect sequence described in Section 3.9.3.2. The following tables show modified detect sequence in ATE initialization mode, which all CSI implementations (DUT) are required to support when ATE initialization mode is chosen in Physical layer control register. Table 3-3 shows Detect sequence when ATE is acting as a transmitter and DUT is acting as a receiver. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer Initialization State ATE Tx DUT Rx Detect.1 • ATE asserts PhyInitBegin signal illustrated in Figure 3-2, which is used by DUT as an indication to advance to Detect.1. • ATE does not check for DUT’s terminations. Advances to Detect.2 after asserting PhyInitBegin. • DUT turns-on both clock and data terminations (only clock terminations are turned-on in normal mode) upon seeing PhyInitBegin signal. • Advances to Detect.2 after turning- on terminations. Detect.2 • Starts sending forwarded clock • Stays in Detect.2 for a period of TDETECT.2 and advances to Detect.3. • ATE does not check data terminations on DUT. Assumes all DUT Rx are functional. Any DUT Rx failures will be captured in Config.1 state. • Locks to received clock. • Stays in Detect.2 for a period of TDETECT.2 and advances to Detect.3 (in normal mode, if device times out in Detect.2, initialization is aborted. In ATE initialization mode device advances to Detect.3 instead of aborting initialization. However, device may still abort initialization on timeout, if it cannot receive a stable received clock by the end of this time period). Detect.3 and Beyond • No change to initialization sequence. Follows normal flow. • No change to initialization sequence. Follows normal flow. Table 3-4 shows Detect sequence when ATE is acting as a receiver and DUT is acting as a transmitter. Table 3-4. ATE Initialization Mode - ATE Rx and DUT Tx Initialization State ATE Rx DUT Tx Detect.1 • ATE always has clock and data terminations turned-on. Advances to Detect.2 after asserting PhyInitBegin illustrated in Figure 3-2. • DUT enters Detect.1 upon seeing PhyInitBegin. DUT continuously monitors ATE clock and data terminations in Detect.1, which are immediately detected (ATE terminations are always on). • When clock termination is seen, DUT enters Detect.2. DUT ignores data terminations. (in normal mode, device enters compliance mode when both clock and data terminations are detected in Detect.1. ATE initialization mode bypasses compliance mode). Detect.2 • Locks to received clock. • Stays in Detect.2 for a period of TDETECT.2 and advances to Detect.3. • Starts sending forwarded clock • Stays in Detect.2 for a period of TDETECT.2 and advances to Detect.3. • DUT does not check data terminations. (in normal mode, device checks for data terminations, which are used to handshake received clock stability.) Detect.3 & beyond • No change to initialization sequence. Follows normal flow. • No change to initialization sequence. Follows normal flow. Additionally, ATE may also initialize DUT using single stage initialization as outlined in Section 3.7.7.1. Ref No xxxxx Intel Restricted Secret 3.8 Interface Between Physical Layer and Link Layer The example shown in Figure 3-5 illustrates the interface between Physical layer and Link layer, and is not intended to dictate an implementation style. The data between Physical layer and Link layer is exchanged at a flit granularity, as indicated by Tx and Rx datapaths between these two Layers. The Physical layer transfers data on the link at a phit granularity - a phit represents data transferred in one Unit Interval (UI). Link transfer ratio, expressed as number of phits per flit, is 4, 8 or 16 for full width, half width or quarter width link, respectively. Figure 3-5. Interface Between Physical Layer and Link Layer – An Example Physical Layer Link Layer Rx Block Tx Block Cmd/Rsp PhyTxRdy PhyRxRdy LinkRxRdy LinkTxRdy Rx Datapath Tx Datapath [flit] [flit] From Remote Port Tx To Remote Port Rx [phit] [phit] Link layer and Physical layer communicate commands over the Cmd/Rsp interface. Signals PhyTxRdy, PhyRxRdy, LinkTxRdy and LinkRxRdy are referred to as beats and control the data transfer between Link layer and Physical layer. The Physical layer asserts PhyRxRdy to transfer a flit to Link layer, and asserts PhyTxRdy to accept a flit from the Link layer. Likewise, the Link layer asserts LinkTxRdy and LinkRxRdy beats to transmit/receive flits from Physical layer. The Physical layer can control link transfer ratio by controlling PhyTxRdy and PhyRxRdy beats. For instance, Physical layer lowers the beat when a full width link is configured as a half width link. Physical layer is also allowed to introduce bubbles into Link layer by temporarily halting this beat under some special circumstances - like link re-initialization or link retraining (Section 3.9.7). This specification of Physical layer assumes that Link layer continuously transmits/receives data to/from Physical layer, implying that Link layer is not allowed to introduce bubbles into the Physical layer. In case of inactivity, Link layer is required to transmit Null Ctrl flits. An exception to this rule is right after Physical layer initialization, when Physical layer is waiting for Link layer to take control of the link. After Physical layer initialization is completed, Physical layer transfers Null Ctrl flits on the link until Link layer asserts LinkTxRdy beat at which point Physical layer transfers control to Link layer. During this time, the remote port (receive side) discards incoming data if its LinkRxRdy beat is turned-off. The duration of Null Ctrl flits sent right after initialization is the time difference between PhyInitComplete and HandoverToLinkLayer events shown in Figure 3-2, and is allowed to vary between implementations. Physical layer does not depend on a specific value or a range of values - it can continue to send/receive Null Ctrl flits until Link layer assumes control of the link. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 3.9 Logical Sub-Block This section describes the features of Physical layer logical sub-block, and explains the initialization sequence required to support these features and to initialize a link. 3.9.1 Supported Link Widths The logical sub-block uses a base width of 20 lanes per link, referred to as full width of the link. The logical sub-block also allows for half width link with 10 lanes and a quarter width link with 5 lanes. Thus, a flit consisting of 80 bits is transmitted using 4 phits on a full width link, 8 phits on a half width link and 16 phits on a quarter width link. To support multiple link widths, a link is logically partitioned into four quadrants, with each quadrant consisting of 5 bits. The four quadrants representing a link are referred to as Q0, Q1, Q2 and Q3. A link of desired width can be formed by using a combination of one or more quadrants. For instance, a full width link requires all four quadrants {Q3, Q2, Q1, Q0}, half width link requires any two quadrants {Qy, Qx} and quarter width requires any one quadrant {Qx}. A combination of muxing and bit swizzling is used to support these link widths at quadrant granularity. For interoperability, all CSI implementations are required to implement the mux and swizzle schemes described in this section. Table 3-5 shows an 80-bit flit divided into 4 chunks. The top row indicating column number corresponds to the bit position within each chunk. Refer to Chapter 4 on Link layer for an explanation of fields within a flit, shown in the last four rows of Table 3-5. The transmission of chunks and bits within a chunk are required to follow a specific order to make effective use of Link layer CRC burst error detection capabilities, and this transmission order depends on the link width in use. Table 3-5. Flit Format Column Number 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 chunk 0 I68 I64 I60 I56 I52 I48 I44 I40 I36 I32 I28 I24 I20 I16 I12 I8 I4 I0 C4 C0 chunk 1 I69 I65 I61 I57 I53 I49 I45 I41 I37 I33 I29 I25 I21 I17 I13 I9 I5 I1 C5 C1 chunk 2 I70 I66 I62 I58 I54 I50 I46 I42 I38 I34 I30 I26 I22 I18 I14 I10 I6 I2 C6 C2 chunk 3 I71 I67 I63 I59 I55 I51 I47 I43 I39 I35 I31 I27 I23 I19 I15 I11 I7 I3 C7 C3 3.9.1.1 Muxing Scheme for Supporting Different Link Widths The logical sub-block internally represents each bit using an ordered pair , where ‘q’ is the quadrant number a bit belongs to and ‘o’ is offset of the bit in quadrant ‘q’. Thus, the highest bit in quadrant Q0 is represented as <0, 4> and the lowest bit in quadrant Q3 is represented as <3, 0>. The flit format and phit order for full-, half- and quarter width links are shown in Table 3-6, Table 3-7 and Table 3-8, respectively. All implementations are required to maintain the exact location of bits in these tables to meet CRC requirements, and for maintaining interoperability. The flit format for a full width link shown in Table 3-6 is similar to the one shown in Table 3-5, except for the notation used to indicate the fields within a flit. The top row in Table 3-6 shows the column number of a bit within each phit. The next four rows show the phits in the order they are transmitted. Comparing Table 3-6 to Table 3-5, fields of the flit are represented using a combination of chunk number and column number. The chunk number, preceding ‘:’, corresponds Ref No xxxxx Intel Restricted Secret to the chunk this bit belongs to in Table 3-5 and the number following ‘:’ is a positional value indicating the column number of this bit. The last two rows in Table 3-6 show the internal representation of bits using ordered pair. Table 3-6. Flit Format and Phit Order – Full Width Link Column Number 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 phit 0 0:19 0:18 0:17 0:16 0:15 0:14 0:13 0:12 0:11 0:10 0:9 0:8 0:7 0:6 0:5 0:4 0:3 0:2 0:1 0:0 phit 1 1:19 1:18 1:17 1:16 1:15 1:14 1:13 1:12 1:11 1:10 1:9 1:8 1:7 1:6 1:5 1:4 1:3 1:2 1:1 1:0 phit 2 2:19 2:18 2:17 2:16 2:15 2:14 2:13 2:12 2:11 2:10 2:9 2:8 2:7 2:6 2:5 2:4 2:3 2:2 2:1 2:0 phit 3 3:19 3:18 3:17 3:16 3:15 3:14 3:13 3:12 3:11 3:10 3:9 3:8 3:7 3:6 3:5 3:4 3:3 3:2 3:1 3:0 quadrant 3 2 1 0 3 2 1 0 3 2 1 0 3 2 1 0 3 2 1 0 offset 4 4 4 4 3 3 3 3 2 2 2 2 1 1 1 1 0 0 0 0 The flit formats for half- and quarter width link widths are shown in Table 3-7 and Table 3-8, respectively. The fields of each phit retain the same values as in full width flit format shown in Table 3-6. The two quadrants, Qx and Qy, shown in row of Table 3-7 indicate the two quadrants chosen to transmit a flit on a half width link. The quadrant Qx shown in Table 3-8 indicates the quadrant chosen to transmit a flit on a quarter width link. An implementation has the flexibility to designate quadrants of choice to support half- and quarter width links. Table 3-7. Flit Format and Phit Order – Half Width Link Column Number 9 8 7 6 5 4 3 2 1 0 phit 0 0:18 0:16 0:14 0:12 0:10 0:8 0:6 0:4 0:2 0:0 phit 1 1:18 1:16 1:14 1:12 1:10 1:8 1:6 1:4 1:2 1:0 phit 2 0:19 0:17 0:15 0:13 0:11 0:9 0:7 0:5 0:3 0:1 phit 3 1:19 1:17 1:15 1:13 1:11 1:9 1:7 1:5 1:3 1:1 phit 4 2:18 2:16 2:14 2:12 2:10 2:8 2:6 2:4 2:2 2:0 phit 5 3:18 3:16 3:14 3:12 3:10 3:8 3:6 3:4 3:2 3:0 phit 6 2:19 2:17 2:15 2:13 2:11 2:9 2:7 2:5 2:3 2:1 phit 7 3:19 3:17 3:15 3:13 3:11 3:9 3:7 3:5 3:3 3:1 quadrant y x y x y x y x y x offset 4 4 3 3 2 2 1 1 0 0 Table 3-8. Flit Format and Phit Order – Quarter Width Link Column Number 4 3 2 1 0 phit 0 0:16 0:12 0:8 0:4 0:0 phit 1 1:16 1:12 1:8 1:4 1:0 phit 2 0:18 0:14 0:10 0:6 0:2 phit 3 1:18 1:14 1:10 1:6 1:2 phit 4 0:17 0:13 0:9 0:5 0:1 phit 5 1:17 1:13 1:9 1:5 1:1 Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer Column Number 4 3 2 1 0 phit 6 0:19 0:15 0:11 0:7 0:3 phit 7 1:19 1:15 1:11 1:7 1:3 phit 8 2:16 2:12 2:8 2:4 2:0 phit 9 3:16 3:12 3:8 3:4 3:0 phit10 2:18 2:14 2:10 2:6 2:2 phit 11 3:18 3:14 3:10 3:6 3:2 phit 12 2:17 2:13 2:9 2:5 2:1 phit 13 3:17 3:13 3:9 3:5 3:1 phit 14 2:19 2:15 2:11 2:7 2:3 phit 15 3:19 3:15 3:11 3:7 3:3 quadrant x x x x x offset 4 3 2 1 0 The flit formats for different link widths, shown in Table 3-6, Table 3-7 and Table 3-8 have the following properties. 1. Even chunks (0 and 2) are sent as even phits and odd chunks (1 and 3) are sent as odd phits. Chunks 0 and 1 are transmitted completely before transmitting chunks 2 and 3. For half- and quarter width links, this requires interleaving chunks 0 and 1, until they are transmitted completely, followed by interleaving chunks 2 and 3. 2. Once chunk order is established for a given link width, the bits within a chunk are required to follow a specific order. For full width link, all bits of a chunk are transmitted in one phit, and hence follow the order shown in Table 3-6. 3. A half width link transmits a flit as 8 phits by choosing every other column of a full width link shown in Table 3-6. Phit 0 transmits even columns of chunk0 and phit 1 transmits even columns of chunk1. The next 2 phits send odd columns of chunks 0 and 1, respectively. Thus, the first 4 phits are used to completely transmit chunks 0 and 1. The next four phits are formed by repeating these steps using chunks 2 and 3. An implementation may choose any two arbitrary quadrants in half width mode. In this case, the quadrant with lower value must transmit the bit with lower column number. 4. A quarter width link transmits a flit using 16 phits, each consisting of 5 bits. These 5 bits are formed by taking each row of a half width link shown in Table 3-7 and transmitting every other bit. Referring to a full width flit format in Table 3-6, phit 0 transmits 5 bits from chunk0, starting with column0 and transmitting every 4th column (columns 0, 4, 8, 12 and 16). Phit 1 transmits 5 bits from chunk1, starting with column0 and transmitting every 4th column. The next 6 phits interleave chunks 0 and 1, transmitting 5-bits per chunk starting with columns 2, 1 and 3 in that order, and selecting every 4th column. Thus, the first 8 phits completely transmit chunks 0 and 1. The next 8 phits are formed in an identical fashion using chunks 2 and 3. The muxing scheme to satisfy the above flit transmission properties is shown in Figure 3-6. Each chunk is divided into 5 nibbles. A bit with an offset ‘k’ in any quadrant always muxes into one of the 4 bits of nibble ‘k’, using nibble muxes. The chunk mux is used to interleave even and odd chunks. Muxes are shown only for bits with offset ‘0’ in each quadrant. The remaining bits use a similar muxing scheme and hence abstracted using dotted arrows. Link transmission properties enumerated above are explained below using nibble0 of Figure 3-6 as an example. Ref No xxxxx Intel Restricted Secret For a full width link, mux input selection is straight forward. Columns 0, 1, 2 and 3 of nibble0 are connected to bits with offset 0 in each quadrant. The entire nibble0 of chunk0 is transmitted as phit0 and the entire nibble0 of chunk1 is transmitted as phit1. This step is repeated with chunks 2 and 3 for the next two phits. Figure 3-6. Mux Scheme for Link Width Support Even Chunk 4 4 Odd Chunk Chunk Mux 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 n i b b l e 0 n i b b l e 1 n i b b l e 2 n i b b l e 3 n i b b l e 4 n i b b l e 0 n i b b l e 1 n i b b l e 2 n i b b l e 3 n i b b l e 4 4 1 1 1 1 Ordered Pair <0, 0> <1, 0> <2, 0> <3, 0> <0, 2> <1, 2> <2, 2> <3, 2> <0, 1> <1, 1> <2, 1> <3, 1> <0, 3> <1, 3> <2, 3> <3, 3> <0, 4> <1, 4> <2, 4> <3, 4> 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 Nibble Muxes 4 4 4 4 In half width mode using quadrants {Qy, Qx}, nibble0 of chunks 0 and 1 is transmitted during first 4 phits, using bits and . In phit 0, Qx and Qy transmit columns 0 and 2 of chunk 0, respectively. In phit 1, Qx and Qy switch to chunk 1 and transmit columns 0 and 2, respectively. This process is repeated with columns 1 and 3 of chunks 0 and 1 for a total of 4 phits. The next 4 phits follow the same steps, but replace chunks 0 and 1 with chunks 2 and 3, respectively. For a chosen quadrant pair {Qy, Qx}, it is required that x is less than y. For instance, if quadrants {Q1, Q0} are used to form a half width link, Q0 sends columns 0 and 1 of each chunk in successive phits and Q1 sends columns 2 and 3 (in successive phits). Instead, if quadrants {Q2, Q1} are used to form a half width link, Q1 sends columns 0 and 1 of each chunk and Q2 sends columns 2 and 3. In quarter width mode using a quadrant Qx, it takes a total of 8 phits to transmit nibble0 of chunks 0 and 1 using . Column0 of chunk0 is transmitted in phit0 and column0 of chunk1 is transmitted in phit1. This process is repeated 3 more times, using columns 2, 1 and 3 while interleaving the two chunks, for a total of 8 phits. The remaining 8 phits follow a similar sequence by replacing chunks 0 and 1 with chunks 2 and 3, respectively. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer The mux scheme discussed thus far applies to transmit side of a CSI port. The receiver at the remote port is required to implement a de-mux scheme that does the exact opposite of the mux scheme described here. For interoperability, all CSI implementations are required to implement the mux scheme described in this specification. 3.9.1.2 Swizzling Function for Connecting Logical Bits to Physical Lanes Previous section described the mux scheme used to support different link widths. To take advantage of nibble muxing, quadrants are interleaved as shown in the last two rows of Table 3-6 through Table 3-8. As such a half- or quarter width link does not use contiguous physical lanes to transmit a phit across the link. This discontinuity in physical lanes is addressed through a swizzling scheme which maps the columns in Table 3-6 through Table 3-8onto physical pins at the interface, such that a quadrant is connected to a contiguous set of pins. A bit swizzling layer is introduced between internal logic and physical lanes (physical pins) to force quadrants on contiguous lanes, as shown in Figure 3-7. Bit swizzling is accomplished through on-die hard wiring, and hence does not require additional logic. A bit represented internally using an ordered pair is mapped on to a physical lane ‘n’ using swizzling equation shown below. Lane n = (NL/4)*(1+q) -‘o’ -1, for q < 2 = (NL/4)*(5-q) + ‘o’for q >= 2 where, ‘n’ is the lane number (0 through (NL-1)), ‘NL’ is the number of lanes in a full width link (20 for current CSI specification), ‘q’ is the quadrant number (0 through 3), and ‘o’ is the bit offset within quadrant ‘q’ (0 through 4) The bit swizzling scheme described above applies to transmit side of a CSI port. Receive side at the remote port is required to implement a de-swizzling scheme that does the exact opposite of this swizzle scheme. For interoperability, all CSI implementations are required to implement the swizzle scheme described in this specification. Note the order of quadrants after swizzling, shown as Swizzled Ordered Pair in Figure 3-7 - swizzling does not result in a sequential quadrant ordering at the Physical Pins. Ref No xxxxx Intel Restricted Secret Figure 3-7. Physical Bit Swizzling Ordered Pair <0, 0> <1, 0> <2, 0> <3, 0> <0, 2> <1, 2> <2, 2> <3, 2> <0, 1> <1, 1> <2, 1> <3, 1> <0, 3> <1, 3> <2, 3> <3, 3> <0, 4> <1, 4> <2, 4> <3, 4> 0 1 2 3 4 5 6 7 8 9 10 11 Physical Pins Swizzled Ordered Pair Q 0 Q 1 Q 3 Q 2 12 13 14 15 16 17 18 19 <0, 0> <0, 2> <0, 1> <0, 3> <0, 4> <1, 0> <1, 2> <1, 1> <1, 3> <1, 4> <2, 4> <2, 2> <2, 3> <2, 1> <2, 0> <3, 4> <3, 2> <3, 3> <3, 1> <3, 0> The clock lane is required to be in the center of physical interface, between pin 9 and pin 10, as shown in Table 3-9. The ordered pair representation, using quadrant number and lane offset, are also shown for each pin. Note that clock pin is not assigned to any quadrant as this lane is transparent to the mux and swizzle logic described earlier. Table 3-9. Physical Pin Numbering and Clock Position on a Link with 20 Lanes Physical Pin 19 18 17 16 15 14 13 12 11 10 CLK 9 8 7 6 5 4 3 2 1 0 Quadrant 2 2 2 2 2 3 3 3 3 3 N/A 1 1 1 1 1 0 0 0 0 0 Offset 4 3 2 1 0 4 3 2 1 0 N/A 0 1 2 3 4 0 1 2 3 4 3.9.1.3 Link Map and Width Capability Indicator A link formed using a combination of any 4 logical quadrants - Q0 through Q3, is internally represented using a 4-bit field called a Link Map (LM). The LSB of LM corresponds to quadrant Q0 and the MSB corresponds to quadrant Q3. A value of 1 for a bit position in LM indicates that Ref No xxxxx Intel Restricted Secret 55 Physical Layer Physical Layer the corresponding quadrant is active, and a value of 0 indicates that the corresponding quadrant is not a part of the link. Table 3-10 shows Link Map for link widths supported using all possible quadrant combinations. Table 3-10. Link Map for Supported Link Widths Link Width Quadrants Used Link Map Link Map Index Full Width {Q3, Q2, Q1, Q0} 1111 0 {Q1, Q0} 0011 1 {Q2, Q0} 0101 2 Half Width {Q3, Q0} 1001 3 {Q2, Q1} 0110 4 {Q3, Q1} 1010 5 {Q3, Q2} 1100 6 {Q0} 0001 7 Quarter Width {Q1} 0010 8 {Q2} 0100 9 {Q3} 1000 10 As shown in Table 3-10, there are eleven possible ways of forming a valid link - a unique combination of quadrants to form a full width link, six possible quadrant combinations to form a half width link and four possible ways to form a quarter width link. The last column in Table 3-10 is used to index a Link Map, and should be consistent across all CSI implementations. An implementation is not required to support all these eleven possible combinations. The initialization algorithm allows for such flexibility and chooses a Link Map Index that is common to both ports. Link Maps supported by an implementation are represented using an 11-bit field called Width Capability Indicator (WCI). Each bit in WCI corresponds to one of the indices shown in Link Map Index column of Table 3-10. Thus, bit 0 of WCI corresponds to index 0, bit 1 of WCI corresponds to index 1 and so on. A value of 1 for a WCI bit indicates that an LM corresponding to this index can be used to form a link width. During link initialization, ports exchange their corresponding WCI, which is implementation specific, and agree on an LM that is common to both ports. The LM thus agreed upon is referred to as Common Link Map (CLM). Order of precedence for selecting a CLM is from the lowest bit to the highest bit in WCI. For instance, if two ports supporting all LMs in Table 3-10 are configured to form a half width link, they will use {Q1, Q0} to form a link as this quadrant combination has a lower bit position in WCI compared to all other half width quadrant combinations. Table 3-11 shows a few example implementations with widely varying link width support capabilities. The WCI fields for each of these examples is also shown. For instance, if two implementations shown in Example 1 were configured to form a half width link, they will use quadrants {Q1, Q0}, as this quadrant combination takes precedence over other half width quadrant combinations. Likewise, if implementations shown in Examples 1 and 2 are connected to form a half width link, they will use quadrants {Q3, Q2}, as this is the only common quadrant combination that can support a half width link. Conversely, if implementations shown in Examples 1 and 3 are connected together and configured to form a half width link, link initialization error occurs as these implementations do not have a common LM to support a half width link. Ref No xxxxx Intel Restricted Secret Table 3-11. Examples of Width Capability Indicator (WCI) Example Link Widths Supported Width Capability Indicator (WCI) 10 9 8 7 6 5 4 3 2 1 0 1 Full, half and quarter width using all possible quadrant combinations 1 1 1 1 1 1 1 1 1 1 1 2 Full width. half width using only quadrants Q3 and Q2, and quarter width using quadrant Q3 only 1 0 0 0 1 0 0 0 0 0 1 3 Full width support only 0 0 0 0 0 0 0 0 0 0 1 3.9.1.4 Virtual Lanes–- UP Profile Some CSI components may optimize link power consumption by turning-off electrical I/O circuits on lanes carrying implicit information. The internal logic still operates using 20/10/5 bits for full/half-quarter width links. Logic bits connected to disabled physical lanes are assigned a virtual lane attribute to recover implicit information at the receiving port. The transmitting port continues to drive these implicit bits, but these bits are not transmitted across the link as they are connected to disabled lanes. The receiving port populates bits tagged with virtual lane property in each phit with the corresponding implicit value before forwarding a flit to the Link layer. During Physical layer initialization transmit side of a port tells the remote port, the lanes to be virtualized and their implicit value. The remote port can turn-off receiver circuitry connected to these virtual lanes and stuffs these bit positions with the implicit value received from transmitter during initialization, before forwarding a flit to the Link layer. For example, some implementations might not use Profile Dependent Fields or Interleave Bit occupying columns 18 and 19 of Table 3-6, in which case, these columns always have a value of 0. Refer to Chapter 4 on Link layer for a description of these fields. Fields in these columns are referred to as sideband signals in this Chapter. Thus, lanes carrying sideband signals can be turned- off, and a receiving port can reconstruct this information by tagging the corresponding logic bits with virtual lane property, and populating a value of 0 in these bit positions before forwarding an 80-bit flit to Link layer. 3.9.1.5 Forming an 18-Bit Wide Link using a 20-Bit Wide Link – UP Profile CSI Implementations (in UP profile) have 20 physical pins on a full width link but may take advantage of lane virtualization (Section 3.9.1.4) by disabling either CRC or sideband signals. This results in a full width link with 18 active lanes across all four quadrants. A half width link has 9 active lanes in two quadrants, and a quarter width link consists of 5 active lanes in a quadrant. Thus, these configurations have 2 virtual lanes on a full width link and 1 virtual lane on a half width link. CRC bits occupy columns 0 and 1 of each chunk, shown in Table 3-5 and sideband signals occupy columns 18 and 19. The positions of these fields in full-, half- and quarter width links are shown in Table 3-12, Table 3-13 and Table 3-14, respectively. For example, on a platform configured to disable sideband signals on a full width link, receiving port automatically generates the implicit value for sideband signals on logic bits <2, 4> and <3, 4> (ordered pair representation) of each phit before forwarding an 80-bit flit to Link layer. Similarly, on a half width link formed using quadrants {Qy, Qx}, receiving port internally generates the Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer implicit value for logic bit of each phit, before forwarding an 80-bit flit to Link layer. No action is required by the receiving port in quarter width mode consisting of 5 lanes, as the implicit values are transmitted across the link. Table 3-12. CRC and Side-band Fields – Full Width Link Column Number 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 phit 0 0:19 0:18 0:17 0:16 0:15 0:14 0:13 0:12 0:11 0:10 0:9 0:8 0:7 0:6 0:5 0:4 0:3 0:2 0:1 0:0 phit 1 1:19 1:18 1:17 1:16 1:15 1:14 1:13 1:12 1:11 1:10 1:9 1:8 1:7 1:6 1:5 1:4 1:3 1:2 1:1 1:0 phit 2 2:19 2:18 2:17 2:16 2:15 2:14 2:13 2:12 2:11 2:10 2:9 2:8 2:7 2:6 2:5 2:4 2:3 2:2 2:1 2:0 phit 3 3:19 3:18 3:17 3:16 3:15 3:14 3:13 3:12 3:11 3:10 3:9 3:8 3:7 3:6 3:5 3:4 3:3 3:2 3:1 3:0 quadrant 3 2 1 0 3 2 1 0 3 2 1 0 3 2 1 0 3 2 1 0 offset 4 4 4 4 3 3 3 3 2 2 2 2 1 1 1 1 0 0 0 0 Table 3-13. CRC and Side-band Fields –- Half Width Link Column Number 9 8 7 6 5 4 3 2 1 0 phit 0 0:18 0:16 0:14 0:12 0:10 0:8 0:6 0:4 0:2 0:0 phit 1 1:18 1:16 1:14 1:12 1:10 1:8 1:6 1:4 1:2 1:0 phit 2 0:19 0:17 0:15 0:13 0:11 0:9 0:7 0:5 0:3 0:1 phit 3 1:19 1:17 1:15 1:13 1:11 1:9 1:7 1:5 1:3 1:1 phit 4 2:18 2:16 2:14 2:12 2:10 2:8 2:6 2:4 2:2 2:0 phit 5 3:18 3:16 3:14 3:12 3:10 3:8 3:6 3:4 3:2 3:0 phit 6 2:19 2:17 2:15 2:13 2:11 2:9 2:7 2:5 2:3 2:1 phit 7 3:19 3:17 3:15 3:13 3:11 3:9 3:7 3:5 3:3 3:1 quadrant y x y x y x y x y x offset 4 4 3 3 2 2 1 1 0 0 Table 3-14. CRC and Side-band Fields – Quarter Width Link Column Number 4 3 2 1 0 phit 0 0:16 0:12 0:8 0:4 0:0 phit 1 1:16 1:12 1:8 1:4 1:0 phit 2 0:18 0:14 0:10 0:6 0:2 phit 3 1:18 1:14 1:10 1:6 1:2 phit 4 0:17 0:13 0:9 0:5 0:1 phit 5 1:17 1:13 1:9 1:5 1:1 phit 6 0:19 0:15 0:11 0:7 0:3 phit 7 1:19 1:15 1:11 1:7 1:3 phit 8 2:16 2:12 2:8 2:4 2:0 phit 9 3:16 3:12 3:8 3:4 3:0 phit10 2:18 2:14 2:10 2:6 2:2 Ref No xxxxx Intel Restricted Secret Table 3-14. CRC and Side-band Fields – Quarter Width Link (Continued) Column Number 4 3 2 1 0 phit 11 3:18 3:14 3:10 3:6 3:2 phit 12 2:17 2:13 2:9 2:5 2:1 phit 13 3:17 3:13 3:9 3:5 3:1 phit 14 2:19 2:15 2:11 2:7 2:3 phit 15 3:19 3:15 3:11 3:7 3:3 quadrant x x x x x offset 4 3 2 1 0 3.9.1.6 Narrow Physical Interfaces (Optional) – UP Profile Some CSI implementations may take advantage of lane virtualization to reduce both link power and pin count. These implementations do not instantiate physical pins corresponding to either one or both sideband signals. Thus, implementations can have a physical interface with either 19 or 18 physical pins. However, the internal logic still operates using a base width of 20 bits to represent the physical interface. A 19 pin physical interface has two variants, depending on the depopulated sideband signal pin. In one case, pin corresponding to higher sideband signal (column 19 of each chunk in Table 3-5) is depopulated and in the other, pin corresponding to lower sideband signal (column 18 of each chunk in Table 3-5) is depopulated. These two 19 pin variants are not interchangeable, implying a link with 19 active lanes cannot be established by connecting a port depopulating higher sideband pin to a port depopulating lower sideband pin. Table 3-15 shows the physical pins depopulated on narrow physical interfaces. Table 3-16 shows the complete pin map for narrow interfaces and the corresponding ordered pair representation for physical pins. Having fewer than 20 pins does not renumber the physical pin numbers. For instance, an implementation with a depopulated higher sideband signal is required to have physical pins from 0 through 13 and from 15 through 19 - pin 14 is considered missing and hence pins are not renumbered from 0 through 18. Table 3-15. Pins Depopulated on Narrow Physical Interfaces Configuration Depopulated Physical Pin #s Missing Higher Sideband Signal 14 Missing Lower Sideband Signal 19 Missing Both Sideband Signals 14 and 19 Ref No xxxxx Intel Restricted Secret Table 3-16. Narrow Physical Interface -Pin Map and Internal Representation Physical Pin Number Narrow Interface with higher sideband signal depopulated 19 18 17 16 15 X 13 12 11 10 CLK 9 8 7 6 5 4 3 2 1 0 Narrow Interface with lower sideband signal depopulated X 18 17 16 15 14 13 12 11 10 CLK 9 8 7 6 5 4 3 2 1 0 Narrow Interface with both sideband signals depopulated X 18 17 16 15 X 13 12 11 10 CLK 9 8 7 6 5 4 3 2 1 0 Quadrant 2 2 2 2 2 3 3 3 3 3 N/A 1 1 1 1 1 0 0 0 0 0 Offset 4 3 2 1 0 4 3 2 1 0 N/A 0 1 2 3 4 0 1 2 3 4 Narrow physical interfaces adhere to mux and swizzle schemes described in Section 3.9.1.1 and Section 3.9.1.2, respectively. Although, the physical interface has fewer than 20 pins, the swizzling function described in Section 3.9.1.2 still uses a value of 20 for NL. These implementations can achieve further power reductions by virtualizing CRC bits independent of sideband signals. Hence, these implementations can support links with 16, 17, 18 or 19 active lanes. Table 3-17 summarizes the number of active lanes for each of these narrow link width options, in full-, half- and quarter width modes. Table 3-17. Summary of Narrow Physical Interfaces Number of Lanes Configuration Notes Full Width Half Width Quarter Width 16 8 5 CRC and both sideband signals disabled Configured by virtualizing either 18 or one of the two 19 pin interface variants 17 9 5 CRC and one sideband signal disabled Configured by virtualizing either 19 pin interface variants 18 9 5 Both sideband signals disabled An 18 pin interface or configured by virtualizing either 19 pin interface variants 19 10 5 One sideband signal disabled Either 19 pin interface, no lane virtualized A link with 16-19 lanes can be formed by connecting a narrow physical interface to a full physical interface with 20 pins. In this case, the 20 pin implementation is required to support lane virtualization and is required to form a half width link using quadrants {Q1, Q0}, and a quarter width link using either {Q1} or {Q0}. The 20 pin part may choose to support other quadrant combinations. Pins on full width physical interface corresponding to missing pins on narrow physical interface should not be used to form a link. Thus a link formed between an 18 pin narrow interface and a 20 pin full width interface would not use pins 14 and 19 on the latter. Unused pins on full width interface may be left unconnected or may be hard wired to either Vcc or Vss, as required by the implementation. This specification does not require unused pins to have known logic values. Ref No xxxxx Intel Restricted Secret 3.9.1.7 Designing a Half Width Link (optional) - UP Profile An implementation can be designed for a half width link with 10 physical pins. These implementations also support a quarter width link consisting of 5 lanes, but do not support lane virtualization. An implementation designed for half width link is required to follow mux and swizzle schemes described in Section 3.9.1.1 and Section 3.9.1.2, respectively. This implementation has only two quadrants that are numbered Q0 and Q1, and the physical pins should be numbered from 0 through 9. The swizzling function still uses a value of 20 for NL, but only the top half of swizzling equation applies to half width link implementations. The clock lane should be at one end of the physical interface, adjacent to pin number 9, as shown in Table 3-18. Physical pins corresponding to quadrants Q3 and Q2 does not exist for implementations designed for a half width link. Table 3-18. Physical Pin Numbering and Clock Position on a Link with 10 Lanes Physical Pin Non-existent Physical Pins CLK 9 8 7 6 5 4 3 2 1 0 Quadrant 2 2 2 2 2 3 3 3 3 3 N/A 1 1 1 1 1 0 0 0 0 0 Offset 4 3 2 1 0 4 3 2 1 0 N/A 0 1 2 3 4 0 1 2 3 4 Half width link implementations are interoperable with a full width link implementation consisting of 20 physical pins. However, a link can be formed only by connecting physical pins 0 through 9 on half width link implementation to physical pins 0 through 9, respectively, on full width link implementation. In this configuration, the full width link implementation is required to support, at a minimum, half width link using quadrants {Q1, Q0}, and quarter width link using either {Q1} or {Q0}. The full width implementation may choose to support other quadrant combinations. Unused pins on full width link implementation may be left unconnected or may be hard wired to Vcc or Vss, as required by the implementation. This specification does not require unused pins to have known logic values. 3.9.1.8 Port Bifurcation (Optional) – Small and Large MP Profiles Some CSI components may optionally implement a port bifurcation feature, where a full width link can operate as two independent half width links. An implementation supporting port bifurcation is required to have two clock lanes, at the center of pin field as shown in Table 3-19. Table 3-19. Pin Map for Implementations Supporting Port Bifurcation Physical Pin 19 18 17 16 15 14 13 12 11 10 C L K 2 C L K 1 9 8 7 6 5 4 3 2 1 0 Quadrant 2 2 2 2 2 3 3 3 3 3 1 1 1 1 1 0 0 0 0 0 Offset 4 3 2 1 0 4 3 2 1 0 0 1 2 3 4 0 1 2 3 4 Port bifurcation is a static configuration, which is set prior to link initialization through pin straps or other means. An implementation supporting port bifurcation should also have the capability to operate as a single full width link, in which case either CLK1 or CLK2 can be designated as the primary clock lane. An implementation may leave unused clock pin unconnected or may hard wire the unused clock pin to either Vcc or Vss, as required by the implementation. This specification does not require unused clock lane to be in a specific state. A bifurcated port is required to follow mux and swizzle schemes described in Section 3.9.1.1 and Section 3.9.1.2, respectively. A bifurcated port should maintain the same physical pin numbers and quadrant numbers as an otherwise full width port. This implies one half of a bifurcated port will Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer have physical pin numbers 0 through 9 corresponding to quadrants Q0 and Q1, and the other half will have physical pin numbers 10 through 19 corresponding to quadrants Q3 and Q2. Due to the identical quadrant numbers and physical pin numbers as a full width port, swizzling function for a bifurcated port is identical to that of a single full width port. Of note, even though a port is bifurcated, swizzling is accomplished on all lanes, spanning both ports, using a value of 20 for NL. Each half of a bifurcated port can be connected to a non-bifurcated full width port, forming a half width link. The physical pin numbers on ports across each lane are required to be identical. Thus, one half of a bifurcated port connects to a non-bifurcated full width port using pins 0 through 9 on both ports, and the other half of a bifurcated port connects to a non-bifurcated full width port using pins 10 through 19 on both ports. It is not permissible to form a half width link between a bifurcated port and non-bifurcated port using pins 0 through 9 on one port and pins 10 through 19 on the other. Unused pins on full width port may be left unconnected or may be hard wired to Vcc or Vss, as required by the implementation. This specification does not require unused lanes to have known logic values. A full width link implementation supporting half width link connection to a bifurcated port should, at a minimum, support link widths using the following quadrant combinations. 1. Half width link using {Q1, Q0} and {Q3, Q2}. 2. Quarter width link using either {Q1} or {Q0}. 3. Quarter width link using either {Q3} or {Q2}. 3.9.2 Link Training Basics A pair of connected ports, after detecting each other, interactively train through a series of states to form a link. Ports advance these training states using a handshake mechanism which, simply stated, involves each port indicating its ability to advance states. A state transition occurs when a port indicates its ability to advance states and receives a similar indication from the remote port. The indication sent by local port is called a local ACK and the indication received from remote port is called a remote ACK. 3.9.2.1 Training Sequences (TSx) A link is established through a series of training states. Each training state has a training sequence that is unique to that training state. Table 3-20 shows a generic TSx format. TSx are transmitted serially on each lane, starting with the LSB. This specification uses a little-endian convention to represent TSx. Thus, the first TSx bit transmitted corresponds to the LSB of Header field. Table 3-20. Training Sequence (TSx) Format Byte Field Description 0 Header A unique signature for a given training sequence, used by a receiving port to detect a training sequence. 1 ACK Field Handshake field used for advancing training states 2 - 7 Pay Load A training state specific field. Can have multiple subfields of varying lengths. Values of these sub-fields can be static or dynamic. Examples include Link-up Identifier, Loop Back control, width capabilities etc. Ref No xxxxx Intel Restricted Secret 3.9.2.2 Link Handshake Mechanism Assumption based state transitions are difficult to validate and debug, and hence ports use a handshake mechanism to minimize assumption based state transitions. Within a given training state, each port sets attributes based on intermediate training status and uses these attributes to advance to next state. Handshake involves tracking these attributes and using training sequences to signal to remote port when local port is ready to advance to next state. The handshake attributes are outlined in Table 3-21. Table 3-21. Summary of Handshake Attributes Handshake Attribute Conditions Attribute Scope Actions RxReady Any Rx received at least two consecutive TSx patterns and completed processing any one of these TSx patterns. each Rx None LocalPortReady All local Rx have RxReady attribute set. entire port Set the ACK field in outbound TSx training sequence of all Tx. Local ACK LocalPortReady attribute set. entire port Compile local port information for transmitting on outbound TSx. Local ACK can vary from 1 to 8 bits in length, depending on TSx. RemotePortReady A local Rx has received two consecutive TSx patterns with ACK. each Rx None Remote ACK At least one local Rx has RemotePortReady attribute. entire port None Advance Remote ACK attribute set and at least 4 TSx with local ACK have been sent. entire port Advance to the next state. The sequence of steps involved to set handshake attributes is shown in Figure 3-8. Ref No xxxxx Intel Restricted Secret Figure 3-8. Sequence of Events for Acquiring Handshake Attributes Local Port Remote Port The remote port is sending TSx and looking for TSx TxRxEach Rx sifts through the incoming bit stream, attempting to match a TSx pattern RxTx Each lane sends TSx pattern. The ACK field is cleared 1 TxRx When any Rx pair has received two consecutive TSx patterns with ACK field set, the local port knows that remote port is ready and sets RemotePortReady attribute (internal to Local Port) All receivers are ready and the remote port is sending TSx with ACK set 3 Local port may advance state when any Rx indicates RemotePortReady, and all transmitters have transmitted at least four TSx patterns with ACK 4 RxTx LocalPortReady attribute is set when all Rx have RxReady attribute set. All Tx set the ACK bit in the outgoing TSx Pattern TxRx When an Rx correctly interprets at least two consecutive TSx patterns, it is known to be good and RxReady attribute is set on this Rx 2 1. Initially, both ports transmit and receive TSx with ACK fields cleared. Rx on each lane sifts through incoming bit stream to match a training sequence header. No handshake attributes acquired yet. 2. RxReady attribute is set on a local Rx when this Rx interprets at least two consecutive training sequences correctly, and completes processing any of these training sequences. Checking for at least two identical consecutive training sequences avoids miscommunication between ports due to transient errors. All Rx on local port have RxReady attribute set, and hence the local port is ready to advance to next state. LocalPortReady attribute is set on this port, and all subsequent TSx transmitted by local port will contain local ACK in the ACK field of TSx. The remote port follows a similar sequence to set its attributes. Ref No xxxxx Intel Restricted Secret 3. A local Rx receives two consecutive TSx with ACK field set. This Rx gets RemotePortReady attribute. 4. Local port advances to next state when RemotePortReady attribute is set on at least one local Rx and at least 4 TSx are sent with local ACK. Remote port also looks for two TSx with ACK field to set its attributes, and hence sending at least 4 TSx with local ACK guarantees that remote port receives at least two consecutive TSx with ACK fields set, even in case of transient errors in one TSx. The assumption made here is that frequency of transient errors on a lane is longer than the length of two TSx. Figure 3-9 shows how handshake attributes of a port are used to advance to the next state. Lane FSM corresponds to portion of the state machine that manages serial communication on each lane (either local Tx or Rx) and Link FSM corresponds to the portion of state machine that manages all the Lane FSMs. Figure 3-9. State Transition Using Handshake Attributes Link FSM Local ACK RxReady RxReady RemotePortReady State Transition Trigger RemoteACK RemotePortReady Lane FSM Lane FSM Lane FSM Lane FSM Rx Side Tx Side AdvanceToNextState When AdvanceToNextState signal is asserted, all local Tx advance to next state simultaneously, after completely transmitting the current training sequence. Likewise, all local Rx advance to next state simultaneously after completely receiving the current training sequence. Thus, transmit and receive sides of a port advance to next state independent of each other as there is no correlation between outbound and inbound TSx boundaries. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 3.9.2.3 Dead Lock Avoidance due to Faulty Lanes An Rx connected to a faulty lane will not acquire RxReady attribute, as it does not receive TSx correctly. Local ACK cannot be transmitted by this port since this port cannot acquire LocalPortReady attribute, which requires all Rx to have RxReady attribute. This results in a training deadlock, as neither side can advance states. A timeout based secondary condition is used to advance states, to prevent dead lock caused by faulty lanes. Each training state is assigned a pre-determined timeout value, which corresponds to the maximum duration a port stays in this state. When local port times out in a state, all local Rx that failed to acquire RxReady attribute are marked bad and the port advances to next state. RxReady attribute is always set on bad lanes for the remainder of the initialization sequence; thus a bad lane will force a state to time out only once, in the state the bad lane is detected. For all subsequent states, state advancement follows handshake protocol as bad lanes have RxReady attribute set even though they do not receive any training sequences. However, if all Rx fail to acquire RxReady attribute, local port abandons link initialization by issuing an Inband Reset to remote port. Information on bad Rx lanes is exchanged during the end of initialization, after which point these bad lanes may be disabled (powered-down). Bad lanes will not be used for Link Map computation described in Section 3.9.1.3. The width negotiation algorithm described in Section 3.9.1.3 will find an optimal link width using the available set of good lanes. 3.9.2.4 Link Training Rules 1. All Tx within an agent transmit TSx simultaneously. TSx are transmitted serially on each lane, starting with LSB. TSx are required to be sent back-to-back - i.e., no gap is allowed between two consecutive TSx even if they belong to two different states. 2. An Rx shall have RxReady attribute set after it has received at least two identical consecutive TSx and it has completed processing any of these TSx. Merely receiving two or more consecutive identical TSx w/o processing them is not adequate to acquire an RxReady attribute. An Rx may choose to compare only portions of two or more consecutive TSx to acquire RxReady attribute. However, TSx fields chosen for comparison across consecutive TSx should unambiguously indicate the readiness of Rx to advance to next state. Thus, an Rx may choose to process header field only if payload fields are not used in the current TSx or may choose to sequentially process TSx fields potentially at the expense of longer time required to acquire RxReady attribute. 3. Once a port acquired LocalPortReady attribute, it will continue sending TSx with local ACK for the remainder of the current state. 4. Likewise, once RemotePortReady attribute is set, it will not be reset for the remainder of the current state. This is true even if subsequent incoming TSx of the current state have ACK fields cleared. 5. It is possible for Rx on an agent to see the first set of consecutive identical TSx with ACK field set. In this case it is permissible for this lane to acquire RxReady and RemotePortReady attributes simultaneously. It is not required for a lane to acquire RxReady attribute before acquiring RemotePortReady attribute, although such an implementation style is not precluded by handshake mechanism. 6. An Rx ignores a TSx with a header that does not match the expected TSx header of the current state. This unexpected TSx header shall not cause Rx to renounce its current attributes. Ref No xxxxx Intel Restricted Secret 7. All Tx within a port advance to next state after completely transmitting the current TSx. All Rx within an agent advance to next state after completely receiving the current TSx. No timing dependency is assumed between state advancement of transmit and receive sections of a port. Receive side may advance to next state ahead of transmit side, or vice-versa. 8. Two connected ports advance training states at approximately the same time due to the handshake mechanism. However, state transitions between these ports may not be synchronized (to the exact UI, for instance). 9. There is no ordering requirement between local ACKs and remote ACKs. For instance, if a port has already sent 4 local ACKs by the time it received a remote ACK, this port can immediately advance to next state. 10. When a port times out in a state, it shall advance to the next state even if local ACK is not sent and/or remote ACK is not received. However, a port is allowed to abort initialization under the following exceptions: a. No Rx within the port acquired RxReady attribute. b. A CLM cannot be identified using exchanged WCI, and hence a link cannot be established. c. Failure to establish a flit boundary (Section 3.9.3.4.3). 3.9.2.5 Link Timeout Values Link training uses timeout based secondary exit conditions to avoid deadlock, as described in Section 3.9.2.3. In cases where handshake mechanism is not applicable, timeouts are used as primary exit conditions. The different timeout values used by logical sub-block are shown in Table 3-22. Table 3-22. Link Initialization Time Out Valuesa Timeout Relevant States Default Value Value in UI Timeout Based Exit Criterion Notes TCONFIG.1 Config.1 0x7F 8192 Secondary See Section 3.9.3.4.1 for details TCONFIG.2 Config.2 0x7F 8192 Secondary See Section 3.9.3.4.3 for details TDEBOUNCE Detect b’01 128 Primary See Section 3.9.3.2 for details TDETECT.2 Detect.2 0x2F 32K Secondary Each tick in this field corresponds to 1024 UI. Time out value is (count + 1) * 1024 UI TDETECT.3 Detect.3 0x7F 8192 Secondary See Section 3.9.3.2.5 for details TINBAND_RESET_INIT Various 0x7F 8192 Primary See Section 3.7.5 for details TPOLLING.1 Polling.1 0x7F 8192 Primary See Section 3.9.3.3.1 for details TPOLLING.2 Polling.2 0x7F 8192 Secondary See Section 3.9.3.3.2 for details TPOLLING.3 Polling.3 0x7F 8192 Secondary See Section 3.9.3.3.3 for details a. Unless specified otherwise, the value in each register field corresponds to (count + 1)*64 UI. For interoperability, all implementation are required to support power on default values as shown in Table 3-22. A platform may choose to optimize link initialization time by configuring these values prior to a Soft Reset. It is a platform’s responsibility that interoperability is maintained on that platform for these non-default values. The only requirement is that TPOLLING.1 must be a multiple of 8. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 3.9.3 Logical Sub-block Finite State Machine The logical sub-block state diagram is shown in Figure 3-10, the details of which are explained in the sections that follow. 3.9.3.1 Disable/Start State Disable/Start state is the initial state of the Physical layer state machine. This state is entered at power on or in response to any Physical layer reset event. No link activity occurs during this state. Terminations on all Tx and Rx lanes meet ZTX_HIGH_CM_DC and ZRX_HIGH_CM_DC, respectively, ensuring a port in this state is not detected by an active remote CSI port. Link detect termination, RTX_LINK_DETECT, should be turned on all lanes. However, link detect control logic that checks for remote Rx terminations will not be enabled until the port advances to Detect state. the Disable/Start is also the final state of Physical layer in the event of a link initialization failure. Link-up Identifier (Section 3.7.2) is cleared if this state is entered during Cold Reset. Link-up Identifier maintains its previous value for all other reset types. All non-sticky register fields are restored to power on default values. For Cold Reset, none of the register fields have sticky property, and hence all register fields are restored to power on default values. Figure 3-10. Logical Sub-block State Diagram Polling Detect Configuration L0 (active) Loopback Compliance Disable / Start >= 1 good bit lane Link width agreed upon Train PHY link Configuration Failure CSI agent detected end of test Directedby master Inband Reset Polling Failure probedetected Detect Failure Physical Layer Reset The Physical layer remains idle until LinkClockStable (Section 3.7.3.1) signal is observed, after which point logical sub-block may initiate an internal calibration phase. Default operation is to perform internal calibration for Cold Reset and bypass calibration phase for other reset types. However, Physical layer offers a mechanism for forcing calibration by configuring CSRs. After calibration is completed, the port updates CSRs to bypass this phase during subsequent link re- initialization sequences until next Cold Reset. The criteria for exiting Disable/Start state depends on reset type and initialization retry threshold value. Physical layer supports a multiple initialization retry feature where a link initialization failure would automatically start another initialization sequence. The number of initialization attempts is configurable using CSRs - two connected ports are allowed to have different initialization retry thresholds, and the state machine uses the lowest of these two values. The port with larger initialization threshold advances past Disable/Start and waits indefinitely in Detect Ref No xxxxx Intel Restricted Secret state. The initialization retry threshold counter is updated at the point of failure, and the state detecting a link initialization failure would initiate the next initialization sequence using Inband Reset. The threshold counters are cleared once a link is established. Following is a summary of Disable/Start exit conditions: 1. For Cold Reset, PhyInitBegin is used as a condition to exit Disable/Start and enter Detect state. 2. For Inband Reset, an internal counter is started as soon as a port enters Disable/Start. If internal calibration is forced, this counter starts after calibration is completed. When this counter reaches TINBAND_RESET_INIT threshold, a. If initialization threshold is not reached, state machine advances to Detect state. b. If initialization threshold is reached, state machine stays in Disable/Start until next Cold Reset. Table 3-23. Summary of "Disable/Start" state State Disable/Start Actions • Restore default values in all non-sticky register fields. • All local Tx and Rx terminations must meet ZTX_HIGH_CM_DC and ZRX_HIGH_CM_DC, respectively. • All local Tx must turn-on link detect pull-up, RTX_LINK_DETECT. The link detect control logic that detects remote Rx termination should not be turned-on • Reset Link-up Identifier during Cold Reset. Maintain previous value in all other cases. • Stay idle until LinkClockStable signal is seen. Proceed to next steps once this signal is observed. • Optionally, perform port internal calibration • Calibration is always performed during Cold Reset. • Calibration is bypassed for other reset types, but can be forced using Physical layer CSRs • Update Physical layer CSR to bypass internal calibration for subsequent link re- initialization sequences • If this state was entered as a result of Inband Reset, start an internal timeout counter as soon as calibration is done. If calibration is bypassed, start the counter upon entering Disable/Start state. Condition Next State PhyInitBegin signal observed Detect Exit Conditions and Next States Timeout counter reaches TINBAND_RESET_INIT threshold but Initialization retry threshold has not reached Detect Timeout counter reaches TINBAND_RESET_INIT threshold and Initialization retry threshold has reached Continue to stay in Disable/Start until next Cold Reset 3.9.3.2 Detect State Detect state is the synchronization point for two ports to begin link initialization. A port stays in Detect state until a remote port is detected. Once a pair of connected ports detect each other, they advance to Polling state to begin interactive training. CSI Physical layer uses a Tx based detect scheme. Each Tx lane of local port contains a link detect circuit that is turned-on during Disable/Start state. However, the link detect control logic that detects a remote Rx termination should be turned-on only after the port enters Detect state. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer The link detect circuit has a weak pull-up resistor, RTX_LINK_DETECT, to bias the lane to logic 1. A remote Rx with termination meeting ZRX_LOW_CM_DC or a passive 50O test probe, would overdrive link detect pull-up resulting in Tx at local port detecting a logic 0. A remote Rx with termination meeting ZRX_HIGH_CM_DC cannot overdrive link detect pull-up, and hence the local Tx sees a logic 1. Thus, a local Tx sensing a logic 1 at its output continues to stay in Detect and advances to the next state after sensing a logic 0. The debounce time required to sense a logic 0, TDEBOUNCE. Each Tx lane containing link detect circuitry has control logic associated with it. When this control logic detects a remote termination (senses logic 0), a lane counter is turned-on to track the duration for which remote Rx termination is detected. Link detect operation is completed when at least one lane counter reaches a value of TDEBOUNCE. Upon completion of link detect, state machine advances to the next state. If, for any reason, link detect control logic momentarily loses remote Rx detection (i.e., sensed value glitches from a logic 0 to logic 1), this lane resets its counter and repeats the sequence when a logic 0 is sensed again. Figure 3-11. Detect Sub-States probedetected Detect.1 Detect.2 clock termination detected rcv clock stable and data termination detected TDETECT.2 timer expired Compliance Polling Disable/ Start Detect.3 TDETECT.3 timer expired Known DC pattern transmitted and received Detect state has 3 sub-states - Detect.1, Detect.2 and Detect.3, that are discussed in the following subsections. 3.9.3.2.1 Detect.1 Sub-state A port checks for the presence of an active CSI port or a passive test probe at the other end of the link, and stays in this state indefinitely until a remote port or a test probe is detected. Link detect control logic on all local Tx are activated. Local clock Rx terminations must meet ZRX_LOW_CM_DC and local data terminations must meet ZRX_HIGH_CM_DC. Local port attempts to advance to Detect.2 when local clock Tx detects remote clock Rx for a period of TDEBOUNCE, using the following state transition rules. 1. If no remote data Rx terminations are detected at the end of debounce period, TDEBOUNCE, port advances to Detect.2. 2. If at least one remote Rx termination is detected at the end of debounce period, even for 1 UI, a. Port enters compliance mode if Link-up Identifier is 0. b. Port continues to stay in Detect.1 if Link-up Identifier is 1. Debounce counter is reset and the entire Detect.1 sequence is repeated. Ref No xxxxx Intel Restricted Secret Table 3-24. Summary of Detect.1 Sub-State State Detect.1 Actions • Link detect control logic turned-on on all local Tx (both clock and data) • Local clock Rx termination must meet ZRX_Low_CM_DC. • Local data Rx terminations must meet ZRX_HIGH_CM_DC. Exit Conditions and Next States Remote clock Rx detected continuously for a period of TDEBOUNCE. State transition depends on Link-up Identifier, and remote data Rx terminations as summarized below. Legend: X => Don’t Care ON => Remote Rx terminations meet ZRX_LOW_CM_DC OFF => Remote Rx terminations meet ZRX_HIGH_CM_DC Local Link-up Identifier Remote Clock Rx Termination Remote Data Rx Termination Next State X ON OFF Detect.2 0 ON ON Compliance 1 ON ON Stay in Detect.1 X OFF ON/OFF Stay in Detect.1 3.9.3.2.2 Extended Detect.1 for Supporting Forwarded Clock Fail Safe Operation - Small MP and Large MP Profiles CSI implementations may support a forwarded clock fail safe feature (Section 3.9.8), where the loss of primary clock channel would not cause fatal system failure. These implementations are required to have two back-up clocks in addition to the primary clock. Back-up clocks are supported by dual use data lanes that can act either as clock or data. Detect.1 state follows the basic operation described in the previous section, but treats dual use data lanes as clock lanes. Thus terminations on both primary and back-up clock Rx should meet ZRX_LOW_CM_DC at the local port. Local port attempts to advance to Detect.2 when local clock Tx (primary and back-up) detect at least one remote clock Rx continuously for a period of TDEBOUNCE. 3.9.3.2.3 Detect.2 Sub-State In Detect.2, local port activates forwarded clock and simultaneously starts locking to the received clock. Local clock Tx and Rx terminations must meet ZTX_LOW_CM_DC and ZRX_LOW_CM_DC, respectively. Local data Rx terminations continue to meet ZRX_HIGH_CM_DC when a port enters Detect.2. Link detect circuits on local clock Tx are turned-off and link detect circuits on local data Tx continue to be active. When local port locks to received clock, all local Rx data terminations meet ZRX_LOW_CM_DC, allowing remote data Tx to detect local data Rx. Local data Tx continue to monitor remote data Rx terminations, and when at least one remote data Rx is detected for a period of TDEBOUNCE, it is interpreted as an indication that remote port has locked to its received clock. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer State Detect.2 Actions • Start Detect.2 timeout counter • De-activate link detect circuit on local clock Tx • Local clock Tx and clock Rx must meet ZTX_LOW_CM_DC and ZRX_LOW_CM_DC, respectively. Local data Rx terminations continue to meet ZRX_HIGH_CM_DC. • Local clock Tx starts driving forwarded clock and local clock Rx attempts to lock to received clock • When received clock is stable, all local data Rx terminations must meet ZRX_LOW_CM_DC, Exit Conditions and • Received clock is stable and at least one remote data Rx termination is Next States detected continuously for a period of TDEBOUNCE, • Next state is Detect.3. Advance all Tx to Detect.3 •TDETECT.2 timer expires • Update initialization retry threshold counter and issue Inband Reset • Next state is Disable/Start. If received clock is not seen at the end of TDETECT.2, local port abandons the current initialization sequence by issuing an Inband Reset after updating initialization retry threshold counter. Similarly, if local port has not seen a handshake from remote port, in the form of data terminations being turned-on for a period of TDEBOUNCE, local port abandons current initialization sequence by issuing an Inband Reset after updating initialization retry threshold counter. 3.9.3.2.4 Extended Detect.2 for Supporting Forwarded Clock Fail Safe Operation - Small MP and Large MP Profiles Implementations supporting forwarded clock fail safe operation have primary and back-up clock lanes (Section 3.9.8), some or all of which can enter Detect.2. Dual use data lanes are treated as clock lanes in Detect.2, and hence must meet termination requirements of clock Tx and Rx specified in Section 3.9.3.2.3. Clock lanes have a pre-determined priority order which is common to all CSI implementations supporting this feature. Local port sends forwarded clock on clock lane with highest current priority. Thus, if primary clock is not detected in Detect.1, local port transmits forwarded clock on dual use data lane with higher priority. Likewise, local port attempts to lock to received clock using the clock Rx with current highest priority. If TDETECT.2 timer expires before received clock is locked to, the clock Rx with current highest priority is disabled and an Inband Reset is issued to remote port. Initialization retry threshold counter is not updated, as the subsequent initialization sequence will use the clock Rx with next highest priority. Initialization retry threshold is updated only if the currently used clock has the lowest priority. Clock Rx that are disabled in Detect.2 are re-enabled only when initialization retry threshold counter is updated prior to issuing an Inband Reset, ensuring all available clock lanes are cycled through before starting afresh. Local port also issues an Inband Reset if received clock is stable and remote data Rx terminations are not seen at the end of TDETECT.2. This indicates that remote port has not received a stable clock on remote clock Rx with highest priority. Local Tx does not have to update its clock priority as the remote port will disable remote clock Rx with current highest priority during a subsequent initialization sequence. Initialization retry threshold counter is not updated prior to issuing Inband Reset. Ref No xxxxx Intel Restricted Secret An implementation supporting clock fail-safe mode is required to disable this feature when connected to an implementation not supporting this feature. Failure to disable this feature results in the latter entering compliance mode. 3.9.3.2.5 Detect.3 Sub-State (Author’s Note: Detect.3 may be modified in next revision of the spec. Discussions underway to assess the merits and implications of the proposed changes) In Detect.3, all local Tx lanes drive 1/0 on D+/D- halves of Tx differential pairs for a period of 2*TDEBOUNCE. The 1/0 value driven by Tx on each lane is referred to as known DC pattern. Each local Rx starts looking for known DC pattern. When at least one local Rx detected known DC pattern for a period of TDEBOUNCE, all local Rx lanes that detected known DC pattern for at least 1 UI are advanced to Polling. Any local Rx lanes that fail to receive known DC pattern at the end of debounce time are disabled and will not be available until the following link initialization sequence. If known DC pattern is not observed for a period of TDETECT.3, local port abandons current initialization sequence using Inband Reset. Initialization retry threshold counter is updated prior to issuing an Inband Reset. Table 3-26. Summary of Detect.3 Sub-State State Detect.2 Actions • All local Tx drive 1/0 on D+/D- of Tx differential pairs for a period of 2*TDEBOUNCE • All local Rx look for 1/0 on D+/D- of Rx differential pairs • Start debounce counter when at least one local Rx receives 1/0 on D+/D- halves of Rx differential pair • At the end of debounce time, disable all local Rx that fail to receive 1/0 on D+/D- halves of Rx differential pair Exit Conditions and Next States • At least one local Rx continuously received 1/0 on D+/D- halves of Rx diff pair for a period of TDEBOUNCE and all local Tx transmitted 1/0 on D+/D- halves of Tx diff pairs for a period of 2*TDEBOUNCE - Next state is Polling •TDETECT.3 timer expires • Update initialization retry threshold counter and issue Inband Reset • Next state is Disable/Start. 3.9.3.2.6 Detecting Polarity Inversion in Detect.3 Sub-state - DP, Small MP and Large MP Profiles Polarity Inversion is a feature where D+/D- of a differential pair are swapped on the Physical Interface (package/motherboard/connector etc.) to reduce platform design complexity. Polarity Inversion is detected by each Rx during Detect.3 and a correction is automatically made by the Rx detecting Polarity Inversion. The corrected polarity will be in effect in Polling state. Physical layer supports Polarity Inversion on an individual lane basis, independent of other lanes. Local Tx continue to drive known DC pattern, a 1/0 on D+/D- of Tx differential pair for a period 2*TDEBOUNCE. Local Rx, however, look for known DC pattern or its 1’s complement - 0/1 on D+/D- of Rx differential pairs. When at least one Rx differential pair detects a 1/0 or 0/1 on D+/D- for a period of TDEBOUNCE, all local Rx lanes that detected known DC pattern or its 1’s complement for at least 1 UI are advanced to Polling. Any local Rx lanes that fail to receive known DC pattern (or 1’s complement) at the end of debounce time are marked bad and will not be available until the subsequent link initialization sequence. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 3.9.3.3 Polling State The Polling state consists of 3 sub-states, as shown in Figure 3-12. Figure 3-12. Polling Sub-states Polling.1 (bit lock) Polling.2 (byte lock & lane deskew) Polling.3 (parameter exchange) Configuration Loopback Disable/Start initialization failure initialization failure configured for loopback timer based >= 1 good Rx/quad. >= 1 good Rx/quad. 3.9.3.3.1 Polling.1 Polling.1 is used for establishing bit lock at Rx. All local Tx send a clock pattern (...1010...), for a period of TPOLLING.1, starting with a 0. Simultaneously, local Rx start bit locking to incoming clock pattern. Polling.1 sub-state does not generate a handshake. Local port advances to Polling.2 at the end when TPOLLING.1 timer expires. \ Table 3-27. Summer of Polling.1 Sub-State State Polling.1 (Bit Lock) Actions Lane FSM • All local dataTx drive clock pattern, starting with a 0 • Each local data Rx aligns its strobe position to center of the eye, using the incoming clock pattern. Link FSM • Initiate TPOLLING.1 timer Exit Conditions and Next States • TPOLLING.1 timer expires: Go to Polling.2 3.9.3.3.2 Polling.2 Polling.2 uses training sequence TS2 to establish byte lock, identify Rx lanes that failed to bit lock and to perform lane-to-lane deskew. At the end of this state, faulty lanes are identified and marked bad. Lane deskew is not done on bad lanes. At the end of deskew operation all good Rx lanes will have identical latency. Ref No xxxxx Intel Restricted Secret The first step in Polling.2 is byte lock where each local Rx lane uses incoming TS2 header to identify training sequence boundary. When a lane receives two consecutive TS2 headers that are 8 bytes apart, the beginning of either TS2 header can be used as a training sequence boundary. The second step is to identify faulty lanes. By the time at least one local Rx receives two consecutive TS2 sequences, any local Rx that fails to see a TS2 header is deemed faulty and marked bad. The maximum skew allowed between any two lanes is 1 UI less than half the training sequence length (i.e., theoretical max skew between lanes is 31 bits) - hence by the time an Rx receives an entire TS2, all good Rx should have at least seen TS2 header. The state machine identifies faulty lanes only after receiving 2 TS2s on at least one Rx to allow for the possibility of other Rx receiving a corrupt TS2 header due to transient errors. Faulty Rx thus identified are marked bad, and will not be deskewed. The final step of Polling.2 is to perform lane-to-lane deskew. Deskew buffers use TS2 header as a signature to identify the relative skew between lanes, and adjust deskew buffer read pointers to offset the relative skew. An ACK is sent on outbound TS2 only after lane deskew is performed. The training algorithm is designed to use any Rx lane as a reference to compute skew between active lanes. The reference Rx uses incoming TS2 header as a datum to compute skew between reference lanes and all other lanes. Skew is defined as the offset between the datum and closest TS2 header on any non-reference lane, as shown in Figure 3-13. Figure 3-13. Computing Lane-to-Lane Deskew – An Example Lane p Lane q Lane r TS2_r_1 header TS2_r_3 header TS2_r_2 header TS2_q_1 header TS2_q_2 header TS2_q_3 header Tqp_1 header Tqp_2 Tqr_1 Tqr_2 Time Lane TS2_p_2 header TS2_p_3 TS2_p_1 header Figure 3-13 shows 3 lanes - p, q and r, with lane q used as a reference for performing lane-to-lane deskew. In this example, the reference lane q uses header on incoming training sequence TS2_q_2 as a datum to compute skew across lanes. Between lanes p and q, offset between TS2_p_2 and TS2_q_2 (shown as Tqp_1) is smaller than the offset between TS2_p_3 and TS2_q_2 (shown as Tqp_2), and hence the former is used as the skew between lanes p and q. Between lanes q and r, Tqr_1 is larger than Tqr_2 and hence the training sequence on lane r following the datum is used to determine the skew. Deskew ambiguity arises if both training sequences on a non-reference lane are equidistant from the datum - hence the maximum skew allowed between any two lanes is 1 UI less than half a training sequence or 31 UI. A larger skew likely causes deskew ambiguity and results in undefined operation. Table 3-28. Description of TS2 Training Sequence Byte Description Value[7:0] 0 TS2 Header 0100 1110 Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 1 ACK Field bit[0] – ACK bit bit[7:1] – reserved bit[0] – applies to RX on other side of the link 0 – No ACK (NACK) 1 – ACK set bit[7:1] – 7b’0 2-7 ISI Pattern Byte 2 - 8b’00000000 Byte 3 - 8b’00010000 Byte 4 - 8b’00000000 Byte 5 - 8b’11111111 Byte 6 - 8b’11101111 Byte 7 - 8b’11111111 Table 3-29. Summary of Polling.2 Sub-State State Polling.2 (Lane Deskew) Actions Lane FSM • Identify training sequence boundary by looking for two consecutive TS2 headers that are 8 bytes apart. Link FSM • Initiate TPOLLING.2 timer. • Identify Rx that failed to bit lock and disable these Rx until subsequent initialization sequence. • Perform lane deskew using TS2 header. • Start sending local ACKs when lane deskew is done. Exit Conditions and Next States • Remote ACK set and Local ACK sent for >= 4 TS2s: Go to Polling.3. •TPOLLING.2 timer expires: • If at least one good Rx, advance to Polling.3 using good Rx lanes. • All Rx bad, increment initialization retry threshold counter and assert InbandReset. 3.9.3.3.3 Polling.3 Polling.3 is used to exchange Physical layer parameters using training sequence TS3. Lane reversal is identified, link latency is estimated by looking at the relationship between reference clock and beginning of TS3 header, Loopback master and slave are identified if the link was configured to run in loopback, target link latency information is exchanged to support repeatability and lockstep operation. Table shows TS3 format and the list of parameters exchanged during Polling.3. Table 3-30. Description of TS3 Training Sequence Byte Description Value[7:0] 0 TS3 Header 1110 0101 1 ACK Field bit[0] – ACK bit bit[7:1] – reserved bit[0] – 0=nack/1=ack bit[7:1] – 7b’0 Ref No xxxxx Intel Restricted Secret Table 3-30. Description of TS3 Training Sequence (Continued) 2 FSM Flow Control bit[0] – L0 or loopback bit[7:1] – reserved 3 Link and Lane Identifier bit[0] – Link-up Identifier Byte Description bit[5:1] – LaneID using ordered pair representation. Bits [5:4] represent a quadrant (0 through 3) and bits [3:1] represent offset of this lane within a quadrant [0 through 4] bit[7:6] – Reserved bit[5:1]: Lane dependent. bit[0]: 1b’1 - Enter Loopback as master 1b’0 - Enter L0 or loopback as slave bit[7:1]: 7b’0 bit[0]: Variable. Set in Disable/Start state Value[7:0] 4 Target Link Latency Latency value requested to remote port for fixing the link latency. This value is copied from CSR. Bit[7:0] - Target Link Latency in terms of UI. bit[7:6] – 2b’0 5 Synchronization Count The value of synchronization count latched at some point while transmitting TS3 Bit[7:0] - Synchronization count. 6 Virtual Lane Identifier - UP Profile bit [0] – corresponds to lower CRC bit. Column 0 in Table 3-5 bit [1] – corresponds to upper CRC bit. Column 1 in Table 3-5 bit [2] – corresponds to lower sideband bit. Column 18 in Table 3-5 bit [3] – corresponds to higher sideband bit. Column 19 in Table 3-5 bits[7:4] – Reserved Bits[7:4] should have a value of 0 for current generation of CSI. For remaining bits, a value of 1 indicates this bit has a virtual lane attribute (Section 3.9.1.4). A value of 0 indicates lane virtualization is not required for this bit. Of note, higher and lower CRC bits go in pairs. It is not allowed to virtualize one of these two bits. 7 bit [0] – corresponds to lower CRC bit. Column 0 in Table 3-5 bit [1] – corresponds to upper CRC bit. Column Virtual Lane Implicit Value - UP Profile bit in Virtual Lane Identifier field. A bit in this field is valid only if the corresponding Virtual Lane Identifier bit is set to 1. Each bit of this field maps to the corresponding 6-7 1 in Table 3-5 bit [2] – corresponds to lower sideband bit. Column 18 in Table 3-5 bit [3] – corresponds to higher sideband bit. Column 19 in Table 3-5 bits[7:4] – Reserved Reserved (for profiles other than UP) This field stores the implicit value of a virtualized lane. 16b’0 FSM Flow Control: Can be configured to bring up the link either in L0 state or loopback mode by configuring Loopback Mode bit in control register. Default power up value is to enter L0 state after initialization. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer If this bit is set to 1 by either component, both components enter loopback mode after link initialization. The port that sets this bit to 1 becomes the loopback master and the other port becomes the loopback slave. If both ports set this bit to 1, loopback master definition is ambiguous and results in initialization failure. Loopback slave, after setting ACK bit in outbound TS3, is required to zero out its payload bytes, from byte 2 through byte 7. This is required for synchronizing loopback entry by loopback master and slave ports (See Section 3.9.3.6 for details). Link and Lane Identifier: Bit[0] corresponds to Link-up Identifier described in Section 3.7.2. Bits[5:1] are unique to each lane, and represent a lane ID using ordered pair representation described in Section 3.9.1.1. Using this representation, Lane Reversal can be identified by comparing one bit only. Quadrants Q0 and Q3 compare MSB of value (bit 5) and quadrants Q1 and Q2 compare LSB of value (bit 4). A mismatch between received bit and internally stored quadrant ID ( value representation) bit indicates Lane Reversal. Table 3-31. Summary of Polling.3 Sub-State State Polling.3 (Parameter Exchange) Actions Lane FSM • Determine Lane Reversal Link FSM • Start TPOLLING.3 timer. • If loopback bit set in incoming TS3, configure local port as loopback slave • Pad additional delay on each lane to meet Target Link latency • Use clock boundary indicator to estimate absolute link latency • Identify virtual lanes and capture implicit values for these virtual lanes. Turn-off I/O corresponding to virtual lanes, but keep the corresponding internal logic operational. Exit • Both sides have loopback bit set – assert Inband Reset and abort initialization Conditions and Next States • Link-up Identifier mismatch. The port with this flag set configures itself to perform Cold Reset and sends an Inband Reset to remote port. • Sent >= 4 local ACK and received remote ACK – advance all Tx/Rx to • Loopback: If either port’s loopback bit is set. The port with this bit set is Loopback master and the other is Loopback slave • Neither side has loopback bit set. Enter Config state. •TPOLLING.3 timer expires and at least one Rx pair has RxReady attribute set. • Neither side has loopback bit set - Mark lanes that failed to gain Rx Ready attribute as bad and advance good lanes to Config state. • One side but not both have loopback bit set – enter loopback. The side with this bit set to 1 is loopback master. Advance all lanes, including bad lanes, to loopback. •TPOLLING.3 timer expires and no Rx pair has RxReady attribute set – mark all lanes bad, abort initialization by initiating an Inband Reset Target Link Latency: Remote receiver accomplishes the desired target link latency by internally adding additional cycles of delay such that the sum of actual link latency and additional internal delay equals target link latency. Synchronization Count: Tx will latch the synchronization counter value at some reference point, while transmitting TS3. The Count value in consecutive TS3 will differ by length of TS3. Rx will use this count value to determine the actual link latency. Refer to Section 3.9.6 for further details on determinism requirements. Ref No xxxxx Intel Restricted Secret Virtual Lane Identifier and Virtual Lane Implicit Value [bytes 6 and 7]: See Table for description. Used only for supporting a link with fewer than 20 lanes in full width mode. Also see Section 3.9.1. 3.9.3.4 Config State Config State is used to negotiate link width and to synchronize flit boundaries between local and remote ports, using training sequence TS4. This has two sub-states - Config.1 and Config.2, as shown in Figure 3-14. Config.2 is a simple extension of Config.1, as described in Section 3.9.3.4.3, and does not use a separate training sequence. It should be noted that handshake sequence ends with Config.1. Thus, after handshake occurs in Config.1, Tx and Rx portions of a port are not required to advance to next state near simultaneously. Thus, Rx portion of a port enters Config.2 as soon as it receives a remote ACK, whereas the Tx portion of a port enters Config.2 after it has sent at least 4 TS4s with ACK and Rx portion of this port has received a remote ACK. Table 3-32. Description of TS4 Training Sequence Byte Description Value[7:0] 0 TS4 Header 1110 0011 1 ACK Field bit[0] – ACK bit bit[3:1] – redundant ACK bit[7:4] – negotiated lane map bit[0] – 0=nack/1=ack bit[3:1] – 3b’000 in Config.1 3b’111 in Config.2 bit[7:4] – A CLM selected from WCI below 2-3 Width Capability Index (WCI) Hardware and lane failure dependent. Bits [7:3] of byte 3 are don’t cares. 4-7 Reserved 0X 0000 0000 Refer to Section 3.9.1.3 for details on TS4 fields. Figure 3-14. Config Sub-States Polling.2 Config.1 Config.2 Disable/ Start L0 CM not agreed upon >= 1 good Rx/quad. CM agreed upon Flit boundary detection failure CM OK/flit boundary OK 3.9.3.4.1 Config.1 Config.1 is the state where both sides exchange information on faulty lanes using Width Capability Indicator (WCI). Each port computes WCI using the available set of good lanes, as deemed by receive portion of this port. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer A port compares its internally generated WCI and the WCI received from remote port and selects a Common Link Map (CLM) that both ports can support. Prior to link initialization, required link width can be configured using Physical layer CSRs. A CLM is selected to form a link of required width specified in CSRs. If a CLM for desired link width cannot be found, initialization is aborted by issuing Inband Reset. Table 3-33. Summary of “Config.1” State State Config.1 (Link Width Negotiation) Actions Lane FSM • Local port sends its WCI and receives remote ports WCI Link FSM •Start TCONFIG,1 timer • Compute WCI using available lanes • Select CLM using local and remote WCI. Send CLM thus selected as a part of outbound TS4 ACK Exit Conditions and • Rx side advances to Config.2 when it receives a remote ACK. Lanes that Next States are not part of CLM, if any, will be disabled. • Tx side advances to Config.2 when it sent >= 4 local ACK and received remote ACK. Lanes that are not part of CLM, if any, will be disabled. • NOTE: Rx and Tx portions of a port can advance to Config.2 independent of each other. Not a violation of link handshake rules as handshake ends in Config.1 • CLM received on all lanes is not identical -Local Rx and remote Tx may use different widths due to an unknown transmission error. Trigger Inband Reset. •TCONFIG.1 timer expires – link width not negotiated. Mark all lanes bad and enter Disable/Start Additionally, if CLM received on all lanes in not identical, a port abandons initialization and triggers and Inband Reset. In Config.1, local Rx port advances to Config.2 as soon as it receives a remote ACK - which includes two consecutive TS4s with ACK bit set. Note that CLM is also a part of TS4 ACK field and will be set when ACK bit is set. The local Tx advances to Config.2 after a remote ACK has been received on this port and at least 4 TS4s with ACK have been sent (normal handshake algorithm). 3.9.3.4.2 Extended Config.1 State for Link Self-healing - Large MP Profile Link self-healing is a RAS feature where Physical layer can automatically detect faulty lanes and forms a link by using available set of lanes, without resetting higher CSI layers. Implementations supporting this feature negotiate a CLM in Config.1 regardless of the link width configured in Physical layer CSRs. As the order of precedence for negotiating a CLM using WCI is from largest width to smallest width, self-healing always results in selecting the most optimal CLM using the available set of lanes. 3.9.3.4.3 Config.2 Config.2 state is used to set flit boundary by synchronizing training sequences between local and remote ports. After exiting Config.1, the transmit port sends exactly 1 TS4 with redundant ACK field populated with all 1s. The TS4 with redundant ACK field set is referred to as TS4A. The receiving port enters Config.2 and waits for a TS4A K. Since a receiving port is guaranteed to enter Config.2 ahead of Ref No xxxxx Intel Restricted Secret transmitting port at the other end, synchronized flit boundary is always guaranteed. The transmit port sets flit boundary after transmitting TS4A and the receiving port sets flit boundary immediately after receiving TS4A. From this point on, flits are transmitted and received at the flit boundary. Any global counters that need to be synchronized between ports are reset in Config.2, when flit boundary is set. When a port sends and receives TS4 with redundant ACK, link initialization is complete and this port enters L0. Link layer can take control of the link after this point. Null Ctrl flits are transmitted by local port during the lag between completion of link initialization and link hand over to Link layer. When Link layer is ready to take over, Physical layer hands control to Link layer at the flit boundary set in Config.2. It is possible that a port sent TS4A but still waiting to receive a TS4A from remote port. In this case, local port sends Null Ctrl flits until a TS4A is received, which then propels the state machine into L0. Any global counters that need to be synchronized between ports are reset in Config.2, when flit boundary is set. Table 3-34. Summary of “Config.2” State State Config.2 (Flit Boundary Synchronization) Actions Lane FSM • Each Rx looks for a TS4A, which is sent exactly once. If 2 of the 3 redundant ACK bits are 1, Rx interprets this TS4 as TS4A - a safeguard against single bit error in redundant ACK field • Tx sends exactly one TS4A Link FSM •Start TCONFIG,2 timer • Send Null Ctrl flits if local Rx has not yet received TS4A Exit Conditions and Next States • TS4A transmitted and received - go to L0 •TCONFIG,2 timer expired and TS4A has not been received - abandon initialization and send Inband Reset to remote port. 3.9.3.5 Compliance State Tx on all active lanes repetitively transmit an eye pattern that can be used by a test probe to measure signal quality on the link. The exact test compliance pattern that needs to be transmitted is implementation specific, as this has no bearing on interoperability of CSI ports. The Tx stay in compliance mode indefinitely and the only way to exit from Compliance state is to reset the link. 3.9.3.6 Loopback State Coming out of polling loopback master sends TS5, but loopback slave continues to transmit TS3. The TS3 transmitted by loopback slave in this state is identical to the TS3 transmitted before exiting from Polling.3 - ACK bit set in byte 1 and zeroed out payload in bytes 2 through 7. Once loopback slave receives TS5, it immediately truncates outbound TS3 pattern and loops back incoming TS5. The master looks at the looped back TS5 as an indication that slave entered loopback mode and sends one TS5 with ACK field set. This TS5 training sequence is followed by a test pattern. Loopback master sends exactly one TS5 with ACK field set. TS5 sequence uses redundancy to communicate ACK, by sending 4 1s to indicate an ACK. This TS5 sequence with redundant ACK Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer is referred to as TS5A. Providing redundancy ensures that slave receives TA5A pattern even in the case of single bit error in ACK field - slave does a majority polling on ACK field bits and interprets this field as an ACK if it contains at least 3 1s. When the slave receives TS5A, it varies its RX parameters based on the payload fields in TS5A and uses these newly configured parameters to loopback anything following the end of this TS5A sequence. It is important that slave switches to these new parameters only after completely looping back TS5A sequence so that master is guaranteed to receive looped back TS5A correctly. It should be noted that byte lock established in Polling.2 is maintained by transmit side of loopback master and receive side of loopback slave. However, the transmit portion of loopback slave and receive portion of loopback master override the bytelock established in Polling.2, as the loopback slave truncates outbound TS3 and immediately echoes TS5 pattern (instead of waiting for the beginning of next training sequence boundary). Loopback master is going to re-establish bytelock using TS5 pattern header echoed by loopback slave. Zeroing out slave’s TS3 payload (bytes 2-7) from loopback slave guarantees that a portion of TS3 payload is not aliased to TS5 header, as seen by loopback master. The transmit portion of loopback slave need not have a notion of the beginning of training sequence - once loopback path is established, slave just echoes incoming traffic. Table 3-35. Description of TS5 Training Sequence Byte Description Value[7:0] 0 TS5 Header 1110 0010 1 ACK Field bit[3:0] – ACK bit bit[7:4] – reserved bit[3:0] – 4b’0=nack/4b’1=ack bit[7:4] – 4b’0 2 Timing trim offset Relative offset w.r.t. calibrated setting 3 Voltage trim offset Relative offset w.r.t. calibrated setting 4 RX termination setting Absolute termination strength, specified as Rcomp setting 5 Current source strength to adjust output at slave Tx Absolute current source strength, specified as Icomp setting 6-7 Pattern Length Pattern length in bytes. Slave echoes a pattern for a period specified in this field. If this value is zero, slave echoes the pattern indefinitely, and an Inband Reset mechanism is used to exit loopback. Two exit mechanisms from loopback are supported 1. A HVM option where master encodes pattern length as a part of TS5A. Slave echoes the patterns following TS5A for a pre-determined amount of time, and enters Polling. The master also enters Polling after receiving the entire test pattern. The slave is required to maintain a clean copy of its calibrated settings, which can be restored prior to entering polling. Calibration is a time consuming operation, and re-calibrating after each loopback sequence has severe test throughput implications. 2. The second mechanism is used to test patterns that are extremely long for debug purposes, e.g., BER test. This mode of loopback operation is terminated with master sending an Inband Reset to the slave. The loopback mechanism assumes pre-determined transceiver pairs at both master and slave ports. Loopback on asymmetric links requires muxing/demuxing at either end to match transceiver pairs, and is beyond the scope of this specification. Refer to CSI DFx/Loopback Chapter for a detailed discussion on CSI loopback scheme. Ref No xxxxx Intel Restricted Secret 3.9.3.7 L0 State Physical layer operates under the direction of Link layer in L0 state to transfer data across the link. If periodic retraining is enabled, Physical layer temporarily halts its beats to send retraining packets to the remote port, as described in Section 3.9.7. The Physical layer also responds to Soft Reset and Inband Reset events in L0 state and enters Disable/Start state to re-initialize the link. Physical layer beats are temporarily halted during link initialization. Table 3-36. Summary of “L0” State State L0 Actions Lane FSM N/A Link FSM • Set Link-up Identifier to1 • Serialize outbound flits to link width granularity and de-serialize incoming PHY signals to a flit granularity • Continuously update periodic retraining counters. • If periodic retraining interval is reached, halt Physical layer beats and send periodic retraining patterns. Reset periodic retraining counters at the end of retraining and turn-on Physical layer beats. Exit Conditions and Next States • Inband Reset – Stop forwarded clock and enter Disable & Start. 3.9.4 Optional Low Power Modes – An Overview Figure 3-15. Logical Sub-block State Diagram with Optional Low Power Modes L1 wake-up Physical Layer Reset The Physical layer state machine that supports low power modes is shown in Figure 3-15. As shown in this Figure, low power modes are entered from L0 state. Modified L0 state to support low power modes, and an overview of available low power modes is described in this section. A detailed description of low power mode entry and exit can be found in Section 3.9.5. Polling Detect Configuration L0 (active) L0S L1Loopback Compliance Disable / Start >= 1 good bit lane Link width agreed upon Train PHY link Configuration Failure CSI agent detected end of test Directedby master Inband Reset Inband Reset Polling Failure under LL* controlunder LL* control activity detect probedetected Detect Failure *LL => Link Layer Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 3.9.4.1 Extended L0 State for Low Power Support In addition to doing periodic retraining and responding to an Inband Reset, an extended L0 state also responds to low power mode entry requests from Link Layer. Table 3-37 shows a summary of extended L0 operation to support low power modes. The three low power modes - L0s, link width modulation and L1, are summarized in sections that follow. Table 3-37. Summary of Extended L0 State with Low Power Support State L0 Actions Lane FSM N/A Link FSM • Linkup Identifier = 1 • Serialize outbound flits to link width granularity and de-serialize incoming PHY signals to a flit granularity Exit Conditions and Next States • L0s Entry Signal – Enter into L0s in either direction of the link, independent of the other direction. In each direction, a portion of the link can be in L0s with the rest in L0. Physical layer beats are turned-off when L0s signal is received until Physical layer re-enters L0. • Link width modulation signal - Turn beat off and adjust link width as specified by Link layer and resume L0 operation. Link width can be modulated in either direction, independent of the other. Turn beats on after mux adjustment is done. • L1 Entry Signal – Both sides of the link enter L1 • Inband Reset – Stop forwarded clock and enter Disable & Start. 3.9.4.2 L0s State This low power state can be entered by one direction of the link independent of the other. Portions of electrical sub-block are turned-off based on a pre-determined policy as described in CSI Power Management Chapter. In this state, the link is placed in Electrical Idle (EI) state, where both halves of a differential pair are dropped to ground. The logical sub-block remains powered-on but the Physical layer beat signals are turned-off until the link re-enters L0 state. The flit alignment counters are still operational, and a re-entry into L0 always happens on a flit boundary. Exit from L0s is facilitated through activity detect circuitry on Rx differential pairs that are turned- on when Rx side of the link enters L0s. Activity detectors interpret a break from Electrical Idle as an indication to exit L0s. The transmit side of the link breaks Electrical Idle by driving all Tx differential pairs to a logic 1. The link is required to wake up from L0s to perform periodic link re-training and automatically goes back to L0s state after the completion of retraining. As both ports know when periodic retraining occurs, based on periodic retraining counters specified in, the ports will not rely on activity detectors to exit from L0s to perform periodic retraining. The transmit side automatically starts waking up its circuitry well in advance to drive out a retraining packet. Likewise, the receive side starts waking up in anticipation of a retraining packet. (Author’s Note: Periodic retraining during L0s is subject to change in future rev of the spec, for simplicity. Another proposal is for the transmit side to start exiting L0s in advance of retraining phase, such that both ends of the link are back in L0 when retraining occurs. Upon completion of retraining, the transmitter may choose to re-initiate L0s or continue to remain in L0.) Details on L0s entry and exit are detailed in Section 3.9.5. Ref No xxxxx Intel Restricted Secret Table 3-38. Summary of L0s State State L0s Actions Lane FSM • Maintain lane in Electrical Idle by driving both halves of Tx differential pairs to ground. Link FSM • Turn-on activity detectors on all Rx differential pairs and monitor output to detect a break from Electrical Idle • (Note: An implementation might choose to have activity detectors on select Rx differential pairs only, which is not precluded by this specification). • Start waking up electrical sub-block if link retraining phase is approaching, such that Tx and Rx are ready to transmit/receive retraining packets when periodic retraining is due. • Go back to L0s after performing periodic retraining. Exit • Tx side of link receives L0s exit signal from higher layers. Each Tx differential pair Conditions breaks Electrical Idle on lanes they are driving. and Next States • At least one activity detector sensed a break from Electrical Idle – Wake up circuitry turned-off and enter L0s. • Inband Reset – Stop forwarded clock and enter Disable & Start. 3.9.4.3 Link Width Modulation Link width can be adjusted on the fly without going through a link re-initialization process. The new width to be formed is indicated in Link layer packet used for signaling a width modulation, and this width is chosen by the Link layer using WCI exchanged in the most recent initialization sequence. The Physical layer temporarily halts its beats to adjust internal muxes to support this new width and re-enables the beats once muxes are adjusted. Note that the link continues to stay in L0 when link width is being modulated - the beats are temporarily halted during the transient phase when internal muxes are adjusted to support new width. When link width is modulated from wider to a narrower width, unused lanes after modulation will be placed in L0s. Conversely, if modulation results in going from narrower width to a wider width, new lanes to be phased in are first brought out of L0s before mux adjustment is done. The timing requirements to re-synchronize the link at new width is described in Section 3.9.5.5. 3.9.4.4 L1 State Both directions of the link are required to go into L1 state. In L1 state, circuits in electrical sub- block are turned-off and logical sub-block is functionally shut down. However, power is maintained to the logical sub-block to ensure Physical layer configuration is not lost during L1. A platform may also choose to turn-off the Physical layer internal (PLL) clock. Prior to entering L1, each port automatically configures itself to bypass internal calibration upon exit from L1. It is required that all Rx terminations meet ZRX_HIGH_CM in L1 state. Exit from L1 to L0 uses the detect scheme used by Physical layer during link initialization. Termination detectors on each port’s Tx differential pairs are turned-on. A port receiving an implementation specific L1 exit signal would turn-on terminations on clock lane(s) - clock Rx terminations must now meet ZRX_LOW_CM. Termination detectors at clock Tx on remote port sense Rx clock terminations and use this as an indication to exit from L1 back to L0. Refer to Section 3.9.5.7 for details on L1 entry and exit sequence. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer State L1 Actions Lane FSM Link FSM • Circuitry in electrical sub-block turned-off. Logical sub-block not functional but power is retained to ensure port configuration is not lost during L1 • All clock and data Rx must meet ZRX_HIGH_CM • Termination detectors on all Tx turned on Exit Conditions and Next States • L1 exit signal from higher layer - clock Rx must meet ZRX_LOW_CM at least for a period of TL1_EXIT_DEBOUNCE. Port enters Disable/Start after this time interval. • At least one clock Tx diff pair on a port sense ZRX_LOW_CM for a period of TDEBOUNCE - Port enters Disable/Start at the end of TDEBOUNCE. 3.9.5 Link Low Power Modes The dynamics of entering and exiting low power modes are explained in this section, along with time constants that Physical layer requires for synchronization. These time constants have to be loaded by firmware or higher layer, in Power Management registers, before a low power mode of operation is initiated. Physical layer results in undefined behavior if these values are not programmed. Following notation is used for timer values in the following sections. A variable starting in upper case “T” (e.g., Tsubscript) represents a time constant programmed by firmware or a layer above Physical layer, where as a variable starting in lower case “t” (e.g., tsubscript) represents internal circuit variables that Physical layer can be agnostic to. The tsubscript variables can either be derivatives of Tsubscript, computed by hardware as needed, or can be used by a platform to derive Tsubscript time constants prior to programming these time constants. Additionally, the following discussion “may” append _Min or _Max suffix to tsubscript variables in timing equations to indicate the absolute minimum or maximum value of this variable across all Process, Voltage and Temperature (PVT) variations. 3.9.5.1 L0s Entry Sequence Figure 3-16 shows the sequence of events leading to L0s entry, where Port A is initiating an L0s entry request. The Figure shows event scale (A# and B#) on vertical axis and link communication along tilted horizontal axis. The time between events is also shown on the vertical axis. Ref No xxxxx Intel Restricted Secret Figure 3-16. L0s Entry Sequence Table 3-40. L0s Entry Events and Timers Tx at Port A Rx at Port B PM.LinkEnterL0s NullCtrlFlit A1 A2 A3 A5 B1 B2 B3 B4 B5 tL0S_PKT[UI] TL0S_ENTER_Tx_DRV tL0S_Enter_Tx_off Tx at Port A in L0s Rx at Port B in L0s tL0S_PKT[UI] tFLIT [UI] tRX_PHY->LL + tRx_LL->PHY tL0S_Enter_Rx_off EarliestL0sWakeSignal A6 TL0S_SLEEP_MIN tFLIT [UI] A4 Events/Timers Port A Port B Events A1: Link layer at Port A starts L0s sequence by sending PM.LinkEnterL0s packet A2: Link layer at Port A sends a Null Ctrl flit - required for CRC check on PM.LinkEnterL0s packet for 16-bit rolling CRC scheme After sending a Null Ctrl flit, Link layer at Port A is decoupled from Physical layer at Port A. LinkTxRdy and PhyTxRdy beats are turned off. A3: Physical layer at Port A drives all active Tx differential pairs to binary 1/0 on D+/D-. Required to ensure eye quality at Port B Rx on preceding flit. A4: Port A starts entering L0s. The link is inactive as all Tx differential pairs are held at binary 0/0 on D+/D-. Port A simultaneously starts powering down portions of electrical sub-block as required by the current wake-up time. B1: Physical layer at Port B receives the first phit of PM.LinkEnterL0s B2: Physical layer at Port B receives the first phit of Null Ctrl flit. B3: Physical layer at Port B forwards Null Ctrl flit to Link layer at B. B4: Link layer at B signals Physical layer at B to enter L0s. Between B3 and B4, Link layer at B receives garbage from Physical layer at B, which it ignores. Link layer expects to see this garbage following L0s entry request from Port A. Physical layer beat PhyRxRdy is turned off. Turning on this beat on re-entry to L0 is an indication for Link layer to accept flits again. Any garbage received until this future point in time is ignored. Port B starts turning-off portions of electrical sub-block as required by the current wake-up time. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer Events/Timers Port A Port B A5: Port A Tx side is in L0s state A6: Earliest time Port A can exit out of L0s state by transitioning link from inactive to active state. B5: Rx side of Port B is in L0s. Activity detectors are turned-on to sense a link state change from inactive to active. This is the earliest time that Port B can respond to a wake up signal from Port A. (Authors’ Note: Discuss circuit implications of turning A.D. ON at B4, to reduce TL0S_SLEEP_MIN) Timers tL0S_PKT: Number of UI required to transfer PM.LinkEnterL0s packet. Depends on link transfer ratio (see Section 3.8). tFLIT: Number of UI required to transfer a flit. Depends on link transfer ratio (see Section 3.8). TL0S_ENTER_Tx_DRV: Number of UI Tx is required to drive 1/0 on D+/D- after sending Null Ctrl flit. tL0S_Enter_Tx_Off: Time required for Port A to turn-off portions of electrical sub- block as defined by current L0s wake-up time. TL0S_SLEEP_MIN: Minimum amount of time Port A is required to be in L0s before it can wake-up Port B. tL0S_PKT: Number of UI required to transfer PM.LinkEnterL0s packet. Depends on link transfer ratio (see Section 3.8). tFLIT: Number of UI required to transfer a flit. Depends on link transfer ratio (see Section 3.8). tRX_PHY->LL: Internal delay between Physical layer and Link layer at B, for the flit to reach Link layer tRX_LL->PHY: Time for Link layer at B to process L0s entry request and for Link layer to signal an L0s entry to its Physical layer. tL0S_Enter_Rx_Off: Time required for Port B to turn-off portions of electrical sub- block as defined by current L0s wake-up time. It is evident from Figure 3-16 that Port A needs to stay in L0s for a minimum time period, TL0S_SLEEP_MIN. TL0S_SLEEP_MIN and TL0S_ENTER_Tx_DRV are the two parameters required by Physical layer to support L0s entry, and are expected to be programmed in Power Management register prior to entering L0s. Port A needs to stay in L0s for a minimum time period of TL0S_SLEEP_MIN, the upper limit of which is defined by the following equation, and rounded up to the next UI. TL0S_SLEEP_MIN = tRx_PHY->LL_MAX + tRx_LL->PHY_MAX + tenter_L0S_Rx_Off_Max TL0S_ENTER_Tx_DRV - tL0S_Enter_Tx_Off_MIN The time required by Port A (Tx) to start entering L0s after a decision has been made is, tL0S_Enter_Tx = (tL0S_PKT + tFLIT + TL0S_ENTER_Tx_DRV) [UI] The time required by Port B (Rx) to start entering L0s after a decision has been made is, tL0S_Enter_Rx = (tL0S_PKT + tFLIT + tRx_PHY->LL + tRx_LL->PHY) [UI] Ref No xxxxx Intel Restricted Secret 3.9.5.2 L0s Exit Sequence Figure 3-17 shows L0s exit sequence initiated by Port A (Tx). The Figure shows event scale (A# and B#) on vertical axis and link communication along tilted horizontal axis. The time between events is also shown on the vertical axis. L0s exit policy is based on a pre-determined wake-up time, TL0S_WAKE, that is common to both Ports. This value should be programmed in Power Management registers before the link entered L0s. Both ports are required to power-down their circuitry upon L0s entry such that they wake up within TL0S_WAKE from the time a decision to exit L0s has been made. As the exit mechanism uses an analog activity detection scheme, the debounce time required by activity detectors need to be factored in, along with the variations in response time of activity detectors due to PVT. These activity detector parameters, respectively, are TL0S_EXIT_DEBOUNCE_MAX and TL0S_EXIT_NOP and need to be programmed by a layer above Physical layer prior to entering L0s. The latter parameter indicates the amount of time, in UI, Null Ctrl flits should be sent. This is arrived at by estimating the absolute maximum value of TL0S_EXIT_DEBOUNCE_MAX across all process, voltage and temperature (PVT) variations and looking at the maximum variation of this debounce time across PVT (i.e., .TL0S_EXIT_DEBOUNCE_MAX). Figure 3-17. L0s Exit Sequence Tx at Port A Rx at Port B LinkActive(D+/D-=1/0) Control/DataFlits A1 A2 A3 A5 B1 B2 B3 B4 B5 TL0S_EXIT_DEBOUNCE_MAX [UI] tL0S_Exit_Tx_Wait Tx at Port A in L0 Rx at Port B in L0 A4 TL0S_EXIT_NOP [UI] tL0S_Exit_Rx_On tL0S_Exit_Tx_On tL0S_EXIT_DEBOUNCE NullCtrlFlit#1,#2... tL0S_Exit_Rx_Wait <= TL0S_EXIT_NOPTL0S_WAKE TL0S_WAKE NullCtrlFlit#n [UI] Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer Events/Timers Port A Port B A1: Link layer at Port A signals Physical layer at Port A to exit from L0s. Physical layer on Port A signals Physical layer on Port B to exit L0s by driving D+/D- of all Tx pairs re-entering L0 to 1/0. Simultaneously, Physical layer on Port A starts waking up the powered-down B1: Analog detectors of Physical layer on Port B start sensing active link B2: After analog detector debounce time, the link is deemed active. The Rx portion of Port B starts waking up. portions of electrical sub-block. A2: Physical layer continues to drive 1/0 as the electrical sub-block is still being waken up. B3: Physical layer at Port B turned on electrical sub-block. Assuming this happened between flit boundaries, Port B waits until the next flit boundary before entering L0. Events A3: Physical layer at Port A wakes up completely. Assuming Physical layer woke up between flit boundaries, it waits until the next flit boundary before sending flits out. A4: Physical layer is at a flit boundary and is ready to transmit flits from Link layer. Turns PhyTxRdy beat ON to accept flits. Link layer initially sends Null Ctrl flits until Port B is guaranteed to exit from L0s. Physical layer on Port A is in L0 state. No real flits can be transmitted by Link layer at A yet. A5: Port A Tx side is in L0 state as intended. Link layer at A can start transmitting real flits. Note that Null Ctrl flits sent by Port A start appearing at Port B before the latter exited L0s. This is acceptable as Link layer at Port A does not expect an ACK for Null Ctrl flits. B4: Port B is finally in L0 and starts receiving flits sent by Port A. Port B missed the first (n-1) Null Ctrl flits. Physical layer at Port B turns on PhyRxRdy beat and starts forwarding flits to Link layer at B. It is required that the first Null Ctrl flit sent by Port A arrive at Port B no later than B4 and the first control/data flit (non-Null Ctrl flit) arrive no earlier than B4. B5: Rx side of Port B is in L0 state as intended, as it is now forwarding control/data flits to Link layer at Port B. Ref No xxxxx Intel Restricted Secret Table 3-41. L0s Exit Events and Timers (Continued) Timers TL0S_EXIT_DEBOUNCE_MAX [UI]: This is the absolute maximum debounce time required by activity detectors on Port B to detect link active state, expressed in UI. The maximum value should include all possible process, voltage and temperature variations. tL0S_Exit_Tx_On: Time taken by Port A to turn-on electrical sub-block. Adjusted to meet TL0S_WAKE after factoring in TL0S_ExIT_DEBOUNCE_MAX and TL0S_ExIT_NOP tL0S_Exit_Tx_Wait: Time Port A waits until the next flit boundary. tL0S_EXIT_DEBOUNCE: Time taken by activity detectors on Port B to sense link active state. This value will be between TL0S_EXIT_DEBOUNCE_MIN and TL0S_EXIT_DEBOUNCE_MAX tL0S_Exit_Rx_On: Time taken by Port B to turn-on electrical sub-block. Adjusted to meet TL0S_WAKE by subtracting TL0S_ExIT_DEBOUNCE_MIN from TL0S_WAKE i.e., Port B has to assume that its activity detectors could detect link activity in minimum possible time. tL0S_Exit_Rx_Wait: Time Port B waits until the next flit boundary. TL0S_EXIT_NOP [UI]: Duration for which Null Ctrl flits are sent by Link layer at Port A after exiting L0s. This field is derived by adjusting TL0S_EXIT_DEBOUNCE_MIN to match the next highest TFLIT at the current link transfer ratio. <= TL0S_EXIT_NOP [UI]: Duration for which Physical layer on Port B receives Null Ctrl flits and forwards them to Link layer at Port B. TL0S_WAKE [UI]: L0s wake up time that was in effect prior to entering L0s. Port A (Tx) and Port B (Rx) circuit turn-on time should always be expressed as tL0S_Exit_Tx_On =TL0S_WAKE - {TL0S_EXIT_DEBOUNCE_MAX + (TL0S_EXIT_DEBOUNCE_MAX - TL0S_EXIT_DEBOUNCE_MIN)} tL0S_Exit_Rx_On =TL0S_WAKE - TL0S_EXIT_DEBOUNCE_MIN where, the above time constants are the currently defined values in Power Management registers. If lower values are used for circuit turn-on times, the delta between the values in the equations above and the used values should be compensated for by increasing tL0S_Exit_Tx_Wait and tL0S_Exit_Rx_Wait, respectively. 3.9.5.3 L0s Corner Cases Since L0s does not require an ACK from remote port, local port initiating L0s enters L0s even if remote port sees a CRC error in L0s entry packet. Recovery from CRC error follows Link layer retry sequence. Remote port sends a retry request to local port. If the local port is still in L0s, it wakes-up and retransmits the erroneous packet and enters L0s again. If local port chose not to go back to L0s, it wakes-up and replaces previous L0s entry packet with Null Ctrl flit(s), resulting in both sides going back to L0. If retry packet arrives at local port after it exits L0s, it replaces the previous L0s entry packet with Null Ctrl flit(s). Any subsequent packets sent by local port, after it exited L0s are also retransmitted. Remote port will be agnostic to the most recent L0s request, but no packets are lost. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer 3.9.5.4 Independent L0s Some implementations may choose to implement L0s on a quadrant basis for aggressive power savings. This feature is also required to support Link Width Modulation described in Section 3.9.5.5. The entry and exit sequences are similar to L0s entry and exit sequences mentioned in Section 3.9.5.1 and Section 3.9.5.2, respectively. Instead of operating on the entire port, these entry and exit sequences are localized to quadrants of choice. L0s entry packet specifies the quadrant that needs to enter L0s. Activity detectors sensing L0s exit would wake up Rx belonging to that quadrant only. A quadrant exiting L0s when a portion of the link is active, will enter a limbo state instead of going to L0. In this limbo state, all Tx and Rx are powered-up as in L0 but the nibble muxes/de-muxes for these quadrants are turned-off, ensuring no Link layer traffic flows on this quadrant. A quadrant in limbo state stays there indefinitely, until a Link Width Modulation request merges this quadrant with other quadrants to form a wider link. Independent L0s supports wake-up times on a quadrant basis for further power savings. For instance, a full width link can be downgraded to a half width link, with inactive lanes going into L0s with a long wake-up latency. The active lanes forming a link can go in and out of L0s using a much shorter wake-up latency. 3.9.5.5 Link Width Modulation for Power Savings Link width modulation provides the flexibility of managing link power by making a trade-off between link power consumption and bandwidth. The criteria for making this trade-off is implementation specific. Link width can be adjusted in one direction of the link independent of the other direction. It is even allowed for other direction of the link to be in L0s. The LMs exchanged during the most recent initialization sequence are used to configure the outbound link in new width. The Link layer queries the Physical layer for a Common Lane Map (CLM) supported by the remote port at the new desired width. It is possible for remote port not to support an LM at the newly requested link width, as indicated in remote WCI (stored locally), in which case the local Link layer aborts the link width modulation attempt. If link width modulation is attempting to increase the link width, new lanes to be added will be powered-up by the time the Link layer signals Physical layer to modulate link width. For instance, a link might be operating in half width mode with the other half in L0s. To go to full width, the portion of the link in L0s is powered-up before a link width modulation request is presented to the Physical layer. Conversely, if link width modulation is from a wider width to a narrower width, the portion chosen for exclusion will be powered-down when the rest of the link is trying to adjust to the new link width. Once all lanes to be included at new link width are ready, Link layer on local port communicates link width modulation request by sending a PM.LinkWidthConfig packet. \Physical layer maintains flit boundary alignment between the two connected ports before and after link width modulation using two timers - TLWM_ENTER _NOP and TLWM_MUX_SWITCH, which need to be programmed in Power Management register prior to initiating a link width modulation request. Ref No xxxxx Intel Restricted Secret TLWM_ENTER _NOP is the time required by the remote port to signal link width modulation event to Physical layer. Specifically, this corresponds to the time required for remote Physical layer to forward a PM.LinkWidthConfig packet to remote Link layer and for the Link layer to process this packet and signal remote Physical layer to start adjusting to new width, which is also communicated by remote Link layer using CLM field of PM.LinkWidthConfig packet. This value should be specified in UI and should be constant across all PVT variations. TLWM_MUX_SWITCH is the amount of time required by each port to switch muxes to support new link width and is specified in UI. This is the higher of the mux switching time of either port. After sending PM.LinkWidthConfig packet, the local Link layer sends Null Ctrl flits for a minimum time period of TLWM_ENTER _NOP, rounded up to the next flit boundary. Old link width is used to track flit boundary during this time. After sending the required number of Null Ctrl flits, the local Link layer signals local Physical layer to adjust muxes to new link width. The Physical layer drives 1/0 on D+/D- on all active Tx differential pairs, for a minimum time period of TLWM_MUX_SWITCH, adjusted to the next flit boundary using new link width. During this time, PhyTxRdy beat is turned-off on the local Physical layer. Note that, during the process of link width modulation, flit boundary counters on local port switch to new link width only after sending Null Ctrl flits to guarantee flit boundary alignment between local and remote ports when they start communicating at new width. At the remote port, the Physical layer forwards all incoming flits until it receives a signal from remote Link layer indicating a link width change. The remote Link layer also indicates the new CLM to be used, which is sent as a part of PM.LinkWidthConfig packet. The remote Physical layer switches to new width and starts accepting flits again at the next flit boundary, computed using the new link width. The remote PhyRxRdy beat is turned-off when muxes are being adjusted to support new link width. Once muxes are adjusted to support new widths, Physical layers at either end transfer control back to the corresponding Link layers (by turning on beats). It is possible for the remote Link layer to receive a CRC error in PM.LinkWidthConfig packet or the ones preceding it. In this case, the remote Link layer sends a retry request to the local Link layer, along with the link width currently in effect at the remote Link layer. The local Link layer would adjust its link width to match remote port’s link width and responds to the retry request from remote port. A link width modulation sequence between two ports - Port A and Port B, is shown in Figure 3-18. In this example, Port A is initiating a link width modulation request. Figure 3-18. Link Width Modulation Sequence Tx at Port A Rx at Port B PM.LinkWidthConfig NullCtrlFlit#1 A1 A2 B1 B2 tLWM_PKT[UI] tLWM_PKT[UI] TLWM_ENTER_NOP [UI] TLWM_ENTER_NOP [UI] NullCtrlFlit#nD+/D-=1/0ReadytoSendControl/Data Flits TLWM_MUX_SWITCH A3 A4 B3 B4 TLWM_MUX_SWITCH Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer Events/Timers Port A Port B Events A1: Link layer at Port A initiates link width modulation sequence by sending PM.LinkWidthConfig packet. If link width modulation results in width reduction, all unused lanes will begin to turn-off at this point. They do not impact link width modulation timing. If link width modulation results in width increase, new lanes to be added need to be powered-up ahead of time. These lanes need to be in L0 by A3 (see below) A2: Link layer at Port A starts sending Null Ctrl flits, as required by link width modulation sequence. Null Ctrl flits will be sent for a period of TLWM_ENTER_NOP, rounded to the next flit boundary using the current link width. A3: Physical layer on Port A is informed of new link width by this time. Physical layer turns off PhyTxRdy beat and puts a B1: Physical layer at Port B receives PM.LinkWidthConfig packet sent by Port A. Forwards this packet to Link layer at Port B B2: Physical layer at Port B still not aware of link width modulation. Continues to forward Null Ctrl flits to Link layer at Port B. B3: Link layer at Port B signals its Physical layer to change link width. Sends CLM corresponding to new width, which was received as a part of PM.LinkWidthConfig packet sent by Port A. Physical layer at Port B turns-off RxPhyRdy beat and starts adjusting its muxes to support new link width. Starts computing flit boundary using new link width. Incoming link traffic is ignored during this time. differential DC swing on all lanes that are a part of new link width. D+/D- on these lanes is driven to 1/0 for a period of TLWM_MUX_SWITCH, rounded to the next flit boundary using new width the link is being configured to. Simultaneously, Physical layer on Port A adjusts its internal muxes to support new link width. If link width modulation results in width reduction, all unused lanes will begin to turn-off at this point. They do not impact link width modulation timing. If link width modulation results in width increase, new lanes to be added need to be powered-up ahead of time. These lanes need to be in L0 by this time. Port A computes flit boundary alignment using new link width, starting from A3 A4: Physical layer on Port A ready to communicate using new link width. Re- enables PhyTxRdy beat and can accept flits from Link layer. This is the earliest time Link layer at A can send control/data flits at new link width. B4: Physical layer at Port B re-enables RxPhyRdy and starts accepting incoming flits at new width. Ref No xxxxx Intel Restricted Secret Table 3-42. Link Width Modulation Events and Timers (Continued) tLWM_PKT [UI]: Length of tLWM_PKT [UI]: Length of PM.LinkWidthConfig packet. Does not PM.LinkWidthConfig packet. Does not impact link width modulation sequence. impact link width modulation sequence. TLWM_ENTER_NOP [UI]: Minimum time for TLWM_ENTER_NOP [UI]: Time required by which Null Ctrl flits are sent by Port A Link layer to signal Physical layer about before adjusting muxes to new link link width modulation, once width. Null Ctrl flits are sent by rounding PM.LinkWidthConfig packet is received. up this number to next flit boundary. Physical layer waits until next flit Timers TLWM_MUX_SWITCH [UI]: Time required boundary (at current width) before switching muxes. by Physical layer to switch muxes to support new link width. After adjusting TLWM_MUX_SWITCH [UI]: Time required muxes, Physical layer waits until next flit by Physical layer to switch muxes to boundary, computed using new width, to support new link width. After adjusting re-enable PhyTxRdy beat. muxes, Physical layer waits until next flit boundary, computed using new width, to re-enable PhyRxRdy beat. 3.9.5.6 Link Width Modulation Corner Cases A Link Width Modulation packet received by remote port might have CRC errors. As this low power mode does not require an ACK, transmit side of local port may have already adjusted to new width by the time a retry request arrives from remote port. This corner case is addressed by Link layer, by sending CLM at the remote port’s receiver as a part of retry request. The transmit side of local port adjusts to this CLM and re-transmits erroneous packets. An extreme example of CRC failures occurs when both directions of a link simultaneously attempt to modulate link width, and both ports see CRC errors simultaneously. Neither port can respond to retry request from the other, as they no longer have a common width. This corner case is addressed using existing Link layer retry mechanism. Each port sends a retry request which eventually times out. This results in each port continuously sending retry packets on time out, until retry threshold is reached, which results in Link layer forcing a Physical layer initialization. Both ports go through link re-initialization and establish a link. 3.9.5.7 L1 Low Power State Both directions of the link are required to go into L1 state. In L1 state, circuits in electrical sub- block are turned-off and logical sub-block is functionally shut down. However, power is maintained to the logical sub-block to ensure Physical layer configuration is not lost during L1. A platform might also choose to turn-off the Physical layer internal (PLL) clock. Prior to entering L1, each port also configures itself such that calibration is bypassed upon exit from L1. It is required that all Rx terminations meet ZRX_HIGH_CM in L1 state. Link layer on local port signals its Physical layer that an entry into L1 is impending and starts sending out L1 packets to remote port. The Link layer on remote port, after receiving L1 packet, signals its Physical layer that an entry into L1 is to be expected, and ACKs local port’s L1 entry request. When Link layer on local port receives remote port’s ACK, it instructs local Physical layer to enter L1. Local Physical layer responds to this signal by sending an Inband Reset to remote Physical layer and enters an L1 state. The remote Physical layer interprets this Inband Reset as an entry in L1 based on a previous signal from remote Link layer and enters L1. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer The remote port may also choose to NACK an L1 entry request from the local port, in which case remote Physical layer is not informed of this L1 request. Link layer on the local port, upon receiving remote NACK, abandons its L1 request and instructs its local Physical layer not to expect an entry into L1 until further notice. The ports continue to remain in L0 state. CRC errors detected by either port after an L1 entry sequence has started would result in both sides ignoring this L1 sequence. For instance, if remote Link layer detected a CRC error either on L1 entry packet or flits prior to it, it sends a retry request to local Link layer. In this case, the remote port is not aware of the L1 request, and hence continues to stay in L0. The local Link layer, upon receiving a retry request, abandons current L1 sequence and continues to stay in L0. Conversely, if local Link layer sees a CRC error after sending an L1 entry packet, it abandons the current L1 sequence and sends a retry request to remote port. The remote port, which is expecting an Inband Reset to enter L1, abandons the current L1 sequence upon seeing this retry request. In all cases, when a Link layer abandons its L1 sequence, it instructs the Physical layer accordingly to ensure that a subsequent Inband Reset is not interpreted as an indication to enter L1. Exit from L1 to L0 uses the detect scheme used by Physical layer during link initialization. Termination detectors on each port’s Tx differential pairs are turned-on. A port receiving an implementation specific L1 exit signal would turn-on terminations on clock lane(s). Clock Rx terminations must meet ZRX_LOW_CM that can be detected by termination detectors on remote Tx. The local port must maintain this termination value on all Rx differential pairs at least for a period of TL1_EXIT_DEBOUNCE. Remote port senses an exit from L1 when at least one clock Tx pair detects local clock Rx terminations for a period of TDEBOUNCE (TDEBOUNCE <= TL1_EXIT_DEBOUNCE). (Note: As clock may be turned-off in L1, the remote Tx may have to use an alternate timing reference to meet debounce time requirement. This can be done through system clock or by using an RC circuit to provide the required time constant.Exact mechanism is implementation dependent. Likewise, depending on implementation style, the local port might not be able to turn-on local clock Rx terminations until Internal Clock Stable signal is seen. This specification does not preclude this additional time required by local port before it can send an L1 exit signal to remote port) The local port enters Disable/Start state after turning-on local clock Rx terminations for at least a period of TL1_EXIT_DEBOUNCE. Once in Disable/Start state, local clock Rx terminations must meet ZRX_HIGH_CM. The local port waits for a time period of TINBAND_RESET_INIT before entering Detect.1 state. The remote port, on the other hand, enters Disable/Start state once remote clock Tx sense local clock Rx terminations for a period of TDEBOUNCE. The remote port waits in Detect/Start for a time period of TINBAND_RESET_INIT before entering Detect.1 state. Once both sides are in Detect.1 state, initialization proceeds using the normal initialization flow. L1 entry and exit sequence is shown in Figure 3-19. Ref No xxxxx Intel Restricted Secret Figure 3-19. L1 Entry and Exit Sequence Port B Port A L1EntryPacket#1 L1EntryPacket#2 A1 A2 A3 A5 B1 B2 B3 Port A in L1 Port B in L1 A6 A4 L1ACK#1 InbandReset L1ACK#2 ExitfromL1(analogindicator) A7 TL1_EXIT_DEBOUNCE Port A in Disable/Start B4 B5 TINBAND_RESET_INIT B6Port B in Detect.1 Port A receives L1 exit signal from higher layer Port A Internal Clock Stable Port B Internal Clock Stable Port A in Detect.1 TINBAND_RESET_INIT Port B enters Disable/Start A8 Table 3-43. L1 Entry and Exit Events/Timers Events/Timers Port A Port B A1: Link layer at Port A starts L1 sequence by sending L1 Entry packet and simultaneously signals its Physical layer to expect an entry into L1. A2: Link layer at Port A continues to send L1 Entry packets until an ACK is received from Port B. A3: Link layer at Port A receives an ACK from Port B, and stops sending L1 Entry packets. Enters L1 by signalling Inband Reset. Physical layer on Port A prepares to enter L1. B1: Link layer at Port B sees L1 Entry packet from Port A. Signals an L1 entry to its Physical layer and ACKs L1 entry packet B2: Link layer on Port B receives L1 Entry Packet #2 and ACKs this as well. Link layer at Port B continues to ACK L1 Entry packets until an Inband Reset is seen. B3: Physical layer at Port B sees Inband Reset and prepares to enter L1 instead of re-initializing the Physical layer. Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer Physical layer on Port A turns off circuitry in electrical sub-block and maintains all Rx terminations at ZRX_HIGH_CM. Tx termination detectors are turned-on. Logical sub-block is no longer functional - PhyTxRdy beat is turned-off, all internal counters turned- off but power supply is maintained to remember port configuration prior to entering L1. Flit boundary is lost in L1. Physical layer on Port B turns off circuitry in electrical sub-block and maintains all Rx terminations at ZRX_HIGH_CM. Tx termination detectors are turned-on. Logical sub-block is no longer functional - PhyTxRdy beat is turned-off, all internal counters turned- off but power supply is maintained to remember port configuration prior to entering L1. Flit boundary is lost in L1. Events A4: A hypothetical event. ACK for L1 Entry packet #2 would have been received at this point, but Port A is already in L1. It does not matter how many L1 entry packets Port A sends to Port B. A5: Physical layer on Port A receives an L1 exit signal from a higher layer. Waits for Internal Clock Stable signal. A6: Internal Clock Stable signal asserted. All clock Rx now meet ZRX_LOW_CM, which can be detected by clock Tx on Port B. B4: Clock Tx on Port B detect clock Rx termination on Port A for a period of TDEBOUNCE. Port B enters Disable/Start state and waits for Internal Clock Stable signal. B5: Internal Clock Stable signal seen. Waits for TINBAND_RESET_INIT before entering Detect.1. B6: Port B is in Detect.1 state. Follows normal Physical layer initialization flow. A7: Clock Rx terminations reverted to ZRX_HIGH_CM. Port A enters Disable/Start state. A8: Port A is in Detect.1 state. Follows normal Physical layer initialization flow. Timers TL1_EXIT_DEBOUNCE: Minimum amount of time clock Rx terminations must meet ZRX_LOW_CM. TINBAND_RESET_INIT: Time for which Port A stays in Disable/Start before entering Detect.1. TDEBOUNCE: (not shown in Figure 3-19) Time for which remote clock Rx terminations need to be detected before Port B enters Disable/Start state. TINBAND_RESET_INIT: Time for which Port A stays in Disable/Start before entering Detect.1. It is evident from the above discussion that TDEBOUNCE should be less than TL1_EXIT_DEBOUNCE. 3.9.5.8 L2 Low Power State No special support is provided by the Physical layer to support L2 low power state. The Physical layer is placed in L1 state before the link is transitioned to L2 state. See CSI Power Management chapter for further details on L2 state. 3.9.6 Physical Layer Determinism Requirements CSI operation requires that each CSI port synthesize the internal link clocks from a single reference clock source. The synthesized clocks in two connected CSI ports will have time varying phase difference due to PVT variation in the reference clock. The received clock at the receiver will have Ref No xxxxx Intel Restricted Secret additional phase difference due to PVT variation in the link. The Physical layer determinism mechanism should contain these variations and provide a fixed and repeatable latency between connected CSI ports as seen by their respective link clocks. The Physical layer determinism mechanism will not use any side-band signal between connected CSI ports. It will be based on a synchronizing signal generated by central agent and distributed to both ports with PVT variation less than one reference clock UI. The Physical layer determinism is based on synchronization counter, which is synchronized using a synchronizing signal provided to both the connected CSI ports. The assertion of the signal as sampled by reference clock in end CSI ports should be deterministic with respect to each other. For example, consider a CSI link between port A and port B. Port A samples an assertion in synchronizing signal at nth reference clock UI from some fixed point in time. Port B samples the assertion in its synchronizing signal at mth reference clock UI from same fixed point. On every system power up and initialization, the assertion of synchronizing signal should be sampled at n and m in these ports. One specific example of synchronizing signal will be the use of de-assertion edge of system reset as sampled by reference clock. Such signal will limit the variation between starting point of synchronization counters in connected CSI ports to within one reference Clock UI. The synchronization counter should be deterministic with respect to events in the Link layers or other higher layers. The implementations are required to ensure this by aligning the link clock phase to other clocks in the system appropriately. The synchronization counter should be clocked by link clock, which is the same clock running the state machines and training sequences. Once set at synchronization point, the counter will run freely forever. The wrap points of the counters in connected CSI ports should match. The exact wrap count may be determined by other conditions like hot-plug. (WIP). The synchronization counter will reference all Physical layer determinism states and latency between the connected ports. During initialization, the receiver will fix the latency from transmitter as seen by these counters to the accuracy of link clock UI. In implementations employing flit clock, which is one fourth of link clock, latency will be fixed to the accuracy of flit clock UI. The counter granularity is required to correspond to link clock UI for compatibility purposes. However, the counter shall increment by four for every flit clock. The flit clock based implementations are insensitive to PVT variations less than a flit clock UI. After the initialization and latency fixing, the Physical layer will hold on to this latency even during retraining. The retraining period should be small enough to bring back the drifted strobe to middle of data eye, without losing any data bit. (Is it possible to detect the data loss at re-training? (WIP)). Depending on the differential jitter spectrum of data and received clock, appropriate retraining period should be chosen to avoid any data loss. The drift buffer shall be implemented in each lane to absorb the phase variation between received clock and the link clock. Initial Drift buffer depth setup during initialization can be controlled by CSR. An alarm status flag is set, in case the phase between received clock and link clock drifted below the threshold drift buffer depth. The drift buffer depth refers to difference between read and write pointers of drift buffer. The latency value fixed during the initialization can be read through CSR. Note that the resulting latency may change from initialization to initialization. The latency can be fixed to required target link latency, if one is specified through CSR. If specified, the Physical layer will fix the link latency to desired target latency value at every initialization. The size of the latency buffer should be big enough to accommodate the PVT variation from initialization to initialization. For systems operating in lock-step operation, link latency fixing is mandatory. In such systems, adequate depth of latency buffer should be provided to accommodate link and clock variations under all possible design corners. The mechanism works by introducing synchronization counter value and target link latency in Training sequence TS3. The transmitter samples the synchronization counter at some implementation specific, deterministic point near TS3 transmission and puts the value in training Ref No xxxxx Intel Restricted Secret Physical Layer Physical Layer sequence. Naturally, the values in consecutive training sequences will differ by the length of TS3. Specifically, the counter value in TS3(n+1) = (Counter value in TS3(n) + Length of TS3) mod (Synchronization Counter Wrap Count+1) The receiver will sample the synchronization counter at some implementation specific, deterministic point near TS3 reception and compare to arriving synchronization count. The difference between these count values is the actual perceived latency in link in terms of link clock UI. Latency buffer depth is adjusted to fix the total latency to requested target link latency. Specifically, The Latency Buffer depth set to = | Received Target Link Latency - (Local Synchronization Count - Received Synchronization Count) | mod Latency Buffer Size. Modulus operation is performed to take care of overrun or underrun condition. Note that such cases do not cause data loss, however, determinism may not be guaranteed. A flag is set in CSR under such conditions.The receiver should get two consecutive same value for depth computation before it actually sets the latency buffer depth.At the same time, the drift buffer depths in each lane are set to initial drift buffer depth. Further details pertaining to clocking requirements, hot plug support, tester support and repeater supports are provided in DFx chapter. 3.9.7 Periodic Link Retraining Physical layer does periodic retraining of receivers without Link layer involvement. Periodic retraining is controlled by two parameters - periodic retraining interval [UI] and periodic retraining duration [UI], both of which are programmed by firmware and are required to be identical for both ports. Periodic retraining involves sending a clock pattern (1010...) on each data lane. The basic retraining pattern is 16 bits long, starting with a 0 (the bit transmitted first), which is repeated for periodic retraining duration. Periodic retraining duration, thus, needs to be a multiple of 16. Periodic retraining frequency is also required to be a multiple of 16 to ensure that beginning of retraining pattern always aligns to a flit boundary. Periodic retraining counters are synchronized on both ports through Physical layer determinism scheme (See Section 3.9.6). Periodic retraining counters are updated once a port enters L0. When these counters reach periodic retraining interval threshold, Physical layer is temporarily disconnected from Link layer, and a retraining pattern is sent on each data lane. Physical layer to Link layer communication resumes after completely transmitting/receiving periodic retraining pattern. Periodic retraining counters are reset during the retraining phase; updating these counters for next retraining phase will start after the current retraining phase is completed. Note that periodic retraining interval and duration are common to both ports, and synchronizing periodic retraining counters across these ports guarantees that a connected transmitter/receiver pair know exactly when period retraining starts and ends. The retraining phase is completely localized within the Physical layer - the retraining patterns and retraining mechanism is transparent to Link layer. 3.9.8 Forwarded Clock Fail-Safe Mode – Small MP and Large MPProfiles Forwarded clock fail-safe mode is supported by having pre-determined dual use data lanes. These lanes would normally act as data but in the event of a primary clock failure, they would be used as clocks. link width may be reduced when an alternate clock is used. 100 Ref No xxxxx Intel Restricted Secret CSI specification requires a full width link to have two alternate clock lanes. Alternate clock channels are required to be physically adjacent to the primary clock channel. Hence, for a 20 pin CSI interface, pins 9 and 10 are required to support dual use data and clock functionality. The 3 available clock channels have a pre-defined priority across all CSI implementations - primary clock lane, pin 10 and pin 9, in decreasing order of priority. 3.9.9 Link Self-Healing – Large MP Profiles The Physical layer detects bad lanes during initialization and can automatically downgrade a link to operate in a narrower width mode, without requiring rest of the system to be re-initialized. In the event of Link layer CRC errors, the Link layer retries an erroneous packet until a retry threshold is reached, before initiating a Physical layer reset. The Physical layer can be reset by configuring bits in Control Register, or an implementation might choose to have a dedicated signal between the Physical and Link layers. The Physical layer getting a reset request forces the other end into link initialization through Inband Reset mechanism. Both sides go through a complete initialization sequence to identify bad lanes and configure the link using a narrower width. See Section 3.9.1.3 and Section 3.9.3.4.2 for further details. 3.9.10 Support for Hot Detect – Small MP and Large MP Profiles The Physical layer supports Hot Detect feature, where an in-line addition of a component can be detected and the link is reinitialized. During link initialization, the Physical layers indefinitely waits in Detect.1 until a CSI port is detected at the other end. Once a new part is plugged in and powered-up, both the ends synchronize in Detect.1 and continue with link initialization process. However, Physical layer requires higher layer assistance to support Hot Removal. Prior to removing a component, Physical layer on hot part need to be configured such that next link initialization sequence follows Cold Reset path. When the component at the other end is powereddown/ removed, it triggers an Inband Reset which the hot part uses to start the next initialization sequence (Cold Reset). 3.9.11 Lane Reversal Lane Reversal is a feature used for reducing board layout congestion and/or complexity. Figure 3-20 shows a simple link topology between connected ports on two components - Component A and Component B. NL is the number of pins on each port. The two components are mounted adjacent to each other on a motherboard, with pins on connected ports aligned. In this topology, a link can be formed using a straight connection by connecting pins with same pin number on both components. Figure 3-21 shows a different topology with component B mounted on a daughter card. The side view shows pin locations on component B, looking into the daughter card from the right. A straight connection between the two components in this topology may result in large length mismatch across lanes, potentially adding length matching requirement on shorter lanes to minimize lane-tolane skew. Ref No xxxxx 101 Intel Restricted Secret Physical Layer Physical Layer Component A Component B Mother Board Component A Component B 00 .. .. .. NL/2 - 1 NL/2 - 1 CLK CLK NL/2 + 1 NL/2 + 1 .. .. .. NL-1 NL-1 Front View Top View Figure 3-21. Daughter Card Topology -An Example Front View Daughter CardComponent B Mother Board Component A Component B 0 . . . NL/2 - 1 CLK NL/2 + 1 . . . NL-1 Side View Component A CLK 0 . . . . . . NL-1 NL/2 - 1 NL/2 + 1 Top View 102 Ref No xxxxx Intel Restricted Secret Figure 3-22 shows how pins on both components are aligned with respect to each other. For illustration purposes, the daughter card is rotated 900 clockwise, and hence top view for component B is represented by “looking through” the daughter card. A straight connection between the ports, in addition to having potentially large length mismatch, may also require additional board layers to avoid lane crossing, as shown by dotted connection between the components. Lane Reversal feature provides the needed board routing optimization by allowing connection between pins that have different pin numbers. Figure 3-22. Lane Reversal – An Example Top View Front View Component B Daughter Card ComponentBDaughterCard Mother Board Component A Component B Daughter Card Component A CLK 0 . . . . . . NL-1 NL/2 - 1 NL/2 + 1 Component B CLK 0 . . . NL/2 - 1 NL/2 + 1 . . . NL-1 Lane Reversal allows pins on one port to be mirrored with respect to the pins on the other port. Thus, Lane Reversal is defined by the following pin connection equation between two ports. Pin kcomponent A = Pin (NL-k-1)component B Lane Reversal is automatically detected during link initialization by receive side of a port, which compensates for Lane Reversal. No additional steps are required on the board as long as either of the following pin connection equations are enforced on the board. Pin kcomponent A = Pin kcomponent B --- For a Straight Connection (OR) Pin kcomponent A = Pin (NL-k-1)component B --- For Lane Reversal Ref No xxxxx 103 Intel Restricted Secret Physical Layer Physical Layer 3.9.12 Lane Reversal and Port Bifurcation Lane Reversal can be supported on a bifurcated port (Section 3.9.1.8), as long as the Lane Reversal equation described in Section 3.9.11 is followed. Each half of a bifurcated port supports Lane Reversal independent of the other half. Figure 3-23. Routing Guidelines for a Bifurcated Port using Lane Reversal on Both Halves Component A (bifurcated) CLK 1 0 NL-1 NL/2 - 1 NL/2 + 1 CLK 2 Component B (non-bifurcated) CLK 0 NL-1 NL/2 - 1 NL/2 + 1 Component C (non-bifurcated) CLK 0 NL-1 NL/2 - 1 NL/2 + 1 XX X => Half-width Links using these pin connections are not allowed for Lane Reversal. Pin numbers need to be identical across both ends of a Lane X X It should be noted that a bifurcated port has the same pin numbers as an otherwise full width port. Hence, two independent half width lane reversed links can be formed by connecting pins across ports as shown in Figure 3-23. In this example, Component A supports port bifurcation and forms two independent half width links with Component B and Component C, both of which do not support port bifurcation. The pin numbers at either end of a lane follow the Lane Reversal equation described in Section 3.9.11. The cross-marked connections shown in Figure 3-23 are not permissible, and hence will result in a link initialization error. Conversely, two independent half width links connecting a bifurcated port to two non-bifurcated ports uses the pin numbers shown in Figure 3-24. A platform may choose to have straight connections on one-half of the bifurcated port and Lane Reversal on the other half. In such a case, the half requiring Lane Reversal should follow the routing guidelines in Figure 3-23 and the other half using straight connections should follow the routing guidelines in Figure 3-24. 104 Ref No xxxxx Intel Restricted Secret Figure 3-24. Routing Guidelines for a Bifurcated Port Using Straight Connections on BothHalves Component A (bifurcated) Component B (non-bifurcated) X X X X CLK 1 0 NL-1 NL/2 - 1 NL/2 + 1 CLK 2 0 NL-1 NL/2 - 1 NL/2 + 1 CLK 0 NLNLNLCLK X => Half-width Links using these pin connections are not allowed for Straight Connection. Pin numbers need to be identical across both ends of a Lane /2 - 1 Component C (non-bifurcated) /2 + 1 -1 3.10 Physical Layer Register Interface • This section describes a reference register set to support Physical layer functionality and to support Physical layer test and debug. Implementations are not required to have all the registers defined in this section. An implementation may subset or superset this register set. Refer to implementation design guide for details on a particular implementation. • The register definitions described here are subject to change in a future revision of the specification. • The registers are grouped based on functional requirements of the Physical layer. The registers are further classified as follows. • Mandatory Registers: These registers are required for basic functioning of Physical layer. All implementations are required to implement these registers. • Optional Registers: These registers correspond to optional features or optional programmability provided by Physical layer. An implementation is not required to implement it; however, if implemented, should comply to format specified. • Example Registers: These registers are provided for example only. Such registers suggest certain requirements, which are implementation or platform specific. • Depending on the profile and visibility policy, certain fields of mandatory and optional registers may not be implemented. Such fields should be marked reserved. • Physical layer register visibility policy needs to be finalized. Visibility Legend 1. Other layer/processor; Intel test/Debug through software, JTAG if present, SMBus if present 2. Firmware/system Ref No xxxxx 105 Intel Restricted Secret Physical Layer Physical Layer 3. OEM test/debug; dependent on system configurations through software, JTAG if present, SMBus if present. Note: All registers are visible for Intel test/debug through CSR, unless specified otherwise, and hence not shown explicitly in the Tables below. Table 3-44. Register Attribute Definitions Attribute Abbreviations Description Read/Write RW This bit can be read or written by software. Read Only RO The bit is set by hardware only. Software can only read this bit. Writes do not have any effect. Read/ Write 1 to Clear RW1C The bit can be either read or cleared by software. The software has to write 1 to clear this bit. Writing zero to RW1C bit will have no effect. Read/ Write 1 to Set RW1S The bit can be either read or set by software. The software has to write 1 to set this bit. Writing zero to RW1S bit will have no effect. Sticky S In addition to other attributes The bit will be sticky or unchanged by warm reset, inband reset or soft reset. Late action L In addition to other attributes The bit will take effect at later time. Unless specified, it will take effect when link is re-initialized. Reserved RV Reserved for future definitions. Currently don’t care bits. Reserved and Preserved RsvdP Reserved for future RW implementations. The software need to preserve the value of this bit by read modify write. Reserved and zero RsvdZ Reserved for future RW1C implementations. The software must write zero to this bit. 3.10.1 CSI Physical Layer Mandatory Registers This set of registers are required by CSI Physical layer. Table 3-45. CSIPHCPR0: Physical Layer Capability Register 0 Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:29 3 Reserved RV 0 N/A 28:24 5 Number Of Tx Lanes RO HW Specific Number of Tx lanes with which an implementation can operate for full width. Bit 24 - If set, it can operate with 16 lanes for full width. Bit 25 - If set, 17 lanes. Bit 26 - If set, 18 lanes. Bit 27 - If set, 19 lanes. Bit 28 - If set, 20 lanes. Others Reserved. The bit indicating the maximum lanes will determine the number of control/status bits implemented in TX/RX Data lane Control/Status Registers. 106 Ref No xxxxx Intel Restricted Secret Table 3-45. CSIPHCPR0: Physical Layer Capability Register 0 (Continued) Bit(s) Width Name Attributes Default Value Value/Description Visibility 23:22 2 Reserved RV 0 N/A 21:20 2 ReservedRAS capability RO HW Specific N/ABit 20: If set, RAS capable with Alternate Clock 1. Bit 21: If set, RAS capable with Alternate Clock 2. Any of these bits set indicates corresponding status bits in Table 3-57, “CSIPHPLS: Physical Layer Link Status Register” is implemented. 19:12 8 Reserved RV 0 N/A 11:8 4 Reserved/Phys ical layer Implementation Profile RO HW Specific ‘b1000‘ b0100‘ b0010‘ b0001 Bit 8 - Supports UP profile. Bit 9 - Supports DP profile. Bit 10 - Supports Small MP profile. Bit 11 - Supports Large MP profile. 7:4 4 Reserved RV 0 N/A 3:0 4 CSI Phy Version RO ‘b0000 0: Current CSI version 0. Rest are reserved. Table 3-46. CSIPHCPR1: Physical Layer Capability Register 1 Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:11 21 Reserved RV 0 N/A 10:8 3 Power management Capability RO HW Specific Bit 8: L0s entry capable. Bit 9: LWM capable. Bit 10: L1 entry capable. 7:3 5 Reserved RV 0 N/A 2:0 3 Link Width Capability RO HW Specific Link widths supported in an implementation. Bit 0: If set, Full width capable. Bit 1: If set, Half width capable. Bit 2: If set, Quarter width capable. 1, 2, 3 Ref No xxxxx 107 Intel Restricted Secret Table 3-47. CSIPHCTR: Physical Layer Control Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:27 5 Reserved RsvdP 0 N/A 26:24 3 ReservedAlt ernate Clock Lane Disable RsvdPRW SL 0 N/AApplies to RX side of the link. A bit mask for selectively enabling/disabling clock lanes for validation purposes. A bit value of 1 indicates the corresponding clock lane is disabled. Rx clock terminations are selected in Detect.1 state, depending on this mask. In Detect.1, enabled clock lanes must meet ZRX_LOW_CM and ZRX_LOW_DIFF and disabled clock lanes must meet ZRX_HIGH_CM. See Section 3.11 for a description of ZRX_LOW_CM, ZRX_LOW_DIFF and ZRX_HIGH_CM. bit 24- Primary clock bit 25- Alternate clock 1 bit 26- Alternate clock 2 Bit Values 1- Disable clock lane 0- Enable clock lane 23:14 10 Reserved RsvdP 0 N/A 13:12 2 Initialization Retry Threshold RWSL 0 Number of initialization retries in the event of an initialization failure. 00 - No initialization retry after failure. 01 - One initialization retry after failure. 10 - Two initialization retry after failure. 11 - Try indefinitely. To break the indefinite loop one must write 00 to abort the initialization. 1, 2, 3 11:10 2 Reserved RsvdP 0 N/A 9:8 2 Post Initialization State RWSL 0 00 - Proceed to Config after Polling 01 - Enter Loopback mode after Polling All other values reserved. 2, 3 7 1 Force Single stage initialization RWSL 0 1 - Force single stage initialization at full speed. 6 1 ATE mode RWSL 0 1 - Enable altered initialization flow for test/debug environment. Refer to “Automatic Test Equipment (ATE) Initialization Mode” for further details. 108 Ref No xxxxx Intel Restricted Secret Table 3-47. CSIPHCTR: Physical Layer Control Register (Continued) Bit(s) Width Name Attributes Default Value Value/Description Visibility 5:4 2 RxReady Status Latch Point RWSL 0 The bit defines the Latch point for Rx Lane status in Table 3-51, “CSIPHRDS: Rx Data Lane RxReady Status Register”. 11 - Latch the status in Polling.2 State. 10 - Latch the status in Polling.3 State. 01 - Latch the status in Config.1 State. 00 - Latch the status in Config.2 State. 2, 3 3 1 Detect Status Latch Point RWSL 0 The bit defines the Latch point for Detect status in Table 3-49, “CSIPHTDS: Tx Data Lane Termination Detection Status Register”. 1 - Latch the status in Detect.2 State. 0 - Latch the status in Detect.1 State. 2 1 Bypass Calibration RWSL 0 1 - Bypass I/O Calibration. 1, 2, 3 1 1 Reset Safe RWS 0 0 - Override sticky bits during reset and restore the values to cold reset/power on reset values. 1 - Do not override sticky bits during reset 1, 2, 3 0 1 Physical layer Reset RW 0 1 - Reset. Writing 1 will initiate soft reset, which will cause re-initialization of Physical layer. This field will be set to 0 by logical sub- block state machine when initialization starts. 1, 2, 3 Table 3-48. CSIPHTDC: Tx Data Lane Control Register Bits Width Name Attributes Default Value Value/Description Visibility 31:20 12 Reserved RsvdP 12b’0 N/A 19:0 20 Tx Data Lane Disable RWSL 20b’0 A bit mask used for selectively enabling/disabling data Tx. Used for debug and validation purposes. A bit value of 1 indicates the corresponding lane is disabled. Bit 0: Controls Lane 0. Bit 1: Controls Lane 1. .. and so on. Unless specified, Tx on all disabled lanes must meet ZTX_HIGH_CM. An exception is when hardware chooses to use a data lane as backup clock lane, in which case this lane is indicated as disabled by hardware but terminations on this lane meet ZTX_LOW_DIFF and ZTX_LOW_CM. Ref No xxxxx 109 Intel Restricted Secret Table 3-49. CSIPHTDS: Tx Data Lane Termination Detection Status Register Bits Width Name Attributes Default Value Value/Description Visibility 31:20 12 Reserved RV 0 N/A 19:0 20 Tx Data Lane Status RO 20b’0 The Physical layer state machine updates the termination detection status of each Tx data lane. The status will be latched when exiting state specified by Detect status latch point in Table 3-47, “CSIPHCTR: Physical Layer Control Register”. The Status is updated every time initialization is performed. A bit value of 1 indicates the corresponding lane has detected Rx termination. Bit 0: Status of lane 0. Bit 1: Status of lane 1. .. and so on. Table 3-50. CSIPHRDC: Rx Data Lane Control Register Bits Width Name Attributes Default Value Value/Description Visibility 31:20 12 Reserved RsvdP 0 N/A 19:0 20 Rx Data Lane Disable RWSL 0 A bit mask used for selectively enabling/disabling data Rx. Used for debug and validation purposes. A bit value of 1 indicates the corresponding lane is disabled. Bit 0: Controls Lane 0. Bit 1: Controls Lane 1. .. and so on. Unless specified, Rx on all disabled lanes must meet ZRX_HIGH_CM. An exception is when hardware chooses to use a data lane as backup clock lane, in which case this lane is indicated as disabled by hardware but terminations on this lane meet ZRX_LOW_DIFF and ZRX_LOW_CM. 110 Ref No xxxxx Intel Restricted Secret Table 3-51. CSIPHRDS: Rx Data Lane RxReady Status Register Bits Width Name Attributes Default Value Value/Description Visibility 31:20 12 Reserved RV 0 N/A 19:0 20 Rx Data Lane Status RO 0 Latched RxReady status of each lane when exiting state specified by RxReady Status Latch point in Table 3-47, “CSIPHCTR: Physical Layer Control Register”. The Status is updated every time initialization is performed. A bit value of 1 indicates the corresponding lane’s RxReady is received. Bit 0: Status of Lane 0 Bit 1: Status of Lane 1. .. and so on. Table 3-52. CSIPHPIS: Physical Layer Initialization Status Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:30 2 Reserved RsvdZ 0 N/A 29:28 2 Initialization Iteration RO 0 Indicates the current iteration of initialization sequence. Iteration 0 corresponds to the first initialization attempt. Maximum iterations is equal to Initialization Retry Threshold field of Table 3-47, “CSIPHCTR: Physical Layer Control Register” This field is incremented at the point of initialization failure, and a comparison of this field against Initialization Retry Threshold field of Table 3-47, “CSIPHCTR: Physical Layer Control Register” is done in Disable/Start State. This field will be reset to 0 after initialization is complete and local port enters L0, Loopback or Compliance. 1, 2, 3 27:26 2 Reserved RsvdZ 0 N/A 25:24 2 ACK Status RO 0 00 - local ACK NOT sent, remote ACK NOT received 01 - local ACK sent, remote ACK NOT received 10 - local ACK NOT sent, remote ACK received 11 - local ACK sent and remote ACK received 23:21 3 Reserved RsvdD 0 N/A Ref No xxxxx 111 Intel Restricted Secret Physical Layer Physical Layer Bit(s) Width Name Attributes Default Value Value/Description Visibility 20:16 5 Rx State Tracker RO 0 Indicates the current state of local Rx. See Section 3.9.3 for details on these states. State tracker encoding is given in Table 3-54, “State Tracker Encoding”. 15:13 3 Reserved RsvdZ 0 N/A 12:8 5 Tx State Tracker RO 0 Indicates the current state of local Tx. See Section 3.9.3 for details on these states. State tracker encoding is given in Table 3-54, “State Tracker Encoding”. 7 1 Reserved RsvdZ 0 N/A 6:5 2 Initialization Failure Type RO 0 Applicable ONLY if Initialization Status field indicates a failure. Applies to Rx side. 00 - Link width negotiation failed 01 - Both ports are configured as Loopback masters. 10 - Timed out and all lanes/Rx bad. In this case, this port sends an Inband Reset to the remote port 11 - Received Inband Reset 4:2 3 Initialization Status RO 0 000 - initialization failure 001 - initialization in progress 011 - initialization in progress, but a previous initialization attempt failed. Applicable only if Initialization Retry Threshold field of Table 3-47, “CSIPHCTR: Physical Layer Control Register”, is non-zero. 110 - initialization complete, Linkup Identifier mismatch 111 - initialization complete Rest are reserved. 1, 2, 3 112 Ref No xxxxx Intel Restricted Secret Table 3-52. CSIPHPIS: Physical Layer Initialization Status Register (Continued) Bit(s) Width Name Attributes Default Value Value/Description Visibility 1 1 Calibration Done RW1C 0 Reset to 0 at Cold Reset Set to 1 once calibration is complete. Since calibration is necessary of proper initialization, if this bit is 0, calibration will be performed irrespective of Bypass calibration being set or reset. 0 1 Link-up Identifier RW1C 0 Set to 0 during Cold Reset. Set to 1 when initialization completes and link enters L0. The port clearing this flag due to mismatch in exchanged Link-up Identifier or writing 1 to this bit informs its Link layer, which is an indication that any outstanding Link layer transactions awaiting response from the other port will not receive any. 1, 2, 3 Table 3-53. CSIPHPPS: Physical Layer Previous Initialization Status Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:30 2 Reserved RV 0 N/A 29:28 2 Previous Initialization Iteration RO 0 Indicates the previous iteration of initialization sequence. Iteration 0 corresponds to the first initialization attempt. Maximum iterations is equal to Initialization Retry Threshold field of Table 3-47, “CSIPHCTR: Physical Layer Control Register” This field is copied from Table 3-52, “CSIPHPIS: Physical Layer Initialization Status Register” at the point of initialization failure, 2,3 27:26 2 Reserved RV 0 N/A Ref No xxxxx 113 Intel Restricted Secret Physical Layer Physical Layer Table 3-53. CSIPHPPS: Physical Layer Previous Initialization Status Register (Continued) Bit(s) Width Name Attributes Default Value Value/Description Visibility 25:24 2 Previous ACK Status RO 0 ACK status of the most recent state from previous initialization attempt. This field set to 00 if this is the first initialization attempt. Once initialization is complete and a port enters L0, Loopback or Compliance, this field will have the same value as ACK Status field specified above. 00 - local ACK NOT sent, remote ACK NOT received 01 - local ACK sent, remote ACK NOT received 10 - local ACK NOT sent, remote ACK received 11 - local ACK sent and remote ACK received 23:21 3 Reserved RV 0 N/A 20:16 5 Previous Rx State Tracker RO 0 Most recent Rx state from previous initialization attempt. This field set to “Disable/Start” if this is the first initialization attempt. Once initialization is complete and a port enters L0, Loopback or Compliance, this field will have the same value as Rx State Tracker field specified above. State tracker encoding is given in Table 3-54, “State Tracker Encoding”. 15:13 3 Reserved RV 0 N/A 12:8 5 Previous Tx State Tracker RO 0 Most Recent Tx state from previous initialization attempt. This field set to “Disable/Start” if this is the first initialization attempt. Once initialization is complete and a port enters L0, Loopback or Compliance, this field will have the same value as Tx State Tracker field specified above.State tracker encoding is given in Table 3-54, “State Tracker Encoding”. 7 1 Reserved RV 0 N/A 114 Ref No xxxxx Intel Restricted Secret Table 3-53. CSIPHPPS: Physical Layer Previous Initialization Status Register (Continued) Bit(s) Width Name Attributes Default Value Value/Description Visibility 6:5 2 Previous Initialization Failure Type RO 0 00 - Link width negotiation failed 01 - Both ports are configured as Loopback masters. 10 - Timed out and all lanes/Rx bad. In this case, this port sends an Inband Reset to the remote port 11 - Received Inband Reset 4:2 3 Reserved RO 0 N/A 1 1 Previous Calibration Done RO 0 The Calibration done field is copied from Table 3-52, “CSIPHPIS: Physical Layer Initialization Status Register” at the time of initialization failure, 0 1 Previous Linkup Identifier RO 0 The Linkup Identifier field is copied from Table 3-52, “CSIPHPIS: Physical Layer Initialization Status Register” at the time of initialization failure, Table 3-54. State Tracker Encoding Bits State Name 0 0000 Disable/Start 0 0001 Calibrate 0 0010 Detect.1 0 0011 Detect.2 0 0100 Detect.3 0 0101 Polling.1 0 0110 Polling.2 0 0111 Polling.3 0 1000 Config.1 0 1001 Config.2 0 1100 L0s 0 1101 LWM (In the process of modulating link width for power-savings) 0 1110 L0R (Periodic Retraining in process) 0 1111 L0 1 0000 Loopback Master 1 0001 Loopback Slave 1 1111 Compliance Others Reserved. Ref No xxxxx 115 Intel Restricted Secret Table 3-55. CSIPHWCI: Width Capability Indicator (WCI) Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:17 15 Reserved RsvdP 0 N/A 16 1 Use Programmed LM RWSL 0 0 - Automatically compute WCI from default value during Physical layer initialization 1- Use pre-programmed Local WCI (see below field) 15:11 5 Reserved RsvdP 0 N/A 10:0 11 Local WCI RWSL H/W Generate d List of LMs supported by local PHY layer. 2 Table 3-56. CSIPHLMS: Lane Map Status Register Bits Width Name Attributes Default Value Value/Description Visibility 31:28 4 Reserved RV 0 N/A 27:24 4 Outbound LM RO b’x LM used by Tx portion of the link. 2 23:20 4 Reserved RV 0 N/A 19:16 4 Inbound LM RO b’x LM used by Rx portion of the link. 2 15:11 5 Reserved RV 0 N/A 10:0 11 Remote WCI RO b’x A list of LMs supported by remote PHY layer. 2 Table 3-57. CSIPHPLS: Physical Layer Link Status Register Bits Width Name Attributes Default Value Value/Description Visibility 31 1 Reserved RV 0 N/A 30:28 3 Received Clock Lane in Use RO 0 Applies to RX. 001 - Primary 010 - Alternate clock 1 100 - Alternate clock 2 All other values reserved 27 1 Reserved RV 0 N/A 26:24 3 Forwarded Clock Lane in Use RO 0 Applies to TX. 001 - Primary 010 - Alternate clock 1 100 - Alternate clock 2 All other values reserved 116 Ref No xxxxx Intel Restricted Secret Table 3-57. CSIPHPLS: Physical Layer Link Status Register (Continued) Bits Width Name Attributes Default Value Value/Description Visibility 23:16 8 Local Tx Link State RO b’x bits 17:16 - Quadrant 0 bits 19:18 - Quadrant 1 bits 21:20 - Quadrant 2 bits 23:22 - Quadrant 3 Values of each Quadrant 00 - Disabled 01 - Being Initialized 10 - L0s 11 - L0 2, 3 15:8 8 Local Rx Link State RO b’x bits 9:8 - Quadrant 0 bits 11:10 - Quadrant 1 bits 13:12 - Quadrant 2 bits 15:14 - Quadrant 3 Values for each Quadrant 00 - Disabled 01 - Being Initialized 10 - L0s 11 - L0 2, 3 7:2 6 Reserved RV 0 N/A 1 1 Received Clock Status RO 0 0 - no received clock (on RX) 1 - received clock stable (on RX) NOTE: Received clock is monitored on a continuous basis and this bit updated every UI 2, 3 0 1 Local Link State RO 0 0 - Link in L1 1 - At least a portion of the link is enabled 2, 3 Table 3-58. CSIPHITV0: Initialization Time-Out Value Register 0 Bits Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 TINBAND_RES ET_INIT RWS 0x7F (8192 UI) Time a port waits in Disable/Start state after losing received clock, before entering Detect state. See Section 3.7.5 for details. Time- out value is (count + 1) * 64UI. 15:2 14 Reserved RsvdP 0 N/A 1:0 2 TDEBOUNCE RWS b’01 Debounce time used by Tx detection circuitry in Detect.1 and Detect.2 states. Time-out value is (count + 1) * 64UI. Ref No xxxxx 117 Intel Restricted Secret Table 3-59. CSIPHITV1: Initialization Time-Out Value Register 1 Bits Width Name Attributes Default Value Value/Description Visibility 31:10 22 Reserved RsvdP 0 N/A 9:4 6 TDETECT.2 RWS 0x2F (32K UI) Timeout for Detect.2. If received clock not stable by the end of this time period, an Inband Reset is initiated by the port that fails to see received clock. Each count in this field corresponds to 1024 UI. Time out value is (count + 1) * 1024 UI 3:0 4 Reserved RV 0 N/A Table 3-60. CSIPHITV2: Initialization Time-out Value Register 2 Bits Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 TPOLLING.1 RWS 0x7F (8192 UI) Timeout for Polling.1 state. This is the amount of time each Tx sends TS0. Upon entering Polling.1, each Rx stays in Polling.1 for this duration before advancing to Polling.2. Time-out value is (count + 1) * 64UI. 15:8 8 Reserved RsvdP 0 N/A 7:0 8 TDETECT.3 RWS 0x7F (8192 UI) Timeout for Detect.3. If the DC pattern is not observed for this time period, current initialization cycle is abandoned. Time-out value is (count + 1) * 64UI. Table 3-61. CSIPHITV3: Initialization Time-Out Value Register 3 Bits Width Name Attribute s Default Value Value/Description Visibilit y 31:24 8 Reserved RsvdP 0 N/A 23:16 8 TPOLLING.3 RWS 0x7F (8192 UI) Timeout for Polling.3 state. State timedout if handshake fails. Time-out value is (count + 1) * 64UI. 15:8 8 Reserved RsvdP 0 N/A 7:0 8 TPOLLING.2 RWS 0x7F (8192 UI) Timeout for Polling.2 state. State timedout if handshake fails. Time-out value is (count + 1) * 64UI. 118 Ref No xxxxx Intel Restricted Secret Table 3-62. CSIPHITV4: Initialization Time-Out Value Register 4 Bits Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 TCONFIG.2 RWS 0x7F (8192 UI) Timeout for Config.2 state. State timedout if handshake fails. Time-out value is (count + 1) * 64UI. 15:8 8 Reserved RsvdP 0 N/A 7:0 8 TCONFIG.1 RWS 0x7F (8192 UI) Timeout for Config.1 state. State timedout if handshake fails. Time-out value is (count + 1) * 64UI. Table 3-63. CSIPHLDC: Link Determinism Control Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 Target Link Latency RWSL 0 TX will introduce these bits in TS3 training sequence as the Target Link Latency field. 2, 3 15:12 4 Reserved RsvdP 0 N/A 11:8 4 Initial Drift Buffer Depth RWSL H/W Specific Drift buffer is the mechanism to absorb clock and channel variations between connected CSI ports during normal operation. Drift Buffer Depth refers to the difference between read and writer pointers in drift buffer. The field indicates the difference of read and write pointers in drift buffer to be set during initialization. x’2: Drift buffer depth is adjusted to 2 during training. x’3: Drift buffer depth is adjusted to 3 during training. x’4 - x’F: Corresponds to different depths. The exact programmable values in this register is implementation specific. 7:4 4 Reserved RsvdP 0 N/A 3:0 4 Drift Buffer Alarm Threshold RWS 2 When the difference between read and write pointers (depth) in drift buffer is less than the value in this field, drift buffer alarm status will be set. Ref No xxxxx 119 Intel Restricted Secret Table 3-64. CSIPHLDS: Link Determinism Status register Bits Width Name Attributes Default Value Value/Description Visibility 31:24 8 Current Latency Buffer depth RO 0 Current level of Latency buffer utilization. The field is used for latency estimations and fixing. The latency buffer depth is adjusted to difference of target link latency and the actual link latency. The depth adjusted =| | Received Target Link latency - (Local Sync Count - Received Sync Count) | | mod Latency Buffer size. 23:16 8 Local Synchronizati on Count RO 0 The last Synchronization Count value latched locally by receiver, while receiving training sequence TS3. The difference of Local Sync Count and Received Sync Count is the actual latency of the link. 15:8 8 Received Synchronizati on Count RO 0 The last Received Sync. Count value received in training sequence TS3. The value indicates the latched Sync Count of Transmitter. 7:0 8 Received Target Link Latency RO 0 The last Received Target Link Latency value in training sequence TS3. The value indicates the link latency offset requested by the Transmitter. Table 3-65. CSIPHPRT: Periodic Retraining Timer Register Bits Width Name Attributes Default Value Value/Description Visibility 31:30 2 Reserved R 0 N/A 29:24 6 Retraining Packet Count RWSL 0 Number of retraining patterns sent for each retraining sequence. The retraining pattern is repeated for (the value in this field+1) times for each retraining. The retraining pattern is 16 bits of 0xaaaa with LSB sent first. 23:20 4 Reserved R 0 N/A 19:0 20 Retraining Interval RWSL 0 Periodic Retraining Interval. A value of 0 indicates periodic retraining is disabled. Value to be programmed by firmware. Each count represents 1024 UI 2, 3 120 Ref No xxxxx Intel Restricted Secret 3.10.2 Optional Registers This set of registers correspond to optional features or programmable provisions of Physical layer. The presence of these registers is indicated by capability registers. An implementation can choose not to implement these registers, but if implemented it should follow the format specified. Table 3-66. CSIPHDDS: Link Determinism Drift Buffer Status Register Bits Width Name Attributes Default Value Value/Description Visibility 31:20 12 Reserved RsvdZ 0 N/A 19:16 4 Drift Buffer Alarm lane RO 0 The lane ID of first lane which has reached the drift buffer alarm threshold. The field is valid when Drift Buffer Alarm is set. 15:3 13 Reserved RsvdZ 0 N/A 2 1 Drift Buffer Alarm RW1C 0 1 - Indicates drift buffer depth (difference between read and write pointers) is less than the drift buffer alarm threshold depth. An implementation may initiate re-initialization to re- center the drift buffers. 1 1 Drift Buffer Overflow RW1C 0 1 - Indicates drift buffer has overflown during normal operation. Such events occur under extreme variations in connected port clocks or in the channel. The event will result in the data loss. An implementation may connect this bit to appropriate interrupt. 0 1 Latency Buffer Rollover RO 0 1 - Indicates the latency buffer has Rolled over during last Physical layer initialization. The buffer rollover occurs if requested received target link latency needs a depth beyond the latency buffer size. Table 3-67. CSIPHPMR0: Power Management Register 0 Bits Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 2 TL0S_SLEEP_MIN RWS 0 Minimum time local Tx on a port initiating L0s entry should stay in L0s. This corresponds to the time required by remote Rx to respond to L0s entry signal by local port. This field is at 1 UI granularity and the value of this field is (count + 1)*1 UI 2, 3 Ref No xxxxx 121 Intel Restricted Secret Physical Layer Physical Layer Bits Width Name Attributes Default Value Value/Description Visibility 15:4 12 TL0S_WAKE RWS 0 L0s Wake-up time currently in effect. Set by firmware on both link ports prior to entering L0s. This field is at 16 UI granularity and the value of this field is (count + 1)*16 UI 2, 3 3:0 4 Reserved RsvdP 0 N/A Table 3-68. CSIPHPMR1: Power Management Register 1 Bits Width Name Attributes Default Value Value/Description Visibility 31:26 6 Reserved RsvdP 0 N/A 25:18 8 TLWM_ENTER_NOP RWS 0 Used for link width modulation in low power mode, where link width can be adjusted on the fly without re-initializing the Physical layer. This is the minimum amount of time local Tx on a port initiating link width reduction are required to drive Null Ctrl flits. The number of Null Ctrl flits transmitted is (TLWM_ENTER_NOP/New Link Width), rounded to the next highest integer. This is the time required for remote Rx to respond to link width modulation request, and adjust to new link width. This field is at 4 UI granularity and the value of this field is (count + 1)*4 UI 2, 3 17:11 7 Reserved RsvdP 0 N/A 122 Ref No xxxxx Intel Restricted Secret Table 3-68. CSIPHPMR1: Power Management Register 1 (Continued) Bits Width Name Attributes Default Value Value/Description Visibility 10:8 3 TLWM_MUX_SWITCH RWS 0 Time required by a port to adjust its muxes to support a new link width, when a link width modulation request is received from the Link layer. A common value is used for both local and remote ports, and the value programmed by firmware will be the larger of these two values. This field should indicate the maximum time required to adjust the mux across PVT variations. This field is at 1 UI granularity and the value of this field is (count + 1)*1 UI 2, 3 7:3 5 Reserved RsvdP 0 N/A 2:1 2 TL0S_ENTER_Tx_DRV RWS 0 Time a port initiating L0s entry drives each Tx differential pair to 1/0 on D+/D- after sending the last flit prior to entering L0s This field is at 2 UI granularity and the value of this field is (count + 1)*2 UI 2, 3 0 1 Reserved RsvdP 0 N/A Table 3-69. CSIPHPMR2: Power management Register 2 Bits Width Name Attributes Default Value Value/Description Visibility 31:20 12 TL0S_WAKE_MAX RWS 0 Lower of the maximum L0s wake-up time supported by either Port. Firmware configured. This field is at 16 UI granularity and the value of this field is (count + 1)*16 UI 2, 3 19:16 4 Reserved RsvdP 0 N/A 15:4 12 TL0S_WAKE_MIN RWS 0 Lower of the minimum L0s wake-up time supported by either Port. Firmware configured. This field is at 16 UI granularity and the value of this field is (count + 1)*16 UI 2, 3 3:0 4 Reserved RsvdP 0 N/A Ref No xxxxx 123 Intel Restricted Secret Table 3-70. CSIPHPMR3: Power Management Register 3 Bits Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 23:16 8 TL0S_EXIT_DEBOUNC E_MIN RWS 0 Minimum time local Tx on a port initiating L0s exit is required to drive Null Ctrl flits. This parameter corresponds to the minimum time required by activity detectors on remote port to detect link activity, across PVT variations. This field is at 1 UI granularity and the value of this field is (count + 1)*1 UI 2, 3 15:8 8 Reserved RsvdP 0 N/A 7:0 8 TL0S_EXIT_DEBOUNC E_MAX RWS 0 Minimum time local Tx on a port initiating L0s exit should indicate Link Activity for the activity detectors on remote Rx to respond. This parameter corresponds to the maximum time required by activity detectors on remote port to detect link activity, across PVT variations This field is at 1 UI granularity and the value of this field is (count + 1)*1 UI 2, 3 124 Ref No xxxxx Intel Restricted Secret Table 3-71. CSIPHPMR4: Power Management Register 4 Bits Width Name Attributes Default Value Value/Description Visibility 31:8 24 Reserved RsvdP 0 N?A 7:0 8 TL1_EXIT_DEBOUNC E RWS 0 Time for which clock Rx terminations must meet ZRX_LOW_CM when a port receives L1 exit signal. The remote port detects these terminations and uses this event as an indication to exit L1 and go to Disable/Start state. The local port enters Disable/Start state after this time period expires, at which point clock Rx terminations must meet ZRX_HIGH_CM. See Section 3.9.5.7 for details. This field is at 1 UI granularity and the value of this field is (count + 1)*1 UI 3.10.3 Electrical Parameter Registers (Examples Only) These set of registers are implementation or platform specific. The registers are illustrated in this document for examples only. The standardization of registers may be taken up in second phase. Table 3-72. CSITCR: Termination Control Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:16 16 Tx Termination RW Self- calibrated Value calibrated by H/W. Self-calibrated value can be over-written. 3 15:0 16 Rx Termination RW Self- calibrated Value calibrated by H/W. Self-calibrated value can be over-written. 3 Ref No xxxxx 125 Intel Restricted Secret Table 3-73. CSIETE: Equalization Tap Enable Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:8 24 Reserved RV 0 N/A 7:0 8 Equalization Tap Mask RWS 0 A bit mask used to select one of the 8 equalization coefficients above. A bit value of 1 indicates that corresponding coefficient is selected. Bit 0 corresponds to equalization Coefficient 0. Bit 1 corresponds to the equalization coefficient 1. And so on... Number of coefficients are implementation specific and they need to consecutive starting from bit 0. 2, 3 Table 3-74. CSIECR0: Equalization Coefficient Register 0 Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:24 8 Equalization Coefficient 3 RWS H/W Specific 2, 3 23:16 8 Equalization Coefficient 2 RWS H/W Specific 2, 3 15:8 8 Equalization Coefficient 1 RWS H/W Specific 2, 3 7:0 8 Equalization Coefficient 0 RWS H/W Specific Exact bit width of coefficient Value is implementation dependent. Most significant bit will be sign bit. 2, 3 Table 3-75. CSIECR1: Equalization Coefficient Register 1 Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:24 8 Equalization Coefficient 7 RWS H/W Specific 2, 3 23:16 8 Equalization Coefficient 6 RWS H/W Specific 2, 3 15:8 8 Equalization Coefficient 5 RWS H/W Specific 2, 3 7:0 8 Equalization Coefficient 4 RWS H/W Specific Exact bit width of coefficient Value is implementation dependent. Most significant bit will be sign bit. 2, 3 126 Ref No xxxxx Intel Restricted Secret Table 3-76. CSITEPC: TX Electrical Parameter Control Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RV 0 N/A 23:16 8 Tx CM Bias Control RWS 0 A DFx hook for biasing common mode output of all Tx differential pairs 15:8 8 Reserved RV 0 N/A 7:0 8 Tx Current Drive Strength RW Self- calibrated Value calibrated by H/W. Self-calibrated value can be over-written. 2, 3 Table 3-77. CSIRLR[0-19]: RX Lane Register na Bit(s) Width Name Attribute s Value/Description Visibility 31:24 8 Reserved R N/A 23:16 8 Voltage Offset Cancellation (VOC) Position Self Calibrated Auto position set by hardware during link initialization. Applies to Rx portion of a port 15:8 8 Reserved RV N/A 7:0 8 Strobe Position Self Calibrated Auto position set by hardware during link initialization. Applies to RX portion of a port a. NOTE: One register per Rx differential pair. 3.10.4 Testability Tool-box Registers (Examples Only) These set of registers are implementation or platform specific. The standardization of register may be taken up in second phase. Table 3-78. CSILCR: Loopback Control Register Bits Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 Loopback Counter RWSL 0 Loopback Countera 15:8 8 Reserved RsvdP 0 N/A 7:3 5 Lane Of Interest RWSL 0 Lane of Interest vectora 2 1 Continuous Override RWSL 0 Continuous Overridea 1 1 Stop on Error RWSL 0 Stop on error - Flag 0 - Do not stop the test on error 1 - Stop the test on first error 0 1 Start Loop-Back Test RWSL 0 Start Loop Back test - Flag 0 - start the test 1 - stop the test a. See DFx Chapter for a detailed description of this register field. Ref No xxxxx 127 Intel Restricted Secret Table 3-79. CSILLMC: Loop-Back Lane Mask Control Register Bits Width Name Attributes Default Value Value/Description Visibility 31:20 12 Reserved RsvdP 0 N/A 19:0 20 Lane Mask RWSL 0 Lane Maska Table 3-80. CSILMRC: Loop-Back Master Receiver Control Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 Master Receiver Strobe Override RWSL 0 Master Port - Receiver Strobe Override.a 15:8 8 Reserved RsvdP 0 N/A 7:0 8 Master Receiver CM override RWSL 0 Master Port - Receiver Input Common Mode Overridea a. See DFx Chapter for a detailed description of this register field. Table 3-81. CSILMTC: Loop-Back Master Transmitter Control Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 Master Transmitter Jitter injection RWSL 0 Master Port - Transmitter Jitter Injection 15:8 8 Master Transmitter Equalization Override. RWSL 0 Master Port - Transmitter Equalizer Settings Overridea 7:0 8 Master Transmitter Drive override RWSL 0 Master Port - Transmitter Drive current Overridea a. This field is reserved in the current specification. Placeholder for future feature extensions. Table 3-82. CSILSRC: Loop-Back Slave Receiver Control Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 Slave Receiver Strobe Override RWSL 0 Slave Port - Receiver Strobe Override.a 15:8 8 Reserved RsvdP 0 N/A 7:0 8 Slave Receiver CM override RWSL 0 Slave Port - Receiver Input Common Mode Overridea a. See DFx Chapter for a detailed description of this register field. 128 Ref No xxxxx Intel Restricted Secret Table 3-83. CSILSTC: Loop-Back Slave Transmitter Control Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:24 8 Reserved RsvdP 0 N/A 23:16 8 Slave Transmitter Jitter injection RWSL 0 Slave Port - Transmitter Jitter Injection 15:8 8 Slave Transmitter Equalization Override. RWSL 0 Slave Port - Transmitter Equalizer Settings Overridea 7:0 8 Slave Transmitter Drive override RWSL 0 Slave Port - Transmitter Drive current Overridea a. This field is reserved in the current specification. Placeholder for future feature extensions. Table 3-84. CSILPR0: Loop-Back Pattern Register 0 Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:0 32 Pattern RWSL 0 Pattern bits [31:0] - Least significant bit is sent out first in the line. Table 3-85. CSILPR1: Loop-Back Pattern Register 1 Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:8 24 Reserved RsvdP 0 N/A 7:0 8 Pattern RWSL 0 Pattern bits [39:32] - Rest of the total 40 bit pattern sent out in loop-back lanes. Table 3-86. CSILPI: Loop-Back Pattern Invert Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:20 12 Reserved RsvdP 0 N/A 19:0 20 Pattern Invert RWSL 0 One bit per lane. Bit 0 Controls Lane 0 Bit 1 Controls Lane 1 and so on. 1 - Invert the pattern in a lane. Table 3-87. CSILSR: Loop Back Status Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:13 19 Reserved RV 0 N/A 12:8 5 Failure Index RO 0 Failure Index 7:1 7 Reserved RV 0 N/A Ref No xxxxx 129 Intel Restricted Secret Physical Layer Physical Layer Bit(s) Width Name Attributes Default Value Value/Description Visibility 0 1 Failure Flag RO 0 Failure Flag 0 - No Failure 1 - Failure on any Lane Table 3-88. CSILSP0: Loop-Back Status Pattern Register 0 Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:0 32 Pattern Vector RO 0 Received Pattern vector Bits [31:0] Table 3-89. CSILSP1: Loop-Back Status Pattern Register 1 Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:8 24 Reserved RV 0 N/A 7:0 8 Pattern Vector RO 0 Received Pattern vector Bits [39:32] Table 3-90. CSILSLF: Loop-Back Status Lane Failure Register Bit(s) Width Name Attributes Default Value Value/Description Visibility 31:20 12 Reserved RV 0 N/A 19:0 20 Lane Failure Status RO 0 Lane Failure Status, one bit per each lane. Bit 0 - Status of Lane 0 Bit 1 - Status of Lane 1 and so on. 1 - Lane has received error pattern. 3.11 Electrical Sub-Block Specifications and Budgets Currently this section is TBD to acquire alignment between various CSI teams on the approach proposed by PHY team. 130 Ref No xxxxx Intel Restricted Secret 3.12 Definition of Terms Table 3-91. Physical Layer Glossary Term Definition Active Lane A Lane that is an active part of a link. Tx and Rx on this lane are used for transferring phits between ports and are required to meet termination strength of ZTX_LOW_CM_DC and ZRX_LOW_CM_DC, respectively. The differential pair representing a Tx/Rx shall have a minimum differential swing that meets CSI electrical specification. Activity The term Activity is used to indicate there is a differential signal on a differential pair and the signal levels meet those specified in the electrical interface specification portion of this document. The complement of Activity is Electrical Idle - see definition of Electrical Idle below. Break From Electrical Idle Break From Electrical Idle: See the definition of "Electrical Idle, Break From". Data Packet A Protocol layer message that contains either 8 or 16 data flits. Determinism Determinism is defined to be that we can run same pattern with resets between each run and will get clock for clock repeatability. The input stimulus must be completely defined by clock cycle and not be any other event so that the pattern can be applied and achieve the exact response without ever monitoring the response. Determinism is essential for stimuli response testing, record & replay debugging and lockstep operations. Determinism, tester Output results can be consistently checked by a traditional stored-response automatic tester. This does not imply repeatability. For example, the CSI training algorithm advances from state to state based on internal events such as detect and DLL lock, but a stored-response tester can make this sequence deterministic by simply waiting sufficient time in each state to guarantee that a functional device will have already acknowledged the state. The tester then checks for the state change and issues the acknowledge to advance the state. The CSI lane under test could have acknowledged at anytime during the interval, but the check was made by the tester only at the end of the interval. Differential Pair or Diff Pair Two conductors used to transfer control, data and/or clocks from a Tx to an Rx in one direction. Each Differential Pair is uni-directional and one bit wide. Example; a CSI link which is 20 bits wide would require 20 Differential Pairs in one direction and one clock Differential in that same direction as well as 20 Differential Pairs and one clock Differential Pair in the opposite direction. Disabled Lane A Disable Lane is not an active part of a link, and the Tx/Rx connected to this lane does not take part in transmitting phits between ports. Tx and Rx connected to this lane are required to meet termination strength of ZTX_HIGH_CM_DC and ZRX_HIGH_CM_DC, respectively. Electrical Idle The condition when both conductors of a differential pair are at a 0 volt (grounded) level. Electrical Idle, Break From The opposite of Electrical Idle. A lane is said to "break from Electrical Idle" if one of the two differential pairs drives a non-zero volt signal resulting in a differential swing between a differential pair. Note that all differential pairs do have a differential swing during normal link operation. The phrase "break from Electrical Idle" refers to the case only when a lane is made to exit the Electrical Idle condition. FLIT or flit Acronym for FLow control unIt. A Flit is the unit of exchange between Link Layer and Physical Layer. A Flit is 80-bits wide and is sent over the link in multiple Phits (see the definition of PHIT) Inactive The condition when both conductors of a differential pair are at a 0 volt (grounded) level. The terms Electrical Idle and Squelch may also be used this specification. In this context those terms have the same meaning as Inactive. Ref No xxxxx 131 Intel Restricted Secret Physical Layer Physical Layer Term Definition Inactive Lane Inactive Lane represents a condition where the Tx/Rx differential pair representing this lane have no differential voltage (more precisely, the differential swing is below the required threshold specified in the electrical specification). Tx and Rx of an Inactive Lane are required to meet termination strength of ZTX_LOW_CM_DC and ZRX_LOW_CM_DC, respectively. An Inactive Lane is also characterized by the fact that it is temporarily not an active portion of the link, and thus will not be used to transfer phits between connected ports. Lane A uni-directional single bit wide (serial) conduit of control/data or clock information. A lane carries one logic bit of information, and thus consists of a differential pair. Note that a Lane in CSI context is different from the definition of Lane used PCI Express spec 1.0a, where a Lane is defined as bi-directional. Also see the definitions of Active, Inactive and Disabled Lanes. Lane reversal Lane Reversal is a feature used for reducing board layout congestion and/or complexity. It is a feature that provides the needed board routing optimization by allowing connectionbetween pins that have different pin numbers. Latency The delay from the transmitter of the driving CSI port, across the interconnect which includes (but is not limited to) package, mother board traces and connectors, and then through the receiver and drift buffers of the receiving port. The actual latency of a link can change from one execution to the next, based on temperature, voltage and testability boundaries. The overall latency can be fixed to a value by Physical layer latency fixing mechanisms. Link A set of Lanes configured such that they are operating in a parallel fashion. A link is unidirectional and represents all the lanes used to connect transmitters on one CSI port to receivers on a different CSI port. Thus, a connection between two ports is made through a pair of uni-directional links. Link Transfer Ratio The number of phits used to completely transmit or receive a flit across the link. This is a function of link width. Link transfer ratio is 4/8/16 for full-/half-/quarter width link. Local Local is a prefix to resources that a Port controls directly. Example; a Port on Chip A would form a link when connected to a Port on chip B. All resources such as termination resistors, current sources, DLL, interpolators that are present in Chip A would be Local to Port A; all resources such as termination resistors, current sources, DLL, interpolators that are present in Chip B would be Local to Port B. Lockstep operation Two nodes are running lockstep when they generate same response cycle to cycle with same stimuli. Repeatability is essential part of lockstep operation. Lockstep operation is essential for highly redundant systems like hot standby processor systems. PHIT or phit Acronym for PHysical transfer unIT. Phit is the number of bits transmitted by the Physical Layer in one Unit Interval (UI, see definition of Unit Interval). Thus, for CSI, a Phit is equivalent to number of lanes within the link (Link Width). A Flit is transmitted by the Physical layer using multiple Phits. For instance, a full width link (20 lanes) transmits a Flit in 4 phits and a half width link requires 8 phits to transmit a Flit. Polarity Inversion Polarity Inversion is a feature where D+/D- of a differential pair are swapped on the Physical Interface (package/motherboard/connector etc.) to reduce platform design complexity Port A Port is an end-point of link. Two ports communicate with each other using a pair of unidirectional links. A link is termed outbound from the perspective transmit side of port and termed inbound from receive side of a port. Thus, each uni-directional link connects transmit side of one port to receive side of another port. Received Clock A received differential signal which transitions once every Unit Interval. Example; At 6.4 GT/s, the Received Clock will be running at 3.2 GHz. Receiver or Rx The circuits which create the electrical signals used to receive data, control or clock. The CSI specification does not specify the implementation of those circuits. In most cases Rx is used to describe that function of receiving differential signals. An Rx connects to the the pins of a packaged part and the pads of a component. The term Receiver may be used this specification. In this context that term has the same meaning as Rx. 132 Ref No xxxxx Intel Restricted Secret Table 3-91. Physical Layer Glossary Term Definition Remote Remote is a prefix to resources that a Port does not control directly but are visible to that Port via information (often handshakes) from the Port at the other end of the link (or link being initialized/formed). Example; a Port on Chip A would form a link when connected to a Port on chip B. All resources such as termination resistors, current sources, DLL, interpolators that are present in Chip B would be Remote to Port A; All resources such as termination resistors, current sources, DLL, interpolators that are present in Chip A would be Remote to Port B. Training Sequence (TS) A stream of data, currently defined as 64 bits long, which is sent out serially, starting with LSB, by each Tx on one port and received by the corresponding Rx on the other port. Training Sequence patterns, (TSx, where “x” is a number) are exchanged on lanes by ports during the link initialization process and may contain a unique header, acknowledge fields and payload (configuration) information. Transceiver A Tx-Rx pair. Each Lane is constructed out of a Local Transceiver connected to a Remote Transceiver. Transmitter or Tx The circuits which create the electrical signals used to transmit data, control or clock. The CSI specification does not specify the implementation of those circuits. In most cases Tx is used to describe that function of transmitting differential signals. A Tx connects to the pads of a component and the pins of a packaged part. The term Transmitter may be used this specification. In this context that term have the same meaning as Tx. Unit Interval The time it takes to transfer one unit of information on a lane. In this revision of this spec, 2 level (binary) signaling is used, therefore one bit time is one unit interval. Example; at 6.4 GBits per sec (GB/s) or alternately 6.4 GTransfers per sec (GT/s), a unit interval is 156.25 psec. In contrast to 2 level signaling, 4 level signaling transfers 2 bits of information in one unit interval. In this case, to achieve the same transferred bit rate, the unit interval would be 312.5 psec. Please note that this an example only. This revision of the CSI specification describes, supports and specifies 2 level (binary) signaling only. Ref No xxxxx 133 Intel Restricted Secret Physical Layer Physical Layer 134 Ref No xxxxx Intel Restricted Secret The Link Layer guarantees reliable data transfer between two CSI protocol or routing agents. It abstracts the Physical Layer from the Protocol Layer, is responsible for the flow control between 2 protocol agents, and provides virtual channel services to the Protocol Layer (Message Classes) and Routing Layer (Virtual Networks). The smallest transfer unit at the Link Layer is referred to as a flit. A packet consists of one or more flits that form a message. The Link Layer relies on the Physical Layer to frame the Physical Layer unit of transfer (phit) into the Link Layer unit of transfer (flit). In addition the Link Layer is logically broken into two parts, a sender and a receiver. A sender/receiver pair on one agent will be connected to a receiver/sender pair on another agent. Flow Control is performed on both a flit and a packet basis. Error detection and correction is performed on a flit level basis. The interface between the Protocol Layer and the Link Layer is at the packet level. A packet is comprised of one or more flits. 4.1 Message Class The Link Layer supports up to 14 Protocol Layer message classes of which 8 (UP/DP) or 6 (SMP/LMP) are currently defined. The remaining 6(UP/DP) or 8(SMP/LMP) message classes are reserved for future use. The message classes provide independent transmission channels (virtual channels) to the Protocol Layer, allowing sharing of the physical channel. It is required that the Link Layer create no dependency between any two packets in different message classes. The Link Layer must not block the flow in one message class because of the blockage in another. The message classes are Snoop (SNP, Command Packets), Home (HOM, Command Packets), Non-Data Response (NDR, Command Packets), Data Response (DRS, Data Packets), Non- Coherent Standard (NCS, Command Packets), Non-Coherent Bypass (NCB, Data Packets), Isoch Command Stream (ICS, Command Packets), and Isoch Data Stream (IDS, Data Packets). The messages with the SNP, NDR, DRS, NCS, HOM, and NCB message encodings are un-ordered. This is not the case for those with the HOM message encoding which is required to have point-topoint ordering per address. In an unordered channel there is no relation between the order in which messages are sent on that channel and the order in which they are received. The packet transmission in a message class is expected to be contiguous on the link with respect to other packets. The exceptions being flit level interleaving of 1. Link Layer Special Packets (SP) and a command packet (Protocol Layer Message excluding any packet with a data payload) after the header FLIT(s) or between data FLITs of a data packet. - Command Insert Interleave 2. Interleaving SP into any multi-flit header of any packet (both data and command even if themselves already interleaved into another packet). - SP interleaving 3. Interleaving of two data packet streams (second packet header immediately follows first, with possible SP(s) interleaved, followed by alternating FLITs of the two data packets with possible SP(2) interleaved) - Scheduled Data Interleave Ref No xxxxx 135 Intel Restricted Secret Table 4-1. Message Classes, Abbreviations and Ordering Requirements Name Abbreviation Order Data Payload Snoop SNP none No Home HOM none/Point-2-Point only per address No Non Data Response NDR none No Data Response DRS none Yes Non-Coherent Standard NCS none No Non-Coherent Bypass Isochronous Command Stream Isochronous Data Stream NCB ICS IDS none Point-2-Point Point-2-Point Yes No Yes The HOM channel (independent communication path) is required to implement per address pointto- point ordering. The ICS and IDS channels are required to implement strict point-to-point ordering across addresses (which allows ICS/IDS to support quality of service requirements). The SNP message class is used by the Protocol Layer to send snoops to caching agents. This message class does not support a data payload. The HOM message class supports point to point ordering per address between a caching agent and a Home Agent (see Chapter 8, “CSI Cache Coherence Protocol” or Appendix A, “Glossary”). This message class is used to send request and snoop response messages to Home Agents. The HOM message class does not support a data payload. The NDR class is used by the Protocol Layer to send short response messages. This class does not support a data payload. The DRS class is used by the Protocol Layer to send response messages with data. All DRS class messages contain a cache line data payload. This class also supports a Byte enable bitfield for less than a cache line size transfer. DRS class messages can target both caching agents and Home Agents. The NCS class is used by the Protocol Layer to send non-coherent reads and special writes. This channel does not support a data payload. Some messages in NCS support up to an 8 Byte payload. The NCB class is used by the Protocol Layer for non-coherent data writes, peer-to-peer writes, and several Protocol Layer special messages. The NCB channel has cache line size payload with byte enable field. There is an additional message class that isn’t visible to the Protocol Layer. The Special class (SPC) is used by the Link Layer for communication between two connected Link Layer agents. When there are no packets for transmission, the Link Layer transmits a Link Layer Idle or Ctrl flit, which are SPC class messages. In case the Link Retry Queue is full, the Link Layer will send a Ctrl Flit, which is another SPC class message. Additionally, all the link level retry messages are SPC class messages. The CSI Link Layer provides two dedicated Message-Classes for ISOC traffic: Command (ICS) and Data (IDS). ICS and IDS message-classes provide independent CSI channels for ISOC subsystems, where quality of service (QOS) applications’ transactions must cross the CSI fabric. 136 Ref No xxxxx Intel Restricted Secret From CSI Link Layer perspective, both (ICS and IDS) channels are strictly ordered across all addresses from an endpoint-to-endpoint. Requests in these channels must be considered as high priority requests at various arbitration points of CSI traffic flow to meet latency requirements. The exact mechanism and arbitration policies are product specific and beyond the scope of this specification. 4.1.1 Required Base Message Classes All CSI Link Layer agents are required to implement the SNP, HOM, DRS, NDR, NCS, and NCB message classes. The only exception is if a given endpoint does not have the functionality of sending/receiving a Message Class then the agent can omit the Message Class. This omission only applies in one direction and only for endpoint agents. An example of an exception is the case of a protocol agent that is a Home Agent only (e.g. a directory controller). The Home Agent need only support an outbound SNP channel and does not need to support an inbound SNP channel since it will never be the target of SNP messages. All Link Layer agents are required to support the SPC message class for correct link functionality. 4.2 Virtual Networks Virtual networks can provide a variety of features such as reliable routing, support for complex network topologies, or a reduction in required buffering through adaptively buffered virtual networks. Virtual networks provide an additional method at the Link Layer to replicate each message class into independent virtual channels (independent communication paths). The Link Layer supports up to 3 virtual networks. Each message class is subdivided among the 3 virtual networks. There are 12 independently buffered deadlock free virtual networks (VN0 and VN1) and 1 shared adaptive buffered virtual network (VNA). The total number of virtual channels supported is the product of the virtual networks supported and the message classes supported. For the CSI Link Layer this is a maximum of 1824 virtual channels (3 SNP, 3 HOME, 3 NDR, 3 DRS, 3 NCS, 3 NCB, 3 ICS, and 3 IDS). The (UP) or 2 (DP, SMP, LMP) independently buffered virtual networks (VN0 and VN1) acts like a classical virtual channels. They each have independent buffering and flow control on a per message class basis. The 3rd virtual network (VNA) is more complex. VNA presents a shared buffer pool across a subset of the message classes. The flow control is also shared among all the message classes for VNA. If VNA was the only virtual network supported then the system would eventually deadlock as the different message classes would be interdependent, sharing the same buffers and flow control. VNA relies on the existence of either VN0 or VN1 to provide an escape path that is deadlock free. If a message becomes blocked in VNA (no credit available in VNA for the next destination), it will transition to using VN0 or VN1 in an implementation dependant manner. It can transition back into VNA at any subsequent link if there is buffer space available. In order to support this transition to VN0 or VN1 each packet that is travelling in VNA is set to either drain to VN0 or drain to VN1. VNA provides the mechanism whereby the amount of buffering to support a large number of message classes and virtual networks is significantly reduced. To remain deadlock free VN0 and VN1 only requires one buffer per message class, allowing the majority of the buffer resources to be put into the shared pool of VNA. It is therefore recommended that VN0 and VN1 have minimal buffering and that VNA be sized to cover the round trip credit debit latency if implemented. Ref No xxxxx 137 Intel Restricted Secret CSI Link Layer CSI Link Layer Messages traveling in the SNP, HOM, NDR, DRS, ICS, IDS, NCS, and NCB message classes are allowed to transfer into and out of VNA. 4.2.1 Base Virtual Network Requirements All agents are required to support VN0. Support for VNA and/or VN1 are is optional. If an agent supports VNA, then the agent must guarantee that messages in VNA can always drain to either VN0 and/or optionally VN1 depending on topology. 1. VNA Must be able to process packets from different message classes out of order. (Must not be a FIFO) 2. Between every pair of sender and receiver virtual channel buffers, sender VNA should be able to drain to VN0 or VN1 on the receiver side. 4.2.1.1 Advanced Routing Virtual Network Requirements In order to support advanced routing features such as online update of router tables, hot addition/deletion of components, and advanced network topologies, both VN0 and VN1 are required in all intermediate agents. An intermediate agent is defined as an agent that will receive a packet and forward it on to another agent (see Section 5, “Routing Layer” on page 5-209). 4.3 Credit/Debit Flow Control In a credit/debit based flow control system, a sender will be given a set number of credits to send packets or flits to a receiver during initilization. Whenever a packet or flit is sent to the receiver, the sender will decrement its credit counter by the size of the packet sent or by one flit. Whenever a buffer is freed at the receiver, a credit is returned back to the sender for that buffer. When the sender’s credits have exhausted it stops sending. Each packet contains an embedded flow control stream. This flow control stream returns credits from a receiving Link Layer agent to a sending Link Layer agent. The Link Layer is required to keep track of up to 1317 independent credit pools, up to 2 pools per message class for VN0 and VN1, and 1 Adaptive Virtual Network pool (2 SNP, 2 HOM, 2 NDR, 2 DRS, 2 NCS, 2 NCB, 2 ICS, 2 IDS, and 1 VNA). If a Link Layer agent doesn't have the functionality to send on a particular message class channel then it is not required to keep track of credits for that channel. Credits for buffers in VN0 and VN1 are returned on a per packet basis for each message class. Hence, each buffer for each credit in VN0/VN1 must be sized to cover the buffer requirements for the largest packet size that can use the credit (e.g. for the NCS VN0 channel, the buffer size for each credit is 3 flits since this is the largest packet that can use NCS). This provides the most efficient method of credit return for these channels. Because of the shared resource and a variety of message sizes that will be allocated/deallocated, it would not be efficient to use packet credit/debit for VNA. Instead a flit credit/debit scheme is used for VNA. Each flit represents 1 flit of receiver buffer space with the credits shared by all message classes that can transmit on VNA. The encodings for the credit return described in Section 4.6.2.6, “VC Credit (VCCrd) - 3b -LL” on page 4-163. 138 Ref No xxxxx Intel Restricted Secret 4.4 Link Layer Buffer/Credit Management The CSI Link Layer does not exchange explicit credit sizes at init time. Instead it is the responsibility of the receiver to transmit credits to the sender using the standard CSI credit return mechanism after reset. Each agent should know how many credits it can receive and set its credit return counters to these values. Then during normal operation the standard credit return logic will return these credits to the sender. It is possible that the receiver will make available more credits than the sender can track for a given message class. For correct operation, it is therefore required that the credit counters at the sender be saturating. This method of credit initialization has the advantage that it uses the standard credit/debit mechanism and doesn’t require the full credit size per message class to be sent as a whole. One issue is that, while this method provides for seamless interoperability, if the counters are sized too small there is the possibility of wasted/un-utilized buffering. It is therefore suggested that designs take into account the expected size of buffering in other designs when setting the size of their counters. 4.5 Support For Link Layer Reliable Transmission The Link Layer is responsible for reliable transmission of packets between protocol agents by providing support for transmission error detection and correction. The Protocol Layer expects the packets transferred to it from the Link Layer to be free of transmission error. Transmission error detection over a flit is done using an 8b CRC scheme. For systems requiring higher RAS capabilities an optional 16b rolling CRC scheme, termed as 'rolling CRC', is also defined. Rolling CRC has error detection capabilities very similar to the traditional 16b CRC but incurs lower overall transmission latency. The details of the CRC error detection are defined in Section 4.9.1, “Error Detection” on page 4-189 The recovery from transmission errors is done at up to two levels. The first level, which is required, is for the link to generate a link level retry sequence. The retry scheme is based on the classical goback- n sliding window protocol. The sender buffers all outgoing FLITs that are classified as being retry enabled in a retry circular buffer with specialized read and write pointer controls such that FLITs can be resent on indication of an error from the receiver and buffer space can be freed as error free reception is acknowledged by the opposite agent. Each outgoing flit is uniquely identified by a sequence number that is also its index in the retry buffer. The receiver keeps track of the expected sequence number of flits and returns this sequence number back to the sender if an error is detected. The sender should then retransmit the entire sequence of flits from its retry buffer. The receiver keeps a count of the number of link level retries attempted for a flit; upon exceeding a threshold then it will either trigger a link reset or indicate a link failure to the system. For high RAS requirements, a self healing mechanism is also defined. The self healing is achieved by dynamically reducing the width of the link due to detection of an error condition. (or for power management reasons) The details of the retry scheme are defined in Section 4.9.2, “Error Recovery” on page 4-193. Details of the self healing mechanism are defined in Section 4.6.4, “Width Reduction” on page 4-172. Ref No xxxxx 139 Intel Restricted Secret CSI Link Layer CSI Link Layer 4.6 Packet Definition 4.6.1 Packet Format Data Packets are formed by combining a data header with data flits. For 64 and 128 Byte cache line systems, this would be 8 and 16 data flits, respectively. Command Packets are formed by simply using one of the available header formats and do not include an 8 or 16 flit data payload. The packets are designed around a 24b (C1:C0 for CRC and L21:L0 for payload) wide logical format of which 20b are currently defined. By defining the logical width at 24b, the CSI Link Layer has room for expansion. At the same time, because lanes L18-21 are currently Zero Reserved (set as zero and read as zero), they don’t need to be sent and consequently pins do not need to used for them. For half width and quarter width header formats see Section 4.9.1.2, “CRC Computation” on page 4-191. In the rest of chapter shading of boxes is used to indicate the common fields across many header formats. The upper row of the header in the tables below is the first phit transmission on the port. The definition of the packet fields follows the packet format field assignment. Reserved fields are classified in three categories: (a) reserved fields that can be ignored; (b) reserved fields that need to be decoded; and (c) reserved fields that need to be carried over to related packets. This distinction is not used in this revision but it will be made clear in the future revision of this chapter. 4.6.1.1 Standard Address Header Format, SA Table 4-2. Standard Address, SA UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual VC Crd (2:0) CR CR Network Request Transaction ID (5:0) Ack C 4 C 0 PH (1:0) RHNID (2:0) Address (11:6) C 5 CR C 1 CR 0b1 IIB Vira l Address (27:12) CR C 6 CR C 2 (42:41) Addr Addr (5:3) Address (40:28) C 7 CR C 3 CR 140 Ref No xxxxx Intel Restricted Secret Table 4-3. Standard Address, SA SMP 4.6.1.2 Standard Coherence Address, SCA L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Addr (5:3) Address (40:28) CR C 7 CR C 3 Table 4-4. Standard Coherence Address, SCA UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 RSVD RSNID (2:0) Address (40:28) CR C 7 CR C 3 Table 4-5. Standard Coherence Address, SCA SMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 RSNID (4:0) Address (40:28) CR C 7 CR C 3 Ref No xxxxx 141 Intel Restricted Secret CSI Link Layer CSI Link Layer 4.6.1.3 Standard Coherence No Address, SCC Table 4-6. Standard Coherence, SCC UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral RSVD FCTID (5:0) Rsp Status (1:0) CR C 6 CR C 2 RSVD RSNID (4:0) RSVD CR C 7 CR C 3 Table 4-7. Standard Coherence, SCC SMP 4.6.1.4 Standard Complete With Data, SCD L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral RSVD FCTID (5:0) Rsp Status (1:0) CR C 6 CR C 2 RSNID (4:0) RSVD CR C 7 CR C 3 Table 4-8. Standard Complete With Data, SCD UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) RSVD Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Parameter Byte 0 RSVD RspStatus CR C 6 CR C 2 RSVD Parameter Byte 2 Parameter Byte 1 CR C 7 CR C 3 142 Ref No xxxxx Intel Restricted Secret Table 4-9. Standard Complete With Data, SCD SMP 4.6.1.5 Extended Address Header Format, EA L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) RSVD Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Parameter Byte 0 RSVD Rsp Status CR C 6 CR C 2 RSVD Parameter Byte 2 Parameter Byte 1 CR C 7 CR C 3 Table 4-10. Extended Address, EA UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Addr (5:3) Address (40:28) CR C 7 CR C 3 RSVD RSVD RSV D RSVD CR C 4 CR C 0 RSVD RSVD RSVD RSVD CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Address (2:0) RSV D Length (5:0) CR C 7 CR C 3 Ref No xxxxx 143 Intel Restricted Secret CSI Link Layer CSI Link Layer L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Addr (5:3) Address (40:28) CR C 7 CR C 3 RSVD RSV D RSVD CR C 4 CR C 0 RSVD RSVD RSVD CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Address (2:0) RSV D Length (5:0) CR C 7 CR C 3 Table 4-12. Extended Address, EA LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Addr (5:3) Address (40:28) CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD Address (50:43) CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Address (2:0) RSV D Length (5:0) CR C 7 CR C 3 144 Ref No xxxxx Intel Restricted Secret 4.6.1.6 Extended Coherence Address, ECA Table 4-13. Extended Coherence Address, ECA LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 RSNID (4:0) Address (40:28) CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD Address (50:43) CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSNID (9:5) RSVD CR C 7 CR C 3 4.6.1.7 Extended Coherence No Address, ECC Table 4-14. Extended Coherence No Address, ECC LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) VC Crd (1:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack VCC rd 2 CR C 5 CR C 1 0b1 IIB Viral RSVD FCTID (5:0) Rsp Status (1:0) CR C 6 CR C 2 RSNID (4:0) RSVD CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD RSVD CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSNID (9:5) RSVD CR C 7 CR C 3 Ref No xxxxx 145 Intel Restricted Secret 4.6.1.8 Extended Complete with Data, ECD Table 4-15. Extended Complete with Data LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) RSVD Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Parameter Byte 0 RSVD Rsp Status CR C 6 CR C 2 RSVD Parameter Byte 2 Parameter Byte 1 CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD RSVD CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD CR C 7 CR C 3 146 Ref No xxxxx Intel Restricted Secret 4.6.1.9 Non-Coherent Message, NCM Table 4-16. Non-Coherent Message, NCM UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) Message Type (5:0) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral RSVDa Parameter Byte Ab CR C 6 CR C 2 RSVDc RSVD RSVDd CR C 7 CR C 3 RSVD RSVD RSV D RSVD RSVD CR C 4 CR C 0 RSVD RSVD RSVD RSVD CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Byte Enable[7:0] CR C 7 CR C 3 RSVD Parameter Byte 1 Parameter Byte 0 CR C 4 CR C 0 RSVD Parameter Byte 3 Parameter Byte 2 CR C 5 CR C 1 0b0 IIB RSV D Parameter Byte 5 Parameter Byte 4 CR C 6 CR C 2 RSVD Parameter Byte 7 Parameter Byte 6 CR C 7 CR C 3 a. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 27:20 of the atomic data. b. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 19:12 of the atomic data. c. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 42:41 of the atomic data. d. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 40:28 of the atomic data. Ref No xxxxx 147 Intel Restricted Secret CSI Link Layer CSI Link Layer L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Msg Type Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral RSVDa Parameter Byte Ab CR C 6 CR C 2 RSVDc RSVD RSVDd CR C 7 CR C 3 RSVD RSV D RSVD RSVD CR C 4 CR C 0 RSVD RSVD RSVD CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Byte Enable[7:0] CR C 7 CR C 3 RSVD Parameter Byte 1 Parameter Byte 0 CR C 4 CR C 0 RSVD Parameter Byte 3 Parameter Byte 2 CR C 5 CR C 1 0b0 IIB RSV D Parameter Byte 5 Parameter Byte 4 CR C 6 CR C 2 RSVD Parameter Byte 7 Parameter Byte 6 CR C 7 CR C 3 a. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 27:20 of the atomic data. b. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 19:12 of the atomic data. c. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 42:41 of the atomic data. d. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 40:28 of the atomic data. 148 Ref No xxxxx Intel Restricted Secret Table 4-18. Non-Coherent Message, NCM LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Msg Type Request Transaction ID (5:0) Ack CR C 5 CR C 1 IIB Vira l RSVDa Parameter Byte Ab CR C 6 CR C 2 RSVDc RSVD RSVDd CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD RSVDe CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Byte Enable[7:0] CR C 7 CR C 3 RSVD Parameter Byte 1 Parameter Byte 0 CR C 4 CR C 0 RSVD Parameter Byte 3 Parameter Byte 2 CR C 5 CR C 1 0b0 IIB RSV D Parameter Byte 5 Parameter Byte 4 CR C 6 CR C 2 RSVD Parameter Byte 7 Parameter Byte 6 CR C 7 CR C 3 a. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 27:20 of the atomic data. b. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 19:12 of the atomic data. c. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 42:41 of the atomic data. d. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 40:28 of the atomic data. e. For the ProcLock, ProcSplitLock, LTHold, and DebugLock messages, this field is used to hold the address bits 50:43 of the atomic data Ref No xxxxx 149 Intel Restricted Secret 4.6.1.10 Extended I/O Command, EIC Table 4-19. 3 Flit EIC format UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DestNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Addr (5:3) Address (40:28) CR C 7 CR C 3 RSVD RSVD D RSV RSVD RSVD C 4 CR C 0 CR RSVD RSVD RSVD RSVD C 5 CR C 1 CR 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Byte Enable (7:0) CR C 7 CR C 3 RSVD Date Byte 1 Date Byte 0 CR C 4 CR C 0 RSVD Date Byte 3 Date Byte 2 CR C 5 CR C 1 0b0 IIB RSV D Date Byte 5 Date Byte 4 CR C 6 CR C 2 RSVD Date Byte 7 Date Byte 6 CR C 7 CR C 3 150 Ref No xxxxx Intel Restricted Secret Table 4-20. 3 Flit EIC format SMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Addr (5:3) Address (40:28) CR C 7 CR C 3 RSVD RSVD D RSV RSVD RSVD C 4 CR C 0 CR RSVD RSVD RSVD RSVD C 5 CR C 1 CR 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Byte Enable (7:0) CR C 7 CR C 3 RSVD Date Byte 1 Date Byte 0 CR C 4 CR C 0 RSVD Date Byte 3 Date Byte 2 CR C 5 CR C 1 0b0 IIB RSV D Date Byte 5 Date Byte 4 CR C 6 CR C 2 RSVD Date Byte 7 Date Byte 6 CR C 7 CR C 3 Ref No xxxxx 151 Intel Restricted Secret CSI Link Layer CSI Link Layer L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Addr (5:3) Address (40:28) CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD Address (50:43) CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD Byte Enable (7:0) CR C 7 CR C 3 RSVD Date Byte 1 Date Byte 0 CR C 4 CR C 0 RSVD Date Byte 3 Date Byte 2 CR C 5 CR C 1 0b0 IIB RSV D Date Byte 5 Date Byte 4 CR C 6 CR C 2 RSVD Date Byte 7 Date Byte 6 CR C 7 CR C 3 4.6.1.11 Standard Data Response Header Format, SDR Table 4-22. Standard Data Response, SDR UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Response Data State (3:0) RSVD Rsp Status (1:0) CR C 6 CR C 2 RSVD Address (5:3) Sch Data Inter l RSVD CR C 7 CR C 3 152 Ref No xxxxx Intel Restricted Secret Table 4-23. Standard Data Response, SDR SMP 4.6.1.12 Standard Data Write Header Format, SDW L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Response Data State (3:0) RSVD Rsp Status (1:0) CR C 6 CR C 2 RSVD Address (5:3) Sch Data Inter l RSVD CR C 7 CR C 3 Table 4-24. Standard Data Write, SDW UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Address (5:3) Address (40:28) CR C 7 CR C 3 Table 4-25. Standard Data Write, SDW SMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Address (5:3) Address (40:28) CR C 7 CR C 3 Ref No xxxxx 153 Intel Restricted Secret 4.6.1.13 Extended Data Response Header Format, EDR Table 4-26. Extended Data Response, EDR LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Response Data State (3:0) RSVD Rsp Status (1:0) CR C 6 CR C 2 RSVD Address (5:3) RSV D RSVD CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD RSVD CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD CR C 7 CR C 3 4.6.1.14 Extended Data Write Header Format, EDW Table 4-27. Extended Data Write, EDW LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Address (5:3) Address (40:28) CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD Address (50:43) CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD CR C 7 CR C 3 154 Ref No xxxxx Intel Restricted Secret 4.6.1.15 Extended Byte Enable Data Write Header Format, EBDW Table 4-28. Extended Byte Enable Data Write, EBDW UP/DP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PE (1:0) DNID (2:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 PH (1:0) RHNID (2:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Address (5:3) Address (40:28) CR C 7 CR C 3 RSVD RSV D RSVD RSVD CR C 4 CR C 0 RSVD RSVD RSVD CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD CR C 7 CR C 3 RSVD Byte Enable (15:0) CR C 4 CR C 0 RSVD Byte Enable (31:16) CR C 5 CR C 1 0b0 IIB RSV D Byte Enable (47:32) CR C 6 CR C 2 RSVD Byte Enable (63:48) CR C 7 CR C 3 Ref No xxxxx 155 Intel Restricted Secret CSI Link Layer CSI Link Layer L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Address (5:3) Address (40:28) CR C 7 CR C 3 RSVD RSV D RSVD RSVD CR C 4 CR C 0 RSVD RSVD RSVD CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD CR C 7 CR C 3 RSVD Byte Enable (15:0) CR C 4 CR C 0 RSVD Byte Enable (31:16) CR C 5 CR C 1 0b0 IIB RSV D Byte Enable (47:32) CR C 6 CR C 2 RSVD Byte Enable (63:48) CR C 7 CR C 3 156 Ref No xxxxx Intel Restricted Secret Table 4-30. Extended Byte Enable Data Write, EBDW LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 0b1 IIB Viral Address (27:12) CR C 6 CR C 2 Addr (42:41) Address (5:3) Address (40:28) CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD Address (50:43) CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D RSVD CR C 6 CR C 2 RSVD RSVD CR C 7 CR C 3 RSVD Byte Enable (15:0) CR C 4 CR C 0 RSVD Byte Enable (31:16) CR C 5 CR C 1 0b0 IIB RSV D Byte Enable (47:32) CR C 6 CR C 2 RSVD Byte Enable (63:48) CR C 7 CR C 3 4.6.1.16 Data Flit Format Data in data packets is arranged from least significant quad word to most significant quad word by default. In systems using critical chunk first functionality, the data is ordered R = A xor T, where A is the original address, T is the scaled timeslot number (starting at timeslot 0). Table 4-31. Data Flit Format, DF L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 PDFA Data Word 0 (15:0) CR C 4 CR C 0 PDFB Data Word 1 (31:16) CR C 5 CR C 1 0b0 IIB Pois on Data Word 2 (47:32) CR C 6 CR C 2 RSVD Data Word 3 (63:48) CR C 7 CR C 3 Ref No xxxxx 157 Intel Restricted Secret 4.6.1.17 Peer-to-Peer Tunnel Header Table 4-32. Peer-to-Peer Tunnel SMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 IIB Viral / Psn Tunnel Byte 1 Tunnel Byte 0 CR C 6 CR C 2 RSVD Tunnel Byte 3 Tunnel Byte 2 CR C 7 CR C 3 RSVD RSVD Tunnel Type (3:0) CR C 4 CR C 0 RSVD RSVD RSVD CR C 5 CR C 1 0b0 IIB RSV D Tunnel Byte 5 Tunnel Byte 4 CR C 6 CR C 2 RSVD Tunnel Byte 7 Tunnel Byte 6 CR C 7 CR C 3 RSVD Tunnel Byte 9 Tunnel Byte 8 CR C 4 CR C 0 RSVD Tunnel Byte 11 Tunnel Byte 10 CR C 5 CR C 1 0b0 IIB RSV D Tunnel Byte 13 Tunnel Byte 12 CR C 6 CR C 2 RSVD Tunnel Byte 15 Tunnel Byte 14 CR C 7 CR C 3 158 Ref No xxxxx Intel Restricted Secret Table 4-33. Peer-to-Peer Tunnel LMP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 DNID (4:0) Message Class (3:0) Opcode (3:0) Virtual Network VC Crd (2:0) CR C 4 CR C 0 RHNID (4:0) Address (11:6) Request Transaction ID (5:0) Ack CR C 5 CR C 1 IIB Viral / Psn Tunnel Byte 1 Tunnel Byte 0 CR C 6 CR C 2 RSVD Tunnel Byte 3 Tunnel Byte 2 CR C 7 CR C 3 DNID (9:5) OEM Defined (2:0) RSVD Tunnel Type (3:0) CR C 4 CR C 0 RHNID (9:5) RSVD RSVD/Transport (8:0) CR C 5 CR C 1 0b0 IIB RSV D Tunnel Byte 5 Tunnel Byte 4 CR C 6 CR C 2 RSVD Tunnel Byte 7 Tunnel Byte 6 CR C 7 CR C 3 RSVD Tunnel Byte 9 Tunnel Byte 8 CR C 4 CR C 0 RSVD Tunnel Byte 11 Tunnel Byte 10 CR C 5 CR C 1 0b0 IIB RSV D Tunnel Byte 13 Tunnel Byte 12 CR C 6 CR C 2 RSVD Tunnel Byte 15 Tunnel Byte 14 CR C 7 CR C 3 4.6.2 Packet Fields Packet fields come in three types: Protocol Layer Fields, Link Layer Fields, and Protocol/Link Layer Fields. Any field used by the Link Layer will be marked with “LL” link in the heading. Any field used by the Protocol Layer will be marked with “PL” in the heading. 4.6.2.1 Profile Dependent Fields The CSI Link Layer implements profile dependent fields. Profile dependent fields can be configured for a variety of functionality depending on the requirements of the platform they will be used in while still providing default compatibility. Profile dependant fields allow compatibility between a wide range of designs while allowing systems to be optimized in certain cases for specific needs, e.g. additional error containment in large server systems and hints to the memory controller in desktop systems. By default, at initialization, Profile Dependent Fields are read-as-zero/set-as-zero. As part of the initialization process, two link agents will exchange information on what profiles they can support. If both agents can support a given profile then that profile will be enabled for use. Further details on the profile configuration and initialization are given in Section 4.10, “Link Layer Initialization” on page 4-200. Ref No xxxxx 159 Intel Restricted Secret CSI Link Layer CSI Link Layer 4.6.2.2 Implicit Packet Fields There are 3 Implicit Packet Fields in the CSI Packet Format. These fields are created by combining information from other fields within a packet. The three fields are: Opcode, Packet Length, and Globally Unique Transaction ID. The Opcode field is created by combining the Message Class field with the Minor Opcode field (Section 4.6.2.4, “Opcode - 4b - PL & LL” on page 4-162) as well as the Sub Opcode field (Section 4.6.2.16, “Destination Node ID (DNID) - 3b/5b/10b - PL & LL” on page 4-166) in some packets. Packet Length is derived by combining the MSB of the opcode with the Message Class field as depicted in Table 4-43: Table 4-34. Packet Length Encoding UP/DP/SMP Message Class Message Class Encoding Opcode MSb Packet Size SSS 0 - HOM 0b0000 X 1 1 - HOM 0b0001 2 - NDR 0b0010 3 - SNP 0b0011 5 - ICS 0b0101 4 - NCS 0b0100 00 01 1 2 1X 3 14 - DRS 0b1110 0 1 + Data 12- NCB 0b1100 1 3 + Data 13 - IDS 0b1101 Table 4-35. Packet Length Encoding LMP Message Class Message Class Encoding Opcode MSb Packet Size LSS 0 - HOM 0b0000 X 2 1 - HOM 0b0001 2 - NDR 0b0010 3 - SNP 0b0011 4 - NCS 0b0100 0 2 1 3 14 - DRS 0b1110 0 2 + Data 12- NCB 0b1100 1 3 + Data Data Size is either 8 for systems with 64B Cache Line size or 16 for systems with 128B Cache Line size. The Globally Unique Transaction ID is formed by the 3-tuple . In cases where the Home Node ID isn’t explicit in the message being sent, it can be generated by decoding the physical address (See Chapter 7, “Address Decode”). 160 Ref No xxxxx Intel Restricted Secret 4.6.2.3 Message Class (MC) - 4b - PL & LL The Protocol Layer uses the message class to define the Protocol Class which also acts as the Major Opcode field. The Link Layer uses the Message Class field as part of the VC definition. Some Protocol Classes/VC use multiple Message Class encodings due to the number of messages that need to be encoded, this is reflected in Table 4-36. Table 4-36. Message Class Encoding UP/DP Message 0b1111 Special Cntrl Special Cntrl Class Message Type Message Class Encoding 0b0000 Home - Request Home 0b0001 Home - Response & Writes (commands only) 0b0010 Response - Non Data Non Data Response 0b0011 Snoop Snoop 0b0100 Non-coherent - Commands Non-Coherent 0b0101 Isoch Commands Isoch Cmd 0b0110 RSVD RSVD 0b0111 RSVD RSVD 0b1000 RSVD RSVD 0b1001 RSVD RSVD 0b1010 RSVD RSVD 0b1011 RSVD RSVD 0b1100 Non-coherent Bypass - Commands Non-coherent Bypass 0b1101 Isoch Data Isoch Data 0b1110 Response - Data Data Response Ref No xxxxx 161 Intel Restricted Secret CSI Link Layer CSI Link Layer Table 4-37. Message Class Encoding SMP/LMP Message 0b1111 Special Cntrl Special Cntrl Class Message Type Message Class Encoding 0b0000 Home - Request Home 0b0001 Home - Response & Writes (commands only) 0b0010 Response - Non Data Non Data Response 0b0011 Snoop Snoop 0b0100 Non-coherent - Commands Non-Coherent 0b0101 RSVD RSVD 0b0110 RSVD RSVD 0b0111 RSVD RSVD 0b1000 RSVD RSVD 0b1001 RSVD RSVD 0b1010 RSVD RSVD 0b1011 RSVD RSVD 0b1100 Non-coherent Bypass - Commands Non-coherent Bypass 0b1101 RSVD RSVD 0b1110 Response - Data Data Response 4.6.2.4 Opcode - 4b - PL & LL The Protocol Layer uses the opcode in conjunction with the Message Class to form the complete opcode. The Link Layer uses the opcode to distinguish between Home Agent target or caching agent target for messages when a Home Agent and a caching agent share the same NID. Additionally the Link Layer also uses the opcode to determine packet size. 4.6.2.5 Virtual Network (VN) - 2b - LL Virtual Network defines which virtual network a message is traveling in. In addition, for message traveling in VNA, Virtual Network defines which virtual network (VN0 or VN1) the message should drain into in the event that VNA becomes blocked. Table 4-38. Virtual Network Encoding VN Encoding Virtual Network Drains To 0b00 VN0 N/A 0b01 VN1 N/A 0b10 VNA VN0 0b11 VNA VN1 162 Ref No xxxxx Intel Restricted Secret 4.6.2.6 VC Credit (VCCrd) - 3b - LL VC Credit returns virtual channel credits back to a sender agent via Huffman encoding. The VCCrd field, derived from the concatenation of VCCrd(2) and VCCrd(1:0), is considered a logical stream embedded in the packet format but independent from the protocol message stream. The Huffman encoding stretches across multiple packets for VN0 and VN1. Credits to be returned to the sender are encoded in the VC Credit field. Assume that the state machine starts at the idle state. If the first nibble is a VNA credit or a NOP then the next nibble is decoded from the first/idle state. If it is a Continue X nibble, then they next nibble is decoded from the Continue X state. After a decode from the Continue X state, the next state is first/idle. For example, if the first nibble = 0b110 then the next state is Continue C. If the next nibble is 0b000 then the credit received will be a packet credit for the Home channel in VN1 and the next state will be first/idle. In the Table below, PriorNibble refers to the prior VC Credit nibble, and PriorPriorNibble refers to the previous prior nibble. VN0/VN1 credit return nibbles could span packet boundaries. Table 4-39. VC Credit Field Encoding UP/DP Encoding VCCrd 0b000 0b001 0b010 0b011 0b100 0b101 0b110 0b111 PriorNibble = 0b0XX / PriorPriorNibble = 0b1XX First Nibble/ Idle NOP/Null VNA - 2 Flit Credit VNA - 8 Flit Credit VNA - 16 Flit Credit Continue A Continue B Continue C Continue D PriorNibble = Continue A VN0 - Home RSVD VN0 - NDR VN0 - DRS VN0 - SNP VN0 - NCB VN0 - NCS RSVD PriorNibble = Continue B Second NibbleVN0 - ICS VN1 - Home VN0 - IDS RSVD RSVD VN1 - NDR RSVD VN1 - DRS RSVD VN1 - SNP RSVD VN1 - NCB RSVD VN1 - NCS RSVD RSVD PriorNibble = Continue C PriorNibble = Continue D VN1 - ICS VN1 - IDS RSVD RSVD RSVD RSVD RSVD RSVD Ref No xxxxx 163 Intel Restricted Secret Table 4-40. VC Credit Field Encoding SMP/LMP First Nibble/ Idle Second Nibble VCCrd PriorNibble = Encoding PriorNibble = PriorNibble = PriorNibble = PriorNibble = 0b0XX / PriorPrior Continue A Continue B Continue C Continue D Nibble = 0b1XX 0b000 NOP/Null VN0 - Home RSVD VN1 - Home RSVD 0b001 VNA - 2 Flit Credit RSVD RSVD RSVD RSVD 0b010 VNA - 8 Flit Credit VN0 - NDR RSVD VN1 - NDR RSVD 0b011 VNA - 16 Flit Credit VN0 - DRS RSVD VN1 - DRS RSVD 0b100 Continue A VN0 - SNP RSVD VN1 - SNP RSVD 0b101 Continue B VN0 - NCB RSVD VN1 - NCB RSVD 0b110 Continue C VN0 - NCS RSVD VN1 - NCS RSVD 0b111 Continue D RSVD RSVD RSVD RSVD In the case of an Idle Flit which has two VC Credit fields for faster credit flow, only one of the fields is allowed to send a VNA credit. More information on the Idle Flit format can be found in Section 4.7.1, “Special Packet Format” on page 4-173. The order for decode in the Idle Flit is VC Cred 0 and then VC Cred 1. Only flits that enter the retry buffer are allowed to have ack and credit fields. These fields should be ignored for flits that do not enter the retry buffer. 4.6.2.7 Address - 43b or 51b - PL The Address Field contains the global system address. The Standard Header contains 43b of physical address. The Extended Header contains 51b of physical address. Address bits 5:3 are used to signify critical chunk order and in certain I/O transactions extend the addressing down to the 8B level. All coherent transactions must be 64 byte aligned and will return either 64B of data or 128B of data depending on system line size. In the SpcIA-32 message address field 11:6 is used to encode the special cycle type. Please refer to Section 4.6.2.24, “Special Cycle Encoding - 6b - PL” on page 4-167 4.6.2.8 Priority Encode (PE) - 2b - PL & LL A 2b priority encode with 0b00 being high and 0b11 being low. It indicates the priority for Isochronous traffic and requests. 4.6.2.9 Performance Hints (PH) - 2b - PL In the UP/DP, bit 0 of PH is currently defined as a DRAM Page Policy hint. If the Page Policy hint bit is 0 then the page should be closed, if it is 1 then the page should be left open. Bit 1 of PH in the UP/DP is used as a CHAIN indication bit. Chain enables arbiters in destination node to service the original request (of multiple 64B fragments) atomically. This minimizes latency impact to cacheline size isochronous requests that traverse on CSI separately but are actually part of a larger multi-cacheline isochronous request. • 1' indicates the CSI request is not the last fragment of the original ISOC request. 164 Ref No xxxxx Intel Restricted Secret • 0' indicates the request in ICS is the last fragment of the original ISOC request. 4.6.2.10 Viral/Poison/Chain - 1b - LL The Viral Alert is used to spread an error condition throughout the system in a contagious manner. Any node receiving a packet with the viral bit set, will set the viral bit in the first flit of any outgoing header. The node may optionally take an interrupt to an error handler when it receives a packet with the viral bit set. The Poison bit indicates that the corresponding flit’s payload has experienced an uncorrectable error at some point during its path. The poison bit is only used in data payloads. In the second and third flits of a multi-flit header this bit should be set to 0b0. 4.6.2.11 Request Transaction ID (RTID) - 6b- PL The RTID is used to uniquely identify the different requests from a single caching agent. Combined with the Requester Node ID and Home Node ID it forms the Globally Unique Transaction ID (GUTID) for a packet. 4.6.2.12 Ack - 1b - LL The Ack field is used by the Link Layer to communicate from a receiver to a sender error free receipt of flits. When a sender receives an Ack it can deallocate the corresponding flits from the Link Level Retry Buffer. For more information on Link Level Retry refer to Section 4.9.2.1, “Link Level Retry” on page 4-193. Acks are send in flit send/receive order. Table 4-41. Ack Field Encoding Ack Encoding Meaning 0b0 No Ack Sent 0b1 8 Flits received without error 4.6.2.13 Requester/Home Node ID (RHNID) - 3b/5b/10b - PL The RHNID identifies the original requester/initiator of a transaction in messages, except for messages targeting the original requester itself where it identifies the Home Agent. The RHNID is supplied by the Protocol Layer. 4.6.2.14 Forward / Conflict Transaction ID (FCTID) - 6b - PL The FCTID field is used in both Snoop Response messages and in Forward messages. In Snoop Response messages, the FCTID identifies the Requester Transaction ID of the request with which the sender conflicts. In Forward messages, the FCTID denotes the outstanding Request Transaction ID of the target for the forwarded message. 4.6.2.15 Requester/Sender Node ID (RSNID) - 3b/5b/10b - PL The RSNID is used in both Snoop Response message where it identifies the sender of the snoop response and in Forward message where it identifies where the forward data should be sent. Ref No xxxxx 165 Intel Restricted Secret CSI Link Layer CSI Link Layer 4.6.2.16 Destination Node ID (DNID) -3b/5b/10b - PL & LL The DNID identifies the destination of a packet. The DNID is supplied by the Protocol Layer. 4.6.2.17 Scheduled Data Interleave - 1b - LL The Scheduled Data Interleave field is used to indicate that the data packet will be sent in a scheduled manner interleaved with another data packet. If Scheduled Data Interleave is not enabled, this field should always be read-as-zero/set-as-zero. Table 4-42. Scheduled Data Interleave Encoding SDI Encoding Meaning 0 Data packet is not interleaved in a scheduled manner 1 Data packet is interleaved in a scheduled manner 4.6.2.18 Transfer Size - 2b - PL Transfer Size is used to indicated the size of a read request message that use an extended header. In the standard header the size of the read is assumed to be a cacheline. Table 4-43. Transfer Size Encoding Extended Header? Transfer Size Encoding Size of Read Request No N/A Cacheline Yes 11 0-8 Bytes 10 16 Bytes 01 32 Bytes 00 Cacheline 4.6.2.19 Interleave/Head Indication Bit (IIB) - 1b - LL The IIB is used for two purposes. It indicates that the flit is the start of a new packet. The IIB bit is also used to indicate the start of an interleaved Link Layer Special flit or, when Command Insert Interleave is enabled, the interleaving of a command packet into the data portion of a data packet. The IIB bit should be set for the first flit of all packets. • The IIB is set in the first flit of any header packet. • The IIB is not set in the second or third flit of any header. • The IIB is not set in any data flit. For the rules related to interleave, refer to Section 4.8, “Flit Interleave” on page 4-187 In the case of a system that doesn’t contain lane 17, the Link Layer is responsible for keeping track of the start of a new packet and command interleave should not be enabled. 166 Ref No xxxxx Intel Restricted Secret 4.6.2.20 Traffic Class - 4b - LL QoS extensions are modeled based on PCI-Express - “Traffic-Class” concept. It enables usage of dedicated (PCI-Express) “Virtual-Channels” for differentiated traffic-classes, across CSI fabric, in a compatible manner to PCI-Express. Virtual channels provide dedicated buffering and arbitration to differentiated traffic-classes, under “system-software” control. QoS extensions are applied to devices' cycles to system memory, as well as peer-to-peer device transactions. A 4b field - “Traffic-Class” request attribute is added in all request channels: Home, SNP, NCS, NCB and ICS, in “Extended-Address” header format. 4.6.2.21 Tunnel Type- 4b - PL TBD Denotes type of tunneled information. 4.6.2.22 Virtual Wire (VW) Type - 4b - PL The encoding for VW Type is TBD. VW Type will be used to indicate the type of virtual wire signals being sent. The intended purpose is to allow the virtual wire messages to be used for both the elimination of external legacy pins and indications/information about current transactions. 4.6.2.23 Response Required (Rsp Req) - 1b - PL Denotes whether or not a response is required. Used in conjunction with Spc*VLW messages to denote whether a response is required. The intended usage is to allow transaction behavior messages to be sent that may or may not require a response. For legacy virtual wire messages this bit should be set. 4.6.2.24 Special Cycle Encoding - 6b - PL The special cycle encoding is done using address bits 11:6 in the SpcIA-32 message. The encoding is outlined in Table 4-44: Table 4-44. Special Cycle Encoding - 6b -PL Address (11:6) Meaning 00 0000 NOP 00 0001 Shutdown 00 0010 INVD_ack 00 0011 HALT 00 0100 WBINVD_Ack 00 0101 STPCLK_Ack 00 0110 SMI_Ack 00 0111 - 00 1111 RSVD Ref No xxxxx 167 Intel Restricted Secret CSI Link Layer CSI Link Layer 4.6.2.25 Response Status - 2b - PL The Response Status field in Non-Data Response messages indicated the status of the response. Table 4-45. Response Status - 2b -PL Encoding Value 00 Normal 01 Abort timeout 10 Reserved 11 Failed 4.6.2.26 Response Data State - 4b - PL Response Data State indicates what state the returned data is in. If none of the bits are set then the response state is the Invalid state. Table 4-46. Response Data State -4b - PL Bit Position State 3 Modified 2 Exclusive 1 Shared 0 ForwardingRSVD For the currently defined protocol flows the 4 states are mutually exclusive but in the future multiple additional coding can be defined. It no state is inserted then the assumed state is the Invalid state. Table 4-47. Response Data State Encoding Bit Vector State 0b1000 Modified 0b0100 Exclusive 0b0010 Shared 0b0001 ForwardingRSVD 0b0000 Invalid (Non-coherent) 4.6.2.27 Virtual Legacy Wire Value - 11b - PL For more information on Virtual Legacy Wire, see Section 9.10.4, “Virtual Legacy Wire (VLW) Transactions” on page 9-323 4.6.2.28 OEM Defined Bits - 3b - LL/PL There are 3 bits in the Extended headers that are defined for use by OEMs. These three bits should be set as zero and read as zero by all Intel devices. 168 Ref No xxxxx Intel Restricted Secret 4.6.3 Mapping of the Protocol Layer to the Link Layer Any Opcodes not explicitly defined are RSVD for future use. Table 4-48. Mapping of the Protocol Layer to the Link Layer UP/DP/SMP/LMP Message Class Message Type Name Message Class Encodin g Packet Format Flits Dat a Size Allowe d VN(s) Opcod e Snoop Channel (Snp) Snoop SnpCur 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 0000 SnpCode 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 0001 SnpData 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 0010 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 0011 SnpInvOwn 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 0100 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 0101 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 0110 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 0111 SnpInvItoE 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 1000 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 1001 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 1010 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 1011 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 1100 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 1101 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 1110 RSVD 3 - SNP SA or EA 1 or 2 N/A 0, 1, A 1111 Home Channel (Home) Request RdCur 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 0000 RdCode 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 0001 RdData 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 0010 RSVD 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 0011 RdInvOwn 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 0100 RSVD 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 0101 RSVD 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 0110 RSVD 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 0111 InvItoE 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 1000 RSVD 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 1001 RSVD 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 1010 RSVD 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 1011 WbMtoI 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 1100 WbMtoE 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 1101 WbMtoS 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 1110 Ref No xxxxx 169 Intel Restricted Secret CSI Link Layer CSI Link Layer Message Class Message Type Name Message Class Encodin g Packet Format Flits Dat a Size Allowe d VN(s) Opcod e AckCnflt 0 - HOM SA or EA 1 or 2 N/A 0, 1, A 1111 Snoop Response RspI 1 - HOM SCC or ECC 1 or 2 N/A 0, 1, A 0000 RspS 1 - HOM SCC or ECC 1 or 2 N/A 0, 1, A 0001 RSVD 1 - HOM SCC or ECC 1 or 2 N/A 0, 1, A 0010 RSVD 1 - HOM SCC or ECC 1 or 2 N/A 0, 1, A 0011 RspCnflt 1 - HOM SCC or ECC 1 or 2 N/A 0, 1, A 0100 RSVD 1 - HOM SCC or ECC 1 or 2 N/A 0, 1, A 0101 RspCnfltOwn 1 - HOM SCC or ECC 1 or 2 N/A 0, 1, A 0110 RSVD 1 - HOM SCC or ECC 1 or 2 N/A 0, 1, A 0111 RspFwd 1 - HOM SCA or ECA 1 or 2 N/A 0, 1, A 1000 RspFwdI 1 - HOM SCA or ECA 1 or 2 N/A 0, 1, A 1001 RspFwdS 1 - HOM SCA or ECA 1 or 2 N/A 0, 1, A 1010 RspFwdIWb 1 - HOM SCA or ECA 1 or 2 N/A 0, 1, A 1011 RspFwdSWb 1 - HOM SCA or ECA 1 or 2 N/A 0, 1, A 1100 RspIWb 1 - HOM SCA or ECA 1 or 2 N/A 0, 1, A 1101 RspSWb 1 - HOM SCA or ECA 1 or 2 N/A 0, 1, A 1110 RSVD 1 - HOM SCA or ECA 1 or 2 N/A 0, 1, A 1111 Response Channel - Data (DRS) Data Response DataC_(F or S) 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0000 DataC_E 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0000 DataC_M 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0000 DataC_I 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0000 DataNc 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0011 DataC_(F or S)_FrcAckCnflt 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0001 DataC_E_FrcAckCnflt 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0001 DataC_(F or S)_Cmp 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0010 DataC_E_Cmp 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0010 DataC_I_Cmp 14 - DRS SDR or EDR 9 or 10 64 0, 1, A 0010 WbIData 14 - DRS SDW or EDW 9 or 10 64 0, 1, A 0100 WbSData 14 - DRS SDW or EDW 9 or 10 64 0, 1, A 0101 WbEData 14 - DRS SDW or EDW 9 or 10 64 0, 1, A 0110 RSVD 14 - DRS SDW or EDW 9 or 10 64 0, 1, A 0111 WbIDataPtl 14 - DRS EBDW 11 0-64 0, 1, A 1000 170 Ref No xxxxx Intel Restricted Secret Table 4-48. Mapping of the Protocol Layer to the Link Layer UP/DP/SMP/LMP Message Class Message Type Name Message Class Encodin g Packet Format Flits Dat a Size Allowe d VN(s) Opcod e RSVD 14 - DRS EBDW 11 0-64 0, 1, A 1001 Response Channel Non Data (NDR) Grants GntE_Cmp 2 - NDR SCC or ECC 1 or 2 N/A 0, 1, A 0000 GntE_FrcAckCnflt 2 - NDR SCC or ECC 1 or 2 N/A 0, 1, A 0001 Completions and Forces Cmp 2 - NDR SCC or ECC 1 or 2 N/A 0, 1, A 1000 FrcAckCnflt 2 - NDR SCC or ECC 1 or 2 N/A 0, 1, A 1001 Cmp_FwdCode 2 - NDR SCC or ECC 1 or 2 N/A 0, 1, A 1010 Cmp_FwdInvOwn 2 - NDR SCC or ECC 1 or 2 N/A 0, 1, A 1011 Cmp_FwdInvItoE 2 - NDR SCC or ECC 1 or 2 N/A 0, 1, A 1100 RSVD 2 - NDR SCC or ECC 1 or 2 N/A 0, 1, A 1101 Misc CmpD 2 - NDR SCD or ECD 1 or 2 N/A 0, 1, A 0100 Non Coherent Bypass (NCB) NcWr 12 - NCB SDW or EDW 9 or 10 64 0, 1, A 0000 WcWr 12 - NCB SDW or EDW 9 or 10 64 0, 1, A 0001 RSVD 12 - NCB SDW or EDW 9 or 10 64 0, 1, A 0010 RSVD 12 - NCB SDW or EDW 9 or 10 64 0, 1, A 0011 NcMsgB 12 - NCB NCM 11 64 0, 1, A 1000 IntLogical 12 - NCB EBDW 11 64 0, 1, A 1001 IntPhysical 12 - NCB EBDW 11 64 0, 1, A 1010 RSVD 12 - NCB EBDW 11 64 0, 1, A 1011 NcWrPtl 12 - NCB EBDW 11 64 0, 1, A 1100 WcWrPtl 12 - NCB EBDW 11 64 0, 1, A 1101 NcP2PB(LMP/SMP) else RSVD 12 - NCB P2P Tunnel 11 64 0, 1, A 1110 RSVD 12 - NCB EBDW 11 64 0, 1, A 1111 Non NcRd 4 - NCS SA or EA 1 or 2 N/A 0, 1, A 0000 Coherent Standard (NCS) IntAck 4 - NCS SA or EA 1 or 2 N/A 0, 1, A 0001 RSVD 4 - NCS SA or EA 1 or 2 N/A 0, 1, A 0010 RSVD 4 - NCS SA or EA 1 or 2 N/A 0, 1, A 0011 NcRdPtl 4 - NCS EA 2 N/A 0, 1, A 0100 NcCfgRd 4 - NCS EA 2 N/A 0, 1, A 0101 NcLTRd 4 - NCS EA 2 N/A 0, 1, A 0110 Ref No xxxxx 171 Intel Restricted Secret CSI Link Layer CSI Link Layer Message Class Message Type Name Message Class Encodin g Packet Format Flits Dat a Size Allowe d VN(s) Opcod e NcIORd 4 - NCS EA 2 N/A 0, 1, A 0111 RSVD 4 - NCS EIC 3 8 0, 1, A 1000 NcCfgWr 4 - NCS EIC 3 8 0, 1, A 1001 NcLTWr 4 - NCS EIC 3 8 0, 1, A 1010 NcIOWr 4 - NCS EIC 3 8 0, 1, A 1011 NcMsgS 4 - NCS NCM 3 8 0, 1, A 1100 NcP2PS 4 - NCS P2P Tunnel 3 8 0, 1, A 1101 RSVD 4 - NCS RSVD 3 8 0, 1, A 1110 Isoch Data Stream (IDS) Isoch Command Stream (ICS) TL_ACK/NACK IsochDataRsp IsochDataWr IsochDataWrPtl IsochCmdRd IsochCmdRdCoh IsochCmdRdConsis IsochCmdRdCohConsis IsochCmdWr IsochCmdWrCoh IsochCmdWrConsis IsochCmdWrCohConsis 4 - NCS 13 - IDS 13 - IDS 13 - IDS 5 - ICS 5 - ICS 5 - ICS 5 - ICS 5 - ICS 5 - ICS 5 - ICS 5 - ICS EIC SDR SDW EBDW SA SA SA SA SA SA SA SA 3 9 9 11 1 1 1 1 1 1 1 1 8 64 64 0-64 N/A N/A N/A N/A N/A N/A N/A N/A 0, 1, A 0 0 0 0 0 0 0 0 0 0 0 1111 0000 0100 1000 0000 0001 0010 0011 0100 0101 0110 0111 4.6.4 Width Reduction This feature is to enable a link to work in a degraded mode when the physical channel has excessively failing signals. When an unrecoverable or intermittent error occurs, the link initiates a discovery phase to find the failed lanes and goes through a retraining sequence to configure itself into a reduced width mode. The exact final configuration is negotiated between the connected Link Layer agents. This process is explained in more detail in the Physical Layer portion of the specification. For the purpose of a re-configuration after a lane failure, the link is divided into segments (either halves or quarters). The half width reduction will try to select the working segments and combine them to get a half width link. Details of the segments and priority for finding a working set of segments is left to specific part and platform specifications but all components that support width reduction must at least support half width mode. Possible segment configurations must be negotiated at link init time. 172 Ref No xxxxx Intel Restricted Secret Width reduction is also supported for power savings and details will be provided in a future revision of the specification. 4.6.5 Organization of Packets on the Physical Layer Please see Table 4-66, Table 4-67, and Table 4-68. 4.7 Link Layer Control Messages The Link Layer uses an additional virtual channel for link to link control messages. These link to link control messages are used for error correction (link level retry), power management, system configuration, initialization, debug, and idle flits during periods when the link is idle. The following section will describe the format of the Link Layer control and the messages that use it. 4.7.1 Special Packet Format The Special Packet Format is used on the Link Layer Control channel for link agent to link agent communication such as Link Level Retry messages. Special Packets are denoted by a Virtual Network encoding of 0b1X and a Message Class of 0b1111. Special Packets with the MSB of the opcode of 0 do not enter the Retry Buffer, but all others do. Table 4-49. Generic form for Special Packet, ISP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 Payload 0b1111 Opcode (3:0) 0b1x Payload CR C 4 CR C 0 Payload CR C 5 CR C 1 0b1 IIB RSV D Payload CR C 6 CR C 2 Payload CR C 7 CR C 3 Special Packets with the MSB of the opcode equal to 0 do not enter the Retry Buffer. Ref No xxxxx 173 Intel Restricted Secret Table 4-50. Opcode Encoding for Special Packet OpcodeEncoding Flit Type Link Layer Control Type Enters Retry Buffer / Contains Ack/Credit 0b0000 CTRL Flit Null Ctrl Flit No 0b0001 Link Level Retry Ctrl Flit No 0b0010 RSVD No 0b0011 System Management No 0b0100 Parameter Exchange No 0b0101 Sync Flit No 0b0110 Error Indication No 0b0111 Debug No 0b1000 IDLE Flit Idle Credit Flit Yes 0b1001 RSVD/LT Link Layer Message Yes 0b1010 RSVD/Power Management Yes 0b1011 RSVD Yes 0b1100 RSVD Yes 0b1101 RSVD Yes 0b1110 RSVD Yes 0b1111 RSVD Yes 4.7.2 Null Ctrl Flit The Null Ctrl flit is a special flit that does not enter the Retry Buffer. The Null Ctrl flit has the special property that all 4 phits in the flit are exactly the same, thereby allowing the Physical Layer to pre-load the link between any two agents with the null data in a simple manner. Additionally the Null Ctrl Flit is used during link initilization and during link level retry because it doesn’t enter the retry buffer in place of sending idle flits. Table 4-51. Null Ctrl Flit L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 174 Ref No xxxxx Intel Restricted Secret 4.7.3 Link Level Retry Ctrl Flit The link level retry ctrl flit is used to send link level retry messages. More detail on these messages and their use can be found in Section 4.9.2.1, “Link Level Retry” on page 4-193. Table 4-52. Link Level Retry Messages TypeEncoding Message Message Description 0b00000 LLR.Idle/Null Flit True Nop packet. No value transferred. 0b00001 LLR.Req Data Field (7:0) contains retry sequence number Data Field (11:8) contains requesters link width. 0b00010 LLR.Ack Data Field (7:0) contain Wr.Ptr value for retry buffer for debug purposes Data Field (11:8) contains senders link width 4.7.4 Power Management Ctrl Flit These messages are used by the power management logic for link level power management. For detailed descriptions and their use, please reference Section 15.1, “Link Power Management” on page 15-435. Table 4-53. Power Management Ctrl Flit L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 RSVD VC Crd1 (1:0) 0b1111 0b1010 0b1x VC Crd0 (1:0) CR C 4 CR C 0 RSVD Ack 1 Type (4:0) RSVD Ack 0 CR C 5 CR C 1 0b1 IBB 0b0 Data Field (15:0) CR C 6 CR C 2 RSVD Data Field (31:16) CR C 7 CR C 3 Ref No xxxxx 175 Intel Restricted Secret Table 4-54. Power Management Link Messages TypeEncoding Message Message Description 0b00000 PM.LinkL0sConfig Data Field (16:0) contains 16b floating point wake time 0b00001 PM.LinkEnterL1 0b00010 PM.LinkReqAck 0b00011 PM.LinkReqNack 0b00100 PM.LinkEnterL0s Data Field (11:0) contains the L0s exit time as a multiple of 16 UI and Data Field (16) states whether active lanes or inactive lanes are being configured. 0b00101 PM.LinkWidthConfig Lane Map in Data Field (3:0) 4.7.5 System Management Ctrl Flit TBD, do we even have a use for it??? 4.7.6 Parameter Exchange Ctrl Flit The parameter exchange ctrl flit is used during link initialization to transfer configuration information. Table 4-55. Parameter Exchange Messages TypeEncoding Message Message Description 0b00000 PE.ReadyForInit Interlock message 1 0b01000 PE.Parameter0 Table 4-56, “PE.Parameter0” 0b01001 PE.Parameter1 Table 4-57, “PE.Parameter1” 0b01010 PE.Parameter2 Table 4-58, “PE.Parameter2” 0b01011 PE.Parameter3 Table 4-59, “PE.Parameter3” 0b01100 PE.Parameter4 Table 4-60, “PE.Parameter4” 0b11110 PE.ReadyForNormalOperation Interlock Message 2 0b11111 PE.BeginNormalOperation Beginning of normal operation with exchanged parameters 176 Ref No xxxxx Intel Restricted Secret Table 4-56. PE.Parameter0 Data Field Size Name Meaning 31 1b RSVD 30 1b Command Insert Interleave This Agent can receive Command Insert Interleave 29 1b RSVD/Scheduled Data Interleave RSVD/This agent can receive a Scheduled Data Interleave 28:27 2b CRC Mode 00 - RSVD 01 - 8b CRC 10 - 16b rolling CRC 11 - RSVD 26:19 8b LLR Wrap Value 18:17 2b Cache Line Size 16:10 7b Node ID Mask Master/CSR Node ID 9:7 3b # Node IDs 1-8 node ids 4:0 5b Port # The # of Node IDs field does not require that the NIDs are contiguous, just that they are within an 8 NID range, from 1-8 NIDs are allowed to be allocated, in any order, and with gaps between NIDs that are used. Table 4-57. PE.Parameter1 Data Field Size Meaning 31 1b RSVD 30 1b RSVD 29 1b RSVD 28 1b RSVD 27 1b RSVD 26 1b RSVD 25 1b RSVD 24 1b RSVD 22:19 4b PDFA Supported 18:17 2b PDFA Requested 16:13 4b PDFB Supported 12:11 2b PDFB Requested 10:7 4b PDFC Supported 7:6 2b PDFC Requested 5:4 2b Critical Chunk Size 3:0 4b RSVD Ref No xxxxx 177 Intel Restricted Secret CSI Link Layer CSI Link Layer Data Field Size Meaning 31 1b Agent 000 Types Peer Agent 30 1b Home Agent 29 1b I/O Agent 28 1b RSVD/LT Agent 27 1b Hierarchical Agent 26 1b Switch Agent 25 1b Firmware 28 1b RSVD 31 1b Agent 001 Types Peer Agent 30 1b Home Agent 29 1b I/O Agent 28 1b RSVD/LT Agent 27 1b Hierarchical Agent 26 1b Switch Agent 25 1b Firmware 28 1b RSVD 31 1b Agent 010 Types Peer Agent 30 1b Home Agent 29 1b I/O Agent 28 1b RSVD/LT Agent 27 1b Hierarchical Agent 26 1b Switch Agent 25 1b Firmware 28 1b RSVD 31 1b Agent 011 Types Peer Agent 30 1b Home Agent 29 1b I/O Agent 28 1b RSVD/LT Agent 27 1b Hierarchical Agent 26 1b Switch Agent 25 1b Firmware 28 1b RSVD 178 Ref No xxxxx Intel Restricted Secret Table 4-59. PE.Parameter3 Data Field Size Meaning 31 1b Agent 100 Types Peer Agent 30 1b Home Agent 29 1b I/O Agent 28 1b RSVD/LT Agent 27 1b Hierarchical Agent 26 1b Switch Agent 25 1b Firmware 28 1b RSVD 31 1b Agent 101 Types Peer Agent 30 1b Home Agent 29 1b I/O Agent 28 1b RSVD/LT Agent 27 1b Hierarchical Agent 26 1b Switch Agent 25 1b Firmware 28 1b RSVD 31 1b Agent 110 Types Peer Agent 30 1b Home Agent 29 1b I/O Agent 28 1b RSVD/LT Agent 27 1b Hierarchical Agent 26 1b Switch Agent 25 1b Firmware 28 1b RSVD 31 1b Agent 111 Types Peer Agent 30 1b Home Agent 29 1b I/O Agent 28 1b RSVD/LT Agent 27 1b Hierarchical Agent 26 1b Switch Agent 25 1b Firmware 28 1b RSVD Ref No xxxxx 179 Intel Restricted Secret CSI Link Layer CSI Link Layer Data Field Size Meaning 31 1b IOQ1 30 1b MCERR# Disable 29 1b BINIT Obs Off 28:27 2b APIC CLUSTER ID (1:0) 26 1b Bus Park Off 25:18 8b CLK Ratio (7:0) 17 2b Agent ID (1:0) 16 1b Lt Enable 15 1b MP Init Disable 14 1b Cache Init Off 13 1b Config Restart 12 1b Burn In Init 11 1b MT Disable 10 1b Swap Primary Thread 9 1b SCT Disable 8 1b Dynamic Bus Inv Dis 7:0 8b RSVD 4.7.7 Sync Flit TBD, we probably need this but don’t yet have a good definition 4.7.8 Error Indication TBD, don’t know if we need it. Anyone want to write a definition? 4.7.9 Debug The Link Layer defines 4 debug message types. The other 28 Standard Debug Message type are reserved for future CSI general debug packet type extensions. Please refer to a product’s specification for the correct usage of these other debug types. NOTE: Future CSI Specifications may define debug packets that are exposed to other layers of CSI. Debug Type [4:0]: Encoding for 32 Debug packet types. Encodings for initial CSI standard functions are provided in Table 4-61 “Standard Debug Messages” on page 4-181. Debug Packets are essential to expose internal states of CSI agents that are otherwise inaccessible. The contents of debug packets is implementation specific. Contents could include things like branch info (source and target IPs), time stamps, indication of an internal event trigger, internal 180 Ref No xxxxx Intel Restricted Secret node values on the occurrence of an internal event, information useful to create Long-Instruction Traces (LIT) etc. The exposed data is typically captured by observability agents like Logic analyzers for post-processing and failure analysis. Table 4-61. Standard Debug Messages Debug TypeEncoding(4:0) Message Message Description 0b00000 Generic Debug Packet Carries debug information exposed in an opportunistic manner 0b00001 Inband Debug Event Packet mainly used to expose the occurrence of internal debug events, but optionally could carry information related to the event being exposed 0b00010 Timing Correlation Packet (Optional) Carries timing information to assist with tracing and correlation at the Physical layer to Link layer boundary 0b00011 RSVD RSVD 0b00100 0b11111 RSVD RSVD The Format of the generic Debug Ctrl Flit is given below: Table 4-62. Generic Debug Ctrl Flit L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 0b00000 0b1111 0b0111 0b1x RSVD CR C 4 CR C 0 Debug Field 1(17:0) CR C 5 CR C 1 0b1 0b0 Debug Field 2(15:0) CR C 6 CR C 2 Debug Field 3(17:0) CR C 7 CR C 3 4.7.9.1 Requester Rules • Debug packets are sent by the Link Layer on an opportunistic basis (exceptions to this below). The Link Layer should replace only Null flits with these so as to not disturb or add additional traffic on CSI fabric. • The Debug packets are a one-sided communication mechanism without any confirmation of receipt. • In general the mechanisms used to populate the fields of the Debug packets will be implementation specific (the exceptions are for the trigger field of the Inband Debug Even Packet and the Relative Timing Packet). For example, implementations can choose to create a buffering scheme in the Link Layer that matches a debug packet format and have some packet formatter logic fill in the fields as and when debug info is sent from various parts of the chip and then a signal to send the packet. • If more debug packets are sent than can be buffered at the receiver, the method of handling the additional packets is implementation dependant. An implementation can either discard the additional Debug packets or overwrite earlier Debug packets. Ref No xxxxx 181 Intel Restricted Secret CSI Link Layer CSI Link Layer • Priority Debug (Inband Debug Event) packets require Link layer to guarantee delivery within a fixed latency or number of clocks from the time an internal event occurs. The latency is implementation dependent, but for a given implementation is required to be a fixed value. Even though implementations have a bit of a latitude here, it is very important to keep the latency as low as possible. • Priority packets remain pending across low power states and get sent once the CSI fabric is out of low power state. • Priority Packets can be preempted during Link Training and initialization parameter exchange period and sent soon afterwards - again a fixed latency is a required. • Priority packets can be preempted if there are other synchronization packets scheduled. • Priority Packets can be preempted if the last packet sent was a priority packets and other packets are ready for dispatch. This is to prevent Debug packets from blocking other packet transmission and keep disturbance to a minimum. • On CSI agents with multiple Links, implementations should provide Debug packet support on all the links. What information gets sent on which link is implementation specific. 4.7.9.2 Inband Debug Event Ctrl Flit Table 4-63. Inband Debug Event Ctrl Flit L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 Debug Type = 0b00001 0b1111 0b0111 0b1x RSVD CR C 4 CR C 0 Debug Value (17:0) CR C 5 CR C 1 0b1 0b0 Debug Event (15:0) CR C 6 CR C 2 Debug Value (35:18) CR C 7 CR C 3 The Inband Debug Events sub-type of Debug Special Packets provides a low latency transport for inband debug events and values from CSI link agents across the link to monitoring trace tools and opposite link agents. These events and values feature a simple generic semantic to maximize inter- device and tool flexibility and compatibility. Usage of debug events and values shall vary based on the particular debug/validation scenario, the capabilities/limitations of individual devices to source, use, and transported these packets, and external tool chain capability to program these distributed features in system devices and external tools. 4.7.9.2.1 Immediate Transmission These packets are used to expose internal events and values used for debug of devices and systems and therefore must be transmitted as unblocked and with as low a latency as possible. CSI Link Layer agents transmit an Inband Debug Event Packet at the next opportunity when one or more local events are accumulated. This includes interleaving of Inband Debug Event Packets into packets already in transmission as defined in Section 4.8, “Flit Interleave” on page 4-187. The only exceptions to this rule for Inband Debug Event packets are: • While the link is not able to carry traffic, such as while powered down, in reduced power mode, etc. During these times asserted events continue to accumulate until they can be transmitted as soon as the link is able to carry traffic. 182 Ref No xxxxx Intel Restricted Secret • During link initialization training states and before any non-preemptable initialization parameter exchange has completed. • When a physical Layer retraining burst is being transmitted. • When the last packet transmitted was an inband event packet and any other type packet is immediately ready for transmission. This prevents inband debug event packets from completely blocking other traffic. 4.7.9.2.2 DebugEvent(15:0): Inband Debug Events Inband Debug Events transport sixteen independent, generic event pulses between link agents and to trace tools to allow interactions between debug infrastructure distributed in multiple devices. 4.7.9.2.3 Sourcing Inband Debug Events Devices incorporating CSI Link Agents can include debug control registers to specify independent selection of device local internal debug events as sources for up to 16 event positions in the debug packet. CSI Link Agents accumulate assertion (leading edge) of selected local events until an inband event packet can be transmitted carrying all events accumulated to that point. Assertion of Event Bits is also set in packets to indicate that Debug Value field(s) are valid in the packet. It is essential that events transmitted in debug event packets can be passed in any of the event bit positions, i.e. not “hard wired” to specific local event source functions, as hard wiring would inevitably lead to inter-device incompatibilities. CSI Link Agents are required to support transmission of debug packets with a minimum of the first eight debug events (7:0), setting any non-supported event positions to zero, although this will result in limiting debug infrastructure flexibility. 4.7.9.2.4 Semantics for Inband Debug Events in Packets Inband debug events in the resulting packets have the semantic of a one time event pulse for each event bit set to one. Bits set to zero indicate the selected local event at the source was not asserted up to the point of the packet transmission. Each Inband Debug Each event Packet carries any combination of 1 to 16 different events, and an event packet with no event bits set is invalid. Multiple assertions of an individual local event in the transmitting agent before an event packet can be transmitted shall result in the redundant event assertions (all following the first one) being dropped without error or notification. The usage of inband events by trace tools and receiving agents must therefore tolerate loss of redundant events in the sourcing agent. 4.7.9.2.5 Decode and Application of Inband Debug Events Receiving link agents and trace tools monitoring links are required to decode these packets to recover the Inband Debug Events for local use. When an Inband Debug Event Packet is decoded any asserted Debug Events are translated into unique event pulses which may selectively (as defined by debug control registers) be applied as stimuli for device specific response mechanisms. Generally the received events shall be routed into the device local “event pool” for possible selection, processing, and then used to drive event debug mechanisms. Reception of asserted Debug Events associated with Debug Values may be used to control capture of the associated value for use in the receiving device. Like in event packet sources it is essential that events transported by debug event packets be received as generic, i.e. not “hard wired” to specific functions in receiving link agents, as hard wiring would inevitably lead to inter-device incompatibilities. CSI agents are required to support reception of debug packets with a minimum of the first eight debug events (7:0), and may ignore asserted events in higher positions, although this will result in limiting debug infrastructure flexibility. Ref No xxxxx 183 Intel Restricted Secret CSI Link Layer CSI Link Layer 4.7.9.2.6 DebugValue(35:0): Inband Debug Values Inband Debug Values transport one or more independent, generic debug values between link agents and to trace tools to immediately expose specific internal sets of bits. This mechanism is used to reveal values that can only be derived or observed internal to a device for capture in link traces. The values can also provide an input parameter for debug mechanisms which require information from another device. Program selected event bits are designated for each value fields (or set of fields) in the source device, with the corresponding event bit set in packets when the value is valid. 4.7.9.2.7 Sourcing Inband Debug Values One or more debug values are control register selected for transmission in Inband Debug Event packets. Each packet provides a total of 36 bits for debug values, with values from different internal sources permitted to occupy sub-fields of the value payload. Devices are permitted to implement variable numbers and locations of values exposed in the payload, depending on debug needs and device capabilities. For example, a device may have a selectable mode for exposing two 16 bit values in DebugValue[15:0] and DebugValue[33:18] and a 4 bit field in DebugValue[35:34,17:16]. In this example the device might expose all 3 values in each Inband Debug Event Packet, or it could expose each independently. An alternate mode for the same device might expose a single 36 bit value. Note that since the external debug infrastructure (tools and users) explicitly selects the debug values and modes for these mechanisms the debug packets are not required to include information IDing the particular values carried. On the other hand devices and tools are permitted to use part of the value field as a field source ID (supplied by the sourcing agent) in cases where multiple values too large to expose simultaneously are required. For this usage external tools and the sourcing device must utilize shared information mapping values exposed by IDs to specific internal sources. Debug values selected for transport by debug event packets should not be “hard wired”, as there shall usually be several value sources which are useful to expose at different times. In some to specific functions in receiving link agents, as hard wiring would inevitably lead to inter-device incompatibilities. 4.7.9.2.8 Standardized Debug Base Value Alignment of fields within the DebugValue(35:0) payload is free format with one exception. Inband Debug Event Packets shall support standardized passing of a generic value normalized to minimize overhead in a receiving link agent directly using the value in local debug mechanisms. This requires a “base value” orientation, defined to be LS aligned (i.e. LSB of the “base value” located in the LSB of the DebugValue) and occupying sequential higher order bits for the full width of the value. As result of this constraint, CSI agent designs must pre-determine which values that it might be required to expose with the “base value” constraints. 4.7.9.2.9 Value Field Validity Signaling Using Inband Debug Events For each independently exposed value (field or set of fields), a sourcing device must have a control register selected a Debug Event bit to be set in the packet indicating when the value(s) are valid in the packet. Event Bit to value fields correspondence must be program selectable, as “hard wired” association would lead to inter-device incompatibility. 4.7.9.2.10 Decode and Application of Inband Debug Values Receiving link agents and trace tools monitoring links are required to decode these packets to recover the Inband Debug values when the corresponding Debug Events are set. Generally the received debug value shall be captured, then routed directly to the debug mechanism requiring the value. For example, a packet pattern matching mechanism in device A may receive a value (for example the Request Transaction ID for a particular transaction) sent from device B to be used as part of the pattern match function, allowing differentiation of a particular packet from others 184 Ref No xxxxx Intel Restricted Secret identical except for the value passed. Debug values transported by debug event packets shall be generic on the link, but device specific by content and in use at the receiving agent. Since there shall often be several value sources which useful to expose at different times debug value application should not be “hard wired” to specific functions in receiving link agents, as that would inevitably lead to inter-device incompatibilities. 4.7.9.3 Debug Relative Timing Exposure Ctrl Flit Table 4-64. Debug Relative Timing Exposure Ctrl Flit L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 Debug Type = 0b00010 0b1111 0b0111 0b1x RSVD CR C 4 CR C 0 RcvdPhase (7:0) RcvdTime (9:0) CR C 5 CR C 1 0b1 0b0 RcvdID (15:0) CR C 6 CR C 2 XmitPhase (7:0) XmitTime (9:0) CR C 7 CR C 3 The Debug Relative Timing Exposure sub-type of Debug Special Packets provides exposure on CSI device relative time stamps and transfer phase at the boundary between Link and Physical Layers so that externally captured traces can be correlated to that same boundary by post processing SW. This feature requires that each device implement: • A link side time stamp common across all CSI links in the same device • A transfer phase encoder to precisely identify packet transfer phase/pattern for receive and transmit directions for cases where link and link physical Layer run at different frequencies • Registers to capture precise time stamp and phase for arrival and received ID of the latest relative timing exposure packet at each link • Counter to determine length of time from last transmission of a Relative Time Exposure packets • Mechanism to schedule, compose and transmit Debug Relative Time Exposure packets as substitute for some Null Control packets, or preemptively transmitted in rare cases when the maximum period counter expires 4.7.9.3.1 Relative Timing Exposure Packet Transmission Two scheduling mechanisms are required for transmission of Relative Timing Exposure packets in order to minimize system disturbance ordinarily and yet insure minimum packets are transmitted in worst case traffic situations. 4.7.9.3.2 Opportunistic Transmission A minimum number of Relative Timing Exposure packets must be captured in simultaneous traces of all links so that traces can be correlated to each device internal clock domain. Normally links shall not saturate with traffic so there shall be copious opportunities for many of these packets as substitute for Null Control packets in each trace. Since both Relative Timing Exposure and Ref No xxxxx 185 Intel Restricted Secret CSI Link Layer CSI Link Layer Opportunistic Exposure mechanisms substitute their own packets for Null control packets they must be prioritization when both are enabled. To this end, if both are enabled then they shall be required to alternate in using the available Null Control FLIT slots. 4.7.9.3.3 Maximum Period Scheduled Transmission For the rare case where link traffic does not allow for adequate Relative Timing Exposure packets transmission a preemptive scheduling mechanism is also required. Each agent shall implement a counter of FLITs transmitted since the most recent Relative Timing Exposure packet transmission. If this counter is enabled and it reaches a selected threshold then a Relative Timing Exposure packet is scheduled for transmission, prioritized ahead of all other link traffic. The maximum period mechanism shall support scheduling thresholds of 128 and 4096 FLITs. These thresholds provide adequate packet density for both small on-chip trace and external deep trace tools. 4.7.9.3.4 Time Stamp and Phase Each device is required to implement timestamp counters and phase pattern identities. The timestamp function requires a common 10 bit clock counter for all CSI ports (or equivalent) to provide relative time values for arrival and transmission of Time Exposure packets at the interface between the Link and Physical Layers. Likewise each port must provide a transfer phase encoder which precisely identifies packet transfer phase/pattern across the boundary for receive and transmit directions for cases where Link and Physical Layers run at different frequencies. This phase value must identify the phase/pattern for transfer across the clock boundary precisely enough so that exposed timing and phase of any single FLIT crossing the boundary can be used to determine precise timing of all preceding and following FLITs. 4.7.9.3.5 Received Packet Information Each time a Time Exposure packet is received the precise time and phase of the packets arrival at the Link Layer and an ID for the packet is recorded in registers for later transmission on the reverse direction of the link. Note that as each of these packets arrive, its information replaces that of the previous packet, such that at all times the registers contain an intact set of information about the latest received Time Exposure packet. 4.7.9.3.6 RcvdTime(9:0): Precise Packet Receive Time The exact arrival time (value of timestamp) of each received Time Exposure packet is recorded in a register for possible later transmission in a packet on the other direction of the same link. 4.7.9.3.7 RcvdPhase(7:0): Precise Packet Receive Phase The exact arrival phase (value of transfer phase encoder) of each received Time Exposure packet is recorded in a register for possible later transmission in a packet on the other direction of the same link. 4.7.9.3.8 RcvdID(15:0): Identity of Received Packet The entire XmitTime(9:0) and LS 6 bits of the XmitPhase(7:0) for each received Time Exposure packet is recorded in a register for as packet ID for the received packet for possible later transmission in a packet on the other direction of the same link. This value is used by post processing SW to correlate the information for the received packet with the correct packet seen in the external trace of the incoming direction of the link. This is necessary since these packets are only opportunistically transmitted and time between received and next transmitted packet can be many FLIT times. 186 Ref No xxxxx Intel Restricted Secret 4.7.9.3.9 Transmit Packet Information Each transmitted time exposure packet carries information captured about or from the most recent received time exposure packet as well as timing and phase of the packet transmission. 4.7.9.3.10 XmitTime(9:0): Precise Packet Transmit Time The exact transmission time (value of timestamp) of each transmitted Time Exposure packet is carried in the packet. 4.7.9.3.11 XmitPhase(7:0): Precise Packet Transmit Phase The exact transmission phase (value of transfer phase encoder) of each transmitted Time Exposure packet is carried in the packet. 4.7.10 Idle Flit The Idle flit is sent when there is nothing else to send. The idle flit contains 2 VC Cred fields used to return credits to the sender in addition to 2 ack fields. All other fields are RSVD/TBD Table 4-65. Idle Special Packet, ISP L17 L16 L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 C1 C0 RSVD VC Crd1 (1:0) 0b1111 Opcode (3:0) 0b1x VC Crd0 (2:0) CR C 4 CR C 0 RSVD Ack 1 Type (4:0) RSVD Ack 0 CR C 5 CR C 1 IIB RSV RSVD CR CR D C 6 C 2 RSVD RSVD CR CR C 7 C 3 4.8 Flit Interleave The CSI Link Layer supports two optional methods of flit interleave. The first option is the Command Insert option which allows the insertion of Command Packets into a data packet that is currently being sent. The command packet can be a protocol message or a Link Layer message. The second option is the Scheduled Data Interleave option which allows the interleaving of two data streams in a scheduled manor. In additional Link Level Special flits (Ctrl and Idle Flits) can be inserted at any time. Figure 4-1 shows all the allowed interleaves without any optional interleave active. Ref No xxxxx 187 Intel Restricted Secret CSI Link Layer CSI Link Layer Pac PacPack kke eet tt w ww/ // da dadata tata headerdataSPSPSPSPSP0 00 0 0 0 0 0 0 0 header data 1 2 3 4 6 7 9 SP SP SP SP SP 0 SP SPSP Rules for Flit Level Interleave: • At no time shall more than 2 protocol level packets be interleaved if Command Insert Interleave is enabled • At no time shall 2 or more protocol level packets be interleaved if Command Insert Interleave is not enabled. • At no time shall a protocol level packet be interleaved into the header portion of another protocol level packet. • Command Insert Interleave can only interleave into a flit position that would be used for a data flit. • Link Layer Special packets can interleaved at anytime. • At most 3 packets can be interleave but if and only if on of the packets is a Link Layer Special Packet • It is required that the sender guarantee that any flit level interleave not prevent the eventual completion of the packet interleaved into. 4.8.1 Command Insert The Command Insert uses the Interleave Indication Bit (IIB) to signal that a command packet will be inserted. To insert a command the first flit of the command must have the IIB set. The Command packet must be sent to completion. At the end of the command packet, the link will by default assume that the data packet is resuming, but it allows that another command packet be inserted by setting the IIB. It is not allowed to have at any time more than 2 different command packets in progress. A command packet is not allowed to be inserted into the header of a data packet, only in the before or after a data flit. A command packet cannot be inserted into another command packet. It is permitted that multiple command packets be interleaved within a data packet and that the interleaved command packets can be interleaved contiguously. Figure 4-2 shows all the allowed interleaves with Command Insert Interleave active. 188 Ref No xxxxx Intel Restricted Secret Figure 4-2. Command Insert Interleave Example headerdataPacket w/ dataSPSPSPSPSPSPHdr, No dataHdr, No dataHdr, No dataNested packets: 1,2,3,4,5,8,9,10Double nested packets: 6,7headerdataPacket w/ dataSPSPSPSPSPSPHdr, No dataHdr, No dataHdr, No data0 00 0 0 0 0 0 0 05 8 header data Packet w/ data 5 101 2 3 4 6 7 9 SP SP SP SP SP SP Hdr, No data Hdr, No data Hdr, No data Nested packets: 1,2,3,4,5,8,9,10 Double nested packets: 6,7 4.8.2 Scheduled Data Interleave (SDI) The Scheduled Data Interleave provides a method where by minimum latency can be achieved when a sender can ready two streams at the same time, but at a lower data rate than the link is capable of. An example of this is a MCH with 2 independent memory controllers. Without the Scheduled Data Interleave, the MCH would have to wait until at least one of the read transactions from memory had progressed to the point where it could be sent, or using the Command Insert function insert Idle Flits to bubble the data at the Link Layer. With Scheduled Data Interleave, the memory controller can interleave two independent data streams. Scheduled Data Interleave and Command Insert Interleave are mutually exclusive. If a link direction is using SDI then it cannot use CII. 4.9 Transmission Error Handling 4.9.1 Error Detection As mentioned earlier the Link Layer uses an 8b CRC for transmission error detection. CRCs (Cyclic Redundancy Checks) are a widely used type of error detecting codes. The Link Layer computes a checksum using an 8b CRC on a 88b payload. Any payload bits from unused lanes should be read as zero. For example, if the interface only supports 18 data lanes (72b payload), the unused 16b would be zero. These bits are still needed in order to compute the correct crc. For mapping of bits into CRC order, refer to Table 4-66 “CRC Computation - Full Width” on page 4-192. An 8b CRC has the following desirable properties: 1. All 1b, 2b, and 3b errors are detected. 2. Any odd number of errors are detected Ref No xxxxx 189 Intel Restricted Secret CSI Link Layer CSI Link Layer 3. All errors of burst length 8 or less are detected 4. (1/2^7) of errors of burst length 9 are not detected 5. (1/2^8) of errors of burst length greater than 9 are not detected. The CRC polynomial to be used is 0x185, i.e., x8 + x7 + x2 +1. An Example pseudo C fragment to implement the CRC generation assuming a 20 lane interface (18 data + 2 CRC) with this polynomial is included below. long int DataMask[7:0](71,0) = 0b010100100110110111100110001000101101111100111110111111011010100011001011, 0b110010101110011010011010110110010010100110011101110100010000110000010000, 0b011010110110010010100110011101110100010000110000010000111001010111001101, 0b100110011101110100010000110000010000111001010111001101101101011011001001, 0b010000110000010000111001010111001101101101011011001001010011001110111010, 0b111001010111001101101101011011001001010011001110111010101000011000001000, 0b111001111101111110110101000110010111010100100110110111100110001000101101, 0b110101000110010111010100100110110111100110001000101101111100111110111111; CRC_Out[7] = EVEN_PARITY(DataMask[7] & Data_In); CRC_Out[6] = EVEN_PARITY(DataMask[6] & Data_In); CRC_Out[5] = EVEN_PARITY(DataMask[5] & Data_In); CRC_Out[4] = EVEN_PARITY(DataMask[4] & Data_In); CRC_Out[3] = EVEN_PARITY(DataMask[3] & Data_In); CRC_Out[2] = EVEN_PARITY(DataMask[2] & Data_In); CRC_Out[1] = EVEN_PARITY(DataMask[1] & Data_In); CRC_Out[0] = EVEN_PARITY(DataMask[0] & Data_In); 4.9.1.1 Rolling CRC For improved error detection capability the Link Layer proves an optional 16b CRC scheme. One simple way to increase the error detection capability is to use a larger CRC polynomial. But this results in a larger overhead. So in systems like CSI-based ones in which link errors are very infrequent, a scheme termed rolling CRC is a good technique for increasing the capability of CRC for detecting high burst-length errors without increasing the overhead per flit. To use an 8b rolling CRC scheme, we choose two different generator polynomials of degree 8, G1 and G2. For each flit i, two different CRC checksums CS1i and CS2i are computed using the two generator polynomials and the conventional CRC algorithm. The rolling CRC code CSi that is actually sent on the ith flit is the XOR of CS1i and CS2i-1, where we define the CS2 initially as 0 and hence C1 = C11. This is illustrated in Figure 4-3, where + denotes XOR. Rolling CRC is used instead of a 16b CRC code as it avoids waiting for 2 flits to be encoded before a flit can be transmitted. Also when an error is detected in flit i, the sender resends starting from flit i-1. The polynomial for G1 is 0x185 (x8 + x7 + x2 +1) which is the same as the default CRC. The polynomial for G2 is 0x18D (x8 + x7 + x3 + x2 + 1). 190 Ref No xxxxx Intel Restricted Secret Figure 4-3. Rolling CRC Scheme /GA /GB Flit 1 P1+L1 +8.b’0' F lit 2 P2 +L2+8.b’0' P3+L3 +8.b’0' F lit 3 CS A CSB + CS 1 ‘0’s CS A CSB + CS 2 CS A + CS 3 --- P i = pay load Li = LLC f ield G 1 , G 2 = 8 b it C R C po ly no m ials C S = checksu m F liti= P i+Li+C S i(t rans m it t ed) F lit1 F lit2 F lit3 /GA /GB /GA Figure 4-4. Error Detection on the Received flit Using Rolling CRC ‘0’s + Pi =payload Li =LLC field Flit1 G1, G2 = 8 bit CRC poly nomials P1+L1+8.b’0' P1+L1+CS /GA CS= checksum CS1 CS1 Fliti = Pi+Li+CS(transmitted) /GB CS2 + Flit2 /GA P2+L2+8.b’0' P2+L2+CS CS2 = 0; accept f lit1 CS1 CS2 != 0; error in f lit1 /GB CS2 Flit3 /GA + CS3 = 0; accept f lit2 P3+L3+8.b’0' CS1 CS3 != 0; error in f lit2 - - - 4.9.1.2 CRC Computation For a simple CRC computation the 88b of data are appended with 8 zeros, the remainder of division of the combined 96b of data with the CRC polynomial is appended to the original 88b of data before transmitting the complete flit of 96b. Ref No xxxxx 191 Intel Restricted Secret CSI Link Layer CSI Link Layer Caution: Half and quarter width bit positions are out of date. They will be updated in.7+ to match the current design of the Physical Layer Table 4-66. CRC Computation - Full Width 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 Phit I84 I80 I76 I72 I68 I64 I60 I56 I52 I48 I44 I40 I36 I32 I28 I24 I20 I16 I12 I8 I4 I0 C4 C0 Phit 0 I85 I81 I77 I73 I69 I65 I61 I57 I53 I49 I45 I41 I37 I33 I29 I25 I21 I17 I13 I9 I5 I1 C5 C1 Phit 1 I86 I82 I78 I74 I70 I66 I62 I58 I54 I50 I46 I42 I38 I34 I30 I26 I22 I18 I14 I10 I6 I2 C6 C2 Phit 2 I87 I83 I79 I75 I71 I67 I63 I59 I55 I51 I47 I43 I39 I35 I31 I27 I23 I19 I15 I11 I7 I3 C7 C3 Phit 3 Table 4-67. CRC Computation - Half Width 11 10 9 8 7 6 5 4 3 2 1 0 I83 I75 I67 I59 I51 I43 I35 I27 I19 I11 I3 C3 Phit 0 I82 I74 I66 I58 I50 I42 I34 I26 I18 I10 I2 C2 Phit 1 I87 I79 I71 I63 I55 I47 I39 I31 I23 I15 I7 C7 Phit 2 I86 I78 I70 I62 I54 I46 I38 I30 I22 I14 I6 C6 Phit 3 I81 I73 I65 I57 I49 I41 I33 I25 I17 I9 I1 C1 Phit 4 I80 I72 I64 I56 I48 I40 I32 I24 I16 I8 I0 C0 Phit 5 I85 I77 I69 I61 I53 I45 I37 I29 I21 I13 I5 C5 Phit 6 I84 I76 I68 I60 I52 I44 I36 I28 I20 I12 I4 C4 Phit 7 Table 4-68. CRC Computation -Quarter Width 5 4 3 2 1 0 I75 I59 I43 I27 I11 C3 I74 I58 I42 I26 I10 C2 I83 I67 I51 I35 I19 I3 I82 I66 I50 I34 I18 I2 I79 I63 I47 I31 I15 C7 I78 I62 I46 I30 I14 C6 I87 I71 I55 I39 I23 I7 I86 I70 I54 I38 I22 I6 I73 I57 I41 I25 I9 C1 I72 I56 I40 I24 I8 C0 I81 I65 I49 I33 I17 I1 I80 I64 I48 I32 I16 I0 I77 I61 I45 I29 I13 C5 I76 I60 I44 I28 I12 C4 I75 I69 I53 I37 I21 I5 I74 I68 I52 I36 I20 I4 192 Ref No xxxxx Intel Restricted Secret Let I be the 88b of data to be transmitted. Then the checksum C[7:0] = remainder of division {I[87:0], 0b00000000} / G(x); where G(x) is a CRC-8 generator polynomial. The transmitted data of 96b is formed by combining the 88b of data and 8b of remainder. At the receiver, the 96b of received data is divided by G(x). The flit is error free only if the remainder is zero. G1(x) = 0b1 1000 0101 and G2(x) = 0b1 1000 1101 The default CRC is the simple 8b CRC, for the link to use rolling 16b CRC it must first complete negotiation phase during initialization and both sides must agree on using the rolling CRC. On the completion of the parameter exchange, the appropriate CRC (8b or rolling) comes into effect. 4.9.2 Error Recovery 4.9.2.1 Link Level Retry As mentioned earlier, the Link Layer provides recovery from transmission errors using retransmission. The retry scheme relies on sequence numbers but the sequence numbers are maintained within each agent and not communicated between them with each flit. The exchange of sequence number occurs only through LLR SPC messages during a link level retry sequence. The sequence numbers are set to a predetermined value (zero) during reset and they are implemented using a wrap around counter that wraps around to zero after reaching the same value as the depth of the retry buffer. This scheme makes the following assumptions. • The round-trip delay between agents is more than 1 link clock. • All packets are stored in the retry buffer except Special packets that are defined as not retry enabled. Note that for efficient operation, the size of the retry buffer must be more than the round-trip delay to send a packet from the sender, flight time of the packet from sender to receiver, process time to detect an error in a packet, time to send an error indication from receiver back to the sender, flight time of the error indication from the receiver to the sender, and processing of the error indication at the original sender. 4.9.2.2 Link Level Retry State Variables The state variables used by the retry scheme is described as follows. The description is in terms of one sender and one receiver. Both the sender and receiver side of the retry state machines and the corresponding state variables are present at each agent to take into account bidirectional nature of the link. The receiving agent uses the following state variables to keep track of the sequence number of the next flit to arrive. • ESeq: This indicates the expected sequence number of the next valid flit at the receiving agent. ESeq is incremented by one (modulo the size of retry buffer) on error-free reception of an idle flit or an info flit. ESeq stops incrementing after an error is detected on a received flit till an LLRAck is received. ESeq is initialized to 0 at reset. The sending agent maintains two indices to its retry buffer as indicated below. Ref No xxxxx 193 Intel Restricted Secret CSI Link Layer CSI Link Layer • WrPtr: This indicates the index in the retry buffer to record the next new flit. When a flit is sent from an agent, it is copied into the retry buffer entry indicated by the WrPtr and then the WrPtr is incremented by one (modulo the size of retry buffer). This is implemented using a wrap around counter that wraps around to 0 after reaching the count same as the depth of the retry buffer. Certain Special Packet flits do not affect WrPtr. WrPtr stops incrementing after receiving an error indication at the remote agent (LLRReq message) till the normal operation resumes again (all the flits from the retry buffer have to be retransmitted and RdPtr has the same value as WrPtr). WrPtr is initialized to 0 and it is incremented only when a flit is put into the retry buffer. • RdPtr: This is used to read the contents out of the retry buffer during a retry sequence. The value of this pointer is set by the sequence number sent with the LLRReq message as described later. The RdPtr is incremented by one (modulo the size of retry buffer) whenever a flit is sent, either from the retry buffer in response to a retry request or a new flit coming from the Protocol Layer and irrespective of the states of the local or remote retry state machines. If a flit is being sent when the RdPtr and WrPtr are same, then it indicates that a new flit is being sent, otherwise it must be a flit from the retry buffer. The link level retry scheme uses an explicit acknowledgment that is sent from receiver to the sender to remove packets from the retry buffer at the sender. The acknowledgment is indicated using an ACK bit in packets flowing in the reverse direction. Each agent keeps track of the number of available retry buffers and the number of received flits that need to be acknowledged through the following variables. The link level retry protocol requires that the number of retry buffer entries at each agent must be more than the size of the ack that will be sent plus an additional 2 buffers to prevent deadlock for a total of 10 flits. • NumFreeBuf: This indicates the number of free retry buffer entries at the agent. NumFreeBuf is decremented by 1 whenever a retry buffer entry is used to store a transmitted flit. NumFreeBuf is incremented by 8 when an Ack is received. NumFreeBuf is initialized at reset time to the size of retry buffers. The maximum number of retry buffers at any agent is limited to 255 (8 bit counter). • NumAck: This indicates the number of acknowledgements accumulated at the receiver. NumAck increments by 1 when a flit is received. NumAck is decremented by 8 when an acknowledgment is sent using the Ack bit in the header of an outgoing packet. If the outgoing flit is coming from the retry buffer and its Ack bit is set, NumAck does not decrement. At initialization NumAck is set to 0. NumAck at each agent must be able to keep track of at least 255 acknowledgments. 194 Ref No xxxxx Intel Restricted Secret Figure 4-5. Retry Queue and Related Pointers Free entry Used entry WrPtr RdPtr =WrPtr if not in retry mode Sender Receiver On receiving acks NumAcks Increment after receiving a flitdecrement after returning acks WrPtr incremented after storing the sent a flit. RdPtr points to the next flit to be sent Retry queue NumFreeBuf Eseq Sequence number of the next flit 4.9.2.3 Link Level Retry Control Messages The link level retry scheme uses several Link Layer control messages sent through Special Packets to communicate the state information and the implicit sequence numbers between the agents. These messages are described as follows. • LLRReq: This message is sent from the agent that received a flit in error to the sending agent. The message contains the expected sequence number (ESeq) at the receiving agent, indicating the index of the flit in the retry buffer at the remote agent that must be retransmitted. • LLRAck: This message is sent from the agent that is responding to an error detected at the remote agent. The message contains the WrPtr value at the sending agent for debug purposes only, this value should not be used by the retry state machines in any way. • LLRIdle: This message is sent during the retry sequence when there is no retry control messages to be sent or a retry flit from the retry buffer is not ready to be sent. Note that these messages are sent as Special Packets and they do not update the retry buffer content and the internal sequence numbers. This is one of the primary reasons for introducing LLRIdle instead of just sending Idle flits. Also, these flits do not follow the flow control rule, they can be sent from an agent at any time without any credit. These flits must be processed and consumed by the receiver within the period to transmit a flit on the channel, since there are no storage or flow control mechanism for these flits. Table 4-69 describes the types of control messages and its effect on sender and receiver states. The Link Layer state machines and state variables are described in Section 4.9.2.4. Ref No xxxxx 195 Intel Restricted Secret Table 4-69. Control Messages and Their Effect on Sender and Receiver States CTRL Message Other contents Sender State Receiver State LLRIdle None Unchanged Unchanged LLRReq ESeq is sent, which sets the RdPtr at the receiver. LRSM is updated, NUM_RETRY is incremented RRSM is updated, RdPtr is set to ESeq sent with the message LLRAck WrPtr is sent for debug purpose. RRSM is updated LRSM is updated 4.9.2.4 Link Level Retry State Machines The link level retry scheme is implemented with two state machines: Remote Retry State Machine (RRSM) and Local Retry State Machine (LRSM). These states machines are implemented on every agent and together determine the overall state of the transmitter and the receiver at the agent. The states of the retry state machines are used by the send and receive controllers to determine the types of flit to send from the sender and the actions needed to process a received flit. Remote Retry State Machine (RRSM) The remote retry state machine is activated at an agent if a flit sent from this agent is received in error at the receiver, resulting in a link level retry request (LLRReq) from the remote agent. The possible states for this state machine are: • RETRY_REMOTE_NORMAL: This is the initial or default state indicating the normal operation. • RETRY_LLRACK: This state indicates that a link level retry request (LLRReq) has been received from the remote agent and an LLRAck message followed by flits from the retry buffer must be (re)sent. The remote retry state machine transitions are described in Table 4-70. Table 4-70. Remote Retry State Transitions Current Remote Retry State Condition Next Remote Retry State RETRY_REMOTE_NORMAL Non Special Packet Flit received RETRY_REMOTE_NORMAL RETRY_REMOTE_NORMAL Special Packet flit, other than LLRReq, is received RETRY_REMOTE_NORMAL RETRY_REMOTE_NORMAL Special Packet with [LLRReq, RdPtr] received RETRY_LLRACK RETRY_LLRACK Special Packet with LLRAck not sent RETRY_LLRACK RETRY_LLRACK Special Packet with LLRAck sent RETRY_REMOTE_NORMAL Local Retry State Machine (LRSM) This state machine is activated at the agent that detects an error on a received flit. The possible states for this state machine are • RETRY_LOCAL_NORMAL: This is the initial or default state indicating the normal operation. • RETRY_LLRREQ: This state indicates that error has been detected on a received flit and an LLRReq needs to be sent to the remote agent. 196 Ref No xxxxx Intel Restricted Secret • RETRY_LOCAL_IDLE: This state indicates that the receiver is waiting for an LLRAck flit from the remote agent in response to its LLRReq. • RETRY_ABORT: This state indicates that the retry attempt has failed and the link cannot recover. The local retry state machine also has two counters as described below. • TIMEOUT: This counter is enabled whenever an LLRReq request is sent from an agent and LRSM state becomes RETRY_LOCAL_IDLE. The TIMEOUT counter is disabled and counting stops when LRSM state changes to some state other than RETRY_LOCAL_IDLE. The TIMEOUT counter is reset to 0 at initialization and whenever LRSM state changes from RETRY_LOCAL_IDLE to RETRY_LOCAL_NORMAL. In RETRY_LOCAL_IDLE state, the counter increments on every link clock till it either reaches a threshold or the LRSM transitions to some other state. If the counter has reached its threshold without receiving an LLRAck, then LLRReq request is sent again to retry the same flit. The threshold for the TIMEOUT counter must be set higher than the round-trip delay between agents. Therefore, if the flight time on the link is N flit duration, then the threshold for the TIMEOUT counter must be set higher than (2N+1). • NUM_RETRY: This counter is used to count the number of LLRReq requests sent to retry the same flit. The counter remains enabled during the whole retry sequence (state is not RETRY_LOCAL_NORMAL). It is reset to 0 at initialization and whenever LRSM state changes from RETRY_LOCAL_IDLE to RETRY_LOCAL_NORMAL. The counter is incremented whenever LRSM state changes from RETRY_LOCAL_IDLE to RETRY_LOCAL_LLRREQ. If the counter reaches a threshold (must be larger than 0), then the local retry state machine transitions to RETRY_ABORT state indicating a link failure. The local retry state machine transitions are described in Table 4-71. Note that the condition of TIMEOUT reaching its threshold is not mutually exclusive with other conditions that cause LRSM state transitions. If an LLRAck is received at the same time that TIMEOUT reached its threshold, then time-out is ignored and LLRReq is not repeated at that time. If an error is detected at the same time as TIMEOUT reached it threshold, then the error on the received flit is ignored, time-out is taken and a repeat LLRReq is sent to the remote agent. Ref No xxxxx 197 Intel Restricted Secret Table 4-71. Local Retry State Transitions Current Local Retry State Condition Next Local Retry State Actions RETRY_LOCAL_NORMAL A non Special Packet flit is received. RETRY_LOCAL_NORMAL ESeq is incremented, received flit is accepted. RETRY_LOCAL_NORMAL Special Packet Idle flit is received RETRY_LOCAL_NORMAL ESeq is incremented, received flit is processed. RETRY_LOCAL_NORMAL Special Packet Ctrl flit (other than LLRReq) is received. RETRY_LOCAL_NORMAL Received flit is processed. RETRY_LOCAL_NORMAL LLRReq Special Packet is received. RETRY_LOCAL_NORMAL RRSM is updated. RETRY_LOCAL_NORMAL Error is detected on a received flit. RETRY_LLRREQ Received flit is discarded. RETRY_LLRREQ NUM_RETRY has reached its threshold RETRY_ABORT Indicate link failure. RETRY_LLRREQ NUM_RETRY has not reached its threshold and an [LLRReq, ESeq] has not been sent. RETRY_LLRREQ Any received flit is discarded. RETRY_LLRREQ NUM_RETRY has not reached its threshold and an [LLRReq, ESeq] has been sent. RETRY_LOCAL_IDLE Any received flit, other than a Special Packet LLRAck, is discarded. RETRY_LOCAL_IDLE LLRAck Special Packet is received. RETRY_LOCAL_NORMAL Reset TIMEOUT and NUM_RETRY to 0 RETRY_LOCAL_IDLE TIMEOUT has reached its threshold. RETRY_LLRREQ Increment NUM_RETRY RETRY_LOCAL_IDLE Error is detected on a received flit. RETRY_LOCAL_IDLE Received flit is discarded. TIMEOUT is incremented. RETRY_LOCAL_IDLE A flit other than LLRAck is received. RETRY_LOCAL_IDLE Received flit is discarded. TIMEOUT is incremented. RETRY_ABORT A flit is received. RETRY_ABORT Discard any received flit. 4.9.2.5 Send and Receive Controllers The send controller determines the type of flit sent from an agent. The states of local and remote retry state machine are used as inputs to this controller. The actions of the send controller are described in Table 4-72. The rows in this table are prioritized such that the conditions satisfied in earlier rows override the conditions satisfied by later rows. 198 Ref No xxxxx Intel Restricted Secret Table 4-72. Description of Send Controller Local Retry State Remote Retry State Actions RETRY_ABORT Any Send a LLRIdle Special Packet. Any, except RETRY_ABORT RETRY_LLRACK Send a Special Packet with [LLRAck, WrPtr]. RETRY_LLRREQ RETRY_REMOTE_NORMAL Send a Special Packet with [LLRReq, ESeq]. RETRY_LOCAL_NORMAL or RETRY_LOCAL_IDLE RETRY_REMOTE_NORMAL If RdPtr is not same as WrPtr then send a flit from the retry buffer at RdPtr or a LLRIdle Special Packet; else if (NumFreeBuf>2 OR (NumFreeBuf=2 AND NumAck>=8)), then send an normal or idle flit and decrement NumFreeBuf by 1; else send a Ctrl flit with LLRIdle. Table 4-72 captures two important rules of the link level retry scheme. These rules are: 1. Whenever the RRSM state becomes RETRY_LLRACK, the agent must give priority to sending the Special Packet with [LLRAck, WrPtr]. 2. Except RRSM state of RETRY_LLRACK, the priority goes to LRSM state of RETRY_LLRREQ and in that case the agent must send a Special Packet with [LLRReq, ESeq] over all other flits. Note: that when an agent’s LRSM is in RETRY_LOCAL_IDLE state and its RRSM is in RETRY_REMOTE_NORMAL state, it may send new flits but once doing that may result in the other end accumulating a large number of Acks as it can not return any Acks till LLRAck Special Packet is sent. An agent must return Acks whenever possible. An agent can return 0 or 8 Acks with a packet. Also, note that the retry buffer at any agent is never filled to its capacity, therefore NumFreeBuf is never 0. If there are only 2 retry buffer entries left (NumFreeBuf = 2), then the sender can send a Idle or header flit only if NumAck is greater than or equal to 8 and it must set the Ack bit in the outgoing flit, otherwise a LLRIdle Special Packet or other Ctrl flit is sent. This is required to avoid deadlock at the Link Layer due to retry buffer becoming full at both agents on a link and their inability to send Ack bits through packet header or Idle flits. If there is only 1 retry buffer entry available, then the sender cannot send an Idle or Info flit. This restriction is required to avoid ambiguity between a full or an empty retry buffer during a retry sequence that may result into incorrect operation. These restrictions imply that the number of retry buffer entries at any agent cannot be less than 10. Processing of a received flit at the receiver is dependent on the state of the local retry state machine and the type of the flit received. The effect of Ctrl flits or error on the local retry state at the receiver is shown in Table 4-71. Table 4-73 Shows the processing of the received flit and its effect on other Link Layer states at the receiver. Ref No xxxxx 199 Intel Restricted Secret Table 4-73. Processing of Received Flit Local Retry State Type of received flit Actions RETRY_LOCAL_NORMAL A normal flit is received. If Ack is set, then increment NumFreeBuf by 8. Increment NumAck by 1 and the sender credit at appropriate VC by the supplied amount. ESeq is incremented by 1 (modulo retry buffer size). A packet flit is stored in the incoming virtual channel buffers to be forwarded to Protocol Layer. RETRY_LOCAL_NORMAL A Special Packet flit is received If it is an LLRReq message, then the RRSM state is affected, otherwise the flit is processed. RETRY_LOCAL_NORMAL Error is detected on a received flit. LRSM state is affected. Received flit is discarded. RETRY_LLRREQ A Special Packet flit is received If it is an LLRReq message, then the RRSM state is affected, otherwise the flit is discarded. RETRY_LLRREQ An non Special Packet flit is received Received flit is discarded. RETRY_LLRREQ Error is detected on a received flit. Received flit is discarded. RETRY_LOCAL_IDLE A Special Packet flit is received If it is an LLRReq message, then the RRSM state is affected. If it is an LLRAck message, then the LRSM state is affected. Otherwise, the received flit is discarded. RETRY_LOCAL_IDLE An non Special Packet flit is received. Received flit is discarded. RETRY_LOCAL_IDLE Error is detected on a received flit. Received flit is discarded. RETRY_ABORT A flit is received. Received flit is discarded. 4.10 Link Layer Initialization The sequence for Link Layer initialization is given below in pseudo code. After reset the Link Layer will wait on the Physical Layer to complete its initialization. The Link Layer will send Null.Nop Link Layer messages until any product specific reset sequences that are needed before Link Layer initilization are complete (ex. waiting for a service processor to set the local node ids). This is enabled by the first interlock which uses the ready_for_init parameter exchange messages. A Link Layer agent must both be sending and receiving these messages for the interlock to pass. Once the interlock is complete, the Link Layer will begin sending parameter exchange messages. The Link Layer is required to send the parameter exchange messages in order from 0 to N but is not required to send them contiguously. During the parameter exchange, if the Link Layer is not sending a parameter exchange message, it must send Null.Nops. If an error occurs during the parameter exchange, the Link Layer agent detecting the error will revert to sending ready_for_init messages which will cause both agents to re-sync at the first interlock and retry the parameter exchange operation. Once parameter exchange has been completed in an error free manner, the Link Layer agent with start the second interlock by sending the ready_for_normal_operation message. When the agent is both sending and receiving the ready_for_normal_operation message normal operation will begin by that agent sending the begin_normal_operation message. When an agent receives the 200 Ref No xxxxx Intel Restricted Secret begin_normal_operation message it will commit the parameters that were exchanged to the active state. For example, if both agents choose to enable rolling CRC, the rolling CRC will activate for the first flit after the begin_normal_operation message. Table 4-74. Link Init and Parameter Exchange State Machine Current State Received Flit / Local Event Next State Send Action Not_Ready_For_Init Any Flit / Ready_For_Init not asserted Not_Ready_For_Init send->Null.Nop Not_Ready_For_Init Any Flit / Ready_For_Init asserted Ready_For_Init send->Null.Nop Ready_For_Init Null.Nop Ready_For_Init send->Ready_For_Init Ready_For_Init recv->Ready_For_Init Parameter_Exchange, Send_PE = 0, Recv_PE = 0, PE_Error = 0 send->Ready_For_Init Parameter_Exchange, Recv_PE!= 8, Send_PE!= 8 recv->PE[Recv_PE] or Null.Nop Parameter_Exchange, Recv_PE++ send->PE.[Send_PE], Sendt_PE++ Parameter_Exchange error in received flit Parameter_Exchange, PE.Error = 1, Recv_PE++ send->PE.[Send_PE], Send_PE++ Parameter_Exchange, Recv_PE = 8, Send_PE!= 8 any Parameter_Exchange send->PE.[Send_PE], Sendt_PE++ Parameter_Exchange, Recv_PE!= 8, Send_PE = 8 recv->PE[Recv_PE] or Null.Nop Parameter_Exchange, Recv_PE++ Null.Nop Parameter_Exchange, Recv_PE = 8 and Send_PE = 8 any Parameter_Exchange_Done Null.Nop Parameter_Exchange_Done PE.Error = 1 Ready_For_Init Null.Nop Parameter_Exchange_Done PE.Error = 0 and!Normal_Op_Enable Parameter_Exchange_Done Null.Nop Parameter_Exchange_Done PE.Error = 0 and Normal_Op_Enable Ready_For_Normal_Op Null.Nop Ready_For_Normal_Op !recv> Ready_For_Normal_Op Read_For_Normal_Op send->Ready_For_Normal_Op Ready_For_Normal_Op recv> Ready_For_Normal_Op Normal_Operation send->Ready_For_Normal_Op Normal_Operation !recv->Ready_For_Init Normal_Operatoin any Normal_Operation recv->Ready_For_Init Remote_Link_Reset, Assert local soft reset any Normal_Operation local link reset Local_Link_Reset any Local_Link_Reset local reset asserted Local_Link_Reset Null.Nop Local_Link_Reset local reset de-asserted Not_Ready_For_Init Null.Nop Remote_Link_Reset local reset asserted Remote_Link_Reset Null.Nop Remote_Link_Reset local reset de-asserted Not_Ready_For_Init Null.Nop While (!ready_for_init) { send->Null.Nop } While (ready_for_init) { send->Ready_For_Init.Nop; Ref No xxxxx 201 Intel Restricted Secret CSI Link Layer CSI Link Layer If receive->Ready_For_Init.Nop {break;} } Send_Parameter_Exchange = 1; Recv_Parameter_Exchange = 0; Recv_PE_Error = 0; Ready_For_Normal_Operation = 0; While (!Normal_Operation) { If Send_Parameter_Exchange { send->Parameter_Exchange_1.Nop send->Parameter_Exchange_2.Nop send->Parameter_Exchange_3.Nop send->Parameter_Exchange_4.Nop send->Parameter_Exchange_5.Nop send->Parameter_Exchange_6.Nop send_Parameter_Exchange = 0; } If!Recv_Parameter_Exchange { receive->Parameter_Exchange_1.NopRecv_PE_Error |= Error_Check(receive->Parameter_Exchange_1.Nop) receive->Parameter_Exchange_2.NopRecv_PE_Error |= Error_Check(receive->Parameter_Exchange_2.Nop) receive->Parameter_Exchange_3.NopRecv_PE_Error |= Error_Check(receive->Parameter_Exchange_3.Nop) receive->Parameter_Exchange_4.NopRecv_PE_Error |= Error_Check(receive->Parameter_Exchange_4.Nop) receive->Parameter_Exchange_5.NopRecv_PE_Error |= Error_Check(receive->Parameter_Exchange_5.Nop) receive->Parameter_Exchange_6.NopRecv_PE_Error |= Error_Check(receive->Parameter_Exchange_6.Nop) Recv_Parameter_Exchange = 1; } If Recv_PE_Error { Recv_Parameter_Exchange = 0; Send->ready_for_init.nop; Recv_PE_Error = 0; If receive->ready_for_init.nop {Send_Parameter_Exchange = 1;} If (!Recv_PE_Error & Recv_Parameter_Exchange &!Send_Parameter_Exchange) { Send->Ready_Normal_Operation; Ready_For_Normal_Operation = 1; } If (Ready_For_Normal_Operation & Receive->Ready_Normal_Operation) { Send->Begin_Normal_OperationBreak;} 202 Ref No xxxxx Intel Restricted Secret 4.11 Link Layer Required Registers 4.11.1 CSILCP - CSI Link Capability Register Table 4-75. CSILCP Format Bit Attr Def. 31:30 RV Oh Reserved 29:28 RO 0h VN1 Credits Per Data MC 00 - 0 Credits 01 - 1 10 - 2 to 8 11 - 9+ 27:26 RO 0h VN0 Credits Per Data MC 00 - 0 Credits 01 - 1 10 - 2 to 8 11 - 9+ 25:24 RO 0h VN1 Credits Per Non-Data MC 00 - 0 Credits 01 - 1 10 - 2 to 8 11 - 9+ 23:22 RO 0h VN0 Credits Per Non-Data MC 00 - 0 Credits 01 - 1 10 - 2 to 8 11 - 9+ 21:16 RO 0h VNA Credits / 8 15:12 RV 0h Reserved 11 RO 0h CRC Mode Support 0 - 8b CRC 1 - 8b CRC & 16b Rolling CRC 10 RO 0h Scheduled Data Interleave 0 - Not Supported 1 - Supported 9:8 RO 0h Flit Interleave 00 - Idle/Null flit only (CSI default) 01 - Command Insert Interleave 10 - RSVD 11 - RSVD 7:0 RO 0h CSI Version Number 0h - Rev 1.0 !0h - RSVD Ref No xxxxx 203 Intel Restricted Secret 4.11.2 CSILCL - CSI Link Control Register Table 4-76. CSILCL Bit Attr Def. 31:17 RsvdP 0h Reserved 16 RWSL 0h Link Layer Initialization stall (on next initialization) 0 - Disable 1 - Enable, stall initialization till this bit is cleared 15:14 RWSL 0h CRC mode (on next initialization) 00 - 8b CRC 01 - 16b rolling CRC, enabled if peer agent also supports in Parameter0 10 - Reserved 11 - disable 13:12 RWSLR svdP 0h Advertised VN1 credits per supported VC (on next initialization) Reserved for UP/’DP 00 - Max 01 - 2 if < Max 10 - 1 if < Max 11 - 0 (Disable VN1: can cause deadlock) 11:10 RWSL 0h Advertised VN0 credits per supported VC (on next initialization) 00 - Max 01 - 2 if < Max 10 - 1 if < Max 11 - 0 (Disable VN0: can cause deadlock) 9:8 RWSL 0h Advertised VNA credits (on next initialization) 00 - MAX 01 - 64 if < Max 10 - 32 if < Max 11 - 0 (Disable VNA) 7:6 RWSL 00h Link Level Retry (LLR) timeout value in cycles 00 - 4095 01 - 1023 10 - 255 11 - 63 5:4 RWSL 0h Consecutive LLRs to Link Reset 00 - 16 01 - 8 10 - 4 11 - 0, disable LLR (if CRC error, immediate error condition) 3:2 RWSL 0h Consecutive Link Redet from LLR till error condition (only applies if LLR enabled) 00 - up to 2 01 - up to 1 10 - up to 0 11 - Reserved 204 Ref No xxxxx Intel Restricted Secret Table 4-76. CSILCL (Continued) Bit Attr Def. 1 RW 0h Link Hard Reset. Re-initialize resetting values in sticky registers - Write 1 to reset link - this is a destructive reset - when reset asserts, register clears to 0h 0 RW 0h Link Soft Reset - Re-initialize without resetting sticky registers. Write 1 to reset link - this is a destructive reset - when reset asserts, register clears to 0h 4.11.3 CSILS - CSI Link Status Register Table 4-77. CSILS Bit Attr Def. Description 31:28 RsvdZ 0h Reserved 27:24 RO N/A Link Initialization Status 0000 - Waiting for Physical Layer Ready 0001 - Internal Link Initialization Stall 0010 - Sending ReadyForInit 0011 - Parameter Exchange 0100 - Sending ReadyForNormalOperation 0101 - Initial Credit return (initializing credits) 0110 - Normal Operation 0111 - Link Level Retry 1000 - Link Error 11XX, 1001, 101X - Reserved 23:22 R, W1C N/A Link initialization failure count - Saturates at 0b11 00 - 0 01 - 1 10 - 2-15 11 - >15 21:19 R, W1C N/A Last Link Level Retry Count 000 - 0 retry (no LLR has occurec since last hard component reset). 001 - 1 retry 010 - 2-15 retry 011 - >15 retry 18:16 RO N/A VNA credits at receiver 000 - 0 credits 001 - 1-7 credits 010 - 8-10 credits 011 - 11-16 credits 100 - 16-32 credits 101 - 32-63 credits 110 - 64-127 credits 111 - 128+ credits 15 RO N/A VN0 SNP credits: 0 (0 credits); 1 (1+ credits) 14 RO N/A VN0 HOM credits: 0 (0 credits); 1 (1+ credits) 13 RO N/A VN0 NDR credits: 0 (0 credits); 1 (1+ credits) Ref No xxxxx 205 Intel Restricted Secret CSI Link Layer CSI Link Layer Bit Attr Def. Description 12 RO N/A VN0 DRS credits: 0 (0 credits); 1 (1+ credits) 11 RO N/A VN0 NCS credits: 0 (0 credits); 1 (1+ credits) 10 RO N/A VN0 NCB credits: 0 (0 credits); 1 (1+ credits) 9 RO N/A VN0 ICS credits: 0 (0 credits); 1 (1+ credits) 8 RO N/A VN0 ICB credits: 0 (0 credits); 1 (1+ credits) 7 RO RsvdZ N/A VN1 SNP credits: 0 (0 credits); 1 (1+ credits) Reserved for UP/DP 6 RO RsvdZ N/A VN1 Hom credits: 0 (0 credits); 1 (1+ credits) Reserved for UP/DP 5 RO RsvdZ N/A VN1 NDR credits: 0 (0 credits); 1 (1+ credits) Reserved for UP/DP 4 RO RsvdZ N/A VN1 DRS credits: 0 (0 credits); 1 (1+ credits) Reserved for UP/DP 3 RO RsvdZ N/A VN1 NCS credits: 0 (0 credits); 1 (1+ credits) Reserved for UP/DP 2 RO RsvdZ N/A VN1 NCB credits: 0 (0 credits); 1 (1+ credits) Reserved for UP/DP 1 RO RsvdZ N/A VN1 ICS credits: 0 (0 credits); 1 (1+ credits) Reserved for UP/DP 0 RO RsvdZ N/A VN1 ICB credits: 0 (0 credits); 1 (1+ credits) Reserved for UP/DP 4.11.4 CSILP0 - CSI Link Parameter 0 Register Parameter exchanged as part of link initialization Table 4-78. CSILP0 Bit Attr Def Description 31:0 RO 0h Parameter 0 from peer agent 4.11.5 CSILP1 - CSI Link Parameter 1 Register Parameter exchanged as part of link initialization Table 4-79. CSILP1 Bit Attr Def Description 31:0 RO 0h Parameter 1from peer agent 4.11.6 CSILP2 - CSI Link Parameter 2 Register Parameter exchanged as part of link initialization 206 Ref No xxxxx Intel Restricted Secret Table 4-80. CSILP2 Bit Attr Def Description 31:0 RO 0h Parameter 3 from peer agent 4.11.7 CSILP3 - CSI Link Parameter 3 Register Parameter exchanged as part of link initialization Table 4-81. CSILP3 Bit Attr Def Description 31:0 RO 0h Parameter 3 from peer agent 4.11.8 CSILP4 - CSI Link Parameter 4 Register Parameter exchanged as part of link initialization Table 4-82. CSILP4 Bit Attr Def Description 31:0 RO 0h Parameter 4 from peer agent 4.12 Link Layer Rules and Requirements 4.13 Open Issues 1. Link Level retry mechanism 2. Home channel, flow control and shared adaptive buffering 3. Scheduled Data Interleave should be made profile dependent - currently, it is not. 4. sync flit definition 5. error indication description Ref No xxxxx 207 Intel Restricted Secret CSI Link Layer CSI Link Layer 208 Ref No xxxxx Intel Restricted Secret 5.1 Introduction The Routing layer provides a flexible and distributed method to route CSI transactions from a source to a destination. The scheme is flexible since routing algorithms for multiple topologies can be specified through programmable routing tables at each router (the programming is typically done by firmware). The routing functionality is distributed since the route is not set up centrally; instead, the routing is done through a series of routing steps, with each routing step being defined through a lookup of a table at either the source, intermediate, or destination routers. The lookup at a source is used to inject a CSI packet into the CSI fabric. The lookup at an intermediate router is used to route a CSI packet from an input port to an output port. The lookup at a destination port is used to for consumption at the destination CSI protocol agent. Note that the Routing layer is thin since the routing tables, and, hence the routing algorithms, are not defined by the specification. This allows a variety of usage models, including flexible platform architectural topologies to be defined by the system implementor who uses CSI as the system coherent interface. The Routing layer relies on the Link layer providing the use of up to 3 virtual networks (VNs) - two deadlock-free VNs, VN0 and VN1 with several message classes defined in each virtual network and a shared adaptive virtual network, VNA (Section 4.2, “Virtual Networks” on page 4-137.) 5.2 Routing Rules Rule 1. (Message class invariance): An incoming packet belonging to a particular message class is always routed on an outgoing CSI port/virtual network in the same message class. Rule 2. An incoming packet on VN0 (VN encoding 00) can always be routed on the VNA (VN encoding 10 but not with VN encoding 11) of an outgoing CSI port, subject to the availability of resources in the shared buffer pool. An incoming packet on VN1 (VN encoding 01) can always be routed on the VNA (VN encoding 11 bit not with VN encoding 10) of an outgoing CSI port, subject to the availability of resources in the shared buffer pool. Rule 3. An incoming packet on VNA (VN encoding 10) can always be routed on the corresponding VNA (VN encoding 10) of an outgoing CSI port, subject to the availability of resources in the shared buffer pool. A corresponding rule applies for VNA with VN encoding 11. Rule 4. An incoming packet on VNA with encoding 10 should not be routed on an outgoing port with a VNA encoding 11. A corresponding rule applies for VNA with VN encoding 11. For an exception see Rule 9. Rule 5. An incoming packet on VNA (VN encoding 10) should not be routed on VN1 (VN encoding 01) of an outgoing CSI port. A corresponding rule applies to incoming packets on VNA with encoding 11.For an exception see Rule 9. Rule 6. (Draining Rule): An incoming packet on VNA (VN encoding 10) can always be routed on VN0 (VN encoding 00) of an outgoing CSI port. If resources (buffer space) are not immediately available at VN0, they have to be guaranteed to become available, to Ref No xxxxx 209 Intel Restricted Secret Routing Layer Routing Layer ensure the forward progress of the packet. A similar rule applies to incoming packets on VNA with encoding 11 which drain to VN1. Rule 7. (SAF and VCT switching) CSI platforms support the “store-and-forward” and “virtual cut through” types of switching. They don’t support “wormhole” or “circuit” switching. Rule 8. (Interconnect deadlock freedom) CSI platforms should not rely on VNA for deadlock- free routing. With CSI platforms which use both VN0 and VN1, the 2 VNs together could be used for deadlock-free routing - fully adaptive, partially adaptive, or deterministic. Rule 9. (VN0 only for “leaf” routers) In CSI platforms which use both VN0 and VN1, it is permissible to use only VN0 only (or only VN0 and VNA (with encoding 10)) for those components whose routers are not used route-through, i.e., all incoming ports have CSI destinations which terminate at this component. In such a case, packets from different VNs can be routed to VN0 (and VNA with encoding 10). Other rules (for example, movement of packets between VN0 and VN1) are governed by the platform dependent routing algorithm (see Section 5.7 for typical usage models). 5.3 Routing Step A routing step is defined by a routing function RF and a selection function SF. The routing function takes as inputs the CSI port at which a packet arrives and the destination node id and yields as output a 2-tuple - the CSI port number and the virtual network - which the packet should follow on its path to the destination. It is permitted for the routing function to be additionally dependent on the incoming virtual network. Further, it is permitted with for the routing step to yield multiple pairs. The resulting routing algorithms are called adaptive. In such a case, a selection function SF chooses a single 2-tuple based on additional state information which the router has (e.g., with adaptive routing algorithms, the choice of a particular port of virtual network may depend on the local congestion conditions). A routing step consists of applying the routing function and then the selection function to yield the 2-tuple. More formally, the routing and the selection functions, applicable at each router, are of the form: • RF1: P x N -> .() • RF2:P x C x N -> .() • SF: .() x S -> where the input port belonging to the set of input ports P, and the destination node belonging to the set of nodes N define ., a set of 2-tuples - each 2-tuple is a port and a deadlock-free virtual network, , as in RF1. It is also permitted to make the input additionally a function of the virtual network set, C, on which an incoming CSI packet is routed on, as in RF2. In SF, the set S refers to a set of states (which reflect implementation dependent state information) based on which one tuple is selected. In some instances, the output virtual network may not be explicit. The realization of this routing step is shown in Figure 5-1, Figure 5-2, and Figure 5-3. Figure 5-1 shows an example CSI router with several input and output ports - each input port is associated with a routing table. The “CSI_I*” and “CSI_O*” ports are the route through ports. “CSI_S*” are source input ports, i.e., each of these ports is connected to an internal agent which generates CSI Protocol layer transactions (see description on routing from a source port later in this section). “CSI_D*” are destination output ports, i.e., each of these ports is connected an internal agent which 210 Ref No xxxxx Intel Restricted Secret sinks CSI Protocol layer transactions (see description on routing to a destination port later in this section). The Route Table and the virtual network selection and arbitration logic together realize the routing step as explained below. Figure5-1.Routing Layer Functionality– 1 CSI_I3CSI_I6CSI_I1CSI_I2CSI_I4CSI_I5CSI_S1CSI_S2CSI_O3CSI_O6CSI_O1CSI_O2CSI_O4CSI_O5CSI_D1CSI_D2CSI Ports -InputCSI Ports -Output__- CSI_I3 CSI_I6 CSI_I1 CSI_I2 CSI_I4 CSI_I5 CSI_S1 CSI_S2 RT RT RT RT RT RT RT RT . . . .. .. . .VN select & arb CSI_O3 CSI_O6 CSI_O1 CSI_O2 CSI_O4 CSI_O5 CSI_D1 CSI_D2 CSI Ports -Input CSI Ports -Output __- VN: Virtual Network, Arb: Arbitration, RT: Route Table CSI_S*: Source I/P Ports, CSI_D*: Dest O/P Ports CSI_I*, CSI_O*: Route-through Ports Figure 5-3 shows an abstract structure of the routing table, which is associated at each input port of the router. This table will be used to describe the capabilities of the CSI Routing layer and possible simplifications for implementation will be discussed later. The port number refers to a particular port on the local router. • The table is looked up using two fields: a) the destination node id which is contained in each CSI packet and b) the virtual network on which the packet arrives at this port, which is also encoded in the packet. • If the incoming packet has traveled on VN0 or VN1, then the table yields a set of 2-tuples - the CSI port# and the deadlock-free VN (VN0 or VN1) on which the outgoing packet can travel. • If the incoming packet has traveled on VNA, then the table yields set of output ports on which the outgoing packet can travel in VNA (subject to Rule 3., Rule 4., Rule 5.) • In addition, an incoming packet on VN0 or VN1 can switch to outgoing VNA ports as allowed by the table entry for the destination node id (subject to Rule 2.). • Routing from a source port: The routing step at a source port is based only on the destination node id, since there is no explicit notion of virtual networks at this input port. It is up to the implementation choice to either exploit this aspect and simplify the routing table design or use the generalized form of the routing table (with duplicate entries) to keep the design uniform. • Routing to a destination port: There is no explicit notion of virtual networks at a destination output port (typically, a CSI Protocol layer agent). There is no special simplification of the routing table. Internal agents may want to keep the incoming transactions on separate virtual networks (VN0, VN1) if they so desire for design simplification, though such a separation is not necessary. Ref No xxxxx 211 Intel Restricted Secret Routing Layer Routing Layer Routing Table + Selection Logic Input Port#, {Input VN}> Im ImImp ppl llici iciicit tt Optional – depending on usage model Figure 5-3. Abstract Structure of the Routing Table VN0 VN1 VNA 0 , …, , …, , …, 1 , …, , …, , …, (n-1) , …, , …, , …, … InputVirtual Network Dest NodeId … … … 5.3.1 Router Table Simplifications Number of virtual networks: CSI platforms may implement legal subsets of the virtual networks (Section 4.2, “Virtual Networks” on page 4-137): {VN0}, {VN0, VNA}, {VN0, VN1}. Such subsets simplify the size of the routing table (reduce the number of columns in Figure 5-3), associated virtual channel buffering, and arbitration at the router switch. These simplifications come at the cost of platform flexibility and features (Section 2.5, “Profiles” on page 2-33). The designer also has to make sure that CSI components each implementing different virtual network capabilities are inter-operable (Section 4.6, “Packet Definition” on page 4-140). Router table entries: VN0 and VN1 are deadlock-free networks which provide deadlock freedom either together or singly, depending on the usage model, usually with minimal virtual channel resources assigned to them. Routing adaptivity is usually provided with VNA. Hence the entries in VN0 and VN1 columns could be significantly simplified to include only the port# (with an implied VNi, for some regular topologies such as meshes ) or a single entry (for some topologies such as tori). Even when adaptivity is provided using VN0/VN1, the size of the list in each entry is small and fixed (typically 2). With the VNA column, alternative representations for port#s could be used (e.g., bit vector indexed by the port#), especially, when the list in each entry is more than a few ports long. Router table size and organization: A flat organization of the routing table requires a size corresponding to the maximum number of node ids permitted by each profile (8, 32, 1024). With such an organization, the routing table is indexed by the destination node id field and possibly by 212 Ref No xxxxx Intel Restricted Secret the virtual network id field. The table organization can also be made hierarchical with the destination node id field being sub-divided into multiple sub-fields, which is implementation dependent. For example, with a division into “local” and “non-local” parts, the “non-local” part of the routing is completed before the routing of the “local” part. The advantage of reducing the table size at every input port comes at the cost of being forced to assign node ids to CSI components in a hierarchical manner. Choice of a selection function is left to the implementation. Particular care needs to be taken to avoid livelocks with non-minimal routing algorithms. 5.4 Routing Algorithm A routing algorithm defines the set of permissible paths from a source module to a destination module. A particular path from the source to the destination is a subset of the permissible paths and is obtained as a series of routing steps defined above starting with the router at the source, passing through zero or more intermediate routers, and ending with the router at the destination. Note that even though the CSI fabric may have multiple physical paths from a source to a destination, the only paths permitted are those defined by the routing algorithm. 5.5 Routing at Source and Destination Agents The Routing layer identifies each CSI agent solely by the node id field in the CSI packet for routing purposes. A CSI agent, identified by a unique node id, may have multiple sub-agents- for example an agent may have the processor, the memory and home controllers as sub-agents. At a destination node, it is up to the implementation to route internally to each of the sub-agents. For example, an implementation may route to a sub-agent based on the opcode of the transaction, the message class on which a message arrives, the address range, etc. Correspondingly, the internal routing of packets from the sub-agents to the Routing layer’s virtual networks at the source is also implementation dependent. Further, the internal buffering of the packet before it is placed in the virtual network at the source agent or after it is consumed from a virtual network at the destination agent is left to the particular implementation. Essentially, the sub-agents are outside the purview of the Routing layer, though internal routing may use some of the information contained in the CSI packet. 5.6 Routing Broadcast of Snoops It is possible for the Routing layer to perform the broadcast of snoop requests. This is not possible for all topologies and router implementations. This is an optimization that, in some topologies, can save snoop message bandwidth (when there is more than one target caching agent, and some share a link in their routes from the requestor). Broadcast is recognized by the Routing layer when broadcast is enabled and it receives a snoop request to a reserved set of nodeIDs configured to be broadcast snoop targets. Snoops are sent to every node with a caching agent (see “caching agent address subdivision”) except for the requestor, and possibly the home node. When the home node is not a target of a broadcast snoop, then the home node generates the snoop to the caching agent(s) at the home node. In the requestor-alias broadcast case, snoops must be sent to the home agent; it is optional in the home-alias broadcast case. Ref No xxxxx 213 Intel Restricted Secret Routing Layer Routing Layer Home-alias broadcast can reduce snoop traffic slightly by eliminating a separate snoop message to the home node (the home node would use the request message to generate a snoop message from the home) at the cost of possibly increasing snoop latency (since there is now an extra hop through the Routing layer for the snoop to the home) In cases where the home node socket is an intermediate node on a route to some caching agent, this saves no bandwidth. The decision as to which configuration to use depends on system topology, as deadlock free routing can result in highly imbalanced link bandwidth requirements in one configuration versus the other. In broadcast configurations, it must also be possible to send targeted snoops to caching agents (e.g. when the home forwards a snoop request to a local caching agent, or a directory indicates a caching agent owns the line). Nodes that must receive targeted snoops cannot be in the reserved set of nodeIDs configured as broadcast targets. When a broadcast snoop target must also be a directed snoop target, it is necessary to use a nodeID which is not a home or caching agent to indicate the directed snoop. For example, this could be indicated by sending a snoop to another on-chip agent which cannot normally be a target of snoop, like a configuration agent. The Routing layer must be capable of diverting snoop requests for non-snoopable target to the correct on-chip snoopable agents in this case. Agents that are under directory control should always receive directed snoops, so broadcast routes must be configured to avoid them. (Implementation Note: Tukwila uses the Ubox nodeID to indicate directed snoops to a socket and IOH nodeIDs for directed snoops under directory control.) Not all topologies are amenable to fabric-based broadcast. It is sufficient that either the topology must be routable deadlock-free with a single virtual network, or the fabric must be capable of forwarding snoop messages on a different virtual network for each possible output. Any superset of a fully connected cube is amenable to snoop broadcast, but it may be the case that hot removal of a node , or partially populated systems will require disabling of snoop broadcast. Allowed combinations of protocol options are illustrated in Table 5-1. Key • Config Parameters — Router B’cast (single snoop messages from snooping agent is broadcast to all required caching agents by routing agents) — IOH Dir (requestors don’t snoop I/O agents; a directory or snoop filter at the home directs instead.) — Local Snoop (requestors don’t snoop caching agents in home nodes; home nodes spawn snoops to local caching agents when request is received) • Config Requirements — CPU Snoop Targets: which nodeID(s) must be generated for coherent processor requests — IOH Snoop Targets: which nodeID(s) must be generated for coherent IOH requests — Home Snoop Targets: which nodeID(s) must be generated by the home agent for coherent requests (different targets will be used in different flows) — Broadcast Targets: which nodeID(s) must be configured in the router as destination of snoops • Snoop Target Definitions — Bcast(hm) target = home nodeID. The snoop is broadcast to all broadcast targets 214 Ref No xxxxx Intel Restricted Secret — Bcast(hm/req) target = home nodeID or requestorID, The snoop will be broadcast to all broadcast targets — lcl_cache target = local caching agent (shares same nodeID) — none: no target, i.e. no snoop messages are sent — allCaches: a list or fanout tree consisting of all caching agents — nonDircaches: a list or fanout tree consisting of all caching agents that are not under directory control tgt the destination nodeID is removed from the snoop list or fanout tree req the requestor nodeID is removed from the snoop list or fanout tree hm the home nodeID is removed from the snoop list — owner the nodeID of the owner (or possible owner) as identified by a directory or snoop filter Table 5-1. Combinations of Protocol Options Router B’cast IOH Dir Local Snoop CPU Snoop Target IOH SnoopTarget Home Snoop Target Broadcast Target Y Y Y Bcast(hm) none {lcl_cache – req}, owner ,Bcast(hm nonDirCaches Y Y N Bcast(hm/req) none owner ,Bcast(hm {nonDirCaches - tgt} Y N Y Bcast(hm) Bcast(hm) {lcl_cache – req}, {allCaches - tgt} Y N N Bcast(hm/req) Bcast(hm) -allCaches N N Y {allCaches-hm} {allCaches-hm} {lcl_cache – req} -N N N allCaches allCaches -- 5.7 Usage Models A variety of usage models are permitted within the scope defined by the Routing layer. The usage models can be classified into two main categories: a) flexible interconnect topologies, b) flexible partition management. Example usage models and their needs from the Routing Layer perspective are shown in Table 5-2. 5.7.1 Flexible Interconnect Topologies For a variety of direct connect topologies such as meshes, hypercubes, trees, and most indirect networks need only VN0 for non-adaptive, minimal routing. For example, with meshes and hypercubes, this can be achieved with dimension ordered routing. If VN0 has minimal buffer resources, then it is highly recommended that the platform also use VNA. In such a case, VNA can be used for adaptive routing while VN0 can be used for deadlock-free routing. The Route Table entries for such topologies are simple since the routing function is of the form RF1 (see Section 5.3). Further adaptivity with such networks are permitted if VN1 is added. The Route Tables are more complex (see Table 5-2) since the routing function is of the form RF2 (see Section 5.3). Ref No xxxxx 215 Intel Restricted Secret Routing Layer Routing Layer Ring based topologies such as tori require both VN0 and VN1 for deadlock-free routing. Further, it is highly recommended that the platform also use VNA since VN0 and VN1 are expected to have minimal buffer resources. With VNA, adaptive routing is permissible in such topologies but only along VNA (see Table 5-2). It is possible that regular topologies such as above become “fractured” because the platform is not fully populated to begin with or it becomes depopulated later. Such partial population is done for system manageability and flexibility, usually at the FRU (Field Replaceable Unit) granularity. The resulting restricted topologies do not impose the need for additional virtual networks or any additional resources to achieve deadlock-free routing. With partially populated FRUs, care has to be taken to make the underlying topology connected and built in such a manner that there are no performance bottlenecks that arise with the deadlock-free routing algorithm used. With FRU depopulation, it is assumed that performance degradation that could arise with the restricted topology is tolerable. Table 5-2. Routing Layer Needs for Different Usage Models Us UsUsage ageage M MMode odeodel ll Ne NeNeeds edseds Comme CommeComment ntnts ss M MMi iini ninim mma aal ll D DDF FF, ,, no nonon- n-n-a aad ddap apapt tti iive veve r rro oout ututi iin nng f g fg fo oor rr m mme ees ssh hh- -ba babas sse eed(m d(md(me ees ssh hh an anand c d cd cu uube) be)be) t tto oopol polpologi ogiogies eses z ....VN0 VN0VN0 z ....Sim SimSimp ppl lle ee r rro oou uut tti iin nng t g tg ta aab bbl lle (j e (je (ju uus sst tt por porport tt# ## s sspec pecpeci iif ffi iie eed dd i iin nn R RRo oou uut tti iin nng t g tg ta aab bbl lle ee e een nnt ttr rry yy) )) z ....W WWi iit tth hh V VVN NNA, A,A, ad adadapt aptapti iive veve r rro oou uut tti iing ngng o oon nnl lly yy al alalo oon nng VNA g VNAg VNA pe peper rrm mmit ititt tte eed dd M MMi iini ninim mma aal ll D DDF FF, ,, no nonon- n-n-a aad ddap apapt tti iive veve r rro oout ututi iin nng f g fg fo oor rr r rri iing ngng bas basbased eded ne nenet ttw wwor orork kks ss z ....VN0, VN0,VN0, VN1 VN1VN1 z ....R RRo oout ututi iin nng gg t tta aabl blble ee en enent ttr rry yy nee neeneeds dsds t tto oo s sspec pecpeci iif ffy yy por porport tt# ## and andand VNi VNiVNi z ....W WWi iit tth hh V VVN NNA, A,A, ad adadapt aptapti iive veve r rro oou uut tti iing ngng o oon nnl lly yy al alalo oon nng VNA g VNAg VNA pe peper rrm mmit ititt tte eed dd Ad AdAdapt aptapti iiv vve, e,e, m mmi iin nnim imima aal ll D DDF FF ro rorou uut tti iin nng gg f ffo oor rr m mme ees ssh hh-b -b-ba aas sse eed dd ne nenet ttw wwor orork kks ss z ....VN0, VN0,VN0, VN1 VN1VN1 f ffo oor rr m mme ees ssh hh- --b bba aas sse eed dd z ....R RRo oout ututi iin nng gg t tta aabl blble ee en enent ttr rry yy nee neeneeds dsds t tto oo s ssp ppe eec ccif ifify yy m mmu uultip ltipltiple lele p ppo oor rrt tt# ##s ss ( ((a aan nnd dd V VVN NNs ss) )) z ....W WWi iit tth hh V VVN NNA AA , a , a, ad dda aap pptiv tivtive ee r rro oout ututi iin nng gg alon alonalong gg V VVN NN0 00, ,, V VVN NN1, 1,1, a aan nnd dd V VVN NNA AA p ppe eer rrmitt mittmitte eed dd Ad AdAdapt aptapti iiv vve, e,e, m mmi iin nnim imima aal ll D DDF FF ro rorou uut tti iin nng gg f ffo oor rin r rinr ring gg-b -b-ba aas sse eed dd ne nenet ttw wwor orork kks ss z ....V VVN NN0, 0,0, VN1, VN1,VN1, VNA VNAVNA f ffo oor rr r rri iin nng gg- --bas basbase eed dd z ....R RRo oout ututi iin nng gg t tta aabl blble ee en enent ttr rry yy f ffo oor rr VNi VNiVNi has hashas s ssi iin nng ggl lle ee po popor rrt tt an anand V d Vd VN NN z ....W WWi iit tth hh V VVN NNA, A,A, ad adadapt aptapti iive veve r rro oou uut tti iing ngng o oon nnl lly yy al alalo oon nng VNA g VNAg VNA pe peper rrm mmit ititt tte eed dd M MMu uul llt tti ii- --par parpari iit tti iion m on mon ma aan nna aage gegement mentment w wwi iith ththo oou uut q t qt qu uuie ieies ssc cci iin nng gg z ....VN0, VN0,VN0, VN1 VN1VN1 z ....Pr PrPrim imimar arary yy r rro oou uut tti iing t ng tng tabl ablables eses us ususe VN0 e VN0e VN0 and andand al alalt tte eer rrn nna aat tte ee r rro oout ututi iin nng gg t tta aabl blble ees ss us ususe ee VN1 VN1VN1 z ....C CCS SSR RR t tto oo s sspec pecpeci iif ffy yy w wwh hhi iic cch V h Vh VN NN i iis ss be bebei iin nng gg us usused eded at atat eac eaceach hh s sso oou uur rrc cce ee po popor rrt tt z ....W WWi iit tth hh V VVN NNA, A,A, ad adadapt aptapti iive veve r rro oout ututi iin nng gg pe peper rrm mmit ititt tte eed dd z ....D DDe eet tta aai iil lls ss o oof ff sch schsche eem mme i e ie in nn D DDy yyna nanam mmi iic R c Rc Re eecon conconf ffi iig gg ch chcha aap ppt tte eer rr F FFR RRU UU / // S SSo ooc cck kke eet tt de dedepopu popupopul lla aat tti iion onon z ....N NNo oo sp spspecial ecialecial n nne eee eed dds ss z ....R RRe eesult sultsulting inging f ffr rra aac cct ttu uured redred t tto oop ppo oolo lologies co gies cogies could ulduld resu resuresul llt tt i iin nn n nno oon-m n-mn-mi iinim nimnima aal ll r rro oou uut tti iing ngng an anand dd p ppo ooor oror pe peper rrf ff 5.7.2 Flexible Partition Management This usage model is permitted since CSI has two virtual networks (for deadlock-free routing). If only one such network is used, then the other can be used to keep a partition running even when another partition with which it shares the routing interconnect needs to be quiesced. The details are explained in Section 14.7.3, “Flexible Option” on page 14-422. 216 Ref No xxxxx Intel Restricted Secret 5.8 CSI Components’ Compatibility As explained in Section 5.7, a variety of usage models are permitted within the scope defined by the Routing Layer. Care has to be taken, however, when two components both implementing CSI interface with each other. It is possible that one component implements a particular usage model allowed by the Routing Layer while the other component implements another usage model with the same resources (see Table 5-2). If two components interface through CSI and have different routing and Link layer resources, their capability to interface is defined by the component having the least resources. This is illustrated in Table 5-3 (“X” means that the combination is permitted). It is the responsibility of each CSI component to make sure that appropriate usage models and modes are defined for compatibility. A set of CSRs needed to enable such interfacing has been specified in the table in Section 5.9 - this may not be a complete set, however. It is expected that the component specification will contain the usage model enabling details. Table 5-3. Interfacing CSI Components with Different VNs Component B's VNs VN0 VN0, VNA VN0, VN1 VN0, VN1, VNA Component A's VNs VN0 X X B does not use VN1 (*) B does not use VN1 (*) VN0, VNA X X B does not use VN1 (*) B does not use VN1 (*) VN0, VN1 A does not use VN1 (*) A does not use VN1 (*) X X VN0, VN1, VNA A does not use VN1 (*) A does not use VN1 (*) X X (*) VNA can be used - refer to Link Layer Section for interfacing such components. Exception applies to "Leaf" Component - see Rule 9 5.9 Configuration Space and Associated Registers The routing tables and associated control and status registers (CSRs), which reside in the protected configuration space, are accessed through CSI’s NcRd and NcWrPtl (Non-coherent Read and Non- coherent Partial Write) transactions. This section provides a list of all the configuration space registers for the Routing layer. Since the exact Route Table organization is component specific and since the CSR assignments are either component or platform specific, this section will only list the CSRs related to the Routing layer from a functional perspective. The Component Specifications will provide additional details. Table 5-4. CSI Control and Status Registers Needed by the Routing Layer Configuration Space CSR Name(s) Function Routing Table Entries for each crossbar port Encodes routing algorithm VNs Capabilities Presence of VN0, VN1, VNA; Buffer sizes VNs Usage VN1 usage for flexible routing or adaptive routing Ref No xxxxx 217 Intel Restricted Secret Routing Layer Routing Layer Table 5-4. CSI Control and Status Registers Needed by the Routing Layer (Continued) Configuration Space CSR Name(s) Function CSR to specify which VN (0 or 1) is being used at each source agent Needed for flexible partition management to specify if primary RT or secondary RT use Route Tables Programmed? Bit indicating if Route Tables in component have been programmed Link Initialization Complete? Used by SBSP/PBSP before RT can be set up on that component Accesses to Firmware Hub Complete? Used by SBSP/PBSP before RT can be set up on that component 5.10 Routing Packets Before Routing Table Setup After system hard reset, as the system comes out of reset and goes about initializing CSI links and components, initial routing needs to be accomplished without the aid of the routing tables (for e.g., a path needs to be established to the firmware agent). The routing of such CSI packets is described in Section 12.5.1, “Routing of Firmware Accesses” on page 12-375. 5.11 Routing Table Setup after System Reset/Bootup At system boot, it is the responsibility of the system service processor (SSP) or the firmware to program the routing tables at each component in the platform. It is up to the platform to decide which option to choose. With the SSP option, the routing tables are programmed using the SSP’s network. The SSP accesses the CSI visible configuration agent (usually on a CSI processor component) to perform the actual CSI configuration accesses for updating the routing tables. This option is not described any further since it is under the purview of the SSP. With the firmware option, several sub-options are possible. For simple topologies, the firmware can discover the topology and program the routing tables. With topologies which have a firmware attached to each processor component, the routing tables could be stored at each hub and the firmware can load the tables for the processor and its associated components. For other complex topologies which do not have firmware attached to each processor component, the programming of the routing tables is more involved. The rest of this section describes the programming of routing tables for this option. The following assumptions are made: • A unique system boot strap processor (SBSP) be identified in a simple manner (see Section 12.7, “System BSP Determination” on page 12-378) - otherwise, except in simple platform topologies, a race to choose the SBSP among all processor components in the system could result in interconnect deadlocks. • At least one firmware agent is available in the system which is no more than “one CSI hop” away from the SBSP. • When link initialization is complete, all the CSI agents in the platform can be uniquely identified through node ids. The completion of the link utilization is indicated at each component through the setting of a CSR. An algorithm for programming the routing tables by a SBSP is described below. Alternative algorithms are feasible. The algorithm gets simplified if it is assumed that the firmware also knows the breadth-first order (see explanation below) of the components to load the Route Tables. (Note: 218 Ref No xxxxx Intel Restricted Secret Something of this nature needs to be described in the platform specification for each platform. It is being described here to identify all the needed CSRs for each component. This section may eventually get moved out, leaving just the skeleton or a pointer to the platform spec here.) Figure 5-4. Illustrating Firmware Hub Connectivity Options IOH1 CPU Node1 IOH2 FWH FWH CSI Interconnection Network CPU Node2 CPU Node3 CSI Link Link to FWH FWH: Firmware Hub IOH: IO Hub 1. On components with a firmware hub (FWH), once link initialization is complete, the code within SAL/BIOS firmware sets up the Route Table entry to the FWH and executes some initialization code. Any processor which is not the SBSP then halts waiting for a signal from the SBSP to resume. In the example system shown in Figure 5-4, this step would be done by Node1 and by Node3 but not by Node2 since Node2, being not connected to the FWH is still held in a halt state. Requirement: A CSR state bit to indicate whether a Node has completed its link initialization. 2. The SBSP next proceeds with the final Route Table setup for the platform. The other nodes in the system may or may not have performed their link initialization by this time frame. In order to minimize premature probes into neighbors’ CSRs, the SBSP could implement a wait that is platform dependent. This wait time is safely bounded by the term ((Maximum links per node * Maximum link initialization time) + Maximum skew in time arising from various sources since Reset). 3. The Route Tables for the platform topology are assumed to be present in a platform dependent resource such as the firmware, NVM, etc. (It is also assumed that if the integrity of the table contents in the platform resource is suspect, then the SBSP is capable of performing a firmware recovery operation.) In addition, for each topology supported, the firmware could specify the order in which the Route Tables have to be loaded among the CSI components in the platform and it could also specify the first Route Table entry that needs to be programmed at each link in each component so that the response transaction can route itself back to the source (SBSP). Alternatively, the firmware could, in a topology independent manner, determine the breadth- first order in which to program the CSI components’ Route Table entries. The breadth-first nature of the programming is important to ensure that a transaction can route itself back to the source (SBSP). This is illustrated in Figure 5-5. Ref No xxxxx 219 Intel Restricted Secret Routing Layer Routing Layer The SBSP first programs its own Route Tables to be able to reach potentially all components (CSI node ids) in the system. 4. The SBSP then programs the Route Tables for each CSI component (including IOHs, external memory controllers, etc.) which is “1 CSI link” away from it. It then programs CSI components which are “2 CSI links” away from it. The procedure is repeated till all the CSI components’ Route Tables are programmed (see Figure 5-5). Consider the case where the SBSP has completed programming all components at distance i (the iteration starts with i=0, where it programs its own Route Tables in Step 3 above). Assume it has to reach a component Ci+1 from Ci using link l. The SBSP uses the Routing layer which is now functional at all components till distance i. The first Route Table entry it programs in the Route Tables in Ci+1 is the one at link l corresponding to the entry for Ci and VN0 so that a response is guaranteed to return to Ci, and, consequently to the SBSP. Having established a path to Ci+1, the SBSP first ascertains a) that the link initialization for Ci+1 is complete by testing the appropriate CSR and b) that the Ci+1’s AP (in case it is connected to the FWH) has completed its initialization by testing the appropriate CSR.It then programs Once it has ascertained that both these flags are set, the SBSP proceeds with completing the Route Tables at Ci+1. Requirement: The CSI hardware must support the atomic write of an individual VN0 Route Table entry and shall not require the acquisition of any semaphores. In a similar fashion, a remote node’s Route Table may be programmed by both the SBSP and the local node (the same or different Route table entries). These must succeed so long as the granularity of the write - one Route table entry for VN0 - does not exceed the granularity defined by the NcWrPtl transaction (4 bytes). Requirement: The Route Table entry written by NcWrPtl shall be used in routing the response to the NcWrPtl transaction. 5. After completing the Route Table set up for each node id in the system, The SBSP can now set a flag in each CSI component to indicate that the Route Tables are set up. From the Routing Layer’s perspective, the system is ready for normal operation. (Please see the Reset/Init chapter to see the follow-on actions for components that may still need to execute reset firmware, etc.) Requirement: A CSR in the system interface logic for the above flag (Route Tables Programmed). Figure 5-5. Route Table Setup Using Breadth First Order SBSP . . . . . .CSI Component CSI Component . . . . . . CSI Component CSI Component . . . . . . CSI Component CSI Component . . . IOH FW H Com ponents at distance 1 Com ponents at distance 2 Com ponents at distance d Diam eter of N etw ork CSI Link Link to FW H FW H : Firm w are H ub IO H : IO Hub 220 Ref No xxxxx Intel Restricted Secret 5.12 Route Table Setup after Partition Reset 5.12.1 Single Partition The procedure for setting up the Route Tables in the components belonging to a partition are similar to the Route Table setup at system reset. They could be set up by the SSP using its network to access configuration agents at each CSI processor agent of the partition. Alternatively, if the Route Tables are set up by protected firmware running on a processor core, it is recommended that a partition BSP (PBSP) do this function using the procedures outlined in Section 5.11. 5.12.2 Partition with Route Through Components Since the Routing layer is shared, it is possible that a partition reset will affect other partitions (for the definition of affected partitions, see Section 14.7, “Multi-Partition Management with Shared Interconnect” on page 14-420). It is assumed that this determination of the affected partitions can be a priori. In such a case, all the Route Tables for all components belonging to the affected partitions will have to programmed by the PBSP or the SSP. While the PBSP is probing the status of a CSI component, using the procedure outlined in Section 5.11, it needs to test the status of a CSR to check if the component’s Route Table is to be programmed or not. Requirement: A state variable to indicate if Route Tables in the component is to be programmed or not. 5.13 Implementation Notes • The specification assumes that the routing table is always correct - i.e., the table is looked up with a correct set of inputs (node id, etc.) and the resulting output entry in the routing table (port#, VN) is valid. This may not be true because of errors, for example. It is assumed that the implementation has mechanisms to deal with invalid lookups and invalid outputs through appropriate error detection and/or error correction - for example, if errors are uncorrectable, the packet is bit bucketed - if the input port is a source port, then the agent is informed that there was a correctable or uncorrectable error. If the input port is connected to an external CSI link, then the Link layer credits should still be returned - this is a placeholder until we determine where such information belongs - perhaps in the Error Handling chapter. 5.14 Open Issues • The router broadcast section needs to be cleaned up further - currently, it is implementation dependent with references to IOH, Tukwila processor. Ref No xxxxx 221 Intel Restricted Secret Routing Layer Routing Layer 222 Ref No xxxxx Intel Restricted Secret The CSI Protocol layer governs the behaviors of protocol agents and the messaging interface between the various protocol agents. A protocol agent is a proxy for some entity which injects generates or services CSI transactions, such as memory requests, interrupts, etc. There are several types of protocol agents, each dealing with a chunk of protocol flows. Any CSI component may be described as a composite of protocol agent types. This chapter introduces some of the fundamental concepts & terminology within the CSI Protocol layer. This chapter also covers global protocol information--protocol constraints which span the individual protocol agents. The individual behaviors of each protocol agent are described in the various protocol chapters. These chapters describe the low-level messaging interface, as well as the high-level usage models (in some cases, the chapters directly reference the CSI Link Layer encodings of specific messages). The CSI Protocol is subdivided into two classes, the Coherent Protocol and the NonCoherent Protocol. The Coherent Protocol (Chapter 8, “CSI Cache Coherence Protocol”) describes the behavior of caching agents (proxies for processor’s or I/O devices which read & write cache coherent memory) and home agents (protocol engines which order reads & writes to a piece of coherent memory). All other protocol operations are considered part of the NonCoherent Protocol. This covers a wide range of topics, including configuration, I/O, interrupts, non-coherent memory, security, etc. (Chapter 9, “Non-Coherent Protocol”) is the starting point for all non-coherent operations, with separate chapters referenced for the major individual topics. The Address Mapping chapter (Chapter 7, “Address Decode”), is a special topic crossing all protocol agent types. It provides rules & guidance for how accesses from processors or I/O devices may be mapped to CSI transactions targeting CSI protocol agents depending on the address. 6.1 Protocol Messages Protocol agents communicate with one another via messages. At the Protocol layer, messages are a collection of fields--and fields contain values or symbols. Protocol level messages do not carry packetizing or encoding information (this is added by the Link Layer). Therefore, CSI protocol agents may communicate with each other over any medium which correctly delivers the content of the protocol messages between the agents. Each message is given a label (for example, RdCode). When used in the text, this label represents the message in its entirety (including the other populated fields). 6.2 Protocol Agents Agent types are a way of classifying protocol flows. An agent is referenced by its NID (Node ID) as per these rules: • A NID may represent multiple agent types, but in this case: — There may be only one agent of each type behind the NID. Ref No xxxxx 223 Intel Restricted Secret CSI Protocol Overview CSI Protocol Overview — The component must be able to distinguish between the agents based on the data within the incoming message. • An agent must be contained within a single NID. • A single component may be multiple NIDs. Table 6-1 lists the protocol agent types. For most agent types, there is a separate source & target. Table 6-1. Protocol Agent Types Agent Type Description Reference Home Orders read & write requests to a piece of coherent memory Coherent Protocol Caching Makes read & write requests to coherent memory, services snoops Coherent Protocol Non-Coherent Source & Target Sources and sinks transactions to NonCoherent memory or MMIO NonCoherent Protocol Config Source & Target Sources and sinks configuration messages NonCoherent Protocol System Management Dynamic Reconfiguration Power Management Source & Target Sources and sinks power management information transmitted over CSI NonCoherent Protocol Power Management Synchronization Source & Target Agents which initiate a quiescence and the agents which are quiesced NonCoherent Protocol Dynamic Reconfiguration Non-Coherent Msg Source & Target Sources & destinations of NcMsg class of messages NonCoherent Protocol Lock Source & Target Initiator and target of IA-32 bus lock flows NonCoherent Protocol Interrupt Source & Target Interrupts NonCoherent Protocol Interrupt and Related Transactions Legacy I/O Source & Target Transactions to IA-32 legacy I/O space NonCoherent Protocol Isoch Source & Target Covers Isochronous and QOS NonCoherent Protocol Quality of Service and Isochronous Security Source & Target Covers LT & Security flows NonCoherent Protocol Security Apart from highlighting the messages that go in and out, these split types are provided for cases in which there the NID which is the source agent is not the NID which is the target agent. 6.3 Transaction IDs Transaction ID’s (TIDs) are labels on a particular transaction leaving an agent. Each message in CSI has a UTID (global unique transaction ID), which is constructed as the concatenation of home NID, requestor NID, and requestor TID (home here refers to the NID which is the guards the slice of the memory space being requested, whether DRAM or MMIO). There are special rules which govern TID assignment when the target NID of a read or write request is multiple agent types: • A single NID may represent a home agent, noncoherent target agent, and isoch target agent 224 Ref No xxxxx Intel Restricted Secret • When this is the case, then the TID pool is shared amongst all transactions from a given requestor NID • At configuration time a requester will be given the maximum number of requests its allowed to issue to a home node (parameter is MaxRequest). • At configuration time a requester / home node pair will (or will not enable) the ICS / IDS message classes. — If enabled the parameter ICSRequest is set to the number of transactions reserved for the ICS message class. — Default value of ICSRequest is 0x00. • Requests destined to any home node will assign from the available RTID’s. The valid RTID values are 0x00 through (MaxRequest -1) (inclusive). • Per Requester/Home Node pair the sum of all currently active transactions initiated via HOM / NCS / NCB / ICS message classes must be less than equal too MaxRequest. • Maximum number of Coherent and NonCoherent Requests between the requester / home node pair is as follows: — = (MaxRequest – ICSRequest) 6.4 Open Issues This chapter will be expanded in subsequent revisions to better tie together the various Protocol chapters. In particular, this chapter will be the repository for: • Agent types with cross-references to the relevant chapters • Rules governing transaction ID assignment • Rules governing node ID assignment • Dependency rules across protocol channels Ref No xxxxx 225 Intel Restricted Secret CSI Protocol Overview CSI Protocol Overview 226 Ref No xxxxx Intel Restricted Secret 7.1 CSI Addressing Model CSI addressing model describes the mechanism of mapping accesses generated at any CSI agent to CSI transactions. This involves classification of accesses into various categories based on the properties of the address location being accessed and attributes of the access. The addressing model is flexible to accommodate existing firmware and operating systems view of the system address space that may expect address regions with certain properties in a system partition. Apart from supporting existing operating systems and application software, the CSI addressing model also enables advanced features to enable new usage models. This includes support for partitioned systems with various partitioning models and shared memory between partitions. 7.1.1 Types of Addresses Virtual Address: This is the address used by the applications, device drivers and devices (if I/O agents support paging) Physical Address: This is the operating system’s view of the address space in a partition. This is obtained by translating virtual address through the operating system page translation mechanism. This is also the address used by the cache coherency mechanism, which puts certain requirements on the mapping of coherent shared address space within and across partitions. System Address: The system address is represented by the physical address and the target (home) node identifier, which points to a unique device address in a system. The addressing model allows same physical address from different source agents to map to different system address (e.g., private firmware space per processor agent) or to the same system address (e.g., shared memory space in a partition or across partitions) irrespective of partition boundaries. System address also includes the scope of hardware cache coherency. For example, a system may have identical physical memory addresses in different partitions, but with different home nodes and different scope of coherency and therefore distinct system addresses. Also note that in the source broadcast based cache coherency scheme, the home node identifier does not play a role in specifying the scope of coherency. Device Address: This is the address generated by the target node of an CSI transaction to access the physical memory or device location. This address is used on the I/O buses or on the memory interface. This address may be same as the physical address part of the system address or it may be translated through some (optional) mapping mechanism at the target. 7.1.2 Addressing Mechanism Figure 7-1 shows a generic view of the system and the interfaces that use the types of addresses described above. Ref No xxxxx 227 Intel Restricted Secret Address Decode Address Decode Figure 7-1. View of Types of Addresses in the System Processor Agent (Virtual to Physical) Source Decoder (Physical to System) Target Decoder (Physical to Device) Memory Agent Target Decoder (Physical to Device) I/O Agent Source Decoder (Physical to System) CSI Network Fabric System Address Physical Address Device Address • Source Decoder: Source decoder takes the physical address and the request type as input and determines the target (home) agent for the CSI transaction. It also determines the transaction type and attributes of an access, which may override page attribute or I/O interface hints in some cases. The source decoder supports interleaving of an address region across multiple CSI target agents. Optionally, for coherent memory regions the source decoder can specify the scope of coherency on a per region basis. The source decoder does not map destination nodes to CSI ports for routing a transaction. • Target Decoder: The target decoder maps system address to device address. The target decoder works in conjunction with the source decoder to set the interleaving policy. For coherent memory regions, the target decoder can also specify the scope of coherency on a per region basis. Target decoder may not have any memory or I/O technology dependent parts such as channel, device, row or column addresses, I/O bus, etc., but an implementation may combine such functions within the target decoder for simplicity or performance reasons. Processor agents are typically the source of a transaction. The source decoder is used to determine the CSI request type and the target node for the transaction. Processor agents may also be target for some transactions, such as interrupts. If there are multiple interrupt targets within a processor agent, then some type of target decoder functionality may be needed to direct interrupts to specific interrupt targets or it could be broadcast to all the interrupt targets within the processor agent. The interrupt delivery mechanism for interrupt targets within a processor agent is implementation specific. 228 Ref No xxxxx Intel Restricted Secret Memory agents are target for memory transactions. Depending on the size of physical address space supported by the system and the interleaving scheme used to map devices at each memory agent into the physical address space, a target decoder may be provided at the memory agent to map physical addresses to device addresses. This mapping may not be necessary if the memory interface is capable of handling the entire physical address range. Memory agents that support some memory reliability features, such as mirroring, may also act as source of some transactions. In such cases the memory agent may have additional support provided by its target decoder to determine the companion memory agent node identifiers, but does not require the functionality of the source decoder as described in this specification. I/O agents are typically source as well as target of transactions. For the transactions generated by an I/O agent, it uses the source decoder to determine the CSI request type and the target node identifier. For transactions generated by requests from devices that do not support the complete physical address range, the I/O agent may provide additional mapping functions to enable these devices to access the complete physical address space (platform and implementation specific). For the transactions targeted to the I/O agent, a target decoder may be used to map physical addresses to device addresses. This mapping may be used either to interface with devices that support smaller address space than the physical address space and also for the purpose of interleaving the address region between multiple I/O interfaces handled by a single I/O agent. Note that CSI agents may implement only a subset of the functionality of the source and target decoder. The appropriate subset is platform dependent. For example, a platform with single memory and single I/O agent need not implement source decoder functionality to interleave a region among multiple targets. Also, memory agents may not need target decoder functionality if platform does not support physical address size beyond the address size supported on the memory interface and if the platform has only one memory agent, therefore no interleaving. 7.1.3 Classification of Address Regions Attributes of address regions determine their properties and the types of CSI transactions used to perform the operation. • Non-coherent Memory: This indicates a memory region that is not kept coherent by the CSI hardware cache coherency mechanism. Accesses to these regions use non-coherent CSI transactions, such as NcRd or NcWr. CSI agents should avoid putting accesses to these regions into caches. Cacheable accesses to these regions may cause a fault depending on the platform behavior. If locations from this memory region is put into caches, then software should take responsibility for maintaining (single or multi agent) cache coherency. In a typical system, all agents in a partition should map a physical address to these regions to the same target node, however, a system may map a given physical address to different targets from different sources and create private non-coherent memory regions (e.g., for private firmware region in local memory). CSI memory agents are targets of accesses to regions with this attribute. These address regions are side-effect free and CSI agents can make speculative access to any address in these regions. • LT Configuration: This indicates a region that is used to access LaGrande Technology (LT) specific configuration registers in a system. This region is not kept coherent by the CSI hardware cache coherency mechanism. Accesses to these regions use non-coherent CSI transactions, such as NcLTRd, NcLTWr, NcRd, or NcWr. CSI agents must not put accesses to these regions into caches. Cacheable accesses to these regions may cause a fault depending on the platform behavior. All agents in a system partition must map a physical address to these regions to the same target node. CSI configuration agents are targets of accesses to regions with this attribute. Access to these address regions may have side-effects and CSI agents must not make speculative access to location in these regions unless the location is known to be side-effect free. Ref No xxxxx 229 Intel Restricted Secret Address Decode Address Decode • Coherent Shared Memory: This indicates a memory region that is kept coherent by the CSI hardware cache coherency with respect to all caches in a coherency domain. Accesses to these regions use coherent CSI transactions. Address from these regions can be put into caches. A CSI agent may access these regions with non-coherent transaction based on some access attributes, e.g., based on attributes of PCI Express transactions on an I/O agent, however, in such cases the agents rely on the software to maintain coherency between different agents accessing the memory location. All source agents in a coherency domain accessing same location in a coherent shared memory region must map to the same target. CSI directory (home) agents are targets of accesses to regions with this attribute. These address regions are side-effect free and CSI agents can make speculative access to any address in these regions. • Memory Mapped I/O: This indicates regions that map to location on I/O devices and are accessible through the same address space as main memory. Accesses to these regions use non-coherent CSI transactions, such as NcRd, NcWr, NcP2PS, and NcP2PB. Cacheable accesses to these regions should be avoided, and such accesses may cause a fault depending on the platform settings (there may be exception to this rule, e.g., cacheable accesses to firmware regions from flash devices). All source agents in a system partition may not be consistent in terms of the target of a given address in this region, e.g., firmware regions may be pointing to different targets at different sources. CSI I/O or firmware agents are targets of accesses to regions with this attribute. Access to these address regions may have side-effects and CSI agents must not make speculative access to location in these regions unless the location is known to be side-effect free (such as firmware accesses to the flash device). • I/O Port: This indicates regions that are accessible through the I/O port address space. A system may also use part of its memory space to embed I/O port address space. Accesses to these regions use NcIORd or NcIOWr CSI transactions. Cacheable accesses to these regions are not allowed and may cause a fault depending on the platform settings. All source agents in a partition are expected to have an identical mapping of regions with this attribute in terms of the address range and the target. Some agent that may never generate these accesses may not map these regions. CSI I/O agents are usually targets of accesses to regions with this attribute (there may be exceptions such as I/O port accesses to 0x0CF8 and 0x0CFC). Access to these address regions may have side-effects and CSI agents must not make speculative access to any address in these regions. • I/O Configuration: This indicates regions that are accessible through the configuration address space. A system may also use part of its memory space to embed configuration address space. Accesses to these regions use NcCfgRd or NcCfgWr CSI transactions. Cacheable accesses to these regions are not allowed and may cause a fault depending on the platform settings. All source agents in a partition are expected to have an identical mapping of regions with this attribute in terms of the address range and the target. Some agent that may never generate these accesses may not map these regions. CSI I/O or memory agents are targets of accesses to regions with this attribute. Access to these address regions may have side-effects and CSI agents must not make speculative access to any address in these regions. • CSI Configuration: This indicates a memory mapped regions that is used to access CSI specific configuration and status registers. Accesses to these regions use NcRd, NcWrPtl, or NcWr CSI transactions. Cacheable accesses to these regions are not allowed and may cause a fault depending on the platform settings. All source agents in the system (all partitions) are expected to have an identical mapping of regions with this attribute in terms of the address range and the target. Some agent that may never generate these accesses may not map these regions. CSI configuration agents are targets of accesses to regions with this attribute. Access to these address regions may have side-effects and CSI agents must not make speculative access to any address in these regions. Also, a Cmp response to NcWrPtl or NcWr transactions in this region indicates a completion of CSI CSR write, which is different than a Cmp response to NcWrPtl or NcWr transactions in memory mapped I/O regions where it indicates global observation and may not indicate completion of the access. 230 Ref No xxxxx Intel Restricted Secret • Interrupt and Special Operations: This indicates address regions that are used to perform miscellaneous system functions, such as interrupt delivery and other special operations. All source agents in a partition are expected to have an identical mapping of regions with this attribute in terms of address range and the target. Some agent that may never generate these accesses may not map these regions. CSI processor, I/O or configuration agent are targets depending on the type of operation. Access to these address regions may have side-effects and CSI agents must not make speculative access to any address in these regions. CSI does not provide or require a hardware mechanism to ensure that all source agents in a system partition accessing the same memory location have a consistent classification of their attributes and target, it is the responsibility of system software (firmware or system management software) to maintain consistency on attributes of a region at different sources based on the usage model and the types of accesses the CSI agents can generate. The characteristics of CSI region types is summarized in Table 7-1. It indicates possible source and target CSI agents that can initiate accesses to each region. Note that an implementation of a CSI agent may not generate any accesses to a particular region type, even though it is allowed to do so. It also indicates the allowed request length, if speculative accesses are allowed and what does Cmp response mean for writes to address locations of a particular region type. Global observation means that the effect of write is observable by any subsequent read but the write may not have yet reached its final target and updated the indicated address location with the new value. Completion means that write has reached its final target and updated the intended address location with the new value. Table 7-1. Characteristics of CSI Address Regions Region Type Source Agent Target Agent Data Length Speculation Cmp Response on Writes Non-Coherent Memory Processor, I/O, Configuration Memory 1 byte to cache line Allowed Global Observation Coherent Shared Memory LT Configuration Memory Mapped I/O Processor, I/O Processor Processor, I/O, Configuration Memory Configuration I/O, Firmware cache line 1 to 4 1 byte to cache line Allowed Not Allowed Not Allowed Home agent has cache line ownership Completion Global Observation I/O Port Processor, I/O, Configuration I/O 1 to 4 byte Not Allowed Completion Interrupt and Special Operations Processor, I/O, Configuration Processor, I/O Operation dependent Not Allowed Operation dependent CSI Configuration Processor, Configuration Configuration 1 to 4 byte Not Allowed Completion I/O Configuration Processor, I/O, Configuration I/O, Memory 1 to 4 byte Not Allowed Completion Support for a particular CSI region attributes is platform dependent. All platforms must support coherent shared memory, memory mapped I/O, I/O port, configuration, and interrupt and special operations regions. Support for non-coherent memory regions is optional. Platforms that do not use memory mapped address spaces to support I/O port, configuration, and interrupt and special operations regions may not explicitly implement these region attributes. Ref No xxxxx 231 Intel Restricted Secret Address Decode Address Decode 7.1.4 Relationship Between Memory Attribute, Region Attribute and CSI Transactions System firmware and operating system software have different views and mechanisms to control the properties of address regions in the system. System firmware uses the CSI address decoding mechanism for this purpose, but the address decoding mechanism is not directly visible to the operating system. Operating system uses the page table to specify the properties of address regions, but the page table mechanism does not directly affect the address decoder. Systems based on CSI interface makes certain assumptions about consistency of address region properties specified by the page table, MTRR and address decoder and assumes that an interface exists between the firmware and the operating system (such as ACPI Firmware Interface Table or EFI memory descriptor) to facilitate this. Table 7-2 shows the combination of Region Attributes specified at the address decoder and page table or MTRR attributes that are allowed and the corresponding CSI transactions generated on access to these address regions. Note that the page table and MTRR mechanism is available only to processor agents, other source agents, e.g. I/O agents, may not contain page tables and need not worry about consistency between region attributes and page table attributes. However, I/O agents must also observe the requirements on the I/O initiated accesses that are allowed on certain regions and indicate if there is a violation (platform specific - either by sending an interrupt or machine check to a processor or a target abort to the requesting device). Some types of accesses to certain region attributes are not allowed and such access violation may generate a fault depending on the platform specific behavior, such as local machine check on processors or target aborts on I/O initiated accesses. Some of the region attribute and page table attribute combinations that are not allowed are indicated in Table 7-2. Other such cases are listed here: • Non-coherent memory or memory-mapped I/O regions: Read invalidate with ownership, cache cleanse or cache line writeback operations may generate a fault. Clean line replacements and cache line flushes must not generate a fault and must complete without generating any CSI transaction. Code reads, data reads, read invalidates, and I/O initiated reads generate a NcRd or NcP2PS transaction. • Interrupt region: Read accesses to interrupt delivery region (0x0 FEE0 0000 to 0x0 FEEF FFFF by default) may generate a fault, except for interrupt acknowledge or other special operations if the source address decoder is used to generate these operations. Cache line writeback or cache cleanse operations may generate a fault. Clean line replacements and cache line flushes must not generate a fault and must complete without generating any CSI transaction. • Configuration or I/O Port region: Cache line writeback or cache cleanse operations may generate a fault. Clean line replacements and cache line flushes must not generate a fault and must complete without generating any CSI transaction. Read or write accesses larger than 4 bytes or writes crossing a 4 byte naturally aligned boundary may generate a fault. Table 7-2. Allowed Attribute Combinations for Decode Register Entries Region Attributes Page Table, MTRR orI/O Request Attributea CSI Transaction Non-Coherent Memory WBb NcRd, writes may cause a fault WC, UC, WTc, WP, All I/O initiated accesses NcRd, NcWr, NcWrPtl 232 Ref No xxxxx Intel Restricted Secret Table 7-2. Allowed Attribute Combinations for Decode Register Entries (Continued) Region Attributes Page Table, MTRR or I/O Request Attributea CSI Transaction Coherent Shared Memory WB, WC, UC, WT, WP and I/O initiated accesses that require snoop RdCode, RdData, RdInvOwn, InvItoE, RdCur, and WbMtoI LT Configuration I/O initiated accesses that do not require snoop WB, WC, WT, WP UC RdCode, RdData, RdInvOwn, InvItoE, RdCur, and WbMtoI. NcRd and NcWr may be used under certain conditionsd . Not allowed, accesses may cause a fault NcLTRd, NcLTWr, NcRdPtl, NcWrPtl Memory Mapped WBb NcRd, writes may cause a fault I/O WC WcWr, NcRdPtl UC, WTc, WP, All I/O initiated accesses NcRd, NcRdPtl, NcWr, NcWrPtl, NcP2PS, NcP2PB I/O Port UC, Processor or I/O initiated I/O port accesses NcIORd, NcIOWr WB, WC, WT, WP Not allowed, may cause a fault Interrupt and Special Operations UC, Processor or I/O initiated Interrupt or special operations IntLogical, IntPhysical, IntAck, IntPrioUpd, NcMsgB, NcMsgS WB, WC, WT, WP Not allowed, may cause a fault CSI Configuration UC NcRdPtl, NcWrPtl WB, WC, WT, WP, I/O initiated accesses Not allowed, may cause a fault I/O Configuration UC, Processor or I/O initiated configuration accesses NcCfgRd, NcCfgWr WB, WC, WT, WP Not allowed, may cause a fault a. Page table attributes and their semantics are defined by the processor architecture. Abbreviations used in this table are as follows - WB: Writeback, WC: Write-coalescing or Write-combining, UC: Uncacheable, WT: Write-through, WP: Write- protected. b. Read accesses with WB attribute to address regions with non-coherent memory or memory mapped I/O region attribute may not cause an error, however, such accesses will not be kept coherent by the platform. Any modification to address in these regions may not be visible at a caching agent until all the cache lines at that agent are invalidated (since flush cache line or InvItoE requests cannot be issued, so there may not be another way to flush a cache line but to invalidate all the cache lines at the caching agent). Cache line writebacks to non-coherent memory or memory-mapped I/O regions are not allowed and may cause a local machine check. c. Writes to locations with WT attribute in the processor that are mapped to non-coherent memory or memory mapped I/O regions will not be kept coherent by the platform with respect to caches at the source or other caching agents. Source processor agent must invalidate or update its own cache lines at that address to see the effect of writes. d. Non-coherent transactions may be used when software takes the responsibility to keep processor caches consistent with I/O initiated accesses (e.g., by classifying corresponding regions as WC or UC) and either there are no other caching agents besides the processor and the initiating agent or non-processor caching agents do not prefetch beyond request horizon and evict updated lines from its caches in the order in which they are updated (to preserve the ordering model for I/O initiated accesses). 7.1.5 Assumptions and Requirements on System Address Map Following assumptions are made and requirements are placed in mapping the address space in a system. • All caching agents in a coherency domain that expect the system to maintain coherency through hardware based coherency mechanism must set their source address decoders to map a coherent shared memory block at the same physical address to the same target node. This is required since a caching agent responding to a snoop request to the home node of a cache line relies on the source address decoder to determine the home node identifier. Ref No xxxxx 233 Intel Restricted Secret Address Decode Address Decode • Target CSI agents must not use CSI source node identifier to map same physical address from different sources to different device addresses, since task migration between different processor threads in a partition is allowed. • If CSI target agents are shared across partitions, then same physical address from different partitions are mapped to the same target if and only if partitions intend to have shared access to the address, otherwise they must point to different target nodes. 7.1.6 CSI Addressing Model The addressing model used in Itanium and IA-32 processor family-based systems are slightly different from each other. Itanium processor family based systems only use memory mapped address spaces, whereas IA-32 processor-based systems use separate memory mapped, configuration (due to indirect access mechanism through 0x0CF8 and 0x0CFC I/O port addresses) and I/O port address spaces. This difference is illustrated in Figure 7-2. Figure 7-2. Itanium® Processor and IA-32 Addressing Models Memory Memory Config IO Port Main Mem MMIO IO Config CSI Config Interrupt IO Port Others Main Mem MMIO IO Config CSI Config IO Port Interrupt Unified Model (Itanium) Explicit Model (IA-32) LT IO Config CSI uses the concept of region attributes to distinguish different address regions and access to different regions use distinct request types. However, distinct region types and CSI request types does not necessarily signify different address spaces. In general, all the address in CSI-based systems are memory mapped, except for the I/O port which is treated as a distinct address space. Main memory, memory mapped I/O, CSI configuration, interrupt, and any other memory mapped regions are accessed through memory mapped address space in CSI-based systems. Depending on the region attributes for the location being accessed and other attributes of an access, appropriate CSI request type is used to perform these accesses. The entire address field is valid and needs to be taken into account in processing accesses to any address region, except for interrupt and special operations region where depending on the type of operation and the platform implementation, only part of the address field may be relevant and rest of the address field can be ignored. Either part or all of the I/O configuration region in a system may be in memory mapped space. This region can be located anywhere in the memory mapped space, which is platform implementation dependent. Platforms may also support I/O configuration region as a separate address space from memory mapped space, e.g., through an indirect access mechanism such as accesses through 0xCF8/0xCFC I/O port locations. All accesses to the I/O configuration region, either through memory mapped space or separate address space, uses NcCfgRd and NcCfgWr requests. On NcCfgRd or NcCfgWr requests generated through indirect access mechanism, such as 0x0CF8/0x0CFC I/O port locations, the address field for these requests indicate an address in the memory mapped space. Conversion of such indirect accesses to memory mapped accesses may 234 Ref No xxxxx Intel Restricted Secret require an address translation in certain platforms, which is described in Section 7.2.1.4. Target agents for NcCfgRd and NcCfgWr requests need to take into account all the bits in the address field for handling these requests. I/O Port address space is 64KB (+3B) address space specific to IA-32 systems only. These accesses result in NcIORd and NcIOWr requests on CSI. Only A[15:0] part of the address field is valid for these requests. Agents that use memory mapped operations to access I/O port address space must translate memory mapped address to I/O Port address before initiating NcIORd and NcIOWr requests (necessary only for Itanium processors). Upper address bits (A[16] and above) must be ignored by the target, requestor may set these address bits to any arbitrary value. 7.1.7 Addressing Model in a Partitioned System CSI interface allows various partitioning options in a system. From the perspective of addressing model, the relevant partitioning types are partitions that do not share any agents (sharing network fabric does not have any impact on addressing model) and partitions that share agents between them. CSI has a notion of participant registers that specify the scope of an operation in the system. There may be multiple participant registers to specify the scope of various operations in the system (subset of a partition, full partition, full system). In systems where partitions do not share any agents, address regions (except for the protected configuration region that is used to perform system management through CSI interface) in different partitions will not interact with each other. In such systems, the address map can be set up to either have non-overlapping or overlapping physical addresses between different partitions. Overlapping physical address spaces between partitions do not create any issue since different partitions will map same physical address to different targets (based on the assumption that no agent is shared between partitions) and rely on the participant node information at the target to limit the scope of the operation (such as forwarding snoop probes, etc. within a partition). In systems where some system components are shared between partitions (such as memory and I/O agents), an unintended interaction between partitions can be avoided by using different CSI nodes as target agents for overlapping physical addresses and for other operations (such as IntPhysical, etc.) from different partitions and using the participant information at the target to limit the scope of the operation (such as forwarding snoop probes, etc. within a partition). This can also be achieved by allowing multiple logical CSI agents within the shared system component, one for each partition. However, this is not a requirement in such a system. For example, if there is no overlap between the physical address regions of the partitions or if none of the overlapping regions are mapped to the system components shared between partitions, then there is no ambiguity. 7.1.7.1 Sharing Memory Between Partitions Some platforms using CSI interface may intend to share a region of memory across multiple partitions. This can be achieved through appropriate use of address decoding functionality provided by CSI agents. There are multiple ways to achieve sharing between different partitions. Two possibilities with different trade-off are described below. The following description assumes that the memory region being shared across partitions can be cached at the caching agents. Note that coherency across partition is not necessary if the shared address region is not mapped as a coherent region in any partition. Further discussion on shared memory between partitions can be found in the RAS section of this specification. Ref No xxxxx 235 Intel Restricted Secret Address Decode Address Decode 7.1.7.1.1 Hardware Coherency Across Partitions In this option, all partitions that share the memory region map it to the same physical address and to the same target node. The target node with the shared memory region is aware of nodes in all partitions that share the region such that coherency can be enforced across all caching agents. In this scheme, all memory regions (shared or non-shared) mapped to the target node that supports sharing must not be mapped to any other target node in these partitions (since the coherency domain can be specified only on a per target node basis, not on a per address region basis). The advantage of this approach is that coherency on shared memory region allows for simpler programming model and it does not require additional hardware support. The disadvantage is that it compromises on error containment across partitions. 7.1.7.1.2 Software Coherency Across Partitions In this approach, multiple logical CSI nodes are supported by a single component interfacing to memory. Each partition uses distinct logical node identifiers with associated target decoders to access the shared address region (they need not be at the same physical address in different partitions). The target nodes for each partition use their target decoders to map these accesses from different partitions to the same device address, thus sharing it at the device level. The coherency domain in this model do not cross partition boundaries. Software needs to enforce coherency across multiple partitions. The advantage of this approach is that it provides better error containment across partitions than the option described in the previous section. The disadvantage is a more complex programming model and the hardware complexity associated with supporting multiple logical CSI nodes within a component. This option may also be impractical to implement in systems that support large number of partitions and expect sharing to occur between several or all partitions. 7.1.7.1.3 Other Operations Across Partitions If the CSI configuration and interrupt delivery regions overlap between multiple partitions and the source address decoders within each partition is set up to determine the target node identifiers in other partitions, then it is possible to send interrupts and CSI configuration accesses from one partition to another. This may be desirable in systems that primarily rely on the CSI interface to perform system management and configuration functions. Sending interrupts across partitions using IA-32 logical interrupt mode is not supported. This mechanism is also not supported in IA-32 physical interrupt mode with ID=0xFF. 7.2 Address Decoder The exact address decoding mechanism is platform and implementation dependent. This section describes a generic CSI address decoding mechanism that may be adopted to specific platform and implementation. Please refer to the platform and component specifications for the details of the decoder implementation and its programming interface. If the address of an access does not match any entry in the source decoder, then the access is not performed and the transaction is terminated. There may be exception to this during initialization when the address decoders are not configured and not used, however, accesses to firmware and local configuration space is still enabled. Handling of exceptions to source and target address decoders is not covered here and will be discussed under fault handling section. 236 Ref No xxxxx Intel Restricted Secret It is expected that all agents in a partition have consistent entries in their source and target address decoder depending on the usage model in the system. As mentioned earlier, the consistency of the decoders in the system is the responsibility of the system software, no consistency checking is required or performed by the hardware, except for detection of certain access violations (if enabled). 7.2.1 Generic Source Address Decoder 7.2.1.1 Source Decoder at a CSI Agent Figure 7-3 “Source Address Decoder at Requesting Agent” on page 7-237 shows the conceptual view of the CSI source address decoder for an agent supporting N bits of physical address, where value of N is implementation dependent. The address decoder takes the physical address, type of access (read or write), and attributes of access (e.g., page table or MTRR attributes, SMM indication, coherency, etc.) as input and determines the CSI transaction type and target node identifier to perform the access. In most cases, the physical address is used as is in the CSI transaction, but in some cases the input physical address is translated to another address (e.g., during I/O port accesses). The address decoder consists of entries that map address ranges (with optional attributes) to node IDs and attribute. Each entry is compared in parallel against the request address, and the entry that matches will supply the parameters needed to determine the attribute of that address range, and the node ID. In addition to the incoming address, some implementations may pass predetermined attributes (e.g. SMM, code/data, I/O, and other special cycle indicators) which may either predetermine the region attribute, cause specific address ranges to match, or not match, and alter the target. Figure 7-3. Source Address Decoder at Requesting Agent Physical Address 0N-1 Address Range Match Select Target List Index Select Interleave Identifier Region # Hit / Miss Attribute Target Lookup Combine Destination Node ID Ref No xxxxx 237 Intel Restricted Secret Address Decode Address Decode 7.2.1.2 Source Decoder Details for Multiprocessors Source address decoder entries must contain the following fields: • Valid: Valid or invalid entry. If invalid, it forces an address mismatch • Type: Address attribute (illegal, non-coherent, coherent, memory mapped I/O, configuration, I/O port, interrupt, etc.). Illegal type will cause an exception if selected. • Address Region: The encoding is implementation specific (e.g. base/mask, base/limit, start/end). Entries may have different granularity limits above the minimum. Decoders that use start+end encodings can share the end of one entry with the start of another. • Interleave (optional): This field indicates which bits of the address are used to indicate interleaving. Only cache line granularity and maximum granularity are required to be supported (maximum granularity splits the region into power-of-two contiguous sub-regions, using the highest order bits of the region address that can change). • IDBase: Used to set a base node ID. The width is platform specific. • Target List: List of node IDs of target clump. Clumps are a fixed power of two number of sockets, with minimum size of a single processor. The maximum size of a clump is platform specific.The selected target subfield is inserted into the base node ID. The list has length of the largest possible clump, which is platform specific. The mapping of clump number to target list index is platform specific, but depends only on bits of the address determined by Interleave above. A 1:1 mapping of selected address bits to node ID bits must be possible • Offset (optional): This field selects an address subfield which specifies a node within a clump. The selected offset address bits are directly inserted into the base node ID. The subfield position and width can vary by entry, and can range from 0 to max_nodeID_width bits. This node interleave is limited to cache line interleave (if not already used as a Target List index), or maximum granularity within a clump. • Enable()/Select() (optional): This vector (up to one per Target List) either selects an alternate target based on incoming attribute information (e.g.SMM, or read/write) than determined through normal address decode, or enables individual sub-ranges, like the Valid bit. 7.2.1.3 Decoding of I/O Port Accesses Mapping of I/O port accesses using the memory mapped physical address provided by the processor agents (this is the case only for Itanium processor family) may require special handling after the address decode is done to determine the region attributes and the target node identifier. The 64MB memory mapped I/O port region in Itanium processor family based systems represents a 64KB I/O port space. The lower 26 bits of the address in the memory mapped region (A[25:0]) is compressed into a 16 bit I/O port space using IOPortAddr[15:0] = A[25:12,1:0]. I/OPortAddr[15:0] is carried in the address field of the NcIORd and NcIOWr transactions on CSI interface. A source agent may not zero out upper address bits above A[15] in the NcIORd and NcIOWr transactions, the target agents either ignores address bits above A[15] or appropriately translates it in these transactions. 7.2.1.4 I/O Configuration Accesses using 0x0CF8/0x0CFC I/O Port This is done using indirect I/O port accesses at location 0x0CF8 and 0x0CFC. After it is determined that the access is to an I/O port region and the I/O port address has been determined (for platforms that support memory mapped I/O port accesses, see Section 7.2.1.3), then the I/O port address is compared with 0x0CF8 or 0x0CFC to determine that the access needs to be routed to the local CFG_ADDR or CFG_DATA register on the processor agent. 238 Ref No xxxxx Intel Restricted Secret I/O configuration accesses generated through this mechanism can use the same decode mechanism as memory mapped configuration accesses by mapping the address provided by the 0x0CF8 write to memory mapped configuration access by providing the address bits larger than bit 31 from CFG_BASE register (needed only if platform allows configuration region to be located above 4GB, otherwise upper address bits can be assumed to be set to zeros). CFG_BASE register is needed at each processor agent to map 32 bit configuration space in the memory map for proper decoding by the address decoder such that this space does not overlap with other address regions. Implementation Note: The address indicated by the data portion of the 0x0CF8 access may need to be adjusted by shifting address bits A[8] and above by 4 and inserting b0000 at A[11:8]. This is based on the assumption that the indirect access mechanism through 0x0CF8/0x0CFC is used by the systems supporting PCI based I/O subsystem. This address conversion is done to make the resulting NcCfgRd and NcCfgWr accesses conform to configuration mechanism used by PCI-Express based I/O subsystem. The content of CFG_ADDR in this section reflects this modified address, not the original data content of write to 0x0CF8 location. Other implementation specific details are • Accesses to 0x0CFC location must be 4 bytes long. Accesses of any other length to this location must not result in NcCfgRd or NcCfgWr. • Writes to 0x0CF8 location must update CFG_ADDR only if it is a 4 byte long access and the most significant bit of the data is 1, otherwise it must result in generation of NcIOWr. Also, note that the configuration space is traditionally a 32b addressing space, due to the size of the indirect access mechanism through 4 byte long access to 0x0CF8 to specify the address. However, with the memory-mapped addressing model supported by CSI, there is no such restriction. Therefore, it is the responsibility of the target I/O agent to ignore (or translate) address bits above A[27] before forwarding a configuration request to a PCI Express I/O device. The example steps for configuration accesses using 0x0CF8/0x0CFC I/O port accesses are as follows: • Steps for Configuration Write: — 4 byte I/O Port write to 0x0CF8 writes to CFG_ADDR register — 1 to 4 byte I/O Port write to 0x0CFC, which triggers NcCfgWr transaction using address from [CFG_BASE:CFG_ADDR] and data from CFG_DATA. Note that implementation of CFG_DATA register is not strictly required for this operation. I/O port write to 0x0CFC does not complete until NcCfgWr completes. • Steps for Configuration Read: — 4 byte I/O Port write to 0x0CF8 writes to CFG_ADDR register — 1 to 4 byte I/O Port read to 0x0CFC, which triggers NcCfgRd transaction using address from [CFG_BASE:CFG_ADDR]. I/O port read completes by returning data from CFG_DATA register as a result of NcCfgRd data return. Note that implementation of CFG_DATA register is not strictly required for this operation. If the address resulting from [CFG_BASE:CFG_ADDR] is not to an I/O configuration region, then the source agent may either indicate an error, complete writes without any updates and respond with 0xFF to reads to indicate master abort, or generate a NcCfgRd or NcCfgWr transaction targeting one of the I/O agents (which may complete writes without any updates and respond with 0xFF to reads to indicate a master abort). System firmware must set up the CFG_BASE and address decoder entries for configuration region properly to avoid this condition. Ref No xxxxx 239 Intel Restricted Secret Address Decode Address Decode 7.2.1.5 Determination of Target Node for Special Operations These operations include IA-32 interrupts (IntLogical), message requests (NcMsgB and NcMsgS), interrupt acknowledge (IntAck), and XTPR updates (IntPrioUpd) transactions. These transactions may not rely on address decoding mechanism to determine the target node and might use configurations registers at the processor agents to determine the target. The configuration registers used for different special requests might be different to allow different targets to be specified. Note: Details of various registers used for determination of target for these operations will be provided in the subsequent revisions of this specification. This aspect may be implementation dependant. 7.2.2 Target Address Decoder at the Memory Agent This section describes the target address decoder at a typical memory agent to map physical address to the device address on the memory accessed through the memory agent. As mentioned earlier, the target address decoder is not required at all CSI agents that are targets of CSI transactions. The need for the target decoder depends on the capability of the target agent and the types of CSI transactions serviced by the agent. For example, if the memory controller at a memory agent is capable of handling the complete physical address space, no further mapping of physical to device address may be needed. Also, configuration transactions targeted to a memory agent may not need any remapping through the target decoder. 7.2.2.1 Target Decoder at a CSI Memory Agent Figure 7-4 shows the conceptual view of target address decoder at a memory agent. Figure 7-4. Target Address Decoder at a Memory Agent Physical Address 0N-1 Address Range Match Select Interleave Identifier Lookup Base Address Region # Hit / Miss Attribute Remove Interleave and Unmasked Bits To Get Local Offset Combine Device Address 240 Ref No xxxxx Intel Restricted Secret 7.3 NodeID Assignment and Address Subdivision Certain implementations may allow distinct (from routing perspective) CSI protocol agents within a component to share the same CSI node identifiers and use some function of message class encoding and message opcodes to route CSI messages to appropriate agents. Also, in certain implementations, a single caching domain may be represented by multiple CSI node identifiers with explicit division of address responsibility between them, and it is expected that for a given address only one of the CSI agents can ever initiate any request. This property can be exploited by a platform in subdivision of addresses among memory agents and distribution of memory agent resources to different caching agents. These features are optional for a platform, but use of these features in a system requires that CSI agents meet certain expectations. This section outlines some of these expectations. 7.3.1 NodeID Assignment Destination NodeIDs can be specialized by attribute, so that some destination receives and handles only limited request types, but the overriding rules are: • A single physical address as seen by the CSI source address decoder can never (simultaneously) have multiple attributes. • A single physical address as seen by the CSI source address decoder can never (simultaneously) have more than one home NodeID, so not more than one destination agents will ever receive requests for the same address. Note that it is possible for two different requests (e.g, a read and a write request or due to changes in the source address decoder) to the same address to be targeted to two different home agent destinations over time. The determination of the destination node of a request is normally a function of the request address only. There are exceptions to this rule, all of which are for non-coherent requests: 1. Broadcast requests can target multiple nodeIDs simultaneously, all with identical attributes. 2. Address regions that select between two targets depending on whether the request is a read or a write. 3. Address regions that select between two targets depending on the processor mode (SMM), request type (code/data) access, and configuration bits. Note that internal to a component, a single node ID may be shared among several protocol agents, each specializing in a subset of request types. It is the responsibility of the component to route requests to the correct protocol agent based on either message type or address or a combination of the two. However, even if a request is routed on the basis of message type, not more than one agent in the component should ever receive messages for the same request, except for broadcast requests. Also note that in systems that support I/O configuration accesses through 0x0CF8 and 0x0CFC I/O port addresses, the resulting CSI requests may either target an I/O port address or an I/O configuration address. This may seem like two different destinations for the same address (0x0CFC), but in fact these are different destinations for different addresses in different address spaces. 7.3.2 Caching Agent Address Subdivision It is possible for a single caching domain to be represented by multiple CSI caching agents with distinct CSI NodeID. In such implementations, each caching agent can support either the full range of addresses, or the coherent address space can be statically divided between a set of them. Ref No xxxxx 241 Intel Restricted Secret Address Decode Address Decode When a set of caching agents divide up the address space, there are two ways snoops are handled, as indicated below. The particular method used depends on the platform settings. • The first method is to send snoops to each caching agent representing a caching domain. Agents that are not responsible for the snooped address must return a RspI response, and the home agent knows to expect responses from every caching agent representing a caching domain. • The second method sends only a single snoop targeting any one of the caching agents representing a caching domain. Because CSI Routing layers are agnostic of addresses, this case can only occur when the set of caching agents coexist on the same component. It is then the responsibility of that component to correctly route the snoop to the appropriate caching agent, based on the address. If the component determines that no caching agent under its control can contain the snooped address, it must return a RspI response. This routing of snoop requests to appropriate caching agents is invisible to the requesting agent. The home agent must know to expect a response from only a single caching agent representing a caching domain. 7.3.3 Home Agent Address Subdivision A single component can support multiple home agents for coherent memory with distinct NodeID. In this case, addresses must be statically subdivided between them in order to maintain coherency. Caching agents must be capable of implementing home address subdivision functions (because they must route their snoop responses to a subdivided home agent based only on the request address in the snoop message). If components have both home and caching agents with static address division, then there are performance advantages if home and caching agent address divisions match, i.e., each home agent receives requests from only the matching subset of caching agents. It can thus allocate the same resources to the smaller set of requestors, resulting in more outstanding requests for each caching / home pair. In order to take advantage of this: • All sets of home and cache agents must have the same address division function, otherwise some home / cache agent pairs will not match. • All requestors must be capable of implementing the address subdivision function in order to correctly allocate home agent resources, regardless of whether their caching domains support address subdivision or not. The following address subdivision modes must be supported by CSI requesting agents to take advantage of this: • Two way division based on parity(PhysicalAddress[19,13,10,6]) that determines NodeID[1] 7.4 Address Decode Configurations Address decoder may provide following configurable options. • During initial configuration, the processor agent generated accesses to some specific address region (for Itanium processors this is at 0xFFF0 0000 to 0xFFFF FFFF) is sent to the firmware agent and accesses to any other region is sent to the local configuration agent. This is done to facilitate access to the firmware and local configuration registers without relying on the address decoder. 242 Ref No xxxxx Intel Restricted Secret • It is assumed that a path to firmware is determined by the hardware during initialization, without relying on the address decoder and routing table functionality to perform initial firmware accesses. 7.5 Support for Advanced RAS Features Please refer to the dynamic reconfiguration section of the specification for address decoder issues related to on-line reconfiguration or updates to address decoders, memory migration and replication, memory mirroring, memory sparing, transparent processor and I/O agent migration, etc. Ref No xxxxx 243 Intel Restricted Secret Address Decode Address Decode 244 Ref No xxxxx Intel Restricted Secret CSI provides a flexible set of interfaces for implementing a diverse range of cache coherent systems on a CSI links-based fabric. This chapter defines the roles of two types of protocol agents, the caching agent and the home agent. The CSI caching agent definition supports write-invalidate protocols with the M-E-S-I states. In addition, the CSI caching agent supports the F state, which is a read-only forwarding state. Different home agent algorithms may create different constraints on the fabric and the home agent microarchitecture. A given implementation of a CSI coherence agent may subset the functionality in a way that affects performance or compatibility with other agents. This document describes both the superset protocol and the permitted subsets. This chapter begins with an overview of the agent types defined by the protocol, and the logical structures within, in Section 8.1. In Section 8.2, we discuss the assumptions made on the CSI Link layer, the messages that are passed between agents, and the information they contain. The caching agent interface is described in Section 8.3. The 2-hop source broadcast coherence algorithms are detailed in Section 8.4. The out-of-order network, home broadcast (directory) algorithms are detailed in Section 8.5. Figure 8-1. Protocol Architecture Home Agent Home Agent Home Agent Caching Agent Caching Agent Caching Agent CSI Fabric 8.1 Protocol Architecture The coherence protocol defines the operation of two agent types, the caching agent and the home agent. A caching agent can (a) make read and write requests into coherent memory space, (b) can hold cached copies of pieces of the coherent memory space, and (c) can supply those cached copies to other caching agents. A home agent guards a piece of the coherent memory space, performing these duties: (a) tracking cache state transitions from caching agents, (b) managing conflicts amongst caching agents, (c) interfacing to the DRAM, and (d) providing data and/or ownership in response to a request when a caching agent doesn’t. Each piece of coherent memory is guarded by exactly one home agent, with potentially several caching agents caching that home agent’s memory. Caching agents can cache memory from multiple different home agents. The philosophy of CSI is to keep the caching agent simple, instead placing the onus of conflict resolution on the home agent. One reason for this is that the home (as the convergence point), has the most natural access to information across caching agents. Another reason is that placing most of the work in the home agent allows us to migrate relatively simple caching agent devices (processors, caching I/O hubs, etc.) into larger system topologies by coupling them to an enhanced home agent. Ref No xxxxx 245 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol The data structures within the caching agent and the home agent are abstractly described here to highlight the algorithmic state that must be preserved (in one form or another) in order to perform the protocol duties of each agent. Valid CSI implementations may differ dramatically from these structures. 8.1.1 Caching agent As shown in Figure 8-2, we group the architected state within the caching agent into two categories: Cache and System Miss Address File1. Figure 8-2. Caching Agent Architected State Cache System Miss Address File (SMAF) Address State Data a2 M foo 0 5 M-1 Address Cmd Conflict State a1 RdCode FALSE SentReq The cache is indexed by address, and records the state of the cache line (M-E-S-I-F) as well as the actual data. The SMAF records state about requests and writebacks outstanding in the CSI fabric. On a cache miss, the caching agent allocates an entry in the SMAF before launching the request onto the fabric. For each outstanding request, there needs to be a unique transaction ID (UTID) of the form [reqNID: homeNID: reqTID], where the reqNID2 is a unique identifier of the requesting caching agent (0 to MaxAgents3-1), the homeNID4 is a unique identifier of the target home NID (from 0 to MaxAgents-1), and the TID (0 to MaxRequests5-1) is a unique identifier of the request from reqNID to a given homeNID. Each caching agent must provide an indexing function from the unique transaction ID into the appropriate SMAF entry, as the UTID is the handle returned in Response messages. Each caching agent must also provide a way to form the full transaction ID from the SMAF entry. Each caching agent must also provide an indexing function from an arbitrary system address into the SMAF entry for an outstanding request to the same address (for conflict detection). There can be at most one outstanding transaction (request or writeback) for each address per caching agent within the coherence domain. 1. SMAF is a placeholder name until the CSI group finds a more appropriate label for this collection of state. 2. reqNID is the Requestor NID, a unique identifier of the requesting CSI caching agent. 3. MaxAgents is a profile specific parameter which indicates the maximum number of supported agents. 4. homeNID is the Home Agent NID, a unique identifier of the home CSI caching agent for a given address. 5. MaxRequests is a profile and configuration specific parameter which indicates the maximum number of outstanding requests that can target a home agent. 246 Ref No xxxxx Intel Restricted Secret For the state contained in the Cache and SMAF, implementations will likely choose alternative representations than what is implied above. The abstract model above is provided as a conceptual tool to help explain the valid CSI protocol message interleaving. 8.1.2 Home Agent The protocol architecture of the CSI home agent will vary dramatically depending on the type of home agent algorithm used. There are some common themes however. For example, each home agent presents an architectural view of the memory which it guards as a flat structure which may be read and written atomically. In addition, the Protocol layer flow control scheme for CSI requires that the home agent be able to sink all control messages without a dependency on the forward progress of any other message. This creates an architectural requirement for state necessary to record arrival of these messages--often referred to as ‘preallocation’ of the resources to record the control messages. A more precise view of the architected state can be found in the home agent algorithm section (Section 8.4 and Section 8.5). 8.2 Protocol Semantics The CSI coherence Protocol layer is aware of three protocol classes of messages, oblivious as to how this maps to low-level virtual channels. All message dependencies are calculated solely within and across these protocol channels. The three protocol channels are called Snoop, Home, and Response. Snoops may have a dependency on Home messages and Responses. Home messages may have a dependency on Responses. Therefore, the protocol hierarchy is Snoop->Home> Response. A more precise description of the protocol dependencies is provided in Section 8.2.2. Therefore, a caching agent may block forward progress on the snoop channel while waiting for credits to send a home or response channel message without deadlock. Under the source broadcast coherence protocol, the Home channel is required to be kept in-order for control messages to a given address, from a given source caching agent to the destination home agent. A particular design may order across addresses, but other agents must not rely on this ordering. The per-address ordering is expected to be maintained from protocol endpoint to protocol endpoint, which may include routing structures within protocol engines, as well. The protocol assumes the Link layer will guarantee fairness amongst messages traveling within each channel. The coherent protocol assumes that each caching agent has a mapping from an arbitrary coherent address to the homeNID (home NID). The mapping must be the same for any two caching agents which will share a range of coherent memory. 8.2.1 Coherent Protocol Messages This section defines the message types used by the coherent protocol, as well as the necessary fields for each message. The primer in Table 8-1 may be useful in interpreting the message names. Each message carries some number of additional fields. Ref No xxxxx 247 Intel Restricted Secret Table 8-1. Message Name Abbreviations Abbreviation Full Name Abbreviation Full Name Abbreviation Full Name Rd Read Data Data Inv Invalidate Wr Write Flush Flush C Coherent Fwd Forward Cmp Completion M Modified state to To Gnt Grant E Exclusive state Frc Force Wb WriteBack S Shared state Own Owner Code Code I Invalid state Snp Snoop Cur Current F Forwarding state Rsp Response Cnflt Conflict Ack Acknowledge Ptl Partial Table 8-2. Message Field Explanations Message Field Allowed Values Explanation cmd * Command, equivalent to message name addr CohAddrsa Coherent Cache line addresses destNID CacheNIDsb Destination NID, in every message destTID 0 to (MaxRequests-1) Destination TID reqNID CacheNIDs Requestor NID reqTID 0 to (MaxRequests-1) Requestor TID Number fromNID CacheNIDs From NID fromTID 0 to (MaxRequests-1) From TID homeNID HomeNIDsc Home NID data DataValuesd A cache line’s worth of data mask ByteMaskse A byte mask to qualify a data field a. CohAddrs is a profile and configuration specific set of allowable cache line addresses within a system or partition. b. CacheNIDs is a profile and configuration specific set of caching agent NIDs within a system. c. HomeNIDs is a profile and configuration specific set of home NIDs within a system. d. DataValues is the set of allowed data values in the system (generally all possible values of a cache line). e. ByteMasks is the set of allowed byte mask values in the system (generally all possible values of the mask vector). The polarity of the byte mask is high--that is, a logic 1 indicates a valid byte. 8.2.1.1 Snoop Channel Messages Snoop messages are always directed towards caching agents, though they may be generated by caching agents or home agents. The homeNID is not included in these messages, but is regenerated through the address mapping process for snoop responses as well as other responses. Table 8-3. Snoop Channel Messages Message Names Function Fields 248 Ref No xxxxx Intel Restricted Secret Table 8-3. Snoop Channel Messages (Continued) SnpCode Snoop to get data in F/S state cmd, addr, destNID, reqNID, reqTID SnpData Snoop to get data in EF/S states SnpCur Snoop to get data in I state SnpInvOwn Snoop to get data in EM states SnpInvItoE Snoop to invalidate peer agent, flushing any M state data to home 8.2.1.2 Home Channel Request Messages Request messages travel on the per-address ordered home channel and are always generated from a caching agent towards a home agent. The destNID is always the home NID for these messages. When a request is made, the requestor sends snoops to all the peer agents (within its broadcast domain), and sends a request message to the home agent. The request message sent to the home agent implies a snoop of the home agent's cache hierarchy (if it has one). Therefore, a separate snoop message to the home agent’s local caching agent must not be sent. The third column indicates which cache states the request may be issued from. Requests from FEM states are only valid under the IA-32 profiles which include special support for buried accesses. Table 8-4. Home Channel Request Messages Message Names Function May be issued from Fields RdCode Request data in F/S states MESIF cmd, addr, destNID, reqNID, reqTID RdData Request data in EF/S states MESIF RdCur Request data in I state I RdInvOwn Request data in EM states MESIF InvItoE Request E state without data MESIF 8.2.1.3 Home Channel Writeback Marker Messages Writeback marker messages are always generated from a caching agent towards a home agent. The destNID is always the home NID for these messages. Writebacks are initiated with a WbMto* message in the home channel, and the data sent (asynchronously) via a Wb*Data* message in the response channel. Table 8-5. Home Channel Writeback Messages Message Names Function Fields WbMtoI Downgrade from M->I, signal an in-flight WbIData message cmd, addr, destNID, reqNID, reqTID WbMtoS Downgrade from M->S, signal an in-flight WbSData message WbMtoE Downgrade from M->E, signal an in-flight WbEData message cmd, addr, destNID, reqNID, reqTID Ref No xxxxx 249 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol 8.2.1.4 Home Channel Snoop Responses Snoop responses are generated from caching agents towards home agents. The destNID is therefore always equal to the home agent’s ID. RspFwd, RspFwdI, and RspFwdS are all equivalent from the caching agent's perspective. The RspFwdI* & RspFwdS* provide additional information to the home agent about the final state of the line at the responder, which may be needed to record in a directory maintained at the home. RspFwd is used for a SnpCur snoop type, which does not change the cache state at the requestor. Every snoop or Fwd* message will cause a snoop response to be generated. A Fwd* message will never cause a RspCnflt, RspCnfltOwn, RspFwdS, or RspFwdSWb. RspCnfltOwn is used when the snoop responder has a conflicting outstanding request, and an M- state copy of the line. RspIWb is used in situations where the owner cannot or should not respond directly to the requestor, and instead does a writeback to the home. It is the home’s responsibility to respond to the requestor on receiving the WbIData. This is used for the incoming SnpInvItoE’s hitting an M state line, or any snoop hitting a partially written M state line, and other cases where it is desired to respond to the home first. RspSWb may be used arbitrarily by peer caching agents in response to non-RFO snoops. RspSWb also is used when a SnpCode hits an M state line. Since desktop memory hub-type devices will use SnpCode for I/O DMA reads, we’d like to make sure that a SnpCode will never cause two data messages to be sent to the memory hub. This ensures this property at the modest cost of additional latency on code fetches and I/O DMA reads for hit M cases. RspSWb can also be used in other cases where it is desired to respond to the home first. Table 8-6. Home Channel Snoop Responses Message Names Function Fields RspI Peer is left with line in I state cmd, destNID, reqNID, reqTID, fromNID RspS Peer is left with line in S state RspCnflt Peer is left with line in I state, and the peer has a conflicting request or Wb* cmd, destNID, reqNID, reqTID, fromNID, fromTID RspCnfltOwn Peer has a buried M copy for this line with an outstanding conflicting request or Wb* cmd, destNID, reqNID, reqTID, fromNID, fromTID RspFwd Peer has sent the data to the requestor with no change in cache state cmd, destNID, reqNID, reqTID, fromNID RspFwdI Peer has sent the data to the requestor, and is left with line in I state RspFwdS Peer has sent the data to the requestor, and is left with line in S state RspFwdIWb Peer has sent the data to the requestor and a WbIData to the home, and is left with the line in I state RspFwdSWb Peer has sent the data to the requestor and a WbSData to the home, and is left with the line in S state RspIWb Peer has evicted the data with an in-flight Wb*Data[Ptl] message to the home, and has not sent any message to the requestor RspSWb Peer has sent a WbSData message to the home, has not sent any message to the requestor, and is left with the line in S state 250 Ref No xxxxx Intel Restricted Secret 8.2.1.5 Home Channel AckCnflt Message The AckCnflt message travels on the home channel. The destNID is always equal to the home agent’s ID for this message. The address is required to be placed in this packet. The AckCnflt must be generated when the requestor has received its (DataC_* or GntE) response and (Cmp or FrcAckCnflt) from the home agent, and (1) the requestor has been hit with a conflicting snoop, or (2) the requestor received a FrcAckCnflt from the home agent instead of a Cmp. All writebacks (WbMto*) must send an AckCnflt under similar rules, specifically when it receives its (Cmp or FrcAckCnflt) from the home agent, and (1) the writeback requestor has been hit with a conflicting snoop, or (2) the writeback requestor received a FrcAckCnflt from the home agent instead of a Cmp. Table 8-7. Home Channel AckCnflt Message Message Names Function Fields AckCnflt Acknowledge receipt of DataC_*/GntE and Cmp/FrcAckCnflt, signal a possible conflict scenario cmd, addr, destNID, reqNID, reqTID 8.2.1.6 Response Channel Data Responses These messages all carry a cache line of data, aligned on a cache line boundary. The DataC_* messages are always sent to the requesting caching agent, so the requestor NID is the destNID. The Wb*Data[Ptl] messages must also carry address so that they can be written out to memory independent of the arrival of the accompanying WbMto* or Rsp*Wb message. The Wb*Data* message is always sent to the home agent, so the destNID is equal to the home agent’s NID. The ‘Ptl’ qualifier on the WbIDataPtl message indicates that this writeback data includes a per-byte mask. The memory controller must ensure that the final state of the line in memory contains the original background data with only the bytes indicated in the Mask field modified with new data in the WbIDataPtl message. DataC_S & DataC_F shared an identical opcode. We refer to it as DataC_S when the caching agents do not support the F-state, or F-state is disabled at the caching agent. The DataC_*_Cmp variants are semantically equivalent to separate DataC_* and Cmp messages, but are combined for performance reasons. The protocol algorithms described in this chapter make no distinction between the separate DataC_* and Cmp messages and the DataC_*_Cmp messages. There is no DataC_M_Cmp, because the combining of DataC_* & Cmp is only done at the home agent, and data sent from the home agent is always clean with respect to memory. Similarly, the DataC_*_FrcAckCnflt messages are semantically equivalent to the separate DataC_* and FrcAckCnflt messages. There is no DataC_M_FrcAckCnflt because this message is only sent from the home agent, and data sent from the home agent is always clean with respect to memory. The state information included with the Wb*Data* messages must always match the snoop response on an implicit writeback (i.e., Rsp*IWb <=> WbIData, Rsp*SWb <=>WbSData), and must match the WbMto* message on an explicit writeback (i.e., WbMtoI <=> WbIData, WbMtoS <=> WbSData, WbMtoE <=> WbEData). The state qualifiers on Wb*Data* messages are provided to ease manipulation of directory information at the home agent in directory-based controllers. The WbIDataPtl implies that the final state at the requestor is I, in both the implicit and explicit writeback cases (RspIWb and WbMtoI, respectively). Ref No xxxxx 251 Intel Restricted Secret Table 8-8. Response Channel Data Messages Message Names Function Fields DataC_F/S Data in F/S state cmd, destNID, reqTID, homeNID, data DataC_I Data in I state DataC_E Data in E state DataC_M Data in M state DataC_F/S_Cmp Data in F/S state with a Completion DataC_I_Cmp Data in I state with a Completion DataC_E_Cmp Data in E state with a Completion DataC_F/S_FrcAckCnflt Data in F/S state with a FrcAckCnflt DataC_I_FrcAckCnflt Data in I state with a FrcAckCnflt DataC_E_FrcAckCnflt Data in E state with a FrcAckCnflt WbIData Writeback data, downgrade to I state cmd, addr, destNID, reqNID, reqTID, data WbSData Writeback data, downgrade to S state WbEData Writeback data, downgrade to E state cmd, addr, destNID, reqNID, reqTID, data WbIDataPtl Partial (Byte-masked) Writeback data cmd, addr, destNID, reqNID, reqTID, data, mask 8.2.1.7 Response Channel Grant Messages The Grant messages are used to grant ownership for a line without sending the data. These messages are always sent to the requesting caching agent. The destNID is therefore always equal to the requesting NID. GntE is always combined with either a Cmp or a FrcAckCnflt. Table 8-9. Response Channel Grant Messages Message Names Function Fields GntE_Cmp Grant E state ownership without data, but with a Completion cmd, destNID, reqTID, homeNID GntE_FrcAckCnflt Grant E state ownership without data, but with a FrcAckCnflt 8.2.1.8 Response Channel Completions and Forces These messages are always sent to the caching agent who is the current owner of the line. On each request, a requestor will receive a response (GntE or DataC_*) and then either a Cmp or a FrcAckCnflt when the home agent gathers all the snoop responses. GntE is always combined with the Cmp/FrcAckCnflt. For both Cmp or FrcAckCnflt, the destNID is the requestor’s NID. The home agent must generate a FrcAckCnflt when it has detected potential conflicts with respect to the current owner, but it may arbitrarily send a FrcAckCnflt, as well. The Cmp_Fwd* messages are tools used by the home agent to extract data and/or ownership from the current owner under conflict cases. There is symmetry between the behavior of Cmp_Fwd* messages and their counterpart snoop messages. For these messages, the destNID, destTID, and homeNID uniquely identify the owner’s request entry, and the reqNID, reqTID, and homeNID uniquely identify the requestor’s request entry, which will be the target of the forward. 252 Ref No xxxxx Intel Restricted Secret Table 8-10. Response Channel Completions and Forces Message Names Function Fields Cmp All snoop responses gathered, no conflicts cmd, destNID, reqTID, homeNID FrcAckCnflt All snoop responses gather, force an AckCnflt Cmp_FwdCode Request complete, forward the line in F/S state to the requestor specified, invalidate local copy cmd, destNID, destTID, reqNID, reqTID, homeNID Cmp_FwdInvOwn Request complete, forward the line in E or M state to the requestor specified Cmp_FwdInvItoE Request complete, invalidate local copy 8.2.2 Protocol Dependencies A dependency is any potential component of a deadlock. In a links based architecture, protocol dependencies must be carefully tracked to avoid deadlock. Often the rules seem strict, but they are necessarily so, as deadlock can be created in subtle ways. All dependencies are equal from an anti-deadlock perspective. However, it is useful analytically to distinguish the different types of dependencies which are allowed and disallowed by CSI. We classify dependencies in three ways: dependencies within a channel (Section 8.2.2.1), dependencies between channels (Section 8.2.2.2), and dependencies on a particular message arrival (Section 8.2.2.3). 8.2.2.1 Protocol Dependencies Within a Protocol Channel A message X on any protocol channel (Snoop, Response, Home) can have a dependency on any other message Y on the same protocol channel, provided that Y does not also have a dependency on message X. This means that an arbitrary order may be placed on the messages traveling within each channel (provided the point to point per-address ordering of the Home channel is not violated). A home or caching agent must be able to process any interleaving of messages on each channel. For example, for a home agent which has two input ports both carrying response messages, the home agent cannot refuse to sink a response message on port 0 because it is waiting for some response message on port 1. Similarly, an agent cannot refuse to sink a message within any protocol channel because it is waiting for Link layer credits to send a message on the same channel. Another example, a caching agent could not use the same buffer for incoming DataC_* messages and outgoing Wb*Data* messages as it would create a situation where the response channel may not drain. Figure 8-3 visualizes the permitted dependencies within a protocol channel, using the snoop channel as an example. Multiple snoop messages can be arbitrarily reordered, and then placed in an arbitrary but fixed order by the fabric. The protocol endpoint is therefore required to be able to process the incoming snoops in any order. The protocol endpoint (a caching agent in this case), can back pressure the snoop message at the head of the channel for flow control reasons, or while waiting for another message to arrive (or for finite time). This has a ripple effect of pushing this dependence all the way back through the snoop channel, potentially blocking snoops to other protocol endpoints, as well. Care must be taken to prevent a circular dependence from arising due to the blocking conditions. Flow control blocking conditions are discussed in Section 8.2.2, and message dependencies are discuss in Section 8.2.2.3. Ref No xxxxx 253 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol OutgoingSnoop OutgoingSnoop OutgoingSnoop ArbitraryReorder Fixed Order Protocol Endpointcan back pressurefor flow control reasons, or while waiting for one or more messages. Snoop Protocol Endpointcan back pressurefor flow control reasons, or while waiting for one or more messages. IncomingSnoop Incoming 8.2.2.2 Protocol Dependencies Across Channels In addition to the rule governing messages within a channel, there is a hierarchy of dependencies established such that there is no circular dependence created across the protocol channels. The response channel has the highest priority. It must be drained independent of forward progress on the snoop or home channels. The home channel has the next highest priority, in that it may have a dependency on the response channel, but not on snoops. Put another way, it would technically be possible to inhibit forward progress on the home channel pending progress on the response channel without a deadlock, but the converse is not true. In practice, it is generally not necessary (or recommended) to create a dependency at all between the home channel and the response channel. The snoop channel has the lowest priority. It may have dependencies on both the home channel or the response channel (but neither the home or response channels may have a dependency on the snoop channel). The canonical example of this is that a caching agent may refuse to process an incoming snoop until it has Link layer credits to send the resulting snoop response on the home channel and [potentially] the data response on the response channel. However, it is not legal (for example) for a caching agent to refuse to accept a data response because it has not finished sending all the snoops for a request. In situations where the protocol indicates that multiple messages are sent in a single logical step (for example, WbIData & WbMtoI), then the messages are bound by the rules above. On sending the WbMtoI, the caching agent is committing to sending the WbIData relying only on forward progress on the response channel. The same rule applies for snoop responses (RspIWb & WbIData)--as the caching agent commits to sending both the RspIWb & WbIData when either is sent, with the only dependence being on the forward progress of the Home & Response channels, respectively. 8.2.2.3 Message Dependencies Message dependencies cover situations in which a resource is held while waiting for one or more particular messages to arrive, as well as the permitted dependencies at an endpoint in responding to a message with its response. In some situations, the resource that is held implies a blocking condition on one or more of the protocol channels in such a way that the message dependence interacts with the dependencies described in Section 8.2.2.1 and Section 8.2.2.2. Message dependencies are specific to a phase of a 254 Ref No xxxxx Intel Restricted Secret transaction. Here we describe some of the most important permitted message dependencies, an exhaustive reference will be provided in a subsequent revision of this chapter. Please refer to Section 8.3.1 for precise descriptions of Request phase, Writeback phase, and AckCnflt phase. Table 8-11. Permitted Message Dependencies in CSI Description Resources That Can Potentially Be Blocked Responses That Can Potentially Be Withheld A caching agent that is in Request phase on a Rd* or InvItoE request, waiting for (Dat.a* or GntE) AND (Cmp or FrcAckCnflt). Transaction ID AckCnflt A caching agent that is in Writeback phase for a WbMto*, waiting for a Cmp or FrcAckCnflt. Transaction ID AckCnflt A caching agent that is in AckCnflt phase for a Rd*, InvItoE, or WbMto* (i.e., waiting for a Cmp/Cmp_Fwd*). Transaction ID, Snoop channel (due to receiving conflicting snoops). Rsp* (snoop response to Cmp_Fwd*), DataC_* and/or Wb*Data[Ptl] (caused by the Cmp_Fwd*). A caching agent will not return snoop responses until it receives the snoop for the message (this should be obvious). – Rsp* (snoop response), DataC_* and/or Wb*Data[Ptl] (caused by the snoop). A caching agent that is *not* in AckCnflt phase on a request which conflicts with the incoming snoop and has received a snoop must reply with a snoop response without any further message dependencies. – – A home agent will not return the Cmp or FrcAckCnflt until it receives all snoop responses for a request (Rd* or InvItoE), as well as the request message. The response is also potentially dependent on receiving a Wb*Data[Ptl] message on an implicit writeback case. The response may also be arbitrarily dependent on receipt of any messages traveling on the home, response, or snoop channels. – Cmp or FrcAckCnflt A home agent will not return the Cmp or FrcAckCnflt until it receives the WbMto* message, as well as Wb*Data[Ptl] message for a WbMto*. The response may also be arbitrarily dependent on receipt of any messages traveling on the home, response, or snoop channels. – Cmp or FrcAckCnflt Ref No xxxxx 255 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol Description Resources That Can Potentially Be Blocked Responses That Can Potentially Be Withheld A home agent will not return the DataC_* or GntE for a request until it receives all the snoop responses for a request (Rd* or InvItoE), as well as the request message. The DataC_* or GntE may also be dependent on receiving a particular Wb*Data[Ptl] message on an implicit writeback case. The response may also be arbitrarily dependent on receipt of any messages traveling on the home, response, or snoop channels. – DataC_* or GntE A home agent will not return a Cmp_Fwd* or Cmp until the AckCnflt arrives from the owner. The response may also be dependent on receiving the home channel request message for the target of the Cmp_Fwd*. The response may also be arbitrarily dependent on receipt of any messages traveling on the home or response channel. There must not be a dependence on the snoop channel, or on snoop responses (since they are dependent on the snoop channel). Cmp_Fwd* or Cmp 8.2.2.4 Protocol Requirements on Fairness We use the term fairness in this document in the forward progress (anti-starvation) sense, which does not imply uniform frequency of selection. It is the responsibility of the protocol endpoints (home and caching agents) to provide fairness across protocol channels, though in practice this is not difficult as there is a natural back pressure through the dependence hierarchy. For example, if a home agent always favors responses over home channel messages, then eventually the caching agents will run out of transaction IDs to send new requests and writebacks, and there will only be home channel messages left to drain. Such a strategy may be sufficient for protocol correctness, but would likely be limited by other requirements (timeouts) motivated by error isolation constraints. 8.2.2.5 Link Layer Requirements on Dependencies and Fairness The Link layer is functionally agnostic of the protocol channel dependence hierarchy, though it may be aware of the dependence hierarchy for performance reasons. The Link layer must not create any dependencies across protocol channels, and it must not create circular dependencies within a protocol channel. The Link layer must provide fairness (in the forward progress sense) amongst messages within a protocol channel (for example, in a switch network, the switch is expected to eventually route from all inputs). 8.3 Caching Agent Interface The CSI coherence protocol is designed to place the bulk of the algorithmic decisions within the home agent, creating a fairly simple set of rules for the caching agent. The algorithmic behavior of the caching agent is consistent from small to large systems. The home agent may or may not implement partial or full directories in order to improve scalability in large systems. In general, the existence of a directory at the home agent is invisible to the caching agents in the system. 256 Ref No xxxxx Intel Restricted Secret 8.3.1 Transaction Phases There are three phases that a request (Rd*, InvItoE) may be in for a given address: • Null Phase: No outstanding request. • Request Phase: Starts when the Rd* or InvItoE is sent, terminates on receipt of DataC_*/GntE AND Cmp/FrcAckCnflt. • AckCnflt Phase: Must occur at the termination of the Request Phase if and only if a RspCnflt* has been sent during the Request phase (in response to a conflicting snoop) OR a FrcAckCnflt was received (instead of a Cmp). Starts with sending of an AckCnflt message, terminates on receipt of a Cmp/Cmp_Fwd* message. Similarly for writebacks (WbMto*), there are three phases for a given address: • Null Phase: No outstanding request. • Writeback Phase: Starts when the WbMto* is sent, terminates on receipt of the Cmp/FrcAckCnflt. • AckCnflt Phase: Must occur at the termination of the Writeback Phase if and only if a RspCnflt* has been sent during the Request phase (in response to a conflicting snoop) OR a FrcAckCnflt was received (instead of a Cmp). Starts with sending of an AckCnflt message, terminates on receipt of a Cmp/Cmp_Fwd* message. The intention is that AckCnflt Phases are only required in the presence of address conflicts. 8.3.2 Coherence Domain CSI permits 2-hop source broadcast protocols to be constructed by coupling CSI caching agents together with one or more home agents which implement the source broadcast coherence algorithm (as described in Section 8.4). These flows rely on an ordered home channel (Section 8.2). The source broadcast protocol also creates a requirement that each caching agent must be able to fanout snoops to all caching agents within its coherence domain. Each caching agent must be configured with a list of peer agents that it is responsible for snooping (PeerAgents). Every caching agent’s PeerAgents value is different, as each caching agent does not consider himself a ‘Peer’. The convention that we adopt is that PeerAgents[X] indicates the value of the PeerAgents list at the caching agent X. Each home agent must have a count of the number of agents within a caching agent’s PeerAgents (the count of agents within every caching agent’s PeerAgents list must be configurable to be consistent within a hard partition), so that the home agent knows how many snoop responses to wait for. When a Rd* or InvItoE request is generated at a caching agent, the requestor is committing to sending the appropriate Snp* to each agent listed in that caching agent’s PeerAgents list, except that the caching agent must not send a snoop to a caching agent that shares a CSI NID with the home agent for this address. Therefore, the home NID must be subtracted from the PeerAgents list before the snoop fanout. If the home agent has a local caching agent, then a snoop is implied by the Rd* or InvItoE request as it arrives. Under a pure directory protocol, the caching agent need not track which caching agents are in the coherence domain--this becomes the responsibility of the home agent. Hybrid protocols are possible under CSI by configuring the caching and home agents such that they are responsible for snooping mutually exclusive sets of caching agents (presumably using directories for the caching agents which are snooped by the home agent control). Ref No xxxxx 257 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol 8.3.3 Cache States CSI implements the M-E-S-I-F states, which are defined in Table 8-12. Clean/Dirty indicates whether the value of the data in the cache matches the data stored in memory. ‘May forward’ indicates whether this cache state can give the response to at least some snoop types. M and E state can forward DataC_[ME] to an ownership snoop (SnpInvOwn) or DataC_[F/SI] to non-ownership snoops (SnpData, SnpCode, SnpCur). Table 8-12. Cache States State Clean/ Dirty MayWrite? MayForward? MaySilent Transition to Explanation M - Modified Dirty Yes Yes – Must writeback on a replacement or when forwarding DataC_E, DataC_F/S, or GntE E - Exclusive Clean Yes Yes MSIF Must transition to M state on a write S - Shared Clean No No I Must invalidate and send RdInvOwn or InvItoE on a write I - Invalid – No No – F - Forwarding Clean No Yes SI Must invalidate and send RdInvOwn or InvItoE on a write The minimum required cache states an implementation must support depends on the requests that the caching agent is able to generate, assuming that the device actually wants to cache the line (and not just a use once policy). Table 8-13 gives background on the relation between the request types and the required cache states. Table 8-13. Required Cache State for Request Types Request Types Required States Explanation Only RdCur I RdCur, RdCode IS or IF RdCur, InvItoE IE or IM Must support M state to actually write the data RdCur, InvItoE, RdInvOwn IM RdCur, InvItoE, RdInvOwn, RdData, RdCode IMS or IMF 8.3.4 Peer Caching Agent Responses to an Incoming Snoop During the Null Phase A caching agent will respond to a snoop in different ways depending on whether the agent has an outstanding conflicting request or writeback in Request or AckCnflt Phase, what state the cache line is in, and what the snoop type is. Table 8-14 shows the cache state transitions in response to incoming snoops when the peer agent does not have a conflicting outstanding request (i.e., Null Phase). Additional permutations are possible considering the silent cache state transitions which are permitted (Table 8-12). For example, an Eor F state line may silently transition to I state on every incoming snoop, which has the effect of only sending data when it hits in M state. The ‘Partial Data’ column indicates whether the data held by the cache hierarchy is incomplete. If an incoming snoop hits a partially written M-state line, then the owner must reply with a RspIWb + WbIDataPtl for any snoop type. For each snoop type, the peer caching agent can respond with RspIWb + WbIData on a hit in M state. For SnpCode, SnpData, and SnpCur, the peer may also 258 Ref No xxxxx Intel Restricted Secret reply with RspSWb + WbSData. This flexibility is to enable simpler caching agent microarchitectures. Whether or not this happens is nondeterministic from the home agent’s perspective. If the incoming snoop hits a partially stored E or F line, then a silent transition to a non-forwarding state should occur, with a RspI or RspS snoop response. . Table 8-14. A Peer Caching Agent’s Response to an Incoming Snoop Snoop Type Peer Cache State Partial Data New Peer Cache State Response toRequestor Response to Home SnpData M No I DataC_E RspFwdIWb + WbIData SnpData M No S DataC_F/S RspFwdSWb + WbSData SnpData M No I RspIWb + WbIData SnpData M No S RspSWb + WbSData SnpData M Yes I – RspIWb + WbIDataPtl SnpData E No S DataC_F/S RspFwdS SnpData S X S – RspS SnpData SnpData SnpInvOwn Ia F M X No No I S I – RspI DataC_F RspFwdS DataC_M RspFwdI SnpInvOwn M No I – RspIWb + WbIData SnpInvOwn M Yes I – RspIWb + WbIDataPtl SnpInvOwn E No I DataC_E RspFwdI SnpInvOwn S X I – RspI SnpInvOwn Ia X I – RspI SnpInvOwn SnpCode SnpCode F M M X No No I S S ---RspI DataC_F/S RspFwdSWb + WbSData – RspSWb + WbSData SnpCode M No I – RspIWb + WbIData SnpCode M Yes I – RspIWb + WbIDataPtl SnpCode E No S DataC_F/S RspFwdS SnpCode S X S – RspS SnpCode SnpCode SnpInvItoE Ia F M X No No I S I – RspI DataC_F RspFwdS – RspIWb + WbIData SnpInvItoE M Yes I – RspIWb + WbIDataPtl SnpInvItoE E X I – RspI SnpInvItoE S X I – RspI SnpInvItoE Ia X I – RspI SnpInvItoE F X I ---RspI SnpCur M No M DataC_I RspFwd SnpCur M No I – RspIWb + WbIData SnpCur M No S – RspSWb + WbSData SnpCur M Yes I – RspIWb + WbIDataPtl SnpCur E No E DataC_I RspFwd SnpCur S X S – RspS Ref No xxxxx 259 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol Snoop Type Peer Cache State Partial Data New Peer Cache State Response toRequestor Response to Home SnpCur SnpCur Ia F X No I F – DataC_I RspI RspFwd a. If the peer’s cache line is in state E, but has no data (which can result from an InvItoE request), then it is treated in the same way as when the cache state is I. 8.3.5 Peer Caching Agent’s Response to a Conflicting Snoop During the Request and Writeback Phases Table 8-15 indicates the peer agent’s response to an incoming snoop when the peer has conflicting outstanding transaction in Request phase or a writeback in Writeback phase. For requests (Rd*), these transitions are only valid until the DataC_* arrives. Once the DataC_* arrives, and until the Cmp/FrcAckCnflt arrives, the only valid response to home is RspCnflt. During this interval, the cache state is unaffected by incoming conflicting snoops. Table 8-15. Peer Caching Agent’s Response to a Conflicting Incoming Snoop During RequestPhase, before DataC_*/GntE Response Snoop Type Peer Cache State New Peer Cache State Response to Home Snp* M M RspCnfltOwn SnpData EFSIa Sb RspCnflt SnpInvOwn EFSI I RspCnflt SnpCode EFSI S RspCnflt SnpInvItoE EFSI I RspCnflt SnpCur EFSI S RspCnflt a. If the peer’s cache line is in state E but has no data (which can result from an InvItoE request), then it is treated in the same way as when the cache state is I. b. S state is the minimal required transition--a transition to I state is also permitted. 8.3.6 Peer Caching Agent’s Response to a Conflicting Incoming Snoop During the AckCnflt Phase An incoming snoop which finds that the peer agent has a conflicting outstanding request or writeback which is in AckCnflt phase must be blocked or buffered by the peer caching agent. The logical view is that the snoop is not processed until the peer agent has received and processed the Cmp or Cmp_Fwd* that will terminate the AckCnflt phase for the conflicting transaction. The difference between buffering or blocking is a matter of dependencies created on the snoop channel. Blocking implies that during the AckCnflt window, the peer caching agent may stall snoops to unrelated (non-conflicting) addresses, in addition to any conflicting snoops. Buffering is a superset of blocking, in that it allows some or all unrelated snoops to continue to make forward progress. Blocking is the minimal required by CSI, the degree of additional buffering provided is a performance or quality of service enhancement. When the AckCnflt phase ends, any buffered or blocked snoops are replayed, generating normal snoop responses, including implicit forwards. 260 Ref No xxxxx Intel Restricted Secret 8.3.7 Responding to Cmp_Fwd* or Cmp to End the AckCnflt Phase The home agent may respond to an AckCnflt with a Cmp--at which point the transaction is complete. The Cmp_Fwd* is provided as a mechanism to allow the home agent to extract data and ownership from the owner (presumably to provide to a new requestor) without relying on forward progress on the snoop channel (which may be blocked during the AckCnflt phase) under conflict cases. In general, the type of Cmp_Fwd* corresponds to the request type of the conflicting requestor-though this is not a rule, and all CSI caching agents must be able to accept all Cmp_Fwd* types. Table 8-16 shows the owner state transitions and message responses on receipt of a Cmp_Fwd* variant. Cmp_Fwd* messages are processed like snoops, including generation of snoop responses. If an allowed silent state transition has occurred at the owner, it may reply with RspI to incoming Cmp_Fwd* message, which will cause the home agent to supply the line from memory. Just as with snoops, for each Cmp_Fwd* type the owner must reply with a RspIWb + WbIDataPtl if there is a hit M on a line for which the owner does not have all the bytes of the line (i.e., as the result of a partial write). The Cmp_Fwd* message technically belongs to the caching agent that is receiving the forwarded data, in that the snoop response will list as its transaction ID the reqNID and reqTID for the requestor that is the target of the forwarded data. Unlike snoops, however, all Cmp_Fwd* types must invalidate the cache line at the owner. Table 8-16. Cmp_Fwd* State Transitions Cmp_Fwd* type Owner’s Cache State Partial Data Owner’s Next Cache State Sent to Requestor Sent to Home Agent Cmp_FwdCode M No I – RspIWb + WbIData Cmp_FwdCode M Yes I – RspIWb + WbIDataPtl Cmp_FwdCode EF No I DataC_F/S RspFwdI Cmp_FwdCode SaIb X I – RspI Cmp_FwdInvOwn M No I DataC_M RspFwdI Cmp_FwdInvOwn M No I – RspIWb + WbIData Cmp_FwdInvOwn M Yes I – RspIWb + WbIDataPtl Cmp_FwdInvOwn E No I DataC_E RspFwdI Cmp_FwdInvOwn FSaIb X I – RspI Cmp_FwdInvItoE M No I – RspIWb + WbIData Cmp_FwdInvItoE M Yes I – RspIWb + WbIDataPtl Cmp_FwdInvItoE EFSI X I – RspI a. The owner’s cache state can be in S because of a silent downgrade from E or F (see Table 8-12). The owner’s cache state can be in F due to a silent downgrade from E b. If the owner’s cache line is in state E but has no data (which can result from an InvItoE request), then it is treated in the same way as when the cache state is I. After processing a Cmp_Fwd* the AckCnflt phase for the target’s outstanding transaction is completed, and the caching agent may deallocate the transaction ID for this request. Note: The above options never permit the owner to keep an S copy. Ref No xxxxx 261 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol 8.4 Source Broadcast Home Agent Algorithm This section describes an option for a CSI home agent algorithm which implements 2-hop source broadcast coherence. The additional constraints that this algorithm places on the base caching agent behavior are: • Each caching agent must implement snoop fanout as described in Section 8.3.2 (PeerAgents[X] lists). • Each caching agent must keep per-address ordering on the home channel The additional constraint that this algorithm places on the fabric is: • The interconnection fabric must maintain per-address ordering on the home channel. The focus of this section is to describe the concepts that the algorithm relies on, and the algorithm itself, qualitatively. A concise description of one instance of this algorithm is provided in Appendix F, “An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence”. 8.4.1 Home agent architected state The Protocol layer flow control scheme for CSI requires that the home agent be able to sink all control messages without a dependency on the forward progress of any other message. This creates an architectural requirement for state necessary to record arrival of these messages. The manifestation of this architectural requirement is a collection of state (a structure) referred to as the Tracker. The Tracker (pictured in Figure 8-4) exists within each home agent, and contains one entry for each possible simultaneous outstanding request (across all caching agents) to that home agent. Therefore there is one Tracker entry (in some home agent) for each valid UTID in the system. Each entry must hold the address of the request, the Cmd of the request, as well as some degree of dynamic state related to the request. The state required to track conflicts (Conflict info) is proportional to the number of snoops which may conflict with each request, and therefore may vary dramatically under various system configurations. Figure 8-4. Home Agent Architected State Tracker 0 3:7 Address Cmd State Conflict Info a2 RdCode ~ ... MxN-1 262 Ref No xxxxx Intel Restricted Secret 8.4.2 Interpreting Protocol Flow diagrams Please refer to the following legend to aid in interpreting the protocol flow diagrams in the following sections. Figure 8-5. Protocol Flow Legend Allocate requestor entry or home agent tracking entry Deallocate requestor entry or home agent tracking entry Ordered Home channel message UnOrdered Probe or Resp channel message ABC Requestor (caching) agents H Home agent MC Memory Controller 8.4.3 Protocol Flows Illuminated The examples in this section assume that PeerAgents[A] = {B,C}, PeerAgents[B] = {A,C}, and PeerAgents[C] = {A,B}. The value of ParticipantAgents is not visible to the caching agent, though we’ll assume that it is either null or the caching agents that it references never participate in the sharing here. Figure 8-6 illustrates the normal flow for a request to a line that is uncached in any of the peer agents. Agent C generates broadcast SnpData snoops to both of its PeerAgents, which are A & B, and the RdData request to the home agent. Both A & B are in I state in this case, so they respond with RspI snoop responses. The home agent gathers the snoop responses and delivers the data from memory using a combined DataC_E_Cmp response. The dashed lines call attention to those messages which travel on the point-to-point per-address ordered Home channel. In later examples, we’ll show how the ordered properties of the home channel are used to resolve conflicts. The forward striped rectangle highlights what we refer to as the Request Phase from agent C’s perspective. This is the time from when it allocates transaction ID, sends out snoops and the request message until it receives both the response to its request (DataC_* or GntE) and the completion (which in this case are sent simultaneously). Ref No xxxxx 263 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol Figure 8-6. Uncached RdData Request MC I A I B I C I H I .. E RdData SnpData SnpData RspI RspI DataC_E_Cmp MR DATA Request Phase The next example (Figure 8-7) illustrates a cache to cache transfer. Here, caching agent B issues a request and snoops agents A and C. Caching agent C has the line in M state, so it forwards the data immediately to agent A. At the point where caching agent A receives the DataC_M, it knows that there are no other cached copies in the system (since E and M states are exclusive). Therefore, A achieves global observation at the time it receives the DataC_M. In the CSI protocol, global observation is always achieved at the time the DataC_* (or GntE) is received at the requestor. In this example, the DataC_M arrives before the Cmp from the home agent. The Cmp is sent when the home agent has gathered all the snoop responses. The Request Phase does not end until the requestor has received both the DataC_M and the Cmp. If there are no conflicts, then the requestor may deallocate the transaction ID at the end of the Request Phase. 264 Ref No xxxxx Intel Restricted Secret Figure 8-7. Cached RdInvOwn Request MC I A I B M C I H RdInvOwn SnpInvOwn RspFwdI DataC_M RspI M .. I MR DAT A Cmp SnpInvOwn Request Phase I .. M Writebacks are generated with a WbMto* message in the home channel, and a Wb*Data* message in the response channel, as shown in Figure 8-8. Similar to requests, writebacks must allocate a transaction ID. Figure 8-8. Standard Writeback Flow MC I A M B I C I H WbMtoI M .. I MW W_CMP Cmp WbIData Writeback Phase Ref No xxxxx 265 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol 8.4.3.1 Caching Agent algorithm, conflict flows Conflicts are situations in which caching agents have overlapping requests to the same address. More precisely, true conflicts (the ones we care about) are situations in which the owner1 of the line has already processed snoops to the same address before completing its outstanding transaction. This creates situations where it becomes the home agent’s responsibility to extract the line from the owner and deliver to the conflictors. The bulk of the protocol algorithms are motivated by conflict cases. Conflict resolution is mostly home-agent centric in CSI, and thus will be described in detail in Section 8.4.5, “Capturing Ordering”. However, there are primitives provided in the caching agent to handle conflict cases, which are described below. 8.4.3.1.1 Responding to Conflicting Snoops During the Request Phase During the request phase, an incoming snoop which address conflicts with an outstanding request should cause a RspCnflt or RspCnfltOwn snoop response. A RspCnflt must be generated if the snoop arrives during the Request phase of any request, or during the Writeback phase of a writeback. A RspCnfltOwn is used in cases where the agent being snooped has an outstanding conflicting request AND also contains the M-state data ‘buried’ within its memory hierarchy (this case arises in microarchitectures in which prefetches do not first check the local hierarchy). A RspCnflt* cannot be generated unless there was actually a conflict with an outstanding request. The incomplete conflict case shown in Figure 8-9 illustrates when a RspCnflt* is sent. As can be seen, caching agent B sends a RspCnflt in response to C’s incoming snoop because the snoop arrives during B’s Request phase. However, C does not send a RspCnflt in response to B’s incoming snoop because the snoop arrives before C’s Request phase begins. Figure 8-9. Generating a RspCnflt on a conflicting incoming Snoop MC I A I B I C I H RdInvOwn SnpInvOwn RspI RspI MR DATA RdDataSnpDataSnpData RspI SnpInvOwn RspCnflt Request Phase Request Phase 1. The owner is defined to be the caching agent which has forwarding privileges for a given line. During a conflict chain, the current owner is the agent that has most recently sent an AckCnflt. 266 Ref No xxxxx Intel Restricted Secret On sending a RspCnflt*, the caching agent must record state indicating that it has observed a conflict. It is not necessary to record how many conflicting snoops have been seen, or from which caching agents. This state will be used to generate an AckCnflt. 8.4.3.1.2 Sending an AckCnflt AckCnflt’s are used to give the home agent the opportunity to extract ownership from a requestor at the end of any request. Since AckCnft’s are on the home channel, they have the effect of pushing in any outstanding RspCnflt* snoop responses, which guarantees that the home agent will always have a complete view of conflicts with respect to the owner at AckCnflt time. At this time, the home agent can make an authoritative decision about whether it needs to extract the line. For requests (Rd*, InvItoE), an AckCnflt must be sent at the end of the Request phase from the requesting caching agent when (1) a conflicting incoming snoop has been seen (and a RspCnflt* generated) during the Request phase, as described in Section 8.4.3.1.1, or (2) when the home agent sends a FrcAckCnflt instead of a Cmp to the requesting caching agent. The Request phase terminates only when the response to the request (DataC_* or Gnt*) AND the (Cmp or FrcAckCnflt) has been received, and therefore the AckCnflt is never sent until both have arrived (of course, the two can be combined when they both come from the home agent, for example: DataC_F_Cmp, GntE_FrcAckCnflt). For writebacks (WbMto*), an AckCnflt must be sent at the end of the Writeback phase from the writeback requestor when (1) a conflicting incoming snoop has been seen (and a RspCnflt* generated) during the Writeback phase, or (2) when the home agent sends a FrcAckCnflt instead of a Cmp to the writeback requestor. The Writeback phase does not terminate until the Cmp or FrcAckCnflt has been received, which implies that the AckCnflt message is never sent until the (Cmp or FrcAckCnflt) has been sunk. In Figure 8-10, we continue the conflict case started in Figure 8-9, showing B sending an AckCnflt once it receives the DataC_E_Cmp. This case is still incomplete--the home agent has the responsibility on receiving an AckCnflt to send either a Cmp_Fwd* or Cmp to B, which will end the AckCnflt phase (this case will be continued below). Figure 8-10. Sending an AckCnflt Due to a Conflicting Snoop MC I A I B I C I H RdInvOwn SnpInvOwn RspI RspI MR DA T A RdDataSnpDataSnpData RspI SnpInvOwn RspCnflt Request Phase Request P hase DataC_E_Cmp AckCnflt I .. E AckCnflt phase Ref No xxxxx 267 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol The other situation where an AckCnflt must be sent is in response to a FrcAckCnflt from the home agent, which can be sent instead of the Cmp message. The FrcAckCnflt flow can be used to resolve conflicts that were not visible to the caching agent. For example, in Figure 8-11, agent C is not hit with a conflicting snoop during its Request phase. However, this is a conflict situation, as agent C has already processed the conflicting snoop from B, yet B has not yet received the line. Therefore, the home agent uses the FrcAckCnflt to require an AckCnflt be sent, which gives the home agent the intervention it needs to later solve this race case. Figure 8-11. Conflict Case Requiring FrcAckCnflt Flow MC I A I B I C I H RdInvOwnSnpInvOwn RspI RspI MR DAT A DataC_E_FrcAckCnflt RdDataSnpDataSnpData RspI SnpInvOwn RspCnflt AckCnflt Request Phase I .. E AckCnflt Phase The AckCnflt phase is the time from when the AckCnflt is sent until the home agent replies with a Cmp or Cmp_Fwd*. The AckCnflt uses the transaction ID for the original request--for example, in Figure 8-11, the AckCnflt continues to use the transaction ID for C’s RdData request. This prevents C from issuing another request using the same reqTID value, which allows the same Tracker entry to be used in the home agent. 8.4.3.1.3 Buffering or Blocking Incoming Snoops During the AckCnfltPhase AckCnflt’s are used to serialize conflict cases to simplify resolution. In order to guarantee that the home agent has complete information about conflicts at AckCnflt time, we must guarantee that no new conflicts are generated from the time the AckCnflt is sent until the home agent responds with a Cmp or Cmp_Fwd*. We accomplish this is by requiring that the caching agent buffer or block incoming conflicting snoops during the AckCnflt phase. 268 Ref No xxxxx Intel Restricted Secret 8.4.3.1.4 Responding to Cmp_Fwd* or Cmp to End the AckCnflt Phase On receiving an AckCnflt, the home agent will look for true conflictors in the Tracker. If there are no remaining true conflictors, then the home agent will send the current owner a Cmp. Receiving the Cmp ends the AckCnflt phase and allows the owner to deallocate the transaction ID associated with this request. In the case where the home agent detects a queued true conflictor (or it otherwise needs to extract the line), it will send a Cmp_Fwd* to the owner. The Cmp_Fwd* type sent depends on the conflictor’s request type. The mapping of request type to Cmp_Fwd* type is intuitive, with the exception that we map RdData requests into using the Cmp_FwdCode forward type, and we map RdCur requests into using the Cmp_FwdInvItoE forward type (with the home forwarding the DataC_I on receipt of the implicit writeback from the owner). Refer to Section 8.3.7 for a complete description on how the caching agent must respond to Cmp_Fwd* messages. After processing a Cmp_Fwd* the AckCnflt phase for the target’s outstanding transaction is completed, and the caching agent may deallocate the transaction ID for this request. The above options do not allow the owner to keep an S copy. This is an idiosyncrasy of the current home agent algorithm. Qualitatively, the difficulty lies in identifying whether a downstream conflictor is an ownership request. To finish the conflict case started in Figure 8-9 and Figure 8-10, Figure 8-12 shows how the home agent will generate a Cmp_Fwd* response on receiving an AckCnflt. Caching agent B sends the DataC_F directly to the requestor and a RspFwdI to the home agent, similar to normal snoop processing. Figure 8-12. Conflict Case Continued from Figure 8-9 and Figure 8-10 MC I A I B I C I H RdInvOwn SnpInvOwn RspI RspI MR DATA RdDataSnpDataSnpData RspI SnpInvOwn RspCnflt Request Phase Request Phase DataC_E_Cmp AckCnflt I .. E AckCnflt phase E .. I I .. F Cmp_FwdCode DataC_F Cmp RspFwdI Ref No xxxxx 269 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol 8.4.3.1.5 Conflicts During the Writeback Phase The WbMto* transaction is very similar to a normal request. A caching agent who has an outstanding WbMto* will generate a RspCnflt if a conflicting snoop arrives during the Writeback phase. The caching agent will send an AckCnflt based on nearly identical rules as normal requests. An AckCnflt must be send on arrival of the Cmp or FrcAckCnflt if (1) if a conflicting snoop has been seen during the Writeback phase (and therefore a RspCnflt was sent) or (2) a FrcAckCnflt was received from the home instead of a Cmp. Like normal requests, during the AckCnflt phase of a WbMto*, incoming conflicting snoops are stalled. The home will send either a Cmp or a Cmp_Fwd* to extract data and ownership from the WbMto* requestor. A standard WbMto* conflict case is shown in Figure 8-13. Figure 8-13. WbMtoE Conflict MC I A I B I C I H WbMtoE WbEData RdDataSnpDataSnpData RspI RspCnflt Writeback Phase Request Phase Cmp AckCnflt I .. E AckCnflt phase E .. I I .. F Cmp_FwdCode DataC_F Cmp RspFwdI MW W_CMP A WbMtoI & WbMtoS are handled in the same way as WbMtoE (as shown in Figure 8-13). The difference is that a WbMtoI or WbMtoS will always respond with RspI on any Cmp_Fwd*. A home agent may choose to retain state about the type of writeback and only reply with a Cmp to a WbMtoI or WbMtoS, as shown in Figure 8-14. 270 Ref No xxxxx Intel Restricted Secret Figure 8-14. WbMtoI Conflict MC I A I B I C I H WbMtoI WbIData RdDataSnpDataSnpData RspI RspCnflt Writeback Phase Request Phase Cmp AckCnflt M .. I AckCnflt phase I .. E Cmp DataC_E_Cmp MW W_CMP 8.4.3.1.6 Buried Hit M State Flows Some caching agent designs naturally permit an outstanding request to a cache line for which the requestor has M state data buried within their memory hierarchy. CSI supports these caching agents through the following contract: • A caching agent may issue a request for a line which it contains (in M, E, S, F state). A RdCur must only be issued from I state. • An incoming conflicting snoop must invalidate an E, S, or F state copy and reply with a RspCnflt snoop response. • An incoming conflicting snoop must detect a buried M copy and signal the existence of the M- state data through a RspCnfltOwn snoop response. • The home will reply to the buried M’s (Rd*) request with stale or undefined data. It is the requesting caching agent’s responsibility to disregard the stale data. The home will reply to the buried M’s (InvItoE) request with a GntE (as normal). • The home will guarantee (through detection of the RspCnfltOwn snoop response(s)), that the buried M state agent will be the first agent in the subsequent conflict chain, such that coherence is not violated. Figure 8-15 shows an example of this flow where agent B generates a request for a line which it currently has in M state. Ref No xxxxx 271 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol Figure 8-15. Buried HITM Flow MC I A M B I C I H RdInvOwn SnpInvOwn RspI RspI MR DATA RdInvOwnSnpInvOwnSnpInvOwn RspI SnpInvOwn RspCnfltOwn Buried M data DataC_E_Cmp AckCnflt M .. M Stale Data M .. I I .. M Cmp_FwdInvOwn DataC_M Cmp RspFwdI 8.4.4 Protocol Invariants All messages from a given source caching agent to a given home agent on the Home channel are ordered per-address. As a result of the ordering, we can construct some true statements (invariants) which give us powerful primitives with which to identify and solve protocol races. Here we will go over several of the most important invariants CSI uses, all derived directly from the Home channel ordering. Some useful definitions: Table 8-17. Useful definitions Term Explanation Implicit Forward When a snoop hits on a cached copy and the owner cedes forwarding privileges to the requestor or the home, typified by a Rsp*Wb or RspFwd* snoop response. Explicit Forward This is when it becomes the home agents responsibility to send a Cmp_Fwd* to the current owner to extract the line (and/or ownership) and deliver it to the requesting caching agent or to the home agent. True Conflictor This is label applied to a requestor relative to the current owner in the system. The peer agent is a true conflictor if the current owner processed the peer agent's snoop before the current owner became the owner (i.e., while its request was outstanding). A peer agent may be a true conflictor with respect to one owner in the system but not a true conflictor with respect to another agent in the system. False Conflictor We use this to indicate a requestor whose probe has not yet been processed by the current owner--which generally makes it the opposite of a True Conflictor. 272 Ref No xxxxx Intel Restricted Secret Table 8-17. Useful definitions Term Explanation Owner We use this tag to indicate the agent in the system that currently has forwarding privileges for a given line. During a conflict chain, the current owner is the agent that has most recently sent an AckCnflt. 8.4.4.1 Implicit Forward Invariant “If there is an implicit forward in the system, then the RspFwd*, RspIWb, or RspSWb snoop response is guaranteed to arrive at the home agent before any other request to the same address receives its request and snoop responses at the home agent.” This is a powerful invariant which allows us to easily spot implicit forwards at the home agent, and order them ahead of other conflicting requests. 8.4.4.2 The Explicit Writeback Invariant “If there is a line eviction at an agent, the WbMto* for this line is guaranteed to arrive at the home agent before any subsequent request to the same address receives its request and snoop responses at the home agent.” Similar to the Implicit Forward invariant (Section 8.4.4.1), this invariant is used to make sure that a request will always return the latest copy of line, specifically during eviction scenarios. 8.4.4.3 Conflict Invariant #1 “If two requests are true conflictor's with each other, then at least one of the conflictors will receive a RspCnflt* at the home agent. The other requestor is guaranteed to have seen a conflicting probe while it had an outstanding request (as it is the one that generated the RspCnflt* snoop response).” This property insures that we can track conflicts at the home agent using RspCnflt*’s and have absolute knowledge about conflicts. 8.4.4.4 Conflict Invariant #2 “If a requestor is a true conflictor with respect to the current owner, then the home agent will receive the RspCnflt* for the conflicting request before receiving the AckCnflt from the new owner.” This ordering property guarantees that the home agent can always observe true conflicts at AckCnflt time. 8.4.4.5 Request Time Invariant “Receipt of a RspCnflt* at the home agent indicates that the conflicting request has already arrived” This simple property allows the home agent to determine the ‘age’ of a request. When combined with the rules for generating AckCnflt’s discussed in Section 8.4.3.1.2, this invariant is extended to say: Ref No xxxxx 273 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol “Receipt of a RspCnflt* at the home agent indicates that the conflicting request has already arrived, and the Tracker entry is still active (i.e., there is an AckCnflt in-flight).” This helps to define the rules around when we can consider the state associated with a Tracker entry to be valid, and when this state is no longer needed for a given transaction. 8.4.4.6 RspCnfltOwn Invariant “If there is a buried M-state copy of data within the network, and the owner current has an outstanding request for that same address, then it is guaranteed that either (a) the owner will receive all of his snoop responses and request message before any conflictors, or (b) a conflicting requestor will receive a RspCnfltOwn snoop response from the owner.” This property is used to resolve buried hit M cases, specifically in order to guarantee that the owner will be the agent which becomes the first agent in the conflict chain such that we do not break coherence. 8.4.5 Capturing Ordering Section 8.4.4 described the critical ordering properties on the home channel. How this ordering is captured is, of course, implementation dependant. Here we describe how the ordering is captured in the context of the abstract microarchitecture described in Section 8.4.1. The home agent architecturally maintains a 'Tracker' structure, which is indexed by the unique transaction ID (reqNID:homeNID:reqTID)1. As messages arrive, they are logged in this structure, potentially triggering other actions. A request cannot be handled (i.e., a response or Cmp message sent) until it has received all of its snoop responses and its request message. It is also necessary to record a subset of the Home channel ordering for things like RspFwd* and WbMto*. We call a request 'Ordered' at the point that we determine that it should be ordered in front of all later requests to the same address. These tests are described below: Based on the Implicit Forward and Explicit Writeback invariants, we apply the following rules: • As soon as a request receives a RspFwd* or Rsp*Wb, that request should be ordered in front of all subsequent requests to the same address. — If the RspFwd* or Rsp*Wb message does not carry the address, the home agent must guarantee that it is ordered in front of requests to all addresses (the request message is not guaranteed to arrive before the RspFwd*). — A Rsp*Wb blocks progress on subsequent conflicting requests until the accompanying Wb*Data* has arrived and committed to memory. — The RspFwd* or Rsp*Wb blocks progress on subsequent requests until it has received all of its other snoop responses and request message, and it has sent out a Cmp to the requestor. 1. The homeNID falls out of this equation, as it is the same for every transaction which arrives at a given home agent. 274 Ref No xxxxx Intel Restricted Secret Figure 8-16. RspFwd Ordering Required MC M A I B I C I H RdInvOwnSnpInvOwn RspI RspI MR DATA FrcAckCnflt RdInvOwnSnpInvOwnSnpInvOwn RspFwdI Cmp_FwdInvOwn SnpInvOwn RspCnflt AckCnflt DataC_M Cmp RspFwdI DataC_M AckCnfltCmp Request Phase AckCnflt Phase AckCnflt Phase Request Phase I .. M M .. I I .. M M .. I In the case in Figure 8-16, caching agent B receives its request and all its snoop responses at the home agent before caching agent C. However, C’s request should be ordered in front of B’s request, since C has received the latest data on a cache to cache transfer from A. Though we are guaranteed that B & C will eventually observe conflicts with one another (Section 8.4.4.3), we have no guarantee that the conflict will arrive before B receives all of its snoop responses (and in this case it does not). The only guarantee we have here is that the RspFwd* for C’s request will arrive before B’s request to the same address can receive all of its snoop responses (Section 8.4.4.1). Therefore, it is necessary to record some global state (across requests) to a given address such that subsequent requests to that address are impeded until the RspFwd* request is completed. Furthermore, the RspFwd* may arrive before the request that generated it, which means the full address may not be available to do a strict per-address ordering. However, the global state can be recorded at an arbitrarily coarse granularity with respect to address, which will impede progress on requests to different addresses (this is not a common case). • When a WbMto* is received, the WbMto* should become ordered in front of all subsequent requests to the same address. — The WbMto* blocks progress on subsequent requests until it has received an indication from the memory controller that the Wb*Data* message has been received and written out, and a Cmp has been sent to the victimizer. Ref No xxxxx 275 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol MCI A I B I C I H WbMtoI WbIData RdDataSnpDataSnpData RspI RspCnflt Writeback Phase Request Phase Cmp AckCnflt M .. I AckCnflt phase I .. E Cmp DataC_E_Cmp MW W_CMP In Figure 8-17, we see a case in which an eviction happens at the same time another caching agent requests the writeback data. Section 8.4.4.2 provides the guarantee that the WbMtoI for B’s writeback will arrive before C’s request can receive its RdData and all of its snoop responses at the home. Like the implicit forward case, the WbMtoI requires recording some global state (across requests) to impede progress on other requests to the same address until the Wb*Data* has been received and written out to memory. • All other requests become ordered in some arbitrary (but fair) order — Making other requests ordered in the order of arrival of their final snoop response is trivially fair, but any fair order will do, provided that the ordering is not set in a way that violates the ordering for implicit forwards and explicit writebacks. 8.4.6 Managing Conflict Lists The home agent algorithm tracks conflictors explicitly using RspCnflt* snoop responses and AckCnflt’s. Qualitatively, the home agent algorithm works by matching up AckCnflt’s to RspCnflt’s. A RspCnflt* signals a conflict, and an AckCnflt signals the resolution of a conflict. 8.4.6.1 Looking Closer At True Conflicts A true conflict occurs when a snoop is processed early by an agent that eventually becomes the owner. The number of possible conflictors in the system with respect to a given caching agent is limited by the number of snoops that can be in-flight at a given time to that address. In a source broadcast system, the number of possible conflictors is equal to the PeerAgents parameter 276 Ref No xxxxx Intel Restricted Secret (generally all caching agents within the partition minus the requestor himself). In a UP system, there may only be one conflictor with respect to the processor (the inbound PCI stream filtered through the I/O Hub). Similarly, in a scalable system, there may be only one true conflictor as the home agent will likely serialize requests from many caching agents for a given address (and the PeerAgents parameter will be null at the caching agents). In hybrid systems, PeerAgents may be set to encompass the caching agents within a local clump, while the home agent ParticipantAgents parameter may be set on a per caching agent basis such that each clump is managed via directories from the other clump’s perspective. 8.4.6.2 Matching AckCnflt’s to RspCnflt*’s There are probably many ways to do this matching, but at least conceptually we can describe this matching process by introducing the architectural notion of a conflict list. The conflict list is maintained on a per request basis, meaning that there is a conflict list per Tracker entry. The conflict list is a list of Transaction IDs (UTIDs) that are active conflictors with respect to a request. Transaction IDs are added to the list by RspCnflt*’s, which are mirrored to both participants in the conflict. For example, if a request from caching agent A receives a RspCnflt from caching agent B, then we add agent B's conflicting UTID to agent A's request's conflict list, and we add agent A's UTID to agent B's request's conflict list. An implementation may choose to record less information in the conflict list, such as just recording the conflicting agent IDs, which will save size in the Tracker but introduce the need to perform associate searches on the Tracker under [presumably rare] conflict cases. An AckCnflt from the owner subtracts the owner's UTID from every true conflictor's conflict list. Remember that the notion of a 'true' conflictor is only with respect to the current owner. The incoming AckCnflt can determine the set of true conflictors by looking at the conflict list in the owner's Tracker entry. For each true conflictor referenced in the owner's conflict list, the owner's transaction ID should be removed from the conflictor's conflict list. Out of all the true conflictors identified when the AckCnflt initiates the purge of conflicts with the owner, the home agent must (fairly) chose one of them to become the next owner. One simple and trivially fair way to chose is to make the first one which receives all of its snoop responses and request message the next owner. Another method is to select the true conflictor whose NID is next highest than the current owner’s agent NID. The size of the conflict list is directly proportional to the maximum number of true conflictors under the home agent in question. In a scalable configuration with snoops generated from the home agent, there may be only one outstanding snoop to a given address, in which case the conflict list only needs to be size one, and selecting the next owner is trivial. In the general case, caching agents such as processors and I/O Hubs will place some upper bound on the PeerAgents parameter, though it is required that this value be configurable down to null (in which case noone will be snooped on a request). A given home agent may optionally implement the conflict lists (and associated logic) to enable source snooping on various system sizes within this range, or restrict it completely. 8.4.6.3 Sending FrcAckCnflt’s FrcAckCnflt’s are provided as a tool for the home agent to signal a potential conflict situation to the cache agent owner. FrcAckCnflt’s may be sent pessimistically even when there is no conflict situation. FrcAckCnflt’s must be sent whenever a request has a non-empty conflict list (conflicting UTID’s that have not been cleared by matching AckCnflt’s). Ref No xxxxx 277 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol This is required in cases such as in Figure 8-18, where the conflicting snoop is processed at C before C’s request is generated. Here, the home agent knows there is a conflict (because C’s request has received a RspCnflt snoop response from B), but the caching agent C has not been hit by a snoop during its request phase. This case is resolved by the home agent issuing the DataC_E_ FrcAckCnflt (instead of a DataC_E_Cmp) to C, at which point C sends an AckCnflt. From this point on, it is the normal conflict resolution flow, in that the AckCnflt will cause the home agent to chose a conflictor (in this case B) to be the next owner, and sends a Cmp_FwdInvOwn to C on B’s behalf. It is also interesting to note that the home agent does not send a FrcAckCnflt to B in this case, because by this time the AckCnflt from C has removed B’s only conflictor from its conflict list. B still generates an AckCnflt, however, because it was hit by a conflicting snoop during its request phase. Figure 8-18. Case Requiring a FrcAckCnflt to Resolve MC I A I B I C I H RdInvOwnSnpInvOwn RspI RspI MR DAT A DataC_E_FrcAckCnflt RdDataSnpDataSnpData RspI SnpInvOwn RspCnflt AckCnflt Request Phase I .. E AckCnflt Phase E .. I I .. E Cmp_FwdInvOwn DataC_E Cmp RspFwdI AckCnflt Cmp Request Phase AckCnflt Phase 8.4.6.4 Request ready Decision Point, Null Conflict List If a request has received all of its snoop responses and its request message, and it has a null conflict list, then the home agent follows these rules: • If the request has received an implicit forward: — If the implicit forward included an implicit writeback, then wait for the Wb*Data* message. — If the implicit forward did not include an implicit writeback, or the Wb*Data* has already arrived, then send a Cmp to the requestor. • If the request did not receive an implicit forward: — Use Table 8-18 to determine the response to send the requestor: 278 Ref No xxxxx Intel Restricted Secret Table 8-18. Home Agent Responses, No Implicit Forward, Null Conflict List Request type Has received RspS? Send to Requestor RdData Yes DataC_F_Cmp RdData No DataC_E_Cmp RdInvOwn X DataC_E_Cmp RdCode X DataC_F_Cmp InvItoE X GntE_Cmp RdCur X DataC_I_Cmp 8.4.6.5 Request ready decision point, non-null conflict list If a request has received all of its snoop responses and its request message, and it has a non-null conflict list, then the home agent follows these rules: • If the request has received an implicit forward — If the implicit forward included an implicit writeback, then wait for the Wb*Data* message — If the implicit forward did not include an implicit writeback, or the Wb*Data* has already arrived, then send a FrcAckCnflt to the requestor • If the request did not receive an implicit forward — Look in the Tracker entry referenced by each transaction ID in the requestor’s conflict list: if any entry has been completed (the Cmp or FrcAckCnflt has been sent), then we must block on this request until the AckCnflt arrives. We are guaranteed that the AckCnflt is in- flight (Section 8.4.6.2). — If this requestor has received a RspCnfltOwn, then wait for the RspCnfltOwn to be cleared (by the AckCnflt from buried hit M owner) before making forward progress on this line. Furthermore, this request should not introduce any blocking or ordering condition which will prevent forward progress on the owner’s request. — If there is no Tracker entry referenced in the requestor’s conflict list which has been completed, then there is no in-flight response or AckCnflt, and therefore we can supply the line to this requestor. However, we must still FrcAckCnflt due to the non-null conflict list. The following table indicates the permitted responses: Table 8-19. Home Agent Responses, No Implicit Forward, Non-Null Conflict List Request type Have received RspCnflt snoop response for this request?a Send to Requestor RdData Yes DataC_F_FrcAckCnfltb RdData No DataC_E_FrcAckCnflt RdData No DataC_F_FrcAckCnflt RdInvOwn X DataC_E_FrcAckCnflt RdCode X DataC_F_FrcAckCnflt InvItoE X GntE_FrcAckCnflt RdCur X DataC_I_FrcAckCnflt Ref No xxxxx 279 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol a. A RspCnflt which is an expected snoop response for the request, i.e., in which reqNID:reqTID matches the reqNID:reqTID of the request. b. A RspCnflt does not imply that all S copies have been invalidated for a RdData request (Section 8.3.5), Therefore, when a RspCnflt has been received for this request, then a DataC_F_FrcAckCnflt must be sent (instead of a DataC_E_FrcAckCnflt). Note also that a DataC_F_FrcAckCnflt may always be sent (under a non-NULL conflict list). 8.4.6.6 AckCnflt arrives decision point On arrival of an AckCnflt, we examine each transaction ID in the conflict list associated with the AckCnflt’s (referenced by the reqNID:reqTID in the AckCnflt message). For each transaction ID referenced in the conflict list, we look in that request’s Tracker entry. If that requestor has received its snoop response from the current owner, we clear the reference to the owner in the requestor's conflict list. By checking whether each requestor has received its snoop responses from the current owner, we are determining which are true conflictors. The home agent must pick one of the true conflictors to be the next owner. The processing of an AckCnflt (and the sending of the Cmp/Cmp_Fwd*) may not depend on forward progress of the snoop channel or on arrival of one or more snoop responses. See Section 8.2.2 for a more detailed description of the permitted dependencies. The processing of the AckCnflt may wait for the request message for the chosen owner (though is not required to). On picking the next owner, the home agent sends a Cmp_Fwd* message to the owner, which will cause the owner to send the DataC_* to the selected conflictor. The precise Cmp_Fwd* message that is sent depends on the request type (See Table 8-20). Note that it is possible for the home to send a Cmp_FwdInvItoE for any request type or if the request type is unknown. It is also possible that the search of the Tracker yields no true conflictors, in which case the home agent sends a Cmp to the owner. If the home agent knows that the AckCnflt-owner does not have a copy of the line (i.e., because it recorded state from the Request phase of the owner’s request and therefore knows that the previous request was a RdCur, WbMtoI, or WbMtoS), then it may send a normal Cmp to the owner. If the home agent does not have this knowledge, then it must send the appropriate Cmp_Fwd* to the owner. If the home agent sends a Cmp_Fwd message to the owner, then it continues to block on that address while waiting for the snoop response. If the snoop response received is a RspI, then the home agent must fetch the data from memory and deliver it (in the correct state, see Table 8-19) to the requestor. If the snoop response is a Rsp*Wb, then the home agent must wait for the writeback data (Wb*Data) and logically write it to memory (such that it is visible to a later read) before sending the Cmp to the new owner, just as in the standard implicit forward flow. Within a conflict chain, if the next owner chosen is a RdCur conflictor, then the home must send a Cmp_FwdInvItoE to the current owner. This will have cause the current owner to invalidate (and potentially writeback) his copy of the line, and it is then the home’s responsibility to send the DataC_I to the RdCur requestor. In cases where the Cmp_FwdInvItoE is sent before the RdCur request has all of its snoop responses, it is possible that the RdCur will still receive a RspFwd, in which case the home must not send it another DataC_I. Table 8-20. Cmp_Fwd* Types Sent to the Owner Request type Send to Owner on behalf of Requestor RdData Cmp_FwdCode RdInvOwn Cmp_FwdInvOwn RdCode Cmp_FwdCode InvItoE Cmp_FwdInvItoE 280 Ref No xxxxx Intel Restricted Secret Table 8-20. Cmp_Fwd* Types Sent to the Owner (Continued) Request type Send to Owner on behalf of Requestor RdCur Cmp_FwdInvItoE * Cmp_FwdInvItoE 8.4.7 Summary of the home agent algorithm This section provides a summary of a possible home agent algorithm where each Tracker entry retains one bit of information, NotOwn, beyond the sending of *Cmp/*FrcAckCnflt in order to avoid sending Cmp_Fwd* to any requestor who cannot have a forwardable copy of the line, as suggested in Section 8.4.6.6. When a Tracker entry receives a new request, the NotOwn bit is set to TRUE if the request type is RdCur, WbMtoI, or WbMtoS, or to FALSE if otherwise. This is the only place where NotOwn is written. The following is how the conflict lists are maintained. We stress that this is a purely conceptual design; in particular, an implementation may choose to represent the conflict lists more compactly as remarked in Section 8.4.6.2. Below A, B, C, etc. denote transactions or their ids (UTIDs): • Conceptually, conflicts are between transactions and are always symmetric (i.e., if A is in B’s conflict list, then B is in A’s conflict list) and irreflexive (i.e., A is never in its own conflict list). • When the home receives a RspCnflt or RspCnfltOwn for a request A from a peer B, A and B are made in conflict with each other (i.e., A is added to B’s conflict list and B to A’s conflict list). • When the home receives a RspCnfltOwn from B for A’s request, it records this fact by setting one bit of information, CnfltOwn, in A’s Tracker entry. This information is used to ensure that B, which has a buried M copy, will be completed before A. This bit is cleared on receipt of an AckCnflt from B. • A transaction A is removed from the conflict relation when the home receives an AckCnflt from A. To remove A from the conflict relation, the conflict lists are modified as follows: — For any . . C in A’s conflict list, make B and C in conflict with each other. — For any B in A’s conflict list, A is removed from B’s conflict list. — A’s conflict list is emptied. In the following, the expressions “A is in B’s conflict list”, “A is a conflictor of B’s”, and “A and B are in conflict” are used interchangeably. When the home receives an AckCnflt from A, one of three things happens: • If A’s NotOwn bit is TRUE, then the home simply sends Cmp to A, because A’s request type must be one of RdCur, WbMtoI, and WbMtoS and hence cannot have a forwardable copy of the line. • If A’s NotOwn bit is FALSE and no conflictor of A has received a response from A, then none of A’s conflictors is a true conflictor, in which case the home also sends Cmp to A. (Note that this includes the case where A has no conflictor at all.) This is to avoid deadlock, since the snoops of A’s conflictors would have been buffered or blocked when they reach A. • If A’s NotOwn bit is FALSE and at least one conflictor of A, say B, has received a response from A, then a Cmp_Fwd* is sent to A on behalf of B according to Table 8-20. The Cmp_Fwd* must not wait for all of B’s snoop responses, as this can introduce deadlock under snoop blocking (see Section 8.2.2.3). It may wait for B’s request, but this is not necessary: a Ref No xxxxx 281 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol Cmp_FwdInvItoE may be sent before B’s request type is known. The selection of B, on the one hand, must not violate any ordering constraints captured so far (see Section 8.4.5), as B may be the next requestor to obtain data in the conflict chain. On the other hand, the selection of B must not commit the home to completing B next in terms of transaction ordering either, as the early sending of Cmp_Fwd* (i.e., B may not have received all its snoop responses) means that there may be ordering constraints that the home is not yet aware of. The reception of an AckCnflt message is the only place where the NotOwn bit is read. The home can generate a completion or data+completion response for a transaction A only when all of the following conditions are met: 1. The home has received A’s request and, if A is not a WbMto*, all its peers’ responses. 2. A is not ordered behind any other transaction to the same address according the ordering rules given in Section 8.4.5. 3. If A has received a RspCnfltOwn before, its CnfltOwn bit has been cleared by receiving an AckCnflt from the conflictor that sent the RspCnfltOwn. 4. None of A’s conflictors is waiting for an AckCnflt. This condition ensures that no Cmp_Fwd* need be sent on behalf of A and, if a data response is needed, the data can be obtained from memory. 5. If there has been an explicit (WbMto*) or implicit (Rsp*Wb) writeback, the writeback data has been committed to memory. If all of the above conditions are met, then the home sends to A a completion response (Cmp or FrcAckCnflt) if A is a WbMto* or has received an implicit forward, or a data+completion response according to Table 8-18 or Table 8-19 if otherwise. The *Cmp response is used when A’s conflict list is empty and *FrcAckCnflt response is used when it is not. An exception to the last rule is that a Cmp response may always be sent to a WbMto*, because if a WbMto* is in conflict with any other request, it must have sent a RspCnflt* and hence will respond to the Cmp with AckCnflt. 8.5 Scaling CSI With an Out-of-Order Network This section describes a home agent algorithm which allows for 3-hop home broadcast (directory) coherence on an unordered network. An out-of-order protocol is desirable in systems with an elaborate communication fabric or in systems that may require advanced RAS features which would be otherwise difficult to implement with an ordered protocol. The caching agent interface is as described in Section 8.3. The following high-level constraints exist for this out-of-order protocol: • Snoops may only be sent by the home agent. • There can be snoops outstanding for at most one request per address at a time. • The home agent must maintain an inclusive directory--which means that there must be a directory entry for every cached line within the coherence domain. Further constraints on the directory information are explained in Section 8.5.1. • The home channel remains preallocated. These constraints imply: • The PeerAgents configuration parameter (as described in Section 8.5) must be null at each caching agent. 282 Ref No xxxxx Intel Restricted Secret • The protocol becomes 3-hop, as each request must take an initial hop to the home agent, which will [optionally] send snoops to its peer caching agents, and any hit data may be returned directly to the requestor. 8.5.1 Directory Structure Requirements This protocol relies on a conservative-inclusive directory at the home agent. As mentioned earlier, inclusive implies that there must be a directory entry for each unique cached line in the system. The directory may be conservative, however, meaning that it is not required to be kept up to date on silent state transitions1 (E->S, S->I, etc.). The protocol places these other minimal requirements on the directory structure: • The directory entry must be able to encode a pointer to an explicit owner NID. • The directory entry must be able to encode the I, E, E’, and S states. • Explicit pointers (NIDs) for sharers are optional, however for each that is included, the directory must maintain provide separate S vs. S’ (S-prime) states. The primed state is explained in Section 8.5.5. The directory entry may use a coarse representation for sharers (i.e., represent multiple caching agents by a single sharing pointer). However, there is reduced performance for which there is a coarse representation of sharers. This performance reduction is described later in Section 8.5.5. Therefore, it is recommended that directories provide at least one or two explicit sharing pointers-and only transition into a coarse representation of the sharing list when the number of sharers exceeds this number. Table 8-21 shows an example 20 bit-wide directory format which works up to 256 caching agents, with up to 2 explicit sharers and a 16 bit coarse sharing mask. Please note the required (prime) state encodings for each explicit owner and sharer pointer. The examples in this section will assume a directory encoding similar to Table 8-21. Table 8-21. Example Directory Format 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Invalid, I Exclusive, E Owner NID [7:0] in E state Exclusive, E’ Owner NID [7:0] in E’ state Shared1, S Sharer #1 [7:0] in S state Shared1, S1’ Sharer #1 [7:0] in S’ state Sharer2, S1’ Sharer #2 [7:0] in S state Sharer #1 [7:0] in S’ state Sharer2, S2’ Sharer #2 [7:0] in S’ state Sharer #1 [7:0] in S state Sharer2, S1’ S2’ Sharer #2 [7:0] in S’ state Sharer #1 [7:0] in S’ state Sharer2, S1 S2 Sharer #2 [7:0] in S state Sharer #1 [7:0] in S state Coarse Sharing 16 bit coarse sharing mask--all sharers in S’ state. 1. CSI does not provide any ‘clean cast-out’ or ‘replacement hint’ transactions for alerting a directory on a silent cache state transition. Ref No xxxxx 283 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol 8.5.2 Home Agent Microarchitectural Constraints The home agent meters access to four resources: (1) reading and writing to memory, (2) reading and writing to the directory info, (3) sending snoops into the network, and (4) sending responses into the network. Access to these resources, along with permitted dependencies, forms the bulk of the constraints on the home agent microarchitecture. All Rd* & InvItoE requests must read the directory (and potentially modify the directory) on arrival at the home agent. The home agent must provide forward progress for this directory manipulation without relying on forward progress on the SPT (snoop pending table) or Request Spill Queue (described later). For Rd* & InvItoE requests which (on the directory read on arrival at the home agent) find the directory in E state, with the directory owner equal to the requestor, these requests must be able to read and write to the memory and directory, and must be able to send a response to the requestor, without a dependence on forward progress of the SPT or Request Spill Queue. These requests may queue on the network (blocking the home channel) while awaiting response-class resources with which to send the response to the requestor. Requests of this type must never send snoops. The home agent is required to track requests which have outstanding snoops in the network, as well as provide a mapping from address to the state associated with an outstanding request. There must also be a mapping from reqNID:reqTID to the appropriate SPT entry so that snoop responses can be matched up with their SPT entry. We will refer to this collection of state as the SPT, and assume that it can be indexed (CAMed) by address and reqNID:reqTID. This structure may be arbitrarily sized. Any request (Rd*/InvItoE) which reads the directory and finds it in a non-NULL state potentially needs an SPT entry. Deallocation of SPT entries relies on forward progress of the home channel (for snoop responses, etc.), and the response channel. Therefore, it is not legal to stall the home channel on the network while awaiting an SPT entry to become available, as this would create a circular dependence which would deadlock. This creates a requirement that Rd* & InvItoE messages which are arbitrating for access to the SPT must be accepted from the network without a dependence on SPT entries becoming available (i.e., without a dependence on snoops, snoop responses, etc.). Therefore, the home must provide preallocated resources for the Rd* & InvItoE messages which are arbitrating for access into a finite SPT. We will refer to this preallocated structure as the Request Spill Queue. It must be the depth of all the possible requests that can target this home agent. The only manipulation of this Request Spill Queue required is to add transactions to it and to remove transactions from it. One interesting implementation is a strict FIFO, though others are possible. The SPT also maintains in-flight directory information (as it is initiating a change in global cache state). For this reason, all incoming requests, snoop responses, writebacks, and AckCnflt’s must index into the SPT (CAM) in order to factor into transitional protocol state calculations-independent of forward progress on the SPT or Request Spill Queue. In general, this operation should happen (for Rd*, InvItoE, Wb*Data[Ptl]) on arrival--before they are placed on the Request Spill Queue or otherwise processed. Snoop responses are always folded into an existing SPT entry (must hit on the outstanding SPT entry that spawned the snoop). Incoming AckCnflt’s must index the SPT on arrival, and must be able to reply with a Cmp/Cmp_Fwd* to the owner, all without relying on forward progress on the SPT or Request Spill Queue. AckCnflt’s may queue on the network (blocking the home channel) while awaiting response-class resources to send a Cmp* to the owner. In general, AckCnflt’s will either terminate in the SPT entry (on a hit), or will miss in the SPT and proceed on to sending the Cmp message. 284 Ref No xxxxx Intel Restricted Secret All incoming Wb*Data[Ptl]’s must (1) Write memory (or do a RMW in the case of a WbIDataPtl), (2) Index (CAM) the SPT to see if this writeback matches an outstanding request, (3) Modify the directory to indicate the new state of the line, and (4) send a Cmp to the writeback agent (in the case of an explicit writeback). They must do (1), (2), and (3) without relying on forward progress of the SPT or Request Spill Queue. All Wb*Data[Ptl]’s must be processed even if there are no response-class resources to send the Cmp (4), just as in the normal protocol. This creates a requirement for preallocation of the information needed to send a Cmp for a Wb*Data[Ptl]. However, for (4), the Wb*Data[Ptl] may rely on forward progress of the Request Spill Queue and the SPT. Since the pool of transaction IDs is shared between Rd*, InvItoE, and Wb*Data[Ptl], it would make sense to use a single preallocated queue to house the Rd*/InvItoE requests and the Wb*Data[Ptl] header info necessary to send a Cmp for the writeback. The Wb*Data[Ptl] header messages can therefore occupy the Request Spill Queue while waiting for response channel credits with which to send the Cmp message to the writeback. WbMto* messages may be discarded as they provide no additional information which is not captured in the Wb*Data[Ptl] messages for this protocol. 8.5.3 Simple Protocol Flows Here we will show several simple protocol flows which illustrate the common performance cases for this protocol. Figure 8-19. RdData Request Fetching an E-State Line and Setting Dir State ABCH Request Phase I I I I I ..E RdData DataC_E_Cmp Dir: I Dir: E @ C Ref No xxxxx 285 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol ABCH Request Phase S S I I I .. E RdInvOwn DataC_E_Cmp Dir: S @A,B RspI SnpInvOwn SnpInvOwn RspI S .. I S .. I Dir: E @ C Figure 8-21. RdInvOwn Request HITM Request Phase ABCH I I M I RdInvOwn RspFwdIDataC_M I .. M M .. I Cmp SnpInvOwn Dir: E @ C Dir: E @ B 286 Ref No xxxxx Intel Restricted Secret Figure 8-22. WbIData Arriving – We Discard Any WbMto* Message Writeback Phase I A M B I C I H WbMtoI M .. I Cmp WbIData Dir: E @B Dir: I 8.5.4 Home Agent Algorithm Overview The primary challenge for this protocol is to distinguish between an early conflict and a late conflict at the home agent. An early conflict is a conflict with a requestor for whom the home has not yet sent the Cmp or FrcAckCnflt (i.e., the request is still outstanding, as in Figure 8-23). A late conflict is one in which the conflictor has already been sent his DataC_*/GntE & Cmp or FrcAckCnflt, but a subsequent snoop arrives before the owner’s Request phase completes (as in Figure 8-24). There is a guarantee that can distinguish these two cases, however. Whenever a snoop returns a RspCnflt, then either the conflictor’s request is in-flight to the home (early conflict), or the AckCnflt will eventually arrive at the home (late conflict). By waiting at the home for one of these two events when we receive a RspCnflt, we can disambiguate the two cases. However, the request that is in-flight may arrive before the snoops were even generated--so we cannot rely on a CAM hit of the SPT alone. Therefore, we use special directory states to encode that fact that someone on the sharing list has received a new request. These are the ‘primed’ states. For example, in Figure 8-23, the agent B was on the sharers list and then re requested the line. On the arrival of agent B’s request, we change the directory to show that B is now in S’ state. S’ is a state attached to an explicit sharer which indicates that we are guaranteed that this sharer has received its most recent S copy. In this case, it allows us to know (when we receive the RspCnflt), that any remaining S copies have been invalidated, and it is therefore safe to grant the line to agent C. A late conflict is shown in Figure 8-24. Here, the home receives a RspCnflt from C for B’s request. The home then waits and eventually an AckCnflt arrives which indicates that this was a late conflict, and therefore the most recent data is at C. Ref No xxxxx 287 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol Figure 8-23. Early Conflict Resolved by Detecting Request from Agent on Sharing List Figure 8-24. Late Conflict Resolved by Waiting for an AckCnflt Request Phase Request Phase AckCnflt Phase I A I B I C I H RdInvOwn I .. E RdInvOwn SnpInvOwn RspCnflt DataC_E_Cmp Dir: S @ B Dir: E @ B DataC_E SnpInvOwn AckCnflt E .. I I .. E RspFwdI Cmp Cmp Dir: S' @ B Dir: E @ C Request Phase Request Phase M A I B I C I H RdInvOwn SnpInvOwn I .. M RdInvOwn SnpInvOwn RspFwdI Cmp Dir: E @A Dir: E @B Dir: E @C DataC_M DataC_M RspCnflt Cmp I .. M M .. I AckCnflt Cmp_FwdInvOwn RspFwdI When we get a RspCnflt wait for an AckCnflt or a Request M .. I 288 Ref No xxxxx Intel Restricted Secret We use the S vs. S’ state encodings to solve cases where we have a conflict from an agent on the sharing list. We could just as easily use E vs. E’ to solve cases where we have a conflict with an agent who is the owner. However, there is an easier solution for the E-state case, and one that more elegantly handles buried HITM flows. This solution is to simply always reply immediately to a request from an agent who is listed in E-state in the directory. This can be done because such a transaction will never need to send out snoops, which changes the dependencies such that it is safe to queue such transactions across the network (they will eventually drain with only a dependence on the response channel). This trivially guarantees that if there is buried M-state data, the owner’s request will always be the first one satisfied. Figure 8-25. Buried HITM Case Request Phase Request Phase I A M B I C I H RdInvOwn I .. M RdInvOwn SnpInvOwn RspCnfltOwn DataC_M Dir: E @B Dir: E @C DataC_E_FrcAckCnflt AckCnflt Cmp_FwdInvOwn M .. M Dir: E @ B Cmp RspFwdI M .. I 8.5.5 Using Coarse Sharing lists Clearly, this protocol naturally relies on having explicit pointers and independent state info for the sharers, as it is necessary to make the S vs. S’ distinction at caching agent granularity. However, it is desirable to use coarse sharing masks in large systems for cases when there is widely shared data. CSI OoO supports this coarse sharing when the number of sharers exceeds the amount that can be explicitly recorded, albeit under a lower performance transaction flow. The basic idea is straightforward, if we do not have the S vs. S’ mechanism to capture when data has truly arrived at a caching agent (and therefore to distinguish between the early and late conflict cases), then we need a more drastic method. The method we have chosen is to use the FrcAckCnflt/AckCnflt handshake on RdData and RdCode requests whenever the number of sharers exceeds the number that can be encoded explicitly. Figure 8-26 shows how a RdCode that finds the directory in Coarse Sharing mode uses the FrcAckCnflt/AckCnflt handshake to serialize arrival of Data with sending out subsequent snoops for any other request. This is a ~20% increase Ref No xxxxx 289 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol in network traffic and ~70% increase in occupancy for the non-RFO requests which hit a line in coarse sharing mode--neither of which seems problematic given the likely usage models (stream data, barrier synchronization). A subsequent RFO request can launch snoops using the Coarse Sharing mask, and any RspI or RspCnflt should be interpreted as a RspI (the late conflict race is not possible due to the FrcAckCnflt/AckCnflt serialization). Figure 8-26. Using the FrcAckCnflt/AckCnflt Handshake for a RdCode in Coarse Sharing I A S B I C I RdCode AckCnflt SnpInvOwn RspI SnpInvOwn DataC_E_Cmp DataC_S_FrcAckCnflt Cmp I .. S RspI RdInvOwn S .. I S .. I I .. E When in S_Coarse, then we are guaranteed that all prior S copies have arrived. H Dir: S_Coarse @ Clump1 Dir: I Dir: E @A To transition to coarse sharing mode, it is necessary to launch snoops to any non-prime (S-state) explicit sharer to determine if the previous request’s data has arrived yet. A RspCnflt from one of the sharers means we have to wait for either the new request to arrive (and therefore transition the S-state to a S’-state), or the AckCnflt. This flow is show in Figure 8-27. This mechanism has the nice property of actually determining if any of the sharers have since invalidated their copy of the line, which helps prevent transiting into coarse sharing mode unless absolutely necessary. To transition from coarse sharing mode back to normal sharing mode, it is necessary to write the line or do a global flush cache (InvItoE). 290 Ref No xxxxx Intel Restricted Secret Figure 8-27. Transiting from Explicit Sharers to Coarse Sharing I A I B I C I H RdCode DataC_S_Cmp RdCode DataC_S_Cmp Dir: I Dir: S @CDir: S @B,C RdCode Dir: S @B,C SnpCode RspCnflt AckCnflt Cmp SnpCode RspS Dir: S' @B,C DataC_S_FrcAckCnflt Dir: S_Coarse: Clump1 AckCnflt Cmp When transitioning to coarse directory mode, it is necessary to: (A) Make sure that all in-flight S copies have arrived at their requestor's. Do this by sending SnpCode's and using AckCnflt flow. (B) For all subsequent sharers, FrcAckCnflt->AckCnflt flow to guarantee data arrival before sending snoops for a subsequent request. I .. S I .. S I .. S 8.5.6 Protocol English flows This section attempts to capture the spirit of how this algorithm works based on the incoming message. 8.5.6.1 On an Incoming Rd* or InvItoE Request • Read directory entry and CAM SPT by address. • If requestor (reqNID) is in E-state in the directory and SPT miss: — Fetch data from memory (Rd*). — Queue home channel waiting for response credits to send the DataC_*_Cmp or GntE_Cmp to the requestor. — Send DataC_*_Cmp or GntE_Cmp to the requestor. • If requestor (reqNID) is in E-state in the directory and SPT hit: — Fetch data from memory (Rd*). — Queue home channel waiting for response credits to send the DataC_*_Cmp or GntE_Cmp to the requestor. — Send DataC_*_FrcAckCnflt or GntE_FrcAckCnflt to the requestor. Ref No xxxxx 291 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol • If requestor (reqNID) is in S-state in the directory and SPT hit or miss: — Mark directory as S’-state for this reqNID. — Wait in Request Spill Queue for an SPT entry to become available. • ELSE — Wait in Request Spill Queue for an SPT entry to become available 8.5.6.2 On Popping a Rd*/InvItoE From the Request Spill Queue Into the SPT with Directory in Invalid, Shared, or Coarse Shared States • Allocate an SPT entry (popping implies an entry is available and there’s no address conflict). • If dir state is I: — Fetch data from memory (Rd*). — Send DataC_*_Cmp or GntE_Cmp to the requestor. — Mark the new owner in the directory in E state. • If dir state is Shared (explicit sharers), and it’s a RdInvOwn or InvItoE request: — Send SnpInvItoE to all agents on sharing list. — For each RspI received, subtract that agent from sharing list. — For each RspCnflt received from an S agent, wait for either the Request or the AckCnflt from that agent to show up. a. If the Request arrives, it will mark it’s S states as S’--subtract that agent from the sharing list now, as we know he does not have the line. b. If the AckCnflt arrives, send a Cmp_FwdInvItoE or CmpFwdInvOwn to the owner, and collect the RspI, then subtract the agent from the sharing list. — For each RspCnflt received from an S’ agent, subtract that agent from the sharing list (as we know he does not have the line anymore). — When the sharing list is empty, then send the DataC_E_Cmp or GntE_Cmp to the requestor. — Mark the new owner in the directory in E state. • If dir state is Coarse Shared, and it’s a RdInvOwn or InvItoE request: — Send SnpInvItoE (or SnpInvOwn) to all the agents represented by the coarse sharing list, set a coherence count of agents. — For each RspI or RspCnflt received, decrement the coherence count. — When the coherence count it zero, send the DataC_E_Cmp or GntE_Cmp to the requestor. — Mark the new owner in the directory in E state. • If dir state is Shared or Coarse Shared, and it’s a RdCur request: — Fetch the line from memory. — Send the DataC_I_Cmp to the requestor. • If dir state is Coarse Shared, and it’s a RdCode or RdData request: — Fetch the line from memory. — Send the DataC_S_FrcAckCnflt to the requestor. 292 Ref No xxxxx Intel Restricted Secret — Wait on the AckCnflt, then send a Cmp to the requestor. — Mark the new sharer in the directory (coarsely). • If dir state is Shared, and it’s a RdCode or RdData request, and we are exceeding the number of explicit sharers the directory can record: — For each sharer that is in S state (as opposed to S’), send a SnpCode: a. If you receive back at least one RspI, remove that agent from the sharing list, swap in the new requestor, stay in explicit sharer mode. b. If you receive back a RspS, then mark that agent in the sharing list as ‘S’. c. If you receive back a RspCnflt, then wait for the matching AckCnflt or Request. On an AckCnflt, send a Cmp and mark that agent in the sharing list as S’. On a Request, mark the requestor in S’ state. d. Once the sharing list is all S’s or NULL’s, and no RspI was received (as mentioned in (2.)), then change state to Coarse Shared, and changed directory encoding to match a coarse representation of the existing sharers. — Fetch the line from memory. — If we changed to Coarse mode, then send the DataC_S_FrcAckCnflt to the requestor. a. Wait for the matching AckCnflt. b. Mark the new sharer in the directory (coarsely). — If we stayed in explicit mode, then send DataC_S_Cmp to the requestor: a. Mark the new sharer in the directory in S-state. • If dir state is Shared, and it’s a RdCode or RdData request, and we are not exceeding the number of explicit sharers the directory can encode: — Fetch the line from memory. — Send DataC_S_Cmp to the requestor. — Mark the new sharer in the directory in S-state. 8.5.6.3 On Popping a Rd*/InvItoE From the Request Spill Queue Into the SPT with Directory in Exclusive • If the dir state is Exclusive: — Send the snoop: a. Send a SnpCur to the owner (for a RdCur request). b. Send a SnpCode to the owner (for a RdCode request). c. Send a SnpData to the owner (for a RdData request). d. Send a SnpInvOwn to the owner (for a RdInvOwn request). e. Send a SnpInvItoE to the owner (for an InvItoE request). — If receive a RspI response: a. Fetch data from memory (Rd* request). b. Send DataC_*_Cmp or GntE_Cmp to requestor. c. Mark new requestor as Exclusive owner (DataC_E/GntE) or sole Sharer (DataC_S) in S-state, or not at all (DataC_I). Ref No xxxxx 293 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol — — — — — — — — — If receive a RspS response: a. Fetch data from memory. b. Send DataC_S_Cmp or DataC_I_Cmp to requestor. c. Mark directory as S’-state for the current owner1, and S-state for the new sharer (or not at all in the DataC_I case). If receive a RspIWb response: a. Wait for WbIData, write it out to memory. b. Send DataC_*_Cmp or GntE_Cmp to requestor. c. Mark new requestor as Exclusive owner (DataC_E/GntE) or sole Sharer (DataC_S) in S-state, or not at all (DataC_I). If receive a RspSWb response: a. Wait for WbSData, write it out to memory. b. Send DataC_S_Cmp or DataC_I_Cmp to requestor. c. Mark directory as S’-state for the current owner, and S-state for the new sharer (or not at all in the DataC_I case). If receive a RspFwdIWb response: a. Wait for WbIData, write it out to memory. b. Mark new requestor as Exclusive owner (RdData or RdInvOwn) or sole Sharer (RdCode) in S-state. c. Send Cmp to the requestor. If receive a RspFwdSWb response: a. Wait for WbSData, write it out to memory. b. Mark directory as S’-state for the current owner, and S-state for the new sharer. c. Send Cmp to the requestor. If receive a RspFwdI response: a. Send Cmp to the requestor. b. Mark new requestor as Exclusive owner (RdData or RdInvOwn) or sole Sharer (RdCode) in S-state. If receive a RspFwdS response: a. Send Cmp to the requestor. b. Mark directory as S’-state for the current owner, and S-state for the new sharer. If receive a RspFwd response: a. Send Cmp to requestor. b. Directory is unchanged (RdCur). If receive a RspCnflt or RspCnfltOwn response: a. Wait for either Request from E-state owner, or AckCnflt. 1. It is not necessary to mark the owner as S’ as opposed to S state. However, the RspS does guarantee that the owner’s previous request had completed at the source, which is what S’ implies. Aggressively marking S’ in these cases will limit the number of later SnpCode’s that would need to be sent to transition to a Coarse sharing state. 294 Ref No xxxxx Intel Restricted Secret b. If Request arrives, it will send Data or GntE to himself, along with a FrcAckCnflt, wait on the AckCnflt response. c. On the AckCnflt from (1) or (2), send the appropriate Cmp_Fwd* to the owner, to grant the line to the requestor. Wait for the snoop response for the Cmp_Fwd*. d. If a Rsp*Wb, then wait for the Wb*Data. e. If a RspI, then fetch the line from memory and deliver DataC_*_Cmp or GntE_Cmp. f. If a RspFwd*, then just send a Cmp to the requestor. g. Mark the directory as either Exclusive (RdInvOwn, InvItoE) or Shared with a single S-state sharer (RdCode, RdData), or I-state (RdCur). 8.5.6.4 On Popping a Wb*Data* Header From the Top of the Request Spill Queue • Send the Cmp to the Wb*Data* requestor. 8.5.6.5 On Receipt of an AckCnflt • CAM SPT by address. • If hits an open SPT entry, then rules in Section 8.5.6.1, Section 8.5.6.2, and Section 8.5.6.3 apply, depending on context provided in those sections. • If miss in the SPT, then queue home channel waiting for response credits to send the Cmp to the AckCnflt-owner. • Send the Cmp to the AckCnflt-owner. 8.5.6.6 On Receipt of a Wb[IS]Data* • CAM SPT by reqNID:reqTID and by address. • If a reqNID:reqTID hit, then mark data as arrived and write-out data to memory. • If an address hit but not reqNID:reqTID hit, then write-out data to memory, change directory state in hit entry, and place Wb*Data* header in Request Spill Queue while it waits for a response credit to send the Cmp back to the requestor. • If misses in SPT, then write-out data to memory, and place Wb*Data* header in Request Spill Queue while it waits for a response credit to send the Cmp back to the requestor. 8.5.6.7 On Receipt of a WbEData • CAM SPT by reqNID:reqTID and by address. • If a reqNID:reqTID hit, then mark data as arrived and write-out data to memory. • Regardless of address hit, the WbEData must be sent a Cmp without relying on forward progress of the Request Spill Queue. 8.5.6.8 On Receipt of a Rsp* • The rules in Section 8.5.6.1, Section 8.5.6.2, and Section 8.5.6.3 cover snoop responses. Ref No xxxxx 295 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol 8.6 Application Notes The previous sections have focused on describing the primitives provided by this protocol for implementing higher order actions, without much information as to how these primitives are used. This section is intended to provide additional information and hints as to how best use these primitives to implement certain high level operations. 8.6.1 Global Observation Designers of caching agents and the memory subsystems hidden behind them are often concerned with the global observation point for requests-for-ownership (RFO’s). Global observation is, quite simply, the point at which the global protocol can guarantee that any subsequent processor load or I/O read will return the data written by the RFO operation. In CSI, global observation for an RFO transaction can always be inferred on receipt of the completion message (Cmp or FrcAckCnflt), or the response message (DataC_E, DataC_M). This rule holds regardless of whether the data comes from the home or from a caching agent. An InvItoE will always signal GO with a combined response (GntE_FrcAckCnflt/GntE_Cmp). For cached data, the DataC_E or DataC_M message will usually reach the requesting caching agent first, and it can signal GO. For uncached data, the home agent may choose to combine the DataC_E with the completion (Cmp or FrcAckCnflt) into a single message. However, it may also be desirable to send the completion as soon as all the snoop response have been received – and this is permitted by CSI. In this case, GO can be signaled from the receipt of the completion, with the DataC_E trailing behind. The converse, of course, is not true. The home agent is not permitted to deliver DataC_* to the requestor until all of the snoop responses have been received. 8.6.2 Flush Cache Operation One higher order primitive that some memory models require is a global flush cache primitive. The semantics of this operation are that it must ensure that the data at the given address is globally observable by a load initiated with all possible page table attributes. In CSI, a flush cache can be implemented with an InvItoE operation. InvItoE ensures that all caches are in I state and that any M-state data is written to memory before returning a GntE_Cmp. Home agents should be sure not to compromise the overloaded intention of using InvItoE as a flush cache primitive by returning GntE_Cmp at a point before a subsequent UC load (for example) is able to read the latest data. 8.6.3 Partial Write to Coherent Space There are a number of memory model quirks that are implemented through the use of a coherent ‘push’ operation, otherwise known as a write-through or partial write. The front side bus (FSB) equivalent of this operation was BWIL. In CSI, this operation is synthesized by joining two existing primitives, the InvItoE, and the coherent writeback, as shown in Figure 8-28. The InvItoE forces a writeback of any M state data, and acquires exclusive ownership of the line. This is followed by an immediate coherent writeback operation (WbMtoI as an example). For a partial write, we use a WbIDataPtl packet, which contains a per-byte mask of which bytes to write. Home agents must ensure that only the bytes indicated by the byte mask are modified in memory. 296 Ref No xxxxx Intel Restricted Secret Figure 8-28. Partial write to coherent space, Hit M I A I B M C I H InvItoE SnpInvItoE RspIWb WbData RspI I ..E M ..I MR DATA GntE_Cmp SnpInvItoE MW W_Cmp RMW W_Cmp M ..I PtlWbData Cmp WbMtoI The other place where this flow diverges somewhat from normal operation is under a conflict with the InvItoE operation, as shown in Figure 8-29. The InvItoE from agent C acquires ownership of the line, but a conflict with B causes an AckCnflt phase, at which point the home attempts to forward the line from C to agent B. However, agent C does not have the background data for the line, so he cannot give it directly to B. In this case, caching agent C must use its option to issue a RspIWb with a WbIDataPtl to the home. The home will do the merge with the data in memory, and deliver the line to the requestor. Ref No xxxxx 297 Intel Restricted Secret CSI Cache Coherence Protocol CSI Cache Coherence Protocol S A I B I C I H I .. E InvItoE SnpInvItoE SnpInvItoE RspI RspI GntE_Cmp MR DATA Cmp_FwdInvOwn M .. I RMW W_Cmp S .. I AckCnflt RdInvOwnSnpInvOwnSnpInvOwn RspCnflt RspIWb PtlWbData DataE_Cmp RspI 8.7 Coherence Protocol Open Issues 8.7.1 Arbitrary AckCnflt’s The next rev of this chapter will relax the rules governing when a caching agent may generate an AckCnflt. This flexibility is being enabled to allow the AckCnflt flow to be leveraged to resolve internal, implementation-specific cases. The following guidance will be given for this flow: • An AckCnflt may only be generated immediately following the request phase of a transaction. If a transaction will have an AckCnflt, then the transaction is not considered complete until that AckCnflt is completed. • AckCnflt’s are not intended to be used for performance critical flows, due to the extra message bandwidth demand and the increased TID occupancy. • AckCnflt’s which are initiated by purely internal events can lead to debug confusion, as the ‘arbitrary’ AckCnflt’s may alias a real error case. Therefore, caching agent implementations should make every attempt to expose the reason for AckCnflt initiation through an implementation-specific encoding in debug bits provided in the AckCnflt message. On a CSI- based Link layer, these debug bits will be in the place of the critical chunk (Addr[5:3]) bits-with 000 indicating that the AckCnflt was initiated because of a CSI visible event, and implementation specific encodings will expose the various internal cases which may initiate the AckCnflt. • There may be a constraint imposed that the AckCnflt may only be initiated if the line is held in E or M state at the caching agent. 298 Ref No xxxxx Intel Restricted Secret This chapter describes the different non-coherent transactions supported with the CSI protocol and the rules for their usage. Non-coherent transactions are defined as those transactions which do not participate in the CSI coherency protocol. 9.1 Transaction List Non-coherent transactions comprise requests and their corresponding completions. For some special transactions, a broadcast mechanism is required of the initiator and this is described in Section 9.8, “Broadcast Non-Coherent Transactions” on page 9-314. The non-coherent transactions are listed in Table 9-2 and the abbreviations used to label the transactions are listed in Table 9-1. Table 9-1. Non-Coherent Message Name Abbreviations Abbreviation Full Name Abbreviation Full Name Nc Non-coherent P2P Peer-to-peer Wr Write LT LaGrande Technology Rd Read Msg Message Ptl Partial B Bypass I/O I/O S Standard Cfg Configuration Int Interrupt Cmp Completion CmpD Completion with Data DataNC Non-coherent Data Prio Priority Upd Update Ack Acknowledge Table 9-2. Non-Coherent Requests Request Type Request Name Data Flit Payload? Expected Response Brief Description Non-Coherent Memory Transactions NcWr Y Cmp Write to non-coherent memory space. NcWrPtl Y Cmp Partial write to non-coherent memory space. WcWr Y Cmp Write combinable write to non-coherent memory space. WcWrPtl Y Cmp Partial write combinable write to non- coherent memory space. NcRd N DataNC Read from non-coherent memory space. NcRdPtl N DataNC Partial read from non-coherent memory space. Legacy I/O Transactions NcIOWr N Cmp Write to legacy I/O space. NcIORd N DataNC Read from legacy I/O space. Ref No xxxxx 299 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol Request Type Request Name Data Flit Payload? Expected Response Brief Description Configuration Transactions NcCfgWr N Cmp Configuration write to configuration space. NcCfgRd N DataNC Configuration read from configuration space. Peer-to-Peer Transactions NcP2PS N Cmp Peer-to-peer transaction between I/O agents. (Non-coherent Standard channel) Secure Transactions NcP2PB NcLTWr NcLTRd Y N N Cmp Cmp DataNC Peer-to-peer transaction between I/O agents. (Non-coherent Bypass channel) Secure write request. Secure read request. Messages NcMsgB Y CmpD Non-coherent Message (Non-coherent Bypass channel) NcMsgS N Non-coherent Message (Non-coherent Standard channel) Interrupt Transactions IntPhysical Y Cmp Physical mode interrupt to processor. IntPrioUpd N Cmp Interrupt priority update message to interrupt source agents. IntAck N DataNC Interrupt Acknowledge to the legacy 8259 interrupt controller. IA-32 Specific Interrupt Transactions IntLogical Y Cmp Logical mode interrupt to processor. The possible responses for non-coherent requests are: • For transactions which request data, a DataNC or CmpD response is supplied by the target. The DataNC completion carries data as separate data flits. CmpD carries the (small) data in the header and has no accompanying data flits. • The Cmp response is returned by a target for transactions which do not require data in the completions. For a listing of how non-coherent transactions and responses are encoded on CSI (and a mapping of their virtual channels), refer to Section 4.6.3, “Mapping of the Protocol Layer to the Link Layer” on page 4-169. 9.2 Protocol Layer Dependencies This section enumerates the dependency rules expected of requesters and targets of non-coherent transactions. A requester is defined as the source agent initiating the request and a target agent is the agent who is the target of the request. 9.2.1 Requester Rules 1. The requester is not required to perform address conflict checking on non-coherent transactions. 300 Ref No xxxxx Intel Restricted Secret Unlike coherent requests, the CSI requester is not required to perform address conflict checking for non-coherent requests. For example, a requester could potentially have multiple non-coherent requests outstanding to the same address as long as that device’s ordering rules are maintained through other implementation specific mechanisms. 2. The non-coherent bypass channel (NCB) must be guaranteed to make forward progress independent of all other channels flowing in the same direction. The NCB channel must be guaranteed to make forward progress regardless of the state of other channels (e.g. non-coherent standard and the coherent channels). For example, the non- coherent standard channel could be backed up and the bypass channel must still make forward progress. 3. The completion channel (Cmp) must be guaranteed to make forward progress independent of all channels flowing in the same direction except for the non-coherent bypass channel (NCB). The completion channel must be guaranteed to make forward progress regardless of the state of other channels (e.g. non-coherent standard and the coherent channels). Only the non-coherent bypass channel can place a dependency on the completion channel (e.g. PCI ordering rules) although it is not required to do so. 4. In general, the requester must be able to accept responses (and associated data) unconditionally for all of its outstanding requests. The only exception to this rule is that the requester may not have sufficient buffering (and consequently response channel credits) to accept all of its outstanding request completions (e.g. tunneled peer-to-peer read transactions). But this condition must be guaranteed to be temporary. When a CSI requester issues a request, the requester must accept responses for that request without any dependencies upon any other pending coherent or non-coherent requests. 5. A transaction is deallocated from the requester after receiving either a Cmp, CmpD, or DataNC. Each requester is assumed to include an implementation dependent structure responsible for tracking all outstanding requests. Once a requester receives a completion response, that entry is deallocated from its pending request structure. 6. In the general case, a transaction is considered globally ordered only after the requester receives a Cmp response (i.e. a requester cannot “post” transactions on CSI). In the general case, the CSI “fabric” is assumed to be unordered due to multipath topologies and independent virtual channels. Because of this property, a transaction such as a NcWr is not assumed to be globally ordered until after it receives a completion. This implies that ordering responsibilities fall upon the requester (and its memory ordering model). No ordering assumptions about the CSI interface can be assumed. 7. The requester must ensure fairness between virtual channels (e.g. NcStd and NcByp channels). The bypass channel can always make forward progress when the standard channel is blocked (which can occur due to PCI requests). However, the implementation must ensure that the bypass channel cannot starve the standard channel under non-blocking conditions. The same guarantee must be made for transactions between the coherent and non-coherent channels. Anti-starvation is addressed in an implementation specific manner. 8. The requester must assign Requester Transaction ID values (RTID) such that they are unique for each requester/target node pair. This rule applies across both coherent and non-coherent requests. Ref No xxxxx 301 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol This rule is detailed further in Chapter 8, “CSI Cache Coherence Protocol”. 9.2.2 Target Rules 1. The target does not perform address conflict checking on non-coherent transactions. Unlike coherent requests, the CSI target does not perform address conflict checking for non- coherent requests. For example, a target is not required to check incoming requests with outgoing requests to the same address. 2. The non-coherent bypass channel (NCB) must be guaranteed to make forward progress independent of all other channels flowing in the same direction. As for requesters, the NCB channel must be guaranteed to make forward progress regardless of the state of other channels (e.g. non-coherent standard and the coherent channels). For example, the non-coherent standard channel could be backed up and the bypass channel must still make forward progress. 3. The completion channel (Cmp) must be guaranteed to make forward progress independent of all channels flowing in the same direction except for the non-coherent bypass channel (NCB). As for requesters, the completion channel must be guaranteed to make forward progress regardless of the state of other channels (e.g. non-coherent standard and the coherent channels). Only the non-coherent bypass channel can place a dependency on the completion channel (e.g. PCI ordering rules) although it is not required to do so. 4. If an non-coherent target agent receives a non-coherent request, that agent is required respond without any dependencies on other coherent or non-coherent requests or responses. Noncoherent responses must be unconditionally returned without dependencies on coherent transactions. 5. A target cannot respond to a request until the request is guaranteed to be globally ordered (defined by the device’s memory ordering model). With CSI, the source is responsible for ensuring correct memory ordering and a transaction is considered globally ordered when it receives a completion for its request. Therefore, a CSI target cannot complete a request until after the ordering rules of that platform can be guaranteed. For example, a noncoherent read completion from PCI cannot be returned by a non-coherent target agent until after prior coherent write requests initiated by the same agent are completed. 6. The target must ensure fairness between virtual channels (e.g. NcStd and NcByp channels). The non-coherent Bypass virtual channel was developed in order to guarantee that the bypass channel can always make forward progress when the standard channel is blocked. However, the implementation must ensure that the bypass channel cannot starve the standard channel under non-blocking conditions. The same guarantee must be made for transactions between the coherent and non-coherent channels. Anti-starvation is addressed in an implementation specific manner. 302 Ref No xxxxx Intel Restricted Secret 9.3 Non-Coherent Memory Transactions This section explains non-coherent memory read and write transactions. These transactions can target memory-mapped I/O or main memory. They do not snoop any caching agents. If the data is potentially stored in a CSI agent’s cache, the coherent protocol must be followed (explained in Chapter 8, “CSI Cache Coherence Protocol”). Note: The following figures illustrate a chipset device as the I/O agent which bridges between CSI and an I/O interface. The I/O interface is depicted as PCI. These terms should be taken only as examples. In other words, PCI can be replaced with any Load/Store I/O interface or a non-coherent region of main memory. Similarly, the I/O Hub could be replaced with a more traditional Memory Controller hub. These choices are not relevant to the CSI traffic flows. For a legend of the following diagrams, refer to Chapter 8, “CSI Cache Coherence Protocol.” 9.3.1 Non-coherent Write Transaction Flow Figure 9-1 illustrates the data flow for a non-coherent write transaction initiated by a processor to memory-mapped I/O space. Figure 9-1. Non-Coherent Write Transaction Flow PCI A B C H NcWr Cmp MemWr Request Phase A non-coherent write transaction is initiated with a NcWr or NcWrPtl request. This request is routed through the CSI fabric to the target non-coherent agent which interfaces I/O devices. The non-coherent target agent responds to the NcWr with a Cmp response once the write request is guaranteed to preserve the memory ordering model of the platform (e.g. the Producer-Consumer model described in the PCI specification). When the Cmp returns to the requester, it deallocates the request and the requester is permitted to issue the next order dependent request. The non-coherent target agent forwards the write to the I/O domain using the appropriate protocol of that interface. If the non-coherent write was targeting memory attached (or integrated) to a CSI agent, the target issues the Cmp after the data is written to a point of global observation. On CSI, the flow above is identical for either case. Ref No xxxxx 303 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol 9.3.1.1 Write Combinable Writes CSI differentiates non-coherent writes versus write-combinable non-coherent writes. The latter is initiated through a WcWr or WcWrPtl request permitting the CSI target (typically an I/O agent) to optionally combine these writes into a longer write which is further optimized for the I/O interface (e.g. PCI Express). Implementation Note: Write Combining Space Assignment In general, a write combinable write is issued if a write targets an address marked as write combinable. Current implementations rely on page table programming to indicate these spaces. Chipset components are currently unaware of where these spaces are and therefore cannot initiate WcWr and WcWrPtl without a new software environment. Implementation Note: Chipset Write Combining Buffers There are a variety of chipset write combining implementations possible. In general, it’s desirable to implement at least one buffer per PCI Express port. The size of this buffer would ideally be sized to support the maximum payload supported on that interface. This implementation allows independent processors to stream data to different ports without interfering with each other. In general, all rules and flows for NcWr and NcWrPtl apply to WcWr and WcWrPtl. In addition, the following rules are required: 1. The target of a WcWr or WcWrPtl is not permitted to combine across a 4KB boundary. CSI write combining requires that an I/O device’s address space not cross a 4KB boundary. This CSI restriction removes the risk of improperly combining independent writes to independent I/O devices or functions. 2. Like NcWr and NcWrPtl, WcWr and WcWrPtl cannot be completed on CSI until after the write is globally observable (e.g. reaches the PCI ordering domain). Since the software fences are not visible on the CSI fabric, processor write combining buffer flushing events are expected to force the buffer contents to be globally observable. Therefore, the non-coherent target agent cannot complete the write on CSI until it can guarantee global observation (e.g. posted in a chipset posted write queue). 304 Ref No xxxxx Intel Restricted Secret Implementation Note: Chipset Write Eviction Explicit software fences are not visible on the CSI fabric and therefore, the chipset cannot differentiate between a WcWr triggered through software flushing and those triggered due to processor resource limitation. Therefore, the chipset must assume that all WcWr and WcWrPtl issued by the processor are due to explicit flushing. Due to this assumption, the chipset must not indiscriminately hold up completing WcWr or WcWrPtl waiting for subsequent WcWr or WcWrPtl to combine. Otherwise, the processor could starve due to a limited number of outstanding noncoherent write requests. One reliable policy would be one where the chipset combines only if the destination I/O port is busy. This self-regulating policy would ensure low latency to complete the writes while still combining when the higher efficiency is needed most. 3. Once the WcWr or WcWrPtl target completes the write request, the target must not combine any subsequent WcWr or WcWrPtl requests with prior write requests if they are to the same write combining buffer. The chipset must assume that all WcWr and WcWrPtl are due to explicit software flushing. Therefore, the target may not combine a WcWr or WcWrPtl with previous write combinable writes which have already been completed by the target. Otherwise, it would be possible to combine writes which have an intervening fence in between therefore violating the software’s intention to separate the writes to the I/O device. Implementation Note: Chipset Write Combining and Multiple Threads Write combining in the chipset has a risk with current software. It is theoretically possible that two independent threads (or cores) could be writing to the same device and these writes would be combined in the chipset. If the software intent is that they are NOT combined, there could be a problem if the I/O device doesn’t have enough buffering to absorb the combined write. This buffering limitation is the reason PCI Express precludes write combining in the general case. This risk could be mitigated somewhat through storing the requester NodeID with the chipset write combining buffer and matching it against subsequent writes which hit the buffer. However, since the requester NodeID doesn’t track initiators down to the core or thread granularity, this is not a perfect solution. However, with indications that the write is to write combinable space, then the chipset combining differs very little from the current processor combining. It is felt that the risks involved with chipset combining are very low as long as the rules described in this specification are followed. Since there is a risk, however, the chipset which employs combining should implement a software mechanism to enable or disable combining (e.g. through BIOS). When disabled, the target treats WcWr and WcWrPtl indentical to NcWr and NcWrPtl. Ref No xxxxx 305 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol 9.3.1.1.1 Non-coherent Write Combinable Write Transaction Flow Figure 9-2 illustrates the data flow for a non-coherent write combinable write transaction initiated by a processor to memory-mapped I/O space. Figure 9-2. Non-Coherent Write Combinable Write Transaction Flow ABCH PCI WcWr Cmp MemWr WcWr WcWr Globally Observable Cmp Cmp WC Window Open A non-coherent write combinable write transaction is initiated with one or more WcWr or WcWrPtl requests. In the above example, the writes all fall within an available write combining buffer in the target node. These requests are routed through the CSI fabric to the target non- coherent agent which interfaces I/O devices. Since these writes typically have relaxed ordering rules, it’s likely that more than one are pipelined without waiting for completions. The non- coherent target agent implementing write combining will buffer the writes and respond to all the buffered WcWr requests with Cmp responses once the write requests are guaranteed to preserve the memory ordering model. When each Cmp returns to the requester, the requester deallocates each request. The eviction policy of the target node is implementation specific. When all the buffered writes are completed, requester is permitted to issue the next order dependent request. The non- coherent target agent forwards the combined write to the I/O domain using the appropriate protocol of that interface. 9.3.2 Non-Coherent Read Transaction Flow Figure 9-3 illustrates the data flow for a non-coherent read transaction initiated by a processor to memory-mapped I/O space. 306 Ref No xxxxx Intel Restricted Secret Figure 9-3. Non-Coherent Read Transaction Flow PCI A B C H NcRd DataNC MemRd Request Phase RdCmp A non-coherent read transaction is initiated with a NcRd request. This request is routed through the CSI fabric to the non-coherent target agent which interfaces I/O devices. Since the read request returns data from the I/O device, the non-coherent target agent forwards the read to the I/O domain using the appropriate protocol of that interface. The I/O device eventually returns data to the non- coherent target agent which forwards this data to the requester using a DataNC response and the requester deallocates the NcRd. If the non-coherent read was targeting memory attached (or integrated) to a CSI agent, the target returns the DataNC response after fetching the data from the appropriate memory location (or internal buffer depending on the microarchitecture). 9.3.3 “Don’t Snoop” Transaction Flow The “Don’t Snoop” attribute is a hint on PCI Express (and other I/O interfaces) which allows the platform to assume that the request is targeting a location assumed to be in main memory and not cached. Therefore, snooping is not required. However, caution should be exercised with this hint. For correctness, software must ensure that the data is not cached in any agent. While this might be possible to ensure for processors (in a standard manner), this might not be a practical expectation for caching chipset components. For example, a caching agent within an I/O agent could have a line in a modified state. Today’s standard software is unaware of caching I/O agents and therefore would be unaware of the requirement to flush I/O agent caches. Here is a list of some implementations which could safely implement non-coherent writes to coherent memory space: • Proprietary or integrated solutions (e.g. integrated graphics). Such implementations may safely exploit this snoop reduction optimization by having a thorough understanding of what data gets cached and where. • Non-caching I/O agents. If there is not a caching I/O hub in the platform, then non-coherent writes might more practically be used with the assumption that the processor caches are flushed appropriately by software. Ref No xxxxx 307 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol 9.3.3.1 “Don’t Snoop” Writes A write received by an I/O agent where the “Don’t Snoop” attribute is set by the I/O device is forwarded to the appropriate home agent as a NcWr request. Once global ordering can be ensured by the home agent, a Cmp response is returned to the requester and the request entry is deallocated. Eventually, the home agent writes the data to the memory subsystem (beyond the scope of this specification). 9.3.3.2 “Don’t Snoop” Reads A read received by the I/O agent where the “Don’t Snoop” attribute is set by the I/O device is forwarded to the appropriate home agent as a NcRd or NcRdPtl request. Data must be retrieved from memory (or some memory buffer) and a DataNC response is returned to the requester. When the DataNC returns to the CSI requester, the requester deallocates the request. 9.3.3.3 “Don’t Snoop” Peer-to-Peer It is also possible that the “Don’t Snoop” attribute is set for peer-to-peer transactions (refer to Section 9.4, “Peer-to-Peer Transactions” on page 9-309). However, since peer-to-peer transactions never snoop CSI caching agents, this attribute is ignored (and/or forwarded) when the address indicates a peer I/O interface. 9.3.4 Length and Alignment Rules NcWr, WcWr, and NcRd represent non-coherent memory writes and reads of one cache line. These requests must have cache line aligned addresses only (e.g. Addr[5:3] = 000 for 64 byte cache line platforms). Partial writes are issued with NcWrPtl or WcWrPtl requests. These partial write requests supply eight data flits (for a maximum of 64 byte cacheline write) and a byte enable for each byte in the cache line. The address is always cache line aligned and any byte in the line can be enabled. It is the responsibility of bridging components (such as an I/O agent) to fragment a non-contiguous NcWrPtl or WcWrPtl transaction into multiple transactions which are supported by the supported interface protocol. Partial read requests are initiated with an NcRdPtl request. This request implements a byte address offset and a length resulting in a read with any byte alignment and any length from 0 to 63 bytes. An NcRdPtl which crosses a cache line boundary is considered illegal. Data which is returned from a NcRdPtl will have the data “naturally aligned” in the eight flit payload of the DataNC completion. All unused data fields are ignored. For examples, refer to Table 9-3. Table 9-3. Example Read Completion Formatting Read Length(bytes) Address[5:0] Data Flit Byte Numbera,b 1 000000 0 0 2 000000 0 1:0 4 000010 0 5:2 4 000100 1 11:8 8 111100 Error Condition a. Unused bytes of the data payload are reserved and ignored. b. Data is in little endian format. Refer to Section 4.6.5, “Organization of Packets on the Physical Layer” on page 4-173for details. 308 Ref No xxxxx Intel Restricted Secret Implementation Note: Length and Alignment Requirements for Target Non-coherent Agent Some targets could choose to restrict the size and alignments they support. This is an implementation simplification but must it is the responsibility of that component design team to guarantee that no other CSI agent is capable sending them an unsupported length/alignment. 9.3.4.1 Zero Length Transactions Zero length NcRdPtl requests are implemented with a 0 byte length. If the target is memory- mapped I/O, the non-coherent target agent propagates this transaction to the I/O device. When the I/O device completes the transaction, the target non-coherent agent forwards the completion (data is undefined and ignored) back to the CSI initiator. If the target is main memory, the home agent replies with undefined data. Zero-length NcWr requests are implemented as an NcWrPtl with no byte enables asserted. Aside from following write ordering rules, the initiator cannot rely on any side effect occurring with this transaction. The write is completed as any other NcWrPtl. 9.4 Peer-to-Peer Transactions Peer-to-peer transactions are defined as those which originate and terminate on an I/O interface such as PCI (any generation). These transactions are only relevant for topologies where a transaction is required to traverse CSI in order to reach its destination. An obvious example topology is one comprising more than one I/O agent. Table 9-4. Peer-to-Peer Transactions Transaction Name Virtual Channel Data Flits? Description NcP2PB NcBypass Yes Used to carry packets with data from one I/O agent to another. Typically used for peer writes and peer read completions. Since it uses the NCB channel, these transactions must be non-blocking and guaranteed to make forward progress. NcP2PS NcStandard No Used to carry packets without data from one I/O agent to another. Typically used for peer read requests. Peer-to-peer transactions allow CSI to preserve all protocol information (e.g. PCI Express) without requiring route-through agents to be aware of that protocol. The format of the CSI peer-to-peer packet (specified in Section 4.6.1.17, “Peer-to-Peer Tunnel Header” on page 4-158) is generic in that many of the fields are labelled as tunneling fields. The protocol getting tunneled is specified using the Tunnel Type field. This allows the source peer-to-peer agent to specify the fields in a proprietary manner while insulating intervening CSI components (e.g. routers) and this specification from changes in the protocols to be tunneled. Peer-to-peer transactions are non-coherent transactions and never snoop CSI caching agents. Peerto- peer requests all require Cmp completion packets to deallocate the transaction from the initiator. Ref No xxxxx 309 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol This specification defines a generic mechanism for tunnelling non-CSI protocol information over the CSI fabric. Details about how these peer-to-peer transactions are used to create upper level protocols between I/O agents are left to component specifications. Note: This peer-to-peer mechanism is prescribed for situations where preserving a foreign protocol’s information is required or if packets larger than cacheline are desired on that foreign interface (e.g. PCI Express). If this is not a requirement, it is possible to implement peer-to-peer by using other non-coherent requests (e.g., NcRd, NcWr, etc.). 9.5 Legacy I/O Transactions Legacy I/O transactions are those traditionally initiated through processor IN or OUT instructions. The Itanium architecture provides a mechanism to address this space through a memory-mapped region but this is outside of the scope of CSI. In the CSI domain, legacy I/O space is an address space which is separate from memory and configuration spaces. 9.5.1 Legacy I/O Write Transaction Flow Figure 9-4 illustrates the data flow for an I/O write transaction initiated by a processor to I/O space. Figure 9-4. Legacy I/O Write Transaction Flow PCI A B C H NcIOWr Cmp IOWr Request Phase WrCmp An I/O write transaction is initiated with a NcIOWr request. This request is routed through the CSI fabric to the target I/O agent which interfaces I/O devices. The I/O agent forwards the request to the appropriate I/O interface and does not respond with a Cmp response until after the I/O device completes it on its interface. When the Cmp returns to the requester, the requester deallocates the request. 9.5.2 Legacy I/O Read Transaction Flow Figure 9-5 illustrates the data flow for an I/O read transaction initiated by a processor to I/O space. 310 Ref No xxxxx Intel Restricted Secret Figure 9-5. Legacy I/O Read Transaction Flow PCI A B C H NcIORd DataNC IORd Request Phase RdCmp An I/O read transaction is initiated with a NcIORd request. This request is routed through the CSI fabric to the target I/O agent which interfaces I/O devices. The I/O agent forwards the request to the appropriate I/O interface and does not respond with a DataNC response until after the I/O device completes it on its interface. When the DataNC returns to the requester, it deallocates the request. 9.5.3 Addressing, Length and Alignment Rules I/O reads and writes are always issued as 1-4 byte transactions. Only contiguous byte enables are allowed to be asserted. Legacy I/O writes which cross a 4 byte boundary are required to get split up by the CSI initiator into two NcIOWr requests. Legacy I/O reads which cross an 8 byte boundary are required to get split up by the CSI initiator into two NcIORd requests. Due to legacy reasons, legacy I/O space is 64K + 3 bytes. The “extra” three bytes are shadows of the first three bytes starting at I/O address 0 and are accessible by issuing an I/O transaction which straddles beyond the 64K limit (e.g. a 4 byte access starting at address FFFFh). Note: It is possible that the CSI NcIORd/Wr request has address bits above bit 15 set. When bridging to PCI Express, it is the responsibility of the target I/O agent to ignore (or translate) address bits above bit 15 when it receives and forwards the I/O request to the I/O device. 9.6 Configuration Transactions Configuration transactions to PCI configuration space are initiated using one of two mechanisms: • The legacy CF8/CFC mechanism • A memory-mapped mechanism Ref No xxxxx 311 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol Both mechanisms fall outside the scope of this specification. The CF8/CFC mechanism is documented in the various PCI specifications starting with revision 2.1. This legacy approach is undesirable for multi-processor systems due to its non-atomic property. For details on how the CF8/CFC accesses translate into CSI configuration cycles, refer to Section 7.2.1.4, “I/O Configuration Accesses using 0x0CF8/0x0CFC I/O Port” on page 7-238. A new memory mapped mechanism was formally standardized with the PCI Express 1.0 specification. This mechanism allows firmware to establish a region of memory space that when written to or read from using processor load and store commands (4 bytes or less) a configuration transaction is issued to the partition. In addition to the PCI Express standard, non-PCI configuration space (e.g. processor configuration registers) is accessible through memory mapping configuration space. This mapping and translation from a memory transaction to a configuration transaction is the responsibility of the initiator. Through either mechanism, the processor issues a NcCfgWr or NcCfgRd request on a CSI interface. Any NcIOWr or NcIORd transaction with a CF8 or CFC address, targets legacy I/O space and not configuration space. Note: The different PCI generations differentiate between Type 0 and Type 1 configuration transactions. CSI relies on platform aware firmware code to access and configure the CSI agents. Therefore, there is no CSI requirement for differentiation (or translation) between Type 0 and Type 1 configuration transactions. Translation from a Type 1 to a Type 0 configuration transaction is the responsibility of the I/O agent. Implementation Note: Configuration Register Mapping This section describes the mechanism used to access configuration registers which are mapped into a space visible by the operating system (PCI configuration space). This space is separate and distinct from legacy I/O and memory space. While PCI Express added the capability to memory map the legacy configuration space, it should be noted that these registers are still accessed through specific configuration transactions. In addition to NcCfgRd and NcCfgWr requests, a CSI component might implement memory- mapped configuration registers which fall outside of the standard PCI Express configuration space. These registers may be mapped anywhere in the platform’s memory space (product specific) and are accessed through NcRdPtl and NcWrPtl requests. Length and alignment rules for memory-mapped CSI configuration registers are product specific (e.g. a particular CSI component might not support byte granularity). It should be noted that these registers are written with NcWrPtl requests and therefore must make forward progress without any dependencies on other transactions mapped to the NcStd channel. In addition, if writing a register has a side-effect (e.g. assertion of a side-band signal) then the Cmp must be issued only after the side-effect has occurred. Note: The below figures illustrate examples where the configuration transaction targets an I/O device beyond the CSI domain. However, there are also cases where the configuration request targets a target configuration agent within a CSI device. 9.6.1 Configuration Write Transaction Flow Figure 9-6 illustrates the data flow for a configuration write transaction initiated by a processor to an I/O device. 312 Ref No xxxxx Intel Restricted Secret Figure 9-6. Configuration Write Transaction Flow PCI A B C H NcCfgWr Cmp CfgWr Request Phase WrCmp A configuration write transaction is initiated with a NcCfgWr request. This request is routed through the CSI fabric to the target I/O agent which interfaces I/O devices. The I/O agent forwards the request to the appropriate I/O interface and does not respond with a Cmp response until after the I/O device completes it on its interface. If the NcCfgWr was targeting a CSI configuration agent, it completes the transaction only after the configuration register is updated with the new data. When the Cmp returns to the requester, it deallocates the request. 9.6.2 Configuration Read Transaction Flow Figure 9-7 illustrates the data flow for a configuration read transaction initiated by a processor to an I/O device. Figure 9-7. Configuration Read Transaction Flow PCI A B C H NcCfgRd DataNC CfgRd Request Phase RdCmp Ref No xxxxx 313 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol A configuration read transaction is initiated with a NcCfgRd request. This request is routed through the CSI fabric to the target I/O agent which interfaces I/O devices. The I/O agent forwards the request to the appropriate I/O interface and does not respond with a DataNC response until after the I/O device has returned the data to the I/O agent. If the NcCfgRd was targeting a CSI configuration agent, it returns the data from the configuration register specified in the NcCfgRd address. When the DataNC response returns to the requester, the requester deallocates the request. 9.6.3 Addressing, Length and Alignment Rules Configuration reads and writes are always issued as 1-4 byte transactions within a 4 byte aligned window. Only contiguous byte enables are allowed to be asserted. Configuration requests can begin at any byte address. 9.7 Secure Non-Coherent Transactions CSI defines two requests used for controlling access permissions in a secure manner: NcLTRd and NcLTWr. These transactions are typically issued by a processor running in a secure environment and always reflect partial (< one cache line) reads and writes. The rules and semantics of these transactions follow those described in Section 9.3, “Non-Coherent Memory Transactions” on page 9-303. 9.8 Broadcast Non-Coherent Transactions Some non-coherent requests are required to be broadcast to multiple target agents. In some cases, the targets are processor agents and in some cases, the targets are I/O agents. Refer to Table 9-5 for a list of the transactions requiring broadcast semantics. The Target Agent lists are defined in Table 9-6. Table 9-5. Broadcast Non-Coherent Transactions Request Request Subtype Required Target Agent List NcMsgBVLW NcMsgSShutdown a NcMsgSInvd_Ack NcMsgSWBInvd_Ack NcMsgBEOI NcMsgBFERR a INTR SMI INIT NMI IGNNE A20M Refer to Table 9-7. IntLogical IntPhysical Interrupt Targets 314 Ref No xxxxx Intel Restricted Secret Table 9-5. Broadcast Non-Coherent Transactions (Continued) Request Request Subtype Required Target Agent List NcMsgSStopReq1 Refer to Table 9-7. NcMsgSStopReq2 NcMsgSStartReq1 NcMsgBStartReq2 NcMsgSPMReq IntAcka I/O IntPrioUpd Interrupt Sources a. While IntAck, NcMsgBFERR, and NcMsgSShutdown can be broadcast to all I/O agents, a more optimal implementation would be to issue the transaction only to the I/O agent which proxies for the active legacy bridge component. Interrupts are typically directed to a specific processor however some situations require delivery to all Local APICs. See the Chapter 10, “Interrupt and Related Operations” for details. 9.8.1 Broadcast Dependency Lists Table 9-5 lists all the broadcast non-coherent transactions and specifies a Target Agent List for each transaction. These lists indicate which destination NodeIDs the transaction must be sent to. The actual implementation of these lists is beyond the scope of this specification however a description of the expected content is provided in Table 9-6. Table 9-6. Target Agent Lists for Broadcast Transactions Target Agent List Description Processors A processor agent list is a list of NodeIDs for each processor agent in the local domain. In some cases, a processor might consist of multiple NodeIDs and therefore the implementation specification must be consulted to determine the expectation of that processor and which agent (or agents) the transaction must be sent to. Note that this list could be identical to the snoop list required for coherent transactions. I/O This is a pointer to the I/O agent. For platforms with multiple I/O agents, this lists the NodeIDs for each I/O agent in the local domain. For many of the broadcast transactions, the cycle is only relevant to the legacy bridge. Since there is typically only one legacy bridge in the partition, the initiator of the transaction could choose an optimization to send those transactions directed only to the NodeID which proxies for the legacy bridge. Power Management The Power Management chapter describes the protocol for coordinating platform power states. Section 15.2.3, “S-State Coordination” on page 15-451 describes an implementation specific dependency list pointing to the agents which have power management dependencies on each other. All This list is the superset of all NodeIDs in the partition which can be the target of non- coherent broadcast transactions (processor + I/O). Quiesce This list covers either all NodeIDs in the system (across partitions) or only the NodeIDs of the agents belonging to that partition. The decision to quiesce the entire system domain or only agents within the partition is an implementation choice. This list enumerates which NodeIDs are the target of synchronizing transactions (StopReq, StartReq). Interrupt Sources The Interrupt Sources list is a list of NodeIDs for agents which are potentially the sources of any interrupt. This typically includes both processors and I/O agents. Interrupt Targets The Interrupt Target list is a list of NodeIDs for agents which are potentially the target of any interrupt. This typically includes processors. Ref No xxxxx 315 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol Note: The Target Agent Lists in Table 9-6 specify the minimum set of agents which are required to see the broadcast transactions listed in Table 9-5. While it would waste CSI bandwidth, the broadcast transactions listed in Table 9-5 could be broadcast to all CSI agents safely. 9.8.2 Broadcast Mechanism Broadcasting on CSI is actually implemented as a multi-unicast. That is, a broadcast request is issued as multiple, point-to-point requests (sub-requests). On the CSI fabric, each request is independent with one exception. The Transaction ID assignment for each sub-request may be the same or different depending on the implementation. All receivers of a broadcast request listed in Table 9-5 must respond with a completion (Cmp or CmpD) without reporting an error condition even if they are never a proper target for that transaction. For example, if a target interrupt agent receives an IntPhysical, it must reply with a Cmp. Figure 9-8. Non-coherent Broadcast Example (IntPhysical) ABCH Request Phase IntPhysical Cmp IntPhysical IntPhysical Cmp Cmp 9.8.3 Broadcast Ordering Only after all completions return for each sub-request is the broadcast transaction considered complete. Any order-dependent operation after the broadcast transaction must wait until all sub- completions return to the initiator. If required, each sub-request can be serialized but this is not required. As of the writing of this specification, there are no known usage models which require serialization of sub-requests. Some broadcast transactions have synchronization requirements. VLW messages (typically targeting the legacy bridge) require that completions do not pass the VLW request directed toward the processors. This is described in Section 9.10.4.1, “VLW Ordering Rules” on page 9-324. 9.8.4 Scaling to Large Systems Broadcast transactions are inherently difficult to scale up to large systems with many agents. CSI expects that components implement their Target Agent Lists large enough to accommodate their market requirements. Scaling beyond those capabilities requires a component to handle the 316 Ref No xxxxx Intel Restricted Secret broadcasting beyond the local cluster of agents. The local cluster agents broadcast within their local cluster and this proxy agent is responsible for broadcasting to remote clusters (if required to do so). 9.9 Interrupts and Related Transactions Interrupts and supporting transactions are considered non-coherent transactions. These requests include IntPhysical, IntLogical, and IntPrioUpd which are listed in Table 9-2. For details and rules governing these transactions, refer to Chapter 10, “Interrupt and Related Operations”. In addition, CSI provides additional legacy interrupt transactions (e.g. INTR) which are implemented with the Virtual Legacy Wire mechanism supported in CSI (NcMsgBVLW). For these details, refer to Section 9.10.4, “Virtual Legacy Wire (VLW) Transactions” on page 9-323. PCI Express also has legacy interrupt support required in platforms which support multiple I/O agents. For example, the legacy INTA:D signals (implemented as a PCI Express messages) must be sent to the target interrupt agent interfacing the legacy bridge. This messaging is addressed through the peer-to-peer mechanism described in Section 9.4, “Peer-to-Peer Transactions” on page 9-309. 9.10 Non-coherent Messages Non-coherent messages comprise transactions which are used to indicate events or states and are not necessarily tied to load and store operations. There are two messages which encapsulate the different CSI non-coherent messages: NcMsgB and NcMsgS. NcMsgB utilizes the Non-coherent Bypass channel (NCB) while NcMsgS utilizes the Non-coherent Standard channel (NCS). Table 9-7. Non-coherent Message Encodings (all use Message Header Type) MessageName Message Type Source Agents TargetAgents Msg Type Encodinga Request Params?b Response Params? Responsec NcMsgBd StartReq2 Quiesce Master All Requesters 0b000000 No No CmpD Reserved 0b000001 - 0b011111 Reserved - Ignored EOI Rsrvd Interrupt Targets All I/O 0b100000 Yes No VLW Rsrvd Legacy I/O Processors 0b100001 GPE Any Legacy I/O 0b100010 CPEI Rsrvd Any Legacy I/O 0b100011 Reserved 0b100100 - 0b111111 Reserved - Ignored Ref No xxxxx 317 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol Table 9-7. Non-coherent Message Encodings (all use Message Header Type) (Continued) Message Name Message Type Source Agents TargetAgents Msg TypeEncodinga RequestParams?b Response Params? Responsec NcMsgS Shutdown Rsrvd Processor Legacy I/O 0b000000 No No CmpD Invd_Ack Rsrvd Processor Any 0b000001 WbInvd_Ack Rsrvd Processor Any 0b000010 Unlock Rsrvd Processor Quiesce Master 0b000011 ProcLock Rsrvd Processor Quiesce Master 0b000100 ProcSplitLock Rsrvd Processor Quiesce Master 0b000101 LTHold Rsrvd Processor Quiesce Master 0b000110 FERR Rsrvd Legacy I/O Processor 0b000111 Quiesce Any Quiesce Master 0b001000 StartReq1 Quiesce Master All Requesters 0b001001 Reserved 0b001010 - 0b011111 Reserved - Ignored IntPrioUpd Interrupt Targets Interrupt Sources 0b100000 Yes No StopReq1 Quiesce Master All Requesters 0b100001 StopReq2 Quiesce Master All Requesters 0b100010 PMReq Any All 0b100011 Yes Reserved 0b100100 - 0b111111 Reserved - Ignored a. Refer to Table 4-17 “Non-Coherent Message, NCM SMP” on page 4-148 for the location of the MsgType field in the Message Header. b. If yes, refer to Table 9-8 “NcMsg Parameter Encoding” on page 9-319 for details on parameter encoding and the byte enables reflect the enabled bytes. If no, the Parameter fields are reserved and Byte Enable field is set to all zeroes. c. Refer to Table 9-9 “CmpD Parameter Encoding (uses SCC Header)” on page 9-320 for details on parameter encoding. d. Since these messages traverse the NCB channel, there are data flits which follow the Message header. These data flits are unused and reserved. Table 9-7 lists that some messages include parameters in the request. Parameters can be carried in different portions of the packet including the ParameterA field and the data fields in the Message Header (refer to Chapter 4, “CSI Link Layer” for details). The encodings for these parameters are listed in Table 9-8. Table 9-7 lists that some messages include parameters in the response. CmpD is the standard response even when there are no parameters required (all parameter fields are treated as Reserved and ignored by the requester). 318 Ref No xxxxx Intel Restricted Secret Table 9-8. NcMsg Parameter Encoding Message Type EOI VLW GPE Byte Enables 00000000 00001111 00000000 ByteNumber ParamA 1, 0 3, 2 5, 4 7, 6 ParamA 1, 0 3, 2 5, 4 7, 6 ParamA L15 L14 L13 L12 L11 RSVD RSVD L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 Vector RSVD RSVD RSVD RSVD RSVD VLW Value VLW Change Indicator RSVD RSVD RSVD GPE Number L0 1, 0 RSVD 3, 2 RSVD 5, 4 RSVD CPEI IntPrioUpd 00000000 00000000 7, 6 ParamA 1, 0 3, 2 5, 4 7, 6 ParamA RSVD RSVD CPEI Number RSVD RSVD RSVD RSVD Dis abl ed RSVD Priority 1, 0 RSVD 3, 2 RSVD 5, 4 RSVD 7, 6 RSVD PMReq 00000011 ParamA Initi al RSVD State Type 1, 0 State Level 3, 2 RSVD 5, 4 RSVD 7, 6 RSVD StopReq1and StopReq2 00000000 ParamA LckQual 1, 0 RSVD 3, 2 RSVD 5, 4 RSVD 7, 6 RSVD Some of the fields listed in Table 9-8 are labelled as reserved (RSVD). These fields are expected to be set to zero by the requester and ignored by the receiver. Ref No xxxxx 319 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol For cases where CmpD carries parameter information, the encoding of these parameters are listed in Table 9-9. Table 9-9. CmpD Parameter Encoding (uses SCC Header) NcMsgType ByteNumber L15 L14 L13 L12 L11 L10 L9 L8 L7 L6 L5 L4 L3 L2 L1 L0 PmReq 5, 4 RSVD State_Type 7, 6 State_Level All other CmpD responses have all parameters set to Reserved - ignored. 9.10.1 Legacy Platform Interrupt Support CSI supports legacy platform interrupts such as General Purpose Event (GPE) and Correctable Platform Error Interrupt (CPEI). These interrupts are currently operating system visible and therefore must follow certain rules. Implementation Note: Platform Interrupt Assertion and Deassertion CSI defines the assertion of GPE and CPEI as an edge-triggered event. That is, there is no deassertion message on CSI. However, in some platforms these events are implemented as level-triggered. In order to emulate level-triggered functionality (if required), the I/O agent which interfaces the legacy bridge implements a software or firmware controlled bit (one for each event) to signal the deassertion. This bit is set by the I/O Agent whenever it sends a platform interrupt event to the legacy bridge. When software clears this bit, the I/O agent deasserts the legacy event either through a physical wire deassertion or by issuing a deassert message to the legacy bridge (implementation specific). Any further platform interrupt event assertions from that point (until cleared again) results in a new interrupt assertion to the legacy bridge. 9.10.1.1 General Purpose Event (GPE) Messages General Purpose Events (GPE) are used to invoke platform-specific ACPI code while system software is running in the operating system context. The GPE is traditionally a specific interrupt signal into the legacy bridge which triggers an SCI (System Control Interrupt). Current operating systems expect that this event is triggered from the legacy bridge. The GPE message on CSI is the mechanism used to trigger an SCI from any CSI device. The GPE message is forwarded to the I/O agent which is proxy for the legacy bridge. That I/O agent forwards the GPE to the legacy bridge which triggers an SCI as an interrupt message. The GPE message carries four bits of parameter information allowing up to 16 different GPEs in the partition. While CSI enables 16 different GPE encodings, the legacy bridge specification should be consulted for the actual GPEs it supports. The CmpD completion for the GPE message carries no useful parameter information and these parameter fields are reserved. 320 Ref No xxxxx Intel Restricted Secret 9.10.1.2 Corrected Platform Event Interrupt (CPEI) Messages Corrected Platform Error Interrupts (CPEI) is used to invoke platform-specific code for the handling of corrected errors. The CPEI is traditionally a specific interrupt signal into the legacy bridge which triggers a CPEI to the processors. Current systems expect that this event is triggered from the legacy bridge since the CPEI vector is programmed by the operating system. The CPEI message on CSI is the mechanism used to trigger a legacy correctable platform error. The CPEI message is forwarded to the I/O agent which is proxy for the legacy bridge. That I/O agent forwards the CPEI to the legacy bridge (via message or physical signal) which triggers an interrupt message. The CPEI message carries four bits of parameter information allowing up to 16 different correctable errors in the partition. While CSI enables 16 different CPEI encodings, the legacy bridge specification should be consulted for the actual CPEIs it supports. The CmpD completion for the GPE message carries no useful parameter information and these parameter fields are reserved. 9.10.2 Power Management Support CSI includes one Protocol layer message which controls how a platform transitions between power management states. This is considered a non-coherent transaction and is broadcast to all target power management agents involved in that state transition. This transaction is initiated with NcMsgSPMReq which is listed in Table 9-7. For details and rules governing these transactions, refer to Chapter 15, “Power Management”. 9.10.3 Synchronization Messages CSI provides messages enabling partition (or entire domain) synchronization. These messages include StopReq1, StopReq2, StartReq1 and StartReq2. StopReq1, StopReq2, and StartReq1 are implemented as NcMsgS messages while StartReq2 is implemented as an NcMsgB message. These messages can be issued through hardware state machines (like the Lock flow) or through writes to implementation-specific control registers. The agent responsible for issuing these messages is referred to as the Quiesce Master. 9.10.3.1 StopReq Messages There are two StopReq messages: StopReq1 and StopReq2. The two step StopReq process is required by platforms with multiple I/O agents. The agent responsible for issuing these messages is the Quiesce Master. Processor agents would simply trigger off StopReq1 to effect the halting of new requests and StopReq2 would be redundant. For platforms with a single I/O agent, the broadcast and processing of StopReq2. The StopReq messages carry a qualifier called LckQual (see Table 9-8). When these messages are used for synchronization, this field must be set to 0xFF. The StopReq messages are broadcast to all synchronization agents indicated in the System Quiesce Scope List (refer to Table 9-6). During these StopReq phases, all caching agents must continue to respond to snoops. Upon receiving a StopReq message, the behavior depends on the agent type: 9.10.3.1.1 Processor Agents 1. When an synchronization agents receives StopReq1: Ref No xxxxx 321 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol • Stop all new requests from queuing into the CSI outstanding transaction table1. • Wait for all outstanding non-posted CSI transactions to complete. • After the above, send a completion for StopReq1: — Exception for Locks: The StopReq1 completion is sent if the only outstanding request is the Lock initiating this flow. 2. On receiving StopReq2: • Send the completion. 9.10.3.1.2 I/O Agents 1. On receiving StopReq1: • Stop all new non-posted requests2 from queuing into the CSI outstanding transaction table. • Wait for all outstanding non-posted CSI transactions to complete. • After the above, send a completion for StopReq1. 2. On receiving StopReq2: • Optionally flush all queued transactions targeting the I/O interfaces. (helps with PAM lock issue for single I/O agent platforms) • Completely block all inbound queues. • Wait for all outstanding non-posted CSI transactions to complete. • After the above, send a completion for StopReq2. 9.10.3.2 StartReq Messages There are two StartReq messages: StartReq1 and StartReq2. The two step StartReq process is required by platforms supporting locks with multiple I/O agents. Like StopReq, the StartReq messages are broadcast to all target synchronization agents indicated in the System Quiesce Scope List (refer to Table 9-6). During these StartReq phases, all caching agents must continue to respond to snoops. Upon receiving a StartReq message, the behavior depends on the agent type: 9.10.3.2.1 Processor Agents • StartReq1 is ignored and completed normally. • Once it receives StartReq2, send the completion and start accepting new requests from core. 9.10.3.2.2 I/O Agents 1. On receiving StartReq1: • For the quiesce flow, ignore and return a normal completion for StartReq1. • For the Lock flow, target agents which are not the target of a Lock access, ignore and return a normal completion for StartReq1. • For the Lock flow, the target agent which is the target of the Lock access, unlock the target I/O port and return a completion for StartReq1. 1. The CSI outstanding transaction table is an implementation-specific structure which simply tracks state for any transactions (coherent and non- coherent) which are outstanding in the CSI fabric. 2. Non-posted per the PCI-Express definition. 322 Ref No xxxxx Intel Restricted Secret 2. On receiving StartReq2: • Start accepting new requests from all I/O ports. • Send a completion for StartReq2 An I/O agent can use the address value to differentiate the target (DRAM or memory-mapped I/O) of the lock sequence and certain performance optimizations can be performed. 9.10.4 Virtual Legacy Wire (VLW) Transactions This section covers legacy signal support on CSI. Legacy signals refer to the signals that exist on current processors - which are primarily sideband signals from the legacy bridge (ICH). Table lists the legacy signals considered and defines how they translate on CSI. Table 9-10. Legacy Pins Descriptions and CSI Handling Legacy Signal Name CSI Treatment Source / Target Definition INTR VLW Source: I/O Agent Indicates to the processor an 8259 interrupt is active/inactive. SMI Target: Processor Interrupt to the processor to enter System Management Mode (SMM). INIT Indicates that a processor must initialize architectural state to the reset values and start code fetch from the reset vector. A20M Indicates to the processor to mask address bit 20 (MSDOS mode). NMI Indicates to the processor that a Non-Maskable Interrupt occurred. IGNNE Indicates to the processor to ignore numeric (floating point) exceptions. FERR Dedicated Message Source: Processor Target: I/O Agent Indicates to the chipset that the processor has detected a Floating Point Error. Open Issue: FERR was a level-triggered signal wire-OR’d across all processors. The new message is edge. Need to resolve if edge is sufficient or if we need to add payload info to the message for Assert/Deassert semantics and requiring a counter. If usage model restricts use from one processor, then edge could be ok. Ref No xxxxx 323 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol Legacy Signal Name CSI Treatment Source / Target Definition PROCHOT / FORCEPR Implementation specific pins Source: Processor or I/O Agent Target: As a processor output (PROCHOT), this indicates the processor has exceeded the thermal limit. As a processor input (FORCEPR), this pin is used to force processor throttling. THERMTRP System Indicates catastrophic thermal trip has occurred (meaning the processor is too hot) and requires the power be dropped immediately. RESET Source: Reset the processor and chipset. TRST, TDI, TDO, TCLK, TMS System Target: I/O Agent or Processor Test Access Port (TAP) - IEEE1149.1 compatibility portion of the TAP used by Intel and other OEMs for board test. TAP is used in High-Volume Manufacturing (HVM) and other post-Si validation/Debug activities. MCERR Source: System Target: Processor The legacy three catastrophic error indicators were used to indicate varying degrees of the error. CSI will support one or more pins indicating a catastrophic error. IERR BINIT In order to phase out these legacy functions in the future, the originator of VLWs (e.g. a processor or legacy bridge) must provide a software controllable (implementation specific) mechanism for disabling each legacy function independently. Multiple ‘pins’ can be delivered by a single VLW message. The message format is implemented as a bit per pin. The message formats will handle both edge triggered and level triggered semantics. Only an ‘assert’ message needed for edge triggered pins. In addition, a ‘deassert’ message required for level triggered pins. As shown in the message format, each virtualized pin has two bits defined - one shows the current state of the pin - asserted (1) or deasserted (0) and the other bit indicates whether the state of the pin has changed. The initial state for all bits are assumed to be inactive before the first VLW message is issued. VLW messages are broadcast to all target non-coherent message agents. It is the responsibility of each target agent to route it to the appropriate core/logical processor. Table 9-11. Legacy Pin Signalling Signal Name EdgeTriggered IGNNE No INTR A20M SMI Yes INIT NMI 9.10.4.1 VLW Ordering Rules Some of the VLWs are generated in response to a processor I/O cycle (NcIORd/NcIOWr). Examples include: • I/O write that causes A20M change 324 Ref No xxxxx Intel Restricted Secret • I/O write causes INIT active edge • I/O write causes IGNNE to go active The I/O agent is required to forward the VLW message and receive its completion prior to completing any NcIORd or NcIOWr back to the requester. 9.10.4.2 Behavioral Rules Rule 1. A VLW message is initiated by a source non-coherent message agent. Rule 2. One VLW message may indicate more than one “pin” state change. Rule 3. VLW messages can be sent at any time with the exception to the rules during the Lock protocol defined in Section 9.10.6.1, “Lock Types” on page 9-327 and Section 9.10.6.2, “Lock Transaction Flow” on page 9-328. Rule 4. All VLW messages are broadcast to all target non-coherent message agents unless otherwise specified. Rule 5. If VLW is directed, only the target non-coherent message agent must respond - all others must ignore and complete normally. Rule 6. Source non-coherent message agents issuing a VLW as a result of a triggering I/O request must receive the completion message of the VLW before sending the completion message for the I/O request. Rule 7. Only one outstanding VLW message is allowed on CSI. The initiator must wait for a completion before sending the next VLW message. Rule 8. The handling of VLWs received by (or routed through) a sleeping agent is discussed in the Chapter 15, “Power Management”. 9.10.4.3 Message Format VLW will use parameters described in Table 9-8. The fields are defined in Table 9-12 and Table 9-13. Table 9-12. VLW Value Field Bits (10:0) Definition Bit(s) Field Name Description 0 IGNNE Level 1 = Active 0 = Inactive 1 A20M Level 2 INTR Level 3 Reserved - Ignored 4 SMI Level 1 = Active Edge 0 = No Active Edge 5 INIT Level 6 NMI Level 7 Reserved - Ignored Ref No xxxxx 325 Intel Restricted Secret Table 9-13. VLW Value Change Bits (10:0) Definition Bit(s) Field Name Description 0 IGNNE Change 1 = There was a change on the signal level 0 = There was not a change on the signal level 1 A20M Change 2 INTR Change 3 Reserved - Ignored 4 SMI Change Always zero since edge triggered. 5 INIT Change 6 NMI Change 7 Reserved - Ignored Reserved fields in the VLW message are set to all zeroes by the source non-coherent message agents and ignored by the target non-coherent message agents. 9.10.5 Special Cycle Transactions IA-32 Architecture supports a set of special cycle transactions that are used to communicate processor state to the platform. Table 9-13 lists the special cycle transactions supported with a brief description of each. Security Special cycles are covered in Section 17.4, “Interprocessor Communication: LT Link Layer Messages” on page 17-469. Table 9-14. IA-32 Special Cycles Special Cycle Name CSI Semantic Description Shutdown NcMsgSShutdown This special cycle is issued by an IA-32 processor when it encounters two more events when it is in the process of handling an event. The third event causes the processor to give up and issue this special cycle to indicate that the processor is non-functional. Typical response to this special cycle is a reset. INVD_Ack NcMsgSInvd_Ack This special cycle is issued after the processor completes an INVD instruction (invalidates caches without writing back modified lines) as an indication to higher level caches that they should also invalidate their caches. This does not cause other processors in the partition to invalidate their caches. WBINVD_Ack NcMsgSWbInvd_Ack This special cycle is issued after the processor completes an WBINVD instruction (invalidates caches AFTER writing back modified lines) as an indication to higher level caches that they should also write back and invalidate their caches. This does not cause other processors in the partition to invalidate their caches. Branch Trace Message Moved to Debug packets Branch Trace messages are used primarily for debug and contain the source and target of a branch that the processor executed.This particular format is also used to send out MWAIT special cycle -which is new to IA-32. The MWAIT instruction issues a special cycle to indicate to the platform processor has entered the Mwait state 9.10.5.1 Behavioral Rules Rule 1. IA-32 special cycles are initiated by processor agents only. Rule 2. All special cycles may be broadcast (multi-unicast) and therefore the broadcast rules specified in Section 9.8, “Broadcast Non-Coherent Transactions” on page 9-314 are required. 326 Ref No xxxxx Intel Restricted Secret Rule 3. As with all CSI requests, ordering of Special Cycles cannot be assumed. If a processor requires certain ordering of Special Cycles, it is their responsibility to serialize at the source with previous transaction completions. Rule 4. Handling of special cycles to (or through) sleeping agents is discussed in the Chapter 15, “Power Management”. 9.10.6 Atomic Access (Lock) Lock operations in CSI are primarily used to support legacy functionality in IA-32 processors. For simplicity, the system lock mechanism in CSI supports different types of lock operations using the same transaction flow. Note: This section describes the lock flow within a specific operating system partition. CSI supports platforms comprising multiple partitions and these partitions can all co-exist within a system domain. The decision to expose locks to the entire domain or to keep within a partition is an implementation decision to be made by both hardware and firmware of that system. 9.10.6.1 Lock Types The purpose of locks range from locking one or more addresses for atomic operations to locking the entire CSI network so that no other operations can progress while certain read and write operations issued by the locking agent are in progress. Table 9-15 lists the lock operations that are supported.1 Table 9-15. Lock Types Lock Type Message Type LckQual Encodinga Traffic Not Locked Processor Lock ProcLock 0x00 Non-Snoop Isoch, Non-Snooped to DRAMb Processor Split Lock ProcSplitLock 0x01 LTHOLD LTHold 0x02 All I/O-Initiated Traffic Reserved 0x03-0xFE Reserved System Quiesce Quiesce 0xFF All traffic locked. Used to quiesce the system. a. Used for corresponding StopReq messages. Refer to Section 9.10.6.2.3, “StopReq Messages” on page 9-329 for details and Table 9-8 for the position in the Message header. b. Blocking of non-snooped accesses to DRAM is an optional performance optimization option. A ‘ProcLock’ is equivalent to the Bus Lock operation on the P4 bus. The semantics of this lock is that all traffic to a given address, (main memory or memory-mapped I/O space) must be stalled while an atomic read-modify-write operation is processed by the lock requesting agent. For simplicity, a Processor Lock operation in CSI has stricter semantics in that it locks all traffic from being initiated during the lock with the exception that Non-Snoopable Isochronous traffic and non-snoopable traffic to DRAM can continue to be issued and completed. Examples of nonsnoopable traffic to DRAM include non-snooped AGP accesses and non-snooped PCI Express accesses. There should be no latency impact to the types of traffic allowed to proceed during the lock operation. (The lock operation corresponds to the lock phase described in Figure 9-9.) 1. PHOLD for ISA devices is not supported in CSI as it can be implemented in the I/O agent by holding off processor-initiated traffic to memory- mapped I/O while a PHOLD from an ISA device is in progress. Ref No xxxxx 327 Intel Restricted Secret Non-Coherent Protocol 328 Ref No xxxxx Intel Restricted Secret A ‘ProcSplitLock’ has similar semantics as a Processor Lock except that it is to guarantee atomicity for two read-modify-write operations. All reads and writes of a split lock must target the same nodeID. Again, only Non-Snoop Isoch and non-snooped traffic to DRAM are allowed to proceed while the lock is in progress. ‘LTHold’ is a Lagrande Technology processor hold (LTHOLD) operation where all processorinitiated traffic is stopped while the LTHOLD requesting processor performs its LT operations in a processor-quiesced partition. While an LTHOLD is in progress, all other I/O-initiated traffic is allowed to proceed. A ‘DebugLock’ is a global CSI lock in that all possible traffic that is pending in the CSI network MUST be drained and all new traffic must be held off while the DEBUG lock is in progress. In certain configurations where isochronous and/or I/O traffic is held off for a lengthy duration, this lock may be destructive, i.e., the system is not restartable after the lock sequence. 9.10.6.2 Lock Transaction Flow Figure 9-9 illustrates an example Lock flow initiated by processor 2 targeting memory-mapped I/O space behind I/O hub 2. The Quiesce Master is I/O hub 1. Details are explained below. Figure 9-9. Example Lock Flow Proc 2 (Lock Requester) IOH 2 (Peer) ProcLock StopReq1 IOH 1 (Quiesce Master) StopReq1 Proc 1 (Peer) StopReq1 Cmp Cmp Cmp StopReq2 StopReq2 StopReq2 Cmp Cmp Cmp NcRdLock Cmp Cmp NcWr Cmp UnLock StartReq1 StartReq1 StartReq1 Cmp Cmp Cmp Cmp StopReq1 phase StopReq2 phase Lock phase StartReq2 StartReq2 StartReq2 Cmp Cmp Cmp StartReq1 phase StartReq2 phase 9.10.6.2.1 Lock Requests To implement all the above forms of locks with a single transaction flow in CSI, there are four different lock types defined (refer to Table 9-15): ProcLock, ProcSplitLock, LTHold, and DebugLock. The lock flow is initiated with one of these four lock requests and terminated with an Unlock message after the atomic update is completed. See “Agent Operations” on page 9-330 for more detail of how an agent reacts during a lock sequence. The address value in the message specifies the address of the first read operation for a Processor Lock and Processor Split Lock type. For an LTHold and DebugLock the address value is undefined. Note: The remainder of this section describes the lock flows starting with a ProcLock message. However, the lock flows may begin with any of the four Lock messages. 9.10.6.2.2 Quiesce Master In a multiprocessor partition, multiple lock requesting agents can simultaneously issue a lock request. To regulate the multiple requestors, a Quiesce Master is identified in the partition. In systems supporting more than one partition with shared resources, the platform could require that the Quiesce Master is the same agent for all partitions. Each lock requesting agent can have at most one lock request outstanding and it is the responsibility of the Quiesce Master to grant permission to one lock requestor at a time. The platform must elect a CSI agent as the Quiesce Master. Only certain components are capable of acting as a Quiesce Master. Election of the Quiesce Master is the responsibility of platform-specific software or firmware. Although any agent in the partition can perform the Quiesce Master duties, the CSI specification requires that an I/O agent assume the Quiesce Master responsibility. The NodeID of the Quiesce Master is programmed into all Lock initiators in an implementation specific manner (e.g. firmware programming). All Lock initiators within a domain must be programmed with the same Quiesce Master. It is expected that for partitionable systems, multiple Quiesce Masters are identified; one (and only one) for each partition. 9.10.6.2.3 StopReq Messages Once the Quiesce Master has accepted a Lock request, the Quiesce Master quiesces the partition (or entire domain depending on the implementation). The specific traffic which is stalled depends on which lock type was issued (refer to Table 9-15). To effect this quiescing operation, stop request messages (StopReq1) are broadcast to the requester’s peers and to the requester itself. All targets begin restricting certain types of traffic as specified by the lock qualifier. Once all the completions for the StopReq1 messages are received, StopReq2 messages are broadcast to initiate the second phase of quiescence. When all completion messages for the StopReq2s are received and the Quiesce Master meets the requirements for StopReq2, a completion message to the initiating Lock message is returned to the lock requestor. At that point, the lock requesting agent can perform its atomic operations. More details of agent responsibilities are described in Section 9.10.6.2.7, “Agent Operations” on page 9-330. 9.10.6.2.4 The Lock Phase The lock requester can perform its atomic operation during the lock phase. Typically this phase consists of a read followed by a write to the same address. In the case of a split-lock, this phase consists of two reads followed by two writes. Ref No xxxxx 329 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol 9.10.6.2.5 Unlock Messages When the atomic operations are completed, the lock requestor sends an Unlock message to the Quiesce Master so that all traffic may resume. The Quiesce Master initiates the two-step start request phase to all lock target agents in the partition (or entire domain). When all targets respond with the appropriate completion messages, the Quiesce Master sends a completion message to the lock requester. If there are other pending ProcLocks in the Quiesce Master, the whole process starts again with the StopReq phases. 9.10.6.2.6 StartReq Messages The StartReq process is initiated after the atomic update is complete. This process is used to “thaw” the CSI agents from their quiesce state. To avoid deadlock, the CSI target of the atomic update must be thawed first (refer to the below implementation note for details). The StartReq requests use the Noncoherent Bypass channel. The data with these requests is undefined. Implementation Note: Locks and Multiple I/O Agents The StartReq process requires two steps in systems with multiple I/O agents: StartReq1 and StartReq2. This is a requirement so that the locked target unlocks the target I/O port BEFORE the other CSI agents continue to issue requests. Without this two-step process, it would be possible that the lock target agent receives requests from other agents (e.g. another processor) before getting unlocked. If this happened, the locked agent would continue to issue locked transactions to the I/O interface even when the intention is not to. More details of agent responsibilities are described in Section 9.10.6.2.7, “Agent Operations” on page 9-330. In addition, StartReq2 must use the Non-coherent Bypass channel to avoid deadlock in the presence of peer-to-peer transactions. If StartReq2 did not use the Bypass channel (which is guaranteed to make forward progress), then when the lock target agent is unlocked it could issue peer-to-peer requests which back up the non-coherent standard channel. A blocked standard channel would block StopReq2 from proceeding. Requiring StopReq2 to use the Bypass channel avoids this problem.a a. This deadlock condition can also occur in single I/O agent systems where the processor is the lock arbiter. For example, if a processor first receives StartReq2 and begins to issue NcRd requests to the locked I/O agent (which hasn’t yet received the StartReq2), the standard channel could fill up. 9.10.6.2.7 Agent Operations Lock Requester Operations 1. The Lock request arrives in the CSI outstanding transaction tracker: • Set LockInProgress bit (only 1 ProcLock accepted). • Send Lock request to Quiesce Master and continue responding to snoops. 2. When the Lock completion is received, the processor core performs the atomic operation during this lock phase. •Continue to respond to snoops, interrupts and VLWs during this lock phase. Note that other I/Oinitiated cycles can occur as a side effect of the atomic operations from the core. •Send Unlock request to Quiesce Master when atomic update is completed 3. When the completion for Unlock is received, clear the LockInProgress bit 330 Ref No xxxxx Intel Restricted Secret Note: During the StartReq and StopReq phases, the Lock Requester maintains the role of a target synchronization agent (refer to Section 9.10.3, “Synchronization Messages” on page 9-321). It is also possible that the Lock Requester is the same component as the Quiesce Master. In this case, the above concepts apply but are not visible on the CSI fabric. Quiesce Master Operations 1. When Quiesce Master accepts the Lock request, set the LockInProgress bit: a. Subsequent Lock requests will queue in Quiesce Master. 2. Broadcast StopReq1 to all target synchronization agents. In addition, the Quiesce Master must perform the responsibilities outlined as a target synchronization agent receiving a StopReq1. Refer to Section 9.10.3.1, “StopReq Messages” on page 9-321 for details. 3. Once it receives all StopReq1 completions (and completes its own StopReq1 responsibilities), broadcast StopReq2 to all peers. In addition, the Quiesce Master must perform the responsibilities outlined as a target synchronization agent receiving a StopReq2. Refer to Section 9.10.3.1, “StopReq Messages” on page 9-321 for details. 4. Once it receives all StopReq2 completions (and completes its own StopReq2 responsibilities), return a completion for Lock request to the Lock requester. 5. Upon receiving an Unlock request, broadcast StartReq1 to peers. In addition, the Quiesce Master must perform the responsibilities outlined as a target synchronization agent receiving a StartReq1. Refer to Section 9.10.3.2, “StartReq Messages” on page 9-322 for details. 6. Upon receiving all the StartReq1 completions (and completing its own StartReq1 responsibilities), broadcast StartReq2 to all peers. In addition, the Quiesce Master must perform the responsibilities outlined as a target synchronization agent receiving a StartReq2. Refer to Section 9.10.3.2, “StartReq Messages” on page 9-322 for details. 7. Upon receiving all the StartReq2 completions (and completing its own StopReq2 responsibilities), return a completion for the Unlock request. Check for other queued Lock requests. If another Lock is pending, restart at step 1 to begin another Lock flow. Target Synchronization Agent Operations For a description of the target synchronization agent requirements, refer to Refer to Section 9.10.3, “Synchronization Messages” on page 9-321. The lock flow has some additional requirements: • After StartReq1 is completed, all peer I/O agents should NOT assume reads are new locked read requests. • An I/O agent can optionally use the address value included with the Lock and StopReq requests to differentiate the target of the lock sequence (DRAM or memory-mapped I/O). When an I/O agent is in the Lock phase (StopReq2 was completed), it reacts to a non-coherent read (NcRd or NcRdPtl) according to the following operations: 1. Thaw posted requests in the inbound ordering queue of the targeted I/O port (if the read targets an I/O port). Continue to block non-posted requests from that port and all requests from any other I/O ports or integrated devices in the I/O agent. 2. Forward the non-coherent read to the target I/O port with lock semantics: a. The completion for the read will push all posted requests in inbound ordering queue (normal PCI ordering). When an I/O agent is in the Lock phase (StopReq2 was completed), it treats CSI writes (NcWr or NcWrPtl) according to the following operations: 1. Forward the non-coherent write to the target I/O port. Ref No xxxxx 331 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol 2. The write completion is returned on CSI after the posting of write. 9.10.6.3 Assumptions for the Lock Mechanism For the above Lock mechanism to function correctly, the following assumptions are made. The first set of assumptions are required by CSI hardware: • A lock requesting agent can have at most one Lock outstanding. • The Quiesce Master must be able to absorb all Lock requests in the network. • Each read or write request of a lock sequence to non-coherent memory space is 8-byte address aligned and less than or equal to an 8-byte size. • If the configuration supports peer-to-peer writes through CSI (multiple I/O agents) then: — I/O agent must guarantee that it will eventually accept all completion packets for outstanding inbound transactions even if its inbound traffic is blocked. — I/O agent must absorb all non-coherent writes targeting it even if the inbound traffic is blocked. The following assumptions are guaranteed by software: • All requests within a LOCK sequence target same destination (DRAM or memory-mapped I/O) with the following exceptions: — For locked cycles to read-only PAM region, read requests target DRAM and write requests target memory-mapped I/O. — For locked cycles to write-only PAM region, read requests target memory-mapped I/O and write requests target DRAM. • Both reads and both writes of split lock must target the same I/O device. 9.11 Non-Coherent Registers List This section enumerates the registers expected to support the non-coherent protocol. It is not intended to be an exhaustive or complete list. Implementations might find alternative ways of designing the protocol to either expand or reduce these registers. For example, the below All Agents List could be identical to Snoop Lists required for the Coherent Protocol. This list is intended to be a guide for implementers designing to the CSI specification and optimization is expected. This list uses a few parameters defined as follows: • NodeID - a bit vector which is wide enough to point to a CSI agent’s NodeID. • N - the number of processor sockets implemented in the platform partition. • M - the number of I/O hubs implemented in the platform partition. • Y - the number of CSI agents implemented across all platform partitions in the system. 332 Ref No xxxxx Intel Restricted Secret Table 9-16. Non-Coherent Logical Register List Name Approximate Sizea Function Processor Agent List NodeID * N Used for broadcasting requests to processors such as interrupts. Refer to Table 9-6 for more details. I/O Agent List NodeID * M Used for broadcasting requests for all I/O Agents such as IntAck. Refer to Table 9-6 for more details. All Agents None (sum of above) Used for broadcasting requests targeting all CSI agents in the platform (e.g. IntPrioUpd). Refer to Table 9-6 for more details. Interrupt Sources NodeID * N Used for broadcasting IntPrioUpd messages. Note that in many systems, this list is identical to the “All Agents” list above. Interrupt Targets NodeID * M Used for broadcasting interrupt messages. Note that in many systems, this list is identical to the “Processor Agent List” above. Quiesce Scope List NodeID * Y Used for broadcasting requests like StopReq, StartReq, due to platform quiescence. This list may span just the partition or it might span across partitions (implementation decision). Refer to Table 9-6 for more details. Power Management Dependency List NodeID * (M+N) Refer to Table 9-6 for more details. Legacy IOH NodeID Pointer to the IOH which holds legacy functionality (e.g. the 8259 Interrupt controller). Quiesce Master NodeID Pointer to the agent which acts as the Quiesce Master. Refer to Section 9.10.6.2.2, “Quiesce Master” on page 9-329. CF8 32 bits Required for legacy iA-32 implementations to emulate legacy configuration accesses to PCI configuration space. Note that the CFC access does not really require a register. LockInProgress 1 bit Indicates that a lock requester has an outstanding lock. Only one is allowed to be outstanding at a time. a. Refer to the Implementation Note below. Implementation Note: NodeID Broadcast Lists The Non-coherent register lists could be implemented as bit vectors where each bit represents a NodeID. For example, a 32-bit vector could represent a list of 32 NodeIDs assuming they are enumerated from 0 to 31. Ref No xxxxx 333 Intel Restricted Secret Non-Coherent Protocol Non-Coherent Protocol 334 Ref No xxxxx Intel Restricted Secret 0 10.1 Overview Interrupt architecture for CSI systems supports the XAPIC and SAPIC interrupt architecture used by IA-32 and Itanium processor family, respectively. This architecture assumes that there is at least one I/O xAPIC with each I/O subsystem (that also support I/O devices without message signaled interrupts) connected to the CSI network and each processor has integrated local APIC to receive and process interrupts and to send inter-processor interrupts (IPIs). In addition, I/O devices may be capable of generating interrupt messages directly through PCI message signaled interrupt (MSI) or equivalent mechanism. Interrupts, interrupt acknowledgment and end of interrupt (EOI) are delivered over the CSI interface to the target processor or I/O xAPIC. The architecture also assumes that there is only one active 8259A equivalent interrupt controller in a system partition. There can be other redundant 8259A interrupt controllers in a system for high availability, but only one of them is active at any time. Processors that support multiple thread contexts support one logical local APIC for each thread context. This is shown in Figure 10-1. Figure 10-1. Interrupt Architecture Overview Processor M Processor M IOH ... IOH ... ...... ICHx ICHx I/O xAPIC Local APIC PCI PCI Primary 8259 and I/O xAPIC Redundant 8259 and I/O xAPIC CSI Network MSI enabled I/O device Inter-processor Interrupt I/O Interrupt Ref No xxxxx 335 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations 10.1.1 Interrupt Model for Itanium®-Based Systems All interrupts in Itanium processor-based systems use physical destination mode to identify the target processor context for an interrupt. Target processor context can be specified using A[19:4] field in the address field of the interrupt message. During initialization, each processor context is assigned a unique ID (physical local APIC ID) to distinguish it from other processor contexts and this unique ID is compared against A[19:4] field in the interrupt message to determine if that processor context is the target of the interrupt. The assignment of physical APIC ID is done by the system firmware. Only one processor context can be specified as the target in one interrupt message. SAPIC interrupt architecture does not support multiple target specification through multicast or broadcast interrupts. SAPIC interrupt architecture allows an interrupt to be redirected from its specified target to another enabled target if the interrupt is a redirectable interrupt. An interrupt is indicated as redirectable if A[3] bit is set to b1 and the delivery mode field in the data field of the interrupt message is set to b001. In case of redirectable interrupts, the interrupt can be delivered to any one of the enabled targets. The interrupt delivery mechanism described in this specification is based on the following assumptions. Note that some of these assumptions are still under investigation for inclusion in interrupt architecture specification and there is a potential for change in these and the resulting CSI support for interrupts. • The assignment of physical APIC ID is done either by the hardware or system firmware and is not changed by the operating system. • Processors never generate redirectable interrupts. • The target APIC ID specified for a redirectable interrupt exists in the system and is enabled. Detailed information about the interrupt architecture for Itanium processor-based systems can be found in the Intel® Itanium® Architecture Software Developers Manual, Volume 2 and in the Intel® Itanium® Processor Family Interrupt Architecture Guide. 10.1.2 Interrupt Model for IA-32 Processor Family-Based Systems IA-32 processor-based systems allow use of either physical destination mode or logical destination mode to identify the target processor context. The specification of targets between these two destination modes are quite different and these will be described separately in following subsections. During initialization, each processor context is assigned a unique physical APIC ID and a unique logical APIC ID. Note that systems that support more than 60 processor contexts may not have unique logical APIC ID for all processor contexts and such systems cannot solely depend on logical destination mode for interrupt delivery. Interrupt messages specify target processor contexts using A[19:12] field in the address field of the interrupt message. Different request type opcode is used in the interrupt message for physical mode and logical mode interrupts and this distinction is used by the processors to match A[19:12] with either the physical APIC ID or the logical APIC ID to register an interrupt with the correct processor context. IA-32 interrupt architecture allows an interrupt to be redirected from its specified target to another enabled target or to any one target among a set of specified targets if the interrupt is a redirectable interrupt (also referred as interrupt with lowest priority delivery mode). An interrupt is indicated as 336 Ref No xxxxx Intel Restricted Secret redirectable if A[3] bit is set to b1 and the delivery mode field in the data part of the interrupt message is set to b001. In case of redirectable interrupts, the delivery of interrupt depends on the enabled targets and the addressing mode used. The interrupt delivery mechanism described in this specification is based on the following assumptions. Note that some of these assumptions are still under investigation for inclusion in interrupt architecture specification and there is a potential for change in these and the resulting CSI support for interrupts. • The assignment of physical APIC ID is done either by the hardware or system firmware and is not changed by the operating system. The assignment of logical APIC ID is done by the operating system and this assignment may not have a fixed relationship with the physical location of the processor context or with the physical APIC ID. • Processors never generate redirectable interrupts. • Redirectable interrupts with broadcast setting is not used in physical or logical cluster addressing mode. Redirectable interrupt with broadcast setting can be used in logical flat addressing mode only when there are 8 logical processors in the system and all of them are enabled. • On redirectable interrupts without a broadcast setting, all potential target APIC ID(s) specified for the interrupt exists in the system and is enabled. — All redirectable interrupts without a broadcast setting using the logical flat or logical cluster addressing mode must indicate either an enabled or a group of enabled processors as target. — All redirectable interrupts without a broadcast setting using the physical addressing mode must indicate an enabled processor as target. Detailed information about the interrupt architecture for IA-32 processor-based systems can be found in IA-32 Intel Architecture Software Developers Manual, vol 3 and Intel XAPIC Architecture Specification. 10.1.2.1 IA-32 Physical Destination Mode In the physical destination mode with directed delivery, the interrupt message can specify a unique processor context as target by setting A[19:12] to its physical APIC ID as long as A[19:12] is not set to 0xFF. If A[19:12] is set to 0xFF, then it indicates that all processor contexts are target of this interrupt. In case of redirectable or lowest priority delivery mode, the interrupt must be registered at exactly one of the processor contexts among all enabled processor contexts. An implementation can assume that target physical APIC ID specified for a redirectable interrupt exists and is enabled and broadcast setting is never used, and therefore ignore the redirection hint. 10.1.2.2 IA-32 Logical Destination Mode The logical destination mode supports two types of addressing modes - flat addressing mode and cluster addressing mode. The addressing mode used in a system is decided by the system firmware or BIOS and the processor local APIC and I/O agents are made aware of the addressing mode during initialization. Ref No xxxxx 337 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations 10.1.2.2.1 Flat Addressing Mode In the flat addressing mode, A[19:12] is interpreted as a bit vector where each bit indicates a processor context. This can be used only in systems that support eight or smaller number of processor contexts. In this mode with directed interrupt delivery, single or multiple processor contexts can be specified as target by appropriately setting the corresponding bits in A[19:12] field of the interrupt message. In case of redirectable interrupt, the interrupt must be registered at exactly one of the processor contexts selected among the group of processor contexts identified in A[19:12] field that are enabled in the system. An implementation can assume that all target logical APICs specified for a redirectable interrupt exist and are enabled, and may chose any one of the specified APIC as the target. Note that compatibility of this assumption with existing software is still under investigation and changes could be made in future revisions based in the outcome of this investigation. 10.1.2.2.2 Cluster Addressing mode In the cluster addressing mode, A[19:16] indicates up to 15 cluster identifiers and each bit in A[15:12] indicates one of the possible four members of a cluster. This mode allows up to 60 processor contexts to be identified as interrupt targets. In this mode with directed interrupt delivery, if A[19:16] is set to 0x0 to 0xE then only the corresponding cluster is the target of the interrupt and A[15:12] indicates 1 to 4 targets within the cluster. If A[19:16] is set to 0xF then all clusters are targets and A[15:12] indicates 1 to 4 targets within each cluster. In case of redirectable delivery mode, the interrupt must be registered at exactly one of the processor contexts. If A[19:16] is set to 0x0 to 0xE, then interrupt target must be a member of the corresponding cluster and it must be from the target indicated in A[15:12] that are enabled. An implementation can assume that all target logical APICs specified for a redirectable interrupt exist and are enabled, and may chose any one of the specified APIC within the specified cluster as the target. An implementation can assume that a redirectable interrupt with A[19:16] set to 0xF is never generated. Note that compatibility of both these assumptions with existing software is still under investigation and changes could be made in future revisions based in the outcome of this investigation. 10.1.2.3 IA-32 Destination Shorthands IA-32 allows use of destination shorthands for efficient generation of interprocessor interrupts. Interrupts generated with destination shorthands use physical destination mode to specify interrupt targets. Various modes used to generate interrupts with destination shorthands is described here. 10.1.2.3.1 Self This shorthand is used to generate interprocessor interrupts to the same processor context. This may cause generation of an interrupt message from the processor core to the CSI interface block with A[19:12] set to the APIC ID of the initiating processor context, however, no interrupt message is generated on the CSI links. Interrupt redirection is not allowed on interrupts generated through this shorthand. 338 Ref No xxxxx Intel Restricted Secret 10.1.2.3.2 All including self This shorthand is used to generate interprocessor interrupts to all the processor contexts in a system partition including the initiating processor context. This causes generation of an interrupt with A[19:12] set to 0xFF. Interrupt redirection is not allowed on interrupts generated through this shorthand. 10.1.2.3.3 All excluding self This shorthand only allows directed delivery mode. In this mode, this shorthand is used to generate interprocessor interrupts to all processor contexts in a system partition excluding the initiating processor context. An interrupt is generated with A[19:12] set to 0xFF and the interrupt could be sent to the initiating processor context which is responsible for ignoring this interrupt. Note that removal of redirectable or lowest priority delivery mode with this shorthand is under investigation. Depending on the outcome of this investigation, there may be some changes to this section of the specification. 10.2 Interrupt Delivery Interrupts from I/O devices or inter-processor interrupts (IPI) are delivered on CSI using the IntPhysical or IntLogical request with an address in the interrupt delivery region of the system address map. Part of the address field contains the local APIC ID of the target processor context for the interrupt. The interrupt delivery mechanism also supports the lowest priority interrupt delivery mode using interrupt redirection. Redirection can be used with IPIs and I/O initiated interrupt messages. The interrupt redirection mechanism will be discussed later in this section. Delivery of interrupts under certain addressing modes and platform configuration relies on the capability to broadcast IntPhysical or IntLogical to all processor agents in the system. This capability is described in Section 9.8, “Broadcast Non-Coherent Transactions” on page 9-314. The address field for the IntPhysical and IntLogical transaction is shown in Figure 10-2. The usage of the address fields in IntPhysical requests for Itanium processor-based systems is shown in Table 10-1. The usage of the address fields in IntPhysical and IntLogical requests for IA-32 processor-based systems is shown in Table 10-2. The RH bit at A[3] indicates the redirection hint. If RH is set to 1, then the interrupt can be redirected to one of the processor contexts, otherwise it must be delivered to the indicated target (this could be more than one in IA-32 processor-based systems). The ID field mapped to A[19:12] identifies the local APIC ID of the target processor context for the interrupt. Itanium processors allow an EID field mapped to A[11:4] to extend the number of processor contexts that can be supported in a system. Exact use of ID and EID field is implementation dependent and implementation specific documents should be consulted for this usage. The upper address field A[51:20] in the interrupt request is dependent on the interrupt delivery area in the system address map. This is a 1MB area that can be relocated in the system address map with the default location starting at 0x0 0000 FEE0 0000. Note that the size of the address field is implementation dependent. Implementations that support only a subset of the addressing capability should set the unsupported address bits to b0. Ref No xxxxx 339 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations Figure 10-2. Address encoding in IntPhysical and IntLogical Requests 3111220 EID or Reserved ID0x0 0000 51 0xFEE (default) 3132 19 RH 4 Table 10-1. Setting of A[51:2] in IntPhysical Requests for Itanium® Processors Address Field Itanium®-Based System A[3] - Redirection Hint 0: Directed 1: Redirectable A[11:4] Extended Local APIC ID A[19:12] Local APIC ID A[51:20] A[51:20] of the interrupt delivery area in system address map. Default is located at 0x0000 0FEE. Table 10-2. Setting of A[51:2] in IntPhysical and IntLogical Requests for IA-32 Processors Address Field IA-32 Processor-Based System A[3] - Redirection Hint 0: Directed 1: Redirectable A[11:4] Reserved, set to 0x00 A[19:12] Physical or Logical Local APIC ID A[51:20] 0x0000 0FEE Note that IA-32 interrupt architecture supports both physical and logical destination modes, which result in IntPhysical and IntLogical requests, respectively. SAPIC interrupt architecture only supports physical destination mode, resulting in IntPhysical request in Itanium processor-based systems. The encoding for data field of IntPhysical and IntLogical requests is shown in Figure 10-3. Usage of data fields in Itanium processor-based systems is shown in Table 10-3. Usage of data fields in IA-32 processor-based systems is shown in Table 10-4. Only a part of the 8 bytes of data field may be valid and within valid bytes only some of the bits contain useful information and the rest is reserved. Valid data bytes for interrupt request must always start from byte 0 and either 2, 4 or 8 consecutive low order bytes of the packet may be valid and its corresponding byte enables set to 1. Please refer to the Intel® Itanium® Processor Family Interrupt Architecture Guide and IA-32 interrupt architecture reference document for further information on the data fields. Figure 10-3. Data field of IntPhysical and IntLogical Requests 63 1615 1413 1211 87 0 Delivery Reserved Reserved Trigger Vector Mode Mode Level 340 Ref No xxxxx Intel Restricted Secret Table 10-3. Setting of Data[31:0] in IntPhysical Requests for Itanium® Processors Data Field Itanium®-based System Data[7:0] Vector Data[11:8] Delivery Mode: 0000 - Directed or Fixed 0001 - Redirectable or Lowest Priority 0010 - PMI 0011 - Reserved 0100 - NMI 0101 - INIT 0110 - Reserved 0111 - ExtINT 1000 - Machine Check 1001 to 1111 - Reserved Data[13:12] Reserved, set to b00 Data[14] Reserved, set to b0 Data[15] Reserved, set to b0 Data[31:16] Reserved, set to 0x0000 Table 10-4. Setting of Data[31:0] in IntPhysical and IntLogical Requests for IA-32 Processors Data[11:8] Data Field Data[7:0] Data[13:12] Data[14] Data[15] Data[31:16] Delivery Mode: 0000 - Directed or Fixed 0001 - Redirectable or Lowest Priority 0010 - SMI 0011 - Reserved 0100 - NMI 0101 - INIT 0110 - SIPI 0111 - ExtINT 1000 - Machine Check 1001 to 1111 - Reserved IA-32 Processor-Based System Vector Reserved, set to b00 Level: Applies only to level triggered interrupts and must be ignored for edge triggered interrupts 0 - Deassert 1 - Assert Trigger Mode: 0 - Edge Triggered 1 - Level Triggered Reserved, set to 0x0000 Ref No xxxxx 341 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations Table 10-5 captures various interrupt modes and its effect on the settings in the IntPhysical or IntLogical request. In IA-32 processor based systems, the distinction between logical flat and logical cluster addressing mode is done through configuration settings in processor local APICs and interrupt source agents, there is no indication in the IntLogical request to distinguish between these addressing modes. Table 10-5. CSI Interrupt Modes Destination Mode Sub-Mode Request Type A[3] Delivery Modea Physical Logical Flat Cluster Redirectable IntPhysical IntLogical 1 b0001 Other than redirectable Redirectable Other than redirectable Redirectable Other than redirectable 0 1 0 1 0 Other than b0001 b0001 Other than b0001 b0001 Other than b0001 a. Delivery Mode is specified in bits 11-8 in the data field of IntPhysical or IntLogical request. The target node of a IntPhysical and IntLogical requests is responsible for delivering the interrupt request to the corresponding local APIC identified in the address field. Alternatively, it can send the interrupt request to all the local APICs. The target node must also send a Cmp response back to the source of the IntPhysical and IntLogical requests. The Cmp response should be generated only after the interrupt has been delivered to the local APIC. This is required to make sure that all interrupts are processed correctly during dynamic reconfiguration of system. The target node of a IntPhysical or IntLogical request is not allowed to forward it to another CSI agent. For example, if a IntPhysical or IntLogical request with redirection hint set to b1 is received by a CSI agent, it cannot forward this request to another CSI agent based on the priority of local or remote processor contexts. Forwarding these requests may lead to deadlock in the CSI network under certain system configurations. The generation and routing of interrupt requests in the system is dependent on its destination mode (physical or logical), redirection hint (directed or redirectable) and ID field value (local target, remote target or broadcast). This is described in Section 10.2.3 and Section 10.2.4 for Itanium processor-based and IA-32 systems, respectively. 10.2.1 Interrupt Delivery Assumptions • For interprocessor interrupts, the CSI interface block is capable of processing and transmitting interrupt messages to the local processor contexts. • For each interrupt event only one interrupt message is sent to each CSI processor agents, which is responsible for transmitting the interrupt message to one or all local processor contexts. There is no restriction on the number of interrupt messages being sent to a processor agent for different interrupt events at any time, i.e., multiple interrupt requests from a source to the same or different processors can be pipelined. • In IA-32 processor-based systems, processor and I/O agents know the node identifier of all CSI processor agents in the system or in the same partition, and the addressing mode (flat or cluster model in logical mode) to determine the destinations of IntPhysical and IntLogical requests. • In Itanium processor-based systems, system firmware is relied upon to assign local APIC ID to each processor contexts that is derived using the CSI node ID for their corresponding CSI agent to facilitate determination of destination node for IntPhysical request. 342 Ref No xxxxx Intel Restricted Secret • If interrupt source agents do not send IntPhysical requests to all processor agents for interrupts with physical destination mode, then it needs to know if it is operating in an Itanium or IA-32 processor-based system to properly determine the destination of IntPhysical requests when A[19:12] is set to 0xFF. • For processor implementations where multiple CSI NodeIDs represent a processor, measure must be taken to avoid redundant interrupt delivery. Please refer to Section 9.8.1, “Broadcast Dependency Lists” on page 9-315 for guidance on setting the target list appropriately to avoid this condition. 10.2.2 Interrupt Redirection CSI supports interrupt redirection to enable lowest priority interrupt delivery to improve performance through interrupt distribution taking into account task priority level among processor contexts and other factors. CSI provides a IntPrioUpd transaction to facilitate interrupt redirection. This transaction provides indications on the task priority level at the processor context and if the local APIC is disabled. This transaction can be sent from processor agents to all the I/O agents that can receive a redirectable interrupt from I/O devices. This transaction may be sent to CSI agents other than I/O agents, however, other agents may ignore the contents of this transaction and respond with a Cmp response. Details about this transaction and how this indication is used for delivering redirectable interrupts is provided in subsequent sections. Note that based on the interrupt architecture assumptions stated in Section 10.1.1 and Section 10.1.2, use of IntPrioUpd transaction is optional in a system and there is no requirement to send IntPrioUpd requests to processor agents. For any redirectable interrupt, it is required that the interrupt is registered at exactly one of the local APICs in the system. Moreover, in IA-32 processor-based systems, the processors participating in the selection can be restricted to a subset of the processors in a system using the logical destination mode either with the flat or cluster addressing model. The details of the algorithm applied to deliver redirectable interrupts for various cases are described in later sections. 10.2.2.1 Redirection Algorithm The exact redirection algorithm used in a system is implementation dependent. Care must be taken such that the interrupt is registered at exactly one of the local APICs by selecting among the APICs indicated in the interrupt request and avoiding any APICs that are disabled. If all the APICs have their corresponding disable bit set, then the interrupt should still be sent to one of the local APICs indicated in the ID field. Some of the ways of optimizing interrupt performance such as balanced distribution of interrupts among all processor contexts and localization of interrupts with specific vectors to specific processor contexts to avoid cache thrashing may play a role in selection of a target for redirectable interrupts. An exact algorithm for interrupt redirection is not described in this specification and is left as an implementation choice for a given system. 10.2.2.1.1 Implementation Note In systems that are designed to operate in an environment with a high frequency of redirectable interrupts, care should be taken to avoid hot-spots and cache thrashing due to interrupt redirection. For example, I/O agents should not select the same target for all redirectable interrupts when multiple processor contexts are at the same priority level and should distribute such interrupts among all eligible targets to avoid a hot-spot. Also, redirection of interrupts from the same event to different targets with separate cache hierarchy should be avoided to eliminate unnecessary thrashing of cache lines between different cache hierarchies. Ref No xxxxx 343 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations 10.2.2.2 External Task Priority Update (IntPrioUpd) Transaction IntPrioUpd transaction is used by the processor contexts to update any external task priority registers in the system. Based on the assumptions on the interrupt architecture stated in Section 10.1.1 and Section 10.1.2, use of IntPrioUpd transaction is optional in a system. In systems that use IntPrioUpd transaction, the number of task priority registers at each agent that generate IntPhysical or IntLogical transactions due to redirectable interrupts must be equal to or larger than the total number of processor contexts in the system to record the complete information provided by IntPrioUpd transaction. However, some agents (such as I/O agents) may record only information related to local APIC enable/disable and other agents (such as processor agents) may not record any information. The relevant fields in IntPrioUpd request on CSI is shown in Figure 10-4, Table 10-6, and Table 10-7. The address field of the request contains the physical and logical APIC ID and the decode type for flat or cluster addressing mode in IA-32 processor-based systems. If Decode Type is set to 0, then it indicates the flat addressing mode, otherwise it indicates the cluster addressing mode. Note that the size of the address field is implementation dependent. Implementations that support only a subset of the addressing capability should set the unsupported address bits to b0. Figure 10-4. Address field of IntPrioUpd Request 51 2423 2019 1211 4 3 Logical APIC ID Physical APIC ID Reserved Logical Processor DTor EID Table 10-6. Setting of A[51:2] in IntPrioUpd Request for Itanium® Processors Address Field Itanium®-Based System A[3] - Decode Type 0 A[11:4] Extended Physical APIC ID A[19:12] Physical APIC ID A[23:20] ID of logical processor context within the CSI processor node A[51:24] Reserved, set to 0x000 0000 Table 10-7. Setting of A[51:2] in IntPrioUpd Request for IA-32 Processors Address Field IA-32 Processor-Based System A[3] - Decode Type 0: Flat Addressing Mode 1: Cluster Addressing Mode A[11:4] Logical APIC ID A[19:12] Physical APIC ID A[23:20] ID of logical processor context within the CSI processor node A[51:24] Reserved, set to 0x000 0000 The data field of the IntPrioUpd request contains one bit to indicate if a processor context is disabled and a 4 bit task priority field. If the Disabled field is set to 1, then it indicates that the processor context corresponding to the APIC ID indicated in the address field is disabled. Only 1 byte of data at byte location 0 is valid with ByteEnable[7:0] set to b00000001, all other data bytes in the packet are reserved and must be ignored. 344 Ref No xxxxx Intel Restricted Secret Figure 10-5. Data field of IntPrioUpd Request 76 430 Disabled Reserved Priority Based on the assumptions on the interrupt architecture stated in Section 10.1.1 and Section 10.1.2, some of which are still under investigation, the use of IntPrioUpd transaction is optional in a system. Depending on the outcome of this investigation, there may be changes in this section of the specification in future revisions. Also, generation of IntPrioUpd transaction is implementation specific. Some processor implementations may initiate IntPrioUpd transaction only on changes in the “Disabled” field or task priority register, whereas others may generate it on any updates to task priority register. Processor agents initiating IntPrioUpd transaction can send IntPrioUpd requests to all I/O agents in the system or in the same system partition. Processor agents initiating IntPrioUpd transaction may send a IntPrioUpd request to agents other than the I/O agents.The receiving agents respond to IntPrioUpd requests with a Cmp response, which is sent back to the initiating processor agent. IntPrioUpd transaction from a processor context is required to be kept in order with respect to other IntPrioUpd transactions from the same processor context. The responsibility to maintain order between IntPrioUpd transactions on CSI interface relies on the initiating processor agent, which must not initiate subsequent IntPrioUpd transactions that changes the value of the “Disable” bit in the data portion, unless previous IntPrioUpd transaction corresponding to the same processor context has completed. If the IntPrioUpd transaction does not change the value of the “Disable” bit (e.g., only the priority value is changing for a processor context), then maintaining order between multiple IntPrioUpd transactions for the same processor context is optional and the implementations that generate such transactions are advised to not serialize these transactions to avoid performance impact. I/O agents receiving IntPrioUpd transactions may keep track of processor contexts that are enabled in the system or in the system partition. I/O agents may also keep track of the priority level of individual processor contexts and the mapping of enabled logical and physical APIC IDs to the CSI NodeID of the corresponding processor agents. This information can be used by I/O agents to redirect interrupts based on priority level and to send interrupt messages only to one CSI processor agent rather than sending it to all processor agents in the system or in a system partition. I/O agents that take actions based on IntPrioUpd transaction should not order other inbound or outbound operations with respect to IntPrioUpd to avoid performance impact. 10.2.3 Interrupt Delivery for Itanium® Processor-Based Systems Since SAPIC interrupt architecture does not allow broadcast or multicast interrupts, the target of an interrupt can always be reliably derived from the APIC ID field in the address field of the IntPhysical transaction. If the APIC ID of processor contexts within a CSI processor agent is derived from its CSI NodeID then the destination NodeID to send a IntPhysical request can be determined through the source address decoder or a similar mechanism. If mechanism to determine the destination node for IntPhysical request is provided, then the request needs to be sent to only that processor agent, otherwise IntPhysical request can be sent to all or a set of processor agents in the system partition (even though it will be registered at only one processor context). Details of the interrupt delivery for directed and redirectable interrupts is described in following subsections. Ref No xxxxx 345 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations 10.2.3.1 Directed Delivery The CSI agent initiating the interrupt can decode the address field to determine the target CSI NodeID and sends an IntPhysical request to the target node. If an interrupt address decoder is not programmed or enabled, then IntPhysical request may be sent to all the processor agents (excluding source processor agent for inter-processor interrupts) in the system partition. The target node of an IntPhysical request is responsible for delivering the interrupt request to the corresponding local APIC identified in the address field. Alternatively, it can send an interrupt request to all the local APICs (assuming that multiple partitions sharing a processor agent have distinct physical APIC IDs for their processor contexts). The target node must also send a Cmp response back to the source of the IntPhysical request. The Cmp response should be generated only after the interrupt has been delivered to the local APIC. 10.2.3.2 Redirectable Delivery The CSI agent initiating the interrupt can decode the address field assuming A[3] is set to 0 to determine the target CSI NodeID and sends an IntPhysical request to the target node. If an interrupt address decoder is not programmed or enabled, then IntPhysical request is sent to all the processor agents in the system partition. If IntPhysical request is sent to only one CSI node, then the source agent can either set A[3] to 0 or leave it unchanged. If IntPhysical request is sent to multiple CSI processor agents, then A[3] must be set to 0 by the source agent. If IntPrioUpd transaction is enabled in the system partition and source agent of an IntPhysical request keeps track of the enabled processor contexts, then the address field can be changed to redirect interrupt to any of the enabled processor contexts in the system partition before address decode is performed to determine the target of IntPhysical request. The target agent of an IntPhysical request is responsible for delivering the interrupt request to the corresponding local APIC identified in the address field. Alternatively, it can send an interrupt request to all the local APICs. If the IntPhysical request is received by a processor agent with A[3] set to 1, then it must set A[3] to 0 before sending the request to local APICs. Also, if A[3] is set to 1, then the target agent has the option of redirecting interrupt among the processor contexts represented by it by changing the APIC ID field in the address. The target node must also send a Cmp response back to the source of the IntPhysical request. The Cmp response should be generated only after the interrupt has been delivered to the local APIC. 10.2.4 Interrupt Delivery for IA-32-Based Systems In case of IA-32 processor-based systems, since targets for redirectable interrupts can be specified as any one among all or a subset of processor contexts, reliable delivery of interrupt either requires an accurate knowledge about the enabled processor contexts in the system partition or depends on software to identify the subset of targets such that all of them are enabled. Also, since logical APIC ID of a processor context is assigned by the operating system which may not assign it based on any relationship with CSI NodeID (since OS is not aware of this NodeID), mapping logical APIC ID to CSI NodeID may not be possible through source address decoder and may require an explicit mapping table to avoid sending IntLogical request to every processor agent. Processor agents may issue IntPrioUpd transaction whenever the local APIC associated with a processor context is enabled or disabled. In such cases IntPrioUpd requests can be sent to all I/O agents in a system partition. I/O agents may keep track of processor contexts with enabled APICs using the information provided in IntPrioUpd requests such that redirectable interrupts are always sent to a valid target. 346 Ref No xxxxx Intel Restricted Secret Details of the interrupt delivery for directed and redirectable interrupts in IA-32 processor-based systems is described in following subsections with respect to responsibilities of the source and target agents. 10.2.4.1 Directed Delivery For interrupts with physical destination mode and ID not set to 0xFF, the CSI agent initiating the interrupt can decode the address field to determine the target CSI NodeID and sends an IntPhysical request to the target node. If ID is set to 0xFF, then IntPhysical request is sent to all the processor agents (excluding source processor agent for inter-processor interrupts) in the system partition. For interrupts with logical destination mode and ID not set to 0xFF, if a mapping table to determine CSI NodeID from logical APIC ID is available, then IntLogical request can be sent to only the corresponding CSI processor agents. If an interrupt address decoder or mapping table is not enabled or if ID is set to 0xFF, then IntLogical request is sent to all the processor agents (excluding source processor agent for inter-processor interrupts) in the system partition. The target node of an IntPhysical or IntLogical request is responsible for delivering the interrupt request to the corresponding local APIC identified in the address field. Alternatively, it can send an interrupt request to all the local APICs (for multiple partitions sharing a processor agent, this must be limited to local APICs within a partition since logical APIC IDs may not be unique across partitions). The target node must also send a Cmp response back to the source of the IntPhysical or IntLogical request. The Cmp response should be generated only after the interrupt has been delivered to the local APIC. 10.2.4.2 Redirectable Delivery The target node of an IntPhysical or IntLogical request is responsible for delivering the interrupt request to the corresponding local APIC identified in the address field. Alternatively, it can send an interrupt request to all the local APICs (for multiple partitions sharing a processor agent, this must be limited to local APICs within a partition since logical APIC IDs may not be unique across partitions). If the IntPhysical request is received by a processor agent with A[3] set to 1, then it must set A[3] to 0 before sending the request to local APICs. Also, if A[3] is set to 1 on IntPhysical requests, then the target agent has the option of redirecting interrupt among the enabled processor contexts represented by it by changing the APIC ID field in the address. In case of IntPhysical, any of the enabled local processor context can be a selected as target. In case of IntLogical, A[3] bit must never be set to 1. The target node must also send a Cmp response back to the source of the IntPhysical or IntLogical request. The Cmp response should be generated only after the interrupt has been delivered to the local APIC. The responsibility of the CSI agent initiating the redirectable interrupt varies depending on the addressing mode being used. The behavior for each of the interrupt addressing modes is described in following subsections. 10.2.4.2.1 Physical Destination Mode If A[19:12] is not set to 0xFF, then the CSI agent initiating the interrupt can decode the address field assuming as if A[3] is set to 0 to determine the target CSI node ID and sends an IntPhysical request to the target node. If the source agent of an IntPhysical request keeps track of the enabled processor contexts, then the address field of an IntPhysical request can be changed to redirect interrupt to any of the enabled processor contexts in the system partition before address decode is performed. If IntPhysical request is sent to only one CSI target agent, then the source agent also has the option of leaving the A[3] bit unchanged such that further redirection can be performed by the target agent. Ref No xxxxx 347 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations As per the interrupt architecture assumptions stated in Section 10.1.2, A[19:12] must never be set to 0xFF with redirectable delivery mode.If an interrupt address decoder is not implemented or not enabled, then IntPhysical request may be sent to all the processor agents (note that processor agents never initiate redirectable interrupts) in the system partition. In cases when IntPhysical request is sent to multiple processor agents, the source CSI agent must set A[3] to 0 in IntPhysical request. 10.2.4.2.2 Logical Destination Mode with Flat Addressing In this case, CSI agent initiating the interrupt leaves only one bit in A[19:12] set to 1 by choosing among bits that are already set to 1. If the CSI agent initiating the interrupt does not keep a mapping of logical APIC ID to CSI NodeID, then IntLogical request is sent to all the processor agents in the system partition, otherwise IntLogical request is sent to only the processor agent representing the target APIC. In all cases, A[3] must be set to 0 by the source agent in IntLogical request. 10.2.4.2.3 Logical Destination Mode with Cluster Addressing If A[19:16] is not set to 0xF, then the CSI agent initiating the interrupt leaves only one bit in A[15:12] set to 1 by choosing among bits that are already set to 1. As per the interrupt architecture assumptions stated in Section 10.1.2, A[19:16] must never be set to 0xF with redirectable delivery mode. If the CSI agent initiating the interrupt does not keep a mapping of logical APIC ID to CSI NodeID, then IntLogical request is sent to all the processor agents in the system partition, otherwise IntLogical request is sent to only the processor agent representing the target APIC. In all cases, A[3] must be set to 0 by the source agent in IntLogical request. Table 10-8 summarizes the interrupt delivery requirements for IA-32 processor-based systems. Table 10-8. Interrupt Delivery in IA-32 Processor-Based Systems Mode Physical Sub-Mode Redirectable Int* Addr Qualifier A[19:12] = 0xFFa Interrupt Target Chosen from list of all enabled APICs Initiator Responsibility Send IntPhysical to NodeID responsible for chosen APIC IDb A[19:12] != 0xFF Chosen from list of all enabled APICs Send IntPhysical to NodeID responsible for chosen APIC IDb Directed A[19:12] = 0xFF All enabled APICs IntPhysical to all processor NodeIDs A[19:12] != 0xFF APIC specified with A[19:12] Send IntPhysical to NodeID responsible for specified APIC IDc 348 Ref No xxxxx Intel Restricted Secret Table 10-8. Interrupt Delivery in IA-32 Processor-Based Systems Logical Flat Redirectable N/A Single enabled APIC chosen from the list specified in A[19:12] bit vector Send IntLogical to NodeID responsible for chosen APIC IDb Directed N/A All enabled APICs specified in A[19:12] bit vector Send IntLogical to NodeID responsible for all specified APIC IDsc Cluster Redirectable A[19:16] = 0xFd Chosen from indicated targets in any valid cluster Send IntLogical to NodeID responsible for chosen APIC IDb A[19:16] != 0xF Chosen from indicated targets within specified cluster Send IntLogical to NodeID responsible for chosen APIC IDb Directed A[19:16] = 0xF Specified APICs in all clusters. Send IntLogical to NodeIDs responsible for all specified APIC IDsc A[19:16] != 0xF Specified APIC in the specified cluster Send IntLogical to NodeIDs responsible for all specified APIC IDsc a. In physical destination mode, A[19:12] specifies the target APIC ID. b. IntPhysical requests can be sent to more than one processor agents, but in that case A[3] bit must be set to 0. IntLogical requests can be sent to more than one processor agents. A[3] bit must be set to 0 for all IntLogical requests. c. IntPhysical or IntLogical requests can be sent to more than one processor agents. d. In logical cluster mode, A[19:16] specifies the target cluster or clusters. Mode Sub-Mode Int* Addr Qualifier Interrupt Target Initiator Responsibility 10.3 Level Sensitive Interrupt and End Of Interrupt Level sensitive interrupt is used to indicate a condition rather than an event. Both edge triggered or level sensitive interrupts are processed in the same manner at the target processor, however servicing of a level sensitive interrupt requires that servicing of the interrupt is indicated to the device that generated the level triggered interrupt. This is done using an end of interrupt indication. In Itanium®-based systems, a memory mapped write to an EOI register at the corresponding I/O xAPIC is used to provide this indication. On CSI, this causes a NcWr transaction targeting the corresponding I/O agent. In IA-32 processor based systems, a EOI message is used to provide this indication. On CSI, this is done using a NcMsgBEOI transaction. NcMsgBEOI request contains the interrupt vector in its data field. The data field for this transaction is 4 byte long and is placed in the lower 4 parameter bytes with ByteEnable[7:0] set to b00001111. NcMsgBEOI is sent to all I/O agents in the system partition, which forwards it to all its I/O xAPICs. Once EOI reaches the I/O xAPICs, correct I/O xAPIC recognizes the interrupt vector and checks if the interrupt condition is still active. The I/O agent returns a Cmp response and once all the expected Cmp responses are received by the source processor agent, the NcMsgBEOI transaction completes. Processor agent initiating an NcMsgBEOI transaction and the agents receiving this request are not required to order any other operation with respect to NcMsgBEOI. Ref No xxxxx 349 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations EOI Request 8 Reserved 31 0 Vector 7 10.4 Miscellaneous Interrupts and Events 10.4.1 8259A Support 8259A interrupt controller support for systems with legacy software is enabled in CSI-based systems. 8259A interrupt request is delivered either using the virtual legacy wire messages or through an I/O xAPIC using the message based interrupt delivery mechanism with the delivery mode set to ExtINT (b0111). There can be only one active 8259A equivalent interrupt controller in a system partition. The processor receiving the interrupt initiates an interrupt acknowledge operation to obtain the interrupt vector from the interrupt controller. This is done using a IntAck transaction on CSI. The system must provide the routing mechanism to direct IntAck transaction from the source processor to the I/O agent with an active 8259A equivalent interrupt controller. DataNC response to this transaction from the I/O agent returns the interrupt vector number. Processor agent initiating an IntAck transaction and the agents receiving this request are not required to order any other operation with respect to IntAck. 10.4.2 INIT This signal is part of the virtual legacy wire (NcMsgBVLW) message based delivery mechanism on CSI. Please refer to virtual legacy wire part of the specification for further discussion. This interrupt can also be delivered using IntPhysical or IntLogical request with delivery mode set to b0101. 10.4.3 NMI This signal is part of the virtual legacy wire (NcMsgBVLW) message based delivery mechanism on CSI. Please refer to virtual legacy wire part of the specification for further discussion. This interrupt can also be delivered using IntPhysical or IntLogical request with delivery mode set to b0100. 10.4.4 SMI SMI interrupt is applicable only to IA-32 processor family based systems.This signal is part of the virtual legacy wire (NcMsgBVLW) message based delivery mechanism on CSI. Please refer to virtual legacy wire part of the specification for further discussion. This interrupt can also be delivered using IntPhysical or IntLogical request with delivery mode set to b0010. 10.4.5 PMI PMI interrupt is applicable only to Itanium processor family based systems. This interrupt can be delivered using IntPhysical request with delivery mode set to b0010. 350 Ref No xxxxx Intel Restricted Secret 10.4.6 PCI INTA - INTD and PME These events are handled through peer-to-peer operation on CSI. Please refer to Section 9.4, “Peerto- Peer Transactions” on page 9-309 for further details. 10.5 Interrupt Related Configuration • Processor (IA-32 only) and I/O agents need to identify CSI processor agents in the system or in the same partition and the addressing mode (flat or cluster model in logical mode) to determine the destinations of IntPhysical and IntLogical requests. This needs to be configured before IPIs or I/O interrupts are enabled. This also implies that broadcast interrupts or logical mode interrupts should not be used during system initialization until interrupt related configuration is completed. • Processor agents (IA-32 only) need to identify CSI I/O agents in the system or in the same partition to determine destinations of NcMsgBEOI requests. This needs to be configured before I/O interrupts are enabled. • Processor agents need to identify the target for IntAck requests to be sent to the I/O agent with 8259A equivalent interrupt controller support enabled. This needs to be configured before I/O interrupts are enabled. • Processor agents need to identify the targets for IntPrioUpd messages and if these messages are enabled. • I/O agents that do not send IntPhysical requests to all processor agents on an interrupt with physical destination mode need to know if it is operating in a system supporting IA-32 or Itanium processors to determine if broadcast interrupts in physical destination mode is supported. This needs to be configured before I/O interrupts are enabled. • Processor and I/O agents need to know if interrupt address decode mechanism is enabled to use the address field in interrupt requests with physical destination mode to determine a specific CSI target node for IntPhysical requests or if it needs to be sent to all processor agents in the system partition. • Processor (IA-32 only) and I/O agents need to know if a mapping table is available to determine target CSI processor agent for interrupts with logical destination mode or such interrupts will be sent to all processor agents in a system partition. 10.6 Reference Documents • Intel® Itanium® Architecture Software Developer’s Manual, Volume 2 • Intel® Itanium® Processor Family Interrupt Architecture Guide • XAPIC Architecture Specification • Intel® Pentium Processor Software Developers Manual • PCI Local Bus Specification, Rev 2.3 Ref No xxxxx 351 Intel Restricted Secret Interrupt and Related Operations Interrupt and Related Operations 352 Ref No xxxxx Intel Restricted Secret This chapter describes the fault handling features provided by the CSI interface to enable systems with varying degree of reliability, availability and serviceability features. CSI fault handling strategy differs from previous Intel architecture based platform in the use of a message based interface for error reporting. This mechanism provides a more scalable solution than the traditional bus based error reporting mechanism in a system that supports advanced RAS features such as partitioning and dynamic reconfiguration. The CSI fault handling features can be classified into following areas: (1) Error reporting (2) Fault diagnosis, and (3) Fault containment. This chapter provides a description of each of these fault handling features. Most of these features are optional features for a platform implementation. 11.1 Definitions Fault: An erroneous state resulting from the observed behavior deviating from the specified behavior. Error: Error is the manifestation of a fault within a system. All faults may not result in an error. Fault Containment: Fault containment is the process of preventing a faulty unit from causing incorrect behavior in a nonfaulty unit. Error Detection: The process that determines the deviation between observed and specified behavior. Fault Diagnosis: The procedure by which a faulty unit is identified. In our context, this process occurs in fault handling software for uncorrectable errors. Hardware is expected to provide sufficient and unambiguous information for a successful diagnosis. The intent is identify the faulty field replaceable unit (FRU) for servicing. This process is also referred as FRU isolation. Error Recovery: The process by which the effect of a fault is eliminated. 11.2 Error Classification The errors can be classified into four classes: (1) Hardware correctable errors, (2) Software correctable errors, (3) Recoverable errors and (4) Fatal errors. The discussion here assumes that for all these error classes, hardware provides the capability to detect the error. Errors that cannot be detected by hardware cause silent data corruption, which is an undesirable event. • Hardware correctable errors are corrected by hardware, and software is completely oblivious to this event. Examples of such errors include single bit ECC errors and successful link level retry. Such events may be logged and reported by the system for a post-mortem by the firmware or operating system software. • Software correctable errors involve firmware or other software layers to eliminate the fault. Examples of errors in this category include errors in hardware structures (e.g., route table) that provide the detection and correction capability but do not have the logic to update the Ref No xxxxx 353 Intel Restricted Secret Fault Handling Fault Handling structures with corrected data. Firmware or other software layers could be used to scrub such structures with corrected data. • Recoverable errors are not corrected by the hardware or software. Such errors are contained in nature, the system state is intact, and the process and the system is restartable. OS or other software layers may be able to recover from such errors by restarting the affected process. Examples of errors in this category include multi-bit data error. • Fatal errors may compromise system integrity and continued operation may not be possible. Example of errors in this category include protocol errors, certain types of transaction timeout, etc. Fatal errors can be further subdivided into errors on the CSI interface and errors within the agents using the CSI interface. 11.3 Error Reporting This section describes various error reporting mechanisms that can be used in a system with the CSI interface. All systems and components with CSI interface may not support all the error reporting mechanisms described here, support and use of these reporting mechanisms are platform dependent. 11.3.1 Error Reporting Mechanisms 11.3.1.1 Interrupt Full capability of interrupt based mechanisms that exist in current platforms are supported in CSI- based platforms and can be used to indicate various types of error conditions in the system. Interrupts can be used to report hardware corrected errors, software correctable errors or uncorrectable errors that are contained in nature. Various interrupt mechanisms are available that can be used to report error events. These interrupts differ in terms of the software entity involved in handling the interrupt and masking or disabling mechanism. Therefore, some of these interrupts may be suitable for indicating platform specific errors but not others. Also, depending on the severity of the error, some of these interrupts may be more suitable than others. An interrupt based error reporting mechanism is asynchronous with respect to the system operation that resulted in error detection. Due to the asynchronous nature of interrupts in indicating platform error events, good error logging at the point of error detection is necessary to diagnose a fault and recover from it. Types of interrupts that can be used for error reporting in Itanium and IA-32 processor based systems are described below. 11.3.1.1.1 Itanium® Processor Family-Based Systems Interrupts that can be used for error reporting include corrected platform error interrupt (CPEI), system control interrupt (SCI), and platform management interrupt (PMI). All of these interrupts are delivered using the IntPhysical transaction on CSI. Some components in a platform may also provide capability to initiate these interrupts using the system service processor or management controller through interfaces other than CSI (e.g., SMBus, JTAG, etc.). 354 Ref No xxxxx Intel Restricted Secret 11.3.1.1.2 IA-32 Processor Family-based Systems Interrupts that can be used for error reporting include system control interrupt (SCI), non-maskable interrupt (NMI), and system management interrupt (SMI). All of these interrupts can be delivered using the IntPhysical or IntLogical transactions on CSI. NMI and SMI can also be delivered using the NcMsgBVLW transaction on CSI. Some components in a platform may also provide capability to initiate these interrupts using the system service processor or management controller through interfaces other than CSI (e.g., SMBus, JTAG, etc.). 11.3.1.2 Response Status A response status field in CSI response packets is used to indicate either a normal response to a transaction or an exception or a fault in completing the transaction. The response field is associated with all the packets under DRS and NDR message type and also with snoop responses under Home message type. Response status indication is synchronous with respect to the CSI transaction affected by a fault. Response status types defined in CSI are: Normal, Abort Timeout, and Failed. Normal response status indicates that the corresponding request completed as expected without any exception or fault. Abort Timeout indicates that the corresponding request has not encountered a fault, but it may take longer than the normal completion time and thus the timeout, if any, associated with the request must be extended to allow additional completion time. Note that support for Abort Timeout response status is optional. Failed response status indicates that the corresponding request has failed to complete as expected. If a transaction results in forwarded requests from an intermediate agent and if one of the forwarded requests results in an abnormal response, then the intermediate agent is expected to reflect the abnormal response to the source of the transaction. Also, in case of multiple forwarded requests resulting in abnormal response statuses, the intermediate agent collects all response statuses and creates a combined response status based on priority of response statuses. Failed response status has the highest priority and overrides all other response statuses. If a forwarded request results in a response with Abort Timeout response status, then it must be reflected immediately to the source agent of the transaction with an appropriate response message even if other responses for that transaction have not been received. A Failed response status is not indicated to the source of a transaction by an intermediate agent until responses for all the forwarded requests have been received or a timeout has occurred for the forwarded requests. In systems that implement CSI transaction timeouts for fault diagnosis, it is expected that I/O agents interfacing to PCI Express interfaces receiving a Configuration Retry Status response on configuration accesses responds with Abort Timeout response status to extend transaction timeouts at source and intermediate agents. In a system using response messages with Abort Timeout response status and with a network that does not preserve message order between a pair of source and destination, it may be possible that the Abort Timeout indication arrives after the requester has received a normal response for the corresponding request. In such cases, the requester must ignore the Abort Timeout indication and must not indicate an error. If another transaction has occupied the resource with the same source node identifier and transaction identifier, then it is possible that the Abort Timeout indication extends the timeout of an unrelated request. The Abort Timeout response for a transaction may also result in Abort Timeout response for other dependent transactions initiated by same or other agents in the system. An agent receiving an Abort Timeout response for a transaction should send the Abort Timeout response to remote requests that are at a higher level in the timeout hierarchy and abort timeouts for local requests that are at a Ref No xxxxx 355 Intel Restricted Secret Fault Handling Fault Handling higher level in the timeout hierarchy. This may cause Abort Timeout response to cascade from one agent to others in the system. This cascading of Abort Timeout responses eventually stops once it reaches the highest level in the dependency hierarchy of transactions outstanding in the system. In case of Failed response status on a processor agent initiated transaction, it results in a local machine check abort at the processor. If the transaction is initiated from a non-processor agent, then the non-processor agent may either generate an MCA message or PMI/SMI to one of the processors in the same system partition, go viral (see Section 11.3.1.6) or assert an error signal. 11.3.1.3 Data Poisoning Data poisoning is a mechanism to indicate uncorrected data errors corresponding to a CSI access. Each CSI data flit contains a data poisoning bit to indicate uncorrected data errors at the 64 bit granularity. CSI routers from the source to the destination of the packet are expected to preserve the poison indication. The actions at the source and destination of poisoned data is platform dependent. Data poisoning indication on processor initiated read transaction typically results in a local machine check abort at the initiating processor. 11.3.1.4 Transaction Timeout Transaction timeout is used to indicate that a transaction has not received all its expected responses within the time period allocated for the transaction. If a forwarded request for a transaction failed to complete in its allocated time, then appropriate information is recorded in the error logs at the forwarding agent and a response with Failed response status is sent to the source agent of the transaction. The timeout of a request at the source of a transaction removes it from the list of outstanding transactions and lets other dependent operations proceed. In case of timeout on a processor agent initiated transaction, it results in a local machine check abort at the processor. If the transaction is initiated from a non-processor agent, then the non-processor agent may either generate an MCA message or PMI/SMI to one of the processors, go viral or assert an error signal. 11.3.1.5 Machine Check Abort Message Machine check abort message is used to indicate error conditions in the system that cannot be corrected by hardware and needs immediate attention. This is typically used by non-processor agents on detection of a uncorrected but contained error to alert one of the processors such that error handling software can take appropriate action to either recover or shutdown the system. Some systems may use interrupt messages to achieve this, however, depending on the priority of tasks currently executing on the processor or the state of the processor interrupts may not get processed for sometime. The Itanium system architecture provides a machine check abort mechanism that cannot be masked or disabled by other tasks and provides a more robust mechanism for dealing with errors. The machine check abort message on CSI enables an Itanium processor-based system to utilize this feature. Machine check abort message is delivered on CSI using the IntPhysical or IntLogical transaction with a machine check delivery mode. This delivery mode is always used with physical destination mode, directed to a single processor context and edge triggered. This is supported by extending the delivery mode field in the IntPhysical and IntLogical message, where D[11:8] = b1000 indicates the machine check delivery mode. Vector field is not used with this delivery mode and must be always set to 0x00. 356 Ref No xxxxx Intel Restricted Secret Delivery of the machine check abort message is dependent on the firmware for setting appropriate delivery modes and a valid target APIC ID field to direct this message to the appropriate processor context. Component may be designed to default all uncorrected error types to use the error signal based mechanism to report error until the firmware has configured the system appropriately to enable the machine check message. As an optional feature, the processor agents receiving the machine check abort message may record the source node identifier of the machine check abort message to facilitate efficient error handling by avoiding polling of error log structures throughout the system. An overflow indication can also be provided if more machine check abort messages are received while the source node identifier from a previous abort message has not been read by the error handling software. Also, the processor agent should avoid indicating an overflow if the subsequent abort messages are from the same agent as the one previously recorded. 11.3.1.6 Viral Alert Viral alert is a mechanism to indicate fatal error where it is difficult to avoid error propagation without immediately shutting down the system. Viral alert addresses the error propagation issue related to fatal errors and allows system to be shutdown gracefully and in the process cleaning up the system interface and other shared resources across system partitions. The viral alert capability of the CSI interface is an optional feature that may not be supported by all components and platforms. This reporting mechanism assumes that the CSI interface is operational and can be used to deliver the error indication. Each CSI packet header contains a viral alert bit to indicate if a fatal error has compromised the system state. Each Protocol layer agent that detects a fatal error or receives a packet that has its viral alert indication set turns viral and starts setting the viral alert indication on all packets initiating from itself until the agent is reset. Once an agent becomes viral, then it is assumed that its protocol state has been compromised. I/O agents may stop committing any data to permanent storage or I/O devices after it has become viral. Agent that are in viral state may generate new requests to allow error handling software to gracefully shutdown the system partition. The mechanisms used by a system for graceful shutdown is platform implementation specific and outside the scope of this specification. The viral alert mechanism is transparent to the Routing and Link layers. 11.3.1.7 Error Signal Components are expected to provide an error output signal to report error events. Name and other details associated with the error signal is outside the scope of this specification and can be found in platform and component specific documents. Error signal can be used in certain class of platforms to indicate various error conditions and can also be used when no other reporting mechanism is appropriate. For example, error signal can be used to indicate error conditions (even hardware correctable error conditions) that may cause a system with lock-step processors to go out of lock step, CSI interface error conditions that makes the interface useless for error reporting, or to indicate all uncorrectable errors during system initialization before firmware classifies each error type and selects appropriate reporting mechanism. Ref No xxxxx 357 Intel Restricted Secret Fault Handling Fault Handling 11.3.2 Error Reporting Priority Multiple errors can be detected at or within a single error reporting window for a protection domain. In that situation, errors are reported in the following priority (unless a higher priority is being masked) within the protection domain: 1. Fatal error (highest) 2. Recoverable or software correctable error 3. Hardware corrected error Error reporting priority is applicable only within a partition. Different partitions may be reporting errors with different priority at the same time. 11.4 Fault Diagnosis Fault diagnosis is the capability to identify the faulty unit after an error has been detected in the system. This capability is useful for systems with improved availability and serviceability by providing the capability to accurately isolate faults and as a result allows reconfiguration of a system to quickly recover from failures. The granularity of identification is dependent on the goal of diagnosis, which could be either at the granularity of a field replaceable unit (FRU) in the system, a component or a logical unit with in a component. In this section, we will consider fault diagnosis aspects related to the CSI interface irrespective of where it is used (i.e., between FRUs, components or units within a component) and leave the decision to use the mechanisms described here to component and platform implementations based on their respective requirements. All the mechanisms described under this category are optional features of the interface. Several factors in the design and implementation of a component and platform play a role in accurate diagnosis of faults. These include placement of error detection logic at appropriate interfaces, logging of meaningful and detailed information associated with an error, elimination of multiple indication on a single fault, identification of first unit to detect a fault, etc. This specification does not require or provide a guideline about all of the factors that affect the fault diagnosis capability, but limits its scope to CSI protocol and interface behavior that has a direct impact on the diagnosis capability. For example, placement of error detection logic and logging of appropriate information associated with an error is not addressed here. Elimination of multiple error indication on a single fault facilitates error handling software to successfully isolate faulty units. For example, an uncorrected data flowing through multiple links may cause error to be detected and reported from each link and logged in multiple error logs, which makes it harder for the error handling software to identify the source of the error correctly. Another condition in which multiple errors could be reported on a single fault is due to dependency between multiple operation. This issue and a mechanism to address it is described in the following section. 11.4.1 Hierarchical Transaction Timeout In the context of CSI interface, a CSI request may be dependent on another CSI request. If a CSI request fails to complete due to some fault, then all other requests dependent on it may also fail, thus causing a cascading effect with multiple errors reported to the error handler and some of these errors may be reported out of sequence, which makes it harder to diagnose the source of the fault. One mechanism to avoid this is to organize CSI requests in a hierarchy based on their dependency and then assign higher timeout values to request that depend on other requests with a lower timeout value. This allows the request that is directly affected by a fault to timeout or indicate an error first and allows other dependent transactions to proceed before their timeouts expire. 358 Ref No xxxxx Intel Restricted Secret CSI requests depend on other CSI requests due to the following reasons: • Functional dependency between two operations in the system. These dependencies are created due to one request being functionally dependent on another request. Examples of such dependencies include coherent read requests being dependent on corresponding snoop requests, I/O read requests being dependent on I/O initiated writes due to ordering requirements of PCI or PCI Express interface, etc. • Network dependency due to message class or virtual channel assignment. If requests are different levels in the dependency chain share the same virtual channel in the network and if a request is not assigned pre-allocated resources at the destination, then all other requests sharing the same virtual channel must be at the same level in the timeout hierarchy or at a higher level. • Other implementation artifacts may also cause additional dependencies between different operations, which must be taken into account in setting proper timeout value. For example, data path dependencies between snoop probes and victim writebacks, fairness in arbitration policies at a router, etc. come under this category. Based on the functional and network dependency characteristics of CSI transactions, the levels of timeout hierarchy for CSI requests is shown in Table 11-1. Note that the timeout hierarchy shown here does not apply to requests generated from primary memory agent to secondary memory agent in a system supporting memory mirroring operation as described in Section 14.9.2. Table 11-1. Timeout Levels for CSI Requests with Source Broadcast Transaction Timeout Level CSI Request Message Type 1 WbMto*, WbData, *FrcAckCnflt, Cmp_Fwd*, Frc_FwdCur HOM, DRS, NDR 2 Rd*and InvItoE at home HOM 3 Rd*and InvItoE at requestor, NcWr*, NcP2PWr, NcMsgB, IntPhysical, IntLogical HOM, NCB 4 NcRd, NcP2PRd, NcCfg*, NcIO*, NcLT*, IntAck, IntPrioUpd, NcMsgSa NCS 5 6 NcMsgSStopReq1, NcMsgSStopReq2 NcMsgSLock NCS NCS a. This includes all NcMsgS messages except NcMsgSLock, NcMsgSStopReq1, and NcMsgSStopReq2. 11.4.1.1 Example Timeout Calculation This section illustrates computation of transaction timeout values for each level in the timeout hierarchy and points out factors that affect the determination of these values. The values indicated in this example may not be correct value for a given platform and each platform should calculate these values considering all the factors that may affect these values before setting and enabling transaction level timeouts. 11.4.1.1.1 Level 1 Timeout Factors affecting level 1 timeout includes worst case network delay (with minimal link width), time to process the requests at the target, time to recover from link failure (including time taken in dynamic link width reduction and link level retry, etc.) The worst case network delay can be derived using the latency through a single crossbar and the maximum number of crossbars between a pair of source and destination of a packet. Latency through a fully loaded crossbar is a function of number of ports, flit buffers per port, and time to Ref No xxxxx 359 Intel Restricted Secret Fault Handling Fault Handling transmit each flit per port assuming that crossbar arbitration is completely fair to packets from all ports. Assuming a certain system configuration, this latency could be 16 ports x 64 flit buffers per port x 4ns per flit for quarter width = 4µs. Additional delay may have to be taken into account due to fairness in arbitration and credit flow for different message classes and non-uniform packet lengths on each port. Assuming up to 4 crossbar stages assuming negligible Physical layer delays per hop, the worst case network delay may be about 4 x 4µs = 16µs. So, level 1 timeout value taking into account processing time for a request at the target and link retry and re-initialization time could be about 2 x 16µs + 6µs + 10µs = 48µs 11.4.1.1.2 Level 2 Timeout Func(Level 1 timeout, conflict resolution delay for coherent request, network delay, snoop processing time) Implementation specific issues / assumptions: • At most 16 conflicting requests are to the same cache line that are processed serially • Snoop processing throughput and delay. Ex: Tanglewood internal ring arbitration policy, CSI snoop blocking conditions, etc. Level 2 timeout = 15 x 32µs + 2 x 16µs + negligible link recovery and snoop processing delay > 512µs 11.4.1.1.3 Level 3 Timeout Func(Level 2 timeout, network delay, memory access time) Level 3 timeout = 512µs + 16µs + 16µs (for link recovery and memory access) > 544µs 11.4.1.1.4 Level 4 Timeout Func(Depth of inbound posted write queue at I/O agent x Level 3 timeout, I/O bus max service time (say ~10µs) x Max number of outbound delayed requests in the system) System behaviors that affect timeout at this level: • I/O initiated writes to coherent memory should not encounter conflict • I/O caches lets multiple inbound writes to coherent memory proceed concurrently • MMIO, I/O port and configuration spaces are not fine-grain interleaved, so serialization due to the occurrence of write forks on inbound write stream should not happen (unless multiple streams are mixed) Level 4 timeout > 1ms 11.4.1.1.5 Level 5 Timeout Time to drain the ordering queue in the processor + time to drain the system request queue (reads including all I/O reads, writes, flushing of WC buffers) 360 Ref No xxxxx Intel Restricted Secret Assumption is that back to back I/O accesses are typically very small Level 5 timeout = 2 x 1ms = 2ms 11.4.1.1.6 Level 6 Timeout Func(Level 5 timeout, network delay) Level 6 timeout = 2ms + 2 x 16µs + link recovery time = 2ms + 64µs 11.4.2 Error Logging Guidelines Some error logging guidelines for improved fault diagnosis are as follows: • Providing a bit vector for tracking snoop responses of each coherent request instead of a counter facilitates identification of the caching node that failed to respond to a snoop request for a coherent transaction 11.5 Error Containment in Partitioned Systems The goal of error containment is to improve system availability in the presence of faults by containing the error to the affected partition. In a partitioned system, there is an expectation that faults in one partition do not affect operation of other partitions in the same system. This expectation presents unique challenges in systems where partitions share some of the resources such as network fabric, etc. This section of the specification deals with mechanisms for providing error containment in partitioned systems where CSI network fabric is shared across partitions. 11.5.1 Error Propagation in Partitioned Systems Issue of error propagation between multiple partitions within a system depends on the partitioning models supported by the system. Different partitioning models are described in Section 14.2, “Partitioning Models” on page 14-399. In a system that supports only hard physical partitioning and does not allow any sharing of resources (e.g., components, protocol agents, routers or links), error propagation is eliminated by design since there is no interaction between partitions and faults in one partition cannot manifest as errors in another partition. However, in systems with hard physical partitioning that allows sharing of resources across partitions or in systems supporting any other partitioning model beyond hard physical partitions, sharing of resources creates a potential for propagation of faults from one partition to another. In partitioned systems with shared resources, the error propagation characteristics depend on the type of fault. If a fault manifests in affecting the state of a Protocol layer agent but does not affect the Routing and Link layers interacting with it, then all partitions sharing the Protocol layer agent are affected by the fault and no other partition is affected. However, if a faulty Protocol layer agent manifests in blocking the routing and Link layers from making progress, then partitions not sharing the Protocol layer agent but sharing the Routing and Link layers are also affected by the fault. Figure 11-1 illustrates an example of partitioned system with shared routing and link resources. This example illustrates a system with two partitions, Partition A and Partition B that share the network fabric. Partition A consists of Node 0 and Node 3, and Partition B consists of Node 1 and Node 2. Path between nodes in their respective partitions use some common set of links and routers in the shared network fabric. In such a system, if one of the nodes in one of the partitions, say Node 2 in Partition B, has a fault which blocks packets going to Node 2 from getting consumed from the Ref No xxxxx 361 Intel Restricted Secret Fault Handling Fault Handling shared network fabric, it may result in packet exchange between Node 0 and Node 3 in Partition 1from making forward progress. As a result, a fault in Partition B results in error in both Partition A and Partition B. Figure 11-1. Illustration of Error Propagation Across Partitions Node 0 Partition A Node 1 Partition B Node 2 Partition B Node 3 Partition A Shared Fabric Shared Resource between Partitions 11.5.2 Error Containment Through Packet Elimination This section describes a mechanism to avoid error propagation across multiple partitions due to sharing of routing and link resources in the network fabric. The mechanism described here applies to faults in the Protocol layer agent manifesting in routing and link resources interacting with the faulty protocol agent not making forward progress. If the fault is in the Link or the Routing layer, then the mechanism covered here is not applicable and such faults can be handled through an endto- end error recovery mechanism provided by the Transport layer. The mechanism described here is applicable even in the presence of Transport layer in the system. The error containment for the fault scenario described above requires that resources consumed in the shared fabric by the faulty partition must be eventually released and it must be released soon enough to not cause a transaction failure in any other partition. To facilitate this, the shared fabric must have a mechanism to determine a fault such that it can release the relevant fabric resources and this mechanism does not rely on the protocol agents for the determination of faults in all cases. Timeout at the links going from the shared fabric to the nodes and from nodes to the shared fabric is used as a fault detection mechanism. Fault detection through timeouts is enabled only on links interfacing to a protocol agent in the system (both on links going from the router to the protocol agent and from the protocol agent to the router), the links connecting two routers do not use the timeout mechanism to detect faults. On the links from a protocol agent to a router, the timeout is enabled only when the router is ready to accept subsequent flits of a packet, but the protocol agent is unable to deliver the flits. Once a fault is detected through the timeout mechanism on a link, all packets intended to use that link are discarded and the corresponding resources released in the link and the router. On links connecting a router to a protocol agent, the packets are eliminated until the faulty protocol agent and its corresponding link interface is reset and re-initialized. On links connecting a protocol agent 362 Ref No xxxxx Intel Restricted Secret to a router, the packet is terminated with a poison indication (if part of the packet has already been sent) and no new packet is accepted from the faulty protocol agent until the protocol agent and its associated link interface is reset and reinitialized. The fault detection mechanism must not cause false trigger at links connected to non-faulty protocol agents. This is facilitated through a three level timeout hierarchy, which takes into account the dependency between packets from different message classes. A lower timeout value is used for packets in message classes lower in the dependency hierarchy and a higher timeout value is used for packets in message classes higher in the dependency hierarchy. Figure 11-2 illustrates the message class hierarchy for the CSI interface. Each of the nodes in this graph represents a CSI message class. The arcs with arrows between the nodes represent dependency between two message classes. For example, the arc from NCB message class to NDR message class indicates that NCB message class is dependent on NDR message class. This dependency represents that for packets in NCB message class to make forward progress, it is required that the packets belonging to NDR message class make forward progress. CSI protocol requires that pre-allocated resources are provided at the destination of packets in the HOM message class. Therefore, even though the packets in HOM message class generate packets in NDR or DAT message class, the forward progress for packets in HOM message class is not dependent on NDR or DAT message class. The dependency graph of CSI message classes represents a three level hierarchy where HOM, DAT, and NDR message classes are at the first level, SNP and NCB message classes are at the second level and NCS message class is at the third level in the hierarchy. Each of the levels in the hierarchy is assigned a timeout value depending on the system configuration and the characteristics of the Routing and Protocol layer agents. On links going from router to a protocol agent with timeout enabled, whenever a message belonging to a message class arrives, its corresponding timeout value is used to detect if the packet is delivered entirely within the allocated time, otherwise a fault is indicated and packet elimination mechanism is activated. On links going from protocol agent to router with timeout enabled, whenever the first flit of a message belonging to a message class is routed, its corresponding timeout value is used to detect if the entire packet is routed through within the allocated time, otherwise a fault is indicated and the packet is terminated with a poisoned flit and the routing connection between the input and output port is relinquished. Figure 11-2. CSI Message Class Hierarchy Pre-allocated MCs Data Response (DAT) Non-Data Response (NDR) Home Messages (HOM) Snoop Probes (SNP) Non-Coherent Bypass (NCB) Non-Coherent Standard (NCS) Queued MCs Mixed MCs True Dependency Dependency eliminated by preallocation Ref No xxxxx 363 Intel Restricted Secret Fault Handling Fault Handling Note that the sources of packets in the system have a dependency on the drains of packets in the system, i.e., a packet cannot be consumed by the routers from the source protocol agent unless the packets are being drained at the destination protocol agents. Because of this dependency, the timeout values used at the link from protocol agents to the routers is set at a higher value (typically to the timeout value used by the next level in the message class hierarchy) than the timeout value used at the link from routers to the protocol agents for the same level in the message class hierarchy. This is required to avoid false detection of faults and the resulting error propagation and also facilitates improved failure diagnosis. 11.5.2.1 Message Class Timeout Example This subsection illustrates an example calculation of the link timeout values and points out factors that affect the determination of these values. The values indicated in this example may not be correct value for a given platform and each platform should calculate these values considering all the factors that may affect these values before setting and enabling link level timeouts. The timeouts calculation in this section assume the same system configuration as assumed in Section 11.4.1.1. Link timeouts indicated here are the timeout values for links connecting a router to a protocol agent. For links connecting protocol agent to router, higher timeout values need to be applied as indicated earlier. 11.5.2.1.1 Level 1 (NDR, DAT, and HOM Message Class) Link Timeout This is a function of worst case time to process any packet in these message classes without a pre- allocated resource by the Protocol layer agent. If this link also involves Physical layer and Link layer, then the timeout also needs to take into account link transmission and error recovery time. In Section 11.4.1.1.1, this value was estimated to be about 16µs, so level 1 link timeout needs to be > 16µs. 11.5.2.1.2 Level 2 (NCB and SNP Message Class) Link Timeout This timeout is a function of the level 1 link timeout, worst case time to propagate credits for any level 1 message classes, and the worst case time to process a packet in NCB and SNP message classes without pre-allocated resources. The worst case time to propagate credits for any message class is dependent on the total number of message classes, number of routing stages between farthest protocol agents, fairness in distribution of credits and arbitration through the routers, and distribution of resources among different message classes for the shared adaptive buffers. This timeout is also dependent on the snoop blocking conditions and the time to process requests on I/O interfaces. The credit propagation time through the network is of the same order as the worst case network propagation delay, which was estimated to be about 16µs in Section 11.4.1.1.1. Depending on the snoop blocking time and the time to process I/O requests in NCB message class, the level 2 link timeout could be of the order of 64µs or larger. 11.5.2.1.3 Level 3 (NCS Message Class) Link Timeout The level 3 link timeout is a function of level 2 timeout, worst case time to propagate credits for level 2 message classes, worst case time to process a packet in NCS message class without pre- allocated resource, and timeout for transactions using NCB message class that does not use pre- allocated resources. The transaction level timeout for transactions without pre-allocated resources in NCB message class was estimated to be about 544µs in Section 11.4.1.1.3, therefore level 3 link timeout must be set to larger than 544µs. 364 Ref No xxxxx Intel Restricted Secret 11.5.2.2 Effect of Adaptive Virtual Network on Link Timeouts Since packets belonging to any message class share resources in adaptive virtual network, care must be taken in design of system using adaptive virtual network to make sure that packets in different message classes in a dependency chain must not block each other at any link or router in the system. For example, if a packet in NCS message class is blocked due to unavailable resources at the destination, unavailable credit from the next link or filled buffer on the output of a router, it must not block packets from other message classes in VN0, VN1 or VNA from reaching its destination. This property is required to avoid a protocol level deadlock due to dependency between packets in different message classes, and it is also relied upon to eliminate error propagation between multiple partitions sharing routing and link resources. Ref No xxxxx 365 Intel Restricted Secret Fault Handling Fault Handling 366 Ref No xxxxx Intel Restricted Secret 12.1 Introduction This chapter addresses the reset, initialization and boot flow for a CSI-based component, and is applicable to processors that are compliant with the CSI system interface. Such processors may be parts of different system configurations and topologies - desktop/mobile/workstation/server, UP/DP/MP, partitioned/non-partitioned, etc. The reset and initialization description discusses sequences with and without an external system management controller (alternatively known as the System Service Processor or SSP). An external SSP with a JTAG or a SMBus interface is very likely to be present in a large scale system - to aid in reset and initialization and to limit the need for strapping pins; however, such a SSP is not a requirement for CSI initialization. In this context, Reset is defined to be a set of hardware based events that results in a deterministic initial hardware state. Initialization is defined to be the set of firmware or micro-code sequences that follow Reset and which prepare the hardware for execution of boot firmware. The boot firmware then prepares the system for loading and transferring control to the operating system. The description assumes that the CSI component may be required to participate in a system with multiple OS partitions. The various partitioning models supported by the CSI interface are described in the Chapter 14, “Dynamic Reconfiguration” in this document. 12.2 CSI Reset Domains Based on the scopes of resets, the CSI interface contains four reset domains, as shown in Figure 12-1. In addition to these reset domains, the platform may implement additional reset domains system/partition wide. Such reset domains are beyond the scope of CSI. The CSI component may also implement multiple PWRGOOD domains to control the supply of power to different power planes within the component. The CSI specific reset domains are as follows: 1. Physical layer and lower Link layer 2. Upper Link layer 3. Routing layer 4. Individual CSI agents on the component. An implementation may decide to either separate these reset domains or combine several reset domains into a single reset domain. Some of these domains may have a common PWRGOOD domain yet have separate reset domains. Thus, the Protocol, Routing and Link layers may have a common PWRGOOD domain but separate reset domains. Separation of the reset domains is based on the usage models. Implementations that do not support some usage models may collapse the appropriate reset domains into one. For example, if an implementation does not support link width reduction, then the upper Link layer and the lower Link layer reset domains can be combined. Similarly, the division of reset domains is platform dependent, and the domain separation indicated here is not a requirement for all CSI components. Table 12-1 illustrates the functionality enabled by separation of reset domains in CSI components. Ref No xxxxx 367 Intel Restricted Secret Table 12-1. Justification for Reset Domain Separation Physical and Lower Link Layer Physical layer control and initialization of an individual link. Enables self-healing of links by use of techniques such as link reconfiguration, link width reduction, etc., on an intermittent error or partial link failure Upper Link Layer Initialization of virtual channel (VC) queues of individual links. Enables Online addition. Routing Layer Allows sharing of interconnect fabric across partitions. Reset to the Crossbar provides the ability to reset the physical and Link layer logic of all the links. CSI Protocol Agent Allows dynamic system reconfiguration. Provides ability to reset an individual CSI agent in a package Figure 12-1. Reset Domains in CSI Components Processor To Physical Layer E rror detection/ Link Layer R etry Logic Flit A ssem bly From Physical Layer C SI P rotocol Agents C rossbar or Routing Layer Upper Link Layer Low er Link and Physical Layer Mem ory I/O C onfiguration Virtual Channels for each Link V irtual Channels for each Link 12.2.1 CSI Physical Layer and Lower Link Layer Reset Domain This reset domain covers the Physical layer and the lower Link layer of the CSI interface. It includes the Physical layer control and initialization, flit assembly, error detection and some part of the Link layer retry mechanism. The lower Link layer domain covers a portion of the Link layer retry mechanism including the Local Retry State Machine and the timers (see Section 4.9.2.4, “Link Level Retry State Machines” on page 4-196). The Remote Retry State Machine, retry buffers, expected sequence number and any other associated control logic (see Section 4.9.2.4, “Link Level Retry State Machines” on page 4-196) are not affected by a reset to this domain. 368 Ref No xxxxx Intel Restricted Secret Table 12-2. Features of CSI Physical and Lower Link Layer Reset Domain Coverage Physical layer control logic and registers in the CSI Configuration Region De-skew buffersa Flit-assembly buffers and part of the Link layer retry logic Triggers Assertion of RESET signal or deassertion of PWRGOOD signal Software controlled reset On-line addition reset based on detection of activity on the link (this is also a reset to the Upper Link layer Reset domain) Power management reset such as wake up from Off or Deep Sleep state Error induced reset such as Link layer retry failureb Upper Link layer reset Lower Link and Physical layer reset by the other end of the link Reset Actions Reset physical and lower Link layer control logic and most registers in the CSI Configuration Region to default values Link frequency is set to a default value only on PWRGOOD assertion, otherwise link is initialized with the frequency setting provided in the appropriate configuration register Reset flit assembly logic Initialization Actions Physical layer initialization Link layer framing and initialization Protocol layer parameter exchange a. This can be optional for the sake of determinism. b. These errors generate an error event to the firmware layer. The firmware layer then takes steps to cause “Reset Actions” listed above and may use the services of the SSP. Refer to the Error Handling chapter for details. Reset to the Physical and the lower Link layer is not fatal to the system partition even if the reset is asynchronous. If other reset domains are not reset, then there is no loss of information and the system is expected to continue to operate or recover completely. Any CSI transaction time out values must take into account the reset and initialization duration due to the physical and lower Link layer reset event that was caused by a Link layer retry failure. In this context, the term CSI transaction applies to a CSI protocol message such as a read and its associated response. It is expected that the Physical and lower Link layers on both sides of a CSI link will be reset by the reset mechanism (these may not happen at the same time). A soft reset sequence involves configuring physical link parameters at both ends and initializing the Physical layer at one end of the link. Both sides of the physical link then re-initialize the link using the newly configured parameters. 12.2.2 CSI Upper Link Layer Reset Domains This reset domain covers the upper Link layer of the CSI interface which includes the virtual channel queues, retry buffers, registers in the CSI Configuration Region, status registers, etc. This domain is responsible for most of the Link layer operations such as virtual channel flow control and Link layer retry. The Link layer error status and log registers are reset on assertion of the RESET signal or the de-assertion of the PWRGOOD signal, other reset triggers do not affect the values in the error status and log registers. Table 12-3. Features of CSI Upper Link Layer Reset Domain Coverage Virtual channel queues and flow control logic Link layer retry logic, retry buffers & pointers Link layer configuration & status registers in the CSI Configuration Region Link layer error log and status registers Ref No xxxxx 369 Intel Restricted Secret Reset and Initialization Reset and Initialization Triggers Assertion of RESET signal or deassertion of PWRGOOD signal Software controlled reset On-line addition reset Reset to the Routing layer Reset Actions Reset VC queue read/write pointers, set credit to 0 Reset retry buffer pointers Reset link control registers, except the link frequency setting Clear error logs and status registers (only on RESET signal assertion or PWRGOOD signal deassertion) Initialization Actions Exchange of Link layer and Protocol layer configuration parameters (See Section 4.7.6, “Parameter Exchange Ctrl Flit” on page 4-176) Reset event to the upper Link layer domain will clear any control logic associated with storage structures such as read and write pointers, valid bits, etc., and reset them to their default values. The contents of the storage structures such as VC queues and Retry buffers may not be initialized/cleared after the reset to this domain. It is expected that Physical and lower Link layers on both sides of a CSI link will be reset by the reset mechanism though these may not happen simultaneously. Any upper Link layer reset also causes a physical and lower Link layer reset event and the link goes through the complete initialization sequence. The flow control and retry state machines remain in their default state until the link initialization has successfully completed. Since reset to the upper Link layer domain causes loss of information in the VC queues, it may be destructive to a running system that was actively using that link. To avoid a fatal system event due to an upper Link layer reset, the system must establish new routes for the source-destination pairs that were using that link. The system must also ensure that packets using the old routes are drained from the system. Such a synchronization operation is typically used before severing a link (e.g., due to an on-line deletion event) such that a subsequent activation of that link (e.g., due to an online addition event) can be done reliably. Reset to the upper Link layer could also be done by software or system management channels to enable error logging and fault diagnosis after a fatal system event. 12.2.3 Routing Layer or Crossbar Reset Domain This reset domain covers the Routing layer or crossbar logic on a component. The reset domain for the crossbar needs to be different from the link reset domains since individual links need separate reset domains to support functionality such as on-line addition and on-line deletion. Further, since the crossbar may be a resource shared across multiple partitions and by multiple agents on a CSI component, its reset domain cannot be the same as the reset domains of the processor, memory or I/O agents, which typically belong to just a single partition. Similarly, the configuration agent may be a resource shared by multiple partitions. Note that the Route tables associated with the link controllers do not belong to this reset domain but are part of the configuration agent reset domain. Reset of the Routing layer or the crossbar does not affect the Route tables. 370 Ref No xxxxx Intel Restricted Secret Table 12-4. Features of CSI Routing Layer or Crossbar Reset Domain Coverage Routing layer or Crossbar Triggers Assertion of RESET signal or deassertion of PWRGOOD Software controlled reset Reset of the configuration agent that owns the Route tables Reset Actions Reset crossbar arbitration and control logic Initialization Actions None Reset to the Routing layer or the crossbar affects the internal links from the crossbar to CSI agents on the package as well as the external links. Reset to the Routing layer or the crossbar may be destructive in a running system that is actively using the crossbar. It causes loss of information being routed through the crossbar, which may result in the loss of a full or part of a packet. To avoid a fatal system event due to a crossbar reset, the system must take care to establish new routes for the source-destination pairs that are using the crossbar and make sure that no packets using the old routes on the crossbar are outstanding in the system. Refer to the Non-Coherent Protocol chapter for details of the required synchronization actions. Such a synchronization capability may not be implemented on all CSI system configurations. This reset event is typically used to enable error logging and fault diagnosis after a fatal system event. 12.2.3.1 CSI Protocol Agent Reset Domain The CSI protocol agent reset domain covers the Protocol layer and other logic (such as buffers, control logic, etc.) associated with a CSI agent. In the most general case, all CSI agents on a component have their independent reset domains to facilitate partitioning and system resource management at the CSI agent granularity. For example, the different CSI agents shown in Figure 12-1 can have unique reset domains. In a more restricted case, all the CSI agents on a component may share a common reset domain, which would restrict partitioning and system resource management at the component granularity. If the component contains a CSI configuration agent, then all CSI configuration and status registers belong to the configuration agent reset domain, even if those registers are associated with other CSI agents, the Routing layer, the Link layer or the Physical layer. Reset to the configuration agent may impact the functionality of other CSI agents, the Routing layer, the Link layer and the Physical layer on the component. A component may have multiple configuration agents as long as the scope of each configuration agent is restricted to a subset of the component and there is no overlap between their scopes. Table 12-5. Features of CSI Protocol Agent Reset Domain Coverage CSI agent Protocol layer structures and logic, data buffers, associated control logic, etc. Error Log and Status Registers Triggers Assertion of RESET signal or deassertion of PWRGOOD signal Software controlled reset Reset Actions Reset CSI protocol structures and control logic Clear Error Logs and Status registers (only on deassertion of PWRGOOD) Initialization Actions None Ref No xxxxx 371 Intel Restricted Secret Reset and Initialization Reset and Initialization Depending on the reset trigger, the reset to a CSI protocol agent may cause other associated units to be reset. For example, a software controlled reset to a processor agent may cause all the processing units associated with that agent to be reset. Similarly, a reset trigger to a configuration agent may cause the reset trigger to be propagated to all the agents and units within the scope of the configuration agent. The software controlled reset trigger may not clear the storage structures such as Address Decode entries and the Route Table entries associated with the CSI agent. 12.3 Signals Involved in Reset The system signals that are involved in resetting a CSI component are summarized in the sections that follow. The exact timing of these system signals, is beyond the scope of this specification and will be specified in the Electrical, Mechanical and Thermal Specifications (EMTS) document for a particular component that implements CSI. 12.3.1 PWRGOOD Signal PWRGOOD signals are used to indicate the condition of power being supplied to various power planes on a component. A component may contain one or more PWRGOOD signals, depending on the number of different power planes in the system. PWRGOOD causes the component to come up at a default frequency which may be controlled by platform specific strap signals. Once clocks are stable, hardware built in self test engines are run. At the end of this sequence, all undefined state is cleared. Power sequencing and ramp requirements and the relationship between PWRGOOD and RESET signals for specific components are not part of this specification and will be specified in the EMTS document for the component. 12.3.2 RESET Signal RESET signal(s) is the hardware reset input to the CSI component. It is driven either by the system reset control logic or a SSP, and brings all the CSI links connected to the CSI component to a known state. There are certain registers in the CSI Configuration Region that are preserved across the assertion of reset. The RESET signal may be qualified by the state of the PWRGOOD signals to clear different blocks of logic within the component. In a similar fashion, the RESET signal may be qualified by register values within the component to clear state on a CSI reset domain basis or to clear or preserve different blocks of logic within the component. The relationship of the RESET signal with other reset signals in a system and to other signals, such as PWRGOOD, is outside the scope of this specification and will be specified in the EMTS document for the component/system. 12.3.3 CLOCK Signals CLOCK signals are input to the component. The number and characteristics of the clock sources are beyond the scope of this specification and will be specified in the EMTS document for the component. 372 Ref No xxxxx Intel Restricted Secret 12.3.4 Other Configuration Signals A component may use other signals to obtain some configuration parameters during RESET de- assertion. As discussed below, these signals are not required by the CSI specification and the equivalent indications may be derived using other mechanisms such as the SSP or Link layer parameter exchanges. The exchanged configuration information is stored by the configuration agent within the CSI component in its CSRs and later provided to various CSI agents in an implementation specific manner. Separate signals to set configuration parameters are not required if the default values for these parameters are appropriate for the component initialization. 12.3.4.1 Presence of System Service Processor In systems where initialization can be performed either by firmware/configuration agent internal to the CSI component, or by an external SSP, strap pin(s) are required to indicate whether the internal firmware/configuration agent should initiate the initialization sequence or wait for the SSP to do so. If a SSP is present, the firmware/configuration agent lets the SSP perform the initialization steps and co-operate with the SSP in an implementation dependent manner. 12.3.4.2 Node identifier Unique Node Ids are required for communication between CSI agents in a system. A physical CSI component may be assigned multiple Node Id values. The configuration agent is a required agent within each CSI component and is part of the component’s system interface. The term system interface refers to the set of logic within a CSI component that provides the interface to other system components. The configuration agent may have its own Node Id or may share its Node Id with other agents in that node. A crossbar with neither processor nor memory agents will require a Node Id for its configuration agent. Depending on the exact system configuration, there are three options for assignment of node identifiers to CSI components that minimize the number of strapping pins. These are described in Table 12-6. Table 12-6. Node Identifier Options System UP DP Node Identifier Option Default Node IDs for processor and the external system control chipset 1. Default Node Id values for the external system control chipset. The system control chipset communicates its crossbar port number through which the processors are connected which is used as the processor agent’s Node Ida OR 2. Default Node Id values for the external system control chipset and strap pin value for the two processors Small MP/Large MP 1. Sufficient strapping pins for the Node Id requirements of processors, I/O and memory/directory agents. 2. Set by the External SSP a. For this scheme to work, the agent responsible for link initialization (e.g., firmware execution on a processor or a configuration agent hardware) waits for the assignment of Node Id to the processor agent (i.e., Remote Port#) by the system control chipset and only thereafter, attempts link initialization on its other links. The system control chipset must perform its link initialization on various links within a short time interval so that subsequent initializations of processor to processor links can complete successfully. Ref No xxxxx 373 Intel Restricted Secret Reset and Initialization Reset and Initialization If a component requires multiple Node Ids, it may derive these values in an implementation specific manner, e.g., suffixing bits to the input signals. For example, a CSI component with two CSI agents may derive some high order bits of its Node Id from strapping pins and append one low order bit to derive two Node Ids. 12.3.4.3 Implementation Specific Information All components with CSI interface are expected to initialize to some default parameter values after the PWRGOOD signal assertion and RESET signal deassertion. Some components may require additional default configuration parameters over and above the CSI requirements. These parameter values may be set by signals or exchanged as part of Link/Protocol layer parameter exchange process. The details of such parameters are component/implementation-specific and will be described fully in other documents. 12.4 Initialization Timeline This section gives a broad overview of the CSI Component Initialization Timeline. Physical Layer Initialization — Includes impedance matching, current source calibration, Receiver offset cancellation, channel equalization, Receiver training, link frequency and link width determination Link/Protocol Initialization — Includes flit framing, error detection and recovery policy, interleaving policy, virtual channel and flow control capability, etc. Component Initialization — Assignment of unique CSI node identifiers to each local CSI agent — Exchange of Node Id/AgentType/Port#, power on configuration parameters, test and debug parameters, etc. — Selection of configurable link, protocol and system/socket level parameters — Routing of firmware and configuration accesses before system configuration is complete, Discovery of firmware storage, etc. System Initialization — Selection of the System/Partition boot strap processor (SBSP/PBSP) — Discovery of System Topology — Establishing routes between CSI agents — Mapping of system address space — Setting up domains for cache coherency, interrupt broadcasts, etc. — Initializing platform components — Booting the OS, etc. 374 Ref No xxxxx Intel Restricted Secret 12.5 Firmware Classification A CSI system may have one or more firmware constituents to perform different reset initialization steps. The overall firmware functionality can also be aggregated into a single CSI Firmware agent. The division of functionality among various firmware constituents is platform dependent. 1. Embedded (or on-board) Firmware: This is part of the CSI processor incorporating code to perform functionality such as firmware authentication, link initialization, etc. Some processor implementations may incorporate these functions in microcode. If this firmware constituent is present, the hardware must also set up the address decoders and routing structures covering the Embedded firmware address range, to enable execution of the Embedded firmware by the cores. If present, the Embedded firmware will be the first firmware layer to execute and it will authenticate the other firmware constituents and then only use such firmware. This behavior is applicable even in the presence of the SSP. On processor agents with multiple cores, the authentication of firmware can be limited to one core and the component’s system interface may provide an implementation dependent mechanism to elect one of the cores for performing the authentication. 2. Direct Connect Firmware: This firmware constituent is directly connected to a CSI processor agent using a non-CSI interface that is implicitly initialized by hardware prior to core Reset de-assertion. Firmware is typically used to perform the link initialization, processor testing and setting up access to other firmware constituents needed for platform initialization. 3. Platform Initialization Firmware off a CSI Firmware agent: This firmware constituent contains code to initialize the platform. The firmware implementation may also have logic to set up CSI structures such as the Route tables, Address decoders, etc., load other firmware layers such as the Extensible Firmware Interface (EFI), boot the partition’s OS, etc. If this functionality is not contained in the Embedded or Direct Connect Firmware, then the Embedded or Direct Connect Firmware or hardware must locate a Platform Initialization CSI Firmware agent in order to boot the OS, else such a processor agent must go to a wait state. A Platform Initialization Firmware Agent shall not respond as a CSI Firmware Agent in the AgentType field during Link layer parameter exchange unless it contains code for the Reset vector. Refer to Section 4.6.2.26, “Response Data State - 4b - PL” on page 4-168 for additional details of agent types. If embedded or direct connect firmware is present, such firmware can have logic to probe the platform and locate the platform initialization firmware. Requirement: In the absence of Embedded or Direct Connect Firmware, at a minimum, one processor agent in the system must have a one hop CSI connection to Platform Initialization Firmware and such component shall respond as a CSI Firmware agent. There may be multiple copies of this firmware constituent to enable faster booting on a multi- node system. If multiple CSI links indicate the presence of firmware agents, then one of the links is selected for routing of firmware accesses and such selection is implementation specific. 12.5.1 Routing of Firmware Accesses During the initialization phases, the processor agents use a fixed address range to access firmware. The address decoders for this address range must be set up to generate non-coherent RdCode CSI transactions to the firmware space. The address of firmware within a RdCode CSI transaction is used to select between Embedded, Direct Connect and external firmware agents. A CSI firmware agent has the property that a processor agent is not required to initialize the Route tables or any other configuration register at the CSI firmware agent in order to access the firmware. Ref No xxxxx 375 Intel Restricted Secret Reset and Initialization Reset and Initialization Requirement: The CSI firmware agent shall return responses to the firmware accesses on the same port on which the request was received, even though its Route table has not been set up. 12.6 Link Initialization Once the SSP or the local configuration agent or a local processor agent (if using the Firmware option) writes the Node Id, AgentType and other parameter information into the link controllers, link initialization can commence. On some implementations, the two ends of the link synchronize on interlock messages and then exchange the parameter information. Hence, the set up of the Protocol layer parameters must precede the transmission of interlock messages. Refer to Chapter 4, “CSI Link Layer” for further details. If the link initialization is successful, the CSI Node Ids, agent types and Crossbar port number on the neighbor associated with the link are latched by the link controller from the control flit messages exchanged during the Link layer initialization. 12.6.1 Link Initialization Options There are multiple options to link initialization which are described in Section 12.6.1.1 to Section 12.6.1.2: • Firmware • System Service Processor that uses interfaces such as the SMBus or JTAG to initialize the CSI components on the platform. The link initialization options using firmware and SSP are not mutually exclusive. On platforms which have a SSP, the firmware and the SSP may co-operate in configuring various CSI components on the platform. For example, in a platform where the SSP does not have access to an external memory controller, it may rely on the firmware to perform necessary configuration steps. Similarly, since the link controller resources are owned by the local configuration agent, the SSP may send its commands to the local configuration agent and expect the latter to program the link controller registers. 12.6.1.1 Link Initialization Using Firmware Link initialization using firmware relies on Embedded firmware or Direct Connect firmware to perform link initialization. The execution of the firmware occurs at reset de-assertion. Following are the high level steps for link initialization and these may be performed by firmware or hardware (such as the local configuration agent) or any combination of both: 1. The hardware initializes the address decoder and routing structures to access firmware for performing link initialization if such firmware is external to the core. This must be done prior to de-assertion of core RESET. 2. Create the address decoders and other structures for internal CSR accesses by various CSI agents on the component. Since access to the local CSRs’ address space itself requires an address decoder, this step is best implemented in hardware. The other option is to create a shadow area that provides access to the CSR space and implement special hardware support such that the shadow area does not require an address decoder entry. 3. Create routing table entries or internal structures to address the local CSI agents so that the crossbar may route the incoming packets targeted to the local CSI agents. 376 Ref No xxxxx Intel Restricted Secret 4. Determine the Node Ids of the local CSI agents from platform specific signals and program the local Node Ids and Agent types into the local Link Controller CSRs. 5. Provide an indication to the link controllers to begin link initialization. At the end of successful physical Link layer initialization, both ends of the link will transmit the Link layer interlock message. Refer to Chapter 4, “CSI Link Layer” for details. 6. On a successful link establishment, capture the exchanged parameters including neighbors’ Node Ids, AgentTypes and Remote port #s. 7. Identify the Platform Initialization firmware component to be used for rest of the initialization (if present and necessary) and the link port# to be used to access such firmware. 8. Set up the address decoders and Route table entries to access the Platform Initialization firmware. Once this step is performed, the rest of the platform initialization may rely on the Platform Initialization firmware. 12.6.1.2 Link Initialization Using System Service Processor The system management channels between the SSP and the processor agent are enabled prior to core reset de-assertion to allow the SSP to access the CSRs on the CSI component. A full or a partial initialization of the CSI structures can be done through the SSP before activating the processors and other components in a system partition. Thus, Node Id, Address Decoder, Route tables, Participant Lists, and other CSRs related to routing of coherent traffic can be set up by the SSP prior to the de-assertion of the RESET signal. Note that the SSP option still requires firmware to perform the later stages of system reset initialization. Definition of the system management and firmware interfaces as well as the algorithms used for system configuration are platform specific and are beyond the scope of this specification. 12.6.2 Exchange of System/Socket Level Parameters The Link layer parameter exchange protocol permits CSI agents to pass parameters that can be used for initialization of the recipient CSI agent. Refer to Section 4.7.1, “Special Packet Format” on page 4-173 description in Chapter 4, “CSI Link Layer” for details. This packet permits 32 bits of information to be passed and these are sub-divided into two fields as follows: • Bits 0-7: System Type • Bits 8-31: Power On Configuration (POC) values described in product specific documents Table 12-7 provides a definition of the System Profile Type usage. Unused values are reserved. Table 12-7. System Type Values System Type Value 0 1 2 3 4 8 12 13 Usage No Information POC values for IA-32 cores in UP Configuration POC values for IA-32 cores in DP Configuration POC values for IA-32 cores in Small MP Configuration POC values for IA-32 cores in Large MP Configuration POC values for IA-32 Mobile cores POC values for Itanium® processor cores in UP Configuration POC values for Itanium processor cores in DP Configuration Ref No xxxxx 377 Intel Restricted Secret Reset and Initialization Reset and Initialization 14 15 System Type Value POC values for Itanium processor cores in Small MP Configuration POC values for Itanium processor cores in Large MP Configuration Usage 16 POC values for Memory Agents 20 POC values for I/O Agents This scheme allows transfer of system/socket layer power on configuration values which aid the platform firmware during reset initialization. This capability allows processors and other CSI agents to be initialized with the right POC values and thereby avoid multiple resets during the booting process. Some examples of such POC values are listed below: • Platform Input Clock to Core Clock Ratio • Enable/disable LT • Configurable Restart • Burn In Initialization Mode • Disable Hyper threading • System BSP Socket Indication • Platform Topology Index A component’s system interface logic is expected to latch the parameters then pass them to the other agents within the CSI component. A particular implementation may discard the system/socket layer parameters without causing any loss of functionality. Such a discard imposes a higher burden on the platform firmware which should be capable of retrieving the relevant information from the platform. Some of the parameters may be applicable system wide while others may be applicable at the granularity of a socket or a CSI agent or a context within the CSI agent. The exact usage will be defined in product specific documents. The expected usage model is to provide a set of parameter values to a socket on one of the links to the socket. If differing values are provided for the same system/socket parameter through multiple links, results may be unpredictable. The system interface on the recipient CSI component may have limited space to retain the exchanged values. 12.7 System BSP Determination Setup and revision of Route tables needs to be done consistently in all the nodes in the system and there is a danger of creating routing cycles if multiple entities program different subsets of the Route tables simultaneously. Since CSRs in multiple nodes must be revised atomically and consistently, this function can only be performed by one processor in the system. Such a processor is designated as the System BSP and it may perform other functions such as booting the OS. The SBSP must be directly connected to a Platform Initialization firmware agent using a CSI link, and this is typically an I/O+Firmware (IO+FW) agent. Determination of the SBSP is platform and firmware implementation specific. Typically, one of the cores within a node is elected as a Node BSP (NBSP) and the NBSPs in the system vie for the SBSP status. The NBSP designation may be done by the system interface or the cores within a node may elect one using a semaphore in the system interface. Following are examples of SBSP determination: 378 Ref No xxxxx Intel Restricted Secret 1. The processor socket implements a strap designating the socket as the SBSP socket and one of the cores on the socket is elected as the SBSP. The SBSP socket must have a direct connection to a Platform Initialization firmware agent. Thus Node1 directly connected to IOH1 in Figure 12-2 can serve as the SBSP socket. 2. The platform implements a semaphore that is accessed by all the processors. For example, one of the IO+FW agents in the system implements a strap for a LegacyIOH designation and a semaphore. All processors in nodes that have a CSI connection to an IO+FW agent first verify if they are connected to the agent designated as the legacy IOH and, if so, try to acquire the semaphore for the SBSP designation. 3. One IO+FW agent in the platform contains a strap designating it as the Legacy IOH in the system as in Option 2 above. During the Protocol layer parameter exchange, the LegacyIOH can use the system/socket layer parameter packet to designate one processor agent as the SBSP socket. The processors within this socket then elect one of the cores as the SBSP. Even the core# to serve as the SBSP can be part of the Protocol layer parameter exchange. 12.8 CSI Component Initialization Requirements CSI-based components need some initialization steps before they can interact with each other or with other platform components. Some of the high level features and initialization requirements are indicated below: • CSI Node Ids need to be setup prior to successful communication with other CSI agents and these need to be unique system-wide. • CSI links need to be initialized prior to their use. The link initialization sequences, negotiation of link parameters, and related Link and Protocol layer parameter exchange are described in Chapter 4, “CSI Link Layer”. The links can have non uniform link speeds and features. • The system topology is not fixed. It is discovered by probing CSI links, identifying the neighbors and then the route between CSI agents is established in the Route tables. Even the firmware agent must be discovered, then used to initialize the system. • The reset signals of cores and chipsets may not be synchronized. • CSRs for various Participant Lists must be setup before sending coherent transactions such as Snoops or broadcast interrupts to other CSI agents. • Routing of core accesses to memory and platform uses interconnect fabric hardware structures such as Address decoders and Route tables which need to be initialized. 12.8.1 Support for Fabric Initialization Initializing the Route tables of a CSI component is essential to establish communication with the rest of the platform. In a system environment, there may be CSI agents, such as memory agents or I/O agents, that cannot initialize their Route tables by themselves and a remote processor agent may have to perform this function. This requires that an agent must be able to route remote configuration accesses even though its own Route tables are not fully initialized. The CSI interface relies on the following mechanisms to support this functionality: • Components communicate their Node Ids, their corresponding agent types and port identifiers during link initialization to the remote components. The received port identifier indicates the crossbar port number on the remote component and if there is no crossbar on the remote component, then the port identifier is specified as 0. The remote Configuration agent’s Node Id is used to derive the address of the CSRs representing the Route tables, Address decoders, etc., Ref No xxxxx 379 Intel Restricted Secret Reset and Initialization Reset and Initialization at the remote component. Using such derived address, the CSI structures of the I/O, Memory and Directory agents can be programmed by a processor agent. • Components with processor agents that do not have an Embedded/Direct Connect firmware agent, may use the exchanged information to detect the presence of a firmware agent on a remote component in the system. If found, the local processor agent may route its firmware accesses to the remote firmware agent through the CSI interface before its own Route tables are initialized. Some CSI implementations may not support this type of routing. Alternatively, such processor agents may wait for another agent to set up the path to the firmware as described in Section 12.8.2. 12.8.2 Programming of CSI Structures This section describes the early steps in booting sequence of CSI platforms and identifies the hardware requirements (listed in text below) in the CSI platform. There are many possible firmware implementations to programming the CSI structures and the sequences described below may not be optimal for all CSI platforms. A simple system topology is shown in Figure 12-2 to illustrate various issues with the system initialization sequences. Also refer to Chapter 5, “Routing Layer” for issues pertaining to Route table setup. This system has 3 processor nodes, Node1, Node2 and Node3, connected in a serial fashion. It also has two I/O Agents, IOH1 and IOH2, both of which are also firmware agents. IOH1 is connected to Node1 and IOH2 is connected to Node3. The platform has the convention of designating one of the IOHs as a LegacyIOH and a processor that acquires a semaphore in the LegacyIOH becomes the SBSP which is responsible for the platform initialization and booting the OS. Figure 12-2. Example System Topology Diagram IOH1 CPU Node1 IOH2 FWH FWH CSI Interconnection Network CPU Node2 CPU Node3 CSI Link Link to FWH FWH: Firmware Hub IOH: IO Hub Following are the high level programming steps for initialization of CSI structures within various CSI components. If these steps are implemented using firmware, also refer to Section 12.6.1.1, “Link Initialization Using Firmware” for requirements for firmware execution. 1. NBSP selection: The cores within the CSI component elect a Node BSP using an implementation dependent mechanism. The NBSP proceeds with the component initialization while the other cores (designated as Attached Processors or APs) go to sleep. On IA-32 processors, such APs go to a “wait-for-SIPI” state in the microcode. On Itanium processors, such APs execute a halt. Requirement: The system interface must provide a mechanism to elect one core as the Node BSP, ability for a core to go to sleep, then be woken up by another core or the SSP. 380 Ref No xxxxx Intel Restricted Secret 2. Boot Mode Determination: The CSI component may implement different booting modes based on some platform signals. These signals are read by the NBSP. If the signals indicate the presence of a SSP, the NBSP may signal to the SSP in an implementation specific manner and then go to sleep. The SSP can then proceed with initialization and will eventually wake up the SBSP. The SSP can obviate a number of the steps below by co-operating with the configuration agent on the CSI component to achieve link initialization, then initializing CSI specific structures in the platform such as the Route tables, Source and Target Address Decoders, Participant Lists for Snoop, electing a SBSP, etc. The SSP will then wake up the cores to perform the traditional system initialization. The rest of the description in this section does not assume the presence of a SSP. 3. Node Id Determination: The NBSP reads the platform signals that provide the Node Id information for the local CSI agents and programs its internal registers as well as the link controllers CSRs prior to triggering Link layer initialization. 4. Link initialization: Link initialization occurs as described in Section 12.6.1. Each CSI link controller latches one or more information triplets of its neighbor’s CSI agents, i.e., Node Id, AgentType and Neighbor’s Crossbar port# through which the neighbor is connected. For example, if Node1’s port#5 is connected to Node2’s port #3, following the Link layer parameter exchanges, Node1 will latch 3 as the neighbor’s port# on its port #5 and Node2 will latch 5 as the neighbor’s port# on its port #3. The AgentType is exchanged as a bit vector that identifies the types of CSI agents connected to the port. Thus, if both IOH1 and IOH2 have firmware for the reset vector, they will both respond as IO+FW Agents. If none of the links have a Platform Initialization firmware directly connected (such as Node2), such a component shall perform link initialization using Embedded firmware or equivalent hardware and then go to sleep. Such a component’s CSRs will then be capable of being programmed by its neighbors. Thus, Node2 in the System Topology diagram Figure 12-2, does not have a direct connection to firmware and this can be determined by the firmware on Node1 by reading the latched values in Node2’s link controller CSRs. Some other processor in the system (e.g., Node1) that has a single hop connection to the Platform Initialization firmware will later set up the address decoder and Route table of Node2 such that firmware accesses by Node2 go through Node1, and then wake up Node2. Requirement: A CSI agent that contains the code for the Reset vector shall respond as a Firmware Agent during the Link layer parameter exchange. Note: The Embedded/Direct Connect firmware may be capable of probing the platform and locating the rest of the firmware needed for system initialization. In such cases, it need not rely on the firmware agent indication provided during the Link layer parameter exchange and may ignore such an indication. 5. Setting POC Values: Once all the links are initialized, any system/socket level power on configuration values latched during the Link layer parameter exchanges (see Section 12.6.2) are provided by the system interface to the cores and other CSI agents. 6. Setting the path to Platform Initialization Firmware: If the entire firmware is not contained in the Embedded or Direct Connect firmware, the path to the rest of the firmware is set up using one of the CSI ports on which such firmware agent was detected. Refer to steps 7 and 8 of Section 12.6.1.1, “Link Initialization Using Firmware” for details. 7. Firmware Authentication: The Embedded firmware shall authenticate other firmware constituents in an implementation specific manner prior to their use. 8. Processor Initialization at Reset: The processors begin execution at the Reset vector. For IA32 systems, processor initialization is performed by microcode (if not already done) and then the BIOS code responsible for platform initialization gains control. For Itanium processor systems, the initial code would be PAL_A firmware that initializes the processor. The PAL_A firmware gives control to the SAL_A firmware which performs the platform initialization steps. Ref No xxxxx 381 Intel Restricted Secret Reset and Initialization Reset and Initialization 9. SBSP Selection: The NBSPs must elect a SBSP as described in Section 12.7. If a semaphore needs to be acquired to obtain the SBSP status and such a semaphore is located in a platform resource, firmware must set up the address decoders and Route table entries to access the same. The SBSP performs most of the system initialization steps while the other processors wait for a signal from the SBSP. The other NBSPs effectively transform their status to Attached Processors, switch to a Halt state, and not contend for firmware or memory accesses. The Halt state support on the processor is a requirement as routing errors can occur if an Application Processor (AP) were executing a wait loop in firmware/memory and the SBSP were to revise the Route tables of the AP. Requirement: The processors must support a Halt state in which instruction execution is stopped and accesses to internal/external resources do not occur. 10. Route Table Setup: Refer to Section 5.11, “Routing Table Setup after System Reset/Bootup” on page 5-218 for detailed discussion of system topology discovery and set up of route tables for the entire system. In short, once all processor nodes have completed their local Route table initialization, the SBSP proceeds with Route table initialization for the platform while the others wait for a wake up from the SBSP. The Route table initialization values required for the programming may be part of the firmware space thus obviating the need for deriving the Route table information dynamically. Dynamic derivation of Route tables would be required only when the platform configuration has changed and current tables in the firmware are not appropriate. Even these cases can be minimized if the firmware can boot in UP mode and load a new set of tables into the firmware address space from an EFI System Partition. In a multi-partition system, it is possible to elect a unique PBSP for each partition and perform the Route table setup in a parallel fashion. This scheme presents no firmware issues on a system with hard partitions. If partitions use common interconnect fabric, the firmware executions on these partitions must co-ordinate the programming of common resources such as crossbar Route tables. 11. Lock_Master: Execution of some LOCK instructions on an IA-32 system can cause a request to go to a Lock Master which, in turn, broadcasts special CSI transactions to other nodes in the partition. On IA-32 DP/MP systems, each partition must set up the necessary CSI structures for supporting the IA-32 LOCK instruction. Processor agents will have a CSR containing the Node Id of the Lock Master Target (typically an I/O Hub) and Lock Master Target Node will have a participant list that contains the scope of the CSI StartReq/StopReq transactions issued during the LOCK sequence. Refer to the Non-Coherent Protocol chapter for details. Firmware shall not use the LOCK instruction until the Lock Master Target for the partition is established and the required CSRs for the Lock Master Target and the Lock Master Scope List are programmed. 12. Activating Other Nodes: The SBSP can program the Route tables of processors without immediate firmware connection (such as Node2) and then wake them up in order to perform the necessary processor initialization. The SBSP can indicate to Node2 (through memory or a platform resource) that its Route tables have been programmed and that Node2 shall not perform system topology discovery. 13. Setting up Address Maps: The SBSP discovers the platform hardware and programs the address decoders to access the system address map. The SBSP can invoke other NBSPs for this step, as necessary. All processors set up their source address decoders to match the system address space. The processors also set up the source and destination address decoders of the IOHs in the partition. If an IOH is used by multiple OS partitions, the IOH's Participant List registers must be set up to describe the resources belonging to each partition - processors, PCIE busses, downstream south bridge chipsets, etc. Such a participant list may be used by an I/O bridge to partition busses among various OS domains and support overlapping system wide MMIO addresses between partitions. 382 Ref No xxxxx Intel Restricted Secret 14. Integrating Processors belonging to the OS Partition: The SBSP determines the other processor nodes for the partition from a platform implementation dependent location, (e.g., firmware or non-volatile memory space) and then rounds up its processors. It may then relinquish the semaphore on the LegacyIOH so that another NBSP boot its OS partition. There are multiple possible implementations for this functionality. 15. Enabling Coherence Traffic: Processors belonging to the same OS partition set up their CSI structures consistently – address decoders, Participant Lists for Snoop, etc. Coherence traffic is enabled only after this stage. The APs are sent to a wait loop. For Itanium processor systems, this is a wait loop within the SAL firmware. For IA-32 systems, this is a “wait-for-SIPI” loop in the microcode. Requirement: If a CSI agent has a non-CSI link with another CSI agent in the system (e.g., IOH-IOH link where both IOHs are CSI agents), the non-CSI link must be severed when coherence traffic is enabled, otherwise there is a potential for cache coherence and/or ordering violations. 16. Booting the OS: The PBSP initializes other platform devices. It may wake up other NBSPs and APs to perform some of the platform discovery and initialization steps. The PBSP then boots a shrink wrap OS which is oblivious of the system being CSI-based. 12.9 Support for On-Line Addition In case of an on-line addition, the added node will be provided with the HotAdd indicator and this will be used by firmware to limit the scope of its platform discovery and initialization. Refer to the Chapter 14, “Dynamic Reconfiguration” for details. Such an indicator can be in a platform dependent resource such as memory, CSR space, MMIO space, etc., that may be set by the Running System or the SSP. Alternatively, this indicator may be provided as part of the socket/system level parameter exchanges. (See Section 4.7.1, “Special Packet Format” on page 4-173) Platforms supporting on-line addition of CSI components must implement a state indication for the “HotAdd” node whereby the added node is instructed to wait for further commands from the Running System even if the added node has local firmware. This indication may be conveyed by the SSP or platform signals such as Boot Mode or using the socket/system level parameter exchanges. This indication dictates the behavior of the Reset initialization firmware on the added node. By using this indication, the firmware execution on the Running System can: • Set up all the CSI structures on the added node and restrict the ability of the firmware on the added node from revising any local or remote CSRs, e.g., no write privileges in the address decoder entry for the CSR space. • Set up memory areas with code and data to test the processor and other resources on the added node, • Instruct the added node to conduct tests to ensure proper operation of the added components, and • Report the testing results to the Running System using implementation dependent mechanisms. The firmware on the Running system can thus ensure proper operation of the added node and only thereafter permit the added node from revising the CSI structures for the shared interconnect fabric. Ref No xxxxx 383 Intel Restricted Secret Reset and Initialization Reset and Initialization 12.10 Support for Partition Reset Reset of an individual OS Partition is required to support a system with multiple partitions and to ensure that an error condition in one partition does not bring down the entire system. An example error condition would be a machine check in one partition due to poisoned data consumption by privilege level 0 code. The platform firmware and the SSP can co-ordinate with each other and reset only the affected partition’s components. For example, the firmware can cause the assertion of an error signal to the SSP. The SSP can write to CSRs in the partition component(s) to be reset indicating the reset domain granularity and then assert the RESET signal to cause a partition reset. 12.11 Hardware Requirements 1. In systems where initialization can be performed either by firmware/configuration agent internal to the CSI component, or by an external SSP, strap pin(s) are required to indicate whether the internal firmware/configuration agent should initiate the initialization sequence or wait for the SSP to do so. These pins or encoded values may serve as the Boot Mode indications and may encompass other indications such as a HotAdd indication to limit the scope of platform discovery by an OL_A node, on platforms supporting OL_* operations. 2. SSP Presence indication: The configuration agent will await indication from the SSP for link initialization if SSP presence is indicated. The presence indicator may be conveyed in the form of Boot Mode platform signals with other encoded values for: — Instructing an OL_A node to stop after completing the link initialization in order that the running system may program the CSRs within the OL_A. — Notifying the OL_A node that it is being hot added, to limit the scope of platform discovery by the OL_A. 3. In the absence of Embedded or Direct Connect Firmware, at a minimum, one processor agent in the system must have a one hop connection to Platform Initialization Firmware and such component must respond as a CSI Firmware agent. 4. A CSI agent that contains the code for the Reset vector shall respond as a Firmware Agent during the Link layer parameter exchange. 5. A firmware agent shall return responses to the firmware accesses on the same port on which the request was received, even if its Route table has not been set up. 6. If link initialization will be performed by firmware outside the core, the address decoder and routing structures to access such firmware must be initialized by the hardware prior to de- assertion of core RESET. Similar requirement is applicable for the local CSR space that will be accessed by the link initialization code. 7. The system interface must provide a mechanism to elect one core as the Node BSP. In a system with multiple nodes, the platform must provide a mechanism for System BSP election. 8. Processors and the system interface must provide the ability for a core to go to sleep, then be woken up by another core or SSP. 9. If a CSI agent has a non-CSI link with another CSI agent in the system, the non-CSI link must be severed when coherence traffic is enabled. 384 Ref No xxxxx Intel Restricted Secret 12.12 Configuration Space and Associated Registers This section provides a list of all the Configuration and Status registers in the CSI Configuration Region for the Reset and Initialization from a functional perspective. The Component Specifications will provide additional details. Table 12-8. CSI Control and Status Registers Needed for Reset and Initialization CSR Name(s) for Registers Required in the CSI Configuration Region Function Reset Domain Qualifier Registers that indicate the scope of the Reset Boot Mode Indication Latch the value of Boot Mode indicator straps. Online Addition can be one of the encoded values. SSP Indication Presence of a SSP Local Node IDs Node IDs of CSI agents on the socket Link control and status Link control CSRs to enable/disable link, Link status values such as Link Initialization complete/in-progress on a per-link basis Neighbors’ CSI Node Id, AgentType, Report Port# values for each link Neighbors’ characteristics on a per-link basis POC values Contents of Socket/System level parameters (one per CSI socket) Node BSP Mechanism to elect a NBSP and a CSR to hold the core ID of the NBSP System BSP Mechanism to elect a SBSP and a CSR to hold the Node ID of the SBSP Legacy IOH CSR on an IOH that denotes the I/O Hub as being the sink for Legacy I/O transactions. Snoop Participant List List of agents to whom snoops should be sent. Required on each caching agent Lock Master Target CSR on IA-32 processor nodes containing the Node Id of the Lock Master Lock Master Scope Participant list on a Lock Master that has the Node Ids of processor agents that should receive the CSI transactions associated with the LOCK sequence. Ref No xxxxx 385 Intel Restricted Secret Reset and Initialization Reset and Initialization 386 Ref No xxxxx Intel Restricted Secret 13 13.1 Introduction The collection of mechanisms that configure and control the operation of CSI platforms constitutes the system management infrastructure. It comprises of two distinct subcomponents, in-band and out-of-band system management. The out-of-band system management infrastructure consists of the service processors that operate in parallel to the main platform components. Service processors configure and control the operation of the platform through dedicated access interfaces to processor and chipset components such as SMBus and JTAG, distinct from the interfaces that the platform components use to communicate among themselves. The in-band system management infrastructure consists of platform firmware running on the processors that is used to configure and control the platform components by accessing the processor and chipset configuration and status registers (CSRs) through CSI. Figure 13-1 shows the paths through which the in-band and out-of-band system management agents can access system CSRs from the processor die. In-band and out-of-band accesses can be mapped to CSI transactions and carried over to local CSRs or CSI components in the system. CSI does not preclude a private on-die interface that bypasses the on-die CSI fabric for in-band and outof- band accesses to the local CSRs. Figure 13-1. Logical View of Access Paths to CSI Configuration Registers Core+$’s Protocol Engine Fabric Access SMBus JTAG SMBus JTAG Config Accesses Config Accesses Processor Die ... On-die CSI fabric Outgoing CSI Links Out-of-band interfaces that can access local and remote CSRs may also exist in chipset components. CSI does not require that specific out-of-band interfaces must be supported by all CSI components neither does it require that if such interfaces exist they must be bridged to CSI or give access only to CSRs internal to that component. Any processor or chipset component may or may not have an out-of-band interface, and if it does, it may be bridged to CSI, or it may just give access to internal CSRs or both. The rest of the chapter elaborates on the in-band and out-of-band system management mechanisms, introduces the concepts of protected and unprotected configuration spaces in partitionable CSI systems and presents the usage models associated with them. Ref No xxxxx 387 Intel Restricted Secret System Management Support System Management Support 13.2 Configuration Address Space CSI components support a target configuration space where the system CSRs of that component reside. A detailed discussion on the CSI configuration space and its relationship with other platform configuration spaces such the PCI Express configuration space can be found in Chapter 7, “Address Decode.” The system configuration register set includes the address decode registers and switch route tables but does not include processor core model-specific registers (MSRs) or any other processor configuration registers internal to the core and not accessible via loads/stores. Core MSRs cannot be directly accessed through the CSI configuration access mechanisms. In CSI systems that support multiple partitions, the system configuration registers are classified into two sets: protected and unprotected. The protected set includes all configuration registers that can affect the operation of multiple partitions. Registers belonging to this set should not be accessible by the OS but rather controlled by out-of-band and in-band system management agents. The address decode registers and the route tables are examples of registers that belong in this set. The unprotected set may include error logging and performance monitoring registers that should be accessible by the OS. The exact membership list in each set is platform architecture and usage model dependent. For example, the protected set is likely to be an empty set in a platform that does not support multiple partitions. The classification does not prevent the definition of CSRs on a processor die that are not directly related to CSI configuration yet which they will be accessible through CSI by Processor Abstraction Layer (PAL) firmware or microcode. 13.3 Configuration Access Mechanisms Processor and chipset components may support an SMBus/JTAG configuration access mechanism to any register in the configuration space. In addition, the configuration registers can be accessed though processor reads and writes to memory mapped configuration space. Specifically, a processor that supports partitioning should support a protected memory mapped configuration address space that can only be accessed by platform firmware and an unprotected memory mapped configuration space that can be accessed by firmware and privileged system software. Different access mechanisms are allowed for a given configuration register depending on the protection requirements of the specific register and on whether the platform implements in-band system management. Specifically, protected registers may be accessed by external management controllers through the SMBus and JTAG interfaces and in platforms that implement in-band management, by protected firmware through the protected memory mapped configuration (MMCFG) space. Unprotected registers can be accessed by external management controllers through the SMBus and JTAG interfaces and by firmware or privileged system software through the unprotected memory mapped configuration space. A a portion of the unprotected configuration space may also be accessible through the PCI-compatible CF8/CFC configuration access mechanism. CSI does not assume any special protections associated with such configuration spaces. Therefore, such configuration spaces are not appropriate for mapping protected registers. Configuration accesses are transported over the CSI fabric as CSI configuration read and write transactions or non-coherent memory read and write transactions. Different components may choose to support accesses to their configuration registers via either one or both transactions types. The CSI source decoders in other components must support the decoding of accesses to the transaction type appropriate for the target component. 388 Ref No xxxxx Intel Restricted Secret 13.3.1 CSI Configuration Agent All configuration registers on a CSI agent or component logically belong to the configuration agent (CA) for that agent or component. A component may define a separate CA for each CSI agent on the die or it may define a single CA to represent all the CSI agents on the die. In the latter case, configuration transactions destined to CSRs in other CSI agents are logically targeted to the CA, which in turn completes the access within the die via implementation-specific mechanisms. The CA is not necessarily a separate logic block on the agent or component but rather, it is the logical destination for CSI transactions accessing CSRs on the specific CSI agent or component. Accordingly, the CA may share the node id with other CSI blocks in the die or it may have its own separate node id. The CA has means to connect to the CSI fabric, either directly or indirectly through some other block on the die. A core sends a configuration transaction to a local or remote CA as specified by the source address decoder attributes for the memory range where the CSRs of that component reside. The CA may also receive configuration requests from any out-of-band channels (SMBus/JTAG) that terminate on the die. Requests can arrive from in-band and out-of-band paths simultaneously. Multiple concurrent requests will be buffered and serviced one at a time, in no particular order or priority. The CA performs no explicit security checks. If a configuration transaction arrives at the CA, it is assumed to be trustworthy. Consequently, out-of-band accesses are always assumed to be secure. Accordingly, non-secure code running on a processor must be prevented from modifying protected registers. Control of protected resources is enforced by the core and through address decoder programming using mechanisms discussed in Section 13.4. If in-band system management is not implemented or enabled in a platform, then, in partitionable systems, the system management controllers should not install mappings in the core source address decoders that would allow the processor core to access protected configuration registers from the OS domain. No such mappings should also be installed in CSI source decoders in the I/O subsystem if I/O devices are not trusted by the system operator to participate in the system management stack. CSI does not preclude additional protection mechanisms at the transaction target such as lock bits controlled by out-ofband agents that freeze the value of a configuration register when set. To get the attention of the out-of-band management system for management events, the processor can use a path from the I/O subsystem to the service processor. In addition, error pins on the processor die may be supported to request the attention of the out-of-band management system on error conditions. 13.3.2 JTAG and SMBus Processor and chipset components may support a JTAG test access port (IEEE Std 1149.1/1149.4). The test access port can also function as a configuration access mechanism. In this mode, it can be used by the system management controller to generate configuration accesses. Processors and chipset components may also support an SMBus Specification, Revision 2.0 compliant slave port that can be used by the system management controller to generate configuration accesses. Both JTAG and SMBus configuration accesses can target either local configuration registers in the component only or may also be enabled to generate CSI configuration transactions directed to any of the chips in the system. To support the latter capability the CA may have its own source address decoder to appropriately route configuration accesses to their final destination. Alternatively, the system management controller may emulate the source decoding functionality before it injects a configuration transaction in the CSI fabric. The out-of-band interfaces are intended primarily to enable accesses to system configuration registers. The interfaces may also, but not required to, allow the generation of transactions that access uncached physical memory. Ref No xxxxx 389 Intel Restricted Secret System Management Support System Management Support The details of the SMBUS and JTAG access protocols are beyond the scope of this document. In the past, such interfaces where limited to chipset components and were documented in the chipset specification documents. With the integration of system logic in the processor die however, it is desirable for all key platform components to share a similar command format and operation flow. 13.3.3 MMCFG and CF8/CFC A region of memory that maps configuration registers is known as MMCFG space. Processor read and write requests in this range generate a configuration access. A CSI platform may have multiple MMCFG regions to support the different usage model requirements. For example, an MMCFG region can be used to map protected configuration registers that will only be accessible through out-of-band channels and protected firmware. In addition, a platform may support the CF8/CFC access mechanism to the PCI configuration space. CSI does not assume that any special protections will be enforced by the processors for accesses through this mechanism. Consequently, protected registers in partitionable systems should not be made accessible through this mechanism, without additional considerations to protect them from OS software. Further details on the configuration spaces and in-band access mechanisms can be found in Chapter 7, “Address Decode.” 13.4 Protected Firmware When in-band system management is used, the processor itself generates accesses through the coherent interconnect to the platform configuration registers. It may be desirable to provide the necessary mechanisms that will enable the creation of an execution environment where firmware can live undisturbed from the OS. For example, the configuration registers control the operation of multiple partitions. Therefore, we cannot allow the processors to modify such registers without additional effort to establish an authentication domain for the firmware, independent of the OS authentication domain. Otherwise, a hostile or compromised OS could adversely affect the integrity of all partitions. Consequently, the processor must support a “hardened” firmware environment where partition management services can be implemented as part of the platform-specific firmware. A key prerequisite for establishing the trust is that the firmware should be able to run undisturbed from the OS in order to implement an independent authentication domain. Under no circumstances, should the OS be able to deduce any credentials such as passwords shared between this layer and the system operator. The processor must ensure that the firmware code and data cannot be compromised by a malicious OS. The following requirements must be met: 1. The protected firmware code and data must be located in a portion of the physical memory not accessible by the main OS and thus, isolated from the OS. Protected configuration registers will also placed in a protected portion of the physical address space. 2. There must be mechanisms to perform the transition between OS and “hardened” firmware that cannot be compromised by the OS. 3. There must be in-band and out-of-band mechanisms to force any thread or core to transition to configuration mode on demand. Core support for a “hardened” firmware execution environment is specific to an instruction set architecture and, for Itanium processors, the core implementation. 390 Ref No xxxxx Intel Restricted Secret 13.4.1 Configuration Management Mode (CM Mode) Itanium processors on CSI platforms rely on the configuration management mode to support an execution environment for protected firmware. For the rest of the discussion, we will refer to the “hardened” Itanium processor firmware environment as CM mode. Accordingly, we will refer to the OS execution environment as OS mode. CM mode is not an Itanium processor architectural feature. To minimize legacy constraints, each core implementation has the discretion of offering implementation-specific mechanisms to support a “hardened” firmware environment. To enforce the firmware isolation, Itanium processor cores must inhibit accesses to protected physical address regions that map to protected system CSRs or contain protected firmware code and data. The exact method for enforcing the isolation is implementation-specific. However, it is highly desirable that the core can protect firmware code and data while resident in the processor caches across transition to CM mode. In general terms, a portion of the implemented physical address space is defined to be inaccessible by loads and stores unless the core is executing in CM mode. The position of the protected physical address range may be fixed or relocatable through a processor MSR. For example, one potential CM mode implementation option is to rely on the “Unimplemented Address” fault mechanism available in Itanium processor implementations. Under this approach, the implemented physical address space is reduced by one bit while running in OS mode. Furthermore, PAL reports the reduced value when queried by the operating system. In effect, the highest bit of implemented physical space is stolen from the OS and is dedicated it to protected firmware. While in OS mode, any direct or indirect references to the protected physical address space result in a trap or fault of type consistent with an attempt to access an unimplemented physical address. Since the protected firmware address space is located in fixed location in the highest portion of the implemented physical address space, special provisions must be taken when the standard address header is used. Specifically, since the standard address header may not support all the implemented physical address bits, the high order physical bits that specify a protected physical address region must be transposed to the high order bits in the address field of the standard address header. When a remote protocol engine receives a standard address packet, it must then move the high order bits from their location in the packet header to the high order bits in the physical address and zero-fill the intermediate bits in the address used to snoop the core caches. In contrast, this bit transposing will be transparent to the memory agents in the platform. Itanium processors on CSI platforms are targeting a CM mode implementation that expands on this basic mechanism and offers better protection characteristics.The rest of the discussion focuses on this expanded CM mode. 13.4.1.1 Resource Protection Model In the implementation of CM mode in processors on CSI platforms, the CM region is restricted to a smaller region of physical address with a number of specific high order bits set to specific values. Accesses by OS code to the protected physical memory regions will return an error consistent with errors returned by accesses to physical memory regions where memory is not installed. Itanium processors on CSI platforms also define multiple privilege levels associated with different firmware and system management agents such as PAL, SAL and service processors. In this model certain CSRs are made accessible by one agent (e.g. PAL) while remain inaccessible by other agents (e.g. SAL or service processors). To achieve isolation between different layers of protected code, the protected resources are further divided into three groups and access by code to each group is restricted as indicated in Table 13-1. Ref No xxxxx 391 Intel Restricted Secret System Management Support System Management Support Resource Group byLevel of Protection Code That Can Access Group Resources in Group Most protected • Protected PAL • Protected MSRs • Protected CSRs that support PAL functions (processor only) Next Most Protected • Protected PAL • System management controllers • Protected CSRs accessible by PAL and system management controllers only Least Protected • Protected PAL • System management controllers • Protected SAL • Protected DRAM •Flash ROM • Other Protected CSRs Both the core and the CSI physical address spaces are divided into a protected region and an unprotected region, and, in each case, the protected region is divided into four sub-regions as indicated in Table 13-2. The protected region is 64 GB in size and lies at the top of the core physical address space. The sub-regions are 16 GB in size. Table 13-2. Sub-Regions of the Protected Region Sub-Region ID Sub-Region Name Resources located in region 3 Reserved Possibly resources owned by on-chip ROM in future IPF processor chips (processor only) 2 PAL Resources (CSRs) owned by PAL (processor only) 1 System Management Resources (CSRs) owned by system management controllers 0 Base Other protected resources including CSRs and flash ROM and DRAM for protected PAL and SAL code and data structures Note that not all protected resources need be located in the protected address region. Protected resources may be located at other protected locations including read-only locations mapped to the on-chip ROM or flash ROM. Any component with configuration registers that may affect the operation of more than one partition must place such registers within a protected region.Chipsets components in particular, are likely to support the base and system management regions only since by definition, they should do not contain CSRs for the exclusive use of PAL or processor on-chip ROM code. 13.4.1.1.1 Conversion Between Core And CSI Addresses Standard header CSI packets support a 43-bit physical address. Extended header CSI packets support a 51-bit physical address. In the rest of this subsection, we will assume that the core physical address space is 50 bits. Since however, the CM mode support is implementation-specific, future processors have the option of moving the protected region location in the core physical address space in accordance with the number of physical address space bits that they support. In both the CSI and core physical address spaces, all bits above bit 35 are 1, bits 35:34 indicate the sub-region, and bits 33:0 are an offset within the sub-region. Although the protected regions have different locations in the core and CSI address spaces, the processor CSI protocol engine converts addresses for incoming and outgoing packets so that the protected region of the cores is mapped to the protected region of CSI. 392 Ref No xxxxx Intel Restricted Secret With the standard header packet formats, the unprotected region of CSI is smaller than the unprotected region of the cores. In this case, the protocol engine maps the unprotected region of CSI to the bottom of the unprotected region of the core. No address conversion is performed. The remaining upper portion of the unprotected region of the core is unused, and the protocol engine blocks outgoing accesses by the cores to this region. With the extended header packet formats, the protected region of the core is smaller than the protected region of CSI. In this case, the protocol engine maps the lower part of the unprotected region equal in size to one half of the core address space (address bit 49 = 0) to the bottom of the CSI unprotected region. No address conversion is performed here either. The remaining upper portion of the unprotected region of the core is unused, and the protocol engine blocks outgoing accesses by the cores to this region, while the remaining upper portion of the unprotected region of CSI is un-accessible to the cores. Although the upper portion of the unprotected region of the cores could have been mapped to the respective region of CSI, this is not done to simplify the hardware implementation. The processor CSI protocol engine will convert between core and CSI addresses as shown in Figure 13-2. The processor CSI protocol engine will convert between core and CSI addresses as shown in Figure 13-3. These conversion rules also specify cases where conversion is unsuccessful. In such cases, the protocol engine must not transmit the packets to actual targets irrespective of source address decoder programming. Figure 13-2. Address Conversion Rules between Core & CSI Addresses (SMall MP) CORE -> SMALL-SYSTEM CSI: CSI[42:0] = Core[42:0] Conditions: If Core[49] = 0: Core[48:43] must be 000000 and Core[42:36] must not be 1111111. If Core[49] = 1: Core[48:36] must be 1111111111111. SMALL-SYSTEM CSI -> CORE: If CSI[42:36] = 1111111: Core[49:43] = 1111111 Core[42:0] = CSI[42:0] Otherwise: Core[49:43] = 0000000 Core[42:0] = CSI[42:0] Ref No xxxxx 393 Intel Restricted Secret System Management Support System Management Support Figure 13-3. Address Conversion Rules between Core & CSI Addresses (Large MP) CORE -> LARGE MP CSI: CSI[52:50] = 3 copies of Core[49] CSI[49:0] = Core[49:0] Conditions: If Core[49] = 1: Core[48:36] must be 1111111111111. LARGE MP CSI -> CORE: Core[49:0] = CSI[49:0] Conditions: Either CSI[52:37] should be 1111111111111111, or CSI[52:49] should be 0000. There should be no need to check. 13.4.1.2 Firmware Entry Mechanisms and Other Considerations An Itanium processor can start executing firmware either through the procedural interface or as a result of non-performance critical hardware event (reset, init, machine check, platform management interrupt). The minimum requirement is to provide a mechanism through which a transition can take place when an non-performance critical interrupt handler is launched. This requirement is sufficient to satisfy all the known usage models. First, protected firmware running on one processor should be able to force another processor into CM mode through a directed interrupt. Second, the OS should be able to directly request system management functions through the procedural firmware interface. A processor thread will implicitly transition to CM mode when it receives one of the non-performance critical interrupts. If the OS, either directly or through ACPI, wants to invoke management operations, the processor thread can send a self-directed IPI requesting a PMI interrupt. In addition, a processor implementation may provide a faster transition mechanism for procedural firmware calls through the execution of br.pm, a new branch instruction. This instruction provides a more convenient protected gateway through which PAL and SAL code can enter protected mode. The instruction always results in a new protected-mode trap, a br.pm trap, which transfers control to one of 256 br.pm trap vectors in a new protected vector table, or PVT. The vector to which control is transferred is determined by the value of an 8-bit immediate operand, imm8. Since it is a protected-mode trap, execution enters protected mode when the trap is taken. In order to obtain protected partitions, the PVT and the routines branched to therefrom must be located at a protected location, presumably in the protected address region. A further consideration is the protection of the firmware interrupt vectors so that the OS cannot divert the interrupts to its own handlers. To achieve effective isolation, the processor must ensure that the CM firmware protection cannot be bypassed through accesses to core MSRs that provide non-architectural interfaces to the core MSRs and other structures such as the cache or TLB arrays. A core enforces these protections by implementing two modes, protected mode and PAL mode, which provide privileges to SAL or PAL but not the operating systems. A core can be in protected mode or not, and it can be in PAL mode or not. The two modes are orthogonal: A core can be in PAL mode but not protected mode, and it can be in protected mode but not PAL mode. But as we 394 Ref No xxxxx Intel Restricted Secret will see, PAL mode does not give a core any added privileges unless it is also in protected mode, so it is effectively a qualifier of protected mode. Table 13-3 summarizes the privileges obtained by protected and PAL modes. Table 13-3. Protected and PAL Mode Access Privileges Mode Combination Accessible Resources Protected PAL Un- Protected Region Base Protected Sub-Region SystemManagementProtected Sub- Region PAL Protected Sub-Region MSRs No Don’t care Yes No No No No Yes No Yes Yes No No Yes* Yes Yes Yes Yes Yes Yes Yes NOTE: But only protected PAL is expected to access the MSRs since only PAL knows about the implementation-specific move instruction used to access these registers. Note that protected PAL need not always be in PAL mode, yet it can still access the MSRs which it might need to do to enter PAL mode. The core applies the following rules to enforce the protected and PAL privileges: • If an instruction is executed out of protected mode that would result in an unimplemented address fault as were the core to support only 49 address bits instead of 50, then the instruction is not performed and an unimplemented address fault results. This includes load, store and branch instructions as well as TLB update instructions. • If a instruction is executed that would result in a D-stream access to the reserved protected sub-region, then an unimplemented address fault results. The core will actually perform a stronger check and fault if address bit 49 is 1 and bits 35:34 equal 11, the ID of the reserved sub-region. The stronger check is acceptable because the upper half of the core address space (bit 49 = 1) is unused except for the protected region. • If an instruction is executed out of protected mode that would result in a D-stream access to the protected region, then the instruction is not performed and an unimplemented address fault results. As above, the core will perform a stronger check and fault if address bit 49 is 1. The stronger check is acceptable because the upper half of the core address space (bit 49 = 1) is unused except for the protected region. • If an instruction is executed out of PAL mode that would result in an access to the protected PAL or system management sub-regions, then the instruction is not performed and an unimplemented address fault results. Again, the core will perform a stronger check and fault if address bit 49 is 1 and bits 35:34 equal either 10 or 01, the IDs of the protected PAL or system management sub-regions. Note that if protected code inserts an TLB entry for the protected region, protected code should purge that entry before exiting protected mode. 13.4.1.3 PMI Delivery Mechanisms From a system management perspective, PMI interrupts are necessary in the following cases. First, CM firmware running on one core should be able to interrupt any core in the system (care should be taken to minimize interactions between partitions). Second, PMI interrupts should be triggered through the out-of-band interfaces. A third usage model requires PMI interrupts to be generated when the state of a CSI link changes. This usage model and its requirements will be discussed in Chapter 14, “Dynamic Reconfiguration.” Ref No xxxxx 395 Intel Restricted Secret System Management Support System Management Support Itanium processors can send interrupts by store accesses to the Processor Interrupt block residing in the processor physical address space (see Chapter 10, “Interrupt and Related Operations” for details). PMI Vectors 1-3 are used by OEM SAL and vectors 4-15 are Intel reserved. For the transition to protected firmware, the usage model assumes that SAL will send a PMI interrupt using one of the OEM SAL PMI vectors. The CM firmware will verify the validity of the request and perform the operation. In addition, previous Itanium processors respond to assertion of the PMI pin in the system bus interface, which are delivered as vector 0 PMIs. CSI-based processors have no PMI pin. Equivalent functionality will be provided through CA CSRs that can trigger PMI interrupts to one or more cores in the processor die. Similar CSRs must also be provided to trigger INIT interrupts in one or more processor cores or force a reset of all the cores. A communication protocol between the interrupting and the interrupted thread can be implemented by CM firmware through a software mailbox structure in protected memory. For system management controllers, which may not be able to access memory directly, the communication protocol between the system controller and the interrupted thread may involve the firmware communicating with the controller through the I/O subsystem or through scratch registers in the CA. The PMI mechanism can also be used to indicate certain hardware error conditions to platform firmware. More details about this usage model can be found in Chapter 11, “Fault Handling.” 13.4.2 IA-32 Processor System Management Mode (SMM) System management mode in CSI-based systems will rely on the existing IA-32 SMM execution semantics along with CSI specific memory protection/isolation mechanisms. Legacy SMM execution has relied on a compatibility memory region (CSEG) typically shadowed behind the VGA device region at location A0000-BFFFF see Figure 13-4. In CSI-based systems this memory range still exists from a SW perspective but is physically dealiased from the VGA region to allow the removal of the external SMM execution mode indicator, SMMEM#, see Figure 13-5. Legacy SMM also defines at least one high memory region (TSEG) located just below the 4 GB address, CSI-based systems still retain the TSEG, allow it to be relocated and of variable length. In pre-CSIbased systems, protection of these two memory regions was implemented by IA-32 chipsets and memory controllers. In CSI-based systems the responsibility for SMM memory region protection is shared between the processor(s) and the I/O controllers see Section 13.4.2.2 for more details. Figure 13-4. Legacy SMM Memory Layout TSEG System Memory CSEG or VGA TSEG < 4GB > Top of Memory (ToM) or in Memory Hole at <4GB for Large Memory Systems ToM CSEG/VGA Memory at A0000-BFFFF CSEG/VGA Selection Based on SMMEM# 0 396 Ref No xxxxx Intel Restricted Secret 13.4.2.1 Memory Range Description The SMM memory regions will be defined in the processor by CSI address decode registers. Access to the SMM address decode registers (TSEG & CSEG) will be restricted to SMM execution mode only i.e. only SW executing in SMM will be allowed to change the register contents. SMM address decode registers will only be updated (from an architectural perspective) on execution of a Return from System Management Mode (RSM) instruction. Figure 13-5. IA-32 SMM Memory Layout in a CSI-Based System TSEG CSEG ToM 0 VGA Physical View of Memory TSEG ToM 0 SW View of Memory Address Selection Based On Mode and CSI Address Decoder Controls TSEG_BASE CSEG_BASE TSEG < 4GB CSEG or VGA 13.4.2.2 TSEG The TSEG definition consists of a variable base address and a variable length or range. The base and length parameters are changeable only while the processor is in SMM (when the processor core’s SMMEM# or equivalent bit is asserted). Physical addresses generated by the processor core are compared directly for inclusion in the defined base and range and for being generated from SMM. If all elements of the comparison match, the accesses are directed to the relevant memory controller or CSI segment with no address remapping. If there is a base and length match but no SMMEM bit match then the access is defined as being illegal in which case reads return 0s and writes are discarded. Note: a special case exists where writeback of modified data from a cached TSEG line may occur outside of SMM (SMMEM clear). This operation should be allowed to occur with correct data. Accesses into the SMM regions from the I/O subsystem should be “filtered out” by the I/O controller and terminated as above, not issued to the home node for that memory location. In addition, any component with configuration registers that may affect the operation of more than one partition must place such registers within an SMM region. TSEG address decoder defaults should be for zero base and zero length. 13.4.2.3 CSEG The CSEG consists of a variable base address and a fixed length or range of 128KB. The base and control parameters are changeable only while the processor is in SMM (when the core’s SMMEM# or equivalent bit is asserted). If the physical addresses generated by the core are between 0xA00000xBFFFF and the SMMEM bit is set then the CSI address decode logic must add must add the CSEG_BASE value to the address supplied by the core before issuing the resulting access with its new address to the relevant memory controller or CSI segment. If there is an address match (0xA0000-0xBFFFF) but no SMMEM bit match the access is targeted to VGA and should be directed to the CSI segment that owns the VGA device. Ref No xxxxx 397 Intel Restricted Secret System Management Support System Management Support Direct software access to the memory region targeted by the CSEG_BASE offset mechanism is not supported. Access is only allowed for software accesses into the 0XA0000-BFFFF memory range. There are two control bits associated with the CSEG address description and decode: • SMM_Region_Decode_Locked - When set locks the contents of the CSEG SMM address registers, preventing all updates, until the next reset. Default is clear • VGA_Data_Access_Enable - When set prevents the addition of the CSEG_BASE value to the address supplied by the core for DATA accesses only. Code accesses are handled normally. Default is clear. CSEG address decoder defaults should be for a zero base value. CSEG is optional from the perspective of CSI. however it is a requirement for IA-32 processors, that implement CSI, for legacy compatibility reasons. Table 13-4. CSEG Operating Parameters SMME M Code Data VGA_Data_Access_ Enable Action Purpose 1 X X 0 Access_address = SW_Physical_Address + CSEG_BASE SMM Operation from DRAM 1 1 0 1 Access_address = SW_Physical_Address + CSEG_BASE SMM Codefetch from DRAM 1 0 1 1 Data Access_address = SW_Physical_Address SMM Data access into VGA region 0 X X X Access_address = SW_Physical_Address Accesses to 0xA0000BFFFF go to VGA region Implementation Note: SMM initialization: • BIOS enables memory at 0x38000 (IA-32 processor default value) i.e. “normal” DRAM, which is accessible from SMM or normal operating mode. • BIOS loads it's SMM initialization code and data at 0x38000 and asserts a self-SMI • The SMM initialization code is now executing with SMMEM set and is therefore allowed to access and initialize the TSEG/CSEG address decoders to their appropriate values • BIOS can now load the real SMM code and data into the, BIOS assigned, protected memory regions, lock them down if desired, update the SMM_BASE value in the SMM dump space (a per processor operation with different values of SMM_BASE) and then execute an RSM. • The next SMI will vector to the real SMM handler at whatever address was assigned by the BIOS 398 Ref No xxxxx Intel Restricted Secret 14.1 Introduction Server architectures, based on CSI, support a number of RAS features. In this chapter, the CSI support necessary for RAS features related to dynamic reconfiguration is described. Dynamic reconfiguration includes on line addition (OL_A), deletion (OL_D), and replacement (OL_R; collectively referred to as OL_*) of modules to support dynamic partitioning of a system, interconnect (link) reconfiguration, memory RAS such as migration and mirroring without OS intervention, dynamic memory reinterleaving, processor and socket migration, and support for global shared memory across partitions. All reconfiguration activities happen either under the control of an out-of-band service processor or through in-band management. The firmware is involved and, in many cases, the operating system also. The scope of this chapter is to describe the CSI support and high level flows for dynamic reconfiguration only. It is expected that firmware architects will utilize this description to design detailed flows for dynamic reconfiguration activities. Description of firmware and OS flows is outside the scope of this specification. Dynamic partitioning is a significant part of dynamic reconfiguration. The first few sections of the chapter delve into details of partitioning models, management of OL_* events, and partitions. Note: Refer to Section 14.13 for acronyms used in this chapter. 14.2 Partitioning Models Partitioning is always assumed to be dynamic, unless otherwise stated. The partitioning is dynamic if resources can be added or removed from a partition without the need to reboot the system or the affected partitions. (With static partitioning, the system is partitioned at boot time and repartitioning requires reboot of the affected partitions). The resource that is added or removed is usually a field replaceable unit (FRU), however, the granularity of addition/deletion is defined by OS support and hardware implementation. More generically, the granularity at which resources can be added to or deleted from a partition is referred to as a “module”. On-line addition and deletion of a module from a running partition requires OS support. A module may be comprised of processors only, memory (including the memory controller), I/O Hub, or some combination of the preceding, depending on the particular CSI implementation and platform configuration. Multiple partitions can exist within a system, each of which is logically isolated from the other, providing different degrees of reliability and security depending on the particular type of partitioning. Control of partitioning is done through system service processor(s) (SSPs), Baseboard management controllers (BMCs) and/or protected firmware running on the processor(s) (referred, generically, as SSP in the rest of the chapter). CSI platforms support multiple partitioning models with distinct RAS features. The motivation for partitioning is the isolation of OS and applications running on one partition from similar entities of other partitions within the system. This section introduces partitioning terminology, models, and discuss the principles of support in CSI platforms for each partitioning model. Ref No xxxxx 399 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration Note: In the rest of this section, the socket architecture that is shown, including the number of cores, is for illustrative purposes only. SMA refers to the on-die memory agent and XBar refers to the on-die router. 14.2.1 Hard physical partitioning (HPPAR) The system is partitioned at platform interfaces that minimize the interactions between multiple partitions within the system. This logical boundary is typically at a socket or a FRU granularity. Interactions between different partitions are minimized so that hardware or software failures in one partition do not cause failures in other partitions. Each partition contains a full set of hardware resources such that an operating system cannot distinguish between a partition and an unpartitioned system. The partitions may or may not share the interconnect fabric. Specifically, the fabric can be shared if it supports a mechanism such as a Transport layer which avoids single points of failure in the shared fabric and guarantees message delivery in the presence of faults. If the partition components are not disjoint, and the partitions don’t share any hardware resources, partition isolation can be enhanced by disabling the CSI links between the partitions. The primary driver for the HPPAR model is to enable system consolidation and to avoid single points of failure. Figure 14-1 illustrates an example of a hard physical partitioning system. Figure 14-1. Hard Physical Partitioning Example Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA Core Config Agent XBar SMA IOH IOH Partition A Partition B Global Resources Shared component PCI-ESwitch PCI-E Device PCI-E Device Board Core SMA 14.2.2 Firm physical partitioning (FPPAR) In this model, the hard physical partitioning model is extended to the subcomponent level. In other words, a single component includes the necessary hardware to logically behave as two or more components. Thus, one module or FRU can participate in more than one partition. The primary driver for the FPPAR model is to enable system consolidation, and to a lesser extent, to avoid single points of failure in software or hardware. Figure 14-2 illustrates an example of a Firm physical partitioning system where an I/O Hub (IOH) CSI component is shared by two partitions. The IOH serves as a conduit to unique PCI Express ports for the two partitions. This model also covers the cases where a portion of the on-die fabric, such as the router, is used by more than one partition or is used by a different partition than other agents on the same die such as the processor cores. Other examples of firm physical partitioning models are: a) a socket containing multiple processor cores can be sub-divided among several OS partitions, b) a memory 400 Ref No xxxxx Intel Restricted Secret agent shared by multiple partitions. The FPPAR induced by sub-dividing multiple processor cores, a memory agent, or an I/O agent within a socket (as distinct from the shared router) will be referred to as sub-socket partitioning. Firm physical partitions do not offer the same level of reliability as physical partitions at an FRU level but transparently extend the granularity of partitioning without OS modifications. Figure 14-2. Firm Physical Partitioning Example Partition A Partition B Global Resources Shared component Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA IOH IOH PCI-ESwitch PCI-E Device PCI-E Device Board 14.2.3 Logical or software partitioning (LPAR) The OS willingly relinquishes traditional OS functionality such as control of the page tables or device discovery to the firmware layer (often called a firmware hypervisor). The primary driver for this model is to enable system consolidation and to tolerate software faults. The Figure 14-3 illustrates an example of a logical partitioning system. Figure 14-3. Logical Partitioning Example Partition A Partition B Global Resources Shared component Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA IOH IOH PCI-ESwitch PCI-E Device PCI-E Device Board Ref No xxxxx Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration 14.2.4 Virtual partitioning (VPAR) The virtual machine monitor (VMM), a new software component provided by a third party sits between the OS and a physical partition. It virtualizes the platform resources and creates virtual partitions allowing multiple OS instances to share processors, memory and I/O devices. The primary driver for this model is to enable system consolidation and to tolerate software faults. Figure 14-4 illustrates an example of a virtual partitioning system. Figure 14-4. Virtual Partitioning Example Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA Core Core Config Agent XBar SMA SMA IOH IOH Global Resources Shared component PCI-ESwitch PCI-E Device PCI-E Device Board The CSI system architecture is focused on enabling hard or firm physical partitioning. Logical and virtual partitioning, which require OS modifications or a VMM, are beyond the scope of this specification (they are mentioned here for the sake of completeness). 14.3 OL_* Support A key feature necessary to enable dynamic partitioning is the support for on-line addition and deletion (OL_*). With OL_*, a partition can be resized by adding and removing resources without the need to reboot the system or restart the affected partition(s). CSI platforms support OL_* events on CSI links. In addition, OEMs could support OL_* through other interfaces that they define. In this document, a CSI-centric view of the functionality needed for OL_* by the processor agent, memory agent, and the I/O agent is described. The functionality is illustrated through abstract procedures for on-line addition and on-line deletion in a CSI platform. From CSI perspective, the procedures are identical whether the OL_* nodes are physically added/removed or switched between multiple partitions within the system. A key requirement for any OL_* operation is for the ability of the firmware to quiesce the domain of interest so that many system resources, such as routing tables and address decoders, can be updated in what essentially appears to be an atomic operation to the software layers above the firmware (please refer to the Appendix A, “Glossary” for definitions). The quiescence and de-quiescence operations are performed in a platform dependent manner possibly with the aid of CSI messages such as StopReq*, StartReq*, Reads and Writes to protected fimware region (NcRdPtl, NcWrPtl). To make this discussion concrete, an implementation dependent description of quiescence and de-quiescence for CSI platforms is given. It is expected 402 Ref No xxxxx Intel Restricted Secret this section will be moved to the appropriate Platform Architecture Specification in the future and that this description will be used as one of several possible ways to achieve quiescence in a CSI- based platform. 14.3.1 Implementation Dependent Quiescence/De-Quiescence This section uses a possible CSI-based platform implementation to illustrate platform dependent flows for quiescing and de-quiescing. The domain of quiescence can span a single partition or multiple partitions or even the entire system depending on the sharing of the IOH and the CSI interconnect and the sophistication of the platform firmware. 14.3.1.1 Assumptions/Requirements • The firmware ensures that a single quiescence operation at most is in progress at any time within a domain. Multiple quiescence operations are permissible in a system with multiple domains • A quiescence operation is initiated either by a particular core (in-band), by the SSP (out-ofband). The SSP can control the quiescence through its non-CSI network or can designate a core, which then follows the in-band flow. (To make the discussion simple, only the in-band flow is described initially - the differences arising from control by the SSP are described later.) — In IA-32 systems, the initiating core uses a designated IOH to quiesce/de-quiesce all the CSI agents in the domain. The designated IOH is the Quiescence Master. — In Itanium processor family systems, the initiating core quiesces/de-quiesces all the cores in the domain, while the facilitating IOH performs a similar function for all IOHs in the domain. — IOHs are quiesced using two phases - in the first phase, the target IOH stops issuing all non-posted requests into CSI and in the second phase, the target IOH stops issuing all requests into CSI. • A number of CSRs belonging to the protected firmware space are needed for the “primitive” flows described below in Section 14.3.2. These are defined in Table 14-1 Table 14-1. Control and Status Registers Needed for Quiesce/De-Quiesce Configuration Space CSR Name(s) Function PROC_CNTL_Q, PROC_STATUS_Q Control CSR in processor to quiesce socket and status register to indicate quiescence progress PROC_CNTL_DeQ, PROC_STATUS_DeQ Control CSR in processor to dequiesce socket and status register to indicate dequiescence progress CS_CNTL_Q_Ph1, CS_CNTL_Q_Ph2, CS_STATUS_Q_Ph1, CS_STATUS_Q_Ph2 Control CSRs in chipset to control each of 2 phases of quiescence and corresponding status registers to indicate quiescence progress CS_CNTL_DeQ_Ph1, CS_CNTL_DeQ_Ph2, CS_STATUS_DeQ_Ph1, CS_STATUS_DeQ_Ph2 Control CSRs in chipset to control each of 2 phases of de- quiescence and corresponding status registers to indicate de-quiescence progress CS_QUIESCE_SCOPE Chipset register defining scope of quiescence through CSI node ids Firm Partition and Domain Scope Lists Scope lists in terms of CSI node ids defining scope of each firm partition and domain available to protected firmware • For multipartition systems, the firmware has the option of quiescing each partition (subdomain) and declaring quiescence after each partition (sub-domain) is quiesced or quiescing Ref No xxxxx 403 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration the domain as a whole by appropriate programming of the IOH scope register and related data structures. • Locks are not allowed while the quiescence operation is in progress (in IA-32 platforms). This can be ensured, for example, by first bringing all the processors in the domain to SMM through a SMI operation. 14.3.2 Flows The flows are shown as a sequence of steps. Concurrent operations within a step are suffixed by a letter. Thus, if Step 1 has 3 concurrent operations, they are labeled Step 1a, Step 1b, and Step 1c. The flows described below are the basic quiesce /de-quiesce operations upon which higher level quiesce/de-quiesce operations can be built by the firmware. Hence they are referred to as a “primitives”. 14.3.2.1 Itanium® Processor Family CSI_QUIESCE “Primitive” Step1a: • Initiating core writes to control CSR, PROC_CNTL_Q, of each processor (including itself) in its domain to quiesce socket. Target Itanium processor family returns write completion. • Each target (peer) processor modifies status CSR, PROC_STATUS_Q, to indicate quiescence in progress. Step1b: Initiating core writes to control CSR in facilitating IOH, CS_CNTL_Q_Ph1, to launch Phase 1 quiescence flow. Target IOH returns write completion. Step2a: Target processor core stops issuing all requests and modifies status CSR, PROC_STATUS_Q to indicate quiescence complete after all issued requests complete. Step2b: Initiating core polls status CSR, PROC_STATUS_Q in each processor socket until it indicates quiescence complete Step2c: • Facilitating IOH sends StopReq1 to each target IOH in its domain (scope defined by CS_QUIESCE_SCOPE) • Target (Peer) IOH stops issuing all non-posted requests into CSI • Target IOH returns StopReq1 completion after all issued non-posted requests complete • Facilitating IOH modifies CS_STATUS_Q_Ph1 to indicate first phase of quiescence complete Step2d: Initiating core polls facilitating IOH status CSR, CS_STATUS_Q_Ph1 until it indicates first phase of quiescence complete Step3: Initiating core writes to control CSR, CS_CNTL_Q_Ph2 in facilitating IOH to launch second phase of quiescence flow. Facilitating IOH returns write completion Step4a: • Facilitating IOH sends StopReq2 to each target IOH in its domain • Target IOH stops issuing all requests into CSI • Target IOH returns StopReq2 completion after it has received completions for all issued requests 404 Ref No xxxxx Intel Restricted Secret • Facilitating IOH modifies status CSR, CS_STATUS_Q_Ph2 to indicate second phase of quiescence complete Step4b: Initiating core polls facilitating IOH’s CS_STATUS_Q_Ph2 until it indicates phase-2 quiescence complete. This detection signals the completion of the quiescence operation. 14.3.2.2 Itanium® Processor Family CSI_DEQUIESCE “Primitive” Step1: Initiating core writes to control CSR in facilitating IOH, CS_CNTL_DeQ_Ph1 to launch Phase 1 dequiescence flow. Target IOH returns write completion. Step2a: • Facilitating IOH sends StartReq1 to each target IOH in its domain (scope defined by CS_QUIESCE_SCOPE) • Target IOH returns StartReq1 completion (the need for StartReq1 is redundant but is required to keep a common flow with other (bus lock) operations needed in CSI) • After receiving all StartReq1 completions, facilitating IOH modifies CS_STATUS_DeQ_Ph1 to indicate first phase of dequiescence complete Step2b: Initiating core polls facilitating IOH status CSR, CS_STATUS_DeQ_Ph1 until it indicates first phase of quiescence complete Step3a: Initiating core writes to control CSR, CS_CNTL_DeQ_Ph2 in facilitating IOH to launch second phase of quiescence flow. Facilitating IOH returns write completion Step3b: • Initiating core writes to control CSR, PROC_CNTL_DeQ of each Itanium processor family (including itself) in its domain to dequiesce socket. • Target processor returns write completion and resumes normal activity Step4a: • Facilitating IOH sends StartReq2 to each target IOH in its domain • Target IOH returns write completion and resumes normal activity • Target IOH returns StartReq2 completion • After receiving all StartReq2 completions, facilitating IOH modifies status CSR, CS_STATUS_Q_Ph2 to indicate second phase of dequiescence complete Step4b: Initiating core polls facilitating IOH’s CS_STATUS_DeQ_Ph2 until it indicates second phase of dequiescence complete. Step5: The dequiescence operation completes after the initiating core receives all the write completions from the target processors and the completion of Step 4b. 14.3.2.3 IA-32 CSI_QUIESCE “Primitive” Step1: Initiating core writes to control CSR in the (quiescence) master IOH, CS_CNTL_Q_Ph1, to launch quiescence flow. Master IOH returns write completion. Note: The intiating core does not need to know about the two phases of quiescence flow since the IOH is the only master. Step2a: Ref No xxxxx 405 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration • Master IOH sends StopReq1 to each target IOH and IA-32 processor CSI agents in its domain (scope defined by CS_QUIESCE_SCOPE and includes the initiating core) • Target IA-32 processor agent stops issuing all requests into CSI • Target IA-32 processor agent returns StopReq1 completion after all issued requests complete • Target (Peer) IOH stops issuing all non-posted requests into CSI • Target IOH returns StopReq1 completion after all issued non-posted requests complete Step3a: • After all the completions for the StopReq1s have arrived at the master IOH, it sends StopReq2 to each target IOH and IA-32 processor CSI agents in its domain (scope defined by CS_QUIESCE_SCOPE and includes the initiating core) • Target IA-32 processor agent returns StopReq2 completion (since it is already quiesced) • Target (Peer) IOH stops issuing all requests into CSI • Target IOH returns StopReq2 completion after all issued requests complete • Master IOH modifies status CSR, CS_STATUS_Q_Ph1 to indicate quiescence complete Step2b, 3b: Initiating core polls facilitating IOH’s CS_STATUS_Q_Ph1 until it indicates quiescence complete. This detection signals the completion of the quiescence operation. 14.3.2.4 IA-32 CSI_DEQUIESCE “Primitive” Step1: Initiating core writes to control CSR in master IOH, CS_CNTL_DeQ_Ph1 to launch dequiescence flow. Master IOH returns write completion. Note: The intiating core does not need to know about the two phases of dequiescence flow since the IOH is the only master. Step2a: • Master IOH sends StartReq1 to each target IOH and processor CSI agents in its domain (scope defined by CS_QUIESCE_SCOPE and includes the initiating core. • Target processor agent returns completion for StartReq1 (the need for StartReq1 is redundant but is required to keep a common flow with other (bus lock) operations needed in CSI) • Target IOH returns StartReq1 completion (the need for StartReq1 is redundant but is required to keep a common flow with other (bus lock) operations needed in CSI) Step3a: • After all the completions for the StartReq1s have arrived at the master IOH, it sends StopReq2 to each target IOH and processor CSI agents in its domain (scope defined by CS_QUIESCE_SCOPE and includes the initiating core) • Target processor agent returns StopReq2 completion and resumes normal operation (request generation into CSI) • Target IOH agent returns StopReq2 completion and resumes normal operation (request generation into CSI) • After receiving all StartReq2 completions, facilitating IOH modifies status CSR, CS_STATUS_DeQ_Ph1 to indicate second phase of dequiescence complete Step2b, 3b: Initiating core polls facilitating IOH’s CS_STATUS_DeQ_Ph1 until it indicates dequiescence complete. This detection signals the completion of the dequiescence operation. 406 Ref No xxxxx Intel Restricted Secret 14.3.2.5 Quiescence/Dequiescence Using System Service Processor (SSP) The SSP could be used instead of the inband flows described in Section 14.3.2. The SSP may employ out-of-band buses such as SMBus and JTAG to program the processor and the IOH registers (only the IOH registers with IA-32) and check their status. The flows then proceed in a manner similar to the flows described in Section 14.3.2 except that there the functions performed by the “initiating core” are now performed by the SSP. Alternatively, the SSP may send an PMI/SMI to the intiating core and then rely completely on the inband management flows to effect quiescence/dequiescence. <<< Notes: This completes the implementation dependent discussion on quiescence. Need a smooth transition from this section to the next.>>> 14.3.3 Assumptions/Requirements • CSI is the interface for on-line addition and on-line deletion. • CSI links recognize the on line addition of component(s). • The addition/removal of CSI or non-CSI agents occurs with the concurrence of the SSP, the firmware and the OSs, i.e., there are no surprise additions/removals. Further, the SSP/firmware is capable of restricting OL_* events to one event at a time. • The on line addition/removal event may cause the simultaneous addition/removal of multiple NodeIDs into/from the system. • A configuration agent is available either logically or physically (see Chapter 13, “System Management Support” for details). • CSI agents in a partition are completely identified by their node ids (for exception, see Section 14.8.2). Dynamic reconfiguration requires the use of several types of participant lists. It is assumed that the implementation supports hardware structures reflecting the following participant lists: — NodeIDs of agents that participate in a snoop coherence domain — NodeIDs of agents that participate in a quiesce operation. • The CSI support required for OL_* actions is a mechanism to quiesce all CSI agents in the partition. For a partition to support OL_* events, all the CSI agents in that platform must support the quiesce operation. • The SSP and the firmware are aware of the connection topology and can prevent OL_A operations that requires transcending hard partition boundaries. The hard partitions must be converted to a firm partition before the OL_A operation can be supported. • Some types of on line addition (OL_A) and on line deletion (OL_D) changes may not be supported by the platform and the firmware is capable of disallowing such configurations. For example, OL_* events on specific FRU boundaries may not be allowed to keep the resulting topologies regular to yield deadlock-free routing - these limitations are imposed by the system management but not by CSI. Each CSI link has several built-in functions to enable OL_*. The functional hardware support needed in various nodes is outlined. A variety of implementations of these functions are possible: • CSI Interface Control and Status Register for each link: Reflects the state of a link port and progress of link initialization. This CSR can be queried by the system management channels or in-band firmware to verify a variety of conditions related to a link’s status: link is connected to a remote component, idle flits detected, idle flit acknowledgment detected, link initialization Ref No xxxxx 407 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration status, enable CSI link initialization and framing sequence, etc. This CSR must be accessible through the configuration space. • Means to assess (through configuration space) each CSI agent’s node id, node initialization status, routing tables, system address map tables. • Means to interrupt the Running System (RS) after a OL_A module has been added to the system and capability to enable/disable such interrupt, i.e., generation of a PMI/SMI/vectored interrupt targeted to the firmware execution on the RS. This interrupt may be generated on the RS when the link controller hardware recognizes a change in the link state or when both ends of the link have successfully completed Link/Protocol layer initialization or for both events. With the first option, the firmware is required to check for successful Link layer initialization before interacting with the OL_* node. • Means to interrupt the Running System after a module has been quiesced for removal, e.g., generation of a PMI/SMI IPI to the SAL/BIOS firmware execution on the RS. • Means to bring each component of a module on-line, as part of the current partition: processor, memory, memory hub, I/O hub, and CSI links. The quiescing and on-lining requires the interaction among the hardware, firmware, OS, and possibly the SSP. Hence, we illustrate the support needed by describing a fairly abstracted flow for on-line addition of a node in Section 14.5. • Means to quiesce and off-line each component of a module: processor, memory, memory hub, I/O hub, and CSI links. The quiescing and off-lining requires the interaction among the hardware, firmware, OS, and possibly the SSP. Hence, we illustrate the support needed by describing a fairly abstracted flow for on-line deletion of a node in Section 14.6. • Means (CSR) to specify which of the two routing tables (primary or alternate) is being used. 14.3.4 Configuration Space and Associated Registers This section provides a list of all the Configuration and Status registers for dynamic reconfiguration operations from a functional perspective. The Component Specifications will provide additional details. Table 14-2. CSI Control and Status Registers Needed for Dynamic Reconfiguration Operations CSI Control and Status Register Name(s) Function Enable/Disable flag for interrupt generation. Target core’s ID, EID, NodeID. Interrupt delivery type (SMI, PMI, vectored interrupt, etc.) Interrupt vector#, if applicable. Ability to generate a programmable interrupt to the Running System on link state change due to addition of a neighbor and on successful link initialization. CSRs needed on a per link basis. Physical link initialization status CSR for each link Ability to poll the status of each link NodeID, interrupt type, Enable/Disable flag for error notification. Target NodeID to receive notification for errors on active/quiesced non-processor agents Indicator for the selected Route table Specify whether Primary Route tables or Alternate Route tables or both should be active. Mirror Target Node Id of secondary memory agent which mirrors memory of primary memory agent - maintained by the latter 408 Ref No xxxxx Intel Restricted Secret 14.3.5 Need for a Quiesce During OL_* Events Some procedures are required to be done after the CSI links on the RS and the OL_A node are both working - such as updating the routing tables as described in Section 14.7. Most OL_* operations would require quiesce functionality. Some examples of where a quiesce is required are described below: • The addition/deletion of a CSI agent that causes changes to the route tables of other source- destination pairs in a running OS partition. Quiesce is not necessary from the perspective of route table changes if the platform supports the Flexible Option (described in Section 14.7.3) which uses both the virtual networks VN0 and VN1. This option cannot be used in a few cases: — In system topologies which use both networks for deadlock-free routing such as ring- based networks. — In topologies which rely on adaptive routing using VN0 and VN1. — Non-scaled systems (Source Broadcast) that require an ordered Home channel. This is because a transaction on VN1 (using the alternate route table) can overtake an older transaction on VN0 (using primary route table) - see Section 14.7.3 for additional details. • The addition/deletion of a CSI agent that requires changes to snoop participant lists in multiple CSI agents or route table entries for other source-destination pairs in a running system. • Memory migration situations where the source address decoders on various CSI agents need to be revised atomically. • Replacement of a Home Node memory agent in a system with memory mirroring, which requires changes to the Home Node field in the source address decoders of multiple CSI agents. • Replacement of the current Lock_Master. A quiesce may be avoided during some limited OL_* situations as below: • Links are added or removed and the topology change does not affect any of the existing source-destination pairs (e.g., hard to firm partition or vice versa) 14.4 Use of System Service Processor during OL_* Operations The SSP can perform modifications to the CSI structures using service management channels. It can also invoke the in-band firmware (SAL/BIOS) by generating a PMI/SMI interrupt and request the in-band firmware to perform some of the initialization steps. The interfaces between the SSP and the in-band firmware are OEM implementation dependent. The advantages of SSP usage are: • CSR modifications may be done in the background, particularly if the component has primary and alternate structures. Modifications in the background minimizes the interruption and/or impact to the OS. • Minimize the number of interrupts to firmware involving PMI/SMI. The firmware usage model of some of these platform interrupts involves rendezvousing all the processors in the system/partition to the PMI/SMI execution mode and this may impact OS latency expectations (PMI does not require a rendezvous, though current firmware implementations use one). Some examples of minimizing PMI/SMI transitions are described below: — To initiate the OL_A operation, firmware on the RS must enable the link to the OL_A and this requires a transition to the PMI/SMI layer on the RS. The SSP can avoid the PMI/SMI Ref No xxxxx 409 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration transition by programming the link controller CSRs on the RS in the background, i.e., set up its link controllers to default values, programming them with triplets of {NodeID, AgentType and Crossbar Port number} for each CSI agent present on the RS and turning on the link enable flag. Similarly, the SSP can set up the corresponding CSRs on the OL_A for the links to be activated. — Successful link initialization can be configured to generate a PMI/SMI to the RS in order that in-band firmware on the RS may recognize the new neighbor, revise various participant lists, route tables, etc. and provide the boot mode indicator of HotAdd to the OL_A. The PMI/SMI generation can be avoided by using the SSP. The SSP can recognize the link completion by polling the relevant CSRs on both ends of the link, provide suitable boot mode indications to the OL_A, monitor the progress of the OL_A, wake up the OL_A processors as necessary, perform some of the CSR modifications on both RS and OL_A nodes in the background and involve the in-band firmware on the RS only when absolutely required. — During an OL_D operation, the Quiescing Processor (QP) signals the RS with a PMI/SMI when it reaches the low power sleep state. An alternative approach involving the SSP would be for the QP to signal the SSP, the SSP performing some of the operations on the RS in the background, then involving the firmware on the RS. • CSR resources on the OL_A can largely be programmed by the SSP without involving the firmware execution on the RS and this is particularly useful if the OL_A is a non-processor node. • Implement domain perimeter protection. In-band firmware of various partitions can be prevented from accessing CSRs of other hard partitions by the SSP enforcing protection using the Source Address Decoders. • The need for co-ordination between firmware of different partitions within the system and fine grain synchronization between the firmware and the SSP, is minimized if most of the CSR resources are controlled by the SSP. This leads to overall simplification of firmware. There are some operations that must be performed by in-band firmware and these cannot be done by the SSP since the server management channels typically don’t provide the required access paths. Some examples are described below: • Revision of firmware data structures in memory. • Invocation of procedures in other firmware layers such as the Itanium processor family PAL, setting up of processor registers, etc. Some disadvantages of the SSP approach are: • Server management channels are typically slow and this can pose latency problems if a large number of CSRs need to be revised while the system is quiesced, as part of an OL_* operation. The following sections describe the OL_* flows with the operations primarily being performed by the firmware. There are multiple possible ways for performing these steps. As described above, a number of these steps can be performed by the SSP by itself or in co-operation with the firmware. The division of responsibility and interfaces between the SSP and firmware are OEM implementation dependent and will be described in platform specific documents. Such details are beyond the scope of this document. 410 Ref No xxxxx Intel Restricted Secret 14.5 On Line Addition of a Node In this discussion, RS is the Running System and OL_A is the module which is added. Depending on the system configuration, the OL_A could be a) processor(s) with memory and I/O, b) processor(s) with memory, c) processor(s) only, d) I/O Hub only, e) Memory only. When multiple CSI agents are added simultaneously as part of a module addition, the initial indication is received by the firmware and the firmware can decide the integration order. The integration can be done one CSI agent at a time or all the CSI agents together. Figure 14-5. Illustrating Addition of a Node to a Running System Partition 2 Legacy IO Agent Non-Legacy IO Agent Chipset with Firmware Partition 1 Resource Added: CPU node with direct Connection to IO+FW agent Partition 2 CPU0 CPU1 Partition 1 Chipset with Firmware For the inserted processor and from its perspective, the initialization sequence is exactly like a cold Reset to the inserted hardware. The RS must provide a signal (hereafter referred to as HotAdd indication) to the OL_A in an implementation dependent platform resource such as MMIO or CSR register. Alternatively, such a HotAdd indication may be provided by the SSP in a CSR. The HotAdd indication serves to limit the scope of platform initialization done by the OL_A node. Refer to Section 12.9, “Support for On-Line Addition” on page 12-383 for additional details. The SSP determines whether new hardware will create a new partition or join an existing partition and the firmware/SSP will program the configuration registers on various CSI agents accordingly. The power on reset and initialization sequences on OL_A nodes are fully described in the Reset and Initialization chapter of this document. As described in that chapter, some of the Embedded/Direct Connect firmware actions described below can be performed by hardware that is part of the configuration agent on some implementations. CSI link controllers check for the presence of a new neighbor. If they detect the addition of one, as evidenced by successful Link layer initialization, they will issue an interrupt packet with the PMI/SMI/regular interrupt delivery type to a predefined processor thread on the RS, if programmed to do so on the RS. The target processor identifier and interrupt characteristics are specified in CSR resources on the RS’s configuration agent. Ref No xxxxx 411 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration 14.5.1 Online Addition of a Processor Node (With or Without Other Agents) A detailed description of CSI specific steps are listed below. The steps for online addition are quite similar with and without the presence of a SSP. The steps performed by the SSP, the OS, firmware and other software layers are described at a high level only. Step1: Online Node Addition: physical/logical • The SSP and/or the RS firmware is aware of the platform topology and the CSI links to which the OL_A FRUs will be connected. It sets up the appropriate link controllers on the RS with the triplets of {NodeID, AgentType and Crossbar Port number} for each CSI agent present on the node(s) to which the OL_A is being connected. It also programs one or more configuration agents on the RS associated with such CSI links, to generate a PMI/SMI interrupt to one or more processor agents on the RS when link initialization is completed. Note that it is not necessary to arm all the links between the OL_A and the RS to generate a PMI/SMI interrupt. Multiple simultaneous PMI/SMIs targeted to the same processor will appear as a single PMI/SMI at target’s firmware layer and the firmware is required to handle this condition. Thus, in complex OL_A scenarios, the SSP/firmware must read the status registers of various links to ensure that all the expected links are operational. The arming of PMI/SMI interrupt is not performed where the OL_A node is part of the running system with already active links. Such a situation arises, for example, when a processor agent is logically removed from one partition and added to another. • OL_A node is inserted - this insertion is either physically done or through the SSP that grants permission to include an existing node into a OS partition as the OL_A node. If a physical insertion is involved, the insertion triggers a power on reset sequence on the OL_A node and the out-of-band signals on the OL_A are enabled (SMBus, JTAG, etc.) and at this point, the SSP is capable of accessing the CSR resources on the OL_A. • If an SSP is present, it sets up the OL_A’s link controllers with the triplets of {NodeID, AgentType and Crossbar Port number} for each CSI agent present on each FRU being added to the RS. In the absence of the SSP, the firmware (or microcode) on the OL_A performs this function. The firmware on the OL_A or the configuration agent within the OL_A may read some platform signals or strapping pins to derive their own NodeID information. These steps are identical to the power on reset and initialization flows described in the Reset and Initialization chapter. This step is not performed where the OL_A node is already part of the system with active links. Step 2: CSI links between RS and OL_A enabled • The SSP or the firmware on the RS issues a command to the link controller to perform the physical and Link layer initialization, and exchange of Link/Protocol layer parameters. At the end of a successful link initialization, each side of the link latches the information about the CSI agents present on the other end of the link, i.e., triplets of {NodeID, AgentType and Crossbar Port number}. This step is not done if the OL_A is already part of the system and the link is active. • One or more processor agents on the RS receive PMI/SMI in recognition of the OL_A event. There may be some CSI links between various CSI agents within the OL_A nodes that also need to be initialized. The firmware execution on the RS proceeds with the integration of the OL_A when all the expected CSI links are operational. In a multi-partition RS, the firmware execution on the RS that receives the PMI/SMI interrupt is aware of the partition to which the OL_A must be integrated and passes on the PMI/SMI indication to the firmware execution on the right partition, if necessary. Such an indication may be signaled by a write to a CSR on the target that generates the PMI/SMI, or using a platform resource, or using the SSP. The PMI/SMI interrupt generation due to OL_A event does not occur where the OL_A node is 412 Ref No xxxxx Intel Restricted Secret part of the running system with already active links (e.g., route-through OL_A). The OL_A can signal a PMI/SMI to the RS as described above. At this point, the RS is capable of accessing the CSR resources of the OL_A. • This step is firmware and platform implementation dependent. The system interface may provide a mechanism by which the OL_A components are set to a Halt state awaiting a wake up by the RS. Such an indication may be provided with the use of Boot mode straps that are read by the OL_A at Reset de-assertion, or using system/socket layer parameters during Link layer parameter exchange, or by the SSP setting CSR values of the OL_A’s configuration agent. The SSP and/or the firmware execution on the RS co-operate to set up the Route Table, Address Decoders and other CSI structures on both the RS and OL_A components. At the end of this programming, the OL_A can access full/limited resources on the RS. Firmware can be designed to protect common resources such as the interconnect fabric from errant accesses by the OL_A until the OL_A components are tested successfully. Refer to Section 12.9, “Support for On-Line Addition” on page 12-383 for additional details. Step 3: Set the path to Firmware • Each processor core needs a path to the firmware agent. If the OL_A complex has a firmware agent, it can be used. If the OL_A node gets connected to a firmware agent on the RS, that firmware agent can be used by the OL_A. If neither of these options are available, firmware accesses from the OL_A can be routed through a node on the RS which acts as a conduit to the firmware agent. In such a situation, the SSP or the firmware on the RS sets up the address decoders and route tables on the OL_A nodes to route firmware accesses. There may be additional firmware/OS requirement for the firmware versions on the RS and the OL_A nodes to be identical. Such issues are beyond the scope of this document. • The SSP or the firmware execution on the RS writes to an implementation dependent CSR on the OL_A to release it from halt state, if the OL_A is waiting for such a signal. They may also provide some configuration values in platform resources, such as HotAdd indication to limit the scope of platform discovery by the OL_A. Step 4: OL_A node runs self test and performs initialization • Each processor core on the OL_A tries to become the node boot strap processor (NBSP). In one possible implementation, the winner may be decided using a simple race to set a predetermined configuration register flag. • In some architectural configurations, the firmware running on the NBSP also initializes and enables some functionality of the memory bridge or the memory controller. The NBSP initializes and tests memory, if present, on the OL_A node. The size and memory gap information is communicated to the OL_A by the SSP/firmware on the RS, using configuration registers or other implementation dependent platform resources. • Additional core initialization and testing which requires scratch pad memory is completed. The firmware on the OL_A node and the firmware on the RS communicate with each other using IPIs or implementation dependent platform resources. If the OL_A node has no local memory, its memory needs can be satisfied by memory on the RS. Such memory must be mapped as non-coherent on both RS and OL_A as the OL_A nodes are not part of the running system’s coherency domain yet. Non-coherent accesses also insulate the RS from errant accesses by the OL_A. If the OL_A is expected to boot an independent OS, the OL_A node must have its own memory and the co-ordination between the firmware executions on the RS and the OL_A is not applicable. Step 5: Indicate to firmware execution on the RS that the OL_A is ready to join the partition(s) Ref No xxxxx 413 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration • If the OL_A is expected to join an existing OS partition, the firmware execution on the OL_A sends an indication to the firmware execution on the RS. The firmware executions on either end exchange information in an implementation dependent manner. The firmware on the RS becomes aware of the health of the processors on the OL_A, the memory and I/O Hub resources added by the OL_A. • If the OL_A causes changes to the route tables of any partitions within the RS (or optionally, the entire RS), the firmware performs a quiesce of all the CSI agents on affected partitions, updates the route tables and then wakes up the quiesced CSI agents. Similarly, the Route tables of the OL_A are programmed/re-programmed as necessary. Also, refer to Section 14.3.5, “Need for a Quiesce During OL_* Events” for further details. • If the OL_A is expected to form a new partition, it is woken up if necessary and the firmware execution on the OL_A proceeds to boot its OS. The rest of the steps below describe the scenario where the OL_A will be joining an existing partition. • The OL_A and the RS update their system address maps and address decoders to incorporate the new resources. The RS may need to revise the participant lists (for snoops, interrupt broadcast, Lock, etc.) in various CSI agents of the OS partition. If these lists are not revised in an atomic or consistent manner, protocol violations may occur. • The firmware executions on the RS and the OL_A processor nodes interact using implementation specific mechanisms and enable coherence traffic with each other. • The firmware execution on the OL_A nodes informs the firmware execution on the RS of the boot progress, then waits in a loop for a wake up by the partition’s OS. Step 6: Indicate to the OS on the RS that the OL_A nodes are ready to join the partition(s) • The firmware execution on the RS builds appropriate ACPI tables and generates an interrupt to the OS using ACPI mechanisms, to notify the addition of resources. If resources are being added to multiple OS partitions, the firmware on the RS will send the interrupts to multiple partitions. • The OS identifies the resources of the OL_A node using ACPI methods/interfaces. If I/O Hub, bridges and devices were added, the OS will load the appropriate device drivers for the newly added resources on the I/O Hub. If memory was added, the OS may begin using such memory. • The OS wakes up the processors on the OL_A node and executes the OS initialization steps on such processors. 14.5.2 Online Addition of a Memory only Node The steps are quite similar to the addition of a processor node and only high level differences are described in detail here. If the memory node has a processor agent also, refer to Section 14.5.1, “Online Addition of a Processor Node (With or Without Other Agents)” . Any issues pertaining to re-interleaving of memory are not discussed in this section. • The memory node is powered on and the out-of-band signals, if any, are enabled. The SSP or the configuration agent within the OL_A memory node sets up its link controllers with the triplets of {NodeID, AgentType and Crossbar Port number} for each CSI agent present on the FRU being added to the RS. • During Link/Protocol layer initialization, the RS recognizes the addition of a Memory agent from the AgentType parameter exchange. • The addition of the OL_A Memory agent generates a PMI/SMI to the firmware execution on the RS. 414 Ref No xxxxx Intel Restricted Secret • If the OL_A causes changes to the route tables of any partitions within the RS, the firmware may need to perform a quiesce of all the CSI agents on such partitions (or optionally, the entire RS), update the route tables and then wake up the quiesced CSI agents. Typically, a pure Memory agent is a leaf node and route table changes to the RS can be made without a quiesce. Also, refer to Section 14.3.5, “Need for a Quiesce During OL_* Events” for further details. • The SSP/firmware on the RS initializes and tests memory on the OL_A node. • The RS updates its system address map and address decoders to incorporate the new resources. The target address decoders at the Memory agent are set up to address the newly added memory. • The firmware execution on the RS builds appropriate ACPI tables and generates an interrupt to the OS using ACPI mechanisms, to notify the addition of resources. If resources are being added to multiple OS partitions, the firmware on the RS will send the interrupts to multiple partitions. • The OS(s) on the RS integrates the memory with the OS partition(s). 14.5.3 Online Addition of an I/O Hub Node only The steps are quite similar to the addition of a memory node and only high level differences are described here. If the I/O node has a processor agent also, refer to Section 14.5.1, “Online Addition of a Processor Node (With or Without Other Agents)” • The SSP or the configuration agent within the OL_A node sets up its link controllers with the triplets of {NodeID, AgentType and Crossbar Port number} for each CSI agent present on the FRU being added to the RS. • During Link/Protocol layer initialization, the RS recognizes the addition of an I/O agent from the AgentType parameter exchange. • The addition of the OL_A I/O agent generates a PMI/SMI to the firmware on the RS. • If the OL_A changes the route tables of any partitions within the RS, firmware must perform a quiesce of all the CSI agents on such partitions (or optionally, the entire RS) and then update the route tables. • The RS determines the MMIO and I/O address ranges required for the downstream busses and devices. It updates its system address map and address decoders to incorporate the new resources. The target address decoders at the OL_A I/O agent are set up to address the newly added MMIO and I/O resources and the source address decoders at the I/O agent are set up to address the RS memory and processor resources. • The participant lists for Snoop at various CSI agents must be revised to include the OL_A I/O Hub agent, if such an agent is a caching agent. Similarly, if the OL_A were to act as the Lock_Master, the appropriate CSRs on various CSI agents are initialized. • The CSI agents that were quiesced are then woken up. • The firmware execution on the RS builds appropriate ACPI tables and generates an interrupt to the OS using ACPI mechanisms, to notify the addition of resources. If resources are being added to multiple OS partitions, the firmware on the RS will send the interrupts to multiple partitions. • The OS(s) on the RS recognizes the OL_A event, probes the I/O busses downstream of the I/O Hub, discovers devices, loads appropriate device drivers for such devices and then integrates the devices with the OS partition(s). Ref No xxxxx 415 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration 14.6 On Line Deletion of a Node In this discussion, OL_D is the node which is deleted from the running system (RS). Depending on the system configuration (described in Chapter 2, “Platform Scope”), the OL_D node could be a) processor(s) with memory and I/O, b) processor(s) with memory, c) processor(s) only, d) I/O Hub only, e) Memory only. The deletion procedure is explained through a series of steps. Figure 14-6. Illustrating Removal of a Node from a Running System Partition 2 Legacy IO Agent Non-Legacy IO Agent Chipset with Firmware Partition 1 Resource Removed: CPU node Partition 2 CPU0 CPU1Partition 1 Chipset with Firmware Online deletion differs from online addition event in the sense that the steps for removal are initiated at the OS layer prior to firmware. When multiple resources are being removed (processor, memory, I/O Hub, etc.), the order in which the OS removes the resources is OS specific and the firmware cannot make any assumptions about the ordering. The most likely scenario is for the OS to remove the devices downstream of the I/O Hub and the I/O Hub first, then processor agents followed by Memory agents. Removal of the Crossbar preparatory to the physical removal of a CSI component is initiated by firmware or system management channels. 14.6.1 On Line Deletion of a Processor Node Step 1: OL_D Node removal is requested • The request is made to the OS through system management channels. Step 2: Off-line processor (s) of OL_D node Following actions are taken on various processors (actions by the Quiesced Processor (QP) and RS are identified separately). • (QP) The OS moves applications, interrupts away from the QP processor and stops scheduling work on the QP. It also disables interrupts on the QP. • (QP) Stop prefetches and speculative accesses by removal of TLBs. Stopping prefetch avoids errors due to references to memory that may be mapped out by address decoder changes. • (QP) The OS uses an ACPI mechanism to signal that it is no longer using the QP node and the ACPI mechanism enters the firmware layer in an implementation dependent manner. The firmware layer does the following: — On IA-32 systems, revise the appropriate participant lists to ensure that the QP is not the target of subsequent VLW, Lock and broadcast IPI transactions. 416 Ref No xxxxx Intel Restricted Secret — Firmware ensures that any interrupt transactions in-transit are pended to the QP. This step is implementation dependent and may involve polling various I/O Hubs in the system for pending interrupts. The reads to IOHs causes draining of outstanding interrupts from the IOHs. The firmware execution on the QP must send a PMI IPI to itself and verify receipt of the PMI. This ensures that any earlier interrupts and IPIs are delivered to the QP core. If new interrupts are pending, firmware returns to the OS and the OS is expected to initiate the set of actions starting with Step 2. — On Itanium processors, the firmware stops pre-fetches to its memory areas accessed in writeback with limited speculation (WBL) memory attribute, using the PAL_Prefetch_Visibility procedure. — Firmware performs flush of processor and platform caches, and invokes necessary instruction to flush the Write Coalescing (WC) buffers. — Executes from firmware space to avoid reliance on Running System memory. — On Itanium processors, invokes the PAL_SHUTDOWN procedure with the option to write an IPI to the RS with delivery type of PMI. On IA-32, generates an IPI to the RS with delivery type of SMI IPI followed by the necessary steps to reach the lowest power system sleep state. Alternatively, the firmware on the QP may create a regular vectored interrupt (e.g., SCI) to the OS on the RS and expect the OS on the RS to enter the firmware to perform the functionality described in Step 3 below. — After this stage, any occurrence of MCA/INIT/PMI/SMI events does not wake up the cores. Only a Reset event can wake up such sleeping cores. Step 3: Remove QP from various participant lists • (RS) The firmware on the RS does the following on receipt of the above PMI/SMI IPI: — Perform a quiesce to ensure all in-transit transactions such as snoops to the QP are completed. — Remove the QP from various participant lists on the RS. — Remove QP from directory/snoop-filter structures, if any. — Change Address Decoders of the QP such that firmware and data accesses don’t get out of the QP node. — Remove QP references from the firmware data structures for the partition. MCA on the RS is no longer reflected to the QPs. — If the QP nodes are physically removed from the system, the firmware takes actions to remove the Crossbar also. Removal of Crossbar involves changes to routing tables on multiple CSI nodes to bypass the QP node. Refer to Section 14.7, “Multi-Partition Management with Shared Interconnect” for multiple options for accomplishing these changes. Further, in a system with hard partitions, if the links from OL_D to the RS were to be disabled, Route tables updates will be necessary. In such cases, the firmware may need to perform a quiesce of the affected CSI agents on the RS, update the route tables and then wake up the quiesced agents. Also, refer to Section 14.3.5, “Need for a Quiesce During OL_* Events” for further details. — Wake up the quiesced agents on the RS — Notify the OS on the RS of successful removal completion using an ACPI mechanism. (e.g., using SCI interrupt) Step 4: Generate a signal that OL_D node can be removed • (RS) The OS receives the interrupt from the firmware to indicate that the OL_D is complete. It may provide the completion indication to a SSP/system management application if the action Ref No xxxxx 417 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration was initiated by it. If physical removal of the OL_D node(s) are required, the OS may run some ACPI methods to provide a visual indication, e.g., turn on a LED on the removal slot. 14.6.2 On Line Deletion of a Memory Node Following are the steps in removing a memory node from a partition. Depending on the the memory topology and interleaving, some of the steps may be optional. • The OS migrates applications and device drivers using the memory being removed to other memory segments. The OS may flush pageable memory to disk, or if enough spare memory is available, copy the OL_D memory contents to another region of memory. There are some situations requiring special actions: — If the memory being removed is assigned to a DMA device, the device driver must be shut down or assigned a new buffer. — If the OS design permits, the OS can remap the kernel memory area, which is part of the memory being off-lined, to other on line memory. — If memory is interleaved and the memory interleave granularity (across CSI nodes) is smaller than an OS page, then memory from multiple CSI nodes that share the same OS page as the OL_D node, will need to be paged to disk. In the extreme case, all of pageable memory in the partition will need to be paged to disk. — Special interactions with firmware are required for areas accessed by the OS in physical addressing mode such as the OS_MCA, OS_INIT entry points registered with the SAL. Similar requirement exists for firmware memory areas accessed in physical addressing mode (merely copying to a different physical address will not work). Note: These limitations mentioned can be overcome using Memory Migration techniques described in Section 14.9.1 below. • The OS stops pre-fetches and speculative accesses to memory being removed using removal of TLBs and then transfers control to the firmware layer. • On Itanium processors, the firmware stops pre-fetches to its memory being off lined, using the PAL_Prefetch_Visibility procedure. • The firmware flushes the processor and platform caches for the address range. • If I/O agents have caches, the firmware takes steps to drain such caches and cause update of memory. • The firmware performs a quiesce operation to ensure completion of in-transit transactions to removed resources. A loose timeout can also work. A pure Memory node that does not perform route-through functions is a leaf node, hence a quiesce can be avoided if route table changes on the RS are merely the removal of entries for the memory being removed. • The firmware then changes the Address decoders on CSI Agents to avoid references to removed resources. The OS would see a “hole” in its address space and must not generate a request to the off-lined memory in future. • Firmware then releases any quiesced CSI agents on the RS to resume their operation. • Firmware notifies the OS of removal completion using ACPI mechanisms and the OS may provide platform indications such as signaling the SSP or turning on a LED, etc. • If the Memory node is shared by multiple OS partitions, these steps are done on all such partitions. 418 Ref No xxxxx Intel Restricted Secret • Notes: — There may be some restrictions (TBD) on removal or remapping of PAL and SAL code/data associated with the protected configuration mode. — Repeated deletion and addition of memory could fragment the OS view of memory and result in non-availability of large chunks of memory or in running out of address decode registers. The whole system may then need to be quiesced so that all of the memory in the partition is reallocated (re-interleaved). 14.6.3 On Line Deletion of an I/O Hub Node The OS is typically unaware of the I/O Hub and bus bridges leading to devices. At a high level, the OS takes actions to stop using resources being removed, then calls firmware to modify the platform resources controlled by the firmware. The firmware removes references to resources in CSI structures such as Address decoders, Participant lists, etc., then notifies the OS of completion. These steps are described below with emphasis on CSI specific flows: • The OS notifies device drivers of I/O devices being removed. If a device driver cannot be shut down, it will reject the request and the OS will not proceed with the removal. Such a rejection may have impact on removal of other resources from the system. For example, if the device driver has been assigned a memory buffer for DMA operations and the device driver cannot relinquish such a buffer, the memory node associated with the buffer cannot also be removed. • If the device drivers are stopped, OS reclaims the memory associated with their code and data areas. The OS then gives control to the firmware layer using ACPI mechanisms to perform actions such as the following. — If I/O agents have caches, the firmware takes steps to drain such caches and cause update of memory. — The firmware performs a quiesce operation and ensures completion of in-transit transactions to removed resources. — On IA-32 systems, if the removed I/O Hub functioned as a Lock_Master, designate another Lock_Master, set up the Lock_Scope register on the new Lock_Master, and revise the Lock-Target registers on various CSI agents on the RS. Similar steps are required if the removed I/O Hub maintained broadcast lists for interrupt delivery (physical destination with destination shorthand or logical destination in flat/cluster addressing mode), and such broadcast functionality has to be migrated to another I/O Hub. — Revise the Address decoders on RS’s CSI Agents to avoid references to the removed resources. — Remove the I/O Hub CSI agent(s) as a target from Participant lists such as Snoop, Directory, Snoop-filter, etc. — Firmware then releases any quiesced CSI agents on the RS to resume their operation. — Firmware notifies the OS of removal completion using ACPI mechanisms. — If the I/O node is part of multiple OS partitions (such as an IOH shared by multiple OS partitions), these steps are done on all such partitions. • Notes: — Removal of Firmware Agent, Compatibility PCI Bus located on the OL_D I/O Hub, etc., must be co-ordinated with other processors on the RS. Ref No xxxxx 419 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration 14.7 Multi-Partition Management with Shared Interconnect The router on the processor socket node can be used as a route through component - it is possible in such a case that the router is part of the shared interconnect of several partitions. Consequently, with OL_D, since some routing paths no longer exist, the partition from which the node is removed and possibly other partitions are affected. With OL_A, some routing paths are added, which affects the partition to which the node is added and possibly other partitions (if the node has a route- through component). Please see Section 14.3.5 for specific situations which require route table changes during OL_* events. CSI platforms support OL_* operations without the need to quiesce partitions which are part of the shared interconnect. From a routing perspective, OL_* events are handled by switching from primary routing tables to alternate routing tables. Primary routing tables handle routing in the topology prior to the OL_* event while alternate routing tables handle routing in the modified topology induced by the OL_* event. Definition of Affected Partition: If an OL_* event results in the routing path to change for any pair in a partition P, then P is an affected partition. It is assumed that the firmware is able to determine the affected partition resulting from an OL_* event. There are two primary ways in which the transition from the primary routing table to the alternate routing table can be handled. Each option is described in the following sections. 14.7.1 Restricted Option The basic idea behind this scheme is to first quiesce the affected partition - this may, in fact, be the whole system - and then switch over so that all agents belonging to the affected partition use the alternate routing tables. Quiescence is achieved by a scheme such as the one outlined in Section 14.3.2. In this section, additional details pertaining to the hardware option to quiescing are provided (details on the firmware option will be provided in a future revision). A generic description of this scheme is illustrated in Figure 14-7 and by the following steps: • After the OL_* event is requested, the firmware detects all the CSI agents in the affected partition. • The firmware achieves quiescence of the affected partition using the CSI_QUIESCE primitive described in Section 14.3.2 or an alternative scheme which achieves the same result. • Concurrently, the firmware (executing on the initiating core), also begins loading the alternate routing tables for each CSI agent in the affected partition (this would also include the routing tables at each CSI link of an intermediate router, for e.g.). • After the alternate routing tables are loaded and quiescence is achieved, the firmware dequiesces the affected partition using the CSI_DEQUIESCE primitive described in Section 14.3.2 or an alternative scheme which achieves the same result. • From the routing perspective, the OL_* event request is now complete. • The alternate routing table is now relabeled the primary; the original primary table is now ready to be updated for the next OL_* event (or any other event). 420 Ref No xxxxx Intel Restricted Secret Figure 14-7. Multi-Partition Management -Restricted Option E EEa aach ag ch agch age een nnt tt g ggo ooes eses i iin nnt tto oo s ssu uus ssp ppen enended dedded a aan nnim imima aat tti iio oon nn – –– d ddo ooe ees n s ns no oot ge t get gen nne eera rarate tete a aan nny yy t ttr rran anansa sasac cct tti iio oon nn – –– a aal lll ll ro roroute uteute t tth hhr rro ooughs ughsughs com comcomp ppl lle eet tte ee and andand r rreac eaceach des h desh dest tti iin nnat atati iio oon us n usn usi iin nng gg O OOL LL* e * e* ev vven enent tt r rre eequ quques esest tt g ggr rra aan nnt tt f ffr rrom omom or orori iig ggi iin nnal alal R RRT TT. .. A AAf fff ffect ectected pa ed paed part rtrti iit tti iion( on(on(s ss) )) A AAl llte tetern rnrna aat tte ee R RRo oou uutin tinting gg p ppe eersp rsprspe eec cct tti iiv vve ee qu ququi iie ees ssced at ced atced at en enend of d ofd of t tth hhi iis ss t tti iim mme pe e pee peri ririod odod OL* eventAlternate RTloading byFW(foraltered topology) beginsAlternate RTloading byFW endsDetect both events doneUncorrelatedeventsUncorrelated eventsFW does config writeFW sends messagerequesting each“source” CSI agentin affectedpartition(s) toquiesce using mechanismsuch as CSI_QUIESCEOL* event Alternate RT loading by FW (for altered topology) begins Alternate RT loading by FW ends Detect both events done Uncorrelated events Uncorrelated events FW does config writeFW sends message requesting each “source” CSI agent in affected partition(s) to quiesce using mechanism such as CSI_QUIESCE Ti TiTim mme ee FW FWFW dequ dequdequi iie ees ssces cesces a aaf fff ffect ectected p ed ped pa aar rrt tti iit tti iio oon nn( ((s ss) )) re rerequ quque ees sst tt us ususi iin nng gg m mme eechani chanichanis ssm mm s ssu uuch as ch asch as C CCS SSI II_ __DEQU DEQUDEQUI IIE EES SSC CCE. E.E. A AAg gge een nnt tts ss c cco oom mme ee o oou uut tt of ofof s ssu uus ssp ppend endende eed ani d anid anim mmat atati iio oon s n sn st ttat atate and e ande and st ststa aar rrt u t ut us ssin ining A g Ag Al llte tetern rnrna aat tte ee R RRT TT OL*: On- --line Addition/Deletion RT: Route Table to toto a aall llll C CCS SSI II a aag gge een nnts tsts FW: Firmware i iin nndi didicat catcati iin nng gg t tth hhe eey yy us ususe ee Al AlAlt tte eer rrn nna aat tte ee RT RTRT Note: “source” CSI agent means each CSI protocol agent in the system or the affected partition(s) Note that in this scheme: • Two CSI transactions, one of which uses the primary routing table and the other which uses the alternate routing table, never overlap in time. • The alternate routing table uses the same set of virtual networks as the primary routing table (it is assumed that routing algorithms induced by the alternate routing tables are deadlock-free). • The routing tables at each agent need to store both the primary and the alternate routing tables. Hence extra routing table storage is needed for this option. 14.7.2 Restricted Option - Variant This is similar to the above scheme except that the primary routing table is replaced by the alternate routing table. The routing table updates have to be done carefully by the firmware agent (initiating core) since the primary routing tables on the affected partitions are torn down as the update progresses. Since the alternate routing tables are not yet activated, the firmware configuring agent may find that it cannot establish a route to a source agent (combined use of the primary and alternate route tables could lead to interconnect deadlocks, especially, if the primary routing tables use both VN0 and VN1). Hence, the firmware has to establish a linear order among all the agents in the affected partition(s). It then needs to update the routing tables in this order beginning with the farthest and ending with the nearest (one has to ensure that the CSI completion messages sent by a quiesced CSI agent does not follow a route where the primary routing table cannot be used). The advantage of needing a routing table which is half of what is needed in the previous scheme is offset by this complexity, and, of course, the increased quiescing time. This variant is illustrated in Figure 14-8. Ref No xxxxx 421 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration OL* eventrequest“Alternate” RTloading byFW (for alteredtopology) begins afteraffected partition(s) quiesced“Alternate” RTloading byFW endsOL* event requestgrantfromAlternate Routing perspectiveTime-FW does config writeto all CSI agentsindicating they useAlternate RTEach agentgoes into suspendedanimation–does not generate anytransaction –all routethroughscomplete and reach destinationusingoriginalRT. Affected partition(s) quiescedat end of this timeperiodFWsends messagerequesting each“source” CSI agent in affectedpartition(s) toquiesce usingmechanism such as CSI_QUIESCEFW dequiesces affected partition(s) using mechanismsuch as CSI_DEQUIESCE.Agents come outof suspended animation state andstart using Alternate RTOL* event request “Alternate” RT loading by FW (for altered topology) begins after affected partition(s) quiesced “Alternate” RT loading by FW ends OL* event request grant from Alternate Routing perspective Time -OL*: On-line Addition/Deletion RT: Route Table FW: Firmware FW does config write to all CSI agents indicating they use Alternate RT Each agent goes into suspended animation – does not generate any transaction – all route throughs complete and reach destination using original RT. Affected partition(s) quiesced at end of this time period FW sends message requesting each “source” CSI agent in affected partition(s) to quiesce using mechanism such as CSI_QUIESCE FW dequiesces affected partition(s) using mechanism such as CSI_DEQUIESCE. Agents come out of suspended animation state and start using Alternate RT 14.7.3 Flexible Option Unlike the Restricted Option, the basic idea behind this scheme is not to quiesce the system or the affected partition(s) at all but switch over seamlessly from the primary routing table to the alternate, so that eventually all agents belonging to the affected partition(s) use the alternate routing tables. A generic description of this scheme is illustrated in Figure 14-9 and by the following steps: • After the OL_* event is requested, the firmware detects all the “source” CSI agents in the affected partition(s). The firmware either discovers the affected partition(s) or has access to the list of affected partition(s) arising from the OL_* event. • The firmware then begins loading the alternate routing tables for each CSI agent in the affected partition(s) (this would also include the routing tables at each CSI link of an intermediate router, for e.g.) through a series of CSI writes to protected firmware space (using NcWrPtl transactions). Note that during this time period, the affected partition(s) continues to function as before using the primary routing tables. • The firmware then informs each “source” agent to start using the alternate routing table (through the setting of a control and status register addressed in each agent’s configuration space using NcWrPtl transaction). • After the last source agent has been informed, the firmware waits for a a time period lower bounded by the longest CSI transaction lifetime for that platform before it sets a flag indicating that the OL_* event is complete from a routing perspective. Note: The bound on the longest CSI transaction lifetime can be quite loose since it is only used to determine when a new OL_* request can be granted. 422 Ref No xxxxx Intel Restricted Secret Figure 14-9. Multi-Partition Management -Flexible Option OL* eventrequestAlternate RTloading byFW(foraltered topology) beginsAlternate RTloading byFWendsMax CSItransaction lifetime(#) Allagents use AlternateRT forallrequests(original RT no longerused) Time period over which both routingtablesget used –i.e., alternate pathspotentiallyused from src todestOL* eventrequest grantfromAlternate Routingperspective1st “source” agenttostop using regularRT and start usingAlternate RTTimeTime period overwhichFWinforms all “source” agentsto beginusing AlternateRTAll “source” agents startusing AlternateRT- #This bound on themaximum lifetime can be quite “loose” (hundreds of millisecs) since the lifetimeonlysignifies thattheOL* event’s request canbe granted,whichisnot a timecritical event OL* event request Alternate RT loading by FW(for altered topology) begins Alternate RT loading by FWends Max CSI transaction lifetime(#) All agents use Alternate RT for all requests (original RT no longer used) Time period over which both routing tables get used – i.e., alternate paths potentially used from src to dest OL* event request grant from Alternate Routing perspective 1st “source” agent to stop using regular RT and start using Alternate RT TimeTime period over which FW informs all “source” agents to begin using Alternate RT All “source” agents start using Alternate RT -OL*: On-line Addition/Deletion RT: Route Table FW: Firmware #This bound on the maximum lifetime can be quite “loose” (hundreds of millisecs) since the lifetime only signifies that the OL* event’s request can be granted, which is not a time critical event 14.7.3.1 Handling Interconnect Deadlocks with the Flexible Option Note from Figure 14-9 that there exists a time period over which it is possible that two routing paths exist between a source and destination, one of which uses the primary routing table and the other uses the alternate. This could give rise to interconnect deadlocks. A solution to this deadlock problem exists if the underlying topologies (before and after the OL_* event) each use only one of the two deadlock-free virtual networks: • The primary routing table uses one of VN0 or VN1 (VN0, say) and the alternate routing table uses the other deadlock-free virtual network (VN1, say). There are no restrictions on the use of VNA except as those outlined (see Section 5.2, “Routing Rules” on page 5-209). • Further, intermediate switching from a primary path to an alternate path (or vice-versa) is not permitted, i.e., if an CSI packet has started its route on VN0, then it continues on VN0 or VNA until it reaches its destination. The implementation has to ensure that the primary routing table corresponding to the primary routes are usable (for reading), while the alternate routing table is being simultaneously modified. The main advantage of this option is that it does not require that affected partition(s) be quiesced though they share the interconnect. Two disadvantages of this option are: 1) Use of single deadlock-free virtual network which restricts either a) the topologies that are permissible (for e.g, ring topologies would not be permitted with this option since they require the use of both of VN0 and VN1 for deadlock-free routing) or puts constraints on scheme does not require quiescing the affected partition(s) or b) adaptive routing networks which permit multiple paths between a source and destination. From a CSI perspective, all the options mentioned above are permitted. Ref No xxxxx 423 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration 14.8 Support for Sub-Socket Partitioning Sub-socket partitioning refers to the ability for resources within a physical component to be assigned to more than one partition. Thus sharing processor resources on a processor component, I/O resources on a I/O Hub, memory and related resources on a memory component among multiple partitions is sub-socket partitioning. By convention, sharing only the router element on a component among partitions is not considered sub-socket partitioning, though such sharing falls under the umbrella of firm partitioning. Sub-socket partitioning implies firm partitioning but not vice-versa. CSI supports sub-socket partitioning as an optional feature, i.e., all CSI components may not support this feature. Support for sub-socket partitioning could take two forms. 14.8.1 Sub-Socket Partitioning via Node ids Sub-socket partitioning is supported as an extension of hard physical partitioning - each set of resources within a component which could belong to a partition is uniquely identified by a unique set of one or more node ids. Thus, for example, if a processor socket containing 4 cores supports sub-socket partitioning at core granularity, then each core and associated resources are assigned a unique set of node ids. 14.8.1.1 Agent Responsibilities • Coherence and other domains are identified using node ids as in the case of hard physical partitioning (see Section 14.3.3). • It is up to the implementation to partition or share additional resources such as the number of entries in the source address decoder and other interface structures such as the system miss address file. A similar comment applies to the home agent architected state repository such as the tracker. • The routing tables need to comprehend that multiple node ids can be targeted to the same component. There are no other changes to the Routing layer. Additional implementation dependent routing on the component then routes the CSI packets to their appropriate destinations. • A memory controller could belong to multiple partitions (identified by unique node ids). It is the responsibility of the target address decoder to translate the into an unique physical address (note that the system address is not unique across different partitions). • A component which can support up to n partitions on the socket can support fewer partitions - the node ids assigned to the fewer partitions are chosen in an implementation dependent manner (to facilitate internal routing, for example). 14.8.2 Sub-Socket Partitioning via Firm Partition ID Another option to achieve sub-socket partitioning is to specify a firm partition id (FPId) in the CSI packet headers. In the standard header, up to fs (= 4, currently) uppermost address bits are reassigned to specify the firm partition id. With extended headers, up to p (= 4, currently) explicit bits and up to fe (= 4, currently) uppermost address bits are reassigned for the firm partition id. (fe is least significant portion of . fe >= fs to facilitate use of extended headers as FPId in the SMP profile.) The number of bits that are actually used is configured by the firmware based on the platform requirements and the FPId bits available for each CSI component in the platform. (See end of the chapter for open issues.) 424 Ref No xxxxx Intel Restricted Secret 14.8.2.1 Agent Responsibilities Every CSI agent in the system must appropriately handle the firm partition id in the CSI address field regardless of whether that agent uses the firm partition id to implement subsocket partitioning. Specifically, the following rules must be obeyed: • A caching agent (including IOHs) must tag the appropriate firm partition id when it generates a request or a snoop towards agents that supports subsocket partitioning. In this way, agents that rely on the firm partition id can determine the subsocket partition that this request or snoop refers to. Thus each caching agent must know the firm partition(s) it belongs to. • Caching agents that rely on firm partition ids for subsocket partitioning must include the firm partition id before they snoop any caches that contain firm partition ids.This applies to caches shared across subsocket partitions as well as any privates caches that contain the full system address with the firm partition id. • Snoops to the same firm partition id on a socket but belonging to distinct node ids should be separately targeted (by another caching agent or home agent). This is because CSI routing is based on node ids only. • An I/O agent must remove the firm partition id from the system address before it forwards a request to the I/O subsystem. • CSI agent that implement subsocket partitioning through firm partition ids should support a different destination node per firm partition id for a given physical address.This ensures interoperability between components that implement firm partitions via firm partition ids and components that implement firm partitions through multiple node ids. • Since response messages do not contain firm partition ids, CSI components that implement subsocket partitioning through firm partition ids should be able to route responses to the appropriate subsocket partition within the die based on the unique transaction id. 14.9 Memory RAS CSI-based platforms support a number of memory RAS features. In this specification, only those features which are “visible” to CSI and are supported by CSI are described. Features such as DIMM sparing, memory scrubbing which are not visible to CSI are not described- the reader is referred to the particular component specification for details. 14.9.1 Memory Migration Memory migration moves the memory contents of one memory agent to another, and reconfigures the system to use the copied contents, either because a memory node is failing, or because of some other reconfiguration action that will cause that original memory node to become unavailable. 14.9.1.1 Memory Migration Assumptions Memory migration is performed at memory agent granularity, so that the entire contents of memory on one (primary) agent is copied to the memory on another (secondary) agent. • The secondary agent is dedicated to this function; no other request traffic should be targeting the secondary node except those initiated by the migration. • The secondary agent must have at least as much memory as the primary. Ref No xxxxx 425 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration 14.9.1.2 Memory Migration Hardware Support In CSI, the copy operation is performed in firmware, with hardware support to keep the copies coherent. There are several topology and configuration restrictions imposed during the migration flows: • No agent can be under directory control (if a directory is being used prior to memory migration operation, then the directory must be disabled, after taking the appropriate quiescing actions; after the memory migration is complete, the directory can be enabled after appropriate actions such as flushing of CSI agents caches are taken) • Writes to the migration destination agent cannot share a virtual network with any other nonpreallocated traffic on the NCS and NCB channels The latter restriction may require significant reconfiguration before migration is permitted, or restrict migration only to directly connected sockets. • The HW support provided by the memory agent is: Wt-Wt mirroring: A write request directed to the primary memory agent will result in the primary memory agent issuing an NcWr request for the same physical address and same data to the secondary memory agent. The request resources are kept in use until the secondary write request has returned a completion, at which point the primary agent returns a completion to the original requestor. If it becomes necessary to release data buffers before issuing the write to the secondary agent, the data must be re-read from memory. Handling of read errors that occur during Wt-Wt mirroring are TBD. • Rd-Wt mirroring: A read request directed to the primary memory agent will result in the primary memory agent issuing an NcWr request for the same physical address using the read data to the secondary memory agent. The request resources are kept in use until the secondary write request has returned a completion, at which point the primary agent returns the data and a completion to the original requestor. Handling of read errors that occur during Rd-Wt mirroring are TBD. 426 Ref No xxxxx Intel Restricted Secret Figure 14-10. Mirroring Support for Migration: Wt-Wt and Rd-Wt Mirroring TargetCoreCorCorSourceWrite-WriteMirror RequestMirror WtMirror WriteCompletionWrite Completion12341234Target Core Router Coher Ctl Router Coher Ctl Source Write-Write M irror Request M irror W t M irror W rite Com pletion W rite Com pletion 1 2 3 4 1 2 3 4 CoreCrCrSourceTargetRead-WriteMirror1234RequestRd DataMirror WtorMirror Rdon ErrMirror Write Completion,otMirror Read CompletionRead Completion123455R oute r Co re C ohe r Ctl R oute r C ohe r Ctl S ourc e Ta rg et Re ad -W rite Mirro r 1 2 3 4 Re q u es t Rd D a ta M irror W t or M irror R d on E rr M irror W rite C o m p le tion , ot M irro r Re a d Co m p le tio n Re a d C o m p le tio n 1 2 3 4 5 5 14.9.1.3 Memory Migration Hardware Support It is the responsibility of the firmware to create a physical address map of the all the memory on the node in order to perform the copy operation. The migration flow is: • Configure Secondary Routing/Memory agent: — Configure secondary target memory decoder identically to primary agent target decoder. — Configure response routing from secondary agent back to primary agent if route doesn’t currently exist. — This would be a new route, with no side effects. • Configure Primary Routing/Memory agent: — Configure route tables from primary memory agent to secondary memory agent. — No other source should target this node! — This would be a new route, with no side effects. — Set nodeID of secondary memory into mirror_target CSR of primary memory agent (using CSR writes). — Turn on Rd->Wt, Wt->Wt mirroring mode in primary memory agent (through CSR writes) Ref No xxxxx 427 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration • Perform copy: — Create DIMM address->PA map (performed by firmware). — Execute cacheline_flush/Load loop for every line in PA map. — may need to do this periodically to avoid OS time-out. — guarantees either a read or a write to every DIMM address. — either one causes a copy to mirror. — Rd->Wt mirroring makes copy operation atomic. — No conflicting accesses can slip between read and write. — Any updates to primary memory agent invoke Wt->Wt mirror. — effect is to keep mirrored memory up to date. • Configure system route table entries: — configure response routing from secondary target decoder back to primary agent if route doesn’t currently exist. — this would be a new route, with no side effects. — Configure route table entries from all possible requesting agents to the secondary memory agent if they don’t exist. — this would be a new route, with no side effects. • Quiesce DMA, processor traffic to affected node: — potential OS/device timeout issues. • Turn off mirroring mode in primary memory agent. • Change home nodes: — reconfigure source address decoders in every node, substituting secondary memory agent nodeID for primary agent nodeID in every entry that is a memory type entry. — Change the snoop list in every node, substituting the secondary nodeID for the primary nodeID. • Unquiesce and continue. 14.9.2 Memory Mirroring Memory mirroring is a method of keeping a duplicate copy of the contents of memory, and starting to use it if the primary copy fails. 14.9.2.1 Memory Mirroring Assumptions CSI imposes the following requirement on mirroring: • Mirroring is performed at memory agent granularity, so that the entire contents of memory on one (primary) agent is kept consistent with the contents of memory on another (secondary) agent. • The secondary agent is dedicated to this function; no other request traffic should be targeting the secondary node except those initiated by the primary. • The secondary agent has at least as much memory as the primary. 428 Ref No xxxxx Intel Restricted Secret • The primary agent needs to dedicate resources for WbData's to drain out to prevent deadlock. This entry can never be used to send any CSI message or to wait for any CSI message. • Only one mirroring primary home memory agent per socket • No agent can be under directory control (if a directory is being used prior to memory mirroring operation, then the directory must be disabled, after taking the appropriate quiescing actions; after the memory mirroring is complete, the directory can be enabled after appropriate actions such as flushing of CSI agents caches are taken) • Writes to the mirror destination agent cannot share a link and virtual network with any other non-preallocated traffic on the NCS and NCB channels The latter restriction may require significant reconfiguration before mirroring is permitted, or restrict mirroring only to directly connected sockets. 14.9.2.2 Memory Mirroring Hardware Support In order to facilitate mirroring, the coherency controller implements three different kinds of mirror operation: • Wt-Wt mirroring: A write request directed to the primary memory agent will result in the primary memory agent issuing an NcWr request for the same physical address and same data to the secondary memory agent. The request resources are kept in use until the secondary write request has returned a completion, at which point the primary agent returns a completion to the original requestor. If it becomes necessary to release data buffers before issuing the write to the secondary agent, the data must be re-read from memory. Handling of read errors that occur during Wt-Wt mirroring are TBD. • Rd-Wt mirroring: A write request directed to the primary memory agent will result in the primary memory agent issuing an NcWr request for the same physical address using the read data to the secondary memory agent. The request resources are kept in use until the secondary write request has returned a completion, at which point the primary agent returns the data and a completion to the original requestor. Handling of read errors that occur during Rd-Wt mirroring are TBD. • Rd-Rd mirroring: A read request targeting the primary is retargeted to the secondary (forwarded) as an NcRd request. The read data from the secondary agent is returned to the primary, which, in turn forwards it to the requestor. Rd-Rd mirroring could be configured to forward on every read (fwd_all) once an error is found, or just on reads that are found uncached when an error (fwd_uc_err) is found. 14.9.2.3 Memory Mirroring Flow There are four steps for setting up and switching to a mirrored copy of memory 1. Mirroring: Initializing • Configure secondary target memory decoder identically to primary agent target decoder. • Configure response routing from secondary agent back to primary agent if route doesn’t currently exist: — This would be a new route, with no side effects. • Configure Primary Routing/Memory agent: — Configure route tables from primary memory agent to secondary memory agent: Ref No xxxxx 429 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration — No other source should target this node! — This would be a new route, with no side effects. • Configure Secondary Memory Agent snoop list to snoop memory agents snooped by primary memory agent (typically IOHs when in directory mode): — Only need to do this if forwarded read request is coherent. • Perform initial copy: — Create DIMM address->PA map (performed by firmware). — Execute cacheline_flush/Load loop for every line in PA map: — May need to do this periodically to avoid OS time-out. — Guarantees either a read or a write to every DIMM address. — Either one causes a copy to mirro. — Rd->Wt mirroring makes copy operation atomic: — No conflicting accesses can slip between read and write. — Any updates to primary memory agent invoke Wt->Wt mirror: — effect is to keep mirrored memory up to date. • Turn off Rd->Wt Mirroring, enable Rd->Rd fwd_UC_Err mode in primary memory agent: — Mirror now has a copy of Primary. 2. Running: • Mirror now has a copy of Primary. • Any write to primary will mirror to secondary agent, keeping it up to date: — Mirror writes target same physical address on the mirror node. 3. Error Handling • When uncorrectable read error detected by primary memory agent: — Wait for all snoop responses. — if requested data is not found in another cache (e.g. all responses are inval), then: — If forwarding request is a coherent message, snoop agents controlled by directory, and wait for their snoop responses as well. — Forward read request to secondary agent, and wait for data response. — Secondary agent will snoop IOHs if required. — Primary agent forwards data response to the original requestor. — If requested data is found in another cache, then: — Data already forwarded, send completion signal back to original requestor. — Primary agent signals local socket that a reconfiguration is required: — Actions to be taken when directory update required TBD. • When a write error is detected: — Memory controller can optionally retry write, if possible. — If no retry possible, or errors recurr:. 430 Ref No xxxxx Intel Restricted Secret — Primary agent signals local socket that a reconfiguration is required. — Primary agent goes into Rd->Rd forward_all mode. • All reads forward, since data in DRAM could now be stale, so can't be trusted, even if CRC passes; Reconfiguration • Reconfiguring is an abbreviated memory migration flow: — All memory already 'migrated. • Configure route table entries from all possible requesting agents to secondary agent if they don’t already exist. — New routes, no side effects. • Configure response routing from secondary agent back to all possible requesting agents if routes don’t exist: — New routes, no side effects. • Set snoop list on secondary caching agent identically to primary caching agent snoop list. • Quiesce DMA, processor traffic to affected node: — potential OS/device timeout issues. • Turn off mirroring mode in primary agent: — reconfigure source address decoders in every node, substituting secondary memory agent nodeID for primary agent nodeID in every entry that is a memory type entry. — Change the snoop list in every node, substituting the secondary nodeID for the primary NodeID. • Unquiesce and continue. 14.10 Hardware Requirements, Etc. 1. Interrupt Generation on Link events: Any CSI link can be configured so that when it completes a successful link initialization, it will trigger an interrupt to a local or remote core (Some implementations may limit interrupts to be sent to the local core only). The resources needed to generate the interrupt are: a. Enable/Disable Flag b. Destination processor ID, Extended ID (EID) c. Interrupt Delivery type (SMI, PMI, INIT, MCA, vectored interrupt, etc.) d. Interrupt Vector # where applicable. If this vector is associated with PMI, such vector shall be one of the vectors reserved for the SAL firmware layer. Figure 14-11 “PMI/SMI Generation Sequence During OL_A Events” on page 14-432 shows the sequence of events for generation of the PMI/SMI interrupt during OL_A events. Ref No xxxxx 431 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration Pwr-on Resets Phys Layer Init Link, Protocol Layer Parameter Exchange or Other Link Actions Signal to Config Agent on RS from Link Ctlr PMI interrupt to a core on the RS 2. Halt state that is not woken up by MCA, INIT, PMI events. Only a Reset event or an implementation specific CSR write by the SSP or another CSI agent can serve as the wake-up. 3. CSR containing the NodeID to receive notification of MCA/PMI/SMI for errors on active/quiesced Memory and I/O agents. Error pin is not a good solution for recoverable errors reported by these CSI agents. 14.11 Implementation Notes 1. An OS based management application can derive routing tables suitable for the new configuration or provide canned tables from disk. These may be provided to the firmware layer through an ACPI OpRegion area. For extra protection, the tables can be signed with a public key and firmware can verify them using a private key embedded within the firmware. 2. Impact of removal of PCI Root bridges from firmware and OS standpoint: These are orthogonal to CSI. 14.12 Open Issues/Notes 1. Section on OL_R (link reconfiguration - topology change) without changes to processor, memory, I/O resources is TBD. 2. (TBD) Sub-socket partitioning: There are several open issues, especially with the firm partition id approach, since it is a late add-on. a. Need to fix fs for standard header, p and fe for extended header (currently only hint that fs = 4, p = 4, and fe = 4). b. What are the implications of sub-socket partitioning (with FPId) on i) memory RAS flows, ii) interrupt generation and direction, iii) fault handling (needs to be captured in that chapter), and iv) global shared memory? c. With shared SMAF at caching agent, tracker at home, possible starvation of a firm partition if another firm partition floods requests. Any lurking deadlock scenarios? d. The implications on source and target address decoders (for e.g., are some entries of the source address decoder across sub-socket partitions shared - if yes, which ones?) need to be worked out for compatibility? 3. CPU migration: A solution, not as yet reviewed has been proposed - if it works we will include it in Rev. 0.8; need to close with Planning on requirement for migration. 432 Ref No xxxxx Intel Restricted Secret 4. In Rev. 0.8, it would be nice to clarify those RAS operations that can proceed in parallel and those that are not permitted to (e.g., OL_*/memory mirroring/memory migration as long as they involve different sets of CSI agents). 5. The quiescence flows show more sequentiality than necessary since they are laid out as Steps, instead of a flowchart. In Rev. 0.8, it is worthwhile to make those flows more amenable to show the maximum concurrency. 6. The memory mirroring and migration flows apply only to the SMP profile but the conditional text indicates both SMP and LMP profiles. If these flows are enabled only for SMP, then some rewording throughout chapter is needed. 14.13 List of Acronyms Used ACPI Advanced Configuration and Power Interface Specification BIOS Basic Input/Output System BIST Built In Self Test BMC Baseboard Management Controller BSP Bootstrap Processor FW Firmware IPI Inter-processor Interrupt NBSP Node BSP OL_A, OL_D, OL_* On Line Addition, On Line Deletion, On Line Addition or Deletion PAL Platform Abstraction Layer PMI Platform Management Interrupt QP Quiesced Processor or Quiescing Processor (depending on context) RS Running System SAL System Abstraction Layer SCI System Control Interrupt SMA On-die (“Simple”) Memory Agent SMI System Management Interrupt SSP System Service Processor XBar On-die Router (Crossbar) Ref No xxxxx 433 Intel Restricted Secret Dynamic Reconfiguration Dynamic Reconfiguration 434 Ref No xxxxx Intel Restricted Secret 15 This chapter describes the power management mechanisms supported within CSI. There are two distinct categories of power management in CSI. The first, link power management, involves those mechanisms required to effectively manage the CSI interface power. The second, platform power management, is concerned with the communications required to handle coordinating the use of platform level power states in a fair manner. 15.1 Link Power Management Link power management involves managing the CSI Physical Layer in order to reduce the power consumption by the CSI interface. This handles how the interconnect can be reconfigured to reduce operating power as well as the ways in which power is saved when the interconnect is idle. 15.1.1 Link Power States The following link states are relevant to link power management. The entry and exit of these states is described in detail below in Section 3.9.5, “Link Low Power Modes” on page 3-86. Table 15-1. Link State Overview Link State L0 L0s L1 L2 Forwarded Clocks On On Off Off Reference Clock On On Optionally Off Off Link Initialization Required No No No Yes Estimated Wake Time NA Variable (5ns-200ns) 5us-15us msecs Comments Active state Active power management low power state Link suspend Power removed • L0 State - Active state — This is the power on active working state for the interconnect, and is therefore required to be supported by all designs. • L0s State - Configurable low power state used for active power management — L0s support is optional. Usage of this state will be determined and optionally enabled following link initialization by BIOS. — All CSI power supplies, reference clocks, forwarded clocks, and PLLs must remain on during the L0s state — This state provides a flexible low power state where the time allowed for waking to L0 can be optionally adjusted on-the-fly. This allows the system to adjust it’s tolerance to wake latency depending on performance demands. The intention is to allow lower power L0s states when longer wake latency can be tolerated, which would in turn allow more of the Physical Layer circuits to be powered down resulting in a lower power L0s state. With short wake tolerances, only minimal circuit shutdown might be possible in order to meet Ref No xxxxx 435 Intel Restricted Secret Power Management Power Management the wake time, while with a longer permitted wake to L0 other circuits taking longer to wake up could also be shut off during L0s. — L0s can be entered independently by the transmitter at each endpoint This means that for a given endpoint, it’s receiver could be in an L0s state and it’s transmitter in the L0 state, or vice versa. • L1 State - A low power state with longer latency and lower power than L0s — L1 support is optional. Usage of this state will be determined and optionally enabled following link initialization by BIOS. — As an example, in systems where it is supported, L1 may be used as the state of a processor to chipset link while the processor is in the C3 or C4 state. — All CSI power supplies must remain on during the L1 state. Forwarded clocks are disabled in the L1 state. PLLs may also be disabled. If PLLs are disabled, the reference clock generator may also be disabled for additional power savings. — L1 offers a lower power state than L0s, but requires additional latency to wake to L0. In L1, one key difference is that the forwarded clocks are disabled, whereas in L0s they must remain enabled. Because of this, the component PLLs, DLLs, and potentially reference clocks may also be disabled during this state. — Unlike L0s, which allows independence for the two directions of the link, L1 is a link- level state which requires the lanes in both directions to enter the L1 state. • L2 State - Zero power state with all power removed — System has removed power from the CSI related circuitry. — When L2 is supported, this could be for example the state of a processor to chipset link while the platform is in the S3, S4, or S5 system suspend state. — A full link reset is required to resume upon exiting this state, as all context is lost due to power removal 15.1.2 L0s Link State L0s is the low power state used for active power management. This will be used to reduce power consumption during periods when a transmitter has nothing to send, but a relatively low latency is needed to bring the link back into a normal operating L0 state. To provide flexibility to adapt to the different workloads that can be experienced during normal runtime, an optional mechanism is provided to change the wake time for L0s. This change would be performed by a link endpoint in response to an internal condition change like, for instance, an ACPI power state change or a detected change in I/O traffic patterns. The intention of this configurable mechanism is to allow a link to power manage itself to the greatest extent possible while still meeting the latency goals of the system at any given time. Once a given latency tolerance is determined by a mechanism such as the system power state and other operating conditions, a wake time can be assigned to the link indicating how much time is allowed for waking from the L0s state. Once this is configured, a port can power down all circuits on the transmitters and receivers that be guaranteed to recover in the required amount of time once a break event occurs. This can allow a relatively fast-waking L0s state that achieves minimal power savings when latency is critical, or a slower-waking L0s state that achieve very good power reductions to be used when power requirements dominate. 436 Ref No xxxxx Intel Restricted Secret 15.1.2.1 L0s Configuration To execute a change in the allowed wake time, a PM.LinkL0sConfig packet is issued by an endpoint. The information contained in this packet updates only the port defined by the transmitter issuing the update and the receiver detecting this packet. No explicit change is implied for the port in the other direction on the link. Wake latency can be specified by a transmitter to determine the amount of time allowed for a wake from L0s to L0 for the corresponding receiver. This indicates to the receiver that the transmitter will guarantee at least that amount of time specified from the point where electrical idle is exited until the first valid (non-Idle) packet will be sent. The receiver can use this information in L0s to power down those circuits which would still allow the receiver to meet this wake up time. Note that when specifying this wake time, a transmitter must account for its own wake time when configuring the receiver to meet system latency goals. Therefore a receiver must be configured to guarantee a wake time equal to the latency the system would allow minus the latency that the transmitter reserves for waking its own circuits before exiting electrical idle. The wake time is a integer value specified in units of 16 Unit Intervals (which is the maximum flit rate). The value is described with a 12 bit field. Once a break event occurs, as signalled by a break from electrical idle, the receiver can be guaranteed that no non-Idle packets will be transmitted within the specified number (the value of the 12 bit wake time value multiplied by 16) of Unit Intervals. Note that the logic on each endpoint is expected to know the link frequency and be able to translate Unit Intervals into real time. 15.1.2.1.1 Simplified L0s Implementation For many designs, a simple L0s state with a single fixed latency may be preferable to the flexible implementation described above. These designs may need a relatively low latency power management state but are unable to justify the design and validation complexity of the flexible mechanism. CSI provides a simple way of reducing this complexity. As described in the Physical Layer chapter Section 3.9.3.7, “L0 State” on page 3-83, the TL0S_WAKE parameter can be assigned for each port on each endpoint during initialization and can optionally be updated via a PM.LinkL0sConfig packet during runtime. For those implementations not wanting to use reconfigurable L0s policy, they would not allow updates to be done via the PM.LinkL0sConfig mechanism. After link initialization BIOS will configure the TL0S_WAKE parameter (along with the other required timing assignments) without enabling the use of the reconfigurable L0s mechanism. When this mechanism is disabled for a port, the transmitter for that port should never send the PM.LinkL0sConfig packet. 15.1.2.2 Entering into the L0s State Entry into the L0s state is managed separately for each direction of the Link. It is the responsibility of each endpoint to initiate an entry into the L0s state on its transmitting lanes when needed. There is no specified policy for invoking the L0s state. It is left to the implementation to determine the appropriate amount of idle time to wait before deciding to enter into the L0s state after the last non-Idle packet has been sent. Once a decision to enter L0s has been made, a transmitter sends the PM.LinkEnterL0s packet followed by a single Idle packet (to allowing for rolling CRC to complete). Following this, the transmitter disables it’s transmit lanes and transitions from the L0 state to the L0s state. Ref No xxxxx 437 Intel Restricted Secret Power Management Power Management This exact sequence and timing for entry into L0s is described in greater detail in the Physical Layer chapter Section 3.9.5.1, “L0s Entry Sequence” on page 3-86. 15.1.2.3 Exiting from the L0s State An endpoint with its transmitters in the L0s state must initiate an exit to L0 before being able to send packets across the link. Once a decision to wake to L0 has been made, a Break from Electrical Idle is issued and subsequently driving Idle packets. The transmitter must not issue any non-Idle packets until the configured time (TL0s_WAKE) has elapsed after the break from electrical idle. This ensures that the receiver is able to function properly. This is covered in more detail in the Physical Layer chapter Section 3.9.5.2, “L0s Exit Sequence” on page 3-89. 15.1.3 L1 Link State The L1 state is the lowest power state available to the link without removing its power supplies. It offers deeper power savings over the L0s state by removing the requirement that clocks be maintained. During the L1 state, all lane transmitter and receivers including forwarded clocks are disabled; PLLs and CSI reference clocks may also be disabled. This requires a longer wake time to L0 in order to allow the clocking to recover. L1 also acts as the intermediate state when a transition to L2 is desired. The link is placed in L1 prior to removing power to the link. Once power is removed the link will be in L2. Until power is removed the link is governed by standard L1 behavior including break events. 15.1.3.1 Entering into the L1 state L1 is entered as a global link; both directions of the link must enter L1 together, unlike L0s. L1 may be initiated by either endpoint. This is usually in response to a platform level power management event indicating that the system can tolerate the link being unavailable for long periods of time. An example would be the usage of L1 during the C3 or C4 state. Once a decision to enter L1 has been made, the endpoint wishing to initiate L1 entry completes any pending outbound transactions, sends any acknowledgements it has pending, and begins continuously transmitting PM.LinkEnterL1 packets. If the endpoint receiving the request can accept a transition into L1, for each PM.LinkEnterL1 packet it receives it will respond with a PM.LinkReqAck to indicate that it is willing to accept this policy, after having first flushed any of its own pending outbound transactions and acknowledgements. When the endpoint that is initiating the transition to L1 detects the PM.LinkReqAck from the other endpoint, it will stop transmitting L1 requests and disable its transmit data and forwarded clock lanes and place them in electrical idle. When the disabled clocks are detected by the other endpoint, that endpoint transitions its own transmit data and forwarded clock lanes into Electrical Idle. Once both endpoints have their data and clock transmitters in Electrical Idle, the L1 state has been effectively reached and the PLLs and reference clocks may be shut down. 438 Ref No xxxxx Intel Restricted Secret If for any reason the endpoint receiving the request to enter L1 could not allow a transition to L1 (including awaiting a response for a recently sent request) it would instead send a PM.LinkReqNack. When the endpoint attempting to initiate the transition to L1 detects this, it stops sending requests. Further aspects of the L1 state entry are discussed in the Physical Layer chapter Section 3.9.5.7, “L1 Low Power State” on page 3-95. 15.1.3.2 Exiting from the L1 State Either endpoint can wake the link from L1 to L0 when it determines the link is needed again (due to a local policy change, local event, or notification through sideband from some other system agent). Once a decision to exit L1 has been made, a transmitter enables it’s transmit lanes and transitions from the L1 state to the L0 state as described in the Physical Layer chapter Section 3.9.5.7, “L1 Low Power State” on page 3-95. 15.1.4 L2 Link State The L2 state is a full power off state for the link. 15.1.4.1 Entering into the L2 State Entry into the L2 state is logically exactly the same as entering the L1 state. When the link has been transitioned to the L1 state and all power is removed from the link circuits, the link is in the L2 state. 15.1.4.2 Exiting from the L2 State Exiting from the L2 state is done by reset. This is described in the Physical Layer chapter Section 3.9.5.8, “L2 Low Power State” on page 3-98. 15.1.5 Link Width Modulation This mechanism allows the transmitter to reconfigure the link width in order to reduce power consumption. Using this, a portion of the lanes on the link can be left in L0s, while the active portion can be in either L0s or L0. The lane width will be reconfigured in terms of the fraction of the max width, with full, half, and quarter supported. The number of lanes used must be equal to or a subset of the full link width that has been initialized. Link width modulation is handled separately for each of the two receiver-transmitter-pairs on the link. In other words, a downstream transmitter may decide to reconfigure to half width, while the upstream transmitter continues to use full width. This is useful for links that often have asymmetric bandwidth requirements. When waking up a portion on the link to go from a narrower to a wider configuration, some designs may require that the active lanes do not continue transmitting during this waking period. The concern is related to the instantaneous current draw caused by waking the other lanes which may pose signal integrity problem for lanes that continue to transmit. However, since the ability to continue transmitting is a desirable property from a performance standpoint, the reconfiguration mechanism is designed to allow the flexibility for each design to determine it’s own requirements here. Ref No xxxxx 439 Intel Restricted Secret Power Management Power Management Note that there are no requirements for flushing pending acknowledgements on the link before a transition. This is due to the fact that mapping of packets to a physical lane assignment is not comprehended in the retry buffer. In other words the muxing for the lane assignments comes between the retry buffer and the physical transmitters. So any acknowledgements that needed to be sent, or any packet that needed to be retired could naturally be retried properly on the new width. 15.1.5.1 Wider to Narrower Reconfiguration Steps This is a high level description of the sequence. More details and timing requirements are discussed in the Physical Layer Section 3.9.5.5, “Link Width Modulation for Power Savings” on page 3-92. 1. When initiating a reconfiguration to narrow the link width, the Link layer must block all outgoing packets. 2. A PM.LinkWidthConfig packet is issued to indicate the new (smaller) link fraction. 3. The transmitter begins sending Idle flits at the new width, continuing for the time specified in the Physical Layer configuration space. At this point it is also free to put the unused lanes into L0s. 4. Having completed the transition interval, the endpoint initiating the reconfiguration un-blocks outgoing packets, sending them on the newly defined active portion of the link. 15.1.5.2 Narrower to Wider Reconfiguration Steps This is a high level description of the sequence. More details and timing requirements are discussed in the Physical Layer Section 3.9.5.5, “Link Width Modulation for Power Savings” on page 3-92. 1. If required by the transmitter design (as mentioned above), the transmitter may need to stop traffic when initiating a transition to reconfigure to a wider link. 2. The portion of the unused lanes that are now desired to be made active are brought from L0s to L0. 3. Once the transition to L0 has completed on the lanes being brought into service, a PM.LinkWidthConfig packet is issued to indicate the new (wider) link fraction. Note that this packet is still sent on the narrower link as the receiver is not yet aware of the reconfiguration. 4. Having sent the request for reconfiguration, the Link layer blocks all subsequent outgoing transactions and transmits Idle packets on the new width, continuing for the time specified in the Physical Layer configuration space. 5. Having completed the transition interval, the endpoint initiating the reconfiguration un-blocks outgoing packets, sending them on the newly defined active portion of the link. 15.1.5.3 Power Management for Link Width Modulation While a port is using a subset of the full initiated link width, it must maintain L0s on the unused lanes. The active lanes may be either in L0 or L0s depending on their usage and the L0s active power management policy. A mechanism has been defined to allow a different L0s wake time on the unused portion of the link. This means that the active and unused lanes on a port can have different wake times and therefore different levels of power savings. This can allow significant power reductions on those unused lanes, while still enabling to remaining active lanes to be configured for a faster response (but higher power) L0s state. The wake time for both the used and unused portions will be configured following link initialization by BIOS. Also, this configuration can be updated as described above. When using this L0s reconfiguration, a flag in the PM.LinkL0sConfig packet indicates whether an L0s configuration update applies to active or unused lanes. 440 Ref No xxxxx Intel Restricted Secret If a link is placed in L1 while using a smaller fraction of the full link, an exit back to the L0 state will preserve this reconfiguration and the unused lanes will stay in the L0s state. If this behavior is undesirable then the lanes must be first reconfigured to be full width again before entering L1. 15.1.5.4 Transmission Error Handling During Link Width Reconfiguration Since the link width entry redefines how packets are oriented on the lanes, special consideration needs to be given to the case where the notification of the width change is not properly received by the other endpoint. In this case, the transmitter will have begun using the new width it had requested, but the receiver is unaware of the change. To handle this, the Link Level Retry notification will always indicate the link width that the receiver is using. As the link width modulation is independent for each direction on the link, the Link Level Retry can be transmitted and received correctly, except for a special case when dome other link issue cause the transmission of the Link Level Retry to be corrupted, which is described later. When the endpoint that initiated the width reconfiguration receives this retry and sees the discrepancy in width between its transmitter and the other endpoint’s receiver, it will change itself back to use the width that the receiver is still configured for. Having done this, it will retry any packets pending in its retry buffer and may or may not re-issue the request for a reconfiguration. [Note that the endpoint that initiated the failed width reconfiguration is not bound to re-issue this reconfiguration since the reconfiguration packet is a Link Layer special packet and not kept in the retry buffer or subjected to flow control credits or acknowledgements.] A further extreme error condition could occur if the Link Level Retry is itself corrupted (as mentioned earlier). In this case, the endpoint that initiated the link width modulation does not receive the indication of the width the endpoint who failed to receive the link width modulation request is operating with, since this information was in the corrupted Link Level Retry packet. Because of this it is unable to understand and consequently correct the width mismatch. Due to the very small probability of this event, there is no special mechanism to optimize for this case. Instead, each endpoint will continue to retry each other until the maximum number of retries has been reached. Once this limit is reached, the link will reset which will naturally correct the problem. [Note that this type affects only the link and will only manifest itself as a large link latency to the reset of the platform while the reset is occurring]. 15.2 Platform Power Management This outlines the support provided within CSI for handling the platform power management. Platform power management is a set of mechanisms and states, managed usually by software, which governs the power policy and usage for a platform or a partition within that platform. 15.2.1 Platform Power States Currently, CSI comprehends a few specific categories of states, but the coordination mechanism is generic and can be expanded to include other states in the future. One thing implicit in the coordination mechanisms described below is that all participants in power management coordination have the same concept of what each state represents. For all the states discussed, the lower the number of that state, the higher the power, so S0, C0, and P0 would have the highest power consumption and performance for their state type. When relative levels of power states are being referred to, it is natural to refer to lower numbered states as “higher states” or “higher power states”, etc. This will be seen throughout this chapter. Ref No xxxxx 441 Intel Restricted Secret Power Management Power Management • C-States: Govern the power management state of the platform processor(s) as defined in the ACPI specification. The different states define various functional levels of service the processor can provide to the system. A particular C-state, for instance, might indicate that a processor is not running code, but is able to service snoops with no added delay. • P-States: Govern the relative power versus performance point at which a platform is operating so as to allow a platform to make informed decisions about power versus performance tradeoffs. P-States are expected to represent better than linear power savings for the performance impact (i.e. a 50% performance degradation should result in more than 50% power savings). • T-States: Throttling states which are similar to P-States in that they represent a measure of power versus performance trade-off. They differ from P-States since they are expected to result in a linear power reduction for the resulting performance degradation (i.e. a 50% performance degradation would result in a 50% power decrease). • S-States: Govern the system power state as defined in the ACPI specification. These states are typically different types of system suspend states. We will differentiate between two primary categories of power states. The first category is composed of C, P, and T-states and involves different components or threads on the platform that can be at different levels of these states at the same time. The second category is composed of S- states and involves an entire system entering into a suspend state in an orderly manner. The mechanisms for the two categories are very similar, but there are some subtle differences that justify treating them separately. 15.2.2 P, T, and C-State Coordination 15.2.2.1 Power State Coordination Overview When software makes a decision to transition a particular processor or thread into a lower power state, it does so with the limited information available to it about that thread and perhaps other resources that it controls which might affect this decision. However there may be other agents on a platform which the software making this decision would not have visibility to. When these other agents share resources with the processor or thread that software is trying to transition, or more importantly, when they depend on resources that this thread or processor controls, it is important that they provide feedback to this decision making process. Without this, an autonomous decision by one agent to manage his own power, due to inactivity for example, could have a debilitating effect on another agent that still needs to operate at full performance. CSI provides a hardware mechanism whereby a node on the CSI fabric can query all of the other nodes with which it does have these type of dependencies in order to decide if the decision that the software made with it’s limited information is acceptable to the rest of the system. This hardware coordination mechanism provides a responsive efficient method of performing this coordination. For a given power state, like a P-state for example, there may be many features or configurations that an agent would like to enable or disable to manage it’s own power. Some of the actions it could take might be completely independent of any other agents in the system, while others might have an impact on other agents in the system and are therefore dependent on coordination with them. Only those features that are dependent on coordination must be deferred until that coordination has been achieved. For a P-state example, a given P-state in a particular system might include both a voltage/frequency change (independent) and a change in CSI link power management policy which would affect snoop latencies (dependent). In this example system, when software indicates that 442 Ref No xxxxx Intel Restricted Secret hardware should use this P-state, the voltage/frequency change can be made without any coordination, but the link power state policy would require coordination from the other nodes that need to perform snoops utilizing this link. As a note, we separate two notions of power state. The desired state would be the state that software (or possibly some hardware mechanism determining policy) has indicated that a particular node should observe if possible. However the state that the agent has assumed may be different than the desired state, since the agent may not always be allowed to behave at the desired level based on responses from other nodes taking part in the coordination. This distinction will be seen frequently in the examples below. 15.2.2.2 Coordination Protocol When a node decides that it needs to make a transition to a different power state, for any of the defined power state types, it needs to get feedback from all of the nodes that share it’s resources that it would like to manage to change power consumption and performance. For instance, if a node desires to go to a C3 state, where it will not be available for snoops for many microseconds, it needs to get permission to do this from all of the other nodes that are required to snoop it’s caches. To do this, it sends a PMReq message to each of the nodes that it’s transition is dependent on. This is done with a multi-cast operation, described in more detail in Section 9.8.2, “Broadcast Mechanism” on page 9-316. For this multi-cast operation, the message is sent to each node on the dependency list. This specification does not determine how these lists are constructed so as to provide implementation flexibility. There may be a single list that is used for every state type and level being coordinated, or there may be a unique list for each state type and level. The request being made in the PMReq message indicates to those nodes receiving it that a node that they are dependent on would like to make the specified state transition. Nodes receiving this are required to send a response in the form of a CmpD message with the data in this response indicating the level of that state type that each would be willing to allow. In most cases, though not all, the state they would be willing to allow is the state that software had assigned to them as their desired states. Once the node making the request has received a response from all of the nodes it made a request of, it determines the state that it is allowed to transition to. That state will be the highest power/performance level state from it’s own desired state and all of the allowed states received in the responses. The resolution policy will always select the policy for ensuring performance over power savings. It is worth noting that the nodes receiving the request take no action (other than a possible retry described below) in response to a request being made to it. The request is simply a query, and since a node making a response to a request has no visibility to responses being made by any other nodes, it does not know how the transition was resolved. 15.2.2.3 Transitions to Lower Power States The figure below is an example flow for a P-state transition where one node, Node B, has a change in it’s desired state from P1 to P4. This change could be, for instance, due to software changing the policy and writing to the appropriate registers in hardware to make this change. To get a coordination response to allow it to manage any dependent resources, it will make a request of nodes A and C which are the two nodes on it’s dependency list. Ref No xxxxx 443 Intel Restricted Secret Power Management Power Management ABC P3 (P1) P1.P4 (P1) P4 (P1) P4 (P1 .P3) PMReq[P4’] CmpD[P3] CmpD[P4] B resolves to P3 Note this transaction is not complete as retries are not shown In this example, as in others that follow, there are tags by each of the node letter names which indicate the desired state and then the state it is behaving at in parentheses next to it For example the tag next to node A’s name in the above diagram indicates that at the start of this flow, node A has a desired state of P3, but has been stuck in P1 due to the responses it received when it made a request for coordination (some time earlier). Similarly the tag for Node C indicates that it has a P4 desired state, but is behaving as P1 as a result of an earlier coordination attempt. For Node B, it’s tag indicates that it’s desired state is changing from P1 to P4, and at this point is still behaving at a P1 state. This example progresses in the following steps: 1. Node B receives a change in its desired P state from P1 to P4 from software. To get coordination for this state, it issues a PMReq message indicating that it would like to go to P4. (The apostrophe in the request in the diagram indicates that this is an Initial request - described later in Retry description). 2. Node A receives this Initial request from B, and sends a response indicating that it would be willing to allow a transition to P3. Node C receives this Initial request from B, and sends back a response indicating that it would be willing to allow P4. 3. Node B receives the responses from A and C, and based on this transitions to a P3 state, since Node A was not willing to allow anything lower performance than P3. This example does not show a complete transition, as having seen the Initial request made by Node B, Nodes A and C would retry to see if they can get closer to their desired states. This is described below. 15.2.2.4 Coordination Retry Retries are a re-querying of the dependent nodes. This is done because it is known that something has changed which might have changed the response that those nodes made to the last request. This occurs under three circumstances described below. 444 Ref No xxxxx Intel Restricted Secret To facilitate the retry mechanism a bit is included in the request which indicates whether a request being made is an Initial request or a Retry request. The request is flagged as an Initial request only the first time a request is attempted. This is an indication to any node receiving this that something on that requesting node has caused it to change it’s desired power state, for instance software choosing a new power state for it. The first case where retry is used is after a transition to a lower power state has been attempted and the result was a state that was not as low power as desired (resulting in either no change or some state in between). Since it is not at it’s desired state, the node in this condition would like to continue the transition to the desired state once the conditions that had caused the restricted transition change. It is alerted to a possible change in conditions when it observes any other node making its own Initial request. A request tagged as Initial is always an indication that something has changed, which may have removed a blocking condition. However a node receiving this Initial request cannot assume that the node issuing the Initial request was the one blocking it from making the desired transition. So in order to determine if this change would now allow it to get to it’s desired state (or at least get closer to it), it will re-issue it’s requests, this time tagged as Retry. The second case where retry is used is when a transition to a higher power state is desired. A transition to a lower power state is always limited by the agent that requires the highest level of service within a group of dependent nodes. Because of this, these nodes need to retry their requests once one node requires an increase in power state, as these others might now be required to be performing at a higher power state in order to meet the performance goals of the node that was directed to increase it’s desired state. This is covered more in Section 15.2.2.5. When a node detects an Initial request to a higher power state than it’s own desired state, it forces that node to issue it’s own Retry requests. The third case where retry is used is when a node has sent a request (either an Initial or a Retry request) and it receives an Initial request from another node before it has received completions from all of the nodes it had broadcast to. This indicates that there is a possibility that some of the responses on the system are stale and it must issue a Retry request to all dependent nodes immediately after it finishes the Initial request sequence in order to ensure that it has relevant responses from all. This is described more in the section describing Conflict Cases in Section 15.2.2.6. The operation flow and messaging for a Retry request is exactly like the Initial request described above, except for the request being designated as a Retry rather than Initial. 15.2.2.4.1 Simple Retry Example This examples demonstrates how the retry is triggered, and shows this using P-states. In this case node A is not in it’s desired P-state, having been restricted to P1 sometime before. Node B receives notification from software that it should now change from a desired P-state of P1 to P4; B had been behaving in the P1 state. C is in it’s desired P-state, which is P1. Ref No xxxxx 445 Intel Restricted Secret Power Management Power Management ABC P3 (P1) P1.P4 (P1) P1 (P1) P4 (P1) P3 (P1) PMReq[P4’] CmpD[P3] PMReq[P3] CmpD[P4] CmpD[P1] CmpD[P1] B resolves to P1 A resolves to P1 This example progresses in the following steps: 1. Node B receives a change in its desired P-state from P1 to P4 from software. To get coordination for this state, it issues a PMReq message indicating that it would like to go to P4. (The apostrophe in the request in the diagram indicates that this is an Initial request). 2. Node A receives this Initial request from B, and sends a response indicating that it would be willing to allow a transition to P3. Node C receives this Initial request from B, and sends back a response indicating that it would be only willing to allow P1. 3. Node B receives the responses from A and C, and based on this must stay at a P1 state, since Node C was not willing to allow anything below P1. 4. Node A issues a Retry request, since it saw the Initial request being made by Node B, and it was not in its desired state (it’s desired state is P3 but it was only able to get to P1 last time it requested). 5. Node B receives this Retry request from A, and sends a response indicating that it would be willing to allow a transition to P4. Node C receives this Retry request from A, and sends back a response indicating that it would be only willing to allow P1. 6. Node A receives the responses from B and C, and based on this must stay at a P1 state, since Node C was not willing to allow anything below P1. This simple example was ultimately unproductive, since all of the nodes end up behaving at the same P-state they started at. This is because Node C was still the restricting agent, not allowing anything below P1. 15.2.2.4.2 Full Lowering Power State Retry Example This is another example that demonstrates how the retry is triggered shown using P-states. This example will result in a state change by the nodes, unlike the previous example. 446 Ref No xxxxx Intel Restricted Secret In this case node A is not in it’s desired P-state, having been stuck in P1 sometime before. Node B receives notification from software that it should now change from a desired P-state of P1 to P4; B had been behaving in the P1 state. C is not in it’s desired P-state of P2, having been stuck in P1 sometime before. Figure 15-3. Lowering Power State With 2 Node Retr ABC P1.P4 (P1) PMReq[P4’] CmpD[P3] CmpD[P3] PMReq[P3] CmpD[P4] CmpD[P2] P3 (P1) P2 (P1) P4 (P1.P2) PMReq[P2] CmpD[P2] CmpD[P4] P3 (P1.P2) P2 (P1.P2) This example progresses in the following steps: 1. Node B receives a change in its desired P-state from P1 to P4 from software. To get coordination for this state, it issues a PMReq message indicating that it would like to go to P4. (The apostrophe in the request in the diagram indicates that this is an Initial request). 2. Node A receives this Initial request from B, and sends a response indicating that it would be willing to allow a transition to P3. Node C receives this Initial request from B, and sends back a response indicating that it would be only willing to allow P2. 3. Node B receives the responses from A and C, and based on this transitions to a P2 state, since Node C was not willing to allow anything below P2. 4. Nodes A & C both issue Retry requests, since they saw the Initial request being made by Node B, and they were not in their desired state. 5. Node A receives the Retry request from C, and sends a response indicating that it would be willing to allow a transition to P3. Node B receives this Retry request from both A & C, and sends them each a response indicating that it would be willing to allow a transition to P4. Node C receives the Retry request from A, and sends back a response indicating that it would be willing to allow P2. 6. Node A receives the responses from B and C, and based on this transitions to a P2 state, since Node C was not willing to allow anything below P2. Node C receives the responses from A and B, and based on this transitions to a P2 state, which is it’s desired state. This example results in all nodes in a different state than they started in. This is because, at the beginning Node B was the bottleneck, and software changed it’s policy. In the end, both A and B are still not as low as they would like to be, now being limited by Node C. Ref No xxxxx 447 Intel Restricted Secret Power Management Power Management 15.2.2.5 Transitions to Higher Power States When going from lower to higher power states, there are a few new considerations. • The Initial request to a node for a higher power state than it’s current desired state will cause that node to issue a Retry to re-coordinate it’s state. This is required since the resolution of state always caters to the highest power state. Since a new higher power state is present on one of the nodes (the requester), the coordinations that had taken place previously are obsolete and must be retried. • Note that the state information being sent by the nodes responding to an increased power state request has no effect on the resolution by the requester, since the requester in this case will by definition have the highest power state. Regardless of this, the requesting agent must wait until the responses have been returned before making it’s transition to this higher power/performance state. • A node may wish to delay it’s response if it needs to perform some action before allowing the requesting node to make it’s transition. An example of this would be if the node receiving the request has control of a physical resource like a clock generator or a voltage regulator that needs to be enabled or adjusted before the power state change can proceed on the requester. The following is an example of the flow for an increase in power state, which is shown using C- states. This will also demonstrate a condition where the state that an agent will allow is different than it’s own desired state. In this case node Nodes A, B, & C are all in C4 at the start of the flow. Node B receives notification from a local interrupt source that it must wake up to C0 and start executing code. Because all three nodes share memory and maintain coherency with each other, Nodes A & C must be also ultimately transition to a snoopable state so that Node B will have access to their caches. Figure 15-4. Increasing from C4 to C0 State and Induced Retries on 2 Nodes A BC C4 (C4) C4.C0 (C4) C4 (C4) C4 (C4.C2) C4 (C4.C2) PMReq[C0’] CmpD[C4] CmpD[C4] PMReq[C4] CmpD[C2] CmpD[C4] C4 (C4.C0) PMReq[C4] CmpD[C4] CmpD[C2] This example progresses in the following steps: 1. Node B receives a change in its desired state from C4 to C0 due to an interrupt from a resource behind that node. To get coordination for this state, it issues a PMReq message indicating that it would like to go to C0. 448 Ref No xxxxx Intel Restricted Secret 2. Node A receives this Initial request from B, and sends a response indicating that it would be willing to allow a transition to C4. Node C receives this Initial request from B, and sends back a response indicating that it would be willing to allow C4. 3. Node B receives the responses from A and C and it transitions itself to the C0 state (since it’s own request for C0 was higher than the both responses it received). Also in this step, Nodes A and C issue Retry requests, since the Initial request being made by Node B for C0 is higher than their desired states of C4. 4. Node B receives the Retry requests for C4 from A & C, and sends a response indicating that it would be willing to allow a transition to C2. This is important to note, since it shows the distinction between the state it would allow and it’s own desired state. A’s desired state is C0, but to fulfill it’s own requirements (namely to be able to perform snoops at the other nodes), it only needs the others to be in C2 which is a snoopable state. Also in this step, Node C receives this Retry request from A, and sends back a response indicating that it would be only willing to allow C4. Node A receives this Retry request from C, and sends back a response indicating that it would be only willing to allow C4. 5. Node A receives the responses from B and C, and based on this must transition to the C2 state, since Node B was not willing to allow anything below C2. Node C receives the responses from A and B, and based on this must transition to the C2 state, since Node B was not willing to allow anything below C2. 15.2.2.6 Conflict Cases There are no ordering requirements for requests for transitions between nodes. This is due to two factors: nodes respond with the states they would be willing to allow - not the state they are currently in, and if any Initial requests are detected during a sequence the sequence is completed but abandoned and followed by a retry. The following example illustrates these points. In this example we’ll show the effect of a node responding to another node’s request, then making it’s own new request which somehow passes that response. In this example Nodes A & C have a P3 desired state but have been restricted to P1. Node B gets a request from software to change it’s desired state from P1 to P4. Ref No xxxxx 449 Intel Restricted Secret Power Management Power Management A B P1.P4 (P1) P3 (P1) P4 (P1.P2) P2 (P1.P2) P3.P2 (P1) PMReq[P4] PMReq[P4’] CmpD[P3] PMReq[P2’] CmpD[P4] CmpD[P2] This example progresses in the following steps: 1. Node A receives a change in its desired P state from P1 to P4 from software. To get coordination for this state, it issues a PMReq message indicating that it would like to go to P4. 2. Node B receives this Initial request from A, and sends back a response indicating that it would be only willing to allow P3 but this response (for whatever reason) gets delayed on the way to A. 3. Node B receives a change in its desired P state from P3 to P2 from software. To get coordination for this state, it issues a PMReq message indicating that it would like to go to P2. 4. Node A receives this Initial request from B, and sends a response indicating that it would be willing to allow a transition to P4. Node A at this time also marks it’s sequence as invalid due the initial request being detected before all responses have been received from it’s own request. 5. Node A finally receives the response from Node B, and even though this resolution would have indicated that it could transition to the P3 state, it throws the result away. Node A issues a Retry to Node B for a transition to P4. 6. Node B receives the response from Node A and based on this transitions itself to P2. 7. Node B receives a Retry request from Node A, and sends back a response indicating that it would be willing to allow a transition to P2. 8. Node A receives the responses from B, and based on this transitions to a P2 state, since Node B was not willing to allow anything below P2. 15.2.2.7 Other Behavior Notes • A node on CSI must represent the interests of all of the resources behind it. It is assumed that any sub-node coordination is done locally behind the CSI node to decide when a request needs 450 Ref No xxxxx Intel Restricted Secret to be made to CSI and how to respond to requests being made to it from other CSI nodes. Requests should be made to CSI only when none of the other resources that share the node (like other processors or threads) are limiting the state transition. • All nodes that can reject a transition must also be able to issue their own requests. This is because a transition for a power can be stalled by some node on the system that is unable to handle that transition at a particular time. If that node is unable to issue it’s own Initial request once that blocking condition is removed, there will be no way for other nodes to know that they should retry their transitions. 15.2.3 S-State Coordination 15.2.3.1 Power State Coordination Overview S-states differ significantly from the P-states and C-states in a number of significant ways. During and S-state suspend entry, the entire system is being put into a low power state together and the operating system is primarily responsible for most of this transition. Unlike P-states and S-states, the OS will be aware of nearly all aspects of the platform, as it has needed to proactively disable bus master and interrupt generating devices on the platform prior to entering the S-state. Also, significantly, unlike P-states and C-states the transition to the S-state is done by a single thread on the system to affect the states of all of the platform. So in a sense, there is no coordination needed for S-states, since there is only one agent capable of driving an S-state entry. However, as this tends to be a subset of the operations used for P-states and C-states, the same mechanisms will be borrowed for making an S-state transition. The S-state entry request is made to CSI to allow all of the components to take care of any final hardware operations not handled by software prior to suspend. An example of this would be ensuring that any DRAM devices connected to a CSI node are in a self-refresh state if they are to remain active during the suspend state to preserve their contents. The response can be delayed to handle this housekeeping, but the request cannot be refused. Prior to going into an Sx suspend state, it is assumed that the operating system has done the following: • Flushed all caches to a storage medium whose contents will be preserved while in that suspend state. This could be either a non-volatile memory type or a memory like DRAM if the system can guarantee the preservation of its contents (maintaining power and a self-refresh mode for the DRAM example). • Disabled all bus-mastering devices. No new requests will be generated from any I/O devices. • Disabled all interrupt sources. The exception to this is that an SMI can still be generated under some circumstances during the entry sequence. This is described later. • All processor threads except the one which will initiate the actual S-state entry have been halted. Though a single processor thread will be responsible for initiating the actual S-state entry, this may not be the agent making the request on CSI. In some platforms the entry into a suspend state will be done by that processor thread writing to a register on another I/O device on the platform, most likely a chipset component. This component would then be the device responsible for making the request to all of the CSI devices. Ref No xxxxx 451 Intel Restricted Secret Power Management Power Management 15.2.3.2 Coordination Protocol This uses the same mechanisms seen in the P and C state protocol description, with the only minor exception being in the handling of the response. In the response to a coordination request for S state, the node responding must reflect the state requested in its response. This implies that all agents requested to make a S state entry must do so, with no means for rejection. A node can defer its response to take care of any local housekeeping tasks to prepare for power removal for example, but the response must come and must be permissive. There is consequently no retry case for S states. S state requests must always be marked as Initial. 15.2.3.3 Transitions to Lower Power States An example will be used to show the flow for a three node system for entry into S3. In this example the IOH is a chipset component that is a CSI node. The ICH is attached to the IOH via a non-CSI link, it contains the register with the Sleep Enable bit and also controls any other signals on the platform necessary for transitioning to S3. The Sleep Enable bit will be written by Node B, which represents the thread that the OS is using for initiating sleep entry. All thread behind Node A are halted. Node A cannot issue any Protocol layer messages. Figure 15-6. S-State Entry Example A B IOH ICH PMReq[S3’] CmpD[S3] CmpD[S3] IOH resolves to S3 NcIOWr[SLP_EN] Cmp WrCmp IOWr Go-S3 Ack-S3 NcIORd[WAK_STS] IORd DataNC RdCmp This example progresses in the following steps: 1. Node B issues an I/O write to the Sleep Enable bit in the ICH, which is not a CSI node. 2. The ICH sends a completion for the write. This gets converted to a Cmp message to Node B by the IOH. 3. If Node B is an Itanium processor then it will stop sending Protocol layer messages until it is reset or it receives a PMReq[S0’] message. If Node B is an IA-32 processor then it may issue one or more I/O reads to the WAK_STS status bit as well as speculative (code) reads to memory. 4. The ICH sends a Go-S3 message to the IOH. 452 Ref No xxxxx Intel Restricted Secret 5. The IOH issues a PMReq message indicating that it would like to go to S3. (The apostrophe in the request in the diagram indicates that this is an Initial request). 6. Nodes A and B receive this Initial request from the IOH, and sends a response indicating that it would be willing to allow a transition to S3. If Node B was issuing memory or I/O reads then it will stop sending new reads and wait for out-standing reads to complete before sending the PMReq[Sx’] completion. (Nodes A and B finish up any local housekeeping necessary prior to sending this response.) 7. The IOH receives the responses from A and B, and sends an Ack-S3 message to the ICH. 8. The ICH recognizes that it can now transition the platform to S3. 15.2.3.4 Coordination Retry Since all nodes must accept a request for an S state entry, there is no application of the retry mechanism for S-states. 15.2.3.5 Transitions to Higher Power States For most suspend state exits, the transition to a higher power state is handled through a system reset, implicitly bringing the system back to the S0 active state. These transitions require no additional consideration for CSI. There is the possibility that the CSI links will need to resume from an S state without a reset in certain S-states. For these cases, from a CSI protocol standpoint, it is identical to transitioning to a lower power state. The state is requested by one endpoint and accepted by all others. There are many other hardware considerations with these types of implementations which are outside the scope of this specification, but the protocol does support this. 15.2.3.6 Conflict Cases Since there is only one agent on any system that is able to make an S-state request, there is by definition no conflict case to handle. 15.2.3.7 Other Behavior Notes It is assumed that in all cases, once the request for an S-state entry has been made, no other Protocol layer messages will be issued to CSI. An agent that issues PMReq[Sx’] must not issue any more Protocol layer messages except for PMReq[S0’]. Agents receiving a PMReq[Sx’] must not send any Protocol layer requests after they send CmpD[Sx], until they receive a PMReq[S0’]. There are some cases where an SMI could be generated during the sleep entry sequence in the OS, but these must all be blocked before the actual state request has been made to CSI. If the PMReq[Sx] message is triggered by an NcIOWr message to the chipset then the chipset is required to issue the NcIOWr completion before issuing the PMReq[Sx’]. There is no ordering requirement for the PMReq[Sx’] and the NcIOWr completion at the receiving node. Although the PMReq[Sx’] is initialed by an I/O write instruction there is no requirement that the PMReq[Sx’] arrive at the processor before the I/O write instruction retires and subsequent instructions execute. 15.2.3.8 S-States and Link State Policy There is no specified link state policy for C-states, P-states, and T-states. But the links must be in the L1 link state for an orderly shut-down of the system. So CSI nodes must have a policy of entering the L1 link state after sending a CmpD[Sx] message. This policy must remain in place Ref No xxxxx 453 Intel Restricted Secret Power Management Power Management until the node is powered down or it receives a PMReq[S0’] message. The node that issued the PMReq[Sx’] must implement the a similar policy from the point it receives the last CmpD[Sx] until it decides to issue a PMReq[S0’]. All of the links will be in the L1 link state a deterministic amount of time after the last CmpD[Sx] is received. This is true even if a node puts its links into L1 prematurely and has to wake them up to route-through a CmpD[Sx] message from another node. 15.3 Power Management Related Messages 15.3.1 Platform Power Management Messages These messages are using for handling platform power management state coordination. 15.3.1.1 PMReq - Power Management Transition Request The PMReq message is one of the NcMsgB messages, NcMsgBPMReq. This message is used for making a request for a power state transition from other nodes on the platform. PMReq is an NCM- type packet sent in the Non-Coherent Bypass (NCB) Message Class. The 8 data flits in the NcMsgB message do not contain any information. Table 15-2. PMReq Data Field Mapping Packet Data Data Field Comments Param ByteA[3:0] State_Type[3:0] State type being requested Param ByteA[6:4] Reserved Reserved Param ByteA[7] Initial Initial Request Tag Param Byte0[7:0] State_Level[7:0] Level for state requested Param Byte1[7:0] State_Level[15:8] Level for state requested Param Byte7:2[7:0] Reserved Reserved Data Fields • State_Type[3:0]: An encoded field indicating the type of state being requested in this transaction. Table 15-3. PMReq State_Type Field Encoding State_Type State Description 0000 C C States based on ACPI processor states 0001 P Power/Performance States 0010 S S states based on ACPI System States 0011 T Throttle states Others Reserved Reserved for future use • State_Level[15:0]: A one-hot field to indicate the level being requested within a given state type. For example a P2 request would have State_Type[3:0] equal to “0001” and State_Level[15:0] equal to “0000 0000 0000 0100”. • Initial: When set to ‘1’ indicates that this is the first time this state change has been attempted. If this is a retry, this bit will be cleared. 454 Ref No xxxxx Intel Restricted Secret A power management transition request PMReq[.x] refers to a NcMsgSPMReq message of State Type . (C, P, S, or T), with State Level bit x set, and the Initial bit clear. PMReq[.x’] refers to the same message with the Initial bit set. 15.3.1.2 CmpD - Completion with Data as a Power Management Transition Response This message is used as the response for a PMReq power state transition request. CmpD is a SCD- type packet sent in the Non-Data Response (NDR) Message Class. Table 15-4. Power Management Transition Response Data Field Mapping Packet Data Data Field Comments Param Byte0[3:0] State_Type[3:0] State type being requested Param Byte0[7:4] Reserved Reserved Param Byte1[7:0] State_Level[7:0] Level for state requested Param Byte2[7:0] State_Level[15:8] Level for state requested Data Fields • State_Type[3:0]: An encoded field indicating the type of state being responded to in this transaction. This must match the State_Type in the PMReq packet that this is responding to. Table 15-5. CmpD State_Type Field Encoding for Power Management State_Type State Description 0000 C C States based on ACPI processor states 0001 P Power/Performance States 0010 S S states based on ACPI System States 0011 T Throttle states Others Reserved Reserved for future use • State_Level[15:0]: A priority-encoded field to indicate the level within a given state type that the node responding would be willing to allow another node to transition to. At least one bit must be set. If multiple bits are set then the lowest bit set indicates the allowed state. For example, if it could allow a P2 transition it would have State_Type[3:0] equal to “0001” and State_Level[15:0] equal to “XXXX XXXX XXXX X100”. State_Level bits 3 through 15, corresponding to P3 through P15, are ignored because bit 2 is set. A power management transition request completion CmpD[.x] refers to a CmpD message of State Type . (C, P, S, or T), with State Level bit x as the lowest State Level bit set. 15.3.2 Link Power Management Messages 15.3.2.1 PM.LinkL0sConfig - Link Power Management L0s Configuration This message is used for changing the amount of time allowed for waking from an L0s link state. This is part of the Link Layer Special Message Class. Ref No xxxxx 455 Intel Restricted Secret Table 15-6. PM.LinkL0sConfig Data Field Mapping Packet Data Data Field Comments Data Field[11:0] Wake_Time[11:0] Wake time assignment Data Field[15:12] Reserved Reserved Data Field[16] Cfg_NonActive Wake time assigned for nonactive link portion Data Field[31:17] Reserved Reserved Data Fields • Wake_Time[11:0]: The value indicating the number of 16 Unit Intervals (flits) being allowed for wake up from L0s. The value of this field is equal to (count +1) * 16UI; this gives a range of 16-65,536 Unit Intervals. • Cfg_NonActive: When this bit is set, this configuration defines the wake up time for the portion of the link that is disabled and not used for transactions when link-width reconfiguration. If this bit is cleared, the configuration is related to the normal active operation of the link used for transactions. 15.3.2.2 PM.LinkWidthConfig - Link Width Configuration This message is used for changing the width of the link for conserving power during less active conditions. This is part of the Link Layer Special Message Class. Table 15-7. PM.LinkWidthConfig Data Field Mapping Packet Data Data Field Comments Data Field[3:0] LaneMap[3:0] New width assignment via lane map update Data Field[31:4] Reserved Reserved Data Fields • LaneMap[3:0]: This is the value of the new Lane Map as described in the Physical Layer [Section 3.9.1.3, “Link Map and Width Capability Indicator” on page 3-55]. 15.3.2.3 PM.LinkEnterL0s - Link Power Management Enter L0s State This message is used for making a request for the link to enter the L0s state. This is part of the Link Layer Special Message Class. 15.3.2.4 PM.LinkEnterL1 - Link Power Management Enter L1 State This message is used for making a request for the link to enter the L1 state. This is part of the Link Layer Special Message Class. 15.3.2.5 PM.LinkReqAck - Link Power Management Acknowledge This message is used for acknowledging receipt of a PM.LinkL0sConfig or PM.LinkEnterL1 message. Sending this indicates acceptance of the policy requested by one of those messages. This is part of the Link Layer Special Message Class. 456 Ref No xxxxx Intel Restricted Secret 15.3.2.6 PM.LinkReqNack - Link Power Management No Acknowledge This message is used for acknowledging receipt of a PM.LinkL0sConfig or PM.LinkEnterL1 message, but refusing it’s instructions. Sending this indicates refusal to accept the policy requested by one of those messages. This is part of the Link Layer Special Message Class. Ref No xxxxx 457 Intel Restricted Secret Power Management Power Management 458 Ref No xxxxx Intel Restricted Secret 16.1 Quality of Service (QoS)/Isochronous Platform Requirements CSI provides the capability to support differentiated services or varying QoS needs of I/O devices. In this context, Isochronous operation in CSI is a specific usage model of QoS. CSI defines two separate message classes at the Link layer along with transaction labelling mechanism at the Protocol layer to provide differentiated service. Current platforms support three classes of Isochronous sub-systems: Legacy-ISOC, PCI-E-ISOC and INT-GFX-ISOC. Each class of isochronous transactions has a different set of coherency, ordering and latency requirements as described in next few sub-sections. 16.1.1 Legacy ISOC Legacy ISOC refers to Hardware and Software -real-time sub-systems since prior to the appearance of PCI-Express and exclude the real time services of External and Integrated-Graphics. Traditionally, Legacy ISOC is used for some Audio and Universal-Serial-Bus sub-systems. Legacy-ISOC services provide read-and write transaction streams from relevant DMA devices and System-memory. Legacy-ISOC reads and writes are strictly ordered with respect to processor reads and writes transactions, to System-Memory. In addition, Legacy-ISOC (write) transactions are strictly ordered with respect to completions of processor reads - from respective I/O devices i.e. read completions from I/O devices to processor push upstream writes issued from IO. Legacy-ISOC transactions are coherent with respect to platforms' caches. In a system configuration, where a Legacy-ISOC device needs to communicate with a Home Agent across CSI fabric, the entire isochronous traffic flows through CSI. While in a system configuration, where a legacy-ISOC device can communicate with the system memory controller outside of CSI fabric, only snoop requests and corresponding snoop responses for cache coherency purposes flow through CSI. Legacy-ISOC latency, in a CSI-based platform, must not be any higher than the latency as observed in pre-CSI platforms. It is therefore required that the operation of legacy-ISOC transactions will not be introduced with new stalling conditions as a result of unrelated transactions - in CSI-based systems. 16.1.2 PCI-Express* ISOC PCI-Express (PCI-E) ISOC refers to Hardware and Software - real-time sub-systems compatible with PCI-E specification. Ref No xxxxx 459 Intel Restricted Secret Quality of Service and Isochronous Operations Quality of Service and Isochronous Operations PCI-E-ISOC services provide read-and write transaction streams from relevant DMA devices and System-memory. While the PCI-E specification does not preclude usage of peer ISOC transactions, such usage model is not yet in existence. Unlike Legacy-ISOC, PCI-E-ISOC writes are not ordered with respect to completions of processor reads - from respective I/O devices. PCI-E-ISOC transactions are not coherent with respect to platforms' caches. However, in a platform where isochronous latency and bandwidth requirements cannot be guaranteed while maintaining cache coherency, source CSI agents must not set cache coherency attribute for PCI-EISOC transactions. These transactions are visible on CSI only in a system configuration where a PCI-E-ISOC device needs to communicate with an agent across CSI fabric targeting agent’s memory subsystem. Implementation Note: When PCI-E Isoc traffic does not snoop platforms’ cache, explicit flush protocol mechanism is required for interaction between processor and PCI-E Isoc device. The mechanism implementation depends on device hardware, its driver software and system software. Requirements for data consistency between processor and PCI-E Isoc devices is beyond the scope of this specification. PCI-E-ISOC sub-system latency is controlled by system-software through standard capability and status reports, of PCI-Express. In practice, PCI-E-ISOC sub-systems latency, in a CSI-based platform, must not be any higher than the latency as observed in pre-CSI platforms. It is therefore required that the operation of PCI-E-ISOC transactions will not be introduced with new stalling conditions as a result of unrelated transactions - in a CSI-based systems. 16.1.3 Integrated Graphics ISOC services Integrated-Graphics (INT-GFX) ISOC refers to Hardware and Software -real-time sub-systems used in Graphics sub-systems. INT-GFX-ISOC services provide read and write transaction streams from relevant DMA devices and System-memory. There could be one or more independent real-time DMA engines that can generate INT-GFX-ISOC transactions. INT-GFX-ISOC transactions have no ordering relationship with respect to processor transactions or any other traffic in the system. This requirement enables elimination of stalling conditions with respect to latency and bandwidth services, of these sub-systems. Similarly, INT-GFX-ISOC transactions are not coherent with respect to platforms' caches. This requirement must be met, in order to eliminate potential stalling associated with coherency protocol. These transactions are visible on CSI only in a system configuration where the graphics controller operating within UMA architecture needs to communicate with a Home Agent over CSI fabric. INT-GFX-ISOC transactions can have varying latency requirements depending on the functionality of a requesting DMA engine. CSI-based systems must not introduce any new stalling conditions as a result of unrelated transactions. An example of usage model of INT-GFX-ISOC transactions is described later. 460 Ref No xxxxx Intel Restricted Secret 16.1.4 QoS extensions - compatible w/ PCI-Express PCI-Express supports differentiated services by means of Virtual Channel mechanism and Traffic Class identification for certain classes of applications. Since a PCI-Express device may need to communicate with an agent over CSI fabric, CSI must have QoS extensions to provide differentiated service capability in a manner that is compliant with PCI-E mechanisms. 16.2 ISOC - Message Classes, and Traffic Classes 16.2.1 Message Class definition The CSI link-layer provides two dedicated Message-Classes for ISOC traffic: Command (ICS) and Data (IDS). ICS and IDS message-classes provide independent CSI channels for ISOC subsystems, where real-time applications' transactions must cross the CSI fabric. From CSI link-layer perspective, both (ICS and IDS) channels are strictly ordered across all addresses from an endpoint-to-endpoint. Requests in these channels must be considered as high priority requests at various arbitration points of CSI traffic flow to meet latency requirements. The exact mechanism and arbitration policies are product specific and beyond the scope of this specification. 16.2.2 Traffic Class definition Provisions for QoS extensions enable the support of differentiated traffic classes, across the CSI fabric, in future platforms. In this respect, ISOC is a specific instance of QoS. The CSI QoS-extensions are compatible with PCI-Express QoS model, and is based on the "Traffic Class" concept. By providing Traffic-Class information in CSI packets, usage of Virtual Channels can be made - for traffic of different classes - across CSI fabric and under "system-software" control. CSI ISOC traffic classes define independent Protocol-Layer traffic, which is handled based on its own set of rules. Latency requirement for various traffic classes can be met by means of independent and dedicated Protocol layer resources (queues/buffers and associated control logic) and appropriate arbitration scheme for these transactions to access System memory. From CSI protocol point of view, transactions within ISOC channels are strictly ordered within an ISOC-Traffic Class, but may be out-of-order between different Traffic classes. • Read and write data packets of the same Traffic-Class, are always delivered in exact same order as respective requests • Read and write data packets of different Traffic-Classes, can be delivered Out Of Order. Display, PCI-E and Legacy ISOC are just three examples of CSI (protocol) ISOC traffic classes. ISOC command (ICS) and data (IDS) channels are credit-based pre-allocated, from Protocol layer point of view. In this way, it can be guaranteed that the receiver's link buffers can be always drained of respective ISOC command and Data packets - for further Protocol layer processing. Ref No xxxxx 461 Intel Restricted Secret Quality of Service and Isochronous Operations Quality of Service and Isochronous Operations 16.2.3 Mapping ISOC transactions to ICS and IDS ICS and IDS are optional channels in CSI-based platforms. An ICS channel must be placed in the direction of ISOC requests (from source to destination nodes). IDS channels may be placed in two directions, one for write-data delivery (in same direction as request) and the other for read-data delivery (in the opposite direction). Table 16-1 describes the content of ICS and IDS, as a function of ISOC transactions' components. Table 16-1. Isochronous Command and Data Transaction component Message Class Direction Request ICS Write Data IDS Read Response IDS Write Response IDS Requestor ->Completer Requestor ->Completer Completer -> Requester Completer -> Requester In a system configuration, where a Legacy-ISOC device needs to communicate with a Home Agent across CSI fabric, the isochronous requests use HOM channel to maintain coherency with respect to platform caches. In addition, a priority attribute is tagged with such cycles to differentiate them from other HOM channel requests. In a system configuration, where a legacy-ISOC device can communicate with the system memory controller outside of CSI fabric, snoop requests that are visible on CSI for cache coherency, use SNP channel with priority attribute and corresponding snoop responses flow via HOM channel. 16.3 Link Layer Packet Fields and ISOC Support The following are the unique ISOC-request attributes, along with their respective usage and mapping to Link layer packet format. ISOC request attributes vary as a function of requests' Traffic Class and transaction-length. Table 16-2. ISOC Request Attributes Attribute Usage Link Packet Mapping Cache Coherency Specifies coherency requirement between traffic class and platforms’ Caches ICS Message Class Opcodes ICS Message Class Opcodes IsochCmd*Coh* indicate the request is of a traffic-class that requires Snoop of cache For remaining messages of ICS Message Class, the cache MUST not be snooped for this request Priority Four priority levels are used to differentiate among ISOC traffic classes. The traffic-class w/ '00' is of the highest priority. PE field of ICS channel request, in a Desktop platform This attribute is also used to differentiate legacy-Isoc cycles on HOM channel and legacy-Isoc snoops on SNP channel. 462 Ref No xxxxx Intel Restricted Secret Table 16-2. ISOC Request Attributes (Continued) Attribute Usage Data Consistency Specifies whether an ISOC request requires data- Chain consistency check with respect to “global observation point” of the system. • CS Message Class Opcodes IsochCmd*Consis* indicate the request is of a traffic-class that must perform data consistency check against globally observed data. • For remaining messages of ICS Message Class, the request is not required to perform such a check. Chain enables arbiters in destination node to service the original request (of multiple 64B fragments) atomically. This minimizes latency impact to cacheline size isochronous requests that traverse on CSI separately but are actually part of a larger multi-cacheline isochronous request. • '1' indicates the CSI request is not the last fragment of the original ISOC request • '0' indicates the request in ICS is the last fragment of the original ISOC request Link Packet Mapping ICS Message Class Opcodes Bit 1 of PH field of ICS channel request, in a Desktop platform Table 16-3 shows mapping of Traffic-class examples - to CSI request attributes. Table 16-3. Mapping of Traffic-class examples -to CSI Request Attributes Current Platform Usage Display VGA Channel ICS/IDS Cache Coherency Must not Snoop Priority 00 Data Consistency No data-consistency-check required. PCI-E ICS/IDS Must not Snoop 10 Data-consistency-check is required. Overlay Cursor Video Capture ICS/IDS Must not Snoop 11 No data-consistency-check required. Legacy ISOC HOM/SNP Not applicable since all requests on HOM snoop platform’s caches.a 10 Not applicable since all requests in HOM channel are required to obey platform’s data consistency rules. a) Note that PCI Express Specification allows Isochronous traffic to be both snooped and nonsnooped. Intel's platform imposes restriction to allow only non-snooped Isoch traffic. 16.4 Link Layer - QoS packet extensions QoS extensions are modeled based on PCI-Express - "Traffic-Class" concept. It enables usage of dedicated (PCI-Express) "Virtual-Channels" for differentiated traffic-classes, across CSI fabric, in a manner compatible to PCI-Express. Virtual channels provide dedicated buffering and arbitration to differentiated traffic-classes, under "system-software" control. QoS extensions are applied to devices' cycles to system memory, as well as peer-to-peer device transactions. A 4b field - "Traffic-Class" request attribute is provided in all channels in "Extended-Address" header format. Ref No xxxxx 463 Intel Restricted Secret Quality of Service and Isochronous Operations Quality of Service and Isochronous Operations 16.5 Usage Models of Isochronous Traffic in Current Platforms Applications that use integrated audio and USB controllers generate Legacy-ISOC transactions. Developed before the emergence of PCI-Express, these transactions are required to snoop platforms' cache and require an average worst-case bandwidth of 100MB/s. PCI-E-ISOC sub-systems are the emerging choice for real-time applications. As a result, the bandwidth requirement is expected to grow continuously. It is expected that by the time CSI-based systems are introduced, PCI-E-ISOC bandwidth requirements will exceed 500MB/s. This does not include graphics traffic bandwidth to system memory controller that would have to traverse CSI fabric in a UMA architecture. Integrated Graphics subsystem can have one or more independent active real-time DMA engines of each of the following types: • Display (reads) • Overlay (reads) • Cursor (reads) • Video Capture (writes) • Legacy VGA (reads and writes) Bandwidth requirements of INT-GFX-ISOC sub-systems grow quickly in every new generation of relevant computer systems. Similarly, the required latency of INT-GFX-ISOC sub-systems vary largely - based on requirements as well as implementation specific details (such as FIFO sizes and arbitration details). The following lists generic guidelines, for INT-GFX-ISOC latency and bandwidth: 1. Display subsystem may include multiple independent display sub-systems. The bandwidth requirement is driven by the number of displays, the supported display modes (screen resolution, screen refresh rate and pixel-color depth) and specific details of system- implementation. Generally, Display bandwidth is very high (as compared to other graphics traffic). Display read latency must be in a predictable min-max range, to ensure respective FIFO never overrun or under-run. For this reasons, display sub-systems are serviced via dedicated system resources (buffers and buses) - along with very high arbitration priority, when crossing shared media (such as CSI). 2. Overlay sub-system shares many of the display attributes. It is typically considered a lower bandwidth agent, with relaxed latency requirements. The overlay latency requirements are driven by the associated - very large Scan-Line buffers - which give some latency tolerance - as compared to Display sub-system (where much smaller FIFO structures are used). 3. Cursor is a specific sub-set - usage model of display 4. Video-capture consists of one or more write-data streams to memory. The video-capture bandwidth requirements are much smaller than Display. It is also purely a write transaction stream - which inherently enable easier control of latency. 5. Legacy - VGA is a special case of display. It is not concurrently active with other INT-GFXISOC streams. VGA sub-system is extremely latency sensitive. Although VGA bandwidth requirements are lower than (high-resolution) display, its usage of system memory is very inefficient - with high percentage page-miss accesses. It is important to note that effective VGA access bandwidth and latency is a function of the display data pattern. 464 Ref No xxxxx Intel Restricted Secret 16.6 ISOC/QoS Support Restrictions 1. Isoch support is not provided for dual-/multi-socket systems with distributed memory controllers. 2. CS Chaining mechanism is restricted from chaining transactions across 4 KB address boundaries. Ref No xxxxx 465 Intel Restricted Secret Quality of Service and Isochronous Operations Quality of Service and Isochronous Operations 466 Ref No xxxxx Intel Restricted Secret 7 17.1 LaGrande Technology Background Information The LaGrande Technology (LT) security architecture introduces changes to the processor, chipset, platform components and the interfaces that connect them together to enable the creation of an isolated security domain where trusted software can reside and be isolated from the rest of the system. The security architecture described in this chapter includes mechanisms to isolate the secure domain and its content, support the attestation of the secure domain to internal and external observers, provide sealed storage where secret content can be kept and accessed only by the domain which created the content. In addition the mechanisms to perform a controlled launch are described. These mechanisms define a system architecture that permits the secure installation, launch and use of a secure kernel. The ability to authenticate the identity and integrity of the secure kernel allows the user or other agents to make an informed decision to trust the kernel, and by extension allows the verifiable installation, launch and use of trustable services and applications. The LT launch process relies on a measurement agent as the root of trust to perform a system measurement and attest the hardware and software state of the platform. The root of trust consists of platform hardware (processors, chipset, circuit board, Trusted Platform Module (TPM)) and special software components (on-die processor microcode or PAL code). The secure launch process extends the root of trust in a transitive fashion to include chipset specific authenticated code and the secure kernel. At each step of the process, the new components are measured and the measurement is recorded in the TPM (Figure 17-1). The LT secure launch process must perform a system measurement and record it without dependencies on untrusted configuration firmware or other software. However, the standard CSI protocol operation relies on configuration of decoders and route tables by untrusted platform firmware. Therefore, the system architecture must support a trustable way to check the configuration by relying only on the hardware root of trust and authenticated code. This chapter will describe the CSI mechanisms that have been defined to achieve this objective. A detailed discussion of how these mechanisms are used is documented in a separate “Security System Architecture Specification for CSI Systems” document. Ref No xxxxx 467 Intel Restricted Secret Security Security Lau nch H a rd w a re /M icro co d e A u th e n tic a te d C o d e Se cu re Ke rn e l S e cu re K e rn e l Lega cy OS Truste d A d d re ss S paces 17.2 Secure Launch In CSI Systems At the conceptual level, the CSI secure launch process is an extension of the FSB secure launch process in multi node FSB systems. Indeed, processors in CSI systems may have a multitude of processor cores and chipset components such as the memory controllers integrated in a single processor die. Consequently, each processor die resembles a node in a multi node FSB system. Accordingly, the CSI secure launch process closely tracks the CSI power on configuration process, which is similar to the power on configuration in multi node FSB systems. During the secure launch process, the hardware root of trust makes sure that the proper System Initialization (SINIT) authenticated code (AC) modules are launched. The job of this secure code module is to check, verify and register the state of the complete system. 17.2.1 Simple CSI Systems For CSI systems of relatively simple topology, it may be acceptable for the hardware root of trust to just initiate one SINIT AC module that checks, verifies and registers the state of the system. Whether a particular system needs to run SINIT AC modules on all processor packages or not depends on the complexity of the system configuration. In any case the root of trust verifies that the system runs the correct SINIT AC module by computing the SINIT AC module digital signature hash and verifying that the hashes match on every processor socket that runs it. (This is in case when the system needs multiple SINIT modules to run.) The digital signature hash will be recorded in the Trusted Platform Module (TPM). In order to perform the secure launch, the root of trust must have at its disposal a communication mechanism between processor sockets, chipset and TPM that does not rely on configurable structures. In simple CSI systems, it may often be possible to ensure that all these communication mechanisms are trusted by establishing and checking the links using the root of trust itself before a 468 Ref No xxxxx Intel Restricted Secret secure launch is established. In some large and complex system configurations, a separate mechanism is necessary because, unlike LT-1 in FSB systems, the chipset is not directly connected to all processors. Furthermore, in such systems the chipset cannot broadcast to all processor dies without using untrusted configurable routers. 17.2.2 Complex CSI Systems The CSI secure launch process will be initiated by untrusted software through execution of SENTER, a special instruction implemented in the root of trust. In turn the root of trust uses the intrasocket processor communication mechanism to construct a spanning tree without relying on untrusted configuration of CSI address decoders and route tables. Essentially, the hardware implements a flood mechanism that can grab all processor sockets in a partition in a way that cannot be compromised by untrusted code. Thereafter, the spanning tree will form the basis for the implementation of a family of broadcast and barrier primitives. These primitives will be used to coordinate the measurement and authentication of the SINIT module by the root of trust and, subsequently, the validation of the CSI configuration by the SINIT module. Once the CSI configuration has been validated, the SINIT module can start using the normal CSI protocols that depend on these configurable CSI structures. In addition to the intrasocket communication mechanism between processors, the secure launch process requires communication between the Initiating Logical Processor (ILP) and the chipset to retrieve information such the digital key that is used to validate the SINIT code. This communication mechanism also cannot rely on configurable CSI structures. Since the chipset component does not have the ability to execute programs, a separate communication mechanism is defined for communication between the initiating logical processor and the chipset. In order to reduce complexity, an assumption has been made that the initiating logical processor is directly connected to the chipset component that provides access to the primary TPM. 17.3 Link Initialization Parameters At link initialization, information about the neighboring nodes is captured by each link controller and is available to the protocol engine and the core. For the purposes of initialization and system configuration, this information includes the node type, the node id(s) and the port number at remote end. The secure launch process relies on the presence of these fields. It also requires additional fields to be present during link initialization such the LT compatibility flag and the secrets-insystem flag. Moreover, it requires a finer granularity in existing fields such the node type via a capability vector to cover OEM coherent bridges and switches. For example, OEM bridges may be connected to the processor nodes and therefore must participate in the spanning tree construction in addition to their responsibilities as chipset components. The parameter exchange of adjacent packages must include a capability bit vector with bits like (I_am_IOH, I_have_TPM, I_have_primary_TPM, I_can_be_interior_node). This silicon capability and identity information is captured at the time of power on reset and is usually recorded in the power configuration registers. 17.4 Interprocessor Communication: LT Link Layer Messages The inter processor communication mechanism is based on Link layer special packets that allow two neighbor nodes to communicate without relying on configurable CSI structures. Normally, processor CSI protocol engines can only exchange messages after the route tables have been appropriately setup. Injecting messages in the link was considered a better choice than adding complex non-configurable overrides to the address decoders and routers. Ref No xxxxx 469 Intel Restricted Secret Security Security Link layer messages can only be generated by trusted entities (the root of trust or authenticated code modules). Only the root of trust can create a spanning tree and only authenticated code can use the spanning tree for barrier/summary operations. Link layer messages are considered a general purpose “envelope” allowing the root of trust or authenticated code modules to be developed in parallel with hardware design. Link controllers communicate with each other components by exchange link messages (1 flit per link message). A processor core can force a link controller to generate link messages to the other processor via writes to CSI configuration registers which are associated with each link controller. Next the link controller can capture link message content and make it available to the processor core via a read-only CSI configuration registers accessible through non spoofable loads and stores. A non-spoofable event mechanism is implemented to interrupt the logical processors so that the root of trust can be informed of the arrival of new link messages. Only one receive register will be provided per link controller. The root of trust and authenticated code modules must ensure that at most one LT Link layer message is in flight, on one link and in one direction at a time. Figure 17-2. LT Link Layer Messages Cross-Bar Multi-threaded Core LC PE Core Interface - Cross-Bar Multi-threaded Core LC PE Core Interface Non spoofable access path One opcode in the special message packet format is the LT Link layer message opcode. The 5 bit Type field is used to encode the various LT sub-opcodes and the 32 bit Data field are used for the transfer of 32 bits of data. See the Link layer control message section for format details. 17.5 Processor-to-Chipset Communication: Protocol Layer Messages LT protocol messages will be used for the communication between the ILP and the chipset components during the secure launch process. In a CSI-based LT capable system, it is possible to have the LT address space distributed between various components of the system. The primary I/O hub component presents the emulation of single system-wide image of all architectural LT chipset registers. This means that all LT configuration registers must be addressable from all the processors in the system. This also allows for the checking of all the configuration registers from a single system initialization (SINIT) authenticated code module after routing tables are tested. In smaller systems routing tables can be built by the root of trust. Also, this means that simultaneous execution of SINIT AC modules per package is not required by a CSI-based LT platform. 470 Ref No xxxxx Intel Restricted Secret LT CSR write requests from software shall be implemented by atomic broadcasts to all sockets as source broadcast from the processor. These atomic broadcasts are synthesized by the processor by wrapping the write operations inside lock and unlock sequences (SpcLock / NcWr / NcWr / SpcUnlock). These LT register write operations do not complete until all nodes complete their write operations. Every CSI node has an LT CSR target and returns zeros for read operation to unused LT addresses. Write operation to unused addresses are ignored. The root of trust itself, on the other hand, will run specially marked sequences that are wrapped inside locks and unlocks (ex: SpcLock / NcLtWr / NcLtWr / SpcUnlock). These are not broadcast and can read and write chipset registers in any node. The primary I/O Hub that presents the coherent copy of all these LT addresses to the upper layer software will accumulate all status bits from all of the distributed LT components. This also means that the LT CSR read operation (NcRd) goes to master IOH from the processor, as this component is responsible for the coherent copy of all system level LT registers. The debug tools like the In-target probe (ITP), may invoke the root of trust to do certain LT register operations; however they need to obey strict security rules. Description of such rules governing the behavior of these tools is beyond the scope of this document and is left to the specification documents of such tools. The location of the primary TPM is indicated by the parameter exchange record that is generated during the power on sequence for a simple system. For the complex system, prior to secure launch, firmware has to set up address decoders and routers so that every socket can address the Master TPM and have access to the LT CSR in the chipset component with the Master TPM. This setup will be checked by the SINIT AC module during secure launch operation. During the secure launch, ILP is responsible to validate its own address decoders and route tables to ensure that LT transactions are forwarded to a port that it is connected to a chipset component. Chipset component is responsible to accept LT transactions and allow them to access the LT CSR space. The chipset component maps multiple TPM locality levels by using different fixed address ranges for each locality. Currently, TPMs supports five locality levels, three levels are used by LT. Locality level four is used by root of trust, level three is used by authenticated code and level two is used by the secure monitor. The root of trust can control opening or closing the different locality levels by generating LT transactions, since the root of trust is the only agent that can generate such transactions. Different LT CSR registers are accessible in different ways at different locality levels. More details about the locality levels can be found in the Security SAS for CSI systems. Ref No xxxxx 471 Intel Restricted Secret Security Security 472 Ref No xxxxx Intel Restricted Secret 18.1 Introduction This chapter provides requirements and guidelines for implementing the DFT and debug features needed to validate, debug, characterize, and test components implementing the Common System Interconnect (CSI) interface. Note this chapter has a lot of Intel specific manufacturing and test related information that is for internal use only and may not apply to other third party chipset vendors. Subsections in this chapter are separated according to each phase of the test and validation process. 18.2 Design For ATE-Based Testing and Debugging Through CSI This section discusses the specific features that are required to enable HVM test and tester based debug of the circuitry behind the CSI Analog Front End (AFE) or Physical layer. This includes both the logic within the CSI interface as well as the rest of the logic and arrays within the Un-Core and Core(s) of the device. Testing and validating the CSI AFE is covered in section 1.4 of this document. Tester based debug usually encompasses a few main areas: • Speed path debug • Debug (pattern and silicon) of HVM tests that fail on tester • Failure Analysis / Fault Isolation of all defective parts including customer returns. • Debug of PSMI patterns ported from system HVM Test includes wafer level sort, burn-in, and post assembly package level screening. 18.2.1 Tester Assumptions The following assumptions about the ATE are made for Tester based Debug: • Full-featured stored-response debug tester platform (similar to IMS Vanguard). • The debug tester will have native access to CSI, and will not use an Active TIU (aka. Gasket) • The debug tester platform can directly drive and sample pattern data onto and/or read data from all functional pins including CSI, memory busses, interrupts and control signals. • Debug tester platform must support driving PSMI patterns through all interfaces, including CSI, memory busses, interrupts and control signals • The debug tester can drive and sample the JTAG (TAP) pins and all other special debug feature pins • The debug tester remains the primary platform for driving the device during all physical and non-physical probing modes Ref No xxxxx 473 Intel Restricted Secret Design for Test and Debug Design for Test and Debug • The majority of the debug tester fleet will NOT support CSI running full frequency so the core logic and speed debug must be able to be performed with less than full-speed CSI access. Current Debug Tester platform proposals support up to 4 GHz data rate. • Debug of the CSI itself will be done on a small subset of the debug tester fleet which is capable of running at full CSI frequency. This is most likely a DV tester from a different vendor. • The primary vehicle for debugging HVM test content is the debug tester platform. However since some patterns will be too large for the debug tester, the HVM test platform must also support HVM test debug • The expected frequency range for debug is 25-110% of Fmax The following assumptions are made for the HVM flow and ATE: • Tester reuse is critically important to achieve the product test cost (i.e., upgrading the tester fleet to support native CSI would cost hundreds of millions of dollars). • Interface speeds at Sort (Wafer Test) will limit to 200 MHz to be robust in this compromised electrical environment. • Any native post assembly ATE testing (Class) in HVM will provide full speed drive capability but to manage ATE development costs and reduce dependence on ATE vendors, will rely on on-die MISR’s to validate the logical correctness of the outputs (in native mode). • For High Volume procesors, some amount of native functional testing through all interfaces is required to achieve our quality goals and there must be fast path to get this content from system (PSMI) to the HVM environment to avoid limiting the ramp/product release. • Server procesors may be able to satisfy the functional test requirement via platform based testing (PPV) as this is a standard part of their flow (not true for HVM processor’s) or will incorporate their requirements into the HVM processor solution. • Chipset devices will either not require traditional functional connection to the CSI ports (will rely on functional loopback instead) or will be able to get their requirements supported within the context of the HVM processor solution. • Burn-in will use the LCBI chamber and will only support an 8 pin interface (other signals may be able to be biased, but not actively driven) and only provide a < 50 MHz input clock. 18.2.2 Basic Requirement: Determinism For both HVM test and tester debug, the stimulus will be generated via a simulation model and presented to the DUT by traditional stored/response automatic test equipment (ATE). During HVM test it is adequate to determine the pass/fail status upon test completion, but for debug it is often more efficient if the failure can be observed as quickly as possible. Both of these activities share a common basic requirement: Determinism. Stored-response testers are fundamentally dependent on determinism for error-free test execution. An RTL or other simulation creates a tester trace, which is formatted and replayed on the tester. If the DUT response does not match the expected values of the simulation trace, in the correct reference clock cycle the mismatch is considered an error. Testers are simple playback machines. They do not interpret or understand any bus protocols including CSI. They can be thought of simply as a combination of many function generators on the chip inputs (1 per pin) and a separate set of logic analyzers capturing data on the output pins. The inputs driven in are totally separated from the outputs captured. Determinism is defined to be that we can run the same pattern multiple times with resets between each run and will get clock for clock repeatability. The input stimulus must be completely defined by the clock cycle and not be any other event so that the pattern can be applied and achieve the 474 Ref No xxxxx Intel Restricted Secret exact response without ever monitoring the response. This determinism must hold across all the interfaces of the DUT. Specifically, if the DUT supports both CSI and FBD there must be some way to synchronize these interfaces with each other and with the internal logic of the DUT. Some Specific Requirements to enable determinism: 18.2.2.1 Deterministic Training Sequence Initiated Via External (Tester Provided) Event: • A Sideband signal is required that allows the tester to provide a synchronizing event. For existing front-side bus (FSB) implementations, we use the desertion of reset as the synchronizing event. The PLL lock time can vary from part to part, but is bounded in time. By providing a bounded wait time for the PLL to lock, then deasserting xxReset, the tester forces the DUT back into synchronization with the test. In a similar way, a processor or Chipset implementing CSI must provide a sideband synchronizing event that can be used after a bounded-time training sequence. • length of training sequence must be bounded in time, and the resulting state of all logic must be deterministic with a logic simulation. • The training sequence must leave the product in an identical initial state despite any differences related to voltage or temperature variation. 18.2.2.2 Deterministic Re-Training Sequence: • The need for periodic retraining should be minimized with restricted environmental changes (< 15 degree C change in I/O Tj and < 10% variation in Vcc) and it must be possible to completely disable retraining. All retraining needs to be triggered by a deterministic event (i.e., digital count or external event) and be simulatable. 18.2.2.3 Deterministic Data Rate Transitions • any frequency or data rate transitions that are part of the CSI protocol must be deterministic (i.e., be limited to fixed data rates and all transitions must be predetermined during the RTL simulation). • To be compatible with HVM ATE, it must be possible to perform almost all testing through CSI at single bus ratio. Starting at slow I/O data rate and then switching to a faster data rate can only be supported with tester friendly ratios (1/2n) and only for limited amount of test content (to fit within the limited test memory). 18.2.2.4 Must be able to present input stimulus to the core in a repeatable and controllable fashion across multiple devices, over Vcc, Frequency and Temperature. Shmoo capability is the cornerstone of speedpath debug and design validation. It involves executing the same test sequence multiple times while varying the frequency, voltage, and temperature. The pass/fail regions are then plotted on a 2D or 3D graph. CSI interfaces must support Deterministic Frequency, Voltage, and Temperature Shmoo. During the execution of a shmoo, the condition will be changed and then the test pattern will be re-run, this includes an initialization and training. Requirements: • The test must run deterministically at all voltage, temperature, and frequency points on the shmoo for a given configuration and/or clock-to-clock frequency ratio. Ref No xxxxx 475 Intel Restricted Secret Design for Test and Debug Design for Test and Debug • It is required to align with the common clock boundaries using features/tools provided in the Physical layer chapter (Chapter 3). 18.2.3 Supporting the HVM Test Flow and Tester Fleet • Must include a slow speed structural tester interface to support Sort and ST reuse at Class. — Slow speed mode must support scan/dat/sbft/... and must include: — Ability to specifically set the core-to-bus clock ratio in structural bus mode — Non-structural port pin disable function (brain dead mode to protect us from the problems we hit with external-loops on Wmt, make sure X-loops don’t hurt us) — IO frequency of 50Mhz to 200Mhz • In ST mode, the DUT differential inputs can be capable of being driven single ended and differentially — Structural Tester (ST) only can drive one side of the diff pair, okay if other side of diff pair needs to be biased (internally is preferred — But, must also be able to be driven differentially in structural interface mode to support Class testing (via gasket or ATE with differential drivers) • In ST mode, the DUT differential outputs must be observable single ended (only connect one side to Tester, other side to be left floating or terminated on die) and differentially. — Structural Tester (ST) can only observe one side, this is at the I/O rate of 50- 200Mhz • At any given bus ratio, the CSI must support a dynamic frequency range of 25% to 105% full freq — This is to support multiple speed bins with a single pattern. The test content is executed at a single bus ratio and the Core speeds are determined by the system clock frequency. The 100% requirement comes from core frequency guard band (100% + 5%) • CSI must support a Static Leakage Current measurement mode support (Isb) — Must be able to disable all static current paths • To be compatible with future Test Strategy of NOT strobing (checking) the outputs on a cycle by cycle basis (enables test data volume reduction and significant tester equipment cost reduction) there needs to on-die compressors on all non-static outputs (i.e., those that convey logical information) so tester does not have to strobe outputs on a cycle by cycle basis in order to determine logical correctness of execution. — Requires all outputs to be deterministic, or qualified input to compressor to make an X- free signature • ESD protection equivalent to P1262 high volume products. — i.e., CSI can not be implemented using hyper-ESD sensitive circuitry The product must support fixed, tester-friendly frequency ratios on all interfaces. This means if there are 2 interfaces, they should all be even multiples of the same reference clock. If one interface operates at a ratio 1:7 and another at a ratio 1:13, creating tester vectors can result in unmanageably long simulation and vector generation times. • In native CSI operation, non-differential behavior will cause issues for ATE support and may require DFT to be compatible with ATE support. All non-differential behavior must be identified to DFx team and testability hooks be agreed upon and documented. • Must provide support for floating/non-contacted pins during test, like at sort and burn-in. 476 Ref No xxxxx Intel Restricted Secret — It must be possible to leave un-contacted pins floating or at most statically terminated. Un-connected inputs to the DUT often have to be disabled so as not generate nondeterministic data or excessive current draw. — It must be possible to disable sensing on unconnected outputs and provide a mechanism to deterministically by-pass what was being sensed (i.e., detect on data pins). 18.2.4 Debug “Through” CSI – Debugging Processor or Chipset Via CSI Interface In addition to the HVM tester requirements listed above, the following are requirements to enable debug through the CSI interface.Support existing Speed Debug BKMs – 18.2.4.1 Frequency Requirements Since most of the Debug Tester fleet will NOT have capability of running the CSI at full frequency, there must be fractional CSI frequency control. At a minimum there must be a ½ frequency mode. The number of other fractional modes will be dependent on the Debug Tester POR for a given product. 18.2.4.2 Shmoo Support Requirement In addition to providing fractional modes for running CSI, there must also exist “shmoo” capability. This is the ability to change the reference clock to the DUT and have the corresponding I/O and PLL frequencies change by a known amount. This allows for a test to be run at one fraction, but with varying frequency each iteration (with a full cold reset between each iteration,) until the failure point is found (aka the fmax.) The range that this testing occurs is between 25% and 110% of the nominal frequency. CSI needs to be capable of locking to the reference clock running at any frequency in between deterministically. 18.2.4.3 Debug Pins / Ports Requirement The CSI must not supersede or preclude the existence of other pins and ports required for debug. As an example, the JTAG TAP must not be integrated into the CSI, and should not require the CSI to be functional in any way in order to operate. The same holds true for all other debug pins / ports. Since the number, types and usages of debug pins are product-specific, the information on them can be found in each product’s architectural and/or electrical specs. 18.2.4.4 Synchronization and Looping Requirement for Probing Debug probing modes sample data at a low frequency and build waveforms from many samples taken during different iterations of a looping test. To support this, CSI must be able to synchronize to the debug tester’s external reference clock and put into a pattern loop without needing retraining during the pattern. The pattern would contain an initial training sequence but must not require insertion of any additional retraining during execution that was not in the original logical simulation. The above HVM section on retraining specifies the retraining determinism requirement in more detail. Ref No xxxxx 477 Intel Restricted Secret Design for Test and Debug Design for Test and Debug 18.2.5 Debug and Test of the Logic Associated with CSI 18.2.5.1 Debug Requirements • All CSI logic registers including counters, state machines and staging queues must be readable via TAP based instructions. • Debug-accessible control registers should be made available to ‘defeature’ or disable features of the CSI. • Queue depths and counter values should be programmable. • Key events such as transmission or reception of a packet header, should be visible internally to the TAP in order to trigger actions based on the events. • All CSI debug features must be fully accessible without adding any additional CSI or debug pins. • All CSI debug features must be fully accessible without requiring any additional packets or data fields within a packet. 18.2.5.2 HVM Test Requirements • The following are required for HVM test of the logic used to implement CSI: — Must be able to achieve 98% stuck at fault coverage of the CSI logic on the Structural Tester using ATPG (i.e., can not rely on native functional access), this may require a “full” scan implementation. — In order to achieve the infant mortality requirements, must be able to toggle the logic associated with CSI in the Burn-In environment and under burn-in conditions (elevated Vcc and with < 50 MHz input clock only 8 active signal for the entire device). The CSI interface does NOT have to be fully functional in this condition as the toggle coverage can also be covered by a full scan implementation. To avoid unbalanced aging, all circuits that are required to age uniformly, must be toggled during BI or be designed with initial imbalance that will become balanced post BI. Note: this may require that the AFE will have to be toggled. 18.2.6 Desktop Processor Specific Requirements The following are required in order to meet the Test Cost and Throughput needs of the HVM desktop products • Pseudo link agent support (PLA) to minimize the amount of traditional Functional Testing required — This is a processor-specific requirement, not generic to CSI (chip sets may not need) • All interfaces must be able to run in tester friendly ratios all at once. (CSI, memory bus, Tap, bus clock) — Most important for ATE support, can only talk at friendly ratios across ports 18.2.7 Debug of HVM Patterns To further facilitate the debug of HVM patterns the key thing is to increase the observation capability of HVM tests. Since all of HVM content will be relying on on-die signatures, it would be nice to have a method of making the signature from the on-die compressors observable on 478 Ref No xxxxx Intel Restricted Secret outputs of the DUT at a reduced data rate. For example, a single parity bit or a full 20 bits (full CSI width) of a selected on-die signature could be periodically presented on the outputs in ST mode. This would enable faster recognition of internal errors.This same mechanism could be used to debug signatures accumulated during functional testing. Also avoid relying on micro breakpoints to generate test content as they are often needed by the debug tester tools. 18.2.8 Summary • Determinism is the key requirement. • Synchronization to a debug tester is required for looping capability. • All other debug signals / ports must be accessible without using the CSI, as specified by each product • There was no additional data required in the CSI packets. 18.3 Component and System DV/EV/AnV Availability as well as measurement accuracy of automatic Test, Measurement, and Debug equipment is clearly lagging behind Intel CSI roadmaps; looking at the test learnings and trends in the past, it appears that this chasm will continue to deepen. For example, at the moment (year 20032004), all CSI Testers are Paper testers on Foils; no doubt in couple of years these will be real machines but most probably at Mega Dollar costs requiring continuous upgrades with every new spin of CSI. Moreover, and logically so, vendors can guarantee the accuracy only of their machine and not of the overall test setup in varying conditions -- this brings ample measurement inaccuracies on the table for Design Validation (DV) /Electrical Validation (EV)/ and Analog Validation (AnV) engineers to tackle. Naturally so, with increasing data rates the test setups behave electrically quite different from actual user system environments - this adds another significant portion to the inaccuracies in DV/EV/AnV data. Therefore it is imperative that adequate Built-in- Design Validation (BiDV) features and On-Die-Instrumentation hooks are architected on CSI platforms, components and individual design blocks so that accuracy of DV/EV/AnV data can be improved, and measurement setups can be designed with minimal dependencies on high end external test equipment/environment. On the other hand following Moore's Law, the fab processes are continuously shrinking and complexities of designs are growing almost exponentially. This means manufacturing margins on CSI products will continue to shrink and fault models will keep growing in unknown territories with every new spin of CSI. To screen defects to Intel quality standards, DV like high quality testing in HVM on crucial specs will be required. Since it will not be possible to deploy high end external test equipment in all factories due to equipment availability, accuracy and cost issues, the Built-in-DV features and On Die Instrumentation hooks will not only reduce the risk but also enable scalable DV like quality test solutions in HVM for components and systems alike. This subsection describes the DFT requirements from components and systems DV/EV/AnV perspectives. Ref No xxxxx 479 Intel Restricted Secret Design for Test and Debug Design for Test and Debug 18.3.1 CSI Component and System DV/EV/AnV Requirements The CSI Electrical Specifications must provide a measure of quality for each of the components that go into the making of a typical CSI Link. Individual as well as collective Specifications are required so that design boundaries can be established and quality of manufactured components and systems can be ascertained. Thus, specifications are required to be stated at silicon pads as well as at component pins at both Transmitter and Receiver ends. Availability of Test and Measurement equipment and the practical usage models must be comprehended while defining the Electrical Specifications. For example, a "golden test channel" that can be easily assumed in simulations, is almost impossible to replicate in practice across hundreds of test sites, therefore specifying w.r.t. a golden channel could be a post silicon practical issue. Similarly, there is legacy terminology used in practical Test and Measurement world that should either be followed or crisply explained so that the measurement software algorithms can give out correct answers. Another example is the practically diminishing availability of Real Time Sampling instruments: thus the CSI timing specifications must allow for testing without needing a real time instrument. The CSI testability architecture must support selective lane characterization required in multiple design validation scenarios (some example for clarification purposes): • For electrical characterization, an Oscilloscope or Tester channel is connected to any one Tx at a time, while all other Transmitters & Receivers on the device under test are either active or quiescent, and not connected to any other instrument or device. • For differential specs characterization involving one data and clock Transmitters to be observed simultaneously on a pair of Oscilloscope or Tester channels, while all other Transmitters & Receivers on the device under test are either active or quiescent, and not connected to any other instrument or device. • For Eye Margining or Bit Error Rate type measurements on any one Tx-Rx pair (products can pick pairs of their choice) on the CSI device under test, using an external traffic generator tester to stimulate and observe data in a Digital Far End Loopback mode. • Others The CSI testability architecture must enable built-in-design and electrical validation: Because high speed high accuracy full width test, measurement & debug equipment might not even be available from external vendors due to technology limitations in the CSI enabling time frame, therefore CSI testability architecture must provide adequate on die instruments to enable test platform independent CSI design validation and characterization. Also, accurate bus margining at system level is a must for CSI. Thus, in addition to normal functionality, the CSI Transmitters/Receivers are required to also behave like stand alone external reference Testers or Instruments with adequate for CSI granularity and calibration features across process, voltage, temperature, noise, and crosstalk conditions. DV/EV/AnV and System Margining usage models such as IBIST, MARS, LB, Built-in-DV, etc. can thus be developed using a common set of testability hooks. • Each CSI Transmitter should be able to behave as a programmable controllable traffic generator that can present calibrated signal bits of varying width, height, and equalization to another CSI component under test. The Tx impedance and termination values are also programmable. • Each CSI Receiver is also required to behave like an external reference Tester or Instrument of adequate accuracy for CSI. The Receiver should be able to margin and measure the incoming bits in time and voltage axes, and should be able to compare the incoming data against a reference, and log the error count. Each CSI Receiver should also be able to send the received data back to the external device via its Transmitter (Digital Far End LB mode) to enable Bit Error Rate type tests and augment debug/ margining. 480 Ref No xxxxx Intel Restricted Secret The critical analog and digital signals in the Transmitters and Receivers should be observable outside through some mechanism (details covered elsewhere in this chapter) to enable individual design blocks characterization, calibration, and debug. Analog Legged Devices: Must be able to test to adequate DPM levels. 18.3.2 Tx Characterization In CSI, Tx DFT plays two roles: 1. Enables Tx design validation, electrical characterization, specifications margining & debug using external and/or On-Die Instruments; 2. Enables Tx to behave as an on-die-instrument that can generate electrically controlled self- calibrated stimulus for CSI Rx, Link, and Interconnect Components design validation, electrical characterization, specifications margining & debug DFT features required in CSI Tx: • Ability to Read /Write/ Lock Tx config registers (Comps, Ph, EQ, etc.) for a controlled electrical behavior of the Tx (self-calibrated to Band gap & PLL accuracy). • Ability to generate self-calibrated Tx eyes small enough to margin the Min Rx Specs, and DC or AC Squelch levels (if any per CSI Specs). • Ability to support Digital Far End Loopback (DFE-LB), Digital Near end Loopback (DNELB), and External Loopback (E-LB). • Controllability for seamless switching between normal Functional, Training, and Loopback. • A special programmable Pattern Generator of adequate bit depth {to be decided}, automatic or manual force selectable kickoff, on the fly programmability of the patterns. Ability to detect the Rx (external or on-die), and if selected kickoff a default pattern generator • Observability of critical analog signals such as PLL, Band Gap, and PISO etc. for debug, validation, and calibration. 18.3.3 Rx Characterization In CSI, Rx DFT also plays two roles: 1. Enables Rx design validation, electrical characterization, specifications margining & debug using external and/or On-Die Instruments; 2. Enables Rx to behave as an on-die-instrument that can measure, margin and facilitate observe the received signals in a controlled self-calibrated manner for CSI Tx, Link, and Interconnect Components design validation, electrical characterization, specifications margining & debug. DFT features required in CSI Rx: • Ability to behave as a stand alone sampler capable of measuring the received signal in timing & voltage axes over a large number of incoming consecutive bits. • Ability to Read /Write/ Lock Rx config registers (PI, DLL, Latency/De-skew, Comps, etc.) and self-calibrate the VOC control bits. • Ability to support Digital Far End Loopback (DFE-LB), Digital Near end Loopback (DNELB), and External Loopback (E-LB). • Controllability for seamless switching between normal Functional, Training, and Loopback. Ref No xxxxx 481 Intel Restricted Secret Design for Test and Debug Design for Test and Debug • A special programmable pattern comparator complementary to the pattern generator in the Tx, of adequate bit depth {to be decided}, automatic or manual force selectable kickoff, on the fly observability of per pin pass fail results. • Observabiltiy of critical analog signals such as DLL phases, VOC Sampled data, etc. for debug, validation, and calibration. 18.3.4 Interconnect Characterization Due to the challenges of probing the CSI interface, much of the validation effort relies on the inference of performance based upon margin testing and design validation work to correlate results back to the kit team predictions. One of the major efforts will be the characterization of the actual interconnect, from silicon pad to silicon pad. Typical CSI channel comprises of 3 types of interconnect components namely Package, Socket and PCB. Though passive, the electrical performance of these components is of paramount importance to the overall performance of a CSI Link. Thus, • Frequency dependent characterization of the package and socket for insertion loss and return loss along with coupling to adjacent structures. An s-matrix shall be measured for various package skews and a typical socket. Impedance and Insertion Loss of individual interconnect component must be bound by the CSI Electrical Specifications & Topology steered by CSI platform design guidelines. • Frequency dependent characterization of the PCB interconnects using a VNA to collect s- parameter data on a per lane basis. A means to characterize the impact of adjacent conductor coupling must be included. Resonance points shall be determined and shall be specifically targeted by the Link Characterization activities. • Signal quality scope traces shall be captured using a specially designed package to permit scope probing at the die bump point under the same (or nearly the same) signal termination and loading conditions as the real silicon. These captures need to be correlated to the performance predicted by simulation. • CSI Tx and Rx DFT must provide enough On-Die Instrumentation to enable development of characterization techniques & usage models that adequately cover the individual interconnect component performance. Sub sections 1.1.1.1 & 1.1.1.2 already cover these DFT features. 18.3.5 Link Characterization Characterizing the performance of the link is performed mainly through techniques of margining to failure. Many "knobs" shall be provided in the silicon design to enable margining to failure of the link. CSI Link electrical characterization, analog validation, signal integrity inspection, and CSI Link margining across process, voltage, temperature, frequency, and noise conditions require the following testability features: • Both absolute and relative control of the following circuit functions: Tx current amplitude and control of all equalization tap values; Tx source impedance; Rx termination impedance; Rx Phase Interpolator (PI) offset; Rx differential Voltage Offset Control (VOC.) This requirement implies access to these controls either through a scan chain or memory mapped registers. • A jitter injection block that injects jitter in all major blocks, such as PLL, Tx, Rx and compensation circuitry simultaneously and in any combination and makes the amount of jitter adjustable for the worst impact. Every major block is exposed to jitter, so jitter must be injected in all blocks simultaneously. • Error detection circuitry to report violations of bus Analog characteristics or malfunctioning of pertinent state machines that control analog behavior of driver and/or receiver circuitry. A simple example would be the detection of a compensation circuit reaching maximum or 482 Ref No xxxxx Intel Restricted Secret minimum value. This allows prompt discovery of fundamental problems with the circuits as it relates to malfunctioning of the state machine without putting massive efforts into debug. • A means to determine the Bit Error Rate of the link. Bit Error Rate, Speed, and Bandwidth of the CSI Link must be bound in CSI Specifications. Ensure ability to Read/Write/Lock all CSI registers through platform operating system. • To support special cases when CSI registers can't be accessed through operating system due to some bug or otherwise, CSI registers must be accessible through special test access mechanisms (mostly product specific such as JTAG/TAP, SCUM/TestinB/SSI/XDP, NOA etc.) so that software for characterization and debug techniques such as Built-in-DV and IBIST etc. can be developed independent of the platform operating system. 18.3.6 CSI Link Debug for DV/EV/AnV Conventional Debug and Pico Probing techniques might not be directly applicable especially in high speed analog sensitive blocks in CSI. Also direct probing on CSI platforms will be a challenge. Though it will be product prerogative, yet following DFT is highly recommended for debug during DV/EV/AnV of CSI Components, Links and Platforms: 1. Analog Observation Ports (for critical analog nature signals such as PLLs, Phase Interpolators, Band gaps, etc.), usually dedicated pins will be required. These ports will also be used to calibrate the on die instruments built around CSI design blocks. 2. Mirror Ports (for critical signals digital in nature), these ports can be used for logic debug and visibility into the devices. 3. Controllability an observability of all CSI DFx Registers both through dedicated Test Ports and System BIOS are recommended. This requirement is already being satisfied in multiple sections for HVM and DV/EV/AnV. However, products uArch must include in theory checklist to ensure this is happening. 18.4 CSI Phy Layer DFx Tools 18.4.1 Introduction “Common System Interconnect” – CSI – is a high speed, differential, DC-coupled, 20-bit wide bus, utilizing Ground-referenced differential signaling. Each bit uses an independent differential Transmitter and Receiver pair enabling 2-byte wide unidirectional signaling. A simple representative diagram showing a transmitter and receiver used in a CSI Link is shown in Figure 18-1 Ref No xxxxx 483 Intel Restricted Secret Design for Test and Debug Design for Test and Debug Out Out# Data# Data Iout RtermRtermRtermRterm Phi1 Phi1# VOC VOC Offset control Offset control Out1 Out2 1 Unit Interval - (pSec.) 0 1 Vss Voh Vcm* Vp-p Eye Width < 1UI 0 1 < Voh Transmitted Eye - Ideal Wave transmitted Received Eye - both time and voltage is reduced *Vcm = Voh/2 A DC-Coupled, Differential, Ground-Referenced Transmitter and Receiver Pair in a CSI Link In In# This document describes the silicon self-test hooks built in the electrical sub-blocks of the CSI PHY layer. These hooks will be built around the loopback scheme described in the CSI electrical spec, with pattern generator design, and fast entry, exit and reporting for HVM efficiency. Any BIST-type function can be built on top of these features, in silicon, software and test programs. Other than as examples of how a particular self test hook may be used, a detailed description or specification of BIST features is beyond the scope of this document. 18.4.2 Definitions Some standard terms used throughout this document are defined below • Lane A pair of Transmitter and Receiver representing 1 bit of the bus. • Link A collection of 20-Lanes representing the full width bus. • UI Unit interval representing the nominal bit time • Vcm Common Mode Voltage = Voh/2 484 Ref No xxxxx Intel Restricted Secret • Vp-p Peak to Peak Voltage (or swing) • Lane Registers A set of registers used to Read/Write configuration or status information for each Lane • Link Registers A set of registers used to Read/Write configuration or status information for each Link • Training Registers A set of registers which are used for Training, loop-back and control of the Link. • Near-end agent The local agent - processor/chipset/any device - being described. • Far-end agent The remote agent - processor/chipset/any device - at the other end of the Link. • DF* Represent common term for DFT, DFV, DFD, DOE etc. 18.4.3 Reset Sequence Please refer to the Physical Layer and Reset/Init Chapters 18.4.4 CSI Loopback CSI loopback is envisioned to be the key method of enabling HVM, AV and SV testing and debug mode for all products containing the CSI Physical layer. Loopback helps to isolate exact failing conditions by looping back deterministic data patterns and looking for pattern mismatches. Loopback also gains favor for product validation and compliance because of its relative simple concept and minimal implementation overhead. This section describes the standardized set of the CSI Physical Layer hooks and registers which enable Loopback. This is expected to be common for all products using the CSI PHY layer. Standardized test applications are expected to make use of these Physical layer hooks and register definitions to enable higher level BIST (IBIST, IOBIST, MARS etc.) type software and capabilities. This method would enable cross platform and cross product use of standardized test applications for CSI. The loopback function is integrated into the CSI Physical layer as a separate “state” in the Link State diagram as described in Chapter 3. In normal operation – the link goes through a “detect-toL0” transition after assertion of the “power good” signal from a higher layer. The “L0” state is defined to be the normal operational state of the link. Entry into the loopback state is controlled by the Link layer setting or clearing the loopback control bit in the “TS-x” training patterns. More details of individual states, their function and the TS-x sequences can be found in CSI Physical layer Specification. Loopback patterns are stored in a 40 bit deep – single pattern generator register for all the 20 lanes of the link. These patterns may be a hard-wired sequence of data bits or can be changed and set by any external means through a TAP or microcode. In order to study effects of noise and ISI and adjacent channel crosstalk – this pattern has the ability of being buffered or inverted before being sent out on a per lane basis. Ref No xxxxx 485 Intel Restricted Secret Design for Test and Debug Design for Test and Debug Please refer to the CSI Physical layer Specification for up-to-the-second information on the Physical layer state machine. 18.4.5 Loopback Modes Loopback can be done in different ways and at different points in the Transmitter and Receiver paths in the PHY layer. The main motivation for the modes chosen for CSI was the choice of physical floor plan for the transmitter and receiver circuits. Although the actual details for such a floor plan are beyond the scope of this document, a brief mention of the reasons is made below to understand the thought process. The physical placement of the Analog circuits in silicon dictates the loopback modes that can be implemented in silicon. Isolating the transmitters and receivers to separate islands (called segregated transceiver floor plan) in silicon reduces by half clock routing for the circuits and therefore associated clock jitter. Clock skew is important between the multiple phase clocks coming from the DLL to the individual interpolators on each lane. This number is also improved by reducing the clock routing distance between the DLL and the furthest lane because of a segregated transceiver floor plan. Such a floor plan may also lead itself to minimal switching noise between the transmitters and receivers and reduce clock error associated with “within-die-variations”. These phenomena are especially exaggerated for data rates above 5 Gb/s as these errors become a significant percentage of the total UI. Another advantage of segregated floor plan is from a board routing perspective. Bending for a segregated link cuts the radius in half and therefore the skew between the clock and data paths. This reduces the stringent matching requirements imposed on the mother boards. Figure 18-2. Segregated vs. Integrated Transceiver Floor Plans in Silicon TX19 TX18 TX17 TX2 TX1 RX9 Fwd Clock/ DLL RX11 Rx19 TX10 RX10 Comp/ Bias TX9 RX9 Fwd Clock/ DLL Segregated Transmitters and Receivers Floor-plan Conceptual diagram - shows Receiver side Clock routing Integrated Transmitters and Receivers - Floor-plan Conceptual diagram - shows Receiver side Clock routing Total routing distance Total routing distance TX0 Comp/ Bias RX0 RX1 RX2 TX19 RX19 TX18 RX18 TX0 RX0TX1 RX1 486 Ref No xxxxx Intel Restricted Secret A segregated floor plan, however, eliminates any easy way to do any kind of within-die loopback of the analog signal coming from the transmitter output and routing it to the receiver input and hence is not specified in this specification. Any within-die Analog signal loopback also precludes the use of a pass gate physical connection between the transmitter outputs and receiver inputs – which has a negative impact on total pad capacitance. Based on the above discussion, the CSI Physical Layer will support 3 modes for loopback which are described in Figure below. The three modes are: Digital Far End Loopback (DFE-LB): This mode is defined in which data received on the receiver of the Near-end agent is synchronized, de-skewed and re-timed before being sent back to the Transmitter of the same Near-end agent. This loopback mode will be the most widely used method of loopback for most of the test scenarios. Unless explicitly mentioned, all the information described in the sections hereafter refers to this loopback mode. Digital Near End loopback (DNE-LB): This mode is defined in which data on the transmit path from the Link Layer is looped back to the receive path in the Link Layer. This happens without any interaction with any of the Analog Circuits including Transmitter and Receive front ends. External Loopback (E-LB): This mode is defined in which the analog voltage signal from the Transmitter of the Near end agent is provided a physical path on the TIU and connected back to a Receiver on the Near end agent. Figure 18-3. Loopback Modes in CS Sync. & SIPO De-Skew TX RX RX Digital Near-E nd Loopback Digital Far-E nd Loopback E xternal L oopback (External on TIU ) From L ink Layer To Link Layer 2:1 de-m ultiplexed R eceiver. Ref No xxxxx 487 Intel Restricted Secret Design for Test and Debug Design for Test and Debug 18.4.6 Local vs. Remote Loopback CSI supports a Master-Slave Loopback methodology – at power up the Link layers configures either of the Near-end and Far-end agents as either a Master or a Slave. All loopback tests are performed with the Master configured agent supporting both pattern generation and results comparison and error detection. The loopback mechanism as implemented in the Link training state machine does not differentiate between remote loopback and local loopback and so training protocols and loopback handshaking is independent of the loopback type. Remote loopback is a loopback path between different links on different devices. In this configuration, one device functions as a loopback master, and the other as a loopback slave as described in the documentation. Local Loopback (generally a combination of E-LB and DFE-LB) is defined as loopback between the transmitters and receivers of the same agent. This could further be sub- divided as Inter-Link and Intra-Link for the agent. In an inter-link loopback for the agent – the loopback happens between the transmitters and receivers of 2 separate links in the agent. In this case we program one Link as the Master and the other link as the slave. For Local intra-link loopback there is no Slave agent – we program the device in the Master mode and the transmitters and receivers are looped back to each other. The overrides on this device do not happen through the loopback protocol but through any external access mechanism like a TAP port. Figure 18-4. Local vs. Remote Loopback in CSI 488 Ref No xxxxx Intel Restricted Secret 18.4.7 Loopback Test Sequence The high-level sequence of events for any loopback test is described below. 1. CSI Link is trained in the normal state. 2. All overrides for Loopback are updated by the Link Layer in the loopback control registers. 3. Link Layer gives command for Loopback test. Loopback is initiated – Link layer relinquishes control of the bus – Physical layer assumes control of the link. 1. Physical Layer starts Loopback – “Loopback Entry”. 2. Physical Layer stops Loopback after all loopback status registers are updated and the test is complete. 3. The Physical Layer enters – “Loopback Exit”. 4. The loopback status registers are available to the Link Layer for further processing. 5. All settings are restored to nominal values. 6. CSI Link goes to “polling” state. 18.4.8 Loopback Entry Loopback can be entered as soon as any one CSI lane is active from the Polling state by command from the Link layer. The Link layer would set up the appropriate loopback count, error detection and overrides in the loopback control register and training patterns. The Physical layer CSI state machine is moved to the polling state. The Link layer can also initialize the appropriate loopback registers and training patterns during the power up sequence and before the CSI state machine reaches the polling state. In this way, the Physical layer moves through the training sequence and then directly to loopback. The loopback mode can functions on the concept of a master and slave device. Implicit handshaking, as described below, between the Master and Slave agents happens through the TS3, TS5 sequences as described below, before the Link is in the Loopback State. The diagram below details the TS3 TS5 handshaking that occurs during training to trigger the loopback function. TS5A+Payload is optional. Loopback should run without override mechanism Ref No xxxxx 489 Intel Restricted Secret Design for Test and Debug Design for Test and Debug Master TX data at slave Master TX data at master Slave TX data at slave Master exits polling and sends TS5 to initiate loop back mode TS3 TS5 TS5 TS5 TS3 TS3 TS5 TS5 Flight time from master to slave Slave recognizes the TS5 from the master TS3 TS3 TS5 The Slave interrupts its transfer of TS3, and loops back transmitter to receiver TS5 The slave is now echoing the master TS5 TS5a payload TS5 TS5a Master has received the TS5 packet and sends a TS5 with the ACK bit set, followed by the loop back data payload. TS5a payload After receiving the payload, the slave may implement all receiver overrides. After re-transmitting the payload, the slave may implement all transmitter overrides Slave TX data at master TS3 TS3 TS5 TS5 TS5a After receiving the payload which was echoed by the slave. The master may implement receiver overrides and begin checking loop back data Master Slave Master TX data at master Master TX data at slave Slave TX data at slave Slave TX data at master The master implements all local overrides for the transmitter after sending the payload payload payload The sequence of events for Loopback entry is also explained by means of a “flow diagram” described below. Coming out of polling the loopback master sends TS5 but the loopback slave continues to transmit TS3. Once slave receives TS5 it immediately stops current TS3 pattern it is transmitting and echoes back TS5. The master looks at the looped back TS5 as an indication that slave entered loopback mode and sends TS5 with ACK bit set. This TS5 training sequence is followed immediately by a test pattern. When the slave receives TS5 with ACK bit set, it varies its RX parameters based on the data fields in TS5 and uses these new values to echo anything following TS5. It is important that slave switches to these new parameters only after echoing back TS5 so that master is guaranteed to receive TS5 correctly. The loopback mechanism assumes a pre-determined transceiver pairs at both master and slave ports. Loopback on asymmetric links requires muxing/demuxing at either end to match transceiver pairs, and a detailed discussion on loopback mechanism for asymmetric links is deferred to an implementation spec. The Master contains the necessary logic to look for pattern mismatches and appropriately updates the Loopback status register with the results. The individual Loopback control register bits and Loopback status register bits are detailed in the sections below. 490 Ref No xxxxx Intel Restricted Secret Figure 18-6. Loopback Entry Flow Diagram Slave RX Receives TS5 Pattern Recognizes TS5 Pattern Echos TS5 PatternSlave TX Receives TS5 pattern Master TX Sends TS5 Pattern Master RX Master TX Slave RX Sends TS5 with ACK, followed by Loop Back Overrides for the Slave Slave TX Receives TS5 with ACK followed by Loop Back Overrides for slave Recognizes ACK. Reads the Overrides. Slave Loop’s back the same Header followed by the Overrides which it received. Master RX Loop Back Entry Handshaking Complete between the Master and Slave Link Layer updates the Loop Back Control Register and initiates Loop Back command 11 33 4 5 6 7 8 This is the earliest possible time the Master can implement it’s Receiver Over-rides. The overrides for the Master will be different than that of the Slave LB Header Slave Over-rides This is the earliest possible time the Master can implement it’s Transmitter Over-rides. The overrides for the Master will be different than that of the Slave LB Header Slave Over-rides This is the earliest possible time the Slave can implement it’s Receiver Over-rides. This is the earliest possible time the Slave can implement it’s Transmitter Over-rides. LB Header Slave Over-rides LB Header Slave Over-rides 2 The master receives the TS5 with ack, followed by overrides and can begin checking 18.4.9 Loopback Control Register The Link layer specifies the overrides and other controls to the PHY layer through the control registers for loopback. The control register and the individual bits are described in Chapter 3. Please refer to Chapter 3 for details. Ref No xxxxx 491 Intel Restricted Secret Design for Test and Debug Design for Test and Debug Figure 18-7. Slave Agent – Receiver Input Common Mode Override Figure 18-8. Master Agent – Receiver Strobe Override 492 Ref No xxxxx Intel Restricted Secret Figure 18-9. Slave Agent – Receiver Strobe Override Figure 18-10. Master Agent – Transmitter Driver Current Override Ref No xxxxx 493 Intel Restricted Secret Design for Test and Debug Design for Test and Debug Figure 18-11. Slave Agent – Transmitter Drive Current Override 7. Continuous Override 0 = Implement Overrides (margin offsets) only during Loopback 1 = Implement Overrides continuously – This is useful to have independent control of the override registers and be able to implement the offsets without having to be in loopback modes. 8. Pattern Buffer This 40-bit register is used to store the patterns used during Loopback. This register can be updated from the Link layer, or any external debug port like a TAP controller. Each lane receives 2 sets of data from the pattern buffer – one buffered and the other inverted. Individual Lane’s multiplex between these 2 sets depending on the Pattern Invert vector described below. 9. Pattern Invert This is a 20-bit vector field - used per lane - to choose between either using the pattern buffer data or inverting it. This ability helps in looking for crosstalk related issues in Loopback. A simple diagram showing the pattern buffer architecture is shown below. 494 Ref No xxxxx Intel Restricted Secret Figure 18-12. A Basic And Minimal Pattern Buffer Architecture The above definitions for the Loopback control register are meant to meet minimal requirements to enable information gathering from the CSI bus with minimum overhead. Other, more complex pattern generation and comparison schemes are to be sourced from a software program in the cache of the high performance processor. Using cached based software to control pattern generation and checking will be the workhorse of any IBIST infrastructure. The above definitions for the Loopback control register are meant to meet minimal requirements to enable information gathering from the CSI bus with minimum overhead. Other, more complex pattern generation and comparison schemes are to be sourced from a software program in the cache of the high performance processor. Using cached based software to control pattern generation and checking will be the workhorse of any IBIST infrastructure. 18.4.10 Loopback Status Register The loopback status register is used by the PHY Layer to be update with all the results of the of the Loopback test. See Chapter 3 for details. 18.4.11 Loopback Exit Loopback is exited in one of two ways. If loopback was entered with a finite loop count, exit is seamless. When the requisite number of loopback patterns have been sent and received, the transmitter and receiver both drop out of loopback, restore the original settings. If the loopback count is infinite, the exit procedure is more complicated. Since the outbound link can be stressed, it is possible that the slave receiver is receiving nothing but garbage. We cannot depend on any data Ref No xxxxx 495 Intel Restricted Secret Design for Test and Debug Design for Test and Debug handshaking to end the loopback state. Instead, handshaking is accomplished by dropping and restarting the forwarded clocks – an Inband Reset (refer to Physical Layer Chapter). The timing diagram associated with Loopback exit is shown below. Figure 18-13. Loopback Exit Timing Diagram Master clock at master Slave clock at slave The slave detects the dropped clock and drops its own clock and data. This must not occur until all data before the dropped clock has been re-transmitted The master detects the dropped slave clock and restarts its own forwarded clock. From the time that the forwarded clock is disabled until the slave recognizes the disabled clock and disables its transmitters, the slave “loop back data” is garbage. Master Slave Master TX data or clock at master Master TX data or clock at slave Slave TX data or clock at slave Slave TX data or clock at master Master drops the outbound forwarded clock and idles the data TS1 TS1 Master data at master Slave data at slave The master detects the restarted master clock and restarts its own forwarded clock. TS1 TS1 TS1 TS1 Once the master DLL is locked to the slave forwarded clock, training patterns are transmitted Once the slave DLL is locked to the master forwarded clock, training patterns are transmitted 1 2 3 4 5 6 Loopback is “exited” by the Master agent by dropping the forwarded clock. The slave senses the loss of the clock and drops its own forwarded clock. When the master senses that the slave has dropped its forwarded clock, it restarts its own clock. The slave senses the restoration of the master forwarded clock and also restarts. In this way, full handshaking is accomplished. Data transmission is restarted once the DLLs are locked. Loopback is “exited” by the Master agent by dropping the forwarded clock. The slave senses the loss of the clock and drops its own forwarded clock. When the master senses that the slave has dropped its forwarded clock, it restarts its own clock. The slave senses the restoration of the master forwarded clock and also restarts. In this way, full handshaking is accomplished. Data transmission is restarted once the DLLs are locked. The sequence of events described for Loopback exit can also be described using the flow diagram shown below. 496 Ref No xxxxx Intel Restricted Secret Figure 18-14. Loopback Exit Flow Diagram 18.4.12 CSI Determinism 18.4.12.1 Physical layer requirements. CSI operation requires that each CSI agent synthesize the internal I/O clocks from a single clock source. Each CSI agent requires a synchronizing signal from which it will reference all determinism state and counts. An example of this is DPGs use of the de-assertion edge of reset as sampled by busclk. Each CSI agent will fix the UI (Unit Interval) latency of the interface at initialization and hold that latency, even during re-training. For PSMI collection and replay, a mechanism such as a sync counter is provided in the Physical layer for quantifying the latency of a CSI interface. The use of this mechanism will introduce no side band signals, and will be available through TAP or processor control. The mechanism is initialized at the synchronizing event (reset in the above example) and runs freely from that point. Information from this mechanism (counter values, pointers) combined with similar information from the far end port, will be sufficient to calculate the overall latency of each direction of the CSI port. Ref No xxxxx 497 Intel Restricted Secret Design for Test and Debug Design for Test and Debug 18.4.12.1.1Description. The key to understanding CSI determinism is to view the I/O clock domains on the two separate CSI agents as cycle-to-cycle synchronous to each other as viewed from the common busclk. Each agent synthesizes an I/O clock in reference to and synchronous to the common bus clock, so these to clocks can be viewed as synchronous to each other. Since the sync counters in the different agents were initialized at the same reference event, the counter can be viewed as cycle by cycle in synchronization with each other. Figure 18-15. Example of Clock Synthesis System clock Busclk Local PLL Input clock Synthesized clocks Busclk IO clk Core clk Local PLL Input clock Synthesized clocks Busclk IO clk Core clk The CSI specification requires each agent to synthesize the IO and clocks from a common system clock. These clocks can be treated as a common ‘virtual clock’ with the proper initialization and synchronization The diagram above shows an example of a bus clock, I/O clock and core clock synthesized from the same reference clock. By choosing a common starting point, the synthesized clocks across agents can be viewed as synchronous to each other. In order to maintain cycle by cycle determinism between the forwarded clock and the I/O clock domains, enough slack must be left between these domains to allow retraining without changing the UI at which data is brought into the I/O clock domain. One method to do this is to halve the frequency of the incoming data by interleaving, and sample with the other clock, the resulting data in the middle of the cycle. The two clock domains can drift +/- one UI and still maintain cycle by cycle determinism. The amount and method of actually providing the slack between the clock domains is implementation specific. 498 Ref No xxxxx Intel Restricted Secret 18.4.12.2 Measuring latency One way to quantify the latency of an interface is to have the transmitting agent records the counter value at which a packet is transmitted to the receiver. The receiver notes its own count value when the packet is received and retrieved from the alignment register. Since the two count values are synchronized to the I/O clock, the difference is the latency across the interface, known and repeatable to the I/O clock cycle. For a CSI port in lock step operation, the difference between the read and write pointer in the inbound FIFO gives the latency across the interconnect. In this case, the required registers will be different from those described above. Figure 18-16. System Level Determinism Using Counters Systemlevel determinismusing counters to time the transit of packet headers. 1 D D# . . 20 D D # D# D 20 . . D# D 1 The counter is enabled at reset, providing a deterministic reference for future CSI events. When enabled, the MSR captures the count at which a packet header was transmitted D Q MSFF Counter Enable Reset Bus Clk MSR Latch Packet header To Core DQ MSFF Counter Enable Reset Bus Clk MSR Packet header To Core The receiving agent has a synch counter which is also initialized at reset. When enabled, the receiver captures the count at which the packet header is received. Packet Header Detect Packet Header Detect Out Bound Data In Bound Data 18.4.12.3 Tester Determinism and Lock Step Operation The CSI Physical layer will provide a mechanism to delay the incoming data to a pre-determined flit boundary, based on the synchronous de-assertion of reset which initializes the sync counters. Enough tolerance will be built into the alignment buffer to act as a FIFO, and delay the incoming data by 0 to N cycles, (N being the maximum tolerance needed for tester determinism, or the required flexibility for lock step operation). Ref No xxxxx 499 Intel Restricted Secret Design for Test and Debug Design for Test and Debug CSI flit synchronization to the tester CSI Reset as sampled by the bus clock At the end of the training sequence, the bus is converted from 20 serial links to one parallel link. Training packets TS3a TS3ra Incoming training packets are sent from the tester or gasket. The receiver section locks to the training packets and establishes the flit boundary cadence based on these packets. Pkt Pkt Pkt Pkt Pkt Pkt Pkt Pkt Training packets TS3a TS3ra Pkt Pkt Pkt Pkt Pkt Pkt Pkt Pkt The data can be delayed by 0, 1, 2 up to N phits. CSI Data as sent to the core The tester then begins sending idle packets. The physical layer initializes the physical layer sync counters, which count the number of UI ( CSI clock unit intervals ) from the reset edge capture. When the first non-idle flit is encountered, the physical layer compares the sync count to the desired count as supplied by hard wire, fuse option or TAP override. Enough latency is added to the alignment register to present the data to the processor core at the new flit boundary At the de-assertion edge of reset as sampled by the reference clock, the CSI port initializes the sync counters. These counters are described in the section on System Determinism. When a non-idle flit is encountered, the Physical layer compares the count of the incoming flit cadence to the desired latency value, and slips the pointer in the incoming alignment register by 0 to N phits in order to align the flit boundary. With this scheme, a tolerance of +/- N/2 phits can be allocated to the tester or lock step mechanism, and determinism can be maintained with no tester interaction or calculation. 18.4.13 Repeater Requirements The specification provides for presence of a repeater in the link for the purpose of observe-ability or length extension. The presence of repeater will introduce extra latency which must be accommodated by the Link layer retransmission buffer and any other mechanism sensitive to link latency. The repeater is required to de-skew between lanes while performing the repeating function. The repeater must reset the voltage, jitter and skew budgets in the link. The repeater should conform to following requirements. • Repeater should perform repeating without any loss and should preserve relative timing of FLITs passing through. The introduction of a repeater must not perturb the traffic execution. 500 Ref No xxxxx Intel Restricted Secret • The introduction of repeater should be transparent to end CSI agents. In other words, it should ensure the preservation of end to end agent interaction. The repeater shall pass all initialization and traffic protocol contents unaltered from upstream agent to downstream agent. Further, • Repeaters in each unidirectional sub-link of a link should operate totally independent of each other. • In normal operation, no content is introduced into the link by the repeater, although the repeater could be used to inject debug and training state control as part of debug or DV process. These mechanisms are not covered as part of this spec. • In normal operation, no content is altered in TS1 and beyond or in any FLITs by the repeater. • In normal operation, no content is dropped in transmission beyond minimum unavoidable startup delay for de-skew operation. Repeater shall delay passing on TS1s (and shall instead continue transmitting TS0 1010…) until the lane de-skew is achieved. The delay should be small compared to polling.1 timeout TPOLLING.1 (less than 5% of nominal value). • Repeater shall pass all initialization and traffic protocol minimizing the changes in timing. Further, • Added latency must be as low as practical; not to exceed the maximum value of 30 to 40UI. (TBD). • Added latency must be same following every initialization within maximum VT slew tolerance to aid determinism. • Sequence and the order of TSn and FLITs must be unaltered. • The termination state detected by TX detection circuitry in downstream sub-link is passed on to upstream sub-link independently in forwarded clock lane and each data lanes. The delay in transferring termination state from downstream sub-link to upstream sub-link must be small compared to relevant timeout value TDETECT.2. (less than 5% of nominal value). • Repeater should transfer assertion and de-assertion of forwarded clock (in-band reset) in upstream sub-link to downstream sub-link forwarded clock. The delay of transfer should be small compared to timeout value, TINBAND_RESET_INIT. (less than 5% of nominal value). • The downstream sub-link data is asserted only if actively driven data is received in upstream sub-link. Such transmission must be done in a manner to enable the preservation of RAS, power reduction modes, link width reductions, intermittent or permanent failure of lanes. • Repeater should feature receivers & transmitters conforming to CSI specification, including all DFx requirements for override of transmitter amplifier and equalizers. • Repeater must perform initialization and traffic transmission without requiring real- time parameter changes by external agent. This should not exclude external agents like TAP controllers initializing repeater functions, which is beyond the scope of this specification. • In configurations which support these features, repeater must support hot-plug operations of upstream and downstream agents without causing damage to self or agents and without perturbing algorithms timing or semantics. • Note: Low power modes compatibility and repeater behavior with low power modes are not finalized at this point of time and are WIP. 18.4.14 CSI Eye Margining Eye margining refers to the ability to control the width and height of the “Eye” both from a driver side and from a receiver side of the CSI link. This control provides an elegant way to determine the available margins in the CSI bus – both in HVM and DV - and also helps in quickly isolating Ref No xxxxx 501 Intel Restricted Secret Design for Test and Debug Design for Test and Debug problems in the PHY layer which may otherwise require an elaborate test and validation setup. This also enables the “on-die oscilloscope” feature for the CSI bus which essentially provides an automated method of “schmooing” both the Transmitted Eye and the Received Eye. 18.4.14.1 Eye Height Adjust - Transmitter This section describes the 2 different methods required for modulating the output wave height for a CSI Transmitter. The basic assumption here is a Transmitter circuit design with an N-Tap equalizer (N is an integer 2) and a current mode driver with line termination to “Vss”. Each Tap in the equalizer is also assumed to have certain number of “resolution bits” – to control the amount of de- emphasis performed by each tap. Usage of these registers does not preclude a voltage mode driver. This document does not address that implementation. Eye Height Adjust using Transmit Equalizer The first method of modulating the Eye height from a CSI transmitter involves picking a de- emphasis level by over-riding the equalizer and transmitting at that level from the Transmitter for all data. The Figure below illustrates this method. The common mode voltage for the Transmitter (usually maintained by an I-Comp circuit) is independent of the de-emphasis level. This method allows us to choose an output swing value “below” the nominal output swing level. For example, if we have a 500 mV nominal output swing, then this method enables us to transmit a value only below 500 mV. This method of modulating the Transmitter wave height – when carefully performed may be useful in margining the sensitivity of the Receiver. This feature may require “reduced frequency” mode because this eliminates equalization from the link and therefore ISI may completely close the eye at the receiver. This can also be useful to structurally test the transmitter in the Loop-back mode. 502 Ref No xxxxx Intel Restricted Secret Figure 18-18. Transmitter Eye Height Adjust Using the Equalizer To enable this, a typical implementation multi-tap transmitter equalizer may choose to program the multi-tap equalization to just 1-tap and adjust the co-efficient of the chosen tap to give the required programmable output levels. Eye Height Adjust using Transmitter I-Comp Settings The second method of modulating the Transmitter Wave Height is to change the settings of the I- Comp circuit. (This circuit is used to maintain a constant output swing at the output of the Transmitter by adjusting the Transmitter current source. This involves a replica Transmitter, an External Reference and a feedback Loop. It is assumed that there would be 1 I-comp control circuit per link to compensate all the Lanes). Figure below illustrates this method. Ref No xxxxx 503 Intel Restricted Secret Design for Test and Debug Design for Test and Debug Transmitter I-Comp Settings control Example below shows the shift of the entire waveformand the Common mode level for a 2-tap equalizer with the de-emphasis value fixed at 3-dB. 0 500 mV 600 mV 400 mV VCM = ~300 mV 0 425 mV (3 dB) 354 mV (3 dB) 283 mV (3 dB) 1 500 mV 600 mV 400 mV 1 425 mV 354 mV 283 mV VCM = ~250 mV VCM= ~200 mV 0 mV 175 mV 146 mV 117 mV 0 mV 175 mV 146 mV 117 mV In this method the I-comp control settings are overridden using the override register bits (described below) from the Link registers. If the near-end agent is configured as a Loopback slave, then these settings are obtained from the training registers. This method of adjusting the Transmitter Eye height could be used to margining the common mode response of the receiver. Also, this feature has the potential for Power management. This can also be useful to structurally test the I-comp circuitry and check the settings. Unlike the equalizer method to adjust the transmitter output, this method allows us to choose an output swing value both above and below the nominal output swing level. It must be kept in mind that by choosing an output swing level “above” the nominal value, the current source could be operating at a non- optimal point (current source transistor could be going to the linear region) 18.4.15 Eye Width Adjust – Transmitter (TBD – currently not included in Loopback register definitions) This section describes a method of adjusting the Output Wave width from the Transmitter using Transmitter Jitter Injection. In this implementation, jitter is induced dynamically to the PLL clock going to the Transmitter. This causes the Output data to “shrink” for one cycle. 504 Ref No xxxxx Intel Restricted Secret Figure 18-20. Transmitter Eye Width Adjust Using “Jitter Injection” A simple implementation of this feature is shown in the following figure. In this method the Transmit Clock generation circuitry (PLL) gets an 8-bit field from the Link Registers and outputs a jittery I/O clock, which is used by the transmitter to send out the data. Need to add block diagram here. For this method the Jitter Injection control settings are set using the control register bits (described below) from the Link registers to jitter the Near-end Transmitter. Ref No xxxxx 505 Intel Restricted Secret Design for Test and Debug Design for Test and Debug Figure 18-21. Transmitter Eye Width Adjust Using “Jitter Injection” Control Register This feature is useful to perform Receiver eye width margin measurement. The advantage of using this feature is that we can observe the transmitter output on a scope and thereby find out how much “closure” has happened on the eye and calculate the margin numbers. 18.4.16 Eye Height Adjust – Receiver This Section describes a method to adjust the amount of Eye opening seen by a CSI differential receiver by moving the “common mode” point of the differential receiver. This basic assumption here is a DC-coupled differential receiver with termination to “Vss”. A “programmable offset” comparator circuit is used to effectively change the common-mode point of a differential receiver by adjusting the offset of the devices associated with the comparator. Note – A scheme used in an AC-coupled (PCIE) type to adjust the common mode value directly may not work for CSI as this is defined to be a DC-coupled bus. Therefore an in- direct method of using offset control to move the common-mode point is required. Need to add waveform diagram and block diagram here. 506 Ref No xxxxx Intel Restricted Secret Figure 18-22. Receiver Eye Height Adjust Control Register In this method the Offset-control settings are overridden using the override register bits (described below) from the Link registers. If the near-end agent is configured as a Loopback slave, then these settings are obtained from the training registers. This feature of Receiver Eye Height adjust is useful to measure the amount of margin available in the bus both for HVM and DV, with the interconnect losses taken into account. 4.0.1 Eye Width Adjust – Receiver This section describes a method to adjust the width of the received “Eye”. This can be accomplished overriding the Phase Interpolator settings for the ideal strobe location and therefore moving the strobe to in either direction of the received Eye Ref No xxxxx 507 Intel Restricted Secret Design for Test and Debug Design for Test and Debug In this method the PI-control settings are overridden using the override register bits (described below) from the Link registers. If the near-end agent is configured as a Loopback slave, then these settings are obtained from the training registers. Need to add waveform diagram and block diagram here. 18.4.17 Structural Tests The Physical layer digital logic of the CSI is easily an order of magnitude more complex than the logic found on current Intel processor Front Side Bus (FSB). Complete coverage for high volume manufacturing test will be crucial in reducing the DPM escapes due to the I/O Physical layer. We can categorize the circuitry in the Physical layer as follows: Purely digital logic such as state machines and control logic, and digital controlled analog structures such as phase interpolators and termination. Most of the purely digital logic will be tested during a training sequence. If a training sequence cannot be run such as at electrical sort and it is determined that coverage is necessary to reduce DPM escapes, Scan testing, compatible with the processor core methodology should be used. For the digital controlled analog logic, running the Physical layer state machine through a complete training sequence will test a large part; say 80 percent, of the paths. The remaining paths need to be covered: unused legs in the compensation state machine, termination, drivers, phase interpolators and unused clock phases in the clock distribution. Structural test on major CSI structures can be done in two ways: 508 Ref No xxxxx Intel Restricted Secret Registers to manually override individual legs or paths in the structure to be tested Implementations of structural test state machines to automate the override and test of the targeted structure. Structural test registers are defined for the control and status of individual tests. If the structural test is accomplished simply by manual override of existing registers, then only the status register is needed. If a structural test state machine is designed to automatically enable and test each leg, then both the individual test and global registers should be used. The registers as defined are very simple and are not meant to dictate specific structural test designs, but only to give the same ‘look and feel’ of a structural test control and status. 18.5 Pin Leakage Testing - Transmitter and Receiver The transmitter differential outputs and inputs must provide circuit hooks to monitor leakage current through them without the need for a dedicated tester channel. As this feature does not require a dedicated tester channel - it is called "no-touch". The RC delay method of pin leakage is mentioned here as an example, but it does not preclude anyone from using a different method (current source based methods). Principally the requirements RC leakage testing is accomplished by tri-stating or disabling any other logic paths or circuit connected to these pins. The pins - connected as differential inputs of a comparator - are then charged/discharged to different voltages and the output of the comparator is observed for a flip in value (after some time) indicating a leakage path present on the pin. Pin leakage testing must be possible when transmitter and receiver are hooked together as an operational channel, either in a standard or loop back configuration. 18.6 CSI Post-Si System Debug Requirements CSI Post-Si system debug challenges are numerous, as we shift away from a centralized shared system bus model to the point-to-point links-based system interconnect. The challenges range from observation/probing requirements through ensuring adequate mechanisms to cause debug events and exposing debug information in packets across various links in the system. We propose a generic Debug packet format that will account for the flexibility that we need to accommodate to generate and dump debug data for as-yet-unseen post-Si issues. We anticipate having to debug tough problems on many flavors of the yet-to-be-fabricated CSI agents that run the gamut from single core processors to sophisticated multi-core processors with complex routed interconnects, as provisioned in the CSI specifications. 18.6.1 System Debug Requirements Requirements for system level debug are divided into several categories. 1. Observation of raw CSI traffic 2. Determinism of link behavior 3. Architected Debug Features Ref No xxxxx 509 Intel Restricted Secret Design for Test and Debug Design for Test and Debug 4. Side-band signals for communication of debug/validation information/events 5. TAP/JTAG Specific requirements in each category are enumerated in the sections that follow. Short examples are given for some requirements to illustrate the motivation. Exact implementation of any requirement in a given CSI device will be micro architecture dependent and hence beyond the scope of this document. 18.6.1.1 Observation of raw CSI traffic Observation and analysis of raw CSI traffic will be fundamental to system level validation and debug. Probing and Observability: Electrical characteristics of CSI must accommodate an external observation agent (e.g. mirror port, or a repeater based observation ASIC) on any CSI link. Instrumenting the link for observation should not disturb functionality or performance. There should be enough guard band to allow for instrumenting the signals. Observation Agent Requirement: Observation agents may introduce a small, constant, deterministic flight time delay. Such delay must be measurable and constant on all observed links around a device. An observation agent must never alter the content of the raw traffic, or the relative order and timing between units of physical transport (Phits’) on the CSI link. See Section 18.4.13 Observation agent must be able to deal with Link initialization, flow-control, re(training), idling (due to absence of pending traffic, or for power management), dynamic speed or width reduction, or any special mode of link operation (such as loop-back mode). Capture and Triggering: It should be possible to perform a contiguous or selective capture of raw CSI traffic by a link observation agent. Placement of key information fields in header Flits, or any form of Flit/packet level interleaving should not unduly constrain the ability, or complicate the implementation, of an observation agent to either recognize a trigger defined over multiple contiguous Flits/packets, or filter out Flits/Packets containing specific data pattern. Time-stamps: A common, global timing reference for all devices in CSI fabric is required. Alternatively, it should be possible for an external observation agent to derive a common, global timing reference using an algorithm to correlate local timing references at individual CSI devices. A system reference clock from which CSI forwarded clock is derived should be accessible to the Observation agent. An external observation must be able to time-stamp accurate arrival/departure times for all Flits at their source/destination devices. If precise and consistent global time-stamping of all Flits across CSI fabric is not feasible, the CSI agent must facilitate a notion of a bounded window within which such a global time reference is viable. Reliability: Transmission errors on a CSI link should not break the ability of an observation agent to probe/monitor the link and capture raw CSI traffic. Observation agent should gracefully recover like all other CSI agents in case of a transmission error. In case of an observation mechanism such as Mirror port, it is necessary to indicate via side band signals the occurrence of transmit errors on CSI. 510 Ref No xxxxx Intel Restricted Secret Deterministic Link behavior will be inevitable for failure repeatability and replay. It will also be required for other uses such as for lock-step functionality and for failure correlation across various test/debug environments. Reset determinism: All CSI compliant devices must guarantee well defined and consistent Arch/uArch state out of reset such as to guarantee subsequent run-time determinism as defined below. Run-time determinism: Raw CSI traffic on all links on a CSI device should be fully deterministic in order, content and timing for a fixed set of operations on all CSI devices on all links. Repeated run of the same workloads should result in identical traces. For example, a test program with one or more threads of execution on a processor and ensuing memory/IO operations in a controlled, SV style environment should produce identical CSI traffic between the processor and the chipset each time the test program is executed. Same would be true for CSI traffic between two processors running a test program distributed across one or more threads on each processor. Replay determinism: Run-time deterministic behavior for a CSI compliant device as defined above must be repeatable with pin level vectors extracted from raw CSI traffic. The vectors may be replayed in a test bench with an identical device or a device model (e.g. RTL model in CSIM). Exposure of non-deterministic behavior: Deviations from deterministic behavior as defined above may be unavoidable due to uncontrollable/unpredictable phenomena such as soft errors, charge rationing, etc. The CSI agent implementations must explicitly specify all possible sources of non-deterministic behavior. Periods of non-deterministic behavior must be exposed externally through debug special packets or through the side-band bus if available. 18.6.1.2 Architected Debug Resources and State Defeatures: Every CSI Agent must provide control register bits that can be used to turn-off/on CSI related constructs excluding the ones that are required for data path. Some of the other defeatures include reducing the number of entries in a given buffer/tracker etc. that can be used to change the traffic pattern. These kinds of experiments typically help come up with theories on failures. Some of the defeatures are also used to work around failures and many times help continue product shipment. The exact features that need to be defeatured and number of enable/disable bits to be provided are all implementation specific and should be documented in Product Specifications. Access and use of these could/should be restricted to debug personal only. Defeatures also refer to control of applicable protocol parameters and architected states/resources that can be used to exercise/stage complex validation scenarios. CSI agents must specify generic corner cases and boundary/stress conditions in CSI protocol implementations, and the same for architected protocol states/resources. Complying CSI devices must clearly specify parameterized algorithms to exercise such cases and conditions Microarchitectural event monitors and triggers: CSI agents must provide the capability to count and trigger on critical micro-architectural events. Further have the capability to expose the trigger either via Sideband pins or using Priority Debug Packets (as defined in Link Layer Chapter). The exact definitions of the events, actions to be taken on a triggered event are all micro-architecture and product specific and should be documented in Product Specifications. Some implementations could choose to freeze structures based on a trigger, others ma choose to expose selected internal nodes for several clocks via Debug packets etc. Events referred to here are things like Error conditions, CSI retries, No credits etc. Ref No xxxxx 511 Intel Restricted Secret Design for Test and Debug Design for Test and Debug Freezing of Structures: CSI agents must provide the capability to freeze structures associated with CSI either asynchronously or based on an event trigger. Note, the freeze is considered destructive in terms of the ability for the part to continue to function until it is reset again. In addition, capability to dump the contents of the structures using TAP/JTAG port or Control registers is required. This would allow debuggers to get a snapshot of the internal state to aid debug. Observabilty of internal nodes: CSI agents must provide a way to expose state of internal nodes either via Debug packets or via JTAG/TAP or other debug ports. Exact mechanism is implementation dependent. Implementations may choose the appropriate method. Some implementations may choose to support this as a snapshot using Scan, others may want to expose nodes real-time via high speed debug ports or CSI Debug packets etc.t Physical layer Settings Override: Each CSI device must provide programming means (Control registers) to override transmitter default equalization and amplitude settings so that presence of observation agent/probes can be adjusted independently for each link in each direction. Refer to Chapter 3, “Physical Layer” and also Section 18.4, “CSI Phy Layer DFx Tools” on page 18-483 for more details 18.6.1.3 Debug Packets Debug Packets are essential to expose internal states of CSI agents that are otherwise inaccessible. The contents of debug packets is implementation specific. Contents could include things like branch info (source and target IPs), time stamps, indication of an internal event trigger, internal node values on the occurrence of an internal event, information useful to create Long-Instruction Traces (LIT) etc. The exposed data is typically captured by observability agents like Logic analyzers for post-processing and failure analysis. Observability agents must provide triggering capabilities that allow for matching on any bit pattern in the Debug Field for maximum flexibility in Debug/Validation In the first generation products and CSI specification, Debug packets are Link Layer packets. See Chapter 4, “CSI Link Layer” for details on Debug packets and behavior rules. Future revisions of the specification will include Debug Packets that require responses form other CSI agents. 18.6.1.3.1 Debug Packet Usage Model This section illustrates a critical usage model that Post-si will rely on initially. Using the key learnings from the debug of initial products further evolutions/enhancements will be proposed in future CSI specification revisions CSI Replay CSI Replay is a trace based debug methodology for post-si failures observed in SV, CV, CMV/AnV, or OEM platforms. It allows a failure to be captured on a logic analyzer trace and replayed on RTL/simulator for further debug and root cause. Part of this process is to stop/interrupt the processors periodically and expose a subset of its internal state and then continue till the failure point. In order to stop and expose the internal states, it is necessary to quiesce the CSI fabric especially from the IOH side. This section describes the use of Debug packets and CSRs to accomplish the quiescing of CSI. CSI agents must support coordination mechanisms and behaviors needed to initiate and establish quiscence, allow unambiguous state collection, and resume normal operation eventually leading up to observed failure. These mechanisms and behaviors are defined below. Note a lot of the internal handling and exposure of internal state is implementation dependent. Also, his section is intended to show a use case, details are left out to make it easier to understand the use case. 512 Ref No xxxxx Intel Restricted Secret • A processor Agent (Master) initiates a broadcast debug packet to all processor agents signalling them to get ready to perform CSI replay event. The debug field content etc. are all TBD as of now. • All Receiving processors perform/execute special sw handlers and rendezvous for all other processors to get to the same point • A processor agent (master agent) initiates quiescence on one or more of it's peer agents by broadcasting a debug packet (with TBD information in the Debug Field that the Target can decode and respond) or a remote CSR write. User can select the target agent(s) for the quiescence command. • A quiescence command cannot be initiated by a non processor agent. • A receiving agent will initiate local quiescence to suspend all outbound traffic on the corresponding transmit link, i.e perform the same tasks that the Requester did prior to sending the debug packet. This may require draining all pending messages on that link. The last outbound message from the suspended link must acknowledge the link quiescence state to the requesting agent. The acknowledgement can be accomplished using a debug packet or a remote CSR write. Note non-processor agents are required to do this as well. • Once quiescence has been achieved on a processor agent, the agent will generate an internal notification to initiate local state dump. This is an implementation specific detail enumerated here for completion sake. On some implementations the internal state could be exposed using debug packets before initiating the quiesce protocol, or the done via other high speed debug ports or JTAG. Also, as mentioned before internal state could be exposed on the CSI fabric using/issuing Debug packets. Note: Specific state to be dumped can be user selectable (optimistic but achievable and can be very useful for unified CSI replay tool-chain). State can be dumped non-destructively either on CSI or in local memory in user selectable address space. • External memory/IO agents are not required to dump their internal states. • Any agent should not start local state dump until all peer agents notified earlier with the quiescence command have acknowledged their local quiescence state. • Upon completion of state dump at the master processor agent, the master processor agent resumes normal outbound operation on previously suspended links. It then issues a link resume command (using a debug packet or CSR write) to all peer agents notified earlier with the quiescence command. • Upon receiving a link resume command on an inbound link, an agent resumes normal outbound traffic on the corresponding transmit link. The agent then forwards the link resume command to other peer agents previously notified with the quiescence command. • All CSI agents must provide a mechanism to override the suspended state of an outbound link. 18.6.1.4 Side-Band Signals CSI protocols must support a set of side-band signals to bypass the normal CSI protocol stack within a device for real-time, non-blocking dispatch of critical debug/validation events across the CSI fabric (akin to BPM pins in the FSB paradigm). These signals will also be used to propagate real-time event triggers for an external observation agent. The specifics and the number of side band signals are implementation dependent. Physical Layer Chapter describes the electrical characteristics of these signals Ref No xxxxx 513 Intel Restricted Secret Design for Test and Debug Design for Test and Debug 18.6.1.5 IEEE Compliant JTAG/TAP interface CSI protocol must accommodate an IEEE compliant TAP/JTAG interface on any CSI device. The CSI bus should not restrict instrumentation around the TAP interface for asynchronous access of device state and resources through the TAP port 514 Ref No xxxxx Intel Restricted Secret y A A.1 Definition of Terms The following terms have been defined in the CSI specification: Device Address This is the address generated by the target node of an CSI transaction to access the physical memory or device location. This address is used on the I/O buses or on the memory interface. This address may be same as the physical address part of the system address or it may be translated through some (optional) mapping mechanism at the target. Caching Agent A protocol agent type which can perform reads & writes into coherent memory space. Configuration Agent The logical owner of all platform configuration registers on a CSI agent or component. A component may define a separate CA for each CSI agent on the die or it may define a single CA to represent all the CSI agents on the die. In the latter case, configuration transactions destined to CSRs in other CSI agents are logically targeted to the CA, which in turn completes the access within the die via implementation-specific mechanisms Firmware agent A CSI agent capable of supplying boot firmware to processor cores. Home Agent A protocol agent type which is responsible for guarding access to a piece of coherent memory. I/O Agent A protocol agent type which is responsible for non-CSI I/O devices behind it. As a CSI initiator, an I/O Agent makes CSI requests on behalf of I/O devices and returns responses back to the I/O device. As a target, an I/O Agent is responsible for translating CSI requests to the protocol native to its I/O interface and returns I/O responses back to the CSI requester. Physical Address This is the operating system’s view of the address space in a partition. This is obtained by translating virtual address through the operating system page translation mechanism. This is also the address used by the cache coherency mechanism, which puts certain requirements on the mapping of coherent shared address space within and across partitions. Processor Agent The CSI interface to a logical processor. (this definition needs to be revised and will need to change as we better understand how interrupts, VLWs, etc. are partitioned in designs). Routing Agent A CSI agent which implements a routing step, routing a CSI packet from the input port of a router to the destination port based on the destination node id contained in the packet. A packet is routed from its source to its destination through a series of routing steps. System Address The system address is represented by the physical address and the target (home) node identifier, which points to a unique device address in a system. The addressing model allows same physical address from different source agents to map to different system address (e.g., private firmware space per processor agent) or to the same system address (e.g., Ref No xxxxx 515 Intel Restricted Secret Glossary Glossary shared memory space in a partition or across partitions) irrespective of partition boundaries. System address also includes the scope of hardware cache coherency. For example, a system may have identical physical memory addresses in different partitions, but with different home nodes and different scope of coherency and therefore distinct system addresses. Also note that in the source broadcast based cache coherency scheme, the home node identifier does not play a role in specifying the scope of coherency. Virtual Address This is the address used by the applications, device drivers and devices (if I/O agents support paging) A.2 List of Acronyms The following Acronyms have been used in the CSI specification: 8259 Programmable Interrupt Controller (PIC) The legacy Intel platform interrupt controller. A20M Mask for Address bit 20. Legacy mask used to emulate older systems with only 1MB of main memory. ACPI Advanced Configuration and Power Interface Specification APIC Advanced Programmable Interrupt Controller (APIC) Controller residing in processor agents to generate and accept interrupt messages from other processor or I/O agents. BIOS Basic Input Output Services. Typically, code executed during the boot process and responsible for initializing the platform. BIST Built In Self Test BMC Baseboard Management Controller BSP Bootstrap Processor CAM Column Address Memory CM Configuration Management CSR Control/Status Register. DF Deadlock-free DMA Direct Memory Access. This is the mechanism used by I/O devices to read or write main memory. DRAM Dynamic Random Access Memory DW DWord = 4 bytes EOI Legacy End-of-Interrupt cycle. Used to signal the end of a level- triggered interrupt from the 8259 interrupt controller. FERR Floating Point Error. Legacy signal from older Intel math coprocessors indicating an error. FRU Field Replaceable Unit FW Firmware 516 Ref No xxxxx Intel Restricted Secret IGNNE INIT INTR INVD_Ack IO IPI ISA ISOC JTAG MMCFG MMIO MSR NBSP NID NMI NoDMA table NOP OL_A, OL_D, OL_* OS PAM PBSP PCI PCI Express PHOLD PMI PROCHOT QoS Glossary I/O Advanced Programmable Interrupt Controller residing in I/O devices to translate wire interrupt to inband interrupt messages. Ignore Numeric Error. Legacy signal from the processor used to mask numeric errors initiated by old Intel math coprocessors. Warm/Soft Reset - Legacy initialization interrupt initiated by legacy I/O port 92h.- forces processor to start fro ma known state without affecting cache states Legacy interrupt initiated by the 8259 interrupt controller. IA-32 specific: Special cycle transaction sent out by the processor after executing the INVD instruction - indicates to the external system that the internal caches were invalidated. Input/Output Inter-processor Interrupt Industry Standard Architecture - Industry's version of IBM's AT-bus which was trade marked and non-IBM PC makers could not use that name. Legacy PC bus for I/O devices pre-dating PCI. Isochronous Joint Test Action Group Memory Mapped Configuration Memory-mapped I/O. Memory space allocated to I/O devices. Machine Specific Register Node BSP Node Identifier Non-maskable Interrupt initiated by the 8259 interrupt controller. An implementation specific table used to control DMA access of main memory. No Operation. Meaningless cycle used for various proprietary reasons. On Line Addition, On Line Deletion, On Line Addition or Deletion Operating system Programmable Attribute Map Registers - provide address decode segments and their associated memory attribute bits like cacheable, Write-protect and user-defined Partition BSP Industry standard interconnect for computer peripheral components. The latest revision of the PCI specification. Legacy ISA mechanism for device bus mastering. Platform Management Interrupt Alert that a processor is too hot and thermal throttling has begun. Quality of Service Ref No xxxxx 517 Intel Restricted Secret Glossary Glossary SPT SSP STPCLK STPCLK_Ack SWP t_RTO TID TL TLB TLR UC space UTID VLW VN WBINVD_Ack WC space Quiescent Processor QuadWord = 8 bytes Routing Function Running System System Configuration Agent System Control Interrupt Selection Function System Miss Address File System Management Bus System Management Interrupt System Management Mode. This is an IA-32 mode facilitating platform- specific, system management code. Snoop Pending Table System Service Processor Stop Clock. Legacy signal from pre-CSI chipsets to request that the processor clock will be stopped. Acknowledge from the processor that it's clock may be stopped. Sliding Window Protocol Transport Layer Retry Timeout Transaction Identifier (per node) Transport Layer Translation Lookahead Buffer. A cache of virtual to physical address translation entries typically residing within a processor. Transport Layer Retry Uncacheable address attribute specified in processor page tables. Unique Transaction Identifier (across nodes) Virtual Legacy Wire. CSI mechanism for transmitting legacy, side-band signals into in-band messages. Virtual Network IA-32 specific: Special cycle transaction sent out by the processor after executing the WBINVD instruction - indicates to the external system that the modified data from the internal caches were written back to memory or an external higher level cache. Write combining address attribute specified in processor page tables. 518 Ref No xxxxx Intel Restricted Secret B B.1 CSI Profile Attributes Table B-1 summarizes the attributes of various platform profiles in terms of CSI features. Entries in this table are typically labelled with “N”, “O” or “R” to indicate if a feature is not to be supported, optional or required for a platform profile, respectively. A feature labelled with “N” for a platform profile indicates that the feature is not to be supported in that profile and is used as a mechanism to differentiate it from other platform profiles. A feature labelled with “O” for a platform profile indicates that the feature is optional in that profile and components designed for such profiles should discover (through parameter exchange or firmware) capabilities of other components in the platform before using this feature. A feature labelled with “R” for a platform profile is expected to be supported by all components designed for that profile. The feature may not be enabled by default. Table B-1. CSI Profile Attributes Features/Products Platforms / CSI Profiles Architectures Notes UP DP Small MP Large MP IA-32 Itanium® Profile Attributes Processor Sockets 1 <=2 <= 8 No limit N/A N/A Addressing Capability <= 41 <= 41 <= 43 <= 51 N/A N/A Physical Layer Support for 20 bit link width O O R R N/A N/A Products with the UP or DP profiles can choose to support either 18 or 20 lanes as a base. To get 18 lanes take off two of the L-lanes and not the CRC. Support for 18 bit link width R R N N N/A N/A Support for 10 bit link width O O O O N/A N/A Dependent on the number of CSI nodes, address size, packet interleaving and error handling options (viral, poison) supported. Support for 9 bit link width O O N N N/A N/A Support for 5 bit link width O O O O N/A N/A Self healing link N O O R N/A N/A Clock channel fail over N O O R N/A N/A Lane Reversal O O R R N/A N/A Can be used to create a differentiation. May be needed on one side only. Ref No xxxxx 519 Intel Restricted Secret CSI Profile Attributes CSI Profile Attributes Features/Products Platforms / CSI Profiles Architectures Notes UP DP Small MP LargeMP IA-32 Itanium® Polarity Reversal O O R R N/A N/A Can be used to create a differentiation. May be needed on one side only. Hot-Plug support N O R R N/A N/A Independent control of link width in each direction O O O R N/A N/A Required in Large MP for RAS. Optional for power mngmnt. Link Power Management - L0S Support O O O O N/A N/A Mobile requires this. Optional for desktop. Link Power Management - L1 Support O O O O N/A N/A Mobile requires this. Optional for desktop. Link Layer Node ID width 2 3 5 10 N/A N/A Rest of the bits Rsrvd Packet Formats - SA, SCA, SCC, SDR, SDW R R R N N/A N/A Packet Format - P2PTunnel N N R R N/A N/A Packet Formats - EA, ECA, ECC, EDR, EDW N N N R N/A N/A Really means no support for extended addressing VN1 - For thru-routing agents N N O R N/A N/A IOHs may not have this. ICS message class O N N N N/A N/A IDS message class O N N N N/A N/A Explicit packet delimiter (header bit) O O O R N/A N/A Decode via Msg Class+OpCode Chunk poisoning O O R R N/A N/A Requires 20 bit lanes Viral indication O O R R N/A N/A Requires 20 bit lanes Simple CRC R R R R N/A N/A Closed tech issue Rolling CRC O O O R N/A N/A Closed tech issue Scheduled Data interleaving - Sender O O N N N/A N/A Scheduled Data interleaving - Receiver O O N N N/A N/A Ordered VCs - per priority in Msg Class All VCs CSI- order CSI- order None N/A N/A Support for P_Encode; P_Hint, O O N N N/A N/A If supported then precludes 18-wide bus Critical Chunk Order - Receiver of the data R R O O N/A N/A This is required for WBs 520 Ref No xxxxx Intel Restricted Secret Table B-1. CSI Profile Attributes (Continued) Features/Products Platforms / CSI Profiles Architectures Notes UP DP Small MP Large MP IA-32 Itanium® Routing Layer Through routing capability for processors N/A O R R N/A N/A Alternate routing table N N O R N/A N/A Transport layer N N N O N/A N/A Coherence Protocol Source broadcast coherency N/A R R N N/A N/A Support for F cache line state O O O N N/A N/A Dual data transfer to the same link (one to requester, another to home) from peer caching agent with a modified cache line N O R R N/A N/A Updated for DP UP optimizations and simplifications (Early GO; Ordered VCs - see above) R N N N N/A N/A Buried HITM protocol support R R R N N/A N/A Closed for Whitefield. Non-coherent Protocol Peer-to-peer tunnel transactions (Only applicable to I/O agents, other agents just route through) N N O R N/A N/A Dependent on route through capability Synchronization transactions N N O R N/A N/A VLW transactions N/A N/A N/A N/A R O Special cycle transactions N/A N/A N/A N/A R N Locked accesses N/A N/A N/A N/A R N Interrupt Proc. initiated Int Transaction on CSI N IntPro c Link only IntPro c Link only Any link N/A N/A Logical interrupts (IntLogical) N/A N/A N/A N/A R N Broadcast of logical and physical mode interrupts N/A N/A N/A N/A R N Logical Flat Addressing Mode (<= 8 threads) R R R R R N Logical Cluster Addressing Mode (<= 60 threads) R R R R R N EOI transaction N/A N/A N/A N/A R N Ref No xxxxx 521 Intel Restricted Secret CSI Profile Attributes CSI Profile Attributes Features/Products Platforms / CSI Profiles Architectures Notes UP DP Small MP LargeMP IA-32 Itanium® Support for INIT, NMI, SMI, and ExtINT through VLW mechanism N/A N/A N/A N/A R O Support for INIT, NMI, PMI, and ExtINT through Int transaction N/A N/A N/A N/A O R Limit on number of threads supported for inter-processor interrupts - this could be product specific and change over time 2 8 64 64K N/A N/A CPU may do this internally for UP and may not issue on CSI. First product sets it. Fault Handling Fatal error reporting through viral indication N N R R N/A N/A Machine check indication through Int O O R R N/A N/A Timeout hierarchy for fault diagnosis N O O O N/A N/A Packet elimination for error isolation between partitions N O O O N/A N/A Abort timeout response status N O O O N/A N/A Reset/Initialization Node id assignment Fixed or Chips et assign ed Chips et or straps assign ed Straps or service processorassigned N/A Processor accepting external configuration (NcRd, NcWr, CfgRd, CfgWr going to CSRs) requests N R R R N/A N/A Separation of reset domains between link and Physical layer for link self-healing N O O R N/A N/A Link Self-healing is optional for DP Separation of reset domains between routing/protocol and Link layer for hot plug N O O R N/A N/A Separation of reset domains between CSI agents and Routing layer to allow sub-socket partitioning N O O O N/A N/A Product specific fixed and configurable power on configuration values - configurable through link parameter exchange, e.g., LT enable, HT enable, BSP indication, etc R R O O N/A N/A 522 Ref No xxxxx Intel Restricted Secret Table B-1. CSI Profile Attributes (Continued) Features/Products Platforms / CSI Profiles Architectures Notes UP DP Small MP Large MP IA-32 Itanium® Flexible firmware location through discovery during link initialization N N O O N/A N/A Packet routing during initialization before route table and address decoder is initialized Fixed Config urable thru link init param Configurable through service processor and link initialization parameter N/A IOH returns resp to sender for packets System Management Protected system configuration region N N O O N/A N/A Dynamic Reconfiguration Support for various partitioning models N N O O N/A N/A Support for OL* of various CSI agents N N O O N/A N/A Security Support for LT O O O O N/A N/A Current spec only covers LT solution for UP Power Management System level power management R O O O N/A N/A QOS/Isochronous Quality of service support, I.e., support for ICS and ICB message classes and associated request/response packets O N N N N/A N/A Support for chaining O N N N N/A N/A Ref No xxxxx 523 Intel Restricted Secret CSI Profile Attributes CSI Profile Attributes 524 Ref No xxxxx Intel Restricted Secret C C.1 Introduction The Transport layer provides support for end-to-end reliable transmission between two CSI agents each implementing this layer. It relies on the services provided by the Routing layer below it, while in turn providing reliable transmission support to the Protocol layer above it. Unlike other CSI layers, the Transport layer is optional and is provided for systems which desire a higher degree of reliability, usually at the cost of lower performance and increased bandwidth requirements - this means that it is possible to have a platform architecture with no CSI agent implementing this layer or it is possible to have a platform architecture with a subset of the CSI components implementing this layer. Since this layer is optional, it does not follow the hierarchical layering of CSI from an implementation viewpoint, as explained in Section C.4. C.2 Reliable, End-to-End Transmission End-to-end reliable transmission from a sender CSI agent A to a destination CSI agent B is characterized by the following: • Both A and B implement the Transport layer. It is not necessary for any intermediate CSI agents in the path from A to B to implement the Transport layer. • The granularity for reliable transmission is an CSI packet. • There exists at least two disjoint paths from A to B (referred to as primary and alternate paths). • Assuming no failures along one of these paths, the Transport layer mechanism guarantees the delivery of a CSI packet originating at A and destined to B. This means that the Transport layer provides a mechanism against one or more failures along one of the disjoint paths. In general, between any two CSI Transport layer agents, the mechanism provides protection against a single point of failure. • Transmission and retransmission is based on the notion of Transport layer retry (TLR) -CSI agent A determines that a packet it sent to agent B along the primary path may have been lost. It then retransmits the packet to B until it succeeds or until a threshold on retries is reached (please see Figure C-1 - for illustrative purposes, the figure shows a failed component along the primary path, though this need not be the case for retransmission to happen). The time between two such transmissions is the retry time out (tRTO). It is important to note that TLR is distinct from the notion of link level retry (LLR), which is an independent mechanism to ensure reliable transmission of flits between at the Link layer level. The ability for the sender A to retransmit a packet implies that the packet is buffered at A until A is guaranteed that the packet has been received at B. • The destination agent B acknowledges the receipt of each CSI packet by sending a special CSI response to A. Ref No xxxxx 525 Intel Restricted Secret Future Extensions -Transport Layer Future Extensions -Transport Layer Primary PathAlternate Path CSI Agent B (Implements Transport Layer) Primary Path Alternate Path CSI Agent A (Implements Transport Layer) C.3 CSI Support for Reliable Transmission In the rest of the chapter, unless otherwise stated, an CSI agent will mean an agent with the Transport layer. The Transport layer in CSI for reliable transmission is enabled through the following: • Sequence number field in every CSI packet. • New transaction response types: Transport Layer Acknowledgment (TL_ACK) and Transport Layer Negative Acknowledgment (TL_NACK). • A sender node id in every CSI packet (note: without the Transport layer, this id is required for all transaction types except cache-to-cache data responses). These fields and transactions are not visible to the CSI Protocol layer (for an exception related to the sender node id, see Section C.3.4). Also, note that use of the Transport layer implies the use of extended headers. Piggybacking Transport layer ACKs (and NACKs) along with the regular CSI packets was explored - the overall saving in interconnect bandwidth is achieved at the cost of increasing the packet header to accommodate the additional sequence number field and the ACK/NACK. Penalizing all packet headers, especially, with an optional Transport layer to save relatively small interconnect bandwidth was not considered to be a good design choice. Transport layer protocols are typically implemented with Sliding Window Protocols (SWPs). The fields and the transactions mentioned above provide enough support to implement a wide variety of SWPs. 526 Ref No xxxxx Intel Restricted Secret C.3.1 Routing Each Transport layer agent A has a set of unique CSI (protocol) agents P in its domain. The protocol agents P associated with A launch Transport layer transactions only from A and no other Transport layer agent in the system. Consequently, any packets targeted to a member of P which used the Transport layer is always routed through A. • Transport layer responses, TL_ACK and TL_NACK are treated as no data responses (NDR) and are routed in the NDR message class. • All other packets are routed in exactly the same manner as they would be without the Transport layer. • The Transport layer, by definition, uses multiple disjoint paths between the source and destination Transport layer agents - this implies the use of all the three virtual networks (VNo, VN1, and VNA) for deadlock-free routing between Transport layer agents. — Note that it may still be possible to use VN0/VNA only for “leaf” CSI agents, i.e., those CSI agents whose associated routers are not used for through routing. — Since the Transport layer functionality uses both the deadlock-free virtual networks, some network topologies with the additional Transport layer functionality are not permissible- in particular, ring-based topologies, which use both VN0 and VN1 for deadlock-free routing, cannot be used as the interconnect among Transport layer agents. C.3.2 Sequence Number • Use: A unique sequence number is associated with each packet sent from a source CSI agent to a destination CSI agent. For each pair, the sequence number is unique as long as the packet is alive from the Transport layer’s point of view. Once the sender is guaranteed that the packet has been consumed at the receiver, the sequence number is retired and can be reused. • Retry: Each Transport layer request is to be acknowledged by the receiving agent. When an acknowledgment is not received with in tRTO, the packet is presumably lost and the sender retransmits the packet with the same sequence number as before (TLR). The number of retries is implementation dependent (see Section C.5). • Duplicate Packets: Even though a packet is received, a sender could retry a packet since the acknowledgment could have got lost. This results in a duplicate packet at the receiver, which has to then drop this packet and not pass it on to the Protocol layer. The receiver is able to identify the duplicate since the sender retries the packet with the same sequence number as the original. • Width: The sequence number field is 11-bits wide and is part of each extended header in the CSI packet. The width has been chosen a) to ensure that CSI transactions do not, in most cases, back pressure into the Protocol layer because of the lack of unique sequence numbers and b) the sequence numbers are large enough that they do not wrap around with in a packet’s life time in the Transport layer (round trip latency for the packet transmission to the receiver and its acknowledgment back to the sender). The 11-bit field implies that up to 2048 Transport layer requests can be simultaneously active for each pair - this number is quite conservative even for large scale systems. • Wraparound: Even with 2048 distinct sequence numbers permitted, it is possible, without additional implementation support, that sequence number wraparound could result in a destination agent being unable to distinguish between a duplicate CSI packet and a new CSI packet. The implementation has to ensure that sequence numbers wrap around does not cause any incorrect behavior. It is permitted for the Transport layer to stop accepting transactions from the Protocol layer. Ref No xxxxx 527 Intel Restricted Secret Future Extensions -Transport Layer Future Extensions -Transport Layer Note: As mentioned earlier, the Transport layer is an optional layer - this means that active agents such as the processor and the I/O agents may not implement this layer but instead that another set of components implement it. In such a case, the source and destination agents mentioned here in this section have to be interpreted as corresponding to the original agents and not as the Transport layer agent. This is explained with an illustrative usage model in Section C.5. C.3.3 Transport Layer CSI Transactions There are no explicit Transport layer request transactions. All Protocol layer requests and responses are implicitly the Transport layer’s requests. Two new response transactions unique to the Transport layer are defined: • Transport Layer Acknowledgment (TL_ACK): This is a response sent by the CSI Transport layer agent acknowledging the error-free receipt of a Transport layer request. The response acknowledges the receipt of the request identified by the sequence number and is directed to the agent identified by the sender node id in the request. It is up to the implementation to not acknowledge every Transport layer request but acknowledge a range of requests with a single response - this may be done, for example, to save interconnect bandwidth. Thus, assume that for a pair, a request with a sequence number S1 was previously acknowledged. Sending a TL_ACK response with sequence number Sn directed to the source means acknowledging the receipt of requests with sequence numbers in the range , where k is the width of the sequence number field in the CSI packet (k = 11). Please see the implementation details in Section C.5. • Transport Layer Negative Acknowledgment (TL_NACK): This is a response sent by the CSI Transport layer agent acknowledging an erroneous receipt of a Transport layer request. The response “negatively acknowledges” the request identified by the sequence number and is directed to the agent identified by the sender node id in the request. This transaction is not necessary for the functioning of the Transport layer since a sender retransmits a message on not receiving a TL_ACK response within tRTO -TL_NACK could, however, hasten the retransmission. A component may not implement this transaction. C.3.4 Sender Node ID Since every Transport layer request needs to be acknowledged, the sender node id in the CSI packet header identifies the sender Transport layer agent (e.g., agent A in Figure C-1) either directly or indirectly. Direct: The sender node id of the Transport layer agent is explicitly identified through this field supplied in the extended header. Indirect: If the Protocol layer agent uses the sender node id, then the Transport layer agent need not be identified explicitly - in such a case, the sender node id refers to the node id of the originating Protocol layer request or response corresponding to this Transport layer request. Section C.4 shows a usage model in which the sender node id does not refer to the Transport layer CSI agent. There are specific responsibilities for both the sender and the receiver TL agents in this case. They are identified in the usage model described in Section C.4. CSI only supports the indirect specification of the sender module id. 528 Ref No xxxxx Intel Restricted Secret C.3.5 No Time-Out Field There is no time-out field in the CSI packet header. While such an explicit time-out could be used to facilitate dropping of “aged” packets in the interconnect, they carry an overhead since space in the CSI packet is at a premium. Instead, it is expected that the interconnect support mechanisms to drop CSI packets in order to prevent sequence number wraparound. C.4 Usage Models Unlike other CSI layers, the Transport layer is optional and is provided for systems which desire a higher degree of reliability usually at the cost of perhaps lower performance and increased bandwidth requirements. In such systems, components which implement the Transport layer have to seamlessly work with components which do not. The next section describes a model for such an usage. The description of the usage model is meant as a guide to a platform architect - other usage models, which are consistent with this specification, are possible. The usage model is best described through a generic example illustrated in Figure C-2 and described below. Consider first a system without any Transport layer components - in such a system, components such as S and R communicate with each other in the usual manner - in particular, a) they use the routing tables, b) may or may not use the sender node id field in the CSI packet since either usage is permitted, and c) do not use the sequence# field in the CSI packet. If reliable end-to-end transmission is desired between S and R, then special components A and B which implement the Transport layer are interposed between S and R. Such an interposition is seamless in that neither S nor R are aware of the presence of either A or B - thus, for example, the routing tables at S and R do not change. In fact, there could be multiple components behind A and similarly behind B. • Consider a communication between S and R. The CSI packet routed from S towards R is “intercepted” at A. This means that a single sender TL agent has to be on the path from S to R. Also, assume that the sender node id field is defined by the CSI Protocol layer agent. In such a case, it is not necessary for the A and B to have explicit NodeIDs. Ref No xxxxx 529 Intel Restricted Secret Future Extensions -Transport Layer Future Extensions -Transport Layer CSI pktwith TLfields undefinedCSI pkt with TLfields definedCSI TLTransactions: TL_ACKTL_NACKCSI withTLfields don’t careIntermediate CSIcomponentsmaynot implement TLSender: ..Sequence# generation ..Packet buffering ..Packet regeneration S: CSI Component with PL(no TL) A: CSI Component with TL CSI pkt with TL fields undefined CSI pkt with TL fields defined CSI TL Transactions: TL_ACK TL_NACK CSI with TL fields don’t care Receiver: ..Duplicate packet detection ..PTACK, PTNACK generation R: CSI Component with PL(no TL) B: CSI Component with TL Legend: PL: Protocol Layer TL: Transport Layer S, R, A, B: CSI Components Intermediate CSI components may not implement TL • The intercepted packet has its sequence# field filled in and then sent along to R. The packet, whether it be a Protocol layer request or response, is now a Transport layer request. In addition, the packet is also buffered at A. • The packet is then “intercepted” at B. This means that a single receiver TL agent has to be on the path from S to R. B generates a Transport layer acknowledgment (optional negative acknowledgment if error in packet), TL_ACK, directed to the sender node id. Note that the sender node id corresponds to the original sender (S in this case). It has to be guaranteed that the route to S has to pass through A; further, A has to know that it is acting as a proxy for components such as S for TL_ACKs and TL_NACKs - the implementation has to ensure these conditions • If the sender node id is not defined at the CSI Protocol layer component, then each Transport layer component has a unique node id which is then used as the sender node id in the outgoing Transport layer request. In such a case, TL_ACKs and TL_NACKs are targeted explicitly to the Transport layer agent (A in this case). • The receiving CSI component B then routes the CSI packet appropriately (to R in this case), after making sure that some implementation related conditions are satisfied (that it is not a duplicate packet, for example). C.5 CSI Components’ Responsibilities and OtherImplementation Issues Since the Transport layer is optional, there are a number of details left to the implementation. This means that multiple usage models are permissible. A few of the implementation details relevant to the proper interfacing of CSI components are listed below: 530 Ref No xxxxx Intel Restricted Secret • Transport layer handshake between the sender and the receiver is implemented usually with sliding window protocols (SWPs). There are many variants and it up to the TL agent implementation to choose the appropriate SWP. • A Transport layer agent may not implement TL_NACK - this should be OK even if Intel were to integrate the TL functionality in its components at a later point in time. • The interpretation of TL_ACKs - whether the ACKs are for a range of sequence numbers or for just the sequence number in the CSI packet. Probably should NOT be left to the TL agent implementation. The semantics should say a TL_ACK is for that sequence# only? - insisting on precise semantics will not lead to incompatibilities later if Intel were to integrate the TL functionality in its components at a later point in time. • Means to identify if the Protocol layer agent or the Transport layer agent defines the sender node id - could overload one bit of the sequence# field for this - issue arises only if Intel allows sender node id field is shared between the protocol and Transport layer agents. • Initialization of sequence numbers for reset and OL* (on-line addition and deletion of components) events. Related to the internal aspects of the Transport layer agent, the particular implementation dictates the following (references are to Figure C-1): • The number of times k that A retransmits the packet along the primary path before sending the packet along the alternate path (k could be zero). • The number of times m that A transmits the packet along the alternate path (m >= 1) before it determines that there is a permanent failure and invokes a higher level error mechanism to intervene. • The exact scheme to buffer CSI packets for outstanding Transport layer requests awaiting response is up to implementation C.6 Notes, Comments for Later Revisions • Current spec says that sender module id is not explicitly defined by the Transport layer agent (it is defined by the protocol agent and is always specified or can be inferred). • Sequence# - 11 bits - can this be reduced? More mathematical justification of seq# width? There are 12 bits free for the Transport layer in the extended headers - so we have the bits. • Good idea to show architectural diagrams showing 16-socket and 32-socket systems with TL support. Ref No xxxxx 531 Intel Restricted Secret Future Extensions -Transport Layer Future Extensions -Transport Layer 532 Ref No xxxxx Intel Restricted Secret G D D.1 PurgeTC Special Transaction Note: Contents of this sub-section are under review and very likely to change in a future revision. Please do not consider them final for design or other purposes. The purge TC operation is used to purge translation cache entry from all the processor agents in a TLB coherence domain in Itanium processor family system environment.This operation is initiated due to execution of ptc.g or ptc.ga instruction in Itanium processor. These instructions are used during changes in the attributes of a page table entry or changes in mapping of virtual to physical memory. Purge TC operation is expected to purge translation cache entries matching a specified virtual memory address range and region identifier in all processors within a TLB coherence domain. Definition of the TLB coherence domain is platform dependent, a TLB coherence domain may include all the processors in the entire system or system partition or it may include only a subset of processors in the system or system partition. Following assumptions are made about the behavior of purge TC operation in the system. • TLB coherence domain need not be same as the cache coherence domain. • TLB coherence domain may include processor agents in an entire system hard partition or a subset of the hard partition, however, TLB coherence domain never includes processors from multiple hard partitions. • In case of partitioning models that share the Protocol layer system resources between partitions, the operating systems for such partitions are assumed to observe the restrictions on the number of outstanding purge TC operation. If a processor implementation and its associated platform allows only one purge TC in a TLB coherence domain, then this restriction must be observed by all OS instances within the TLB coherence domain. Note: Completion of purge TC operation is dependent in completion of other memory operations initiated by processor agents in the TLB coherence domain. Therefore, resource allocation for purge TC operation must be done such that progress of other memory operations is not blocked. D.1.1 Purge TC Messages and Resource Requirements Purge TC operation uses SpcPurgeTC and SpcPurgeTCFwd requests. This operation uses the concept of purge agent, which is responsible for defining the TLB coherence domain in the system. To allow flexibility for different classes of platform, it is expected that the purge agent will be located in a component other than the processor. SpcPurgeTC request is sent from the processor agent initiating a purge TC operation to the purge agent for its local TLB coherence domain. SpcPurgeTCFwd request is sent by the purge agent to all the processor agents in the corresponding TLB coherence domain, including the processor that initiated the purge TC operation. Both of these requests use Cmp response to indicate completion of the corresponding requests. Message class and packet type use for SpcPurgeTC and SpcPurgeTCFwd is documented in the Link layer section. These request carry 80 bits of information related to the purge operation, which includes a 49 bits of virtual address, 24 bits of region identifier, 6 bits of page size and 1 bit of ALAT qualifier. Ref No xxxxx 533 Intel Restricted Secret Future Extensions -PTC.G Future Extensions -PTC.G Each agent responsible for processing SpcPurgeTCFwd request for processors is expected to provide at least 3 resources to handle incoming non-coherent and special requests. This is to allow progress in situations when a purge TC, synchronization (SpcSync and SpcBSync) and other non- coherent operations are in progress simultaneously. Purge agent needs to provide enough resources such that all SpcPurgeTC requests within the TLB coherence domain at any time can be absorbed at the purge agent and enough resources remain available to complete other non-coherent memory operations and to absorb any synchronization operations. This can be addressed by making sure that purge agent has at least 2 more resources for non-coherent transactions than the maximum number of purge TC operation possible in its TLB coherence domain. Without these resources at the purge agent, a backup on NCS virtual channel may lead to a system deadlock. D.1.2 Purge TC Transaction Flow The agent responsible for handling purge TC operation for processors, takes following steps during a purge TC operation. • Sends SpcPurgeTC request to purge agent. It need not send purge request to all local processor cores at this time. • On receiving a SpcPurgeTCFwd request, it sends a purge request to all the local cores to perform the purge operation and after completion of all the local purges respond with a Cmp response to the purge agent. • Completes a purge TC operation after receiving a Cmp response from the purge agent. The purge agent in a TLB coherence domain takes following steps during a purge TC operation • Purge agent accepts a SpcPurgeTC request and allows only one purge operation to proceed at any time. • Purge agent sends SpcPurgeTCFwd requests to all processors (including the initiating processor) in its purge domain. • After collecting all Cmp responses corresponding to each SpcPurgeTCFwd request, the purge agent sends a Cmp response to the source of the SpcPurgeTC request. The transaction flow for a purge TC operation is shown in Figure D-1. 534 Ref No xxxxx Intel Restricted Secret Figure D-1. Purge TC Transaction Flow Purge Proc 0 Proc 1 Proc 2 Agent SpcPurgeTC Cmp Cmp Cmp Cmp SpcPurgeTCFwd SpcPurgeTCFwd SpcPurgeTCFwd SpcPurgeTC and SpcPurgeTCFwd requests can be sent to any type of agent in the system depending on the purge agent and agents in a TLB coherence domain, including non-processor agents. All agents must recognize these requests and respond with a Cmp response. The configuration parameters associated with purge TC operation include the purge agent identifier at each agent that initiate a SpcPurgeTC request and receives a SpcPurgeTCFwd request. The agents receiving SpcPurgeTCFwd requests send the corresponding Cmp response to the purge agent identifier in that agent. Purge agent provides a way to specify the TLB coherence domain, which must identify all the agents associated with processors responsible for handling purge TC operation in the TLB coherence domain. These configuration parameters are programmed by the firmware during system or agent initialization. Purge agent identifiers must indicate the same purge agent in a TLB coherence domain. Run time changes in designation of purge agent in the system can be done by synchronizing all processor agents in firmware preventing any operating system initiated purge TC operation to be initiated while purge agent identifier is changed at each agent in the TLB coherence domain. D.2 CSI Component Initialization Requirements CSI-based components need some initialization steps before they can interact with each other or with other platform components. Some of the high level features and initialization requirements are indicated below: • CSRs for various Participant Lists need to be setup before sending PTCs to other caching agents. Ref No xxxxx 535 Intel Restricted Secret Future Extensions -PTC.G Future Extensions -PTC.G D.2.1 Programming of CSI Structures 6. Boot Mode Determination: An SSP can obviate a number of CSI structure programming steps following boot mode determination by co-operating with the configuration agent to achieve link initialization, then initializing CSI specific structures in the platform including PTC participant lists 7. Enabling Coherence Traffic: Processors belonging to the same OS partition set up their CSI structures consistently including Participant Lists for PTC. Coherence traffic is enabled only after this stage. D.2.2 Online Addition of a Processor Node (With or Without Other Agents) The steps for online addition are quite similar with and without the presence of a SSP. A detailed description of CSI specific steps are listed below. The steps performed by the SSP, the OS, firmware and other software layers are described at a high level only. When firmware executing on the RS is notified that the OL_A is ready to join the partition(s): If the OL_A needs to join the OS partition of the RS, the OL_A and the RS update their system address maps and address decoders to incorporate the new resources. The RS may need to revise the participant lists (for snoops, PTC, Sync, Lock, etc.) in various CSI agents of the OS partition. If these lists are not revised in an atomic or consistent manner, protocol violations may occur. D.2.3 On Line Deletion of a Processor Node After the OL_D Node removal is requested, and the OL_D node is Off-line, firmware will perform a quiesce to ensure all in-transit transactions including PTCs to the QP are completed. D.3 Open Issues/Notes 8. Sub-socket partitioning: There are several open issues, especially with the firm partition id approach, since it is a late add-on. a. ptc.g: has the virtual address field - hence, no space for firm partition id. Table D-2. CSI Profile Attributes Features/Products Platforms / CSI Profiles Architectures Notes UP DP Small MP LargeMP IA-32 Itanium Link Layer Packet Format - PTCG N/A N/A N/A N/A N R Revisit when PTCG issue is closed. Routing Layer Purge TC transaction N/A N/A N/A N/A N R See above for same item 536 Ref No xxxxx Intel Restricted Secret n E E.1 Post-Si Validation for CSI FOR INTERNAL USE ONLY CSI is a very different paradigm for system interconnects, from the shared bus (FSB) interconnect in use in current systems. In the parallel system bus architectures, all system agents have visibility into every bus transaction that happens in the system. CSI introduces point-to-point links between system agents, and borrows heavily from the network engineering principles and protocols to implement a layered protocol architecture with system agents interconnected through one or more CSI links. As we move on to more sophisticated interconnect protocols like CSI, it becomes very important to have post Silicon validation and debug requirements captured early in the spec generation cycle. These should expose the needs of the product backend development to the designers and implementers of various subsystems in CSI agents. This sub-section describes the requirements from the Post-Si validation teams. The contents of this section have been generated with inputs submitted by Post-Si validation Task Force members from SV, CV and CMV teams in MPG, EPG and DPG. The key areas we have focus on are: • Event Monitoring/Counting for Performance measurement/Statistics of link transactions and behavior. • Stress Test of Protocol layers, including error injection. This section also includes a proposal for an external stress agent, which will allow a great deal of flexibility in generating and injecting stress traffic through an external agent. This is an initial proposal and needs further follow-up on design and implementation by the System Validation groups. The section summarizes the list of requirements while the remaining sections describe in some detail many of the items covered in section . E.1.1 Summarized List of CSI Post-Si Validation Requirements • Power Management Validation: Support for various levels of power management functionality testing, with equivalent of signals like PROCHOT,FORCEPR, BPMx, GVxs. • Cache Coherency and conflict resolution: Enough hooks/visibility to validate these system level functions. These could be met through special debug packets containing necessary info. • Frequency Margining/Schmoo: These are needed for system and speedpath validation/debug. We want to margin the speed of the links and need to know the limits (granularity/accuracy) on this. • Broadcast Trigger: Capability to generate a trigger condition from test software and broadcast a trigger to a probing device, to initiate capturing transactions on all links. • Stress Event Injection: to enable post-Silicon validation to induce corner cases that would otherwise be difficult or impossible to create in a SV or CV environment. The injection of these events can be initiated in response to a match/mask/counter event or based upon a programmable duty cycle events. Ref No xxxxx 537 Intel Restricted Secret Post Silicon Validation Post Silicon Validation • Error injection mechanisms covering all of the defined CSI errors: Link errors: CRC errors, framing errors, Ability to force errors during training, Force Link Level Retry Packet. All detectable and correctable errors to be covered. • Backpressure: Ability to create artificial backup in one direction (inside the chip); • Virtual Channel Stalls: Creating conditions that will force stalls on virtual channels (rare possibility in real systems?), in the “ordered” ones (like Home channels). • Programmable buffer lengths: Buffers sized for maximum latency cases (including LA repeaters and external cable drivers) must provide a mechanism to reduce the buffer size. This mechanism allows SV tests to hit buffer full conditions that would otherwise be impossible to hit in a system that does not exhibit maximum latency. E.1.2 CSI Monitoring Events CSI event monitoring comprises a set of on-chip counters and the instrumentation logic necessary to select and count various events related to the operation of a CSI link. The implementation details of the counters and associated logic is beyond the scope of this specification; however, the specification does define a list of events that must be implemented on each CSI port of any device. This ensures a consistent definition and implementation of monitoring events across multiple CSI devices in different product segments and from generation to generation. Traditionally, performance analysis groups have used on-chip counters/monitoring events to study system characteristics and optimize system and application software. More recently, post-silicon system validation groups have begun using event monitoring for several purposes: test coverage and stress analysis, visibility of link behavior during system debug, and to trigger debug response functions in order to work around bugs. The following sections describe these three use models in more detail. E.1.2.1. Test Coverage and Stress Analysis Relative to pre-silicon validation, post-silicon system validation has much less visibility into what is occurring within a component. Without internal monitoring events, it is often impossible to know whether certain corner cases or stress levels are being hit by validation tests. And due to the expense of logic analyzers, it is also usually difficult to obtain this type of information for external interfaces. On chip event monitoring has greatly improved system validation’s ability to “look” into the component during validation to verify that intended conditions and stress levels are being hit. One of the simplest monitoring functions is counting the number of occurrences of a given event. This is often used to count things like the number of packets transmitted or received, how many of each type of packet was sent, number of data bytes transmitted, etc. When making these types of measurements, it is valuable to have multiple counters (e.g., many chipsets implement 8 counters) in order to capture multiple related events within a single sample period, especially when there is significant variation between samples. A second more complex monitoring function involves histogramming queue fill levels over time, which provides a good measure of how well tests are stressing internal logic paths. Often it requires special tuning or other changes in order to hit the queue full conditions required to exercise back- pressure mechanisms. Figure shows an example queue-fill histogram for the Front Side Bus In- Order Queue: 538 Ref No xxxxx Intel Restricted Secret Figure E-1. Histogram for FSB In-Order Queue Queue Fill Level Percentage 0.00 5.00 10.00 15.00 20.00 25.00 30.00 35.00 0 1 2 3 4 5 6 7 8 9 10 11 12 Fill Level PercentageTest1 Test2 E.1.2.2. Visibility of System Behavior During System Debug Most systems running post-silicon test applications are not equipped with logic analyzers. When a failure occurs, it is often beneficial to probe internal and external state through a JTAG or Extended Debug Port (XDP) which can be connected to a system after the fact. Events that provide visibility into CSI link activity and internal queue and control state are extremely valuable for giving details that can lead to quicker problem resolution. E.1.2.3. Triggering Debug Response Functions Debug response functions are implementation-specific hooks that are typically defined by design teams as a means to alter component behavior, usually for the purpose of inducing some corner case event or as a potential way to work around bugs in the field. Examples would include injecting snoop stalls on the Front Side Bus, blocking a particular internal queue to create back-pressure conditions, or periodically injecting correctable errors on a CSI link. A very powerful way of controlling these functions is by controlling their activation with the component’s event monitoring logic, which typically may have hundreds or thousands of internal signals and events that can be monitored. Generating an “event out” signal that triggers a selected debug response function provides a tremendous amount of flexibility that is extremely useful for system validation as well as a means of surviving bugs discovered in the field. Detail of what debug response functions are needed for a given component is beyond the scope of this specification; however, it is highly recommended to enable the event monitoring counters as at least one means of activating any response functions that are implemented. E.1.2.4. CSI Monitor events The following table lists the minimum set of events which are required for each CSI link in any device: Ref No xxxxx 539 Intel Restricted Secret Post Silicon Validation Post Silicon Validation Link Utilization • Header Packets Transmitted / Received (including Requests, Snoops, Response, IO, Isoch, etc.) • Header Packet Counts by Type, Addr Range, Data Length, Txmt/Rcv, VC, etc. • by Message Class, Opcode, Virtual Network, DestNID, RHNID, Requestor TID, etc. • Data Packets Transmitted / Received (bandwidth) • Interrupts Link Efficiency • Stalls due to lack of credits available • Special (Link) Packets Sent / Received Resource Utilization • Queue Fill Levels to enable queue depth histogramming (GQ, Tracker, etc.) • Flow Control Credit levels to enable flow control credit histogramming • QueueNotEmpty and QueueExit/Pop events to enable Head-of-Queue histogramming These event types are tied into the PerfMon implementations so any changes to PerfMon (from Blackford style) must track the events used to drive them. Quality of Service Indicators • Average / Worst-case isoc latency Error Monitoring • Link-level Retries Error counts for each type of link and Protocol layer error Other Event Monitoring • Power Management Messages Transmitted/Received • Transmitter/Receiver Power State (L0, L0s, etc.) • LT Reset events E.1.2.5. Match/Mask Registers A set of software programmable Match/Mask registers is required to select the monitoring events to count. The specific format of these registers is TBD. Ideally the CSI spec should provide programming model to enforce consistency across CSI products, but this starts to get into implementation space. The output of the match/mask logic can be fed into the event counters and/or can be used to generate other events such as Inband Debug Packets, BPM events, Error Injection Events, etc. Refer to Default ¶ Fontinkection\ for a description of Inband Debug Packets. E.1.3 Event Counters Each CSI link on a component is required to implement at least two event counters to count the events listed in Section . The queue and credit histogramming functions described above may require a pair of counters - one to track the current level, and the other to count the number of clocks that the level exceeds a programmed threshold value. As an example, with a two counter 540 Ref No xxxxx Intel Restricted Secret implementation, a four-deep queue would require four samples to collect the data needed for a queue depth histogram. This implies an assumption that the workload is such that consistent samples can be obtained over time. Additional counters provide the added flexibility to count multiple events or queue threshold levels within the same sample period, which eliminates the uncertainty inherent in taking multiple samples over a period of time. E.1.3.1. Counter Functional Requirements Counter widths should be a minimum of 32 bits to enable sample periods of several seconds for most events. E.1.3.2. Histogramming requirements In order to generate queue-depth and head-of-queue histograms, special functionality must be designed into the counters and is dependent upon the way in which queue events are implemented. One common way to implement this function is to provide three event signals from each queue: QueueEnter/Push, QueueExit/Pop and QueueEmpty. One counter monitors these three signals to track the queue’s current fill level. A second counter has an associated threshold value which causes it to increment whenever the first counter exceeds the threshold. Multiple counter pairs can be used, or multiple samples taken with different threshold values to gather the data needed for a histogram. An alternate way to implement the histogramming function is to provide queue fill level bits from each queue (e.g., an 8-deep queue would require 4 bits to encode the values 0 to 8). Again, a programmable threshold value is needed to compare with the queue fill level bits. A counter would increment whenever the threshold is exceeded. A simplification of the second technique just described is to provide event signals for predefined threshold levels for each queue. For example, each queue might make the following 5 events available to the event counters: QueueEmpty, Queue>1/4Full, Queue>1/2Full, Queue>3/4Full and QueueFull. With this implementation, each counter simply increments whenever its selected queue fill event is asserted. E.1.3.3. Programmable Duty Cycle Generator Another function required for the event counters is the ability to use a pair of counters to generate an “Event Out” signal with a programmable duty cycle. This Event Out signal is used to activate various debug response functions related to the link. TBD: duty cycle requirements. E.1.4 Error Injection The fundamental philosophy behind error injection functionality is that system validation must have a means of generating any type of error that can normally be detected by agents on a CSI link. Otherwise there is no way to validate the error detection logic. Past methods such as temporarily shorting signal lines to ground are not effective for creating the various error types that can occur on a high-speed serial interconnect like CSI. In addition, CSI supports certain correctable error types, which need to be tested much more thoroughly than fatal error types. The error detection and recovery mechanisms used for correctable errors must guarantee continued correct operation of the system regardless of when or where a correctable error may occur. Validating this requires much more extensive error generation than has typically been used for fatal error types. This specification makes the following error injection requirements for CSI devices: Ref No xxxxx 541 Intel Restricted Secret Post Silicon Validation Post Silicon Validation • Link CRC errors: — Force arbitrary bit error pattern in CSI packet — Force bit lane error in any lane, causing downgrade to narrower-width link — Force link-layer retry threshold to be reached, causing re-training of link — Ability to force bit errors during training • Link Protocol errors • Transaction Protocol errors A general mechanism would allow the specification of a packet bit or bits to be inverted or set to a specified state. The mechanism must be selectively able to invert bits both before and after CRC generation to enable injection of both Link layer and Protocol layer detected errors. When errors are injected after CRC generation, the CRC check of the packet would fail and a link level recovery would be initiated. When an error is injected into the packet prior to CRC calculation, the CRC check would pass and higher level errors (illegal packets, unsorted packets, etc.) can be generated. Initiation of error events is a crucial consideration for enabling a robust error injection methodology in post-silicon testing. Unlike the relatively simple single bit error correction scheme used on many memory subsystems, a correctable error on CSI involves replaying a transaction sequence which potentially involves many links and agents in a system. During system validation testing, it is important to have a means of automatically injecting correctable errors during heavy system stress workloads, without software intervention that could alter the workload. Implementing an error injection circuit as a “debug response function” (see Section E.1.2.3., “Triggering Debug Response Functions” on page E-539) provides a way to use a device’s on-chip counters as a programmable duty cycle generator which can trigger injection of errors into a system. Once the counters and error injection logic are set up, no additional intervention is required to get periodic errors injected on a link. This allows a validation system to run workloads for extended periods of time to verify that no harmful effects occur as a result of correctable errors. A method to corrupt a specific packet must be provided. The trigger to inject the error would be based on header match/mask and the error must be injected into the matching packet. A mechanism for injecting errors into LLR retried packets is required to fully validate the recovery mechanism. The mechanism must allow specification of the number of LLRs that will be corrupted. E.1.5 Diagnostic Information Registers Each CSI link is required to implement software accessible registers that provide diagnostic information on the current state of the CSI link. These registers must be accessible via JTAG and configuration space. This state information includes: LLR Buffer: ESeq, WrPtr, RdPtr, NumFreeBuf, NumAck Flow Control: CREDIT_CONSUMED, CREDIT_ALLOCATED, CREDIT_LIMIT, etc. Software visibility of Tracker and other structure entries The examples above are not intended to be exhaustive. 542 Ref No xxxxx Intel Restricted Secret E.1.6 Programmable Configuration Overrides • Force L0 state (i.e. disable power-savings modes) • Force Channel Mirroring (Lane Reversal) on transmit side • Force Polarity Inversion on transmit side • Programmable Lane Skew on transmit side Ability to re-initiate training via s/w, with new width, polarity, skew after training • Ability to limit number of credits to less than provided by device • Ability to limit quadrant combinations supported by the link • Disable link width modulation • Always use FrcAckCnflt response (vs. Cmp). • Disable “F” state. Caching agent uses “S” state whenever it receives a DataF response. • Disable “E” state. Caching agent goes directly to M. • Turn off error checking for each error type • Programmable Link width • Programmable LLR FIFO length E.1.7 Programmable Timer/Counter Values In order to test certain features in post silicon validation, it is often necessary or extremely helpful to make default timer and counter values programmable rather than hard-coded. These include: • Programmable Flow Control Update Rate • Programmable ACK watermark • Programmable Replay timer • Programmable Physical Layer timers • Programmable Completion Timeout value • Programmable Flow Control Credit values E.1.8 Event Injection The purpose of stress event injection is to enable post-Silicon validation to induce corner cases that would otherwise be difficult or impossible to create in a SV or CV environment. The injection of these events can be initiated in response to a match/mask/counter event or based upon a programmable duty cycle event as described below. E.1.8.1. Artificial Backpressure One useful event that can be injected in response to a Duty Cycle Counter being asserted is to create an artificial backpressure condition into the system. Artificial backpressure is used to stress the stage previous to the one indicating it can no longer accept new requests. This can be used to check for buffer overflow, fairness, to artificially fill queues. The use case for this feature is not to stress the CSI interface itself, but to stress the agent using CSI to communicate. Ref No xxxxx 543 Intel Restricted Secret Post Silicon Validation Post Silicon Validation E.1.8.2. Virtual Channel Stalls There are certain corner cases that rely upon sequencing of packets on different virtual channels in a way that would normally be rare. For example, one scenario involves the home node receiving a RspFwd* response before it receives the corresponding request from the requestor. Normally this would be a relatively rare occurrence, but with the virtual channel stall mechanism, the requestor node could be set up to periodically stall home channel requests. The mechanism should allow selection of transactions to be delayed by virtual channel, message class and destination in order to create conditions like that in 7-18 of CSI spec 0.55. The long delay of the SnpInvOwn from node A to B creates a condition that FrcAckCnflt is required to resolve. Because these stalls are temporal conditions, the system should continue to function correctly, although at a reduced performance level. This mechanism also has potential value as a “survive” feature which could be used to break a livelock condition or help work around some other bug. TBD items: Need to define a virtual channel mask and control registers that are used to select which channels are delayed, and by how much. Need to develop a diagram showing relationship between events/counters/duty cycle counters and event injection functions E.1.9 CSI HUB-Based System Validation Concept E.1.9.1. Introduction Link based serial interconnect protocols force quest for novel system validation methodologies. We are convinced that in the era of the serial-interconnect computing systems the system validation methods must become an inalienable part of the platform or in other words be based on the serial- interconnect principles as well. The first time that principle was adopted in the Hublink-based (predecessor of the PCI-Express) platforms for validation of the Timna processor and Odem MCH. Eilat Transaction Generator has been implemented as a HUB-like agent to be plugged in the hublink. The concept was significantly extended and grounded for PCI-Express (Spider concept). This document commences the analysis of applicability and the benefits that HUB-based validation methodology may bring to the CSI-based systems. We further discuss this concept of CSI hub- based validation agent (hereafter abbreviated as HVA) in the subsequent sections. E.1.9.2. HVA Validation Scope HVA may be used for stress-mode and highly-controllable cache snooping, controllable creating of the coherency conflicts, exercising of the CSI link and Protocol layers, and for power management exercising. It will not require (or will require to minimal extent) DFT support in the processor and MCH. In addition to exercising and stress capabilities the original link traffic may be monitored for observation and coverage analysis. Being directly linked to the chip-under-test (CPU or MCH) HVA allows bringing about more events virtually covering whole CSI spec, with higher preciseness and intensity, and thus implementing principle of validating for spec and not for product. Eventually it allows to faster identifying problematic scenarios and may be extremely instrumental for chips-under-test designed 544 Ref No xxxxx Intel Restricted Secret to operate on different platforms (e.g. single processor designed to inter-operate with multiple MCHs). HVA allows exposing the processor to extended variety of events and transactions on the Single Platform with no need to run tests on numerous platforms. E.1.9.3. General Structure The structure of the system utilizing HVA for post-silicon validation is shown using the example of processor and MCH pair (Figure ). Generally the principle is applicable for any two components communicating through CSI link. Hub Validation Agent (HVA) splits the original CSI link into two independent and physically de- coupled CSI links. Original transaction stream coming from MCH is queued in the HVA prior to being transmitted to the processor. Processor-originated traffic is handled in the similar way. In this sense HVA behaves as a real HUB queuing incoming traffic prior to transmitting it to the destination. But about the presence of the HVA is the system should neither processor nor MCH be “aware” in order to keep the behavior of the whole system undistorted. In words HVA should not exist in the configuration space. Such transparency may be achieved by taking care about a few HVA features (detailed in the 5th section) and providing the transparency of the initialization process is the first one. Once the initialization is done processor perceives HVA as MCH component while MCH perceives HVA as processor component. Injection traffic may be transmitted in any direction (to processor or to MCH) as a result of trigger event or SW command. Triggering may be programmed as a result of either in- band CSI transaction/event or signaled off-band. Figure E-2. General Validation Structure HOSTSRAMInjectionMemoryTo LogicAnalyzerCPUMCHHVACSI LINKCSI LINKFig. 1 General Validation StructureTransparent forCPU and MCHcomponentsHOST SRAM Injection Memory To Logic Analyzer CPU MCH HVA CSI LINK CSI LINK Fig. 1 General Validation Structure Transparent for CPU and MCH components E.1.9.4. HVA Layered Architecture HVA layered structure is shown in the Figure . Ref No xxxxx 545 Intel Restricted Secret Post Silicon Validation Post Silicon Validation Or OrOrigi igiigin nna aal llPac PacPack kket etets ss AP APAPPLIC PLICPLICA AAT TTI IIO OON NN ( ((S SSV VV) )) LA LALAY YYE EER RR HVA incorporates two CSI layered stacks; each one is responsible for inter-operation with linked component. Application Layer which is in essence a SV Layer incorporates two sub-layers for system validation of corresponding linked component (e.g. processor and MCH). SV layer basically incorporates major SV machines and triggering mechanisms. The original Protocol layer traffic flowing from processor to MCH component (shown in blue lines in Figure ) is mixed with injected protocol packets in the SV layer. Both traffic streams must be queued. The arbitration algorithm must be configurable granting prioritization to the high-priority original stream (e.g. isochronous). Different algorithms (round-robin, time-based prioritization like in the PCI-Express, leaky bucket, token bucket) may be considered for accurate mixing of transaction streams. The configurable choice on the algorithm (as it is implemented in the PCI-Express) is another option to be checked. The purpose of the algorithm choice is to balance between sufficient bandwidth of the injected transaction stream and minimal delay of the high-priority (e.g. isochronous) original traffics. Physical, Data, and Routing layers are basically not concerned with Application layer injections and with splitting the original link into two independent. Those layers allow maintaining the functionality of newly created links. Data Link Layer packets are not transmitted from one link to another as Protocol packets are, but start and end within the newly created link. Thus three lowest layers just act as if processor and MCH components could directly interface to each other. And indeed it is quite logically since Data Link packets mainly carry information (e.g. flow control) relevant for concrete link with no impact on the whole system. For those link packets carrying information which must be propagated through the HVA to another component (power management link messages for examples) the SV layer-to-Data Link layer interface must be defined (Figure ). In order to inject Data Link packets for Link layer and power management exercising the proper PHYSICAL LAYERDATA LAYERROUTING LAYERPROTOCOL LAYERPHYSICAL LAYERDATA LAYERROUTING LAYERPROTOCOL LAYEROriginalPacketsacketsInjectionpacketsInjectionpPHYSICAL LAYER DATA LAYER ROUTING LAYER PROTOCOL LAYER PHYSICAL LAYER DATA LAYER ROUTING LAYER PROTOCOL LAYER InjectionpacketsInjection packets Injection packets Or OrOrig igigi iin nna aal llPa PaPac cck kke eet tts ss 546 Ref No xxxxx Intel Restricted Secret interface must be provided - enabling to request the data Link layer to transmit specific packet over the link. In addition this interface allows to transparently propagating original in-band power management sequences, mentioned earlier, flowing through the CSI links (Figure ). Notice that the reason power management link messages must be propagated from link to link through the HVA is the one explained in the 3rd section – HVA should not be the real part of the system and thus should enable smooth power management interoperation between processor and MCH. Figure E-4. HVA Data Link Level Traffic En EnEncod codcodi iin nng gg L LLi iin nnk kk La LaLaye yeyer rr r rrequ equeque ees sst tt APP APPAPPL LLI IIC CCA AATION ( TION (TION (SV) SV)SV) L LLAYER AYERAYER F FFi iig gg. .. 3 33 HV HVHVA AA Da DaDat tta aa Li LiLink La nk Lank Lay yye eer T r Tr Tr rra aaf fff ffi iic cc E.1.9.5. HVA System Impact In order to prevent negative impact on the whole system HVA should satisfy several requirements which will be detailed bellow. Not following these rules will inevitably cause distortion of the system operations and may disqualify the whole approach as relevant for System Validation. E.1.9.5.1 Flow Control HVA Data Layer queuing/buffering ability per each transaction class must be at least the same as that of MCH or processor in accordance (Figure ). In order to satisfy this requirement HVA Northbound Data layer should incorporate the same queuing per class capabilities as MCH, while HVA Southbound Data layer should mimic the processor queuing per class capabilities. Certainly that all classes supported in processor must be supported in the Southbound data layer, while all classes supported in the MCH must be supported in the Northbound data layer. Such approach ensures that HVA plugged into the link will not create flow-control problems and traffic stalls in higher extent that it would happen if processor and MCH have been linked directly to each other. Ref No xxxxx 547 Intel Restricted Secret Post Silicon Validation Post Silicon Validation PHYSICAL LAYER………….. Northbound DataLayer. Mimicsthe MCH queuing/classand contains the same transactionclasses. Southbound Data Layer. Mimics the CPU queuing/classand contains the same transaction classes. PHYSICAL LAYERCPUMCH….…. ….…. PHYSICAL LAYER ………….. Northbound Data Layer. Mimics the MCH queuing/class and contains the same transaction classes. Southbound Data Layer. Mimics the CPU queuing/class and contains the same transaction classes. PHYSICAL LAYER CPU MCH ….…. ….…. ….…. ….…. E.1.9.5.2 Coherency Conflicts CSI protocol is full of inherent coherency conflicts. The expectation is that HVA presence will not create additional conflicts as it may make system debug more complicated. The initial analysis shows that provided HVA tracks the already existing conflicts between processor and MCH it may easily prevent new ones. More analysis is needed on this feature to make sure that corner cases are not missed. E.1.9.5.3 Initialization Sequence It has been already mentioned that accurate behavior of the HVA during the initialization sequence is a premise of its further transparency in the system. As the first step HVA should behave correctly to ensure successful accomplishment of the initialization sequence. To achieve it HVA northbound Physical layer should act and respond as that of the MCH, while HVA southbound Physical layer should act and respond as that of the processor. Notice that initialization process is not transparent but runs simultaneously on two links. If the initiator of the initialization sequence (according to CSI spec) is the southbound link component then processor begins the initialization simultaneously with southbound Physical layer of the HVA. The northbound Physical layer of the HVA responds to the initialization along with MCH (Figure ). 548 Ref No xxxxx Intel Restricted Secret Figure E-6. HVA PHY Initialization Behavior ………….. Northbound PHYLayer. Responds to initializationsequencelaunched byCPUCPUMCHSouthbound PHY Layer. Launches the initialization sequence andis responded bythe MCH. SimultaneousprocessesMay be launched alone with noInitializationprocess on the CPU to HVA linkStrapping………….. Northbound PHY Layer. Responds to initialization sequence launched by CPU CPU MCH Southbound PHY Layer. Launches the initialization sequence and is responded by the MCH. Simultaneous processes May be launched alone with no Initialization process on the CPU to HVA link Strapping The parameters which HVA southbound Physical layer must use for initialization must be strapped or hard-wired during the cold reset. If some parameters must be SW programmed for initialization, the recommended technique is to cold-reset (power-good) the system first and after the link is up to access specific Physical layer registers. Finally the system must be warm-reset to apply the programmed parameters to initialization process. Notice that link-level initialization process between HVA and MCH may be launched with no connection to processor to HVA link if some parameters (e.g. number of lanes) on the HVA to MCH link must be changed. E.1.9.6. Multi-Link System Applicability Applying the HVA approach in the multi-linked server systems has its own challenges. Probably in order to keep the coherency of the whole multi-linked system intact an additional synchronization (control) layer is required (Figure ). Figure E-7. HVA in the Multi-linked System CPUCPUCPUCPUMCHControl Layer FPGA CPU CPU CPU CPU MCH Ref No xxxxx 549 Intel Restricted Secret Post Silicon Validation Post Silicon Validation More analysis is required to check all aspects of the multi-linked coherency. The purpose of this analysis must be search for optimum between three major parameters- number of HVA(s), complexity of the control layer, and effectiveness of the system validation. For example, plugging VHA in every link creates fully-connected graph which brings about effective validation but on expense of the more complicated Control layer. So probably VHA must be plugged in specific links to retain the Control layer relatively simple while reaching the POR validation goals. E.1.9.7. HVA Implementation HVA implementation is layered-based and relatively straight-forward. Some layers may be adopted from processor and MCH thus making design process even easier. One of the implementation options is shown in the Figure . Figure E-8. HVA Implementation Structure PHYSICAL LAYERDATA LAYERROUTING LAYERPROTOCOL LAYERPHYSICAL LAYERDATA LAYERROUTING LAYERPROTOCOL LAYERLayerInterface to/fromProtocol LayerSTACK ASICSTACK ASICSV FPGA(e.g. VIIP) APPLICATIONLAYERSRAM(s) (Injections, Store-mode) HOSTInterface to/fromData LinkPHYSICAL LAYER DATA LAYER ROUTING LAYER PROTOCOL LAYER PHYSICAL LAYERDATA LAYERROUTING LAYERPROTOCOL LAYER Interface to/from Data Link Layer Interface to/from Protocol Layer STACK ASIC STACK ASIC SV FPGA (e.g. VIIP) APPLICATION LAYER SRAM(s) (Injections, Store-mode) HOST STACK ASIC may be built basing on the corresponding Stack of the processor or MCH. Northbound STACK must be based on the MCH logic while southbound STACK must incorporate processor layers. Data layer and Protocol layer must support and interfaces with the SV Logic and thus must be re-designed in their interface portions. Flexibility is another important point in the HVA methodology. Stack of layers must be implemented in ASIC and most of the layer portions (or even complete layers) may be cut and pasted from the MCH and processor. System Validation logic (Application layer) may be implemented in FPGA (or in two FPGA(s) – depending on pin count and area utilization) thus increasing the flexibility of the whole system. E.2 Further Information (for Intel use only) For the latest information, engineers should refer to the following web sites: CSI eroom: http://eroom.fm.patch.intel.com/eRoom/DPG/CSI. This site contains relevant information about all aspects of the CSI structure. 550 Ref No xxxxx Intel Restricted Secret IO Test and DFT website: http://dt-sharepoint.sc.intel.com/IODFT_JET/. The I/O DFT JET focuses on applications of testing and characterization methodologies of the I/O pad circuitry. E.3 DF Manufacturing Reference It is intended that the circuit and mask designers follow the Intel layout guidelines that minimize risk to the manufacturing of the I/O circuits on the 86x process. All information is available at the following Web site. Password access is required. Application directions are posted on the web site: http://www-graphics.al.intel.com/DFM/default/htm E.4 Tester DV Further information For detailed understanding of the DFT architecture, please refer to the Physical Layer DFx features section (Section ). • For detailed understanding of the circuits/ design blocks, please refer to your product A-Specs. For concepts, start with the Physical Layer chapter in this document, that also contains the CSI Electrical Specifications. (Contact: Santanu Chaudhuri) • For a Guideline CSI Electrical Test Plan and latest in DV equipment (currently being worked out, Rev0.1 available ~WW50 ’03) visit the IO Testability WG website. (Contact: Sunil Jain) Ref No xxxxx 551 Intel Restricted Secret Post Silicon Validation Post Silicon Validation 552 Ref No xxxxx Intel Restricted Secret F.1 Introduction Chapter 8, “CSI Cache Coherence Protocol” describes CSI cache coherence protocol at a conceptual level and leaves the protocol details out on purpose. This appendix attempts to provide more details by defining an implementation-agnostic model (CSI-IAM, for short) for the 2-hop source broadcast coherence version of the protocol; the CSI-IAM for the 3-hop home broadcast coherence version of the protocol is described in Appendix G, “An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence”. The primary purpose of CSI-IAM is to provide an unambiguous and maximally permissive specification of the allowed message sequences of CSI cache coherence protocol. “Maximally permissive” means that any sequence of coherent protocol messages allowed by any CSI implementation is also allowed by CSI-IAM, though a typical CSI implementation will exercise only a portion of the behaviors allowed by CSI-IAM. CSI-IAM is specified using state machines coded in a formal description language called Murphi, which was developed at Stanford University and is publicly available (http://verify.stanford.edu/dill/murphi.html). The syntax of Murphi is similar to that of Ada or Modula and its semantics is that of standard imperative programming languages, so any computer engineer should be able to read Murphi code without prior experience with the Murphi language. In any case only a small set of language constructs is used in CSI-IAM and we will explain any constructs that may be new to a typical computer engineer. Murphi manual is included in its distribution available from the URL given above. Although CSI-IAM is ultimately specified using Murphi, an intermediate-level representation of the protocol logic called protocol tables (p-tables, for short) is introduced in this appendix. The tabular format strikes a good balance between informal English description and formal Murphi code. Each p-table typically specifies what an agent does internally and what messages it sends out upon receiving a message of a particular type. The meaning of a p-table is made precise by a semantic mapping that attaches a Murphi code fragment to each cell in the p-table, with the understanding that each row in the p-table represents a possible atomic transition of CSI-IAM and the Murphi code fragments specify what the atomic transition is. Thus, whenever the reader is unsure about what a cell in a p-table means, he can quickly find out the precise meaning by consulting the semantic mapping. Our experience suggests that, once a reader gains a preliminary familiarity with CSI-IAM (in particular, how the system state is represented), the reader will be able to stay at the p-table level almost all the time without continually looking up the semantic mappings. The set of p-tables also serves as a concise summary and quick reference of the protocol. As mentioned earlier, the primary purpose of CSI-IAM is to precisely specify the set of allowed message sequences of CSI cache coherence protocol. It does not dictate or constrain the flexibility of the CSI agent implementations. However, CSI-IAM does provide hints on how CSI agents can be implemented. In particular, the data structures used by CSI-IAM, though not directly implementable, points out what state information need be tracked by each CSI agent and how that information is used for interaction with other agents. Given that understanding, the optimal microarchitecture for the system configurations under consideration can be devised. Ref No xxxxx 553 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence CSI-IAM is deterministically executable, in the sense that given a system state and an action and its parameters (actions and parameters will be defined shortly), the next system state is uniquely determined. Exploiting this property, Section F.10, “A C Reference Model Derived from CSIIAM” on page F-595 describes an executable reference model in C derived from CSI-IAM that can form the basis of a protocol rule checker or a protocol traffic generator for CSI cache coherence protocol. F.2 What CSI-IAM does and does not cover CSI-IAM covers all protocol rules for ensuring cache coherence in the Basic CSI protocol (with source snooping). It does not cover non-coherent transactions or the Scalable CSI protocol (with home snooping), nor does it model any features below or beyond the Protocol layer. CSI-IAM does not model implementation dependencies that may cause deadlock or livelock, since such dependencies are specific to particular microarchitectures and cannot be captured in a single, implementation-agnostic model. Furthermore, in order to make CSI-IAM maximally permissive, no dependencies between transactions to different cache line addresses that may result from the sharing of resources (buffers, queues, message channels, etc.) are modeled. Since CSI relies queueing for liveness and fairness and does not have retries, an implementor of CSI should be extremely careful in documenting and reasoning about any dependencies that resource sharing might introduce. F.3 Components of CSI-IAM CSI-IAM consists of the following components: • Data type declarations, including that of the system state. • The initial state of the system. • Invariants that are expected to hold. • Actions and their parameters. • Protocol tables (p-tables), one per action. • Semantic mappings, one per p-table. • Utility subroutines used by the semantic mappings. From the above components an executable Murphi model can be automatically generated. The rest of this appendix presents these components. F.4 Data Type Declarations The data type declarations used by CSI-IAM are listed below. The Murphi syntax should be self- explanatory. Only the following type constructions are used: finite ranges of integers, enumerated types, and (finite) arrays and records of previously defined types. The reader is referred to the comments embedded in the code, which begin with the string “--” and extend to the ends of the lines. More explanations are given after the code listing. In the sequel “ORB” stands for “Outgoing Request Buffer” and is synonymous with the term “SMAF” used in Chapter 8, “CSI Cache Coherence Protocol”. -- Index types. 554 Ref No xxxxx Intel Restricted Secret ADDR: 0.. (ADDR_NUM-1); -- Addresses (of cache lines). HID: 0.. (HID_NUM-1); -- Home node ids. NID: 0.. (NID_NUM-1); -- Caching node ids. TID: 0.. (TID_NUM-1); -- ORB/SMAF indices. WIDX : 0 .. (WIDX_NUM-1); -- Word indices (in cache lines). WORD : 0 .. (WORD_NUM-1); -- Word values. -- Cache line data and masks. -- A cache line consists of an array of words, each of which is qualified -- by the corresponding boolean field in a mask. DATA : array [WIDX] of WORD; MASK : array [WIDX] of BOOLEAN; -- A transaction (“txn” for short) id is a NID-TID pair. TXN : record Nid : NID; Tid : TID; end; -- A txn queue is a total order of txn ids, represented as an array (Seq) -- of txn ids plus a count (Cnt) of how many of the array entries are used. -- Seq[0] contains the oldest element of the queue and Seq[Cnt] is where-- a new youngest element is to be stored. TXN_Q : record Cnt : 0 .. (NID_NUM * TID_NUM); Seq : array [0 .. (NID_NUM * TID_NUM - 1)] of TXN; end; -- Commands that a caching agent can do internally to its cache. INT_CMD : enum {INT_Store, INT_DowngradeF, INT_DowngradeS, INT_DowngradeI}; -- Commands that a caching agent can issue externally to the CSI interface. EXT_CMD : enum {EXT_None, EXT_RdCode, EXT_RdData, EXT_RdInvOwn, EXT_InvItoE, EXT_RdCur, EXT_WbMtoI, EXT_WbMtoS, EXT_WbMtoE}; -- Auxiliary variables, which are used solely for stating properties about-- the protocol and do not affect the behavior of the protocol in any way. AUX_ENTRY : record LatestData : array [ADDR] of DATA; -- Tracks the latest data of cache line addresses. end; -- Cache (“cch” for short) entries. -- There is one cache entry per address at each caching node. CCH_STATE : enum {CCH_M, CCH_E, CCH_F, CCH_S, CCH_I}; CCH_ENTRY : record State : CCH_STATE; Mask : MASK; Data : DATA; end; -- Outgoing Request Buffer (a.k.a. SMAF) entries. -- There is one ORB entry per txn id. ORBs are in caching nodes. ORB_STATE : enum {ORB_None, ORB_SentReq, ORB_RcvdData, ORB_RcvdCmp, ORB_SentAck}; ORB_ENTRY : record State : ORB_STATE; Cmd : EXT_CMD; Addr : ADDR; Cnflt : BOOLEAN; -- A conflict has been observed. end; -- Tracker entries. -- There is one tracker entry per txn id at each home node. TRK_STATE : enum {TRK_Idle, TRK_Busy}; Ref No xxxxx 555 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence TRK_ENTRY : record State : TRK_STATE; Cmd : EXT_CMD; Addr : ADDR; -- Addr is qualified by Cmd. NotOwn : BOOLEAN; -- Cmd cannot be an owner. WaitFwd : BOOLEAN; -- Waiting for response to Cmp_Fwd*. Shrd : BOOLEAN; -- A shared response has been received. Ifwd : BOOLEAN; -- An implicit forward has been received. WbMark : BOOLEAN; -- A writeback marker has been received. WbData : BOOLEAN; -- A writeback data has been received. CnfltOwn : BOOLEAN; -- A RspCnfltOwn has been received. Rcvd : array [NID] of BOOLEAN; -- Tracks from which node a request/response has been received. Cnflt : array [NID] of array [TID] of BOOLEAN; -- Trk[n][i].Cnflt[p][j] means that txn ids (n,i) and (p,j) are -- in conflict with each other; conflicts are always symmetric. end; -- Coherence queues. -- There is one coherence queue per address at each home node, which contains -- the ids of txns to the address that have been ordered in front of all other -- txns to the address due to RspFwd* or WbMto*. COH_Q : TXN_Q; -- Memory entries. -- There is one memory entry per address at each home node. MEM_ENTRY : record Data : DATA; end; -- Snoop messages. SNP_CMD : enum {SNP_None, SNP_SnpCode, SNP_SnpData, SNP_SnpInvOwn, SNP_SnpInvItoE, SNP_SnpCur}; SNP_MSG : record Cmd : SNP_CMD; Addr : ADDR; end; -- Data response messages. DATA_CMD : enum {DATA_None, DATA_DataC_M, DATA_DataC_E, DATA_DataC_F, DATA_DataC_I, DATA_GntE}; DATA_MSG : record Cmd : DATA_CMD; Data : DATA; end; -- Completion and completion-forward response messages. -- “FwdTo” points to the requestor to which the Data*/Gnt* should be sent. CMP_CMD : enum {CMP_None, CMP_Cmp, CMP_FrcAckCnflt, CMP_Cmp_FwdCode, CMP_Cmp_FwdInvOwn, CMP_Cmp_FwdInvItoE}; CMP_MSG : record Cmd : CMP_CMD; FwdTo : TXN; end; -- Writeback data messages. WBD_CMD : enum {WBD_None, WBD_WbIData, WBD_WbSData, WBD_WbEData, WBD_WbIDataPtl}; WBD_MSG : record Cmd : WBD_CMD; Addr : ADDR; Mask : MASK; Data : DATA; end; -- Transaction nets. -- There is one txn net per txn id which contains all unordered messages 556 Ref No xxxxx Intel Restricted Secret -- associated with that txn. Snoop messages are associated with the txn -- that sends the snoops. Data, completion, and completion-forward messages-- are associated with the txn that receives these responses. Writeback -- data messages are associated with the txn that caused the writeback-- (in particular, an implicit writeback data is associated with the-- requestor, not the snooped node). TXN_NET : record SnpMsg : array [NID] of SNP_MSG; DataMsg : DATA_MSG; CmpMsg : CMP_MSG; WbDMsg : WBD_MSG; end; -- Home channel messages. -- The “From” field is only used by RspCnflt* to indicate the txn id of the -- ORB entry that conflicts with the snoop. HOME_CMD : enum {HOME_None, HOME_RdCode, HOME_RdData, HOME_RdInvOwn, HOME_InvItoE, HOME_RdCur, HOME_WbMtoI, HOME_WbMtoS, HOME_WbMtoE, HOME_RspFwdI, HOME_RspFwdIWb, HOME_RspFwdS, HOME_RspFwdSWb, HOME_RspFwd, HOME_RspI, HOME_RspIWb, HOME_RspS, HOME_RspSWb, HOME_RspCnflt, HOME_RspCnfltOwn, HOME_AckCnflt}; HOME_MSG : record Cmd : HOME_CMD; Addr : ADDR; From : TXN; end; -- Home nets. -- There is one home net per (source node, home node) pair which represents -- the home channel from the source node to the home node. -- The set of messages in the channel and their ordering are tracked -- separately (Msg and Ord), the latter of which is on a per-address basis. HOME_NET : record Msg : array [NID] of array [TID] of HOME_MSG; Ord : array [ADDR] of TXN_Q; end; -- The system state. STATE : record -- Mapping from addresses to home nodes. Home : array [ADDR] of HID; -- The following variables are purely auxiliary. Aux : AUX_ENTRY; -- The following variables belong to the caching nodes. Cch : array [NID] of array [ADDR] of CCH_ENTRY; Orb : array [NID] of array [TID] of ORB_ENTRY; -- The following variables belong to the home nodes. Trk : array [HID] of array [NID] of array [TID] of TRK_ENTRY; CoQ : array [HID] of array [ADDR] of COH_Q; Mem : array [HID] of array [ADDR] of MEM_ENTRY; -- The following variables belong to the interconnect network. TxnNet : array [NID] of array [TID] of TXN_NET; HomeNet : array [NID] of array [HID] of HOME_NET; end; Ref No xxxxx 557 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence Some remarks about how CSI-IAM represents the system state are in order: • Caching nodes and home nodes are conceptually distinct from each other (see Figure 8.1 “Protocol Architecture” on page 8-245) and CSI-IAM has separate index sets (NID and HID) to range over them. • Enumerated constants are always prefixed by the names of their types, to ensure that the same constant does not appear in more than one type. These prefixes are omitted in English comments and p-tables (but not in semantic mappings). • It is always true that whenever the State or Cmd field of a record has the value None, the values of its other fields are don’t-cares. However, when the State of a Tracker (Trk) entry is Idle, its other fields still have meaningful values. In particular, the conflict list (Cnflt) of a Tracker entry is not “qualified” by its State in any way. • It is worth repeating that the primary purpose of CSI-IAM is to precisely specify the set of allowed message sequences of CSI cache coherence protocol. The system state representation shown above is geared toward making this specification as simple and clear as possible. It is not supposed to be directly implementable or “realistic” in any sense. The most “unrealistic” aspect of how CSI-IAM represents the system state is the several data structures indexed by addresses: Cch, CoQ, Mem, and the tracking of message ordering in the home channels. Sometimes this is necessitated by the goal of making CSI-IAM maximally permissive in terms of message ordering. For instance, it is stated in Section 8.4, “Source Broadcast Home Agent Algorithm” on page 8-262 that “the home channel is only architecturally required to be kept in-order to a given address”, so CSI-IAM tracks the home channel ordering literally on a per- address basis. At other times the per-address representation is motivated by a desire for simplicity. For instance, though the cache (Cch) at each caching agent contains only a small subset of possible cache line addresses at any time, modeling the cache as it is actually implemented would include too many details irrelevant to our purpose. So we simply model each cache as indexed by the full set of addresses and use the cache state I to model those lines that are not in the cache. • One Murphi construct used later is perhaps best explained at this point. The command “undefine ” assigns a don’t-care value to the variable . This is a true don’t-care, in the sense that CSI-IAM never needs to read or test a don’t-care value. Thus an implementation is free to use any (legal) value to represent don’t-care, including leaving the value of unchanged. The Initial State of the System The following procedure specifies a legal initial state of the system. It is assumed that the assignment of home nodes to addresses is initialized separately. procedure InitState(var Sta: STATE); for a : ADDR do for w : WIDX do Sta.Aux.LatestData[a][w] := Sta.Mem[Sta.Home[a]][a].Data[w]; end; end; for n : NID do for a : ADDR do Sta.Cch[n][a].State := CCH_I; for w : WIDX do Sta.Cch[n][a].Mask[w] := FALSE; end; end; end; for n : NID do for i : TID do Sta.Orb[n][i].State := ORB_None; end; end; for h : HID do for n : NID do for i : TID do Sta.Trk[h][n][i].State := TRK_Idle; 558 Ref No xxxxx Intel Restricted Secret Sta.Trk[h][n][i].Cmd := EXT_None; Sta.Trk[h][n][i].NotOwn := FALSE; Sta.Trk[h][n][i].WaitFwd := FALSE; Sta.Trk[h][n][i].Shrd := FALSE; Sta.Trk[h][n][i].Ifwd := FALSE; Sta.Trk[h][n][i].WbMark := FALSE; Sta.Trk[h][n][i].WbData := FALSE; Sta.Trk[h][n][i].CnfltOwn := FALSE; for p : NID do Sta.Trk[h][n][i].Rcvd[p] := FALSE; end; for p : NID do for j : TID do Sta.Trk[h][n][i].Cnflt[p][j] := FALSE; end; end; end; end; end; for h : HID do for a : ADDR do Sta.CoQ[h][a].Cnt := 0; end; end; for n : NID do for i : TID do for p : NID do Sta.TxnNet[n][i].SnpMsg[p].Cmd := SNP_None; end; Sta.TxnNet[n][i].DataMsg.Cmd := DATA_None; Sta.TxnNet[n][i].CmpMsg.Cmd := CMP_None; Sta.TxnNet[n][i].WbDMsg.Cmd := WBD_None; end; end; for p : NID do for h : HID do for n : NID do for i : TID do Sta.HomeNet[p][h].Msg[n][i].Cmd := HOME_None; end; end; for a : ADDR do Sta.HomeNet[p][h].Ord[a].Cnt := 0; end; end; end; end; F.6 Invariants The following invariants should hold at all times for CSI-IAM. Some of the invariants use functions defined in Section F.9, “Utility Sub-Routines” on page F-588. -- Whenever the state of a cache is valid (i.e., not I), its valid words-- (i.e., those whose corresponding mask entries are TRUE) contain the-- latest data, which are tracked by Aux.LatestData (this is the only-- place in CSI-IAM where Aux.LatestData is read). invariant “CacheDataProp” forall a: ADDR do forall n: NID do forall w: WIDX do ( Sta.Cch[n][a].State = CCH_M | Sta.Cch[n][a].State = CCH_E | Sta.Cch[n][a].State = CCH_F | Sta.Cch[n][a].State = CCH_S ) & Sta.Cch[n][a].Mask[w] = TRUE -> Sta.Cch[n][a].Data[w] = Sta.Aux.LatestData[a][w] end end end; -- Consistency conditions between the state and mask of a cache line. invariant “CacheMaskProp” forall a : ADDR do forall n : NID do ( Sta.Cch[n][a].State = CCH_M -> MaskFull(Sta.Cch[n][a].Mask) | MaskPartial(Sta.Cch[n][a].Mask) ) & ( Sta.Cch[n][a].State = CCH_E -> MaskFull(Sta.Cch[n][a].Mask) | MaskEmpty(Sta.Cch[n][a].Mask) ) & ( Sta.Cch[n][a].State = CCH_F -> MaskFull(Sta.Cch[n][a].Mask) ) & ( Sta.Cch[n][a].State = CCH_S -> MaskFull(Sta.Cch[n][a].Mask) ) & ( Sta.Cch[n][a].State = CCH_I -> MaskEmpty(Sta.Cch[n][a].Mask) ) end end; -- Consistency conditions between the states of caches at different nodes. Ref No xxxxx 559 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence invariant “CacheStateProp” forall a : ADDR do forall n : NID do forall p : NID do n != p -> ( Sta.Cch[n][a].State = CCH_M -> Sta.Cch[p][a].State = CCH_I ) & ( Sta.Cch[n][a].State = CCH_E -> Sta.Cch[p][a].State = CCH_I ) & ( Sta.Cch[n][a].State = CCH_F -> Sta.Cch[p][a].State = CCH_S | Sta.Cch[p][a].State = CCH_I ) & ( Sta.Cch[n][a].State = CCH_S -> Sta.Cch[p][a].State = CCH_F | Sta.Cch[p][a].State = CCH_S | Sta.Cch[p][a].State = CCH_I ) end end end; -- Implications of an ORB entry being in state None, which essentially say-- that there is no activity with its txn id anywhere in the system. invariant “OrbNoneProp” forall a : ADDR do forall n : NID do forall i : TID do Sta.Orb[n][i].State = ORB_None -> Sta.Trk[Sta.Home[a]][n][i].State = TRK_Idle & forall p : NID do forall j : TID do Sta.Trk[Sta.Home[a]][n][i].Cnflt[p][j] = FALSE end end & !TxnQueueHas(Sta.CoQ[Sta.Home[a]][a], n, i) & forall p : NID do Sta.TxnNet[n][i].SnpMsg[p].Cmd = SNP_None end & Sta.TxnNet[n][i].DataMsg.Cmd = DATA_None & Sta.TxnNet[n][i].CmpMsg.Cmd = CMP_None & Sta.TxnNet[n][i].WbDMsg.Cmd = WBD_None & forall p : NID do Sta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd = HOME_None & !TxnQueueHas(Sta.HomeNet[p][Sta.Home[a]].Ord[a], n, i) end end end end; -- Implications of a Tracker entry being in state Idle. -- Note that a Tracker entry being Idle does NOT imply that its -- WbData or NotOwn field is FALSE or that its conflict list is empty. invariant “TrkIdleProp” forall a : ADDR do forall n : NID do forall i : TID do Sta.Trk[Sta.Home[a]][n][i].State = TRK_Idle -> Sta.Trk[Sta.Home[a]][n][i].Cmd = EXT_None & Sta.Trk[Sta.Home[a]][n][i].WaitFwd = FALSE & Sta.Trk[Sta.Home[a]][n][i].Shrd = FALSE & Sta.Trk[Sta.Home[a]][n][i].Ifwd = FALSE & Sta.Trk[Sta.Home[a]][n][i].WbMark = FALSE & Sta.Trk[Sta.Home[a]][n][i].CnfltOwn = FALSE & forall p : NID do Sta.Trk[Sta.Home[a]][n][i].Rcvd[p] = FALSEend end end end; 560 Ref No xxxxx Intel Restricted Secret Actions and Their Parameters Table F-3. Actions of CSI-IAM Action Meaning Action name Action parameters C a c h i n g A g e n t H o m e a g e n t ReqAddReqNidReqTidPeerNidPeerTidExtCmdIntCmdStWidxStWordBiasFwdBiasToI Requestor generates internal request (Store/Downgrade*) CacheNewReqInt X X X X X Requestor generates external request (Rd*/Inv*/WbMto*) CacheNewReqExt X X X X Requestor receives Data*/Gnt* CacheRecvData X X X Requestor receives Cmp/FrcAckCnflt CacheRecvCmp X X X Requestor receives Cmp_Fwd* for Peer CacheRecvFwd X X X X X X Requestor snoops Peer's cache and misses its ORB CacheSnpOrbMiss X X X X X X Requestor snoops Peer's cache and hits its ORB CacheSnpOrbHit X X X X X Home receives Rd*/Inv*/WbMto* from Requestor HomeRecvReq X X X Home receives Rsp* to Requestor's snoop or forward HomeRecvRsp X X X X Home receives AckCnflt from Requestor and replies with HomeRecvAckCmp X X X Home receives AckCnflt from Requestor and replies with HomeRecvAckFwd X X X X X Home receives Wb*Data for Requestor HomeRecvWbData X X X Home sends Data*/Gnt* + Cmp/FrcAckCnflt to Requestor HomeSendDataCm X X X An action is a set of state transitions with a common theme. Typically it consists of the state transitions caused by a (caching or home) node receiving a message, performing internal processing, and sending messages to other nodes. An action is uniquely identified by its name and parameters, the latter of which specifies for which address, at which Nid(s) and Tid(s), and (possibly) for which internal or external command the action happens. CSI-IAM is specified by 13 actions, 7 of which are for caching agents and 6 for home agents. Their names, parameters, and approximate meanings are listed in Table F-3, where an “X” indicates that a parameter is needed for an action. Please note that this table is just a summary and gives only approximate meanings of actions and their parameters; their precise meanings will be given in the protocol tables and semantic mappings in the next section. Action executions are atomic, in that the execution of an action does not overlap with that of another action and the execution of the system consists a sequence of the action executions, each completely finished before the next is started. The atomicity requirement is, of course, only conceptual: the implementor is free to use any scheme that supports the illusion of atomicity but in reality may overlap the executions of different actions in time. Protocol Tables and Their Semantic Mappings Each action of CSI-IAM is specified by a protocol table (p-table), whose precise meaning is given by a semantic mapping that attaches a Murphi code fragment to each table entry. The Murphi code fragment will refer to the system state representation given in Section F.4 and possibly also use the utility subroutines given in Section F.9. Ref No xxxxx 561 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence Each p-table has three header rows (which are printed in boldface) and one or more body rows. Each body row specifies a state transition. Each nonempty entry in the body under the header “Current State” (respectively, “Next State”) represents a condition on the system state (resp., a set of assignments to the system state). (Empty entries have no meaning and can be ignored.) Exactly what conditions and assignments the entries represent are given by the semantic mapping, in which “Sta” (resp. “NxtSta”) denotes the current (resp., next) system state. A row is enabled if all of its conditions are true and, when executed, performs all of its assignments in parallel (so the assignments are not ordered and the effect of one assignment is not seen by another assignment). To improve readability, there are often “mega-cells” that straddle multiple basic cells in the p- tables. A “mega-cell” has exactly the same meaning as if its content is duplicated in each of its constituent basic cells. Each semantic mapping has three parts: • The PARAMETERS section lists the parameters of the action together with their types. The parameters can be thought of as free variables whose instantiation uniquely identifies an action. • The ALIASES section introduces local abbreviations that are used in the rest of the semantic mapping and often appear in the second header row of the p-table (entries in the third header row are fields of variables or aliases in the second row). • One or more COLUMN sections, each of which uniquely identifies a column in the p-table by the texts in its header entries (listed horizontally rather than vertically) and specifies what condition or assignment (as a Murphi code fragment after “=>”) is attached to each possible entry in the column (before “=>”). Since some of the p-tables and many of the semantic mappings are wide, we’ll switch to the landscape mode in the rest of this section. Table F-4. Action CacheNewReqInt Current State Next State IntCmd ReqCch ReqCch Aux State Mask State Mask[StWidx] Data[StWidx] LatestData[StWidx] Store M, E M TRUE StWord StWord DowngradeF E Full F DowngradeS E Full S F DowngradeI E, F, S I Empty (all of Mask) This is the action of a caching agent performing an internal operation (store or downgrade) to its cache. For simplicity, each store changes only one word in the cache line (i.e., the one indexed by StWidx); a full-line store can be easily modelled by a sequence of single-word stores. Note that the word stored (StWord) is also recorded in the auxiliary variable Aux.LatestData. This is the only place in CSI-IAM where Aux.LatestData is written. Note also that CSI-IAM allows a cache state be downgraded at any time, and that the state E can be downgraded to F or S only when the cache line contains valid data; otherwise it can only be downgraded to I. PARAMETERS ReqAddr : ADDR ReqNid IntCmd : NID: INT_CMD StWidx : WIDX StWord : WORD ALIASES ReqCch : Sta.Cch[ReqNid][ReqAddr] AuxData: Sta.Aux.LatestData[ReqAddr] NxtReqCch: NxtSta.Cch[ReqNid][ReqAddr] NxtAuxData: NxtSta.Aux.LatestData[ReqAddr] 562 Ref No xxxxx Intel Restricted Secret COLUMN |Current State|IntCmd| |Store| => IntCmd = INT_Store |DowngradeF| => IntCmd = INT_DowngradeF |DowngradeS| => IntCmd = INT_DowngradeS |DowngradeI| => IntCmd = INT_DowngradeI COLUMN |Current State|ReqCch|State| |E| => ReqCch.State in {CCH_E} |F| => ReqCch.State in {CCH_F} |M, E| => ReqCch.State in {CCH_M, CCH_E} |E, F, S| => ReqCch.State in {CCH_E, CCH_F, CCH_S} COLUMN |Current State|ReqCch|Mask| |Full| => MaskFull(ReqCch.Mask) COLUMN |Next State|ReqCch|State| |M| => NxtReqCch.State := CCH_M |F| => NxtReqCch.State := CCH_F |S| => NxtReqCch.State := CCH_S |I| => NxtReqCch.State := CCH_I COLUMN |Next State|ReqCch|Mask[StWidx]| |TRUE| => NxtReqCch.Mask[StWidx] := TRUE |Empty (all of Mask)| => SetMaskEmpty(NxtReqCch.Mask); UndefineData(NxtReqCch.Data) COLUMN |Next State|ReqCch|Data[StWidx]| |StWord| => NxtReqCch.Data[StWidx] := StWord COLUMN |Next State|Aux|LatestData[StWidx]| |StWord| => NxtAuxData[StWidx]:= StWord Ref No xxxxx 563 Intel Restricted Secret Current State Next State ReqOrb available ExtCmd ReqCch ReqOrb ReqCch HomeNet TxnNet State Mask State Cmd Addr Cnflt State Mask Send to Home Send to Peers Send to Home TRUE RdCode SentReq RdCode ReqAddr FALSE RdCode SnpCode RdData RdData RdData SnpData RdInvOwn RdInvOwn RdInvOwn SnpInvOwn InvItoE InvItoE InvItoE SnpInvItoE RdCur I RdCur RdCur SnpCur WbMtoI M Full WbMtoI I Empty WbMtoI WbIData(ReqCch.Data) Partial WbIDataPtl(ReqCch.Data, ReqCch.Mask) WbMtoS Full WbMtoS S WbMtoS WbSData(ReqCch.Data) WbMtoE Full WbMtoE E WbMtoE WbEData(ReqCch.Data) An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence 564 Ref No xxxxx Intel Restricted Secret This is the action of a caching agent issuing an external action to the CSI interface. Note that a Rd*/Inv* request except RdCur can be issued from any cache state, including M; RdCur can only be issued from I. PARAMETERS ReqAddr : ADDR ReqNid : NID ReqTid : TID ExtCmd : EXT_CMD ALIASES ReqOrb : Sta.Orb[ReqNid][ReqTid] ReqCch : Sta.Cch[ReqNid][ReqAddr] NxtReqOrb: NxtSta.Orb[ReqNid][ReqTid] NxtReqCch : NxtSta.Cch[ReqNid][ReqAddr] COLUMN |Current State|ReqOrb available| |TRUE| => OrbAvail(Sta, ReqAddr, ReqNid, ReqTid) = TRUE COLUMN |Current State|ExtCmd| |RdCode| => ExtCmd = EXT_RdCode |RdData| => ExtCmd = EXT_RdData |RdInvOwn| => ExtCmd = EXT_RdInvOwn |InvItoE| => ExtCmd = EXT_InvItoE |RdCur| => ExtCmd = EXT_RdCur |WbMtoI| => ExtCmd = EXT_WbMtoI |WbMtoS| => ExtCmd = EXT_WbMtoS |WbMtoE| => ExtCmd = EXT_WbMtoE COLUMN |Current State|ReqCch|State| |I| => ReqCch.State = CCH_I |M| => ReqCch.State = CCH_M COLUMN |Current State|ReqCch|Mask| |Full| => MaskFull(ReqCch.Mask) |Partial| => MaskPartial(ReqCch.Mask) COLUMN |Next State|ReqOrb|State| |SentReq| => NxtReqOrb.State := ORB_SentReq COLUMN |Next State|ReqOrb|Cmd| |RdCode| => NxtReqOrb.Cmd := EXT_RdCode |RdData| => NxtReqOrb.Cmd := EXT_RdData |RdInvOwn| => NxtReqOrb.Cmd := EXT_RdInvOwn |InvItoE| => NxtReqOrb.Cmd := EXT_InvItoE |RdCur| => NxtReqOrb.Cmd := EXT_RdCur |WbMtoI| => NxtReqOrb.Cmd := EXT_WbMtoI |WbMtoS| => NxtReqOrb.Cmd := EXT_WbMtoS |WbMtoE| => NxtReqOrb.Cmd := EXT_WbMtoE COLUMN |Next State|ReqOrb|Addr| |ReqAddr| => NxtReqOrb.Addr := ReqAddr COLUMN |Next State|ReqOrb|Cnflt| |FALSE| => NxtReqOrb.Cnflt := FALSE COLUMN |Next State|ReqCch|State| |I| => NxtReqCch.State := CCH_I |S| => NxtReqCch.State := CCH_S |E| => NxtReqCch.State := CCH_E COLUMN |Next State|ReqCch|Mask| |Empty| => SetMaskEmpty(NxtReqCch.Mask); UndefineData(NxtReqCch.Data) COLUMN |Next State|HomeNet|Send to Home| |RdCode| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_RdCode) |RdData| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_RdData) |RdInvOwn| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_RdInvOwn) |InvItoE| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_InvItoE) |RdCur| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_RdCur) |WbMtoI| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_WbMtoI) |WbMtoS| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_WbMtoS) |WbMtoE| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_WbMtoE) COLUMN |Next State|TxnNet|Send to Peers| |SnpCode| => SendTxnSnps(NxtSta, Sta, ReqNid, ReqTid, SNP_SnpCode, ReqAddr) Ref No xxxxx 565 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence |SnpData| => SendTxnSnps(NxtSta, Sta, ReqNid, ReqTid, SNP_SnpData, ReqAddr) |SnpInvOwn| => SendTxnSnps(NxtSta, Sta, ReqNid, ReqTid, SNP_SnpInvOwn, ReqAddr) |SnpInvItoE| => SendTxnSnps(NxtSta, Sta, ReqNid, ReqTid, SNP_SnpInvItoE, ReqAddr) |SnpCur| => SendTxnSnps(NxtSta, Sta, ReqNid, ReqTid, SNP_SnpCur, ReqAddr) COLUMN |Next State|TxnNet|Send to Home| |WbIData(ReqCch.Data)| => SendTxnWbD(NxtSta, Sta, ReqNid, ReqTid, WBD_WbIData, ReqAddr, ReqCch.Mask, ReqCch.Data) |WbSData(ReqCch.Data)| => SendTxnWbD(NxtSta, Sta, ReqNid, ReqTid, WBD_WbSData, ReqAddr, ReqCch.Mask, ReqCch.Data) |WbEData(ReqCch.Data)| => SendTxnWbD(NxtSta, Sta, ReqNid, ReqTid, WBD_WbEData, ReqAddr, ReqCch.Mask, ReqCch.Data) |WbIDataPtl(ReqCch.Data, ReqCch.Mask)| => SendTxnWbD(NxtSta, Sta, ReqNid, ReqTid, WBD_WbIDataPtl, ReqAddr, ReqCch.Mask, ReqCch.Data) Table F-6. Action CacheRecvData Current State Next State Msg ReqCch ReqOrb ReqOrb ReqCch HomeNet TxnNet Cmd State State Cnflt State State Mask Data Send to Home Msg DataC_M SentReq RcvdData M Full Msg.Data Remove RcvdCmp TRUE SentAck AckCnflt FALSE None DataC_E E, F, S, I SentReq RcvdData E Full Msg.Data RcvdCmp TRUE SentAck AckCnflt FALSE None M SentReq RcvdData RcvdCmp TRUE SentAck AckCnflt FALSE None DataC_F F, S, I SentReq RcvdData F Full Msg.Data RcvdCmp TRUE SentAck AckCnflt FALSE None M, E SentReq RcvdData RcvdCmp TRUE SentAck AckCnflt FALSE None DataC_I SentReq RcvdData RcvdCmp TRUE SentAck AckCnflt FALSE None GntE E, F, S, I SentReq RcvdData E RcvdCmp TRUE SentAck AckCnflt FALSE None M SentReq RcvdData RcvdCmp TRUE SentAck AckCnflt FALSE None This is the action of a caching agent receiving a Data* response. Note that if the caching agent already has the data in a “high enough” cache state, the received cache state and data are ignored. PARAMETERS ReqAddr : ADDR ReqNid ReqTid : NID: TID ALIASES Msg ReqOrb ReqCch : Sta.TxnNet[ReqNid][ReqTid].DataMsg: Sta.Orb[ReqNid][ReqTid] : Sta.Cch[ReqNid][ReqAddr] NxtReqOrb : NxtSta.Orb[ReqNid][ReqTid] NxtReqCch : NxtSta.Cch[ReqNid][ReqAddr] COLUMN |Current State|Msg|Cmd| |DataC_M| => PendTxnData(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = DATA_DataC_M |DataC_E| => PendTxnData(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = DATA_DataC_E 566 Ref No xxxxx Intel Restricted Secret |DataC_F| => PendTxnData(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = DATA_DataC_F |DataC_I| => PendTxnData(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = DATA_DataC_I|GntE| => PendTxnData(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = DATA_GntE COLUMN |Current State|ReqCch|State| |M| => ReqCch.State in {CCH_M} |M, E| => ReqCch.State in {CCH_M, CCH_E} |F, S, I| => ReqCch.State in {CCH_F, CCH_S, CCH_I} |E, F, S, I| => ReqCch.State in {CCH_E, CCH_F, CCH_S, CCH_I} COLUMN |Current State|ReqOrb|State| |SentReq| => ReqOrb.State = ORB_SentReq |RcvdCmp| => ReqOrb.State = ORB_RcvdCmp COLUMN |Current State|ReqOrb|Cnflt| |TRUE| => ReqOrb.Cnflt = TRUE |FALSE| => ReqOrb.Cnflt = FALSE COLUMN |Next State|ReqOrb|State| |RcvdData| => NxtReqOrb.State := ORB_RcvdData |SentAck| => NxtReqOrb.State := ORB_SentAck |None| => OrbClear(NxtSta, Sta, ReqNid, ReqTid) COLUMN |Next State|ReqCch|State| |M| => NxtReqCch.State := CCH_M |E| => NxtReqCch.State := CCH_E |F| => NxtReqCch.State := CCH_F COLUMN |Next State|ReqCch|Mask| |Full| => SetMaskFull(NxtReqCch.Mask) COLUMN |Next State|ReqCch|Data| |Msg.Data| => CopyData(NxtReqCch.Data, Msg.Data) COLUMN |Next State|HomeNet|Send to Home| |AckCnflt| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_AckCnflt) COLUMN |Next State|TxnNet|Msg| |Remove| => RecvTxnData(NxtSta, Sta, ReqNid, ReqTid) Table F-7. Action CacheRecvCmp Current State Next State Msg ReqOrb ReqOrb HomeNet TxnNet Cmd State Cmd Cnflt State Cnflt Send to Home Msg Cmp SentReq WbMto* TRUE SentAck AckCnflt Remove FALSE None != WbMto* RcvdCmp RcvdData TRUE SentAck AckCnflt FALSE None SentAck None FrcAckCnflt SentReq WbMto* SentAck TRUE AckCnflt != WbMto* RcvdCmp RcvdData SentAck AckCnflt This is the action of a caching agent receiving a Cmp or FrcAckCnflt response. Since each transaction has at most one AckCnft phase, FrcAckCnflt cannot be received when the ORB state is SentAck. PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID ALIASES Msg: Sta.TxnNet[ReqNid][ReqTid].CmpMsg ReqOrb: Sta.Orb[ReqNid][ReqTid] NxtReqOrb: NxtSta.Orb[ReqNid][ReqTid] Ref No xxxxx 567 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence COLUMN |Current State|Msg|Cmd| |Cmp| => PendTxnCmpCmp(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = CMP_Cmp |FrcAckCnflt| => PendTxnCmpCmp(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = CMP_FrcAckCnflt COLUMN |Current State|ReqOrb|State| |SentReq| => ReqOrb.State = ORB_SentReq |RcvdData| => ReqOrb.State = ORB_RcvdData |SentAck| => ReqOrb.State = ORB_SentAck COLUMN |Current State|ReqOrb|Cmd| |WbMto*| => IsExtWb(ReqOrb.Cmd) = TRUE |!= WbMto*| => IsExtWb(ReqOrb.Cmd) = FALSE COLUMN |Current State|ReqOrb|Cnflt| |TRUE| => ReqOrb.Cnflt = TRUE |FALSE| => ReqOrb.Cnflt = FALSE COLUMN |Next State|ReqOrb|State| |RcvdCmp| => NxtReqOrb.State:= ORB_RcvdCmp |SentAck| => NxtReqOrb.State:= ORB_SentAck |None| => OrbClear(NxtSta, Sta, ReqNid, ReqTid) COLUMN |Next State|ReqOrb|Cnflt| |TRUE| => NxtReqOrb.Cnflt:= TRUE COLUMN |Next State|HomeNet|Send to Home| |AckCnflt| => SendHomeReq(NxtSta, Sta, ReqAddr, ReqNid, ReqTid, HOME_AckCnflt) COLUMN |Next State|TxnNet|Msg| |Remove| => RecvTxnCmp(NxtSta, Sta, ReqNid, ReqTid) 568 Ref No xxxxx Intel Restricted Secret Ref No xxxxx 569 Ref No xxxxx 569 Table F-8. Action CacheRecvFwd Current State Next State Msg ReqCch BiasFwd ReqOrb ReqCch HomeNet TxnNet Cmd FwdTo State Mask State State Mask Send to Home Send to Peer Send to Home Msg Cmp_FwdCode Peer M Full None I Empty RspIWb WbIData(ReqCch.Data) Remove Partial RspIWb WbIDataPtl(ReqCch.Data, ReqCch.Mask) E Full RspFwdI DataC_F(ReqCch.Data) Empty RspI F RspFwdI DataC_F(ReqCch.Data) S, I RspI Cmp_FwdInvOwn M Full TRUE None I Empty RspFwdI DataC_M(ReqCch.Data) FALSE RspIWb WbIData(ReqCch.Data) Partial RspIWb WbIDataPtl(ReqCch.Data, ReqCch.Mask) E Full RspFwdI DataC_E(ReqCch.Data) Empty RspI F, S, I RspI Cmp_FwdInvItoE M Full None I Empty RspIWb WbIData(ReqCch.Data) Partial RspIWb WbIDataPtl(ReqCch.Data, ReqCch.Mask) E, F, S, I RspI An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence This is the action of a caching agent receiving a Cmp_Fwd* response, which can only be received when the ORB state is SentAck. Msg.FwdTo is the txn id to which the cache data should be forwarded and is here “exposed” at the action level as a pair of action parameters (PeerNid, PeerTid). The parameter BiasFwd selects between two alternative responses when Cmp_FwdInvOwn is received in cache state M. An implementation is free to choose either alternative and the choice need not be consistent from agent to agent (in fact, an agent can even make a different choice each time this table is exercised). PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID PeerNid: NID PeerTid: TID BiasFwd: BOOLEAN ALIASES Msg: Sta.TxnNet[ReqNid][ReqTid].CmpMsg ReqOrb: Sta.Orb[ReqNid][ReqTid] ReqCch: Sta.Cch[ReqNid][ReqAddr] NxtReqOrb: NxtSta.Orb[ReqNid][ReqTid] NxtReqCch: NxtSta.Cch[ReqNid][ReqAddr] COLUMN |Current State|Msg|Cmd| |Cmp_FwdCode| => PendTxnCmpFwd(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = CMP_Cmp_FwdCode |Cmp_FwdInvOwn| => PendTxnCmpFwd(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = CMP_Cmp_FwdInvOwn |Cmp_FwdInvItoE| => PendTxnCmpFwd(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = CMP_Cmp_FwdInvItoE COLUMN |Current State|Msg|FwdTo| |Peer| => Msg.FwdTo.Nid = PeerNid & Msg.FwdTo.Tid = PeerTid COLUMN |Current State|ReqCch|State| |M| => ReqCch.State in {CCH_M} |E| => ReqCch.State in {CCH_E} |F| => ReqCch.State in {CCH_F} |S, I| => ReqCch.State in {CCH_S, CCH_I} |F, S, I| => ReqCch.State in {CCH_F, CCH_S, CCH_I} |E, F, S, I| => ReqCch.State in {CCH_E, CCH_F, CCH_S, CCH_I} COLUMN |Current State|ReqCch|Mask| |Full| => MaskFull(ReqCch.Mask) |Empty| => MaskEmpty(ReqCch.Mask) |Partial| => MaskPartial(ReqCch.Mask) COLUMN |Current State|BiasFwd| |TRUE| => BiasFwd = TRUE |FALSE| => BiasFwd = FALSE COLUMN |Next State|ReqOrb|State| |None| => OrbClear(NxtSta, Sta, ReqNid, ReqTid) COLUMN |Next State|ReqCch|State| |I| => NxtReqCch.State:= CCH_I COLUMN |Next State|ReqCch|Mask| |Empty| => SetMaskEmpty(NxtReqCch.Mask); UndefineData(NxtReqCch.Data) COLUMN |Next State|HomeNet|Send to Home| |RspI| => SendHomeRsp(NxtSta, Sta, ReqNid, ReqAddr, PeerNid, PeerTid, HOME_RspI) |RspIWb| => SendHomeRsp(NxtSta, Sta, ReqNid, ReqAddr, PeerNid, PeerTid, HOME_RspIWb) |RspFwdI| => SendHomeRsp(NxtSta, Sta, ReqNid, ReqAddr, PeerNid, PeerTid, HOME_RspFwdI) COLUMN |Next State|TxnNet|Send to Peer| |DataC_M(ReqCch.Data)| => SendTxnData(NxtSta, Sta, PeerNid, PeerTid, DATA_DataC_M, ReqCch.Data) |DataC_E(ReqCch.Data)| => SendTxnData(NxtSta, Sta, PeerNid, PeerTid, DATA_DataC_E, ReqCch.Data) |DataC_F(ReqCch.Data)| => SendTxnData(NxtSta, Sta, PeerNid, PeerTid, DATA_DataC_F, ReqCch.Data) COLUMN |Next State|TxnNet|Send to Home| |WbIData(ReqCch.Data)| => SendTxnWbD(NxtSta, Sta, PeerNid, PeerTid, WBD_WbIData, ReqAddr, ReqCch.Mask, ReqCch.Data) 570 Ref No xxxxx Intel Restricted Secret |WbIDataPtl(ReqCch.Data, ReqCch.Mask)| => SendTxnWbD(NxtSta, Sta, PeerNid, PeerTid, WBD_WbIDataPtl, ReqAddr, ReqCch.Mask, ReqCch.Data) COLUMN |Next State|TxnNet|Msg| |Remove| => RecvTxnCmp(NxtSta, Sta, ReqNid, ReqTid) Ref No xxxxx 571 Intel Restricted Secret 5757 Intel Restricted Secret Table F-9. Action CacheSnpOrbMiss Current State Next State PeerOrb miss Msg PeerCch BiasFwd BiasToI PeerCch HomeNet TxnNet Cmd State Mask State Mask Send to Home Send to Msg.Req Send to Home Msg TRUE SnpCode M Full TRUE TRUE I Empty RspFwdIWb DataC_F(PeerCch.Data) WbIData(PeerCch.Data) Remove FALSE S RspFwdSWb DataC_F(PeerCch.Data) WbSData(PeerCch.Data) FALSE TRUE I Empty RspIWb WbIData(PeerCch.Data) FALSE S RspSWb WbSData(PeerCch.Data) Partial I Empty RspIWb WbIDataPtl(PeerCch.Data, PeerCch.Mask) E Full S RspFwdS DataC_F(PeerCch.Data) Empty I Empty RspI F S RspFwdS DataC_F(PeerCch.Data) S S RspS I I Empty RspI SnpData M Full TRUE TRUE I Empty RspFwdIWb DataC_E(PeerCch.Data) WbIData(PeerCch.Data) FALSE S RspFwdSWb DataC_F(PeerCch.Data) WbSData(PeerCch.Data) FALSE TRUE I Empty RspIWb WbIData(PeerCch.Data) FALSE S RspSWb WbSData(PeerCch.Data) Partial I Empty RspIWb WbIDataPtl(PeerCch.Data, PeerCch.Mask) E Full S RspFwdS DataC_F(PeerCch.Data) Empty I Empty RspI F S RspFwdS DataC_F(PeerCch.Data) S S RspS I I Empty RspI SnpInvOwn M Full TRUE I Empty RspFwdI DataC_M(PeerCch.Data) FALSE RspIWb WbIData(PeerCch.Data) Partial RspIWb WbIDataPtl(PeerCch.Data, PeerCch.Mask) E Full RspFwdI DataC_E(PeerCch.Data) Empty RspI F, S, I RspI SnpInvItoE M Full I Empty RspIWb WbIData(PeerCch.Data) Partial RspIWb WbIDataPtl(PeerCch.Data, PeerCch.Mask) E, F, S, I RspI An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence Ref No xxxxx 573 Ref No xxxxx 573 Table F-9. Action CacheSnpOrbMiss TRUE SnpCur M Full TRUE RspFwd DataC_I(PeerCch.Data) Remove FALSE TRUE I Empty RspIWb WbIData(PeerCch.Data) FALSE S RspSWb WbSData(PeerCch.Data) Partial I Empty RspIWb WbIDataPtl(PeerCch.Data, PeerCch.Mask) E Full RspFwd DataC_I(PeerCch.Data) Empty I Empty RspI F RspFwd DataC_I(PeerCch.Data) S RspS I I Empty RspI An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence This is the action of a caching agent processing an incoming snoop that does not conflict with any outgoing request in ORB. There are two parameters BiasFwd and BiasToI to select among alternative responses to SnpCode, SnpData, SnpInvOwn, and SnpCur when the snoop is received in cache state M. An implementation is free to use any of these alternatives and the choice need not be consistent from agent to agent (in fact, an agent can even make a different choice each time this table is exercised). PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID PeerNid: NID BiasFwd: BOOLEAN BiasToI: BOOLEAN ALIASES Msg: Sta.TxnNet[ReqNid][ReqTid].SnpMsg[PeerNid] PeerCch: Sta.Cch[PeerNid][ReqAddr] NxtPeerCch: NxtSta.Cch[PeerNid][ReqAddr] COLUMN |Current State|PeerOrb miss| |TRUE| => OrbMiss(Sta, ReqAddr, PeerNid) COLUMN |Current State|Msg|Cmd| |SnpCode| => PendTxnSnp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid) & Msg.Cmd = SNP_SnpCode |SnpData| => PendTxnSnp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid) & Msg.Cmd = SNP_SnpData |SnpInvOwn| => PendTxnSnp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid) & Msg.Cmd = SNP_SnpInvOwn |SnpInvItoE| => PendTxnSnp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid) & Msg.Cmd = SNP_SnpInvItoE |SnpCur| => PendTxnSnp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid) & Msg.Cmd = SNP_SnpCur COLUMN |Current State|PeerCch|State| |M| => PeerCch.State in {CCH_M} |E| => PeerCch.State in {CCH_E} |F| => PeerCch.State in {CCH_F} |S| => PeerCch.State in {CCH_S} |I| => PeerCch.State in {CCH_I} |F, S, I| => PeerCch.State in {CCH_F, CCH_S, CCH_I} |E, F, S, I| => PeerCch.State in {CCH_E, CCH_F, CCH_S, CCH_I} COLUMN |Current State|PeerCch|Mask| |Full| => MaskFull(PeerCch.Mask) |Empty| => MaskEmpty(PeerCch.Mask) |Partial| => MaskPartial(PeerCch.Mask) COLUMN |Current State|BiasFwd| |TRUE| => BiasFwd = TRUE |FALSE| => BiasFwd = FALSE COLUMN |Current State|BiasToI| |TRUE| => BiasToI = TRUE |FALSE| => BiasToI = FALSE COLUMN |Next State|PeerCch|State| |S| => NxtPeerCch.State:= CCH_S |I| => NxtPeerCch.State:= CCH_I COLUMN |Next State|PeerCch|Mask| |Empty| => SetMaskEmpty(NxtPeerCch.Mask); UndefineData(NxtPeerCch.Data) COLUMN |Next State|HomeNet|Send to Home| |RspI| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspI) |RspS| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspS) |RspIWb| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspIWb) |RspSWb| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspSWb) |RspFwd| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspFwd) |RspFwdI| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspFwdI) |RspFwdS| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspFwdS) |RspFwdIWb| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspFwdIWb) |RspFwdSWb| => SendHomeRsp(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspFwdSWb) COLUMN |Next State|TxnNet|Send to Msg.Req| |DataC_M(PeerCch.Data)| => SendTxnData(NxtSta, Sta, ReqNid, ReqTid, DATA_DataC_M, PeerCch.Data) |DataC_E(PeerCch.Data)| => SendTxnData(NxtSta, Sta, ReqNid, ReqTid, DATA_DataC_E, 574 Ref No xxxxx Intel Restricted Secret PeerCch.Data) |DataC_F(PeerCch.Data)| => SendTxnData(NxtSta, Sta, ReqNid, ReqTid, DATA_DataC_F, PeerCch.Data) |DataC_I(PeerCch.Data)| => SendTxnData(NxtSta, Sta, ReqNid, ReqTid, DATA_DataC_I, PeerCch.Data) COLUMN |Next State|TxnNet|Send to Home| |WbIData(PeerCch.Data)| => SendTxnWbD(NxtSta, Sta, ReqNid, ReqTid, WBD_WbIData, ReqAddr, PeerCch.Mask, PeerCch.Data) |WbSData(PeerCch.Data)| => SendTxnWbD(NxtSta, Sta, ReqNid, ReqTid, WBD_WbSData, ReqAddr, PeerCch.Mask, PeerCch.Data) |WbIDataPtl(PeerCch.Data, PeerCch.Mask)| => SendTxnWbD(NxtSta, Sta, ReqNid, ReqTid, WBD_WbIDataPtl, ReqAddr, PeerCch.Mask, PeerCch.Data) COLUMN |Next State|TxnNet|Msg| |Remove| => RecvTxnSnp(NxtSta, Sta, ReqNid, ReqTid, PeerNid) Ref No xxxxx 575 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence Table F-10. Action CacheSnpOrbHit Current State Next State PeerOrb hit Msg PeerOrb PeerCch PeerOrb PeerCch HomeNet TxnNet Cmd State State Mask Cnflt State Mask Send to Home Msg TRUE SnpCode, SnpData SentReq M TRUE RspCnfltOwn Remove F, S S RspCnflt E Full Empty I Empty I RcvdData RcvdCmp SentAck Buffer SnpInvOwn, SnpInvItoE SentReq M TRUE RspCnfltOwn Remove E, F, S, I I Empty RspCnflt RcvdData RcvdCmp SentAck Buffer SnpCur SentReq M TRUE RspCnfltOwn Remove E, F, S, I RspCnflt RcvdData RcvdCmp SentAck Buffer This is the action of a caching agent processing an incoming snoop that conflicts with an outgoing request in ORB. The txn id of the outgoing request is sent to the home agent in the From field of RspCnflt*. If a RspCnflt is sent, the cache state (which must not be M) is downgraded to the “highest” possible state allowed by the snoop type; more aggressive cache state downgrading can be modeled by following this action with a CacheNewReqInt action with an appropriate IntCmd (see Table F-4). If a RspCnfltOwn is sent, the cache state (which must be M) is unchanged; the RspCnfltOwn will guarantee that the outgoing request wins the conflict race. Note that a snoop that is received after a Data* response is received (i.e., in the RcvdData state) does not change the cache state, and that a snoop received after the caching agent has sent an AckCnflt is not processed at all until the ORB entry is deallocated. PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID PeerNid: NID PeerTid: TID ALIASES Msg: Sta.TxnNet[ReqNid][ReqTid].SnpMsg[PeerNid] PeerCch: Sta.Cch[PeerNid][ReqAddr] PeerOrb: Sta.Orb[PeerNid][PeerTid] NxtPeerCch: NxtSta.Cch[PeerNid][ReqAddr] NxtPeerOrb: NxtSta.Orb[PeerNid][PeerTid] COLUMN |Current State|PeerOrb hit| |TRUE| => OrbHit(Sta, ReqAddr, PeerNid, PeerTid) COLUMN |Current State|Msg|Cmd| |SnpCode, SnpData| => PendTxnSnp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid) & Msg.Cmd in {SNP_SnpCode, SNP_SnpData} |SnpInvOwn, SnpInvItoE| => PendTxnSnp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid) & Msg.Cmd in {SNP_SnpInvOwn, SNP_SnpInvItoE} |SnpCur| => PendTxnSnp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid) & Msg.Cmd in {SNP_SnpCur} COLUMN |Current State|PeerOrb|State| |SentReq| => PeerOrb.State = ORB_SentReq 576 Ref No xxxxx Intel Restricted Secret |RcvdData| => PeerOrb.State = ORB_RcvdData |RcvdCmp| => PeerOrb.State = ORB_RcvdCmp |SentAck| => PeerOrb.State = ORB_SentAck COLUMN |Current State|PeerCch|State| |M| => PeerCch.State in {CCH_M} |E| => PeerCch.State in {CCH_E} |I| => PeerCch.State in {CCH_I} |F, S| => PeerCch.State in {CCH_F, CCH_S} |E, F, S, I| => PeerCch.State in {CCH_E, CCH_F, CCH_S, CCH_I} COLUMN |Current State|PeerCch|Mask| |Full| => MaskFull(PeerCch.Mask) |Empty| => MaskEmpty(PeerCch.Mask) COLUMN |Next State|PeerOrb|Cnflt| |TRUE| => NxtPeerOrb.Cnflt:= TRUE COLUMN |Next State|PeerCch|State| |S| => NxtPeerCch.State:= CCH_S |I| => NxtPeerCch.State:= CCH_I COLUMN |Next State|PeerCch|Mask| |Empty| => SetMaskEmpty(NxtPeerCch.Mask); UndefineData(NxtPeerCch.Data) COLUMN |Next State|HomeNet|Send to Home| |RspCnflt| => SendHomeRspFrom(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspCnflt, PeerTid) |RspCnfltOwn| => SendHomeRspFrom(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid, HOME_RspCnfltOwn, PeerTid) COLUMN |Next State|TxnNet|Msg| |Remove| => RecvTxnSnp(NxtSta, Sta, ReqNid, ReqTid, PeerNid) |Buffer| => BuffTxnSnp(NxtSta, Sta, ReqNid, ReqTid, PeerNid) Table F-11. Action HomeRecvReq Current State Next State Msg ReqTrk HomeCoQ HomeNet Cmd State Cmd NotOwn Addr Rcvd WbMark Msg.Req Msg RdCode Busy RdCode FALSE Msg.Addr + {Msg.From} Remove RdData RdData RdInvOwn RdInvOwn InvItoE InvItoE RdCur RdCur TRUE WbMtoI WbMtoI TRUE Enqueue WbMtoS WbMtoS WbMtoE WbMtoE FALSE This is the action of a home agent receiving a request from a caching agent, which is pointed to by Msg.From. The NotOwn bit records whether the requestor is a “non-owner” in the sense that it cannot have or acquire a forwardable copy of the cache line; this is the only place the NotOwn bit is written. Note that a WbMto* is immediately enqueued onto CoQ. PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID ALIASES Msg: Sta.HomeNet[ReqNid][Sta.Home[ReqAddr]].Msg[ReqNid][ReqTid] ReqTrk: Sta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] HomeCoQ: Sta.CoQ[Sta.Home[ReqAddr]][ReqAddr] NxtReqTrk: NxtSta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] NxtHomeCoQ: NxtSta.CoQ[Sta.Home[ReqAddr]][ReqAddr] COLUMN |Current State|Msg|Cmd| |RdCode| => PendHomeReq(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RdCode |RdData| => PendHomeReq(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RdData |RdInvOwn| => PendHomeReq(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RdInvOwn |InvItoE| => PendHomeReq(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_InvItoE Ref No xxxxx 577 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence |RdCur| => PendHomeReq(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RdCur |WbMtoI| => PendHomeReq(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_WbMtoI |WbMtoS| => PendHomeReq(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_WbMtoS |WbMtoE| => PendHomeReq(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_WbMtoE COLUMN |Next State|ReqTrk|State| |Busy| => NxtReqTrk.State:= TRK_Busy COLUMN |Next State|ReqTrk|Cmd| |RdCode| => NxtReqTrk.Cmd:= EXT_RdCode |RdData| => NxtReqTrk.Cmd:= EXT_RdData |RdInvOwn| => NxtReqTrk.Cmd:= EXT_RdInvOwn |InvItoE| => NxtReqTrk.Cmd:= EXT_InvItoE |RdCur| => NxtReqTrk.Cmd:= EXT_RdCur |WbMtoI| => NxtReqTrk.Cmd:= EXT_WbMtoI |WbMtoS| => NxtReqTrk.Cmd:= EXT_WbMtoS |WbMtoE| => NxtReqTrk.Cmd:= EXT_WbMtoE COLUMN |Next State|ReqTrk|NotOwn| |TRUE| => NxtReqTrk.NotOwn:= TRUE |FALSE| => NxtReqTrk.NotOwn:= FALSE COLUMN |Next State|ReqTrk|Addr| |Msg.Addr| => NxtReqTrk.Addr:= Msg.Addr COLUMN |Next State|ReqTrk|Rcvd| |+ {Msg.From}| => NxtReqTrk.Rcvd[ReqNid]:= TRUE COLUMN |Next State|ReqTrk|WbMark| |TRUE| => NxtReqTrk.WbMark:= TRUE COLUMN |Next State|HomeCoQ|Msg.Req| |Enqueue| => TxnEnqueue(NxtHomeCoQ, HomeCoQ, ReqNid, ReqTid) COLUMN |Next State|HomeNet|Msg| |Remove| => RecvHomeMsg(NxtSta, Sta, ReqNid, ReqAddr, ReqNid, ReqTid) 578 Ref No xxxxx Intel Restricted Secret Ref No xxxxx 579 Ref No xxxxx 579 Table F-12. Action HomeRecvRsp Current State Next State Msg ReqTrk ReqTrk PeerTrk HomeCoQ HomeNet Cmd State = Busy and Msg.From in Rcvd State Rcvd WaitFwd Ifwd Shrd WbMark CnfltOwn Cnflt Cnflt Msg.Req Msg RspFwdI FALSE Busy + {Msg.From} TRUE Enqueue Remove TRUE FALSE RspFwdIWb FALSE Busy + {Msg.From} TRUE TRUE Enqueue TRUE FALSE RspFwdS Busy + {Msg.From} TRUE TRUE Enqueue RspFwdSWb Busy + {Msg.From} TRUE TRUE TRUE Enqueue RspFwd Busy + {Msg.From} TRUE Enqueue RspI FALSE Busy + {Msg.From} TRUE FALSE RspIWb FALSE Busy + {Msg.From} TRUE Enqueue TRUE FALSE RspS Busy + {Msg.From} TRUE RspSWb Busy + {Msg.From} TRUE TRUE Enqueue RspCnflt Busy + {Msg.From} TRUE + {Msg.From} + {Msg.Req} RspCnfltOwn Busy + {Msg.From} TRUE + {Msg.From} + {Msg.Req} An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence This is the action of a home agent receiving a response from a caching agent (pointed to by Msg.From) that is generated by a snoop (Snp*) or a forward (Cmp_Fwd*). The two cases are distinguished by testing whether the requestor’s Tracker entry (ReqTrk) is Busy and has received a response from Msg.From already (see the second column of “Current State”). This test is not needed for Rsp*S* and RspCnflt*, since they cannot be generated by a forward. A response generated by a snoop makes ReqTrk to become Busy and is recorded in Rcvd; a response generated by a forward resets the WaitFwd bit, which is set in Table F-14, “Action HomeRecvAckFwd”. A conflict (RspCnflt*) is registered in both the requestor’s and the conflictor’s Tracker entries, and RspCnfltOwn also sets the CnfltOwn bit, which ensures that this transaction will lose the conflict race to the transaction that sends the RspCnfltOwn and hence has an M copy (see Table F-10, “Action CacheSnpOrbHit” and Table F-16, “Action HomeSendDataCmp”). Note that since a RspCnflt may leave its sender’s cache in the S state (see Table F-10, “Action CacheSnpOrbHit”), the Shrd bit need be set when RspCnflt is received. PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID PeerNid: NID ALIASES Msg: Sta.HomeNet[PeerNid][Sta.Home[ReqAddr]].Msg[ReqNid][ReqTid] ReqTrk: Sta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] PeerTrk: Sta.Trk[Sta.Home[ReqAddr]][PeerNid] HomeCoQ: Sta.CoQ[Sta.Home[ReqAddr]][ReqAddr] NxtReqTrk: NxtSta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] NxtPeerTrk: NxtSta.Trk[Sta.Home[ReqAddr]][PeerNid] NxtHomeCoQ: NxtSta.CoQ[Sta.Home[ReqAddr]][ReqAddr] COLUMN |Current State|Msg|Cmd| |RspFwdI| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspFwdI |RspFwdIWb| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspFwdIWb |RspFwdS| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspFwdS |RspFwdSWb| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspFwdSWb |RspFwd| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspFwd |RspI| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspI |RspIWb| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspIWb |RspS| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspS |RspSWb| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspSWb |RspCnflt| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspCnflt |RspCnfltOwn| => PendHomeRsp(Sta, PeerNid, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = HOME_RspCnfltOwn COLUMN |Current State|ReqTrk|State = Busy and Msg.From in Rcvd| |TRUE| => (ReqTrk.State = TRK_Busy & ReqTrk.Rcvd[PeerNid] = TRUE) = TRUE |FALSE| => (ReqTrk.State = TRK_Busy & ReqTrk.Rcvd[PeerNid] = TRUE) = FALSE COLUMN |Next State|ReqTrk|State| |Busy| => NxtReqTrk.State:= TRK_Busy COLUMN |Next State|ReqTrk|Rcvd| |+ {Msg.From}| => NxtReqTrk.Rcvd[PeerNid]:= TRUE COLUMN |Next State|ReqTrk|WaitFwd| |FALSE| => NxtReqTrk.WaitFwd:= FALSE COLUMN |Next State|ReqTrk|Ifwd| |TRUE| => NxtReqTrk.Ifwd:= TRUE COLUMN |Next State|ReqTrk|Shrd| |TRUE| => NxtReqTrk.Shrd:= TRUE COLUMN |Next State|ReqTrk|WbMark| |TRUE| => NxtReqTrk.WbMark:= TRUE COLUMN |Next State|ReqTrk|CnfltOwn| |TRUE| => NxtReqTrk.CnfltOwn:= TRUE COLUMN |Next State|ReqTrk|Cnflt| |+ {Msg.From}| => NxtReqTrk.Cnflt[Msg.From.Nid][Msg.From.Tid]:= TRUE 580 Ref No xxxxx Intel Restricted Secret COLUMN |Next State|PeerTrk|Cnflt| |+ {Msg.Req}| => NxtPeerTrk[Msg.From.Tid].Cnflt[ReqNid][ReqTid]:= TRUE COLUMN |Next State|HomeCoQ|Msg.Req| |Enqueue| => TxnEnqueue(NxtHomeCoQ, HomeCoQ, ReqNid, ReqTid) COLUMN |Next State|HomeNet|Msg| |Remove| => RecvHomeMsg(NxtSta, Sta, PeerNid, ReqAddr, ReqNid, ReqTid) Table F-13. Action HomeRecvAckCmp Current State Next State Msg ReqTrk No conflictor of ReqTrk All conflictors of Msg.Req TxnNet HomeNet Cmd NotOwn Msg.Req has received Rsp* from Msg.Req Cnflt Cnflt CnfltOwn Send to Msg.Req Msg AckCnflt TRUE Clear Remove Msg.Req and connect disconnected conflictors Cmp Remove FALSE TRUE FALSE This is the action of a home agent receiving AckCnflt and responding with Cmp, which happens either when the home agent knows that the source of AckCnflt (= Msg.Req) cannot have a forwardable copy of the cache line because the NotOwn bit is set (see Table F-11, “Action HomeRecvReq”), or when none of Msg.Req’s conflictors has received a snoop response from Msg.Req. The latter case actually contains two sub-cases: either Msg.Req has a nonempty set of conflictors, in which case their snoops to Msg.Req would have been blocked because Msg.Req has sent AckCnflt (see Table F-10, “Action CacheSnpOrbHit”) and the Cmp will release them, or Msg.Req has no conflictors at all, in which case it obviously makes no sense to send a Cmp_Fwd* and a simple Cmp suffices. In addition to sending Cmp, the home also removes Msg.Req from the conflict graph (the first two columns of “Next State”) and resets the CnfltOwn bits of Msg.Req’s conflictors (if any). PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID ALIASES Msg: Sta.HomeNet[ReqNid][Sta.Home[ReqAddr]].Msg[ReqNid][ReqTid] ReqTrk: Sta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] NxtReqTrk: NxtSta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] COLUMN |Current State|Msg|Cmd| |AckCnflt| => PendHomeAck(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) COLUMN |Current State|ReqTrk|NotOwn| |TRUE| => ReqTrk.NotOwn = TRUE |FALSE| => ReqTrk.NotOwn = FALSE COLUMN |Current State|No conflictor of Msg.Req has received Rsp* from Msg.Req| |TRUE| => TrkCnfltNoRsp(Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|ReqTrk|Cnflt| |Clear| => TrkCnfltClearSelf(NxtSta, Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|All conflictors of Msg.Req|Cnflt| |Remove Msg.Req and connect disconnected conflictors| => TrkCnfltClearOther(NxtSta, Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|All conflictors of Msg.Req|CnfltOwn| |FALSE| => TrkCnfltClearOwn(NxtSta, Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|TxnNet|Send to Msg.Req| |Cmp| => SendTxnCmp(NxtSta, Sta, ReqNid, ReqTid, CMP_Cmp) COLUMN |Next State|HomeNet|Msg| |Remove| => RecvHomeMsg(NxtSta, Sta, ReqNid, ReqAddr, ReqNid, ReqTid) Ref No xxxxx 581 Intel Restricted Secret Current State Next State Msg ReqTrk Peer is a conflictor of Msg.Req and has received Rsp* from Msg.Req PeerTrk ReqTrk All conflictors of Msg.Req PeerTrk TxnNet HomeNet Cmd NotOwn Cmd Cnflt Cnflt CnfltOwn WaitFwd Send to Msg.Req to forward to Peer Msg AckCnflt FALSE TRUE RdCode Clear Remove Msg.Req and connect disconnected conflictors FALSE TRUE Cmp_FwdCode Remove RdData RdInvOwn Cmp_FwdInvOwn InvItoE Cmp_FwdInvItoE RdCur None An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence 582 Ref No xxxxx Intel Restricted Secret This is the action of a home agent receiving AckCnflt and responding with Cmp_Fwd* on behalf of a peer conflictor, which happens when both the NotOwn bit is not set and there exists at least one peer conflictor which has received a Rsp* from the source of AckCnflt (= Msg.Req). Any such conflictor may be chosen and a Cmp_Fwd* of the appropriate type is sent to Msg.Req (known as the “owner”) on its behalf. If the chosen conflictor’s request has not reached the home (i.e., when PeerTrk.Cmd = None), a Cmp_FwdInvItoE is sent, which is always safe to use when a Cmp_Fwd* is called for. In addition, the home also removes Msg.Req from the conflict graph (the first two columns of “Next State”), resets the CnfltOwn bits of Msg.Req’s conflictors (if any), and set the chosen conflictor’s WaitFwd bit. This and the previous action (Table F-13, “Action HomeRecvAckCmp”) are the only places where the NotOwn bit is read. PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID PeerNid: NID PeerTid: TID ALIASES Msg: Sta.HomeNet[ReqNid][Sta.Home[ReqAddr]].Msg[ReqNid][ReqTid] ReqTrk: Sta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] PeerTrk: Sta.Trk[Sta.Home[ReqAddr]][PeerNid][PeerTid] NxtReqTrk: NxtSta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] NxtPeerTrk: NxtSta.Trk[Sta.Home[ReqAddr]][PeerNid][PeerTid] COLUMN |Current State|Msg|Cmd| |AckCnflt| => PendHomeAck(Sta, ReqNid, ReqAddr, ReqNid, ReqTid) COLUMN |Current State|ReqTrk|NotOwn| |FALSE| => ReqTrk.NotOwn = FALSE COLUMN |Current State|Peer is a conflictor of Msg.Req and has received Rsp* from Msg.Req| |TRUE| => TrkCnfltSomeRsp(Sta, ReqAddr, ReqNid, ReqTid, PeerNid, PeerTid) COLUMN |Current State|PeerTrk|Cmd| |RdCode| => PeerTrk.Cmd = EXT_RdCode |RdData| => PeerTrk.Cmd = EXT_RdData |RdInvOwn| => PeerTrk.Cmd = EXT_RdInvOwn |InvItoE| => PeerTrk.Cmd = EXT_InvItoE |RdCur| => PeerTrk.Cmd = EXT_RdCur |None| => PeerTrk.Cmd = EXT_None COLUMN |Next State|ReqTrk|Cnflt| |Clear| => TrkCnfltClearSelf(NxtSta, Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|All conflictors of Msg.Req|Cnflt| |Remove Msg.Req and connect disconnected conflictors| => TrkCnfltClearOther(NxtSta, Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|All conflictors of Msg.Req|CnfltOwn| |FALSE| => TrkCnfltClearOwn(NxtSta, Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|PeerTrk|WaitFwd| |TRUE| => NxtPeerTrk.WaitFwd:= TRUE COLUMN |Next State|TxnNet|Send to Msg.Req to forward to Peer| |Cmp_FwdCode| => SendTxnCmpFwd(NxtSta, Sta, ReqNid, ReqTid, CMP_Cmp_FwdCode, PeerNid, PeerTid) |Cmp_FwdInvOwn| => SendTxnCmpFwd(NxtSta, Sta, ReqNid, ReqTid, CMP_Cmp_FwdInvOwn, PeerNid, PeerTid) |Cmp_FwdInvItoE| => SendTxnCmpFwd(NxtSta, Sta, ReqNid, ReqTid, CMP_Cmp_FwdInvItoE, PeerNid, PeerTid) COLUMN |Next State|HomeNet|Msg| |Remove| => RecvHomeMsg(NxtSta, Sta, ReqNid, ReqAddr, ReqNid, ReqTid) Table F-15. Action HomeRecvWbData Current State Next State Msg ReqTrk HomeMem TxnNet Cmd WbData Data Msg Ref No xxxxx 583 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence WbIData TRUE Msg.Data Remove WbSData WbEData WbIDataPtl ViaMask(Msg.Data, Msg.Mask) This is the action of a home agent receiving Wb*Data and Wb*DataPtl. It is the simplest of all actions. ViaMask means that only those words in Msg.Data qualified by Msg.Mask are written into memory. PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID ALIASES Msg: Sta.TxnNet[ReqNid][ReqTid].WbDMsg ReqTrk: Sta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] HomeMem: Sta.Mem[Sta.Home[ReqAddr]][ReqAddr] NxtReqTrk: NxtSta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] NxtHomeMem: NxtSta.Mem[Sta.Home[ReqAddr]][ReqAddr] COLUMN |Current State|Msg|Cmd| |WbIData| => PendTxnWbD(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = WBD_WbIData |WbSData| => PendTxnWbD(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = WBD_WbSData |WbEData| => PendTxnWbD(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = WBD_WbEData |WbIDataPtl| => PendTxnWbD(Sta, ReqAddr, ReqNid, ReqTid) & Msg.Cmd = WBD_WbIDataPtl COLUMN |Next State|ReqTrk|WbData| |TRUE| => NxtReqTrk.WbData:= TRUE COLUMN |Next State|HomeMem|Data| |Msg.Data| => CopyData(NxtHomeMem.Data, Msg.Data) |ViaMask(Msg.Data, Msg.Mask)| => CopyDataViaMask(NxtHomeMem.Data, Msg.Data, Msg.Mask) COLUMN |Next State|TxnNet|Msg| |Remove| => RecvTxnWbD(NxtSta, Sta, ReqNid, ReqTid) 584 Ref No xxxxx Intel Restricted Secret Ref No xxxxx 585 Ref No xxxxx 585 Table F-16. Action HomeSendDataCmp Current State Next State HomeCoQ ReqTrk No conflictor ReqTrk TxnNet HomeCoQ Empty or Head = Req Cmd WbMark => WbData All Rcvd CnfltOwn WaitFwd Shrd Ifwd Cnflt = Empty of Req is Idle or WaitFwd All except Cnflt and NotOwn Send Data to Req Send Cmp to Req TRUE WbMto* TRUE Clear Cmp Dequeue if not EmptyRdCode TRUE FALSE FALSE TRUE TRUE TRUE Cmp FALSE FrcAckCnflt FALSE TRUE DataC_F(HomeMem.Data) Cmp FALSE FrcAckCnflt RdData TRUE TRUE TRUE Cmp FALSE FrcAckCnflt FALSE TRUE DataC_F(HomeMem.Data) Cmp FALSE FrcAckCnflt FALSE TRUE TRUE Cmp FALSE FrcAckCnflt FALSE TRUE DataC_E(HomeMem.Data) Cmp FALSE FrcAckCnflt RdInvOwn TRUE TRUE Cmp FALSE FrcAckCnflt FALSE TRUE DataC_E(HomeMem.Data) Cmp FALSE FrcAckCnflt InvItoE TRUE TRUE Cmp FALSE FrcAckCnflt FALSE TRUE GntE Cmp FALSE FrcAckCnflt RdCur TRUE TRUE Cmp FALSE FrcAckCnflt FALSE TRUE DataC_I(HomeMem.Data) Cmp FALSE FrcAckCnflt An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence This is the action of a home agent selecting a “ready” transaction of a given address (ReqAddr) to complete by clearing its Tracker entry (ReqTrk) and sending Cmp/FrcAckCnflt and (possibly) Data*/Gnt* messages to its source (Req). If the CoQ for ReqAddr is empty, then any “ready” transaction to ReqAddr can be selected; otherwise, the head of CoQ must be selected before all other transactions to ReqAddr. A WbMto* transaction is “ready” once the writeback data has been received; a Cmp response is sent to it. A non-WbMto* transaction is “ready” only after several conditions are met: both its request and all its snoop responses have been received (All Rcvd); if there is any writeback then the data has been received (WbMark => WbData); it is not blocked by a conflictor with buried M (CnfltOwn) or itself waiting for the response to a Cmp_Fwd* (WaitFwd); and none of its conflictors is Idle (which means that the conflictor is waiting for an AckCnflt) or waiting for the response to a Cmp_Fwd*. The last condition is to ensure that if a transaction has an “owner” conflictor, the conflictor is queried (by Cmp_Fwd*) before the transaction can become “ready”. All fields of the Tracker entry of the selected “ready” transaction (ReqTrk) are cleared except the conflict list and NotOwn bit; the former, if not already cleared, will be cleared when AckCnflt is received (see Table F-13, “Action HomeRecvAckCmp” and Table F-14, “Action HomeRecvAckFwd”); the latter is needed if and when AckCnflt is received and will be overwritten when ReqTrk is occupied by a new request (see Table F-11, “Action HomeRecvReq”). If the selected “ready” transaction has not received an implicit forward response (Ifwd), a Data*/Gnt* + Cmp/FrcAckCnflt response is sent to it; otherwise, only a Cmp/FrcAckCnflt response is sent to it. The type of Data*/Gnt* response is determined by the transaction type and (in the RdData case) whether a Rsp*S* or RspCnflt has been received (see Table F-12, “Action HomeRecvRsp”). Whether Cmp or FrcAckCnflt response is sent depends on whether there are no remaining conflictors (Cnflt = Empty). Note that there is no need to send FrcAckCnflt to WbMto* even if there are remaining conflictors, because WbMto* does not snoop and hence can have conflictors only if it has itself sent RspCnflt*, in which case Cmp will trigger AckCnflt just like FrcAckCnflt (see Table F-7, “Action CacheRecvCmp”). Note also that, in CSIIAM, although Data*/Gnt* and Cmp/FrcAckCnflt can be sent together (as in this action), they are always received separately (see Table F-6, “Action CacheRecvData” and Table F-7, “Action CacheRecvCmp”) and the combined versions of these messages are not modeled. No generality is lost, since receiving the two messages together can be modeled by receiving them back-to-back in either order. We choose this way of modeling because Data*/Gnt* and Cmp/FrcAckCnflt are sent and received separately in the implicit forward case anyway and modeling the combined versions as well only adds superfluous details. PARAMETERS ReqAddr: ADDR ReqNid: NID ReqTid: TID ALIASES HomeMem: Sta.Mem[Sta.Home[ReqAddr]][ReqAddr] HomeCoQ: Sta.CoQ[Sta.Home[ReqAddr]][ReqAddr] ReqTrk: Sta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] NxtHomeCoQ: NxtSta.CoQ[Sta.Home[ReqAddr]][ReqAddr] NxtReqTrk: NxtSta.Trk[Sta.Home[ReqAddr]][ReqNid][ReqTid] COLUMN |Current State|HomeCoQ|Empty or Head = Req| |TRUE| => (TxnQueueEmpty(HomeCoQ) | TxnQueueHead(HomeCoQ, ReqNid, ReqTid)) COLUMN |Current State|ReqTrk|Cmd| |WbMto*| => IsExtWb(ReqTrk.Cmd) & ReqTrk.Addr = ReqAddr |RdCode| => ReqTrk.Cmd = EXT_RdCode & ReqTrk.Addr = ReqAddr |RdData| => ReqTrk.Cmd = EXT_RdData & ReqTrk.Addr = ReqAddr |RdInvOwn| => ReqTrk.Cmd = EXT_RdInvOwn & ReqTrk.Addr = ReqAddr |InvItoE| => ReqTrk.Cmd = EXT_InvItoE & ReqTrk.Addr = ReqAddr |RdCur| => ReqTrk.Cmd = EXT_RdCur & ReqTrk.Addr = ReqAddr COLUMN |Current State|ReqTrk|WbMark => WbData| |TRUE| => (ReqTrk.WbMark = FALSE | ReqTrk.WbData = TRUE) COLUMN |Current State|ReqTrk|All Rcvd| |TRUE| => TrkAllRcvd(Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Current State|ReqTrk|CnfltOwn| |FALSE| => ReqTrk.CnfltOwn = FALSE 586 Ref No xxxxx Intel Restricted Secret COLUMN |Current State|ReqTrk|WaitFwd| |FALSE| => ReqTrk.WaitFwd = FALSE COLUMN |Current State|ReqTrk|Shrd| |TRUE| => ReqTrk.Shrd = TRUE |FALSE| => ReqTrk.Shrd = FALSE COLUMN |Current State|ReqTrk|Ifwd| |TRUE| => ReqTrk.Ifwd = TRUE |FALSE| => ReqTrk.Ifwd = FALSE COLUMN |Current State|ReqTrk|Cnflt = Empty| |TRUE| => TrkCnfltEmpty(Sta, ReqAddr, ReqNid, ReqTid) = TRUE |FALSE| => TrkCnfltEmpty(Sta, ReqAddr, ReqNid, ReqTid) = FALSE COLUMN |Current State|No conflictor of Req is Idle or WaitFwd| |TRUE| => TrkCnfltNoIdleOrWaitFwd(Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|ReqTrk|All except Cnflt and NotOwn| |Clear| => TrkClear(NxtSta, Sta, ReqAddr, ReqNid, ReqTid) COLUMN |Next State|TxnNet|Send Data to Req| |DataC_E(HomeMem.Data)| => SendTxnData(NxtSta, Sta, ReqNid, ReqTid, DATA_DataC_E, HomeMem.Data) |DataC_F(HomeMem.Data)| => SendTxnData(NxtSta, Sta, ReqNid, ReqTid, DATA_DataC_F, HomeMem.Data) |DataC_I(HomeMem.Data)| => SendTxnData(NxtSta, Sta, ReqNid, ReqTid, DATA_DataC_I, HomeMem.Data) |GntE| => SendTxnGnt(NxtSta, Sta, ReqNid, ReqTid, DATA_GntE) COLUMN |Next State|TxnNet|Send Cmp to Req| |Cmp| => SendTxnCmp(NxtSta, Sta, ReqNid, ReqTid, CMP_Cmp) |FrcAckCnflt| => SendTxnCmp(NxtSta, Sta, ReqNid, ReqTid, CMP_FrcAckCnflt) COLUMN |Next State|HomeCoQ| |Dequeue if not Empty| => if (!TxnQueueEmpty(HomeCoQ)) then TxnDequeue(NxtHomeCoQ, HomeCoQ) end Ref No xxxxx 587 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence Utility Sub-Routines The utility subroutines used by the semantic mappings are listed below. Please read the interspersed comments for their meanings. Note that many assertions are inserted to detect errors. For instance, whenever a message is sent (respectively, received), an assertion checks whether the message already exists (resp., the message does not exist) in the network and, if so, raises an error flag. Also note that, whenever the State or Cmd field of a record is set to None, its other fields are “cleared” by being assigned don’t-care values. -- In the following (n,i), (p,j), etc denote txn ids. -- Subroutines for handling external commands. -- Check whether external command c is a WbMto*. function IsExtWb(c: EXT_CMD): BOOLEAN; return (c = EXT_WbMtoI | c = EXT_WbMtoS | c = EXT_WbMtoE); endfunction; -- Subroutines for handling data and masks. -- Check whether mask M is full. function MaskFull(M: MASK): BOOLEAN; return (forall w: WIDX do M[w] = TRUE endforall); endfunction; -- Check whether mask M is empty. function MaskEmpty(M: MASK): BOOLEAN; return (forall w: WIDX do M[w] = FALSE endforall); endfunction; -- Check whether mask M is partial. function MaskPartial(M: MASK): BOOLEAN; return (exists w: WIDX do M[w] = TRUE endexists); endfunction; -- Set mask to full. procedure SetMaskFull(var M: MASK); for w: WIDX do M[w]:= TRUE; endfor; endprocedure; -- Set mask to empty. procedure SetMaskEmpty(var M: MASK); for w: WIDX do M[w]:= FALSE; endfor; endprocedure; -- Undefine mask. procedure UndefineMask(var M: MASK); for w: WIDX do undefine M[w]; endfor; endprocedure; -- Undefine data. procedure UndefineData(var D: DATA); for w: WIDX do undefine D[w]; endfor; endprocedure; -- Copy mask. procedure CopyMask(var M: MASK; M1: MASK); 588 Ref No xxxxx Intel Restricted Secret for w: WIDX do M[w]:= M1[w]; endfor; endprocedure; -- Copy data. procedure CopyData(var D: DATA; D1: DATA); for w: WIDX do D[w]:= D1[w]; endfor; endprocedure; -- Copy data via a mask. procedure CopyDataViaMask(var D: DATA; D1: DATA; M1: MASK); for w: WIDX do if (M1[w] = TRUE) then D[w]:= D1[w]; endif; endfor; endprocedure; -- Subroutines for handling txn queues. -- Check whether Q is empty. function TxnQueueEmpty(Q: TXN_Q): BOOLEAN; return (Q.Cnt = 0); endfunction; -- Check whether (n,i) is at the head of Q. function TxnQueueHead(Q: TXN_Q; n: NID; i: TID): BOOLEAN; return (Q.Cnt > 0 & Q.Seq[0].Nid = n & Q.Seq[0].Tid = i); endfunction; -- Check whether (n,i) is somewhere in Q. function TxnQueueHas(Q: TXN_Q; n: NID; i: TID): BOOLEAN; return (exists k:= 0 to Q.Cnt - 1 do Q.Seq[k].Nid = n & Q.Seq[k].Tid = i endexists); endfunction; --Remove the head of Q. procedure TxnDequeue(var NxtQ: TXN_Q; Q: TXN_Q); assert (Q.Cnt > 0) “TxnDequeue: Queue empty”; NxtQ.Cnt:= Q.Cnt - 1; for i:= 0 to Q.Cnt - 2 do NxtQ.Seq[i].Nid:= Q.Seq[i + 1].Nid; NxtQ.Seq[i].Tid:= Q.Seq[i + 1].Tid; endfor; undefine NxtQ.Seq[Q.Cnt - 1].Nid; undefine NxtQ.Seq[Q.Cnt - 1].Tid; endprocedure; -- Add (n,i) to the tail of Q. procedure TxnEnqueue(var NxtQ: TXN_Q; Q: TXN_Q; n: NID; i: TID); assert (Q.Cnt < NID_NUM * TID_NUM) “TxnEnqueue: Queue full”; NxtQ.Cnt:= Q.Cnt + 1; NxtQ.Seq[Q.Cnt].Nid:= n; NxtQ.Seq[Q.Cnt].Tid:= i; endprocedure; -- Subroutines for handling ORBs. -- Clear the i-th entry of the ORB at node n. procedure OrbClear(var NxtSta: STATE; Sta: STATE; n: NID; i: TID); assert (Sta.Orb[n][i].State!= ORB_None) “OrbClear: Aleady cleared”; NxtSta.Orb[n][i].State:= ORB_None; undefine NxtSta.Orb[n][i].Cmd; undefine NxtSta.Orb[n][i].Addr; undefine NxtSta.Orb[n][i].Cnflt; endprocedure; -- Check whether the i-th entry of the ORB at node n is valid with address a Ref No xxxxx 589 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence -- and is not WbMtoI. function OrbHit(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (Sta.Orb[n][i].State!= ORB_None & Sta.Orb[n][i].Addr = a); endfunction; -- Check whether the ORB at node n contains no valid entry of address a -- (except for WbMtoI). function OrbMiss(Sta: STATE; a: ADDR; n: NID): BOOLEAN; return (forall i: TID do Sta.Orb[n][i].State = ORB_None | Sta.Orb[n][i].Addr!= a endforall); endfunction; -- Check whether the ORB at node n has its i-th entry empty and -- there is no valid entry of address a in the ORB (except for WbMtoI). function OrbAvail(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (Sta.Orb[n][i].State = ORB_None & OrbMiss(Sta, a, n)); endfunction; -- Subroutines for handling Trackers. -- Clear the (n,i)-th entry of the Tracker at address a’s home node. procedure TrkClear(var NxtSta: STATE; Sta: STATE; a: ADDR; n: NID; i: TID); assert (Sta.Trk[Sta.Home[a]][n][i].State!= TRK_Idle) “TrkClear: Aleady cleared”; NxtSta.Trk[Sta.Home[a]][n][i].State:= TRK_Idle; NxtSta.Trk[Sta.Home[a]][n][i].Cmd:= EXT_None; undefine NxtSta.Trk[Sta.Home[a]][n][i].Addr; NxtSta.Trk[Sta.Home[a]][n][i].WaitFwd:= FALSE; NxtSta.Trk[Sta.Home[a]][n][i].Shrd:= FALSE; NxtSta.Trk[Sta.Home[a]][n][i].Ifwd:= FALSE; NxtSta.Trk[Sta.Home[a]][n][i].WbMark:= FALSE; NxtSta.Trk[Sta.Home[a]][n][i].WbData:= FALSE; NxtSta.Trk[Sta.Home[a]][n][i].CnfltOwn:= FALSE; for p: NID do NxtSta.Trk[Sta.Home[a]][n][i].Rcvd[p]:= FALSE; endfor; endprocedure; -- Check whether the (n,i)-th entry of the Tracker at address a’s home node -- has received all request/responses. function TrkAllRcvd(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (forall p: NID do Sta.Trk[Sta.Home[a]][n][i].Rcvd[p] = TRUE endforall); endfunction; -- Clear the conflict list of (n,i) of address a. procedure TrkCnfltClearSelf(var NxtSta: STATE; Sta: STATE; a: ADDR; n: NID; i: TID); for p: NID do for j: TID do if (Sta.Trk[Sta.Home[a]][n][i].Cnflt[p][j]) then NxtSta.Trk[Sta.Home[a]][n][i].Cnflt[p][j]:= FALSE; endif; endfor; endfor; endprocedure; -- Clear (n,i) of address a from its conflictor’s conflict lists: -- (1) Remove (n,i) from all its conflictor’s conflict lists. -- (2) Connect all pairs of conflictors of (n,i). procedure TrkCnfltClearOther(var NxtSta: STATE; Sta: STATE; a: ADDR; n: NID; i: TID); for p: NID do for j: TID do if (Sta.Trk[Sta.Home[a]][n][i].Cnflt[p][j]) then NxtSta.Trk[Sta.Home[a]][p][j].Cnflt[n][i]:= FALSE; for q: NID do for k: TID do if (Sta.Trk[Sta.Home[a]][n][i].Cnflt[q][k] &!(q = p & k = j)) then NxtSta.Trk[Sta.Home[a]][p][j].Cnflt[q][k]:= TRUE; endif; endfor; endfor; endif; endfor; endfor; endprocedure; 590 Ref No xxxxx Intel Restricted Secret -- Clear CnfltOwn of the conflictors of (n,i) of address a. procedure TrkCnfltClearOwn(var NxtSta: STATE; Sta: STATE; a: ADDR; n: NID; i: TID); for p: NID do for j: TID do if (Sta.Trk[Sta.Home[a]][n][i].Cnflt[p][j]) then NxtSta.Trk[Sta.Home[a]][p][j].CnfltOwn:= FALSE; endif; endfor; endfor; endprocedure; -- Check whether the (n,i)-th entry of the Tracker at address a’s home node -- has an empty conflict list. function TrkCnfltEmpty(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (forall p: NID do forall j: TID do Sta.Trk[Sta.Home[a]][n][i].Cnflt[p][j] = FALSEendforall endforall); endfunction; -- Check whether the (n,i)-th entry of the Tracker at address a’s home node -- has no conflictor whose Tracker entry is Idle or WaitFwd. function TrkCnfltNoIdleOrWaitFwd(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (forall p: NID doforall j: TID do Sta.Trk[Sta.Home[a]][n][i].Cnflt[p][j] = FALSE | !(Sta.Trk[Sta.Home[a]][p][j].State = TRK_Idle | Sta.Trk[Sta.Home[a]][p][j].WaitFwd = TRUE) endforall endforall); endfunction; -- Check whether the (n,i)-th entry of the Tracker at address a’s home node -- has no conflictor which has received a response from n. function TrkCnfltNoRsp(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (forall p: NID do forall j: TID do Sta.Trk[Sta.Home[a]][n][i].Cnflt[p][j] = FALSE | Sta.Trk[Sta.Home[a]][p][j].Rcvd[n] = FALSE endforall endforall); endfunction; -- Check whether the (n,i)-th entry of the Tracker at address a’s home node -- has a conflictor (p,j) which has received a response from n. function TrkCnfltSomeRsp(Sta: STATE; a: ADDR; n: NID; i: TID; p: NID; j: TID): BOOLEAN; return (Sta.Trk[Sta.Home[a]][n][i].Cnflt[p][j] = TRUE & Sta.Trk[Sta.Home[a]][p][j].Rcvd[n] = TRUE); endfunction; -- Subroutines for handling snoop channels. -- Check whether there is a snoop of address a from (n,i) pending at node p. function PendTxnSnp(Sta: STATE; a: ADDR; n: NID; i: TID; p: NID): BOOLEAN; return (Sta.TxnNet[n][i].SnpMsg[p].Cmd!= SNP_None & Sta.TxnNet[n][i].SnpMsg[p].Addr = a); endfunction; -- Receive a snoop from (n,i) pending at node p. procedure RecvTxnSnp(var NxtSta: STATE; Sta: STATE; n: NID; i: TID; p: NID); assert (Sta.TxnNet[n][i].SnpMsg[p].Cmd!= SNP_None) “RecvTxnSnp: No msg to receive”; NxtSta.TxnNet[n][i].SnpMsg[p].Cmd:= SNP_None; undefine NxtSta.TxnNet[n][i].SnpMsg[p].Addr; endprocedure; -- Buffer a snoop from (n,i) pending at node p. procedure BuffTxnSnp(var NxtSta: STATE; Sta: STATE; n: NID; i: TID; p: NID); assert (Sta.TxnNet[n][i].SnpMsg[p].Cmd!= SNP_None) Ref No xxxxx 591 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence “BuffTxnSnp: No msg to buffer”; endprocedure; -- Send snoops of address a for (n,i) to all of n’s peers. procedure SendTxnSnps(var NxtSta: STATE; Sta: STATE; n: NID; i: TID; c: SNP_CMD; a: ADDR); assert (forall p: NID do p = n | Sta.TxnNet[n][i].SnpMsg[p].Cmd = SNP_None endforall) “SendTxnSnps: Msg already exists”; for p: NID do if (p!= n) then NxtSta.TxnNet[n][i].SnpMsg[p].Cmd:= c; NxtSta.TxnNet[n][i].SnpMsg[p].Addr:= a; endif; endfor; endprocedure; -- Subroutines for handling data response channels. -- Check whether there is a Data*/Gnt* response of address a to (n,i) pending. function PendTxnData(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (Sta.TxnNet[n][i].DataMsg.Cmd!= DATA_None & Sta.Orb[n][i].Addr = a); endfunction; -- Receive a Data*/Gnt* response to (n,i). procedure RecvTxnData(var NxtSta: STATE; Sta: STATE; n: NID; i: TID); assert (Sta.TxnNet[n][i].DataMsg.Cmd!= DATA_None) “RecvTxnData: No msg to receive”; NxtSta.TxnNet[n][i].DataMsg.Cmd:= DATA_None; UndefineData(NxtSta.TxnNet[n][i].DataMsg.Data); endprocedure; -- Send a Data* response to (n,i). procedure SendTxnData(var NxtSta: STATE; Sta: STATE; n: NID; i: TID; c: DATA_CMD; d: DATA); assert (Sta.TxnNet[n][i].DataMsg.Cmd = DATA_None) “SendTxnData: Msg already exists”; NxtSta.TxnNet[n][i].DataMsg.Cmd:= c; if (c = DATA_DataC_I) then UndefineData(NxtSta.TxnNet[n][i].DataMsg.Data); else CopyData(NxtSta.TxnNet[n][i].DataMsg.Data, d); endif; endprocedure; -- Send a Gnt* response to (n,i). procedure SendTxnGnt(var NxtSta: STATE; Sta: STATE; n: NID; i: TID; c: DATA_CMD); assert (Sta.TxnNet[n][i].DataMsg.Cmd = DATA_None) “SendTxnGnt: Msg already exists”; NxtSta.TxnNet[n][i].DataMsg.Cmd:= c; endprocedure; -- Subroutines for handling completion/completion-forward response channels. -- Check whether c is Cmp/FrcAckCnflt. function IsCmpCmp(c: CMP_CMD): BOOLEAN; return (c = CMP_Cmp | c = CMP_FrcAckCnflt); endfunction; -- Check whether c is Cmp/Frc_Fwd*. function IsCmpFwd(c: CMP_CMD): BOOLEAN; return (c = CMP_Cmp_FwdCode | c = CMP_Cmp_FwdInvOwn | c = CMP_Cmp_FwdInvItoE); endfunction; -- Check whether there is a completion/completion-forward response -- of address a to (n,i) pending. function PendTxnCmp(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (Sta.TxnNet[n][i].CmpMsg.Cmd!= CMP_None & 592 Ref No xxxxx Intel Restricted Secret Sta.Orb[n][i].Addr = a); endfunction; -- Check whether there is a Cmp/FrcAckCnflt of address a to (n,i) pending. function PendTxnCmpCmp(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (PendTxnCmp(Sta, a, n, i) & IsCmpCmp(Sta.TxnNet[n][i].CmpMsg.Cmd)); endfunction; -- Check whether there is a Cmp/Frc_Fwd* of address a to (n,i) pending. function PendTxnCmpFwd(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (PendTxnCmp(Sta, a, n, i) & IsCmpFwd(Sta.TxnNet[n][i].CmpMsg.Cmd)); endfunction; -- Receive a completion/completion-forward response to (n,i). procedure RecvTxnCmp(var NxtSta: STATE; Sta: STATE; n: NID; i: TID); assert (Sta.TxnNet[n][i].CmpMsg.Cmd!= CMP_None) “RecvTxnCmp: No msg to receive”; NxtSta.TxnNet[n][i].CmpMsg.Cmd:= CMP_None; undefine NxtSta.TxnNet[n][i].CmpMsg.FwdTo.Nid; undefine NxtSta.TxnNet[n][i].CmpMsg.FwdTo.Tid; endprocedure; -- Send a Cmp/FrcAckCnflt to (n,i). procedure SendTxnCmp(var NxtSta: STATE; Sta: STATE; n: NID; i: TID; c: CMP_CMD); assert (Sta.TxnNet[n][i].CmpMsg.Cmd = CMP_None) “SendTxnCmp: Msg already exists”; NxtSta.TxnNet[n][i].CmpMsg.Cmd:= c; endprocedure; -- Send a Cmp/Frc_Fwd* to (n,i) to forward data to (p,j). procedure SendTxnCmpFwd(var NxtSta: STATE; Sta: STATE; n: NID; i: TID; c: CMP_CMD; p: NID; j: TID); assert (Sta.TxnNet[n][i].CmpMsg.Cmd = CMP_None) “SendTxnCmpFwd: Msg already exists”; NxtSta.TxnNet[n][i].CmpMsg.Cmd:= c; NxtSta.TxnNet[n][i].CmpMsg.FwdTo.Nid:= p; NxtSta.TxnNet[n][i].CmpMsg.FwdTo.Tid:= j; endprocedure; -- Subroutines for handling data writeback channels. -- Check whether there is a WbData of address a for (n,i) pending. function PendTxnWbD(Sta: STATE; a: ADDR; n: NID; i: TID): BOOLEAN; return (Sta.TxnNet[n][i].WbDMsg.Cmd!= WBD_None & Sta.TxnNet[n][i].WbDMsg.Addr = a); endfunction; -- Receive a WbData for (n,i). procedure RecvTxnWbD(var NxtSta: STATE; Sta: STATE; n: NID; i: TID); assert (Sta.TxnNet[n][i].WbDMsg.Cmd!= WBD_None) “RecvTxnWbD: No msg to receive”; NxtSta.TxnNet[n][i].WbDMsg.Cmd:= WBD_None; undefine NxtSta.TxnNet[n][i].WbDMsg.Addr; UndefineMask(NxtSta.TxnNet[n][i].WbDMsg.Mask); UndefineData(NxtSta.TxnNet[n][i].WbDMsg.Data); endprocedure; -- Send a WbData for (n,i) with address a and data d. procedure SendTxnWbD(var NxtSta: STATE; Sta: STATE; n: NID; i: TID; c: WBD_CMD; a: ADDR; m: MASK; d: DATA); assert (Sta.TxnNet[n][i].WbDMsg.Cmd = WBD_None) “SendTxnWbD: Msg already exists”; NxtSta.TxnNet[n][i].WbDMsg.Cmd:= c; NxtSta.TxnNet[n][i].WbDMsg.Addr:= a; CopyMask(NxtSta.TxnNet[n][i].WbDMsg.Mask, m); CopyData(NxtSta.TxnNet[n][i].WbDMsg.Data, d); endprocedure; -- Subroutines for handling home channels. Ref No xxxxx 593 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence -- Check whether home channel command c is a request. function IsHomeReq(c: HOME_CMD): BOOLEAN; return (c = HOME_RdCode | c = HOME_RdData | c = HOME_RdInvOwn | c = HOME_InvItoE | c = HOME_RdCur | c = HOME_WbMtoI | c = HOME_WbMtoS | c = HOME_WbMtoE); endfunction; -- Check whether home channel command c is a response. function IsHomeRsp(c: HOME_CMD): BOOLEAN; return (c = HOME_RspFwdI | c = HOME_RspFwdIWb | c = HOME_RspFwdS | c = HOME_RspFwdSWb | c = HOME_RspFwd | c = HOME_RspI | c = HOME_RspIWb | c = HOME_RspS | c = HOME_RspSWb | c = HOME_RspCnflt | c = HOME_RspCnfltOwn); endfunction; -- Check whether home channel command c is a AckCnflt. function IsHomeAck(c: HOME_CMD): BOOLEAN; return (c = HOME_AckCnflt); endfunction; -- Check whether there is a message for (n,i) pending -- in the home channel from node p to the home node of address a. function PendHomeMsg(Sta: STATE; p: NID; a: ADDR; n: NID; i: TID): BOOLEAN; return (Sta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd!= HOME_None & TxnQueueHead(Sta.HomeNet[p][Sta.Home[a]].Ord[a], n, i)); endfunction; -- Check whether there is a request for (n,i) pending -- in the home channel from node p to the home node of address a. function PendHomeReq(Sta: STATE; p: NID; a: ADDR; n: NID; i: TID): BOOLEAN; return (PendHomeMsg(Sta, p, a, n, i) & IsHomeReq(Sta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd)); endfunction; -- Check whether there is a response for (n,i) pending -- in the home channel from node p to the home node of address a. function PendHomeRsp(Sta: STATE; p: NID; a: ADDR; n: NID; i: TID): BOOLEAN; return (PendHomeMsg(Sta, p, a, n, i) & IsHomeRsp(Sta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd)); endfunction; -- Check whether there is a AckCnflt for (n,i) pending -- in the home channel from node p to the home node of address a. function PendHomeAck(Sta: STATE; p: NID; a: ADDR; n: NID; i: TID): BOOLEAN; return (PendHomeMsg(Sta, p, a, n, i) & IsHomeAck(Sta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd)); endfunction; -- Receive a message for (n,i) from the home channel from node p to -- the home node of address a. procedure RecvHomeMsg(var NxtSta: STATE; Sta: STATE; p: NID; a: ADDR; n: NID; i: TID); assert (Sta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd!= HOME_None & TxnQueueHead(Sta.HomeNet[p][Sta.Home[a]].Ord[a], n, i)) “RecvHomeMsg: No msg to receive or out-of-order msg”; NxtSta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd:= HOME_None; undefine NxtSta.HomeNet[p][Sta.Home[a]].Msg[n][i].Addr; undefine NxtSta.HomeNet[p][Sta.Home[a]].Msg[n][i].From.Nid; undefine NxtSta.HomeNet[p][Sta.Home[a]].Msg[n][i].From.Tid; TxnDequeue(NxtSta.HomeNet[p][Sta.Home[a]].Ord[a], 594 Ref No xxxxx Intel Restricted Secret Sta.HomeNet[p][Sta.Home[a]].Ord[a]); endprocedure; -- Send a request or AckCnflt of command c and address a for (n,i) -- over the home channel from node n to the home node of address a. procedure SendHomeReq(var NxtSta: STATE; Sta: STATE; a: ADDR; n: NID; i: TID; c: HOME_CMD); assert (Sta.HomeNet[n][Sta.Home[a]].Msg[n][i].Cmd = HOME_None) “SendHomeReq: Msg already exists”; NxtSta.HomeNet[n][Sta.Home[a]].Msg[n][i].Cmd:= c; NxtSta.HomeNet[n][Sta.Home[a]].Msg[n][i].Addr:= a; TxnEnqueue(NxtSta.HomeNet[n][Sta.Home[a]].Ord[a], Sta.HomeNet[n][Sta.Home[a]].Ord[a], n, i); endprocedure; -- Send a non-RspCnflt* response of command c and address a for (n,i) -- over the home channel from node p to the home node of address a. procedure SendHomeRsp(var NxtSta: STATE; Sta: STATE; p: NID; a: ADDR; n: NID; i: TID; c: HOME_CMD); assert (Sta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd = HOME_None) “SendHomeRsp: Msg already exists”; NxtSta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd:= c; TxnEnqueue(NxtSta.HomeNet[p][Sta.Home[a]].Ord[a], Sta.HomeNet[p][Sta.Home[a]].Ord[a], n, i); endprocedure; -- Send a RspCnflt* of command c, address a, and From.Tid j a for (n,i) -- over the home channel from node p to the home node of address a. procedure SendHomeRspFrom(var NxtSta: STATE; Sta: STATE; p: NID; a: ADDR; n: NID; i: TID; c: HOME_CMD; j: TID); assert (Sta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd = HOME_None) “SendHomeRspFrom: Msg already exists”; NxtSta.HomeNet[p][Sta.Home[a]].Msg[n][i].Cmd:= c; NxtSta.HomeNet[p][Sta.Home[a]].Msg[n][i].From.Nid:= p; NxtSta.HomeNet[p][Sta.Home[a]].Msg[n][i].From.Tid:= j; TxnEnqueue(NxtSta.HomeNet[p][Sta.Home[a]].Ord[a], Sta.HomeNet[p][Sta.Home[a]].Ord[a], n, i); endprocedure; F.10 A C Reference Model Derived from CSI-IAM This section describes a C reference model for CSI cache coherence protocol (CSI-CRM, for short) that is the executable version of CSI-IAM and can form the basis of a protocol rule checker or a protocol traffic generator for CSI cache coherence protocol. CSI-CRM consists of three files: • csi_params.h contains configuration parameters and a few data type declarations of CSICRM that the user is more likely to customize. • csi_decls.h contains preprocessor macros, data type declarations, and function prototypes. • csi_procs.c contains the actual code and the initialization of global variables. Any program that uses CSI-CRM should #include both csi_params.h and csi_decls.h. F.10.1 Configuration parameters The configuration parameters of CSI-CRM are # defined in csi_params.h; their meanings have been explained at the beginning of the code listing in Section F.4, “Data Type Declarations” on page F-554. If any of these parameters is changed, CSI-CRM must be re-compiled. It is worth pointing out that since many data structures in CSI-IAM and hence CSI-CRM are indexed by Ref No xxxxx 595 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence addresses (ADDR), CSI-CRM is likely to become inefficient and consumes too much memory if there are too many addresses (i.e., when ADDR_NUM is too large). It is hard to say exactly how many are too many, but several thousand addresses should not cause any problems. F.10.2 Data Type Declarations All data objects manipulated by CSI-CRM are contiguous blocks made up of fields of type either ADDR or CELL, both of which are defined in csi_params.h. A field containing an address is of type ADDR, which is by default a signed short (i.e., two bytes). All other fields are of type CELL, which is by default a signed char (i.e., one byte). Either of ADDR and CELL can be freely changed to a signed integer of any size that can fit into a machine word and can accommodate the largest subrange and enumeration values in the model. The default type declarations should be big enough for almost all applications. Any ADDR or CELL field can have a special UNDEFINED value, which is declared to be -1 in csi_decls.h. Note that since both ADDR and CELL are signed integer types and all non-UNDEFINED values are non-negative (see the discussions about subrange and enumeration types below), the maximum numbers of distinct non-UNDEFINED values of ADDR and CELL are 2 (A - 1) and 2 (C - 1) respectively, where A and C are the numbers of bits in ADDR and CELL (by default, A = 16 and C = 8). There is a third type INDEX defined in csi_params.h, which is only used for loop counters in the code and never in data type definitions. CSI-IAM and hence CSI-CRM use only the following kinds of data type definitions: • Subranges of integers. • Enumeration types. • Records whose fields have previously defined types. • Arrays whose index types are previously defined subrange types and whose entry types are any previously defined types. Each of these is detailed below. F.10.2.1. Subrange Types Every subrange used by CSI-IAM starts from 0 and extends to the size of the subrange minus one, e.g.: ADDR: 0.. (ADDR_NUM - 1); NID: 0.. (NID_NUM-1); In CSI-CRM every subrange type other than ADDR is simply a synonym of CELL: #define NID CELL because C has no subrange types. By construction CSI-CRM should never violate the bounds of a subrange type, but the user need be careful when he sets variables and fields of subrange types directly. F.10.2.2. Enumeration Types An enumeration type definition in CSI-IAM like the following: CCH_STATE: enum {CCH_M, CCH_E, CCH_F, CCH_S, CCH_I}; is modeled in CSI-CRM by the following declarations in csi_decls.h: #define CCH_STATE CELL #define CCH_STATE_num 5 596 Ref No xxxxx Intel Restricted Secret extern const char *CCH_STATE_str[]; enum {CCH_M, CCH_E, CCH_F, CCH_S, CCH_I}; and the following global array of strings in csi_procs.c: const char *CCH_STATE_str[] = {“CCH_M”, “CCH_E”, “CCH_F”, “CCH_S”, “CCH_I”}; The purpose of the array is to store the print names of enumeration constants, e.g., the print name of CCH_F is CCH_STATE_str[CCH_F]. The *_num and *_str naming conventions are consistently followed for all enumeration types. The type of boolean values is also defined as an enumeration type: BOOLEAN: enum {FALSE, TRUE}; F.10.2.3. Record Types A record type definition in CSI-IAM like the following: TXN:record Nid: NID; Tid: TID; end; is modeled in CSI-CRM in a straightforward manner: typedef struct { NID Nid; TID Tid; } TXN; F.10.2.4. Array Types An array type definition in CSI-IAM like the following: DATA: array [WIDX] of WORD; is modeled in CSI-CRM by: typedef WORD DATA[WIDX_NUM]; where WIDX_NUM is the size of the subrange type WIDX. Note that we are relying on the assumption that all subranges start from 0. F.10.3 API Functions CSI-CRM manipulates three types of data objects: states, inputs, and outputs. A state is a global system state, as defined at the end of the code listing in Section F.4. An input is a record consisting of an action command and a list of action parameters: typedef struct { ACT_CMD ActCmd; ADDR ReqAddr; NID ReqNid; TID ReqTid; NID PeerNid; TID PeerTid; EXT_CMD ExtCmd; INT_CMD IntCmd; WIDX StWidx; WORD StWord; BOOLEAN BiasFwd; Ref No xxxxx 597 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence BOOLEAN BiasToI; } INPUT; For different values of ActCmd different sets of action parameters are needed; see Table F-3 “Actions of CSI-IAM” on page F-561 for a summary of which action parameters are needed for which action command. An output is a record with two fields: typedef struct { BOOLEAN Fired; BOOLEAN Error; } OUTPUT; whose meanings will be explained in Section F.10.3.3. below. The API functions of CSI-CRM take arguments that are pointers to data objects of these three types. Memory management is completely the user’s responsibility: CSI-CRM contains no code that either allocates or deallocates these data objects. F.10.3.1. Undefining, Comparing, Copying, and Printing States Unless otherwise stated, all API functions described in this section have counterparts for inputs and outputs, whose names are obtained by substituting “Input” or “Output” for “State” in the API names. The API function: int csi_UndefineState(STATE *Sta); sets every CELL of the state *Sta to UNDEFINED. The API function: int csi_CompareState(STATE *Sta1, STATE *Sta2); compares the two states *Sta1 and *Sta2 and returns 0 if and only if the two states are identical. The comparison is implemented using C’s memcmp function. The API function: void csi_CopyState(STATE *Sta1, STATE *Sta2); copies the state *Sta1 over the state *Sta2. The copying is implemented using C’s memcpy function (though note that the direction of copying is reversed in memcpy). The API function: VOID csi_FprintState(FILE *stream, char *root, STATE *Sta); prints the contents of the state *Sta to the output stream, where root is the print name of *Sta. This API is smart enough to print enumeration constants using their print names and produce outputs that can be included into C programs. The API function: int csi_SprintState(char *buffer, char *root, STATE *Sta); is the same as csi_FprintState except that the output is written into the string buffer, where the return count has exactly the same meaning as in C’s sprintf function. These printing API’s also have the following variants: VOID csi_FprintStateAddr(FILE *stream, char *root, STATE *Sta, ADDR addr); int csi_SprintStateAddr(char *buffer, char *root, STATE *Sta, ADDR addr); which limits the printing to the part of the state *Sta that is indexed by the address addr. There are no address-limited printing API’s for inputs and outputs, as both data structures are quite small. 598 Ref No xxxxx Intel Restricted Secret Note: Uninitialized CELLs may contain garbage values that can break the printing API’s and cause program crash, so it is a good idea to csi_Undefine* a state, input, or output data object immediately after it is allocated (either in the heap or on the stack). F.10.3.2. Initializing a State The API function: VOID csi_InitState(STATE *Sta); initializes the state *Sta according to Section F.5, “The Initial State of the System” on page F-558, except that: • the assignment of home nodes to addresses is uninitialized, and • all words in all data lines in memory are uninitialized. These and all other uninitialized state components are set to UNDEFINED. The user is free to initialize the data in the memory after calling csi_InitState, but if he does so then he should also initialize the corresponding words in the auxiliary variable Aux.LatestData, so that the invariant CacheDataProp in Section F.6 will not be violated when the data are loaded into caches. F.10.3.3. Making a State Transition The API function: VOID csi_Transition(INPUT *Inp, OUTPUT *Out, STATE *Sta, STATE *NxtSta); attempts to execute the action (= action command + action parameters) specified by *Inp in the state *Sta. If the action is enabled, then Out->Fired is set to TRUE and the effects of the action (i.e., the changes to state components) are written into *NxtSta. If the action is not enabled, then Out> Fired is set to FALSE and *NxtSta is not changed. *Sta and *Inp are never changed. Out->Error is set to TRUE only when one of the assertions in the code of CSI-IAM is violated, which should never happen if CSI-IAM and hence CSI-CRM are correct. Should Out->Error be set to TRUE, an error message is also printed to stderr. The user can change where such error messages are printed to by setting the global variable ErrStream (by default, ErrStream = NULL, which indicates stderr). Two things should always be remembered when using csi_Transition: • In *NxtSta, only the state components that are changed by the action are written. Thus the user should copy *Sta to *NxtSta (using csi_CopyState) before calling this API if he wants *NxtSta to be really the next state. On the other hand, if Out->Fired equals FALSE after calling this API, the user does not have to do the copying again before trying a different action. • Sta and NxtSta should never point to the same data object. In other words, csi_Transition is not capable of doing in-place modification of a state correctly and *NxtSta should always be a separate copy from *Sta. Given the above, a possible way of driving CSI-CRM through a sequence of actions is the following: INPUT Inp; OUTPUT Out; STATE Sta, NxtSta; csi_Undefine(&Inp); csi_Undefine(&Out); Ref No xxxxx 599 Intel Restricted Secret An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence csi_Undefine(&Sta); csi_Undefine(&NxtSta); csi_InitState(&Sta); /* Set up home node assignments and memory contents (see Section F.10.3.2.) */ csi_CopyState(&Sta, &NxtSta); Inp.ActCmd =... /* Action 1 */ /* Set up other relevant fields of Inp for action 1 */ csi_Transition(&Inp, &Out, &Sta, &NxtSta); csi_CopyState(&NxtSta, &Sta); Inp.ActCmd =... /* Action 2 */ /* Set up other relevant fields of Inp for action 2 */ csi_Transition(&Inp, &Out, &Sta, &NxtSta); csi_CopyState(&NxtSta, &Sta); ... /* and so on */ Of course, it may be desirable to test Out.Fired after each call of csi_Transition to decide what to do if the attempted action was not enabled. It may also be desirable to print out the contents of Sta or NxtSta at certain points using the printing API’s. 600 Ref No xxxxx Intel Restricted Secret G.1 Introduction Chapter 8, “CSI Cache Coherence Protocol” describes CSI cache coherence protocol at a conceptual level and leaves the protocol details out on purpose. This appendix attempts to provide more details by defining an implementation-agnostic model (CSI-IAM, for short) for the 3-hop home broadcast coherence version of the protocol; the CSI-IAM for the 2-hop source broadcast coherence version of the protocol is described in Appendix F, “An Implementation Agnostic Model of CSI 2-Hop Source Broadcast Coherence.” The primary purpose of CSI-IAM is to provide an unambiguous and maximally permissive specification of the allowed message sequences of CSI cache coherence protocol. “Maximally permissive” means that any sequence of coherent protocol messages allowed by any CSI implementation is also allowed by CSI-IAM, though a typical CSI implementation will exercise only a portion of the behaviors allowed by CSI-IAM. CSI-IAM is specified using state machines coded in a formal description language called Murphi, which was developed at Stanford University and is publicly available (http://verify.stanford.edu/dill/murphi.html). The syntax of Murphi is similar to that of Ada or Modula and its semantics is that of standard imperative programming languages, so any computer engineer should be able to read Murphi code without prior experience with the Murphi language. In any case only a small set of language constructs is used in CSI-IAM and we will explain any constructs that may be new to a typical computer engineer. Murphi manual is included in its distribution available from the URL given above. Although CSI-IAM is ultimately specified using Murphi, an intermediate-level representation of the protocol logic called protocol tables (p-tables, for short) is introduced in this appendix. The tabular format strikes a good balance between informal English description and formal Murphi code. Each p-table typically specifies what an agent does internally and what messages it sends out upon receiving a message of a particular type. The meaning of a p-table is made precise by a semantic mapping that attaches a Murphi code fragment to each cell in the p-table, with the understanding that each row in the p-table represents a possible atomic transition of CSI-IAM and the Murphi code fragments specify what the atomic transition is. Thus, whenever the reader is unsure about what a cell in a p-table means, he can quickly find out the precise meaning by consulting the semantic mapping. Our experience suggests that, once a reader gains a preliminary familiarity with CSI-IAM (in particular, how the system state is represented), the reader will be able to stay at the p-table level almost all the time without continually looking up the semantic mappings. The set of p-tables also serves as a concise summary and quick reference of the protocol. As mentioned earlier, the primary purpose of CSI-IAM is to precisely specify the set of allowed message sequences of CSI cache coherence protocol. It does not dictate or constrain the flexibility of the CSI agent implementations. However, CSI-IAM does provide hints on how CSI agents can be implemented. In particular, the data structures used by CSI-IAM, though not directly implementable, points out what state information need be tracked by each CSI agent and how that information is used for interaction with other agents. Given that understanding, the optimal micro architecture for the system configurations under consideration can be devised. Ref No xxxxx 601 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence G.2 What CSI-IAM Does and Does Not Cover CSI-IAM covers all protocol rules for ensuring cache coherence in the CSI protocol (Home snooping). It does not cover non-coherent transactions nor does it model any features below or beyond the Protocol layer. CSI-IAM does not model implementation dependencies that may cause deadlock or livelock, since such dependencies are specific to particular micro architectures and cannot be captured in a single, implementation-agnostic model. Furthermore, in order to make CSI-IAM maximally permissive, no dependencies between transactions to different cache line addresses that may result from the sharing of resources (buffers, queues, message channels, etc.) are modeled. Since CSI relies on queueing for liveness and fairness and does not have retries, an implementor of CSI should be extremely careful in documenting and reasoning about any dependencies that resource sharing might introduce. G.3 Components of CSI-IAM CSI-IAM consists of the following components: • Data type declarations, including that of the system state. • The initial state of the system. • Invariants that are expected to hold. • Actions and their parameters. • Protocol tables (p-tables), one per action. • Semantic mappings, one per p-table. • Utility subroutines used by the semantic mappings. From the above components an executable Murphi model can be automatically generated. The rest of this appendix presents these components, a brief description of each action and the generated Murphi code. G.3.1 IAM Component Details The efficacy of this procedure in which formal models are derived automatically from protocol tables depends greatly on limiting the number of variables in the definition. Even though the coherence protocol is complex and needs a large number of protocol tables, the underlying structures and distinct actions are limited. The state variables, actions and predicate information tracked by different protocol structures are listed below. At Home • SPT: Snoop pending table, responsible for sending snoops, collecting responses and updating and reading from memory. The variable and the values it can take are listed below. — State: {None, SentSnp, SentPOSnp, SentFAC, WaitWbData, WaitRspFwd, WaitAckCnflt, ReadyToRespond} 602 Ref No xxxxx Intel Restricted Secret — Command: The request from the caching agent. {RdCode, RdData, RdCur, RdInvOwn, InvItoE, WbMtoI, WbMtoS} — Data Fwded: RspFwd* was received, {YES, NO}; ex. Data Fwded = YES => RspFwd* was received. — SRP[i]: A vector, to track outstanding snoop responses. {0, 1 and 2}; ex: SRP[i] = 1=> home is expecting a snoop response from node i. — Num SRP: Number of snoops that are pending, used in coarse directory mode only. — Conflict Vector (CV[i]): A bit vector that tracks the sender of RspCnflt* response. {0, 1}; ex: CV[i]=1 => node i sent a RspCnflt*. This is not used in the coarse directory mode. — AckCnflt Vector (ACV[i]): A bit vector that tracks the sender of AckCnflt response. ex: ACV[i]=1 => node i sent a AckCnflt. This is not used in the coarse directory mode. — Predicates that are derived from SPT State ‘SPT Full’, ‘(All Snp Rsp Rcvd & All CV[i]=0) Except msg.from’; ‘Exists Cnflt Req’: A request to same address is present in SPT. • PRB: Pending request buffer, (Spill FIFO without ordering constraint) all new requests after updating directory are placed in PRB. Requests in PRB compete to enter SPT. • Directory: Each of the following fields is included for each memory block. — State: State of the line, M, S or I. — Presence Vector (bit vector), PV[i]=1 indicates that the line may be cached by node i. — New Request Vector: A bit vector that tracks source of a new request. Dir State = S, and NRV[i]=1 is equivalent to the S’ state. NRV vector is not required in coarse directory mode. — Coarse Dir Mode: Indicates that the line is in shared mode and number of sharers is more than the number of directory bits. {TRUE, FALSE} — Predicates used are: ‘Pointer Overflow’: A condition that is checked to determine if addition of one more sharer will exceed the number of presence vector bits available in full map directory mode. {YES, NO} ‘PV[i]=1 only for i=nid’: Only the requester may have a cached copy. ‘exists i!= nid: PV[i]=1’: There is a cache other than the requester who has a copy. • Net: — Msg: All possible requests and responses • Mem: Home memory interface • Various actions performed are —‘Remove’: A message is removed from the network. —‘Place in PRB’, Place the request in to PRB. —‘Send to Peers’: This is used to send snoop requests to peer nodes. A peer node with respect to a transaction is defined as a node that is not the requester. —‘Send to Requester’: This is to send response to the requester —‘Send to msg.from’: This is to send a response to the sender of the message Ref No xxxxx 603 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence At the Caching Agent • ORB: Outgoing request buffer, this may be called SMAF (system miss address file) in some implementations. The various fields in this structure are: — State: {None, SentReq, RcvdData, RcvdCmp, SentAck} — Command: {RdCode, RdData, RdCur, RdInvOwn, InvItoE, WbIData, WbSData, WbIDataPtl}. — Addr: Address in the request. — Cnflt: A bit indicating if an incoming snoop had found a conflicting request in ORB. • Cache: —State: {M, E, S, I} — Data: which may be partial. — Mask: Identifies valid words in a cache line. • The actions at the caching agent are —‘Store a line’, writing a value in to cache. —‘DowngradeS or DowngradeI’: Downgrade state of the cache line. —‘Send to Home’: send the request or snoop response to home. —‘Send Wb to home’: Send write back data to home. —‘Send to Msg.Fwd To’: Forward data to a requester identified in the Cmp_Fwd* message. —‘Send to Msg.Req’: Send the cache line to the requester corresponding to the snoop request. G.4 Data Type Declaration The data type declarations used by CSI-IAM are listed below. The Murphi syntax should be self- explanatory. Only the following type constructions are used: finite ranges of integers, enumerated types, and (finite) arrays and records of previously defined types. The reader is referred to the comments embedded in the code, which begin with the string “--” and extend to the ends of the lines. -- Index types. ADDR: scalarset(ADDR_NUM); -- Addresses (of cache lines). HID: scalarset(HID_NUM); -- Home node ids. NID: scalarset(NID_NUM); -- Caching node ids. TAG: scalarset(TAG_NUM); -- ORB/SMAF tags. WIDX: scalarset(WIDX_NUM); -- Word indices (in cache lines). WORD: scalarset(WORD_NUM); -- Word values. SPTX: scalarset(SPT_NUM); -- SPT entry index. -- Cache line data and masks. -- A cache line consists of an array of words, each of which is qualified -- by the corresponding boolean field in a mask. DATA : array [WIDX] of WORD; MASK : array [WIDX] of BOOLEAN; 604 Ref No xxxxx Intel Restricted Secret -- A transaction (“txn” for short) id is a NID-TAG pair. TXN : record Nid : NID; Tag : TAG; end; -- Commands that a caching agent can do internally to its cache. INT_CMD: enum {INT_Store, INT_DowngradeS, INT_DowngradeI}; -- Commands that a caching agent can issue externally to the CSI interface. EXT_CMD: enum {EXT_None, EXT_RdCode, EXT_RdData, EXT_RdInvOwn, EXT_InvItoE, EXT_RdCur, EXT_WbMtoI, EXT_WbMtoS, EXT_WbMtoE}; -- Auxiliary variables, which are used solely for stating properties about-- the protocol and do not affect the behavior of the protocol in any way. AUX_ENTRY : record LatestData: array [ADDR] of DATA; -- Tracks the latest data value of cache line addresses. end; -- Cache (“cch” for short) entries -- There is one cache entry per address at each caching node. CCH_STATE : enum {CCH_M, CCH_E, CCH_S, CCH_I}; CCH_ENTRY : record State : CCH_STATE; Mask : MASK; Data : DATA; end; -- Outgoing Request Buffer (a.k.a. SMAF) entries. -- There is one ORB entry per txn id. ORBs are in caching nodes. ORB_STATE: enum {ORB_None, ORB_SentReq, ORB_RcvdData, ORB_RcvdCmp, ORB_SentAck}; ORB_ENTRY : record State : ORB_STATE; Cmd: EXT_CMD; Addr : ADDR; Cnflt : BOOLEAN; -- A conflict has been observed. end; -- Snoop messages. SNP_CMD: enum {SNP_None, SNP_SnpCode, SNP_SnpData, SNP_SnpInvOwn, SNP_SnpInvItoE, SNP_SnpCur, SNP_RspCnflt, SNP_RspFwdI, SNP_RspFwdIWb, SNP_RspFwdS, SNP_RspFwdSWb, SNP_RspFwd, SNP_RspI, SNP_RspIWb, SNP_RspS, SNP_RspSWb}; Ref No xxxxx 605 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence SNP_MSG : record Cmd : SNP_CMD; Addr : ADDR; end; -- Data response messages. DATA_CMD: enum {DATA_None, DATA_DataC_M, DATA_DataC_E, DATA_DataC_S, DATA_DataC_I, DATA_GntE}; DATA_MSG : record Cmd : DATA_CMD; Addr : ADDR; Data : DATA; end; -- WBData messages. WBD_CMD: enum {WBD_None, WBD_WbEData, WBD_WbSData, WBD_WbIData, WBD_PtlWbData}; WBD_MSG : record Cmd : WBD_CMD; Addr : ADDR; Mask : MASK; Data : DATA; From : NID; end; -- Home messages. HOME_CMD: enum {HOME_None, HOME_RdCode, HOME_RdData, HOME_RdInvOwn, HOME_InvItoE, HOME_RdCur, HOME_AckCnflt, HOME_Cmp, HOME_FrcAckCnflt, HOME_Cmp_FwdCode, HOME_Cmp_FwdInvOwn, HOME_Cmp_FwdInvItoE}; HOME_MSG: record Cmd: HOME_CMD; Addr: ADDR; FwdTo: TXN; end; -- PRB entries. -- There is one PRB entry per txn id at each home node. PRB_ENTRY: record Cmd: EXT_CMD; Addr: ADDR; end; -- SPT entries. -- There is at most one SPT entry per address at each home node for -- the transaction that the home is currently processing for that address. 606 Ref No xxxxx Intel Restricted Secret SPT_STATE: enum {SPT_None, SPT_Valid, SPT_SentSnp, SPT_SentPOSnp, SPT_SentFAC, SPT_ReadyToRespond, SPT_WaitRspFwd, SPT_WaitWbData}; SPT_ENTRY: record State: SPT_STATE; Txn: TXN; Cmd: EXT_CMD; Addr: ADDR; DataFwded: BOOLEAN; CV: array [NID] of BOOLEAN; -- Conflict Vector ACV: array [NID] of BOOLEAN; -- AckConflict Vector SRP: array [NID] of 0..2; NumSRP: 0.. NID_NUM; -- Snoop Response Pending-- SRP in Coarse Dir Mode end; -- Memory entries. -- There is one memory entry per address at each home node. MEM_ENTRY: record Data: DATA; end; -- Directory entries. -- There is one directory entry per address at the home node. DIR_ENTRY: record CoarseMode: Boolean; DirSta: enum {DIR_M, DIR_S, DIR_I}; PV: array [NID] of Boolean; -- Presence Vector NRV: array [NID] of Boolean; -- New Request Vector end; -- Transaction nets. -- There is one txn_net per transaction id which contains all messages -- (request and response) associated with that txn. -- HomeMsg contains a message to/from the home such as original request, -- completion, AckCnflt, and FAC messages. -- SnpMsg contains snoop request/response to/from peer nodes. -- DataMsg contains a message with data. TXN_NET: record HomeMsg: HOME_MSG; SnpMsg: array [NID] of SNP_MSG; SnpMsg2: array [NID] of SNP_MSG; -- snoop response to FwdCmp* DataMsg: DATA_MSG; WbDMsg: WBD_MSG; end; -- The system state. STATE: record -- Mapping from addresses to home nodes, to be set at initial Ref No xxxxx 607 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence Home: array [ADDR] of HID; -- The following variables are purely auxiliary. Aux: AUX_ENTRY; -- The following variables belong to the caching nodes. Cch: array [NID] of array [ADDR] of CCH_ENTRY; Orb: array [NID] of array [TAG] of ORB_ENTRY; -- The following variables belong to the home nodes. Prb: array [HID] of array [NID] of array [TAG] of PRB_ENTRY; Spt: array [HID] of array [SPTX] of SPT_ENTRY; Dir: array [ADDR] of DIR_ENTRY; Mem: array [ADDR] of MEM_ENTRY; -- The following variables belong to the interconnect network. Net: array [NID] of array [TAG] of TXN_NET; end; Some remarks about system states represented in CSI are: • Caching nodes and home nodes are conceptually distinct from each other and CSI-IAM has separate index sets (NID and HID) to range over them. • Enumerated constants are always prefixed by the names of their types, to ensure that the same constant does not appear in more than one type. These prefixes are omitted in English comments and p-tables (but not in semantic mappings). • It is always true that whenever the State or Cmd field of a record has the value None, the values of its other fields are don’t-cares. As stated earlier the primary purpose of CSI-IAM is to precisely specify the set of allowed message sequences of CSI cache coherence protocol. The system state representation shown above is geared toward making this specification as simple and clear as possible. It is not supposed to be directly implementable or “realistic” in any sense. One Murphi construct used later is perhaps best explained at this point. The command “undefine ” assigns a don’t-care value to the variable . This is a true don’t-care, in the sense that CSI-IAM never needs to read or test a don’t-care value. Thus an implementation is free to use any (legal) value to represent don’t-care, including leaving the value of unchanged. G.5 The Initial State of the System The following procedure specifies a legal initial state of the system. It is assumed that the assignment of home nodes to addresses is initialized separately. procedure InitState(var Sta: STATE; idata: WORD); undefine Sta; for a : ADDR do for h : HID do Sta.Home[a] := h; end; end; for a : ADDR do for w : WIDX do 608 Ref No xxxxx Intel Restricted Secret Sta.Mem[a].Data[w] := idata; Sta.Aux.LatestData[a][w] := Sta.Mem[a].Data[w]; end; end; for n : NID do for a : ADDR do Sta.Cch[n][a].State := CCH_I; for w : WIDX do Sta.Cch[n][a].Mask[w] := FALSE; end; end; end; for n: NID do for i: TAG do Sta.Orb[n][i].State:= ORB_None; Sta.Orb[n][i].Cnflt:= FALSE; end; end; for h: HID do for n: NID do for i: TAG do Sta.Prb[h][n][i].Cmd:= EXT_None; end; end; end; for h: HID do for s: SPTX do Sta.Spt[h][s].State:= SPT_None; Sta.Spt[h][s].Cmd:= EXT_None; Sta.Spt[h][s].DataFwded:= FALSE; Sta.Spt[h][s].NumSRP:= 0; for p: NID do Sta.Spt[h][s].CV[p]:= FALSE; Sta.Spt[h][s].ACV[p]:= FALSE; Sta.Spt[h][s].SRP[p]:= 0; end; end; end; for a: ADDR do Sta.Dir[a].CoarseMode:= FALSE; Sta.Dir[a].DirSta:= DIR_I; for p: NID do Sta.Dir[a].PV[p]:= FALSE; Sta.Dir[a].NRV[p]:= FALSE; end; end; for n: NID do for i: TAG do for p: NID do Sta.Net[n][i].SnpMsg[p].Cmd:= SNP_None; Sta.Net[n][i].SnpMsg2[p].Cmd:= SNP_None; end; Sta.Net[n][i].HomeMsg.Cmd:= HOME_None; Sta.Net[n][i].DataMsg.Cmd:= DATA_None; Sta.Net[n][i].WbDMsg.Cmd:= WBD_None; end; end; endprocedure; -- end of procedure InitState() ruleset InitData: WORD do startstate “start state: I” InitState(Sta, InitData); Ref No xxxxx 609 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence endstartstate; endruleset; ruleset InitData: WORD; Addr: ADDR do startstate “start state: S” InitState(Sta, InitData); for n: NID do Sta.Cch[n][Addr].State:= CCH_S; for w: WIDX do Sta.Cch[n][Addr].Mask[w]:= TRUE; Sta.Cch[n][Addr].Data[w]:= Sta.Mem[Addr].Data[w]; endfor; end; -- set directory Sta.Dir[Addr].DirSta:= DIR_S; Sta.Dir[Addr].CoarseMode:= TRUE; for p: NID do Sta.Dir[Addr].PV[p]:= TRUE; end; endstartstate; endruleset; ruleset InitData: WORD; Addr: ADDR; Nid: NID do startstate “start state: E/I” InitState(Sta, InitData); Sta.Cch[Nid][Addr].State:= CCH_E; for w: WIDX do Sta.Cch[Nid][Addr].Mask[w]:= TRUE; Sta.Cch[Nid][Addr].Data[w]:= Sta.Aux.LatestData[Addr][w]; end; -- set directory Sta.Dir[Addr].DirSta:= DIR_M; Sta.Dir[Addr].PV[Nid]:= TRUE; endstartstate; endruleset; ruleset InitData: WORD; Addr: ADDR; Nid: NID; NewData: WORD do startstate “start state: M/I” InitState(Sta, InitData); Sta.Cch[Nid][Addr].State:= CCH_M; for w: WIDX do Sta.Cch[Nid][Addr].Mask[w]:= TRUE; Sta.Cch[Nid][Addr].Data[w]:= NewData; Sta.Aux.LatestData[Addr][w]:= NewData; 610 Ref No xxxxx Intel Restricted Secret end; -- set directory Sta.Dir[Addr].DirSta:= DIR_M; Sta.Dir[Addr].PV[Nid]:= TRUE; endstartstate; endruleset; ruleset InitData: WORD; Addr: ADDR; Nid: NID; NewIdx: WIDX; NewData: WORD do startstate “start state: M_PD/I” InitState(Sta, InitData); Sta.Cch[Nid][Addr].State:= CCH_M; Sta.Cch[Nid][Addr].Mask[NewIdx]:= TRUE; Sta.Cch[Nid][Addr].Data[NewIdx]:= NewData; Sta.Aux.LatestData[Addr][NewIdx]:= NewData; -- set directory Sta.Dir[Addr].DirSta:= DIR_M; Sta.Dir[Addr].PV[Nid]:= TRUE; endstartstate; endruleset; G.6 The Invariants -- Whenever the state of a cache is valid (i.e., not I), its valid words-- (i.e., those whose corresponding mask entries are TRUE) contain the-- latest data, in Aux.LatestData. invariant “CacheDataProp” forall a: ADDR do forall n: NID do forall w: WIDX do (Sta.Cch[n][a].State = CCH_M | Sta.Cch[n][a].State = CCH_E | Sta.Cch[n][a].State = CCH_S) & Sta.Cch[n][a].Mask[w] = TRUE -> Sta.Cch[n][a].Data[w] = Sta.Aux.LatestData[a][w] endforall endforall endforall; -- Consistency conditions between the states of caches at different nodes. invariant “CacheStateProp” forall a: ADDR do forall n: NID do forall p: NID don!= p -> (Sta.Cch[n][a].State = CCH_M | Sta.Cch[n][a].State = CCH_E -> Sta.Cch[p][a].State = CCH_I) & (Sta.Cch[n][a].State = CCH_S -> Sta.Cch[p][a].State = CCH_S | Sta.Cch[p][a].State = CCH_I) endforall endforall endforall; -- Consistency conditions between the state and mask of a cache line. invariant “CacheMaskProp” Ref No xxxxx 611 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence forall a : ADDR do forall n : NID do ( Sta.Cch[n][a].State = CCH_M -> MaskFull(Sta.Cch[n][a].Mask) | MaskPartial(Sta.Cch[n][a].Mask) ) & ( Sta.Cch[n][a].State = CCH_E -> MaskFull(Sta.Cch[n][a].Mask) | MaskEmpty(Sta.Cch[n][a].Mask) ) & ( Sta.Cch[n][a].State = CCH_S -> MaskFull(Sta.Cch[n][a].Mask) ) & ( Sta.Cch[n][a].State = CCH_I -> MaskEmpty(Sta.Cch[n][a].Mask) ) endforall endforall; -- Implications of an ORB entry being in state None, which essentially say-- that there is no activity with its txn id anywhere in the system. invariant "OrbNoneProp" forall a : ADDR do forall n : NID do forall i : TAG do Sta.Orb[n][i].State = ORB_None -> forall p : NID do Sta.Net[n][i].SnpMsg[p].Cmd = SNP_None & Sta.Net[n][i].SnpMsg2[p].Cmd = SNP_Noneend & Sta.Net[n][i].HomeMsg.Cmd = HOME_None & Sta.Net[n][i].DataMsg.Cmd = DATA_None & Sta.Net[n][i].WbDMsg.Cmd = WBD_None endforall endforall endforall; G.7 Actions and Their Parameters An action is a set of state transitions with a common theme. Typically it consists of the state transitions caused by a (caching or home) node receiving a message, performing internal processing, and sending messages to other nodes. An action is uniquely identified by its name and parameters, the latter of which specifies for which address, at which node(s) and tag(s), and (possibly) for which internal or external command the action happens. CSI-IAM is specified by 16 actions, 7 of which are for caching agents and 9 for home agents. Their names, parameters, and approximate meanings are listed in Table G-1 “Actions of CSI-IAM” on page G-613, where an “X” indicates that a parameter is needed for an action. Action executions are atomic, in that the execution of an action does not overlap with that of another action and the execution of the system consists of a sequence of the action executions, each completely finished before the next is started. The atomicity requirement is, of course, only conceptual: the implementor is free to use any scheme that supports the illusion of atomicity but in reality may overlap the executions of different actions in time. 612 Ref No xxxxx Intel Restricted Secret Table G-1. Actions of CSI-IAM Action meaning Action name Action parameters ReqAddrReqNidReqTagPeerNidPeerTagSPTxHome nidExtCmdIntCmdStWidxStWord Caching Agent Requestor generates internal request (Store/Downgrade*) CacheNewReqInt X X X X X Requestor generates external request (Rd*/Inv*/WbMto*) CacheNewReqExt X X X X Requestor receives Data*/Gnt* CacheRecvData X X Requestor receives Cmp/FrcAckCnflt CacheRecvCmp X X Requestor receives Cmp_Fwd* CacheRecvFwd X X Peer cache responds to a snoop request CacheSnpOrbMiss X X X CacheSnpOrbHit X X X X Home agent Home receives Rd*/Inv* from requestor HomeRecvReq X X X Home receives Wb*Data request HomeRecvExplicitWbReq X X X Home selects a valid PRB entry and places it in SPT and sends snoops HomePRBtoSPTNoCDMa X X X X HomePRBtoSPTCDM X X X X Home receives a snoop response from a peer node, which is not Wb or RspCnflt HomeRecvSnpRspNoCDM X X X X HomeRecvSnpRspCDM X X X X Home receives writeback snoop response HomeRecvWbSnpRsp X X X X Home receives (implicit) write back data HomeRecvImplicitWb X X X Home receives a RspCnflt from a peer node HomeRecvRspCnfltNoCDM X X X X X HomeRecvRspCnfltCDM X X X X X Home receives AckCnflt from a peer node HomeRecvAckCnflt X X X Home SPT entry sends response to the requester HomeSPTReadyToRespond NoCDM X X HomeSPTReadyToRespond CDM X X a. CDM = Coarse Directory Mode Ref No xxxxx 613 Intel Restricted Secret Table G-2. Action CacheNewReqInt Current State Next State IntCmd ReqCch ReqCch Aux State Mask State Data[StWidx] Mask[StWidx] LatestData[StWidx] Store M, E M StWord TRUE StWord DowngradeS E Full S DowngradeI E, Sa I a. for E mask must be non partial and for S it should be full. no other cases apply. Table G-2 actions internal to the cache, such as writing to the cache or downgrading the state of a cache line. Semantic Mapping for ‘CacheNewReqInt’ PARAMETERS ReqAddr: ADDRReqNid: NID IntCmd: INT_CMD StWidx: WIDX StWord: WORD ALIASES ReqCch: Sta.Cch[ReqNid][ReqAddr] AuxData : Sta.Aux.LatestData[ReqAddr] NxtReqCch : NxtSta.Cch[ReqNid][ReqAddr] NxtAuxData: NxtSta.Aux.LatestData[ReqAddr] COLUMN |Current State|IntCmd| |Store| => IntCmd = INT_Store |DowngradeS| => IntCmd = INT_DowngradeS |DowngradeI| => IntCmd = INT_DowngradeI COLUMN |Current State|ReqCch|State| |E| => ReqCch.State in {CCH_E} |M, E| => ReqCch.State in {CCH_M, CCH_E} |E, S| => ReqCch.State in {CCH_S, CCH_E} COLUMN |Current State|ReqCch|Mask| |Full| => MaskFull(ReqCch.Mask) COLUMN |Next State|ReqCch|State| |M| => NxtReqCch.State:= CCH_M |S| => NxtReqCch.State:= CCH_S |I| => CchClear(NxtSta, Sta, ReqNid, ReqAddr, CCH_I) COLUMN |Next State|ReqCch|Data[StWidx]| |StWord| => NxtReqCch.Data[StWidx]:= StWord COLUMN |Next State|ReqCch|Mask[StWidx]| |TRUE| => NxtReqCch.Mask[StWidx]:= TRUE 614 Ref No xxxxx Intel Restricted Secret COLUMN |Next State|Aux|LatestData[StWidx]| |StWord| => NxtAuxData[StWidx]:= StWord Table G-3. Action CacheNewReqExt Current State Next State ReqOrb ExtCmd ReqCch ReqOrb ReqCch Net State Mask State Cmd Addr Cnflt State Send to Home Available RdCode S, I SentReq RdCode ReqAddr FALSE RdCode RdData RdData RdData RdInvOwn RdInvOwn RdInvOwn InvItoE InvItoE InvItoE RdCur I RdCur RdCur WbMtoI M Full WbMtoI I WbIDataa Partial WbIDataPtl WbMtoS M Full WbMtoS S WbSData a. WbData has a cache state indication, A processor may also send a separate write back marker which should be dropped by home. Table G-3 lists the actions of a caching agent issuing an external action to the CSI interface. The CSI interface places the outgoing request in ORB as well as sending it to home node. Semantic Mapping for ‘CacheNewReqExt’ PARAMETERS ReqAddr: ADDRReqNid: NIDReqTag: TAGExtCmd: EXT_CMD ALIASES ReqOrb: Sta.Orb[ReqNid][ReqTag] ReqCch: Sta.Cch[ReqNid][ReqAddr] NxtReqOrb: NxtSta.Orb[ReqNid][ReqTag] NxtReqCch: NxtSta.Cch[ReqNid][ReqAddr] COLUMN |Current State|ReqOrb| |Available| => OrbAvail(Sta, ReqAddr, ReqNid, ReqTag) = TRUE COLUMN |Current State|ExtCmd| |RdCode| => ExtCmd = EXT_RdCode |RdData| => ExtCmd = EXT_RdData |RdInvOwn| => ExtCmd = EXT_RdInvOwn |InvItoE| => ExtCmd = EXT_InvItoE |RdCur| => ExtCmd = EXT_RdCur |WbMtoI| => ExtCmd = EXT_WbMtoI |WbMtoS| => ExtCmd = EXT_WbMtoS COLUMN |Current State|ReqCch|State| |M| => ReqCch.State in {CCH_M} Ref No xxxxx 615 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence |I| => ReqCch.State in {CCH_I} |S, I| => ReqCch.State in {CCH_S, CCH_I} COLUMN |Current State|ReqCch|Mask| |Full| => MaskFull(ReqCch.Mask) |Partial| => MaskPartial(ReqCch.Mask) COLUMN |Next State|ReqOrb|State| |SentReq| => NxtReqOrb.State := ORB_SentReq COLUMN |Next State|ReqOrb|Cmd| |RdCode| => NxtReqOrb.Cmd := EXT_RdCode |RdData| => NxtReqOrb.Cmd := EXT_RdData |RdInvOwn| => NxtReqOrb.Cmd := EXT_RdInvOwn |InvItoE| => NxtReqOrb.Cmd := EXT_InvItoE |RdCur| => NxtReqOrb.Cmd := EXT_RdCur |WbMtoI| => NxtReqOrb.Cmd := EXT_WbMtoI |WbMtoS| => NxtReqOrb.Cmd := EXT_WbMtoS COLUMN |Next State|ReqOrb|Addr| |ReqAddr| => NxtReqOrb.Addr := ReqAddr COLUMN |Next State|ReqOrb|Cnflt| |FALSE| => NxtReqOrb.Cnflt := FALSE COLUMN |Next State|ReqCch|State| |I| => CchClear(NxtSta, Sta, ReqNid, ReqAddr, CCH_I) |M| => NxtReqCch.State := CCH_M |S| => NxtReqCch.State := CCH_S COLUMN |Next State|Net|Send to Home| |RdCode| => SendHomeMsg(NxtSta, Sta, ReqAddr, ReqNid, ReqTag,HOME_RdCode) |RdData| => SendHomeMsg(NxtSta, Sta, ReqAddr, ReqNid, ReqTag,HOME_RdData) |RdInvOwn| => SendHomeMsg(NxtSta, Sta, ReqAddr, ReqNid, ReqTag, HOME_RdInvOwn) |InvItoE| => SendHomeMsg(NxtSta, Sta, ReqAddr, ReqNid, ReqTag, HOME_InvItoE) |RdCur| => SendHomeMsg(NxtSta, Sta, ReqAddr, ReqNid, ReqTag, HOME_RdCur) |WbIData| => SendWbD(NxtSta, Sta, ReqAddr, ReqNid, ReqNid, ReqTag, WBD_WbIData, ReqCch.Mask, ReqCch.Data) |WbSData| => SendWbD(NxtSta, Sta, ReqAddr, ReqNid, ReqNid, ReqTag, WBD_WbSData, ReqCch.Mask, ReqCch.Data) |WbIDataPtl| => SendWbD(NxtSta, Sta, ReqAddr, ReqNid, RReqTag, HOME_RdInvOwn) 616 Ref No xxxxx Intel Restricted Secret Table G-4. Action CacheRecvData Current State Next State Msg ReqCch ReqOrb ReqOrb ReqCch Net Cmd State State Cnflt State State Data Mask Msg Send to Home DataC_M SentReq RcvdData M Msg.Data Full Remove RcvdCmp TRUE SentAck AckCnflt FALSE None DataC_E E, S, I SentReq RcvdData E RcvdCmp TRUE SentAck AckCnflt FALSE None DataC_S S, I SentReq RcvdData S RcvdCmp TRUE SentAck AckCnflt FALSE None DataC_I SentReq RcvdData RcvdCmp TRUE SentAck AckCnflt FALSE None GntE E, S, I SentReq RcvdData E RcvdCmp TRUE SentAck AckCnflt FALSE None Table G-4: This action is a caching agent receiving a Data* response from either home or a peer caching agent. The Cmp* message and the corresponding Data* message may be received in any order The order in which they are received is indicated by the ReqORB state RcvdCmp or RecvdData or SentReq respectively. If a conflicting snoop request was received before both Cmp* and Data* were received then the pending request after receiving both Cmp* and Data* messages sends an ‘AckCnflt’ message to home. To simplify implementation home may send Data*_Cmp as a single message. Semantic Mapping for ‘CacheRecvData’ PARAMETERS ReqNid : NIDReqTag : TAG ALIASES Msg : Sta.Net[ReqNid][ReqTag].DataMsg ReqOrb : Sta.Orb[ReqNid][ReqTag] ReqCch : Sta.Cch[ReqNid][Msg.Addr] NxtReqOrb : NxtSta.Orb[ReqNid][ReqTag] NxtReqCch : NxtSta.Cch[ReqNid][Msg.Addr] COLUMN |Current State|Msg|Cmd| |DataC_M| => PendDataRsp(Sta, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = DATA_DataC_M |DataC_E| => PendDataRsp(Sta, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = Ref No xxxxx 617 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence DATA_DataC_E |DataC_S| => PendDataRsp(Sta, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = DATA_DataC_S |DataC_I| => PendDataRsp(Sta, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = DATA_DataC_I |GntE| => PendDataRsp(Sta, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = DATA_GntE COLUMN |Current State|ReqOrb|State| |SentReq| => ReqOrb.State = ORB_SentReq |RcvdCmp| => ReqOrb.State = ORB_RcvdCmp COLUMN |Current State|ReqCch|State| |S, I| => ReqCch.State in {CCH_S, CCH_I} |E, S, I| => ReqCch.State in {CCH_E, CCH_S, CCH_I} |M, E| => ReqCch.State in {CCH_M, CCH_E} |M| => ReqCch.State = CCH_M COLUMN |Current State|ReqOrb|Cnflt| |TRUE| => ReqOrb.Cnflt = TRUE |FALSE| => ReqOrb.Cnflt = FALSE COLUMN |Next State|ReqOrb|State| |RcvdData| => NxtReqOrb.State := ORB_RcvdData |SentAck| => NxtReqOrb.State := ORB_SentAck |None| => OrbClear(NxtSta, Sta, ReqNid, ReqTag) COLUMN |Next State|ReqCch|State| |M| => NxtReqCch.State := CCH_M |E| => NxtReqCch.State := CCH_E |S| => NxtReqCch.State := CCH_S |I| => CchClear(NxtSta, Sta, ReqNid, Msg.Addr, CCH_I) COLUMN |Next State|ReqCch|Data| |Msg.Data| => CopyData(NxtReqCch.Data, Msg.Data) COLUMN |Next State|ReqCch|Mask| |Full| => SetMaskFull(NxtReqCch.Mask) |Empty| => SetMaskEmpty(NxtReqCch.Mask) COLUMN |Next State|Net|Send to Home| |AckCnflt| => SendHomeMsg(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, HOME_AckCnflt) COLUMN |Next State|Net|Msg| 618 Ref No xxxxx Intel Restricted Secret |Remove| => RecvDataRsp(NxtSta, Sta, ReqNid, ReqTag). Table G-5. Action CacheRecvCmp Current State Next State Msg ReqOrb ReqOrb Net Cmd State Cmd Cnflt State Cnflt Msg Send to Home Cmp SentReq WbMto* TRUE SentAck Remove AckCnflt FALSE None !=WbMto* RcvdCmp RcvdData TRUE SentAck AckCnflt FALSE None SentAck None FrcAckCnflt SentReq WbMto*a SentAck TRUE AckCnflt !=WbMto* RcvdCmp RcvdData SentAck AckCnflt a. Example invarient: WbMto* should never receive Data Table G-5 “Action CacheRecvCmp” on page G-619 is for actions taken by a cache when it receives a Cmp or a FrcAckCnflt response. As mentioned in the last table, Cmp or FrcAckCnflt may be received before corresponding Data* or Grant*. ORB state ‘RcvdData’ indicates that the Data* or Gnt* response was received earlier Semantic Mapping for ‘CacheRecvCmp’ PARAMETERS ReqNid : NID ReqTag : TAG ALIASES Msg : Sta.Net[ReqNid][ReqTag].HomeMsg ReqOrb : Sta.Orb[ReqNid][ReqTag] NxtReqOrb : NxtSta.Orb[ReqNid][ReqTag] COLUMN |Current State|Msg|Cmd| |Cmp| => PendCmp(Sta, ReqNid, ReqTag) & Msg.Cmd = HOME_Cmp |FrcAckCnflt| => PendCmp(Sta, ReqNid, ReqTag) & Msg.Cmd = HOME_FrcAckCnflt COLUMN |Current State|ReqOrb|State| |SentReq| => ReqOrb.State = ORB_SentReq |RcvdData| => ReqOrb.State = ORB_RcvdData |SentAck| => ReqOrb.State = ORB_SentAck COLUMN |Current State|ReqOrb|Cmd| |WbMto*| => (ReqOrb.Cmd = EXT_WbMtoI | ReqOrb.Cmd = EXT_WbMtoS | ReqOrb.Cmd= EXT_WbMtoE) |!=WbMto*| => (ReqOrb.Cmd = EXT_RdCode | ReqOrb.Cmd = EXT_RdData | ReqOrb.Cmd= EXT_RdCur | ReqOrb.Cmd= EXT_RdInvOwn | ReqOrb.Cmd= EXT_InvItoE) COLUMN |Current State|ReqOrb|Cnflt| |TRUE| => ReqOrb.Cnflt = TRUE Ref No xxxxx 619 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence |FALSE| => ReqOrb.Cnflt = FALSE COLUMN |Next State|ReqOrb|State| |RcvdCmp| => NxtReqOrb.State := ORB_RcvdCmp |RcvdFAC| => NxtReqOrb.State := ORB_RcvdFAC |SentAck| => NxtReqOrb.State := ORB_SentAck|None| => OrbClear(NxtSta, Sta, ReqNid, ReqTag) COLUMN |Next State|ReqOrb|Cnflt| |TRUE| => NxtReqOrb.Cnflt := TRUE COLUMN |Next State|Net|Send to Home| |AckCnflt| => SendHomeMsg(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, HOME_AckCnflt) COLUMN |Next State|Net|Msg| |Remove| => RecvHomeMsg(NxtSta, Sta, ReqNid, ReqTag) Table G-6. Action CacheRecvFwda Current State Next State Msg ReqCch ReqOrb ReqCch Net Cmd State Mask BiasFwd State State Msg Send to Home Send Wb to Home Send to Msg.FwdTo Cmp_FwdInvItoE M Full None I Remove RspIWb WbIData Partial RspIWb WbIDataPtl E, S, I RspI Cmp_FwdInvOwn M Full TRUE RspFwdI DataC_M FALSE RspIWb WbIData Partial RspIWb WbIDataPtl E Full TRUE RspFwdI DataC_E FALSE RspI Empty RspI S, I RspI Cmp_FwdCode M Full RspIWb WbIData Partial WbIDataPtl E Full TRUE RspFwdI DataC_S FALSE RspI Empty RspI S, I RspI a. *FwdCode does not forward to peer but writes back the data to home. The state of ORB must be SentAck Table G-7 is for a cache receiving *Fwd* message from home. In response to this the caching agent may send Data to the peer node directly or send it to home and let home forward the Data to the peer node. If the copy at the caching agent was silently downgraded to S or I, RspI response is sent to home. ‘Bias Fwd’ condition is used to explore both forwarding as well as non forwarding cases. 620 Ref No xxxxx Intel Restricted Secret Semantic Mapping for ‘CacheRecvFwd’ PARAMETERS ReqNid : NID ReqTag : TAG BiasFwd: BOOLEAN ALIASES Msg: Sta.Net[ReqNid][ReqTag].HomeMsg ReqOrb: Sta.Orb[ReqNid][ReqTag] ReqCch: Sta.Cch[ReqNid][Msg.Addr] NxtReqOrb: NxtSta.Orb[ReqNid][ReqTag] NxtReqCch: NxtSta.Cch[ReqNid][Msg.Addr] COLUMN |Current State|Msg|Cmd| |Cmp_FwdCode| => PendFwd(Sta, ReqNid, ReqTag) & Msg.Cmd = HOME_Cmp_FwdCode |Cmp_FwdInvItoE| => PendFwd(Sta, ReqNid, ReqTag) & Msg.Cmd = HOME_Cmp_FwdInvItoE |Cmp_FwdInvOwn| => PendFwd(Sta, ReqNid, ReqTag) & Msg.Cmd = HOME_Cmp_FwdInvOwn COLUMN |Current State|ReqCch|State| |M| => ReqCch.State in {CCH_M} |E| => ReqCch.State in {CCH_E} |S| => ReqCch.State in {CCH_S} |I| => ReqCch.State in {CCH_I} |S, I| => ReqCch.State in {CCH_S, CCH_I} |E, S, I| => ReqCch.State in {CCH_E, CCH_S, CCH_I} COLUMN |Current State|ReqCch|Mask| |Full| => MaskFull(ReqCch.Mask) |Empty| => MaskEmpty(ReqCch.Mask) |Partial| => MaskPartial(ReqCch.Mask) COLUMN |Current State|BiasFwd| |TRUE| => BiasFwd = TRUE |FALSE| => BiasFwd = FALSE COLUMN |Next State|ReqOrb|State| |None| => OrbClear(NxtSta, Sta, ReqNid, ReqTag) COLUMN |Next State|ReqCch|State| |M| => NxtReqCch.State := CCH_M |E| => NxtReqCch.State := CCH_E |S| => NxtReqCch.State := CCH_S |I| => CchClear(NxtSta, Sta, ReqNid, Msg.Addr, CCH_I) COLUMN |Next State|Net|Send to Home| |RspFwdI| => SendSnpRspOwn(NxtSta, Sta, ReqNid, Msg.FwdTo, Msg.Addr, SNP_RspFwdI) |RspIWb| => SendSnpRspOwn(NxtSta, Sta, ReqNid, Msg.FwdTo, Msg.Addr, SNP_RspIWb) |RspI| => SendSnpRspOwn(NxtSta, Sta, ReqNid, Msg.FwdTo, Msg.Addr, SNP_RspI) |RspFwd| => SendSnpRspOwn(NxtSta, Sta, ReqNid, Msg.FwdTo, Msg.Addr, SNP_RspFwd) COLUMN |Next State|Net|Send to Msg.FwdTo| |DataC_M| => SendData(NxtSta, Sta, Msg.Addr, Msg.FwdTo.Nid, Msg.FwdTo.Tag, DATA_DataC_M, ReqCch.Data) Ref No xxxxx 621 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence |DataC_E| => SendData(NxtSta, Sta, Msg.Addr, Msg.FwdTo.Nid, Msg.FwdTo.Tag, DATA_DataC_E, ReqCch.Data) |DataC_S| => SendData(NxtSta, Sta, Msg.Addr, Msg.FwdTo.Nid, Msg.FwdTo.Tag, DATA_DataC_S, ReqCch.Data) COLUMN |Next State|Net|Send Wb to Home| |WbIData| => SendWbD(NxtSta, Sta, Msg.Addr, ReqNid, Msg.FwdTo.Nid, Msg.FwdTo.Tag, WBD_WbIData, ReqCch.Mask, ReqCch.Data) |WbSData| => SendWbD(NxtSta, Sta, Msg.Addr, ReqNid, Msg.FwdTo.Nid, Msg.FwdTo.Tag, WBD_WbSData, ReqCch.Mask, ReqCch.Data) |WbIDataPtl| => SendWbD(NxtSta, Sta, Msg.Addr, ReqNid, Msg.FwdTo.Nid, Msg.FwdTo.Tag, WBD_PtlWbData, ReqCch.Mask, ReqCch.Data) COLUMN |Next State|Net|Msg| |Remove| => RecvHomeMsg(NxtSta, Sta, ReqNid, ReqTag). Table G-7. Action CacheSnpOrbMiss Current State Next State Orb Msg PeerCch PeerCch Net Conflict Cmd State Mask BiasFwd BiasToI State Msg Send to Home Send Wb to Home Send to Msg.Req Miss SnpCode M Full TRUE I Remove RspIWb WbIData FALSE S RspSWb WbSData Partial I RspIWb WbIDataPtl E Full TRUE S RspFwdS DataC_S FALSE S RspS Empty I RspI S S RspS I I RspI Miss SnpData M Full TRUE TRUE I Remove RspFwdIWb WbIData DataC_E FALSE S RspFwdSWb WbSData DataC_S FALSE TRUE I RspIWb WbIData FALSE S RspSWb WbSData Partial I RspIWb WbIDataPtl E Full TRUE S RspFwdS DataC_S FALSE S RspS Empty I RspI S S RspS I I RspI 622 Ref No xxxxx Intel Restricted Secret Table G-7. Action CacheSnpOrbMiss (Continued) Current State Next State Orb Msg PeerCch PeerCch Net Conflict Cmd State Mask BiasFwd BiasToI State Msg Send to Home Send Wb to Home Send to Msg.Req Miss SnpInvOwn M Full TRUE I Remove RspFwdI DataC_M FALSE RspIWb WbIData Partial RspIWb WbIDataPtl E Full TRUE RspFwdI DataC_E FALSE RspI Empty RspI S, I RspI Miss SnpInvItoE M Full I Remove RspIWb WbIData Partial RspIWb WbIDataPtl E, S, I RspI Miss SnpCur M Full TRUE Remove RspFwd DataC_I FALSE TRUE I RspIWb WbIData FALSE S RspSWb WbSData Partial I RspIWb WbIDataPtl E Full TRUE RspFwd DataC_I Full FALSE RspI Empty I RspI S RspS I I RspI Table G-7: This table defines cache responses to a snoop request. ORB of the cache does not have any conflicting outgoing request to the same address. The case where ORB has a conflicting request is handled in Table G-8. All possible state transitions such as M to I or M to S for SnpCode, similarly for SnpData etc. are explored in this table. A caching agent may implement subset of these actions Semantic Mapping for ‘CacheSnpORBMiss’ PARAMETERS ReqNid : NID ReqTag : TAG PeerNid : NID BiasFwd : Boolean BiasToI : Boolean ALIASES Msg : Sta.Net[ReqNid][ReqTag].SnpMsg[PeerNid] PeerCch : Sta.Cch[PeerNid][Msg.Addr] NxtPeerCch : NxtSta.Cch[PeerNid][Msg.Addr] COLUMN |Current State|Orb|Conflict| Ref No xxxxx 623 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence |Miss| => OrbMiss(Sta, Msg.Addr, PeerNid) COLUMN |Current State|Msg|Cmd| |SnpCode| => PendSnp(Sta,ReqNid, ReqTag, PeerNid) & Msg.Cmd = SNP_SnpCode|SnpData| => PendSnp(Sta,ReqNid, ReqTag, PeerNid) & Msg.Cmd = SNP_SnpData|SnpInvOwn| => PendSnp(Sta,ReqNid, ReqTag, PeerNid) & Msg.Cmd = SNP_SnpInvOwn|SnpInvItoE| => PendSnp(Sta,ReqNid, ReqTag, PeerNid) & Msg.Cmd = SNP_SnpInvItoE|SnpCur| => PendSnp(Sta,ReqNid, ReqTag, PeerNid) & Msg.Cmd = SNP_SnpCur COLUMN |Current State|PeerCch|State| |M| => PeerCch.State in {CCH_M} |E| => PeerCch.State in {CCH_E} |S| => PeerCch.State in {CCH_S} |I| => PeerCch.State in {CCH_I} |S, I| => PeerCch.State in {CCH_S, CCH_I} |E, S, I| => PeerCch.State in {CCH_E, CCH_S, CCH_I} COLUMN |Current State|PeerCch|Mask| |Full| => MaskFull(PeerCch.Mask) |Empty| => MaskEmpty(PeerCch.Mask) |Partial| => MaskPartial(PeerCch.Mask) COLUMN |Current State|BiasFwd| |TRUE| => BiasFwd = TRUE |FALSE| => BiasFwd = FALSE COLUMN |Current State|BiasToI| |TRUE| => BiasToI = TRUE |FALSE| => BiasToI = FALSE COLUMN |Next State|PeerCch|State| |S| => NxtPeerCch.State := CCH_S |I| => CchClear(NxtSta, Sta, PeerNid, Msg.Addr, CCH_I) COLUMN |Next State|Net|Send to Home| |RspI| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspI) |RspS| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspS) |RspIWb| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspIWb) |RspSWb| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspSWb) |RspFwd| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspFwd) |RspFwdI| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspFwdI) |RspFwdS| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspFwdS) |RspFwdIWb| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspFwdIWb) |RspFwdSWb| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspFwdSWb) COLUMN |Next State|Net|Send to Msg.Req| |DataC_M| => SendData(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, DATA_DataC_M, PeerCch.Data) |DataC_E| => SendData(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, DATA_DataC_E, 624 Ref No xxxxx Intel Restricted Secret PeerCch.Data) |DataC_S| => SendData(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, DATA_DataC_S, PeerCch.Data) |DataC_I| => SendData(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, DATA_DataC_I, PeerCch.Data) COLUMN |Next State|Net|Send Wb to Home| |WbIData| => SendWbD(NxtSta, Sta, Msg.Addr, PeerNid, ReqNid, ReqTag, WBD_WbIData, PeerCch.Mask, PeerCch.Data) |WbSData| => SendWbD(NxtSta, Sta, Msg.Addr, PeerNid, ReqNid, ReqTag, WBD_WbSData, PeerCch.Mask, PeerCch.Data) |WbIDataPtl| => SendWbD(NxtSta, Sta, Msg.Addr, PeerNid, ReqNid, ReqTag, WBD_PtlWbData, PeerCch.Mask, PeerCch.Data) COLUMN |Next State|Net|Msg| |Remove| => RecvSnp(NxtSta, Sta, ReqNid, ReqTag, PeerNid) Table G-8. Action CacheSnpOrbHit Current State Next State PeerOrb hit Msg PeerOrb PeerCch PeerOrb PeerCch Net Cmd State State Mask Cnflt State Msg Send to Home TRUE SnpCode, SnpData SentReq S TRUE S Remove RspCnflt E Full Partial I I RcvdData RcvdCmp SentAck Buffer TRUE SnpInvOwn, SentReq E, S, I TRUE I Remove RspCnflt SnpInvItoE RcvdData RcvdCmp SentAck Buffer TRUE SnpCur SentReq E, S, I TRUE Remove RspCnflt RcvdData RcvdCmp SentAck Buffer Semantic Mapping for ‘CacheSnpOrbHit’ PARAMETERS ReqNid : NID ReqTag : TAG PeerNid : NID PeerTag : TAG ALIASES Msg : Sta.Net[ReqNid][ReqTag].SnpMsg[PeerNid] Ref No xxxxx 625 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence PeerCch : Sta.Cch[PeerNid][Msg.Addr] PeerOrb : Sta.Orb[PeerNid][PeerTag] NxtPeerCch : NxtSta.Cch[PeerNid][Msg.Addr] NxtPeerOrb : NxtSta.Orb[PeerNid][PeerTag] COLUMN |Current State|PeerOrb hit| |TRUE| => OrbHit(Sta, Msg.Addr, PeerNid, PeerTag) COLUMN |Current State|Msg|Cmd| |SnpCode, SnpData| => (Msg.Cmd = SNP_SnpCode | Msg.Cmd = SNP_SnpData) |SnpInvOwn, SnpInvItoE| => (Msg.Cmd = SNP_SnpInvOwn | Msg.Cmd = SNP_SnpInvItoE) |SnpCur| => Msg.Cmd = SNP_SnpCur COLUMN |Current State|PeerOrb|State| |SentReq| => PeerOrb.State = ORB_SentReq |RcvdData| => PeerOrb.State = ORB_RcvdData |RcvdCmp| => PeerOrb.State = ORB_RcvdCmp |SentAck| => PeerOrb.State = ORB_SentAck COLUMN |Current State|PeerCch|State| |E| => PeerCch.State in {CCH_E} |S| => PeerCch.State in {CCH_S} |I| => PeerCch.State in {CCH_I} |E, S, I| => PeerCch.State in {CCH_E, CCH_S, CCH_I} COLUMN |Current State|PeerCch|Mask| |Full| => MaskFull(PeerCch.Mask) |Empty| => MaskEmpty(PeerCch.Mask) |Partial| => MaskPartial(PeerCch.Mask) COLUMN |Next State|PeerOrb|Cnflt| |TRUE| => NxtPeerOrb.Cnflt := TRUE COLUMN |Next State|PeerCch|State| |S| => NxtPeerCch.State := CCH_S |I| => CchClear(NxtSta, Sta, PeerNid, Msg.Addr, CCH_I) COLUMN |Next State|Net|Send to Home| |RspCnflt| => SendSnpRsp(NxtSta, Sta, PeerNid, Msg.Addr, ReqNid, ReqTag, SNP_RspCnflt) COLUMN |Next State|Net|Msg| |Remove| => RecvSnp(NxtSta, Sta, ReqNid, ReqTag, PeerNid) |Buffer|=>BuffSnp(NxtSta, Sta, ReqNid, ReqTag, PeerNid) 626 Ref No xxxxx Intel Restricted Secret Ref No xxxxx 627 Ref No xxxxx 627 Table G-9. Action HomeRecvReq Current State Next State Msg Dir SPT SPT Dir PRB Net Cmd Coarse Dir Mode PV[ req. nid] State Exists Cnflt Req CV[ req. nid] Cmd (All SnpRsp Rcvd & All CV[i]=0) Except msg.from State CV[ req. nid] State PV[ req. nid] NRV [req. nid] Place in PRB Msg !=WbMto* FALSE 0 FALSE YES Remove 1 1 0 TRUE 1 0 1 1 M, S TRUE 1 RdCode, RdData, RdCur YES ReadyTo Respond 0 Sa 1 S NO FALSE 1 M, S TRUE 1 RdInvOwn, InvItoE YES ReadyTo Respond 0 I if all PV[i] = 0 except msg.from and SPT.Cmd != RdCur 0 0 S NO TRUE YES a. Can be split into {inv req in SPT} and {non inv req in SPT} PV=1 or 0 respectively An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence Table G-9 “Action HomeRecvReq” on page G-627, Table B-10 and Table B-11 describe actions taken by home after receiving a request. ‘Exists Cnflt Req’ checks if there is a request in SPT to the same address. ‘(All Snp Rsp Rcvd & All CV[i]=0) Except msg.from’ is to check if all messages effecting the SPT request are received, for example if a node had sent a RspCnflt has either a request or an AckCnflt from that node is received. Semantic Mapping for ‘HomeRecvReq’ PARAMETERS ReqNid : NID ReqTag : TAG SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].HomeMsg Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] COLUMN |Current State|Msg|Cmd| |!=WbMto*| => (Msg.Cmd = HOME_RdCode | Msg.Cmd = HOME_RdData | Msg.Cmd = HOME_RdCur | Msg.Cmd = HOME_RdInvOwn | Msg.Cmd = HOME_InvItoE) COLUMN |Current State|Dir|Coarse Dir Mode| |FALSE| => Sta.Dir[Msg.Addr].CoarseMode = FALSE |TRUE| => Sta.Dir[Msg.Addr].CoarseMode = TRUE COLUMN |Current State|Dir|PV[req.nid]| |1| => Sta.Dir[Msg.Addr].PV[ReqNid] = TRUE |0| => Sta.Dir[Msg.Addr].PV[ReqNid] = FALSE COLUMN |Current State|Dir|State| |M| => Sta.Dir[Msg.Addr].DirSta = DIR_M |M, S| => (Sta.Dir[Msg.Addr].DirSta = DIR_M | Sta.Dir[Msg.Addr].DirSta = DIR_S) |S| => Sta.Dir[Msg.Addr].DirSta = DIR_S|I| => Sta.Dir[Msg.Addr].DirSta = DIR_I COLUMN |Current State|SPT|Cmd| |RdCode, RdData, RdCur| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdCur) |RdInvOwn, InvItoE| => (Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) COLUMN |Current State|SPT|Exists Cnflt Req| |TRUE| => SPTHasCnfltReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) = TRUE |FALSE| => SPTHasReqAny(Sta, Sta.Home[Msg.Addr], Msg.Addr) = FALSE COLUMN |Current State|SPT|(All Snp Rsp Rcvd & All CV[i]=0) Except msg.from| |YES| => (AllSnpRspRcvdExcept(Sta, Spt, ReqNid) & AllCVExcept(Sta, Spt, ReqNid)) = TRUE|NO| => (AllSnpRspRcvdExcept(Sta, Spt, ReqNid) & AllCVExcept(Sta, Spt, ReqNid)) = FALSE COLUMN |Current State|SPT|CV[req.nid]| |1| => Spt.CV[ReqNid] = TRUE |0| => Spt.CV[ReqNid] = FALSE COLUMN |Next State|SPT|State| |ReadyToRespond| => NxtSpt.State := SPT_ReadyToRespond 628 Ref No xxxxx Intel Restricted Secret COLUMN |Next State|SPT|CV[req.nid]| |1| => NxtSpt.CV[ReqNid] := TRUE |0| => NxtSpt.CV[ReqNid] := FALSE COLUMN |Next State|Dir|State| |I if all PV[i] = 0 except msg.from and SPT.Cmd != RdCur| => if forall i:NID doi=ReqNid | Sta.Dir[Msg.Addr].PV[i]=FALSE endforall & Spt.Cmd !=EXT_RdCur then NxtSta.Dir[Msg.Addr].DirSta := DIR_I endif |I| => NxtSta.Dir[Msg.Addr].DirSta := DIR_I |S| => NxtSta.Dir[Msg.Addr].DirSta := DIR_S COLUMN |Next State|Dir|PV[req.nid]| |1| => NxtSta.Dir[Msg.Addr].PV[ReqNid] := TRUE |0| => NxtSta.Dir[Msg.Addr].PV[ReqNid] := FALSE COLUMN |Next State|Dir|NRV[req.nid]| |1| => NxtSta.Dir[Msg.Addr].NRV[ReqNid] := TRUE |0| => NxtSta.Dir[Msg.Addr].NRV[ReqNid] := FALSE COLUMN |Next State|PRB|Place in PRB| |YES| => EnterPRB(NxtSta, Sta, Msg.Addr, ReqType(Msg.Cmd), ReqNid, ReqTag) COLUMN |Next State|Net|Msg| |Remove| => RecvHomeMsg(NxtSta, Sta, ReqNid, ReqTag) Ref No xxxxx 629 Intel Restricted Secret Table G-10. Action HomeRecvExplicitWbReq Current State Next State Msg SPT PRB SPT Dir Net Cmd State Exists Cnflt Req CV [req.nid] Cmd (All Snp RspRcvd & All CV[i]=0) Exceptmsg.from Place in PRB State CV [req.nid] State PV [req.nid] NRV [msg.from] Home Msg WbIData, WbIDataPtl None FALSE YES I 0 Write Remove TRUE 0 1 1 YES ReadyToRespond 0 0 NO WbSData None FALSE YES S 1 1 TRUE 0 1 RdCode, RdData, RdCur YES ReadyToRespond 0 1 NO RdInvOwn, InvItoE YES ReadyToRespond I 0 0 NO E An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence 630 Ref No xxxxx Intel Restricted Secret Semantic Mapping for ‘HomeRecvExplicitWbReq’ PARAMETERS ReqNid : NID ReqTag : TAG SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].WbDMsg Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] COLUMN |Current State|Msg|Cmd| |WbIData, WbIDataPtl| => (Msg.Cmd = WBD_WbIData | Msg.Cmd = WBD_WbSData | Msg.Cmd = WBD_WbEData | Msg.Cmd = WBD_PtlWbData) & (Msg.Cmd = WBD_WbIData | Msg.Cmd = WBD_PtlWbData) |WbIData| => (Msg.Cmd = WBD_WbIData | Msg.Cmd = WBD_WbSData | Msg.Cmd = WBD_WbEData | Msg.Cmd = WBD_PtlWbData) & Msg.Cmd = WBD_WbIData|WbSData| => (Msg.Cmd = WBD_WbIData | Msg.Cmd = WBD_WbSData | Msg.Cmd = WBD_WbEData | Msg.Cmd = WBD_PtlWbData) & Msg.Cmd = WBD_WbSData|WbEData| => (Msg.Cmd = WBD_WbIData | Msg.Cmd = WBD_WbSData | Msg.Cmd = WBD_WbEData | Msg.Cmd = WBD_PtlWbData) & Msg.Cmd = WBD_WbEData COLUMN |Current State|SPT|State| |None| => Spt.State = SPT_None COLUMN |Current State|SPT|Cmd| |RdCode, RdData, RdCur| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdCur) |RdInvOwn, InvItoE| => (Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) COLUMN |Current State|SPT|Exists Cnflt Req| |TRUE| => SPTHasCnfltReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) = TRUE|FALSE| => SPTHasCnfltReq(Sta, Sta.Home[Msg.Addr], Msg.Addr, ReqNid, ReqTag) = FALSE COLUMN |Current State|SPT|CV[req.nid]| |1| => Spt.CV[ReqNid] = TRUE |0| => Spt.CV[ReqNid] = FALSE COLUMN |Current State|SPT|(All Snp Rsp Rcvd & All CV[i]=0) Except msg.from| |YES| => (AllSnpRspRcvdExcept(Sta, Spt, ReqNid) & AllCVExcept(Sta, Spt, ReqNid)) = TRUE|NO| => (AllSnpRspRcvdExcept(Sta, Spt, ReqNid) & AllCVExcept(Sta, Spt, ReqNid)) = FALSE COLUMN |Next State|SPT|State| |ReadyToRespond| => NxtSpt.State := SPT_ReadyToRespond COLUMN |Next State|SPT|CV[req.nid]| |1| => NxtSpt.CV[ReqNid] := TRUE |0| => NxtSpt.CV[ReqNid] := FALSE COLUMN |Next State|Dir|State| |I| => NxtSta.Dir[Msg.Addr].DirSta := DIR_I |I if SPT req!=RdCur| => if Spt.Cmd != EXT_RdCur then NxtSta.Dir[Msg.Addr].DirSta := DIR_I endif|S| => NxtSta.Dir[Msg.Addr].DirSta := DIR_S Ref No xxxxx 631 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence COLUMN |Next State|Dir|PV[req.nid]| |1| => NxtSta.Dir[Msg.Addr].PV[ReqNid] := TRUE |0| => NxtSta.Dir[Msg.Addr].PV[ReqNid] := FALSE |0 if SPT req!=RdCur| => if Spt.Cmd != EXT_RdCur then NxtSta.Dir[Msg.Addr].PV[ReqNid] := FALSE endif COLUMN |Next State|Dir|NRV[msg.from]| |0| => NxtSta.Dir[Msg.Addr].NRV[ReqNid] := FALSE |1| => NxtSta.Dir[Msg.Addr].NRV[ReqNid] := TRUE COLUMN |Next State|PRB|Place in PRB| |YES| => EnterPRB(NxtSta, Sta, Msg.Addr, WbReqType(Msg.Cmd), ReqNid, ReqTag) COLUMN |Next State|Net|Msg| |Remove| => RecvWbDMsg(NxtSta, Sta, ReqNid, ReqTag) COLUMN |Next State|Home| |Write| => CopyDataViaMask(NxtSta.Mem[Msg.Addr].Data, Msg.Data, Msg.Mask 632 Ref No xxxxx Intel Restricted Secret ) Table G-11. Action HomePRBtoSPTNoCDMa Current State Next State PRB Dir SPT Dir PRB Net Cmd Pointer Overflow State PV[req. nid] State State PV[req. nid] NRV[req. nid] State Send to Peers Send to Requester RdCode NO M 0 SentSnp 0 None SnpCode 1 None M 1 DataC_S_C mp S, I S 1 DataC_S_C mp YES SentPOSnp SnpCode RdData NO M 0 SentSnp 0 SnpData 1 None M 1 DataC_E_C mp S S 1 DataC_S_C mp I M 1 DataC_E_C mp YES SentPOSnp SnpCode RdCur M 0 SentSnp 0 SnpCur 1 None I 0 DataC_I_Cm pS, I RdInvOwn M 0 SentSnp 0 SnpInvOwn 1 None 1 DataC_E_C mp S exists i != nid: PV[i]=1 SentSnp SnpInvOwn PV[i]=1 only for i=nid None M 1 DataC_E_C mp I None M 1 InvItoE M 0 SentSnp 0 None SnpInvItoE 1 None 1 GntE_Cmp S exists i != nid: PV[i]=1 SentSnp SnpInvItoE PV[i]=1 only for i=nid None M 1 GntE_Cmp I WbMto* None 0 Cmp a. Coarse Dir Mode = FALSE; Pointer Overflow = NO Ref No xxxxx 633 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence In Table G-11 and Table G-12, a valid PRB entry is moved into SPT and processed. Predicate ‘exists i != nid: PV[i]=1’ checks if a node other than the requester has a copy of the requested line. NoCDM and CDM in the titles indicates the table is for full directory mode and coarse directory mode respectively. Semantic Mapping for ‘HomePRBtoSPTNoCDM’ PARAMETERS ReqNid : NID ReqTag : TAG SptHid : HID SptIndex : SPTX ALIASES Spt : Sta.Spt[SptHid][SptIndex] Prb : Sta.Prb[SptHid][ReqNid][ReqTag] Dir : Sta.Dir[Prb.Addr] NxtDir : NxtSta.Dir[Prb.Addr] NxtPrb : NxtSta.Prb[SptHid][ReqNid][ReqTag] NxtSpt : NxtSta.Spt[SptHid][SptIndex] COLUMN |Current State|PRB|Cmd| |RdCode| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & !Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_RdCode|RdData| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & !Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_RdData|RdCur| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & !Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_RdCur |RdInvOwn| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & !Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_RdInvOwn |InvItoE| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & !Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_InvItoE|WbMto*| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & !Sta.Dir[Prb.Addr].CoarseMode & (Prb.Cmd = EXT_WbMtoI | Prb.Cmd = EXT_WbMtoS | Prb.Cmd = EXT_WbMtoE) COLUMN |Current State|Dir|Pointer Overflow| |YES| => MaxSharers(Sta, Prb.Addr, ReqNid) = TRUE |NO| => MaxSharers(Sta, Prb.Addr, ReqNid) = FALSE COLUMN |Current State|Dir|State| |M| => Dir.DirSta = DIR_M |S| => Dir.DirSta = DIR_S |I| => Dir.DirSta = DIR_I |S, I| => (Dir.DirSta = DIR_S | Dir.DirSta = DIR_I) COLUMN |Current State|Dir|PV[req.nid]| |0| => Dir.PV[ReqNid] = FALSE |1| => Dir.PV[ReqNid] = TRUE |exists i != nid: PV[i]=1| => exists i:NID do i!=ReqNid & Dir.PV[i] endexists |PV[i]=1 only for i=nid| => Dir.PV[ReqNid] & forall i:NID do i=ReqNid | !Dir.PV[i] endforall 634 Ref No xxxxx Intel Restricted Secret COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, SptHid, SptIndex) |SentPOSnp| => EnterSPT(NxtSta, Sta, SptHid, SptIndex, ReqNid, ReqTag); NxtSpt.State := SPT_SentPOSnp |SentSnp| => EnterSPT(NxtSta, Sta, SptHid, SptIndex, ReqNid, ReqTag); NxtSpt.State := SPT_SentSnp COLUMN |Next State|Dir|State| |M| => NxtDir.DirSta := DIR_M |S| => NxtDir.DirSta := DIR_S |I| => NxtDir.DirSta := DIR_I COLUMN |Next State|Dir|PV[req.nid]| |1| => NxtDir.PV[ReqNid] := TRUE |0| => NxtDir.PV[ReqNid] := FALSE COLUMN |Next State|Dir|NRV[req.nid]| |0| => NxtDir.NRV[ReqNid] := FALSE COLUMN |Next State|Net|Send to Peers| |SnpCode| => SendSnpReq(NxtSta, Sta, SptHid, SptIndex, Spt.Txn, Prb.Addr, SNP_SnpCode) |SnpData| => SendSnpReq(NxtSta, Sta, SptHid, SptIndex, Spt.Txn, Prb.Addr, SNP_SnpData) |SnpCur| => SendSnpReq(NxtSta, Sta, SptHid, SptIndex, Spt.Txn, Prb.Addr, SNP_SnpCur) |SnpInvOwn| => SendSnpReq(NxtSta, Sta, SptHid, SptIndex, Spt.Txn, Prb.Addr, SNP_SnpInvOwn) |SnpInvItoE| => SendSnpReq(NxtSta, Sta, SptHid, SptIndex, Spt.Txn, Prb.Addr, SNP_SnpInvItoE) COLUMN |Next State|Net|Send to Requester| |DataC_S_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Prb.Addr, DATA_DataC_S, Sta.Mem[Prb.Addr].Data) |DataC_E_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Prb.Addr, DATA_DataC_E, Sta.Mem[Prb.Addr].Data) |DataC_I_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Prb.Addr, DATA_DataC_I, Sta.Mem[Prb.Addr].Data) |GntE_Cmp| => SendGntECmp(NxtSta, Sta, Spt.Txn, Prb.Addr) |Cmp| => SendHomeMsg(NxtSta, Sta, Prb.Addr, Spt.Txn.Nid, Spt.Txn.Tag, HOME_Cmp) COLUMN |Next State|PRB|State| |None| => ClearPRB(NxtSta, Sta, SptHid, ReqNid, ReqTag) Ref No xxxxx 635 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence Table G-12. Action HomePRBtoSPTCDM Current State Next State PRB SPT Dir PRB Net Cmd State PV[req. nid] State Send to Peers Send to Requester RdCode SentFAC 1 None DataC_S_FA CRdData RdCur DataC_I_FA C RdInvOwn SentSnp None SnpInvOwn InvItoE SentSnp SnpInvItoE WbMto* SentFAC FAC Semantic Mapping for ‘HomePRBtoSPTCDM’ PARAMETERS ReqNid : NID ReqTag : TAG SptHid : HID SptIndex : SPTX ALIASES Prb : Sta.Prb[SptHid][ReqNid][ReqTag] Spt : Sta.Spt[SptHid][SptIndex] Dir : Sta.Dir[Prb.Addr] NxtPrb : NxtSta.Prb[SptHid][ReqNid][ReqTag] NxtDir : NxtSta.Dir[Prb.Addr] NxtSpt : NxtSta.Spt[SptHid][SptIndex] COLUMN |Current State|PRB|Cmd| |RdCode| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_RdCode|RdData| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_RdData|RdCur| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_RdCur |RdInvOwn| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_RdInvOwn |InvItoE| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & Sta.Dir[Prb.Addr].CoarseMode & Prb.Cmd = EXT_InvItoE|WbMto*| => Prb.Cmd != EXT_None & Sta.Spt[SptHid][SptIndex].State = SPT_None & SPTFull(Sta, SptHid) = FALSE & SPTHasReqAny(Sta, SptHid, Prb.Addr) = FALSE & Sta.Dir[Prb.Addr].CoarseMode & (Prb.Cmd = EXT_WbMtoI | Prb.Cmd = EXT_WbMtoS | Prb.Cmd = EXT_WbMtoE) COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, SptHid, SptIndex) 636 Ref No xxxxx Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence Intel Restricted Secret |SentSnp| => EnterSPT(NxtSta, Sta, SptHid, SptIndex, ReqNid, ReqTag); NxtSpt.State := SPT_SentSnp |SentFAC| => EnterSPT(NxtSta, Sta, SptHid, SptIndex, ReqNid, ReqTag); NxtSpt.State := SPT_SentFAC COLUMN |Next State|Dir|PV[req.nid] | |1| => NxtDir.PV[ReqNid] := TRUE |0| => NxtDir.PV[ReqNid] := FALSE COLUMN |Next State|Net|Send to Peers| |SnpInvOwn| => SendSnpReq(NxtSta, Sta, SptHid, SptIndex, ReqNid, ReqTag, Prb.Addr, SNP_SnpInvOwn) |SnpInvItoE| => SendSnpReq(NxtSta, Sta, SptHid, SptIndex, ReqNid, ReqTag, Prb.Addr, SNP_SnpInvItoE) COLUMN |Next State|Net|Send to Requester| |DataC_S_FAC| => SendDataFAC(NxtSta, Sta, ReqNid, ReqTag, Prb.Addr, DATA_DataC_S, Sta.Mem[Prb.Addr].Data) |DataC_I_FAC| => SendDataFAC(NxtSta, Sta, ReqNid, ReqTag, Prb.Addr, DATA_DataC_I, Sta.Mem[Prb.Addr].Data) |FAC| => SendHomeMsg(NxtSta, Sta, Prb.Addr, ReqNid, ReqTag, HOME_FrcAckCnflt) COLUMN |Next State|PRB|State| |None| => ClearPRB(NxtSta, Sta, SptHid, ReqNid, ReqTag) . Table G-13. Action HomeRecvSnpRspNoCDM Current State Next State Msg SPT SPT Dir Net Cmd ACV[ msg. from] Cmd State SRP[ msg. from] (All SnpRsp Rcvd& All CV[i]=0) Except msg.from Pointer Overflow State SRP[ msg. from] ACV[ msg. from] Data Fwded Coarse Dir Mode State PV[ req.nid ] PV[msg.from] Msg Send to Requester RspFwdI 0 RdCode, RdData, SentSnp 1 YES None M 1 0 Remove Cmp 1 RdInvOwn 2 YES 1 YES None 0 1 Cmp NO -1 0 YES I RspFwdS 0 RdCode, RdDataa 1 YES None S 1 1 Cmp RspFwd RdCur Cmp Table G-13. Action HomeRecvSnpRspNoCDM (Continued) Table G-13. Action HomeRecvSnpRspNoCDM (Continued) 638 Ref No xxxxx Intel Restricted Secret Current State Next State Msg SPT SPT Dir Net Cmd ACV[ msg. from] Cmd State SRP[ msg. from] (All SnpRsp Rcvd & All CV[i]=0) Exceptmsg.from Pointer Overflow State SRP[ msg. from] ACV[ msg. from] Data Fwded Coarse Dir Mode State PV[ req.nid ] PV[msg.from] Msg Send to Requester RspI 0 !=WbMto* SentSnp, 1 NO -1 I if all PV[i] = 0 except msg.from 0 if SPT.Cm d != RdCur Remove SentPOSnpb YES ReadyTo Respond RspI 1 !=WbMto* SentSnp, 2 -1 and SPT.Cmd != RdCurSentPOSnpb 1 YES ReadyTo Respond 0 NO WaitWbData 1 WaitRspFwd 2 1 YES ReadyTo Respond 0 NO RspSc 0 RdCode, RdData SentSnp 1 YES None -1 S 1 Remove DataC_S_C mp NO SentPOSnp YES YES SentFAC TRUE S 1 DataC_S_F AC NO None DataC_S_C mp NO RdCur SentSnp YES None DataC_I_C mp NO a. RdCur Receive RspFwd b. RdCur, RdInvOwn and InvItoE will never have the Current SPT state to be SentPOSnp c. RdInvOwn & InvItoE can not receive RspS response Table G-13 and Table G-14 are for home receiving a snoop response. The snoop response does not include an implicit write back. Again the two tables are for no coarse directory mode and coarse directory mode respectively. The action ‘I if all PV[i] = 0 except msg.from and SPT.Cmd != RdCur’ states that the directory state is set to I if all the nodes except the responder do not have a copy and also if the command is not RdCur. If the command is RdCur, RspI does imply that the responder does not have a copy of the line. Semantic Mapping for ‘HomeRecvSnpRspNoCDM’ PARAMETERS ReqNid : NID ReqTag : TAG PeerNid : NID SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].SnpMsg[PeerNid] Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtDir : NxtSta.Dir[Msg.Addr] COLUMN |Current State|Msg|Cmd| |RspFwdI| => PendFwdNoWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & !Sta.Dir[Msg.Addr].CoarseMode & Msg.Cmd = SNP_RspFwdI |RspFwdS| => PendFwdNoWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & !Sta.Dir[Msg.Addr].CoarseMode & Msg.Cmd = SNP_RspFwdS|RspFwd| => PendFwdNoWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & !Sta.Dir[Msg.Addr].CoarseMode & Msg.Cmd = SNP_RspFwd|RspI| => PendFwdNoWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & !Sta.Dir[Msg.Addr].CoarseMode & Msg.Cmd = SNP_RspI|RspS| => PendFwdNoWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & !Sta.Dir[Msg.Addr].CoarseMode & Msg.Cmd = SNP_RspS COLUMN |Current State|SPT|Cmd| |RdInvOwn| => Spt.Cmd = EXT_RdInvOwn |RdCode, RdData| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData) |RdInvOwn, InvItoE| => (Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) |RdCur| => Spt.Cmd = EXT_RdCur |RdCode| => Spt.Cmd = EXT_RdCode |RdData| => Spt.Cmd = EXT_RdData |InvItoE| => Spt.Cmd = EXT_InvItoE |RdCode, RdData, RdInvOwn| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdInvOwn) |RdCode, RdData, RdInvOwn, InvItoE| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) |!=WbMto*| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdCur | Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) COLUMN |Current State|SPT|Pointer Overflow| |YES| => MaxSharers(Sta, Spt.Addr, ReqNid) = TRUE |NO| => MaxSharers(Sta, Spt.Addr, ReqNid) = FALSE Ref No xxxxx 639 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence COLUMN |Current State|SPT|(All Snp Rsp Rcvd & All CV[i]=0) Except msg.from| |YES| => (AllSnpRspRcvdExcept(Sta, Spt, PeerNid) & AllCVExcept(Sta, Spt, PeerNid)) = TRUE|NO| => (AllSnpRspRcvdExcept(Sta, Spt, PeerNid) & AllCVExcept(Sta, Spt, PeerNid)) = FALSE COLUMN |Current State|SPT|State| |SentSnp| => Spt.State = SPT_SentSnp|SentPOSnp| => Spt.State = SPT_SentPOSnp |SentSnp, SentPOSnp| => (Spt.State = SPT_SentSnp | Spt.State = SPT_SentPOSnp) |WaitWbData| => Spt.State = SPT_WaitWbData|WaitRspFwd| => Spt.State = SPT_WaitRspFwd COLUMN |Current State|SPT|SRP[msg.from]| |1| => Spt.SRP[PeerNid] = 1 |2| => Spt.SRP[PeerNid] = 2 COLUMN |Current State|SPT|ACV[msg.from]| |0| => Spt.ACV[PeerNid] = FALSE |1| => Spt.ACV[PeerNid] = TRUE COLUMN |Next State|Dir|Coarse Dir Mode| |TRUE| => NxtDir.CoarseMode := TRUE; for i:NID do NxtDir.NRV[i]:=FALSE endfor COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, Sta.Home[Msg.Addr], SptIndex) |ReadyToRespond| => NxtSpt.State := SPT_ReadyToRespond |SentSnp| => NxtSpt.State := SPT_SentSnp |SentFAC| => NxtSpt.State := SPT_SentFAC COLUMN |Next State|SPT|ACV[msg.from]| |0| => NxtSpt.ACV[PeerNid] := FALSE |1| => NxtSpt.ACV[PeerNid] := TRUE COLUMN |Next State|SPT|SRP[msg.from]| |-1| => NxtSpt.SRP[PeerNid] := Spt.SRP[PeerNid] - 1 COLUMN |Next State|SPT|Data Fwded| |YES| => NxtSpt.DataFwded := TRUE |NO| => NxtSpt.DataFwded := FALSE COLUMN |Next State|Dir|State| |M| => NxtDir.DirSta := DIR_M |S| => NxtDir.DirSta := DIR_S |I| => NxtDir.DirSta := DIR_I |I if all PV[i] = 0 except msg.from and SPT.Cmd != RdCur| => if forall i:NID do i=PeerNid | Sta.Dir[Msg.Addr].PV[i]=FALSE endforall & Spt.Cmd != EXT_RdCur then NxtDir.DirSta := DIR_I endif COLUMN |Next State|Dir|PV[req.nid]| |1| => NxtDir.PV[ReqNid] := TRUE COLUMN |Next State|Dir|PV[msg.from]| |0| => NxtDir.PV[PeerNid] := FALSE |0 if SPT.Cmd != RdCur| => if Spt.Cmd != EXT_RdCur then NxtDir.PV[PeerNid] := FALSE; endif |1| => NxtDir.PV[PeerNid] := TRUE 640 Ref No xxxxx Intel Restricted Secret COLUMN |Next State|Dir|NRV[msg.from]| |0| => NxtDir.NRV[PeerNid] := FALSE COLUMN |Next State|PRB|Place in PRB| |Move SPT req into PRB| => MoveSPTintoPRB(NxtSta, Sta, Sta.Home[Msg.Addr], SptIndex) COLUMN |Next State|Net|Msg| |Remove| => RecvSnp(NxtSta, Sta, ReqNid, ReqTag, PeerNid) COLUMN |Next State|Net|Send to Requester| |DataC_S_FAC| => SendDataFAC(NxtSta, Sta, ReqNid, ReqTag, Spt.Addr, DATA_DataC_S, Sta.Mem[Spt.Addr].Data) |DataC_S_Cmp| => SendDataCmp(NxtSta, Sta, ReqNid, ReqTag, Spt.Addr, DATA_DataC_S, Sta.Mem[Spt.Addr].Data) |DataC_E_Cmp| => SendDataCmp(NxtSta, Sta, ReqNid, ReqTag, Spt.Addr, DATA_DataC_E, Sta.Mem[Spt.Addr].Data) |DataC_I_Cmp| => SendDataCmp(NxtSta, Sta, ReqNid, ReqTag, Spt.Addr, DATA_DataC_I, Sta.Mem[Spt.Addr].Data) |GntE_Cmp| => SendGntECmp(NxtSta, Sta, ReqNid, ReqTag, Spt.Addr) |Cmp| => SendHomeMsg(NxtSta, Sta, Spt.Addr, Spt.Txn.Nid, Spt.Txn.Tag, HOME_Cmp) Table G-14. Action HomeRecvSnpRspCDM Current State Next State Msg SPT SPT Net Cmd Cmd State Num_SRP State Num_SRP Msg RspI RdInvOwn, InvItoE SentSnp > 1 -1 Remove 1 ReadyTo Respond Ref No xxxxx 641 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence Semantic Mapping for ‘HomeRecvSnpRspCDM’ PARAMETERS ReqNid : NID ReqTag : TAG PeerNid : NID SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].SnpMsg[PeerNid] Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtDir : NxtSta.Dir[Msg.Addr] COLUMN |Current State|Msg|Cmd| |RspI| => PendFwdNoWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & Sta.Dir[Msg.Addr].CoarseMode & Msg.Cmd = SNP_RspI COLUMN |Current State|SPT|Cmd| |RdInvOwn, InvItoE| => (Sta.Spt[Sta.Home[Msg.Addr]][SptIndex].Cmd = EXT_RdInvOwn | Sta.Spt[Sta.Home[Msg.Addr]][SptIndex].Cmd = EXT_InvItoE) COLUMN |Current State|SPT|State| |SentSnp| => Sta.Spt[Sta.Home[Msg.Addr]][SptIndex].State = SPT_SentSnp COLUMN |Current State|SPT|Num_SRP| |1| => Spt.NumSRP = 1 |> 1| => Spt.NumSRP > 1 COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, Sta.Home[Msg.Addr], SptIndex) |ReadyToRespond| => NxtSpt.State := SPT_ReadyToRespond COLUMN |Next State|SPT|Num_SRP| |-1| => NxtSpt.NumSRP := Spt.NumSRP - 1 COLUMN |Next State|Net|Msg| |Remove| => RecvSnp(NxtSta, Sta, ReqNid, ReqTag, PeerNid) 642 Ref No xxxxx Intel Restricted Secret Ref No xxxxx 643 Ref No xxxxx 643 Table G-15. Action HomeRecvWbSnpRsp Current State Next State Msg SPT SPT Dir Net Cmd ACV[ msg. from] Cmd State SRP[ msg. from] (All Snp Rsp Rcvd & All CV[i]=0) Except msg.froma State SRP[msg.from] ACV[ msg. from] Data Fwded State PV[ req. nid] PV[msg.from] Msg Send to Requester RspFwdIWb RdCode SentSnp WaitWbData -1 YES S 1 0 Remove RdData SentSnp WaitWbData M WaitRspFwd YES None Cmp NO YES InvItoE SentSnp WaitWbData RspFwdSWb RdCode SentSnp WaitWbData YES S 1 RdData SentSnp WaitRspFwd YES None Cmp NO YES An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence Table G-15. Action HomeRecvWbSnpRsp Table G-15. Action HomeRecvWbSnpRsp 644 Ref No xxxxx Intel Restricted Secret RspIWbb 0 RdCode SentSnp WaitWbData -1 NO I 0 Remove WaitRspFwd YES None S 1 DataC_S_Cmp NO NO I RdData SentSnp WaitWbData NO I WaitRspFwd YES None M 1 DataC_E_Cmp NO NO I RdCur SentSnp WaitWbData NO Ic WaitRspFwd YES None DataC_I_Cmp NO NO RdInvOwn SentSnp WaitWbData NO M 1 WaitRspFwd YES None DataC_E_Cmp NO NO I InvItoE SentSnp WaitWbData NO I WaitRspFwd YES None M 1 GntE_Cmp NO NO I RspIWb 1 !=WbMto* SentSnp 2 WaitWbData -1 NO I 0 Remove 1 WaitRspFwd 2 1 YES ReadyToRe spond 0 NO 0 RspSWb 0 RdCode, RdData SentSnp WaitWbData -1 NO S Remove WaitRspFwd YES None 1 1 DataC_S_Cmp NO NO RdCur SentSnp WaitWbData NO WaitRspFwd YES None DataC_I_Cmp NO NO a. for all i, except msg.from CV[i] = NRV[i] =0; does not need to check NRV[i] = 0 as if all NRV not equal to zero and but all CV = 0 implies that all snp responses have been received and any CV[i]=1 has received its corresponding new request or AckCnflt b. Includes response for partial data write back as home does not yet know if the data is partial or not. c. In certain cases SnpCur changes cache state and this can be determined from the type of received snoop response. These are the home actions upon receiving a snoop response which indicates an implicit write back in progress. These actions are not divided into coarse directory and non coarse directory mode, as an implicit write back can occur only if there is a single cached copy and the protocol requires that explicit pointers are required to track such a node. Semantic Mapping for ‘HomeRecvWbSnpRsp’ PARAMETERS ReqNid : NID ReqTag : TAG PeerNid : NID SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].SnpMsg[PeerNid] Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtDir : NxtSta.Dir[Msg.Addr] COLUMN |Current State|Msg|Cmd| |RspFwdIWb| => PendFwdWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = SNP_RspFwdIWb |RspFwdSWb| => PendFwdWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = SNP_RspFwdSWb|RspIWb| => PendFwdWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = SNP_RspIWb|RspSWb| => PendFwdWb(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & Msg.Cmd = SNP_RspSWb COLUMN |Current State|SPT|State| |SentSnp| => Spt.State = SPT_SentSnp |WaitRspFwd| => Spt.State = SPT_WaitRspFwd COLUMN |Current State|SPT|Cmd| |RdInvOwn| => Spt.Cmd = EXT_RdInvOwn |RdCur| => Spt.Cmd = EXT_RdCur |RdCode| => Spt.Cmd = EXT_RdCode |RdData| => Spt.Cmd = EXT_RdData |InvItoE| => Spt.Cmd = EXT_InvItoE |RdCode, RdData| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData) |RdData, RdInvOwn| => (Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdInvOwn) |RdData, RdInvOwn, InvItoE| => (Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) |!=WbMto*| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdCur | Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) Ref No xxxxx 645 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence COLUMN |Current State|SPT|SRP[msg.from] | |1| => Spt.SRP[PeerNid] = 1 |2| => Spt.SRP[PeerNid] = 2 COLUMN |Current State|SPT|ACV[msg.from] | |0| => Spt.ACV[PeerNid] = FALSE |1| => Spt.ACV[PeerNid] = TRUE COLUMN |Current State|SPT|(All Snp Rsp Rcvd & All CV[i]=0) Except msg.from| |YES| => (AllSnpRspRcvdExcept(Sta, Spt, PeerNid) & AllCVExcept(Sta, Spt, PeerNid)) = TRUE |NO| => (AllSnpRspRcvdExcept(Sta, Spt, PeerNid) & AllCVExcept(Sta, Spt, PeerNid)) = FALSE COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, Sta.Home[Msg.Addr], SptIndex) |WaitWbData| => NxtSpt.State := SPT_WaitWbData |ReadyToRespond| => NxtSpt.State := SPT_ReadyToRespond COLUMN |Next State|SPT|Data Fwded| |YES| => NxtSpt.DataFwded := TRUE |NO| => NxtSpt.DataFwded := FALSE COLUMN |Next State|SPT|ACV[msg.from]| |0| => NxtSpt.ACV[PeerNid] := FALSE COLUMN |Next State|SPT|SRP[msg.from] | |-1| => NxtSpt.SRP[PeerNid] := Spt.SRP[PeerNid] - 1 COLUMN |Next State|Dir|State| |M| => NxtDir.DirSta := DIR_M |S| => NxtDir.DirSta := DIR_S |I| => NxtDir.DirSta := DIR_I |I if all PV[i] = 0 Except msg.from| => if forall i:NID do i=PeerNid | Sta.Dir[Msg.Addr].PV[i]=FALSE endforall then NxtDir.DirSta := DIR_I endif COLUMN |Next State|Dir|PV[req.nid] | |1| => NxtDir.PV[ReqNid] := TRUE COLUMN |Next State|Dir|PV[msg.from] | |0| => NxtDir.PV[PeerNid] := FALSE |1| => NxtDir.PV[PeerNid] := TRUE 646 Ref No xxxxx Intel Restricted Secret COLUMN |Next State|Dir|NRV[msg.from] | |0| => NxtDir.NRV[PeerNid] := FALSE COLUMN |Next State|Net|Msg| |Remove| => RecvSnp(NxtSta, Sta, ReqNid, ReqTag, PeerNid) COLUMN |Next State|Net|Send to Requester| |DataC_S_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn.Nid, Spt.Txn.Tag, Spt.Addr, DATA_DataC_S, Sta.Mem[Spt.Addr].Data) |DataC_E_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn.Nid, Spt.Txn.Tag, Spt.Addr, DATA_DataC_E, Sta.Mem[Spt.Addr].Data) |DataC_I_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn.Nid, Spt.Txn.Tag, Spt.Addr, DATA_DataC_I, Sta.Mem[Spt.Addr].Data) |GntE_Cmp| => SendGntECmp(NxtSta, Sta, Spt.Txn.Nid, Spt.Txn.Tag, Spt.Addr) |Cmp| => SendHomeMsg(NxtSta, Sta, Spt.Addr, Spt.Txn.Nid, Spt.Txn.Tag, HOME_Cmp) Ref No xxxxx 647 Intel Restricted Secret Current State Next State Msg SPT SPT Dir Net Cmd Exists Cnflt Req State ACV[ msg. from ] Cmd SRP[ msg. from ] (All Snp Rsp Rcvd & All CV[i]=0) Exceptmsg.from Data Fwded State ACV[ msg.from] State PV[ req. nid] Home Msg Send to Requester WbIData, WbIDataPtl FALSE SentSnp 0 !=WbMto* WaitRspFwd Write Remove 1 2 1 1 WaitWbData 1 !=WbMto* 1 WaitRspFwd 0 YES ReadyToRespond 0 NO WaitRspFwd 0 0 RdCode YES NOa None S 1 DataC_S_Cmp YES 1 Cmp NO WaitRspFwd RdData YES NO None M 1 DataC_E_Cmp YES None 1 Cmp NO WaitRspFwd RdCur YES NO None DataC_I_Cmp YES Cmp NO WaitRspFwd RdInvOwn YES NO None M 1 DataC_E_Cmp NO WaitRspFwd InvItoE YES NO None M 1 GntE_Cmp NO WaitRspFwd An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence 648 Ref No xxxxx Intel Restricted Secret Ref No xxxxx 649 Ref No xxxxx 649 Table G-16. Action HomeRecvImplicitWbData (Continued) WbSData FALSE SentSnp 0 !=WbMto* WaitRspFwd 0 Write Remove WaitWbData RdCode YES NO None S 1 DataC_S_Cmp NO WaitRspFwd RdData YES NO None S 1 DataC_S_Cmp YES YES None S 1 Cmp NO WaitRspFwd RdCur YES NO None DataC_I_Cmp NO WaitRspFwd a. assertion: For WaiWbData.WbIData and SPT.RdCode => Data Fwded must be NO An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence Similar to previous table Table G-16 is for implicit write back data is received by home. Semantic Mapping for ‘HomeRecvWbData’ PARAMETERS ReqNid : NID ReqTag : TAG SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].WbDMsg Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtDir : NxtSta.Dir[Msg.Addr] COLUMN |Current State|Msg|Cmd| |WbIData| => PendWbD(Sta, ReqNid, ReqTag) & Msg.Cmd = WBD_WbIData|WbSData| => PendWbD(Sta, ReqNid, ReqTag) & Msg.Cmd = WBD_WbSData |WbIData, WbIDataPtl| => PendWbD(Sta, ReqNid, ReqTag) & (Msg.Cmd = WBD_WbIData | Msg.Cmd = WBD_PtlWbData) COLUMN |Current State|SPT|State| |SentSnp| => Spt.State = SPT_SentSnp |WaitWbData| => Spt.State = SPT_WaitWbData COLUMN |Current State|SPT|Cmd| |RdInvOwn| => Spt.Cmd = EXT_RdInvOwn |RdCur| => Spt.Cmd = EXT_RdCur |RdCode| => Spt.Cmd = EXT_RdCode |RdData| => Spt.Cmd = EXT_RdData |InvItoE| => Spt.Cmd = EXT_InvItoE |!=WbMto*| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdCur |Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) COLUMN |Current State|SPT|Exists Cnflt Req| |FALSE| => SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) = TRUE |TRUE| => SPTHasCnfltReqAt(Sta,Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) = TRUE COLUMN |Current State|SPT|(All Snp Rsp Rcvd & All CV[i]=0) Except msg.from| |YES| => (AllSnpRspRcvdExcept(Sta, Spt, Msg.From) & AllCVExcept(Sta, Spt, Msg.From)) = TRUE|NO| => (AllSnpRspRcvdExcept(Sta, Spt, Msg.From) & AllCVExcept(Sta, Spt, Msg.From)) = FALSE COLUMN |Current State|SPT|Data Fwded| |NO| => Spt.DataFwded = FALSE |YES| => Spt.DataFwded = TRUE COLUMN |Current State|SPT|CV[req.nid]| |1| => Spt.CV[ReqNid] = TRUE |0| => Spt.CV[ReqNid] = FALSE COLUMN |Current State|SPT|SRP[msg.from]| |0| => Spt.SRP[Msg.From] = 0 |1| => Spt.SRP[Msg.From] = 1 650 Ref No xxxxx Intel Restricted Secret |2| => Spt.SRP[Msg.From] = 2 COLUMN |Current State|SPT|ACV[msg.from]| |0| => Spt.ACV[Msg.From] = FALSE |1| => Spt.ACV[Msg.From] = TRUE COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, Sta.Home[Msg.Addr], SptIndex) |SentSnp| => NxtSpt.State := SPT_SentSnp |WaitRspFwd| => NxtSpt.State := SPT_WaitRspFwd |Valid| => NxtSpt.State := SPT_Valid |ReadyToRespond| => NxtSpt.State := SPT_ReadyToRespond COLUMN |Next State|SPT|ACV[msg.from]| |0| => NxtSpt.ACV[Msg.From] := FALSE COLUMN |Next State|SPT|SRP[msg.from]| |0| => NxtSpt.SRP[Msg.From] := 0 COLUMN |Next State|Dir|PV[req.nid]| |1| => NxtDir.PV[ReqNid] := TRUE COLUMN |Next State|Dir|State| |M| => NxtDir.DirSta := DIR_M |S| => NxtDir.DirSta := DIR_S |I| => NxtDir.DirSta := DIR_I COLUMN |Next State|Dir|PV[msg.from]| |0| => NxtDir.PV[Msg.From] := FALSE |1| => NxtDir.PV[Msg.From] := TRUE COLUMN |Next State|Net|Msg| |Remove| => RecvWbDMsg(NxtSta, Sta, ReqNid, ReqTag) COLUMN |Next State|Home| |Write| => CopyDataViaMask(NxtSta.Mem[Msg.Addr].Data, Msg.Data, Msg.Mask) COLUMN |Next State|Net|Send to Requester| |DataC_S_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_S, NxtSta.Mem[Msg.Addr].Data) |DataC_E_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_E, NxtSta.Mem[Msg.Addr].Data) |DataC_I_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_I, NxtSta.Mem[Msg.Addr].Data) |GntE_Cmp| => SendGntECmp(NxtSta, Sta, Spt.Txn, Spt.Addr) |Cmp| => SendHomeMsg(NxtSta, Sta, Spt.Addr, Spt.Txn.Nid, Spt.Txn.Tag, HOME_Cmp) Ref No xxxxx 651 Intel Restricted Secret Current State Next State Msg SPT Dir SPT Dir Net Cmd Cmd State SRP[ msg.from] ACV[ msg. from] (All Snp Rsp Rcvd & All CV[i]=0) Exceptmsg.from NRV[ msg. from] State State SRP[msg.from] CV[msg.from] ACV[ msg. from] State PV[msg.from] NRV[ msg. from] Msg RspCnflt RdCode, RdData, RdCura SentSnp, SentPOSnp 2 1 -1 Remove 1 1 YES ReadyTo Respond 0 1 NO 0 0 1 NO 1 M, S S I YES 1 M, S ReadyTo Respond S I WaitRspFwd 2 1 -1 1 1 YES ReadyTo Respond 0 NO WaitWbData 1 1 An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence 652 Ref No xxxxx Intel Restricted Secret Ref No xxxxx 653 Ref No xxxxx 653 Table G-17. Action HomeRecvRspCnfltNoCDM (Continued) Current State Next State Msg SPT Dir SPT Dir Net Cmd Cmd State SRP[ msg.from] ACV[ msg. from] (All Snp RspRcvd & All CV[i]=0) Except msg.from NRV[ msg. from] State State SRP[msg.from] CV[msg.from] ACV[ msg. from] State PV[msg.from] NRV[ msg. from] Msg RspCnflt RdInvOwn, InvItoE SentSnp 2 1 -1 Remove 1 YES ReadyTo Respond 0 NO 0 0 0 1 YES 1 ReadyTo Respond I if all PV[i] = 0 Except msg.from 0 0 NO 1 WaitRspFwd 2 1 1 YES ReadyTo Respond 0 NO 0 WaitWbData 1 a. Current state SentPOSnp does not apply to RdCur An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence Table G-17 and Table G-18 are for home actions when it receives a RspCnflt response. In the coarse directory mode the action depends on if home is waiting for more than one response or not, i.e. ‘Num SRP’ = ‘> 1’ or ‘Num SRP’ = ‘1’. Semantic Mapping for ‘HomeRecvRspCnfltNoCDM’ PARAMETERS ReqNid : NID ReqTag : TAG PeerNid : NID SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].SnpMsg[PeerNid] Dir : Sta.Dir[Msg.Addr] Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtDir : NxtSta.Dir[Msg.Addr] COLUMN |Current State|Msg|Cmd| |RspCnflt| => PendCnfltRsp(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & !Sta.Dir[Msg.Addr].CoarseMode & Msg.Cmd = SNP_RspCnflt COLUMN |Current State|SPT|Cmd| |RdCur| => Spt.Cmd = EXT_RdCur |RdCode, RdData| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData) |RdCode, RdData, RdCur| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdCur) |RdInvOwn, InvItoE| => (Spt.Cmd = EXT_RdInvOwn | Spt.Cmd= EXT_InvItoE) |RdCode, RdData, RdCur, RdInvOwn, InvItoE| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData | Spt.Cmd = EXT_RdCur | Spt.Cmd = EXT_RdInvOwn | Spt.Cmd = EXT_InvItoE) COLUMN |Current State|SPT|State| |SentSnp| => Spt.State = SPT_SentSnp|SentPOSnp| => Spt.State = SPT_SentPOSnp |SentSnp, SentPOSnp| => (Spt.State = SPT_SentSnp | Spt.State = SPT_SentPOSnp) |WaitWbData| => Spt.State = SPT_WaitWbData|WaitRspFwd| => Spt.State = SPT_WaitRspFwd COLUMN |Current State|SPT|SRP[msg.from]| |0| => Spt.SRP[PeerNid] = 0 |1| => Spt.SRP[PeerNid] = 1 |2| => Spt.SRP[PeerNid] = 2 COLUMN |Current State|SPT|ACV[msg.from]| |0| => Spt.ACV[PeerNid] = FALSE |1| => Spt.ACV[PeerNid] = TRUE COLUMN |Current State|SPT|(All Snp Rsp Rcvd & All CV[i]=0) Except msg.from| |YES| => (AllSnpRspRcvdExcept(Sta, Spt, PeerNid) & AllCVExcept(Sta, Spt, PeerNid)) = TRUE|NO| => (AllSnpRspRcvdExcept(Sta, Spt, PeerNid) & AllCVExcept(Sta, Spt, PeerNid)) = FALSE COLUMN |Current State|Dir|NRV[msg.from]| 654 Ref No xxxxx Intel Restricted Secret |1| => Dir.NRV[PeerNid] = TRUE |0| => Dir.NRV[PeerNid] = FALSE COLUMN |Current State|Dir|State| |M| => Dir.DirSta = DIR_M |S| => Dir.DirSta = DIR_S |I| => Dir.DirSta = DIR_I |M, S| => (Dir.DirSta = DIR_M | Dir.DirSta = DIR_S) COLUMN |Next State|SPT|State| |ReadyToRespond| => NxtSpt.State := SPT_ReadyToRespond COLUMN |Next State|SPT|SRP[msg.from]| |-1| => NxtSpt.SRP[PeerNid] := Spt.SRP[PeerNid] - 1 COLUMN |Next State|SPT|CV[msg.from]| |0| => NxtSpt.CV[PeerNid] := FALSE |1| => NxtSpt.CV[PeerNid] := TRUE COLUMN |Next State|SPT|ACV[msg.from]| |0| => NxtSpt.ACV[PeerNid] := FALSE |1| => NxtSpt.ACV[PeerNid] := TRUE COLUMN |Next State|Dir|State| |M| => NxtDir.DirSta := DIR_M |S| => NxtDir.DirSta := DIR_S |I| => NxtDir.DirSta := DIR_I |I if all PV[i] = 0 Except msg.from| => if forall i:NID do i=PeerNid | Sta.Dir[Msg.Addr].PV[i]=FALSE endforall then NxtSta.Dir[Msg.Addr].DirSta := DIR_I endif COLUMN |Next State|Dir|PV[req.nid]| |1| => NxtDir.PV[ReqNid] := TRUE COLUMN |Next State|Dir|PV[msg.from]| |0| => NxtDir.PV[PeerNid] := FALSE |1| => NxtDir.PV[PeerNid] := TRUE COLUMN |Next State|Dir|NRV[msg.from]| |0| => NxtDir.NRV[PeerNid] := FALSE COLUMN |Next State|Net|Msg| |Remove| => RecvSnp(NxtSta, Sta, ReqNid, ReqTag, PeerNid) COLUMN |Next State|Net|Send to Requester| |DataC_S_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_S, Sta.Mem[Spt.Addr].Data) |DataC_E_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_E, Sta.Mem[Spt.Addr].Data) |DataC_I_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_I, Sta.Mem[Spt.Addr].Data) |GntE_Cmp| => SendGntECmp(NxtSta, Sta, Spt.Txn, Spt.Addr) Ref No xxxxx 655 Intel Restricted Secret Table G-18. Action HomeRecvRspCnfltCDM Current State Next State Msg SPT SPT Net Cmd Cmd State Num_SRP State Num_SRP Msg RspCnflt RdInvOwn, InvItoE SentSnp > 1 -1 Remove 1 ReadyTo Respond Semantic Mapping for ‘HomeRecvRspCfltCDM’ PARAMETERS ReqNid : NID ReqTag : TAG PeerNid : NID SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].SnpMsg[PeerNid] Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] Dir : Sta.Dir[Msg.Addr] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtDir : NxtSta.Dir[Msg.Addr] COLUMN |Current State|Msg|Cmd| |RspCnflt| => PendCnfltRsp(Sta, ReqNid, ReqTag, PeerNid) & SPTHasTheReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) & Sta.Dir[Msg.Addr].CoarseMode & Msg.Cmd = SNP_RspCnflt COLUMN |Current State|SPT|State| |SentSnp| => Sta.Spt[Sta.Home[Msg.Addr]][SptIndex].State = SPT_SentSnp COLUMN |Current State|SPT|Cmd| |RdInvOwn, InvItoE| => (Sta.Spt[Sta.Home[Msg.Addr]][SptIndex].Cmd = EXT_RdInvOwn | Sta.Spt[Sta.Home[Msg.Addr]][SptIndex].Cmd = EXT_InvItoE) COLUMN |Current State|SPT|Num_SRP| |> 1| => Sta.Spt[Sta.Home[Msg.Addr]][SptIndex].NumSRP > 1 |1| => Sta.Spt[Sta.Home[Msg.Addr]][SptIndex].NumSRP = 1 COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, Sta.Home[Msg.Addr], SptIndex) |ReadyToRespond| => NxtSpt.State := SPT_ReadyToRespond COLUMN |Next State|SPT|Num_SRP| |-1| => NxtSpt.NumSRP := Spt.NumSRP - 1 COLUMN |Next State|Net|Msg| |Remove| => RecvSnp(NxtSta, Sta, ReqNid, ReqTag, PeerNid) 656 Ref No xxxxx Intel Restricted Secret Table G-19. Action HomeRecvAckCnflt Current State Next State Msg SPT Dir SPT Net Cmd Exists Req Exists Cnflt Req State Cmd State State CV[ msg. from] ACV[ msg. from] SRP Msg Send to msg.from AckCnflt FALSE Remove Cmp TRUE FALSE None TRUE SentSnp RdCode M 0 1 +1 Cmp_FwdCode RdData Cmp_FwdCode RdCur Cmp_FwdInvItoE RdInvOwn M, S Cmp_FwdInvOwn InvItoE Cmp_FwdInvItoE AckCnflt TRUE TRUE SentPOSnp RdCode, RdData S 0 1 +1 Cmp_FwdCode Semantic Mapping for HomeRecvAckCnflt PARAMETERS ReqNid : NID ReqTag : TAG SptIndex : SPTX ALIASES Msg : Sta.Net[ReqNid][ReqTag].HomeMsg Dir : Sta.Dir[Msg.Addr] Spt : Sta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtSpt : NxtSta.Spt[Sta.Home[Msg.Addr]][SptIndex] NxtDir : NxtSta.Dir[Msg.Addr] COLUMN |Current State|Msg|Cmd| |AckCnflt| => Msg.Cmd = HOME_AckCnflt COLUMN |Current State|SPT|Exists Req| |TRUE| => SPTHasReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr) = TRUE |FALSE| => SPTHasReqAny(Sta, Sta.Home[Msg.Addr], Msg.Addr) = FALSE COLUMN |Current State|SPT|Exists Cnflt Req| |TRUE| => SPTHasCnfltReqAt(Sta, Sta.Home[Msg.Addr], SptIndex, Msg.Addr, ReqNid, ReqTag) = TRUE|FALSE| => SPTHasCnfltReq(Sta, Sta.Home[Msg.Addr], Msg.Addr, ReqNid, ReqTag) = FALSE COLUMN |Current State|SPT|State| |SentSnp, SentPOSnp| => (Spt.State = SPT_SentSnp | Spt.State = SPT_SentPOSnp) |SentSnp| => Spt.State = SPT_SentSnp|SentPOSnp| => Spt.State = SPT_SentPOSnp COLUMN |Current State|SPT|Cmd| |RdInvOwn| => Spt.Cmd = EXT_RdInvOwn Ref No xxxxx 657 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence |RdCur| => Spt.Cmd = EXT_RdCur |RdCode| => Spt.Cmd = EXT_RdCode |RdData| => Spt.Cmd = EXT_RdData |RdCode, RdData| => (Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData) |InvItoE| => Spt.Cmd = EXT_InvItoE COLUMN |Current State|Dir|State| |M| => Dir.DirSta = DIR_M |M, S| => (Dir.DirSta = DIR_M | Dir.DirSta = DIR_S) |S| => Dir.DirSta = DIR_S COLUMN |Current State|SPT|CV[msg.from]| |1| => Spt.CV[ReqNid] = TRUE |0| => Spt.CV[ReqNid] = FALSE COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, Sta.Home[Msg.Addr], SptIndex) |SentSnp| => NxtSpt.State := SPT_SentSnp COLUMN |Next State|SPT|SRP| |+1| => if Coarse.DirMode then NxtSpt.NumSRP = NumSRP + 1 else NxtSpt.SRP[ReqNid] := Spt.SRP[ReqNid] + 1 COLUMN |Next State|SPT|CV[msg.from]| |0| => NxtSpt.CV[ReqNid] := FALSE COLUMN |Next State|SPT|ACV[msg.from]| |1| => NxtSpt.ACV[ReqNid] := TRUE COLUMN |Next State|Net|Msg| |Remove| => RecvHomeMsg(NxtSta, Sta, ReqNid, ReqTag) COLUMN |Next State|Net|Send to msg.from| |Cmp_FwdInvItoE| => SendHomeFwdMsg(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, HOME_Cmp_FwdInvItoE, Spt.Txn) |Cmp_FwdCode| => SendHomeFwdMsg(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, HOME_Cmp_FwdCode, Spt.Txn) |Cmp_FwdInvOwn| => SendHomeFwdMsg(NxtSta, Sta, Msg.Addr, ReqNid, ReqTag, HOME_Cmp_FwdInvOwn, Spt.Txn) |Cmp| => SendHomeMsg(NxtSta, Sta, Spt.Addr, ReqNid, ReqTag, HOME_Cmp) 658 Ref No xxxxx Intel Restricted Secret Table G-20. Action HomeSPTReadyToRespondNoCDM Current State Next State SPT Dir SPT Dir Net State Pointer Overflow Cmd Data Fwded State State State PV[req. nid] Coarse Dir Mode Send to Requester ReadyTo Respond NO RdCode NO None S 1 DataC_S_Cmp YES Cmp RdData NO M, I M 1 DataC_E_Cmp S DataC_S_Cmp YES M, I M Cmp S Cmp RdCur NO None DataC_I_Cmp YES Cmp RdInvOwn NO M 1 DataC_E_Cmp YES Cmp InvItoE M 1 GntE_Cmp YES RdCode, RdData SentFAC S 1 TRUE DataC_S_FAC Table G-20: Once all snoop responses are received, home either sends Data*_Cmp or if data was forwarded by a snooped node then just a Cmp to the requester. An exception is InvItoE in which case home sends GntE_Cmp. Table G-21 is when home is in coarse directory mode. Semantic Mapping for ‘HomeSPTReadyToRespondNoCDM’ PARAMETERS SptHid : HID SptIndex : SPTX ALIASES Spt : Sta.Spt[SptHid][SptIndex] ReqNid : Spt.Txn.Nid ReqTag : Spt.Txn.Tag Dir : Sta.Dir[Spt.Addr] NxtDir : NxtSta.Dir[Spt.Addr] NxtSpt : NxtSta.Spt[SptHid][SptIndex] COLUMN |Current State|SPT|State| |ReadyToRespond| => Spt.State = SPT_ReadyToRespond & !Sta.Dir[Spt.Addr].CoarseMode & Spt.State = SPT_ReadyToRespond COLUMN |Current State|SPT|Pointer Overflow| |YES| => MaxSharers(Sta, Spt.Addr, ReqNid) = TRUE |NO| => MaxSharers(Sta, Spt.Addr, ReqNid) = FALSE COLUMN |Current State|SPT|Cmd| |RdCode| => Spt.Cmd = EXT_RdCode Ref No xxxxx 659 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence |RdData| => Spt.Cmd = EXT_RdData |RdCur| => Spt.Cmd = EXT_RdCur |RdInvOwn| => Spt.Cmd = EXT_RdInvOwn |InvItoE| => Spt.Cmd = EXT_InvItoE |RdCode, RdData| => Spt.Cmd = EXT_RdCode | Spt.Cmd = EXT_RdData COLUMN |Current State|SPT|Data Fwded| |YES| => Spt.DataFwded = TRUE |NO| => Spt.DataFwded = FALSE COLUMN |Current State|Dir|State| |M| => Dir.DirSta = DIR_M |S| => Dir.DirSta = DIR_S |I| => Dir.DirSta = DIR_I |M, I| => (Dir.DirSta = DIR_M | Dir.DirSta = DIR_I) COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, SptHid, SptIndex) |SentSnp| => NxtSpt.State := SPT_SentSnp |SentFAC| => NxtSpt.State := SPT_SentFAC COLUMN |Next State|SPT|SRP[i] where ACV[i]=1| |+1| => for i:NID do if Spt.ACV[i] then NxtSpt.SRP[i] := Spt.SRP[i] + 1; endif endfor COLUMN |Next State|Dir|State| |M| => NxtDir.DirSta := DIR_M |S| => NxtDir.DirSta := DIR_S COLUMN |Next State|Dir|PV[req.nid]| |1| => NxtDir.PV[ReqNid] := TRUE |0| => NxtDir.PV[ReqNid] := FALSE COLUMN |Next State|Dir|Coarse Dir Mode| |TRUE| => NxtDir.CoarseMode := TRUE; for i:NID do NxtDir.NRV[i]:=FALSE endfor COLUMN |Next State|Net|Send to Requester| |DataC_S_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_S, Sta.Mem[Spt.Addr].Data) |DataC_S_FAC| => SendDataFAC(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_S, Sta.Mem[Spt.Addr].Data) |DataC_E_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_E, Sta.Mem[Spt.Addr].Data) |DataC_I_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_I, Sta.Mem[Spt.Addr].Data) |GntE_Cmp| => SendGntECmp(NxtSta, Sta, Spt.Txn, Spt.Addr) |Cmp| => SendHomeMsg(NxtSta, Sta, Spt.Addr, Spt.Txn.Nid, Spt.Txn.Tag, HOME_Cmp) COLUMN |Next State|Net|Send to i where ACV[i]=1| |Cmp_FwdCode| => for i:NID do if Spt.ACV[i] then SendHomeFwdMsg(NxtSta, Sta, Spt.Addr, i, ReqTag, HOME_Cmp_FwdCode, Spt.Txn); NxtSpt.ACV[i] := FALSE; endif endfor; |Cmp_FwdInvItoE| => for i:NID do if Spt.ACV[i] then SendHomeFwdMsg(NxtSta, Sta, Spt.Addr, i, ReqTag, HOME_Cmp_FwdInvItoE, Spt.Txn); NxtSpt.ACV[i] := FALSE; endif endfor; 660 Ref No xxxxx Intel Restricted Secret Table G-21. Action HomeSPTReadyToRespondCDM Current State Next State SPT SPT Dir Net State Cmd State State Coarse Dir Mode All PV except req.nid PV[req. nid] Send to Requester ReadyToRespond RdInvOwn None M FALSE 0 1 DataC_E_Cmp InvItoE GntE_Cmp Semantic Mapping for ‘HomeSPTReadyToRespondCDM’ PARAMETERS SptHid : HID SptIndex : SPTX ALIASES Spt : Sta.Spt[SptHid][SptIndex] ReqNid : Spt.Txn.Nid ReqTag : Spt.Txn.Tag Dir : Sta.Dir[Spt.Addr] NxtDir : NxtSta.Dir[Spt.Addr] NxtSpt : NxtSta.Spt[SptHid][SptIndex] COLUMN |Current State|SPT|State| |ReadyToRespond| => Spt.State = SPT_ReadyToRespond & Sta.Dir[Spt.Addr].CoarseMode & Spt.State = SPT_ReadyToRespond COLUMN |Current State|SPT|Cmd| |RdInvOwn| => Spt.Cmd = EXT_RdInvOwn |InvItoE| => Spt.Cmd = EXT_InvItoE COLUMN |Next State|SPT|State| |None| => ClearSPT(NxtSta, Sta, SptHid, SptIndex) COLUMN |Next State|Dir|State| |M| => NxtDir.DirSta := DIR_M COLUMN |Next State|Dir|PV[req.nid]| |1| => NxtDir.PV[ReqNid] := TRUE |0| => NxtDir.PV[ReqNid] := FALSE COLUMN |Next State|Dir|All PV except req.nid| |0| => for i:NID do if i != ReqNid then NxtDir.PV[i] := FALSE endif endfor COLUMN |Next State|Dir|Coarse Dir Mode| |FALSE| => NxtDir.CoarseMode := FALSE COLUMN |Next State|Net|Send to Requester| |DataC_E_Cmp| => SendDataCmp(NxtSta, Sta, Spt.Txn, Spt.Addr, DATA_DataC_E, Sta.Mem[Spt.Addr].Data) |GntE_Cmp| => SendGntECmp(NxtSta, Sta, Spt.Txn, Spt.Addr) |Cmp| => SendHomeMsg(NxtSta, Sta, Spt.Addr, Spt.Txn.Nid, Spt.Txn.Tag, HOME_Cmp) Ref No xxxxx 661 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence G.8 Utility Sub-Routines The utility subroutines used by the semantic mappings are listed below. Please read the interspersed comments for their meanings. Note that many assertions are inserted to detect errors. For instance, whenever a message is sent (respectively, received), an assertion checks whether the message already exists (resp., the message does not exist) in the network and, if so, raises an error flag. Also note that, whenever the State or Cmd field of a record is set to None, its other fields are “cleared” by being assigned don’t-care values. -- Subroutines for handling data and masks. -- Check if mask M is full. function MaskFull(M : MASK) : BOOLEAN; return (forall w : WIDX do M[w] = TRUE endforall); endfunction; -- Check if mask M is empty. function MaskEmpty(M : MASK) : BOOLEAN; return (forall w : WIDX do M[w] = FALSE endforall); endfunction; -- Check if mask M is partial. function MaskPartial(M : MASK) : BOOLEAN; return (exists w : WIDX do M[w] = TRUE endexists); endfunction; -- Set mask to full. procedure SetMaskFull(var M : MASK); for w : WIDX do M[w] := TRUE; endfor; endprocedure; -- Set mask to empty. procedure SetMaskEmpty(var M : MASK); for w : WIDX do M[w] := FALSE; endfor; endprocedure; -- Undefine mask. procedure UndefineMask(var M : MASK); for w : WIDX do undefine M[w]; endfor; endprocedure; -- Undefine data. procedure UndefineData(var D : DATA); 662 Ref No xxxxx Intel Restricted Secret for w : WIDX do undefine D[w]; endfor; endprocedure; -- Copy mask. procedure CopyMask(var M : MASK; M1 : MASK); for w : WIDX do M[w] := M1[w]; endfor; endprocedure; -- Copy data. procedure CopyData(var D : DATA; D1 : DATA); for w : WIDX do D[w] := D1[w]; endfor; endprocedure; -- Copy data via a mask. procedure CopyDataViaMask(var D : DATA; D1 : DATA; M1 : MASK); for w : WIDX do if (M1[w] = TRUE) thenD[w] := D1[w]; endif; endfor; endprocedure; -- Subroutines for handling caches. -- Clear the cache at node n for address a. procedure CchClear(var NxtSta : STATE; Sta : STATE; n : NID; a : ADDR; s : CCH_STATE); NxtSta.Cch[n][a].State := s; SetMaskEmpty(NxtSta.Cch[n][a].Mask); UndefineData(NxtSta.Cch[n][a].Data); endprocedure; -- Subroutines for handling ORBs. -- Clear the i-th entry of the ORB at node n. procedure OrbClear(var NxtSta : STATE; Sta : STATE; n : NID; i : TAG); assert (Sta.Orb[n][i].State != ORB_None) "OrbClear : Aleady cleared"; NxtSta.Orb[n][i].State := ORB_None; undefine NxtSta.Orb[n][i].Cmd; undefine NxtSta.Orb[n][i].Addr; undefine NxtSta.Orb[n][i].Cnflt; endprocedure; -- Check if the i-th entry of the ORB at node n is valid with address a. function OrbHit(Sta : STATE; a : ADDR; n : NID; i : TAG) : BOOLEAN; return (Sta.Orb[n][i].State != ORB_None & Sta.Orb[n][i].Addr = a); endfunction; Ref No xxxxx 663 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence -- Check if the ORB at node n contains no valid entry of address a. function OrbMiss(Sta : STATE; a : ADDR; n : NID) : BOOLEAN; return (forall i : TAG do Sta.Orb[n][i].State = ORB_None | Sta.Orb[n][i].Addr != a endforall); endfunction; -- Check if the ORB at node n has its i-th entry empty and -- there is no valid entry of address a in the ORB. function OrbAvail(Sta : STATE; a : ADDR; n : NID; i : TAG) : BOOLEAN; return (Sta.Orb[n][i].State = ORB_None & OrbMiss(Sta, a, n)); endfunction; -- Check if there is a snoop request for (n,i) pending at node p. function PendSnp(Sta : STATE; n : NID; i : TAG; p : NID) : BOOLEAN; return (Sta.Net[n][i].SnpMsg[p].Cmd = SNP_SnpCode | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_SnpData | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_SnpInvOwn | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_SnpInvItoE | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_SnpCur); endfunction; -- Check if there is a conflict snoop response for (n,i) from node p. function PendCnfltRsp(Sta : STATE; n : NID; i : TAG; p : NID) : BOOLEAN; return (Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspCnflt); endfunction; -- Check if there is a Data*/Gnt* response for (n,i) pending. function PendDataRsp(Sta : STATE; a: ADDR; n : NID; i : TAG) : BOOLEAN; return ( Sta.Orb[n][i].Addr = a & ( Sta.Net[n][i].DataMsg.Cmd = DATA_DataC_M | Sta.Net[n][i].DataMsg.Cmd = DATA_DataC_E | Sta.Net[n][i].DataMsg.Cmd = DATA_DataC_S | Sta.Net[n][i].DataMsg.Cmd = DATA_DataC_I | Sta.Net[n][i].DataMsg.Cmd = DATA_GntE ) ); endfunction; -- Check if there is a Cmp or FrcAckCnflt response for (n,i) pending. function PendCmp(Sta : STATE; n : NID; i : TAG) : BOOLEAN; return (Sta.Net[n][i].HomeMsg.Cmd = HOME_Cmp | Sta.Net[n][i].HomeMsg.Cmd = HOME_FrcAckCnflt); endfunction; -- Check if there is a Cmp or FrcAckCnflt response for (n,i) pending. function PendFwd(Sta : STATE; n : NID; i : TAG) : BOOLEAN; return (Sta.Net[n][i].HomeMsg.Cmd = HOME_Cmp_FwdCode | 664 Ref No xxxxx Intel Restricted Secret Sta.Net[n][i].HomeMsg.Cmd = HOME_Cmp_FwdInvOwn | Sta.Net[n][i].HomeMsg.Cmd = HOME_Cmp_FwdInvItoE); endfunction; -- Check if there is a Fwd response (not WB) pending. function PendFwdNoWb(Sta : STATE; n : NID; i : TAG; p : NID) : BOOLEAN; return ( Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspFwdI | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspFwdS | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspFwd | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspI | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspS ); endfunction; -- Check if there is a Fwd Wb Response pending. function PendFwdWb(Sta : STATE; n : NID; i : TAG; p : NID) : BOOLEAN; return ( Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspFwdIWb | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspFwdSWb | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspIWb | Sta.Net[n][i].SnpMsg[p].Cmd = SNP_RspSWb ); endfunction; function PendFwdNoWb2(Sta : STATE; n : NID; i : TAG; p : NID) : BOOLEAN; return ( Sta.Net[n][i].SnpMsg2[p].Cmd = SNP_RspFwdI | Sta.Net[n][i].SnpMsg2[p].Cmd = SNP_RspI ); endfunction; function PendFwdWb2(Sta : STATE; n : NID; i : TAG; p : NID) : BOOLEAN; return ( Sta.Net[n][i].SnpMsg2[p].Cmd = SNP_RspIWb ); endfunction; -- Check if there is a WbData for (n,i) pending. function PendWbD(Sta : STATE; n : NID; i : TAG) : BOOLEAN; return Sta.Net[n][i].WbDMsg.Cmd != WBD_Noneendfunction; function ReqType(c: HOME_CMD) : EXT_CMD; if c = HOME_RdCode then return EXT_RdCode; elsif c = HOME_RdData then return EXT_RdData; elsif c = HOME_RdInvOwn then return EXT_RdInvOwn; elsif c = HOME_RdCur then return EXT_RdCur; elsif c = HOME_InvItoE then return EXT_InvItoE; else assert false "ReqType Mismatch"; endif; endfunction; Ref No xxxxx 665 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence function WbReqType(c: WBD_CMD) : EXT_CMD; if c = WBD_WbIData then return EXT_WbMtoI; elsif c = WBD_WbSData then return EXT_WbMtoS; elsif c = WBD_WbEData then return EXT_WbMtoE; elsif c = WBD_PtlWbData then return EXT_WbMtoI; else assert false "WbReqType Mismatch"; endif; endfunction; -- Receive a snoop from (n,i) pending at node p. procedure RecvSnp(var NxtSta : STATE; Sta : STATE; n : NID; i : TAG; p : NID); assert (Sta.Net[n][i].SnpMsg[p].Cmd != SNP_None) "RecvSnp : No msg to receive"; NxtSta.Net[n][i].SnpMsg[p].Cmd := SNP_None; undefine NxtSta.Net[n][i].SnpMsg[p].Addr; endprocedure; -- Buffer a snoop from (n,i) pending at node p. procedure BuffSnp(var NxtSta : STATE; Sta : STATE; n : NID; i : TAG; p : NID); assert (Sta.Net[n][i].SnpMsg[p].Cmd != SNP_None) "BuffTxnSnp : No msg to buffer"; endprocedure; -- Receive a Data*/Gnt* response to (n,i). procedure RecvDataRsp(var NxtSta : STATE; Sta : STATE; n : NID; i : TAG); assert (Sta.Net[n][i].DataMsg.Cmd != DATA_None) "RecvTxnData : No msg to receive"; NxtSta.Net[n][i].DataMsg.Cmd := DATA_None; UndefineData(NxtSta.Net[n][i].DataMsg.Data); undefine NxtSta.Net[n][i].DataMsg.Addr; endprocedure; -- Receive a WbData for (n,i). procedure RecvWbDMsg(var NxtSta : STATE; Sta : STATE; n : NID; i : TAG); assert (Sta.Net[n][i].WbDMsg.Cmd != WBD_None) "RecvWbD : No msg to receive"; NxtSta.Net[n][i].WbDMsg.Cmd := WBD_None; undefine NxtSta.Net[n][i].WbDMsg.Addr; undefine NxtSta.Net[n][i].WbDMsg.From; UndefineMask(NxtSta.Net[n][i].WbDMsg.Mask); UndefineData(NxtSta.Net[n][i].WbDMsg.Data); endprocedure; -- Receive a message for (n,i) from the home channel from node p to -- the home node of address a. procedure RecvHomeMsg(var NxtSta : STATE; Sta : STATE; n : NID; i : TAG); assert (Sta.Net[n][i].HomeMsg.Cmd != HOME_None ) "RecvHomeMsg : No msg to receive"; NxtSta.Net[n][i].HomeMsg.Cmd := HOME_None; 666 Ref No xxxxx Intel Restricted Secret undefine NxtSta.Net[n][i].HomeMsg.Addr; undefine NxtSta.Net[n][i].HomeMsg.FwdTo; endprocedure; procedure SendHomeMsg(var NxtSta : STATE; Sta : STATE; a : ADDR; n : NID; i : TAG; c : HOME_CMD); assert (NxtSta.Net[n][i].HomeMsg.Cmd = HOME_None) "SendHomeMsg : Msg already exists"; NxtSta.Net[n][i].HomeMsg.Cmd := c; NxtSta.Net[n][i].HomeMsg.Addr := a; endprocedure; procedure SendHomeFwdMsg(var NxtSta : STATE; Sta : STATE; a : ADDR; n : NID; i : TAG; c : HOME_CMD; txn : TXN); assert (NxtSta.Net[n][i].HomeMsg.Cmd = HOME_None) "SendHomeFwdMsg : Msg already exists"; NxtSta.Net[n][i].HomeMsg.Cmd := c; NxtSta.Net[n][i].HomeMsg.Addr := a; NxtSta.Net[n][i].HomeMsg.FwdTo.Nid := txn.Nid; NxtSta.Net[n][i].HomeMsg.FwdTo.Tag := txn.Tag; endprocedure; -- Send a WbData for (n,i) with address a and data d. procedure SendWbD(var NxtSta : STATE; Sta : STATE; a : ADDR; f: NID; n : NID; i : TAG; c : WBD_CMD; m : MASK; d : DATA); assert (NxtSta.Net[n][i].WbDMsg.Cmd = WBD_None) "SendWbD : Msg already exists"; NxtSta.Net[n][i].WbDMsg.Cmd := c; NxtSta.Net[n][i].WbDMsg.Addr := a; NxtSta.Net[n][i].WbDMsg.Mask := m; NxtSta.Net[n][i].WbDMsg.Data := d; NxtSta.Net[n][i].WbDMsg.From := f; endprocedure; procedure SendSnpReq(var NxtSta : STATE; Sta : STATE; h : HID; s : SPTX; n : NID; t : TAG; a : ADDR; c : SNP_CMD); for p : NID doif p != n & Sta.Dir[a].PV[p] then assert (NxtSta.Net[n][t].SnpMsg[p].Cmd = SNP_None) "SendSnpReq: Net[n][i].SnpMsg is occupied"; NxtSta.Net[n][t].SnpMsg[p].Cmd := c; NxtSta.Net[n][t].SnpMsg[p].Addr := a; if Sta.Dir[a].CoarseMode then NxtSta.Spt[h][s].NumSRP := NxtSta.Spt[h][s].NumSRP + 1; else NxtSta.Spt[h][s].SRP[p] := Sta.Spt[h][s].SRP[p] + 1; endif; endif; endfor; endprocedure; procedure SendSnpRsp(var NxtSta : STATE; Sta : STATE; p : NID; a : ADDR; n : NID; i : TAG; c : SNP_CMD); NxtSta.Net[n][i].SnpMsg[p].Cmd := c; Ref No xxxxx 667 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence NxtSta.Net[n][i].SnpMsg[p].Addr := a; endprocedure; procedure SendSnpRspOwn(var NxtSta : STATE; Sta : STATE; onid : NID; t : TXN; a : ADDR; c : SNP_CMD); assert (NxtSta.Net[t.Nid][t.Tag].SnpMsg2[onid].Cmd = SNP_None) "SendSnpRspOwn: Msg already exists"; NxtSta.Net[t.Nid][t.Tag].SnpMsg2[onid].Cmd := c; NxtSta.Net[t.Nid][t.Tag].SnpMsg2[onid].Addr := a; endprocedure; -- Send a Data* Reply to Requester Procedure SendData(var NxtSta : STATE; Sta : STATE; addr : ADDR; n : NID; i : TAG; cmd : DATA_CMD; data : DATA); assert (NxtSta.Net[n][i].DataMsg.Cmd = DATA_None) "SendData: Net[n][i].DataMsg is occupied"; NxtSta.Net[n][i].DataMsg.Cmd := cmd; NxtSta.Net[n][i].DataMsg.Addr := addr; NxtSta.Net[n][i].DataMsg.Data := data; endprocedure; -- Send a Data* and Cmp Reply to RequesterProcedure SendDataCmp(var NxtSta : STATE; Sta : STATE; n : NID; t : TAG; addr : ADDR; cmd : DATA_CMD; data : DATA); assert (NxtSta.Net[n][t].DataMsg.Cmd = DATA_None) "SendDataCmp: Net[n][i].DataMsg is occupied"; NxtSta.Net[n][t].DataMsg.Cmd := cmd; NxtSta.Net[n][t].DataMsg.Addr := addr; NxtSta.Net[n][t].DataMsg.Data := data; assert (NxtSta.Net[n][t].HomeMsg.Cmd = HOME_None) "SendDataCmp: Net[n][i].HomeMsg is occupied"; NxtSta.Net[n][t].HomeMsg.Cmd := HOME_Cmp; NxtSta.Net[n][t].HomeMsg.Addr := addr; endprocedure; -- Send a Data* and FrcAckCnflt Reply to RequesterProcedure SendDataFAC(var NxtSta : STATE; Sta : STATE; n : NID; t : TAG; addr : ADDR; cmd : DATA_CMD; data : DATA); assert (NxtSta.Net[n][t].DataMsg.Cmd = DATA_None) "SendDataFAC: Net[n][i].DataMsg is occupied"; NxtSta.Net[n][t].DataMsg.Cmd := cmd; NxtSta.Net[n][t].DataMsg.Addr := addr; NxtSta.Net[n][t].DataMsg.Data := data; assert (NxtSta.Net[n][t].HomeMsg.Cmd = HOME_None) "SendDataCmp: Net[n][i].HomeMsg is occupied"; NxtSta.Net[n][t].HomeMsg.Cmd := HOME_FrcAckCnflt; NxtSta.Net[n][t].HomeMsg.Addr := addr; endprocedure; -- Send GntE and Cmp Reply to Requester Procedure SendGntECmp(var NxtSta : STATE; Sta : STATE; n : NID; t : TAG; addr : ADDR); assert (NxtSta.Net[n][t].DataMsg.Cmd = DATA_None) 668 Ref No xxxxx Intel Restricted Secret "SendGntECmp: Net[n][i].DataMsg is occupied"; NxtSta.Net[n][t].DataMsg.Cmd := DATA_GntE; NxtSta.Net[n][t].DataMsg.Addr := addr; undefine NxtSta.Net[n][t].DataMsg.Data; assert (NxtSta.Net[n][t].HomeMsg.Cmd = HOME_None) "SendGntE: Net[n][i].HomeMsg is occupied"; NxtSta.Net[n][t].HomeMsg.Cmd := HOME_Cmp; NxtSta.Net[n][t].HomeMsg.Addr := addr; endprocedure; -- Send GntE and FrcAckCnflt Reply to RequesterProcedure SendGntEFAC(var NxtSta : STATE; Sta : STATE; t : TXN; addr : ADDR); assert (NxtSta.Net[t.Nid][t.Tag].DataMsg.Cmd = DATA_None) "SendGntEFAC: Net[n][i].DataMsg is occupied"; NxtSta.Net[t.Nid][t.Tag].DataMsg.Cmd := DATA_GntE; NxtSta.Net[t.Nid][t.Tag].DataMsg.Addr := addr; undefine NxtSta.Net[t.Nid][t.Tag].DataMsg.Data; assert (NxtSta.Net[t.Nid][t.Tag].HomeMsg.Cmd = HOME_None) "SendGntE: Net[n][i].HomeMsg is occupied"; NxtSta.Net[t.Nid][t.Tag].HomeMsg.Cmd := HOME_FrcAckCnflt; NxtSta.Net[t.Nid][t.Tag].HomeMsg.Addr := addr; endprocedure; -- Check if there is a conflict request to the address a being-- processed in SPTfunction SPTHasReqAny(Sta : STATE; h : HID; a : ADDR): BOOLEAN; assert ( h = Sta.Home[a] ) "SPTHasReq: home for the address"; return ( exists s: SPTX do Sta.Spt[h][s].State != SPT_None& Sta.Spt[h][s].Addr = aendexists ); endfunction; function SPTHasReqAt(Sta : STATE; h : HID; s : SPTX; a : ADDR): BOOLEAN; return ( Sta.Spt[h][s].State != SPT_None & Sta.Spt[h][s].Addr = a ); endfunction; function SPTHasTheReqAt(Sta : STATE; h : HID; s : SPTX; a : ADDR; n : NID; i : TAG) : BOOLEAN; return ( Sta.Spt[h][s].State != SPT_None & Sta.Spt[h][s].Addr = a & Sta.Spt[h][s].Txn.Nid = n & Sta.Spt[h][s].Txn.Tag = i ); endfunction; function SPTHasCnfltReqAt(Sta : STATE; h : HID; s : SPTX; a : ADDR; n : NID; i : TAG) : BOOLEAN; return ( Sta.Spt[h][s].State != SPT_None Ref No xxxxx 669 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence & Sta.Spt[h][s].Addr = a & (Sta.Spt[h][s].Txn.Nid != n | Sta.Spt[h][s].Txn.Tag != i) ); endfunction; function SPTHasCnfltReq(Sta : STATE; h : HID; a : ADDR; n : NID; i : TAG): BOOLEAN; assert ( h = Sta.Home[a] ) "SPTHasReq: home for the address"; return ( exists s: SPTX do Sta.Spt[h][s].State != SPT_None & Sta.Spt[h][s].Addr = a & (Sta.Spt[h][s].Txn.Nid != n | Sta.Spt[h][s].Txn.Tag != i) endexists ); endfunction; function SPTFull(Sta : STATE; h : HID): BOOLEAN; return ( forall s: SPTX do Sta.Spt[h][s].State != SPT_None endforall ); endfunction; function AllSnpRspRcvdExcept(Sta : STATE; spt : SPT_ENTRY; p : NID) : BOOLEAN; return ( forall n : NID do n = p | spt.SRP[n]=0 endforall ); endfunction; function AllCVExcept(Sta : STATE; spt : SPT_ENTRY; p : NID) : BOOLEAN; return ( forall n : NID do n = p | !spt.CV[n] endforall ); endfunction; function MaxSharers(Sta : STATE; addr : ADDR; n : NID) : BOOLEAN; var num : 0 .. NID_NUM; beginnum := 0; for i : NID do if Sta.Dir[addr].PV[i] & i != n then num := num + 1; endif; endfor; if num >= MAX_SHARERS then return true; else return false; endif; endfunction; 670 Ref No xxxxx Intel Restricted Secret -- Enter a HomeMsg(n,i) into PRB procedure EnterPRB(var NxtSta : STATE; Sta : STATE; a: ADDR; c: EXT_CMD; n : NID; i : TAG); assert (Sta.Prb[Sta.Home[a]][n][i].Cmd = EXT_None) "EnterPRB: PRB(n,i) is occupied"; NxtSta.Prb[Sta.Home[a]][n][i].Cmd := c; NxtSta.Prb[Sta.Home[a]][n][i].Addr := a; endprocedure; procedure ExchangePRB(var NxtSta : STATE; Sta : STATE; s : SPTX; a: ADDR; c: EXT_CMD; n : NID; i : TAG); assert (Sta.Prb[Sta.Home[a]][n][i].Cmd = EXT_None) "ExchangePRB: PRB(n,i) is occupied"; assert (Sta.Prb[Sta.Home[a]][Sta.Spt[Sta.Home[a]][s].Txn.Nid][Sta.Spt[Sta.Home[a]][s].Txn.Tag].Cmd = EXT_None) "ExchangePRB: PRB(spt(n),spt(i)) is occupied"; assert (Sta.Spt[Sta.Home[a]][s].Addr = a) "ExchangePRB: Addr not same"; NxtSta.Prb[Sta.Home[a]][Sta.Spt[Sta.Home[a]][s].Txn.Nid][Sta.Spt[Sta.Home[a]][s].Txn.Tag].Cmd := Sta.Spt[Sta.Home[a]][s].Cmd; NxtSta.Prb[Sta.Home[a]][Sta.Spt[Sta.Home[a]][s].Txn.Nid][Sta.Spt[Sta.Home[a]][s].Txn.Tag].Addr := a; NxtSta.Spt[Sta.Home[a]][s].Txn.Nid := n; NxtSta.Spt[Sta.Home[a]][s].Txn.Tag := i; NxtSta.Spt[Sta.Home[a]][s].Cmd := c; NxtSta.Spt[Sta.Home[a]][s].Addr := a; NxtSta.Spt[Sta.Home[a]][s].DataFwded := FALSE; -- NxtSta.Spt[Sta.Home[a]][s].WbRcvd := Wb_None; for j : NID do NxtSta.Spt[Sta.Home[a]][s].CV[j] := FALSE; NxtSta.Spt[Sta.Home[a]][s].ACV[j] := FALSE; NxtSta.Spt[Sta.Home[a]][s].SRP[j] := 0; endfor; endprocedure; procedure ClearSPT(var NxtSta : STATE; Sta : STATE; h : HID; s : SPTX); -- assert (Sta.Spt[h][s].Cmd != EXT_None) "ClearSPT: SPT(n,s) is empty"; NxtSta.Spt[h][s].State := SPT_None; NxtSta.Spt[h][s].Cmd := EXT_None; undefine NxtSta.Spt[h][s].Addr; undefine NxtSta.Spt[h][s].Txn; NxtSta.Spt[h][s].DataFwded := FALSE; -- NxtSta.Spt[h][s].WbRcvd := Wb_None; for j : NID do NxtSta.Spt[h][s].CV[j] := FALSE; NxtSta.Spt[h][s].ACV[j] := FALSE; NxtSta.Spt[h][s].SRP[j] := 0; endfor; endprocedure; Ref No xxxxx 671 Intel Restricted Secret An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence An Implementation Agnostic Model for CSI 3-Hop Home Broadcast Coherence procedure MoveSPTintoPRB(var NxtSta, Sta : STATE; h : HID; s : SPTX); assert (Sta.Prb[h][Sta.Spt[h][s].Txn.Nid][Sta.Spt[h][s].Txn.Tag].Cmd = EXT_None) "MoveSPTintoPRB: PRB(spt(n),spt(i)) is occupied"; NxtSta.Prb[h][Sta.Spt[h][s].Txn.Nid][Sta.Spt[h][s].Txn.Tag].Cmd := Sta.Spt[h][s].Cmd; NxtSta.Prb[h][Sta.Spt[h][s].Txn.Nid][Sta.Spt[h][s].Txn.Tag].Addr:= Sta.Spt[h][s].Addr; ClearSPT(NxtSta, Sta, h, s); endprocedure; procedure ClearPRB(var NxtSta : STATE; Sta : STATE; h: HID; n : NID; i : TAG); assert (Sta.Prb[h][n][i].Cmd != EXT_None) "ClearPRB: PRB(n,i) is empty"; NxtSta.Prb[h][n][i].Cmd := EXT_None; undefine NxtSta.Prb[h][n][i].Addr; endprocedure; -- Enter a PRB entry into SPT procedure EnterSPT(var NxtSta : STATE; Sta : STATE; h : HID; s : SPTX; n : NID; i : TAG); assert (Sta.Spt[h][s].State = SPT_None) "EnterSPT: SPT[h][s] is occupied"; assert (Sta.Prb[h][n][i].Cmd != EXT_None) "EnterSPT: PRB[h][n][i] is empty"; NxtSta.Spt[h][s].State := SPT_Valid; NxtSta.Spt[h][s].Addr := Sta.Prb[h][n][i].Addr; NxtSta.Spt[h][s].Cmd := Sta.Prb[h][n][i].Cmd; NxtSta.Spt[h][s].Txn.Nid := n; NxtSta.Spt[h][s].Txn.Tag := i; NxtSta.Spt[h][s].DataFwded := FALSE; --NxtSta.Spt[h][s].WbRcvd := Wb_None; for j : NID do NxtSta.Spt[h][s].CV[j] := FALSE; NxtSta.Spt[h][s].ACV[j] := FALSE; NxtSta.Spt[h][s].SRP[j] := 0; endfor; endprocedure; 672 Ref No xxxxx