diff --git a/.github/workflows/pycodestyle.yml b/.github/workflows/pycodestyle.yml new file mode 100644 index 000000000..5dbf05cf5 --- /dev/null +++ b/.github/workflows/pycodestyle.yml @@ -0,0 +1,23 @@ +name: Pycodestyle + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pycodestyle + - name: Analysing the code with pycodestyle + run: | + pycodestyle --ignore=E501,W503 $(git ls-files 'learning_observer/*.py' 'modules/*.py') diff --git a/.gitignore b/.gitignore index b25c15b81..b53507a71 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,30 @@ *~ +\#* +.\#* +*__pycache__* +webapp/logs +webapp/static_data/teachers.yaml +creds.yaml +CREDS.YAML +uncommitted +extension.crx +extension.pem +extension.zip +*egg-info* +public_key +*/dist +learning_observer/learning_observer/static_data/teachers.yaml +learning_observer/learning_observer/logs/ +learning_observer/learning_observer/static/3rd_party/ +learning_observer/learning_observer/static_data/course_lists/ +learning_observer/learning_observer/static_data/course_rosters/ +learning_observer/learning_observer/static_data/repos/ +learning_observer/learning_observer/static_data/dash_assets/ +learning_observer/learning_observer/static_data/courses.json +learning_observer/learning_observer/static_data/students.json +learning_observer/passwd.lo +--* +.venv/ +.vscode/ +build/ +dist/ diff --git a/CONTRIBUTORS.TXT b/CONTRIBUTORS.TXT new file mode 100644 index 000000000..48602bc9c --- /dev/null +++ b/CONTRIBUTORS.TXT @@ -0,0 +1,3 @@ +Piotr Mitros +Oren Livne +Paul Deane diff --git a/LICENSE.TXT b/LICENSE.TXT new file mode 100644 index 000000000..be3f7b28e --- /dev/null +++ b/LICENSE.TXT @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..9cd02d01b --- /dev/null +++ b/Makefile @@ -0,0 +1,65 @@ +PYTHONFILES = $(wildcard \ + learning_observer/learning_observer/*py \ + learning_observer/util/*py \ + gitserve/gitserve/*py \ + gitserve/*py \ + learning_observer/learning_observer/pubsub/*py \ + learning_observer/learning_observer/stream_analytics/*py \ +) + +run: + # If you haven't done so yet, run: make install + # Also, run: workon learning_observer + cd learning_observer && python learning_observer + +# Build browser extension +extension-package: + # I won't build this for you, but I will give you instructions, since + # the set of options will vary. + # + # On my last system, I needed: + # google-chrome --pack-extension=extension --pack-extension-key=extension.pem --disable-setuid-sandbox --no-gpu --no-sandbox --headless + # On my current system + # google-chrome --pack-extension=extension --pack-extension-key=extension.pem + # The pem file is your private key. You'll need to make one. + # The output will be called extension.crx + +codestyle: + # Check code style quality + # + # In pycodestyle, we ignore E501 (line too long) and W503 (which + # requires a choice. + # https://stackoverflow.com/questions/57074300/what-is-the-recommended-way-to-break-long-if-statement-w504-line-break-after-b + # + # In pylint, we ignore: + # 1. W0613: unused arguments (common for e.g. `request` parameter) + # 2. E501/C0301: line too long (obsolete with modern computers + # ref: https://lkml.org/lkml/2020/5/29/1038) + # We still aim for shorter lines, but it's not a showstopper if + # we break the limit occasionally. We may re-examine this decision + # if we have sufficient developers working on VT100 terminals. + # 3. W0511: TODO: We're gonna have a lot of these, and we want to + # encourage leaving these around. As a coding style, we want to + # get interface and overall structures right, and then clean + # up e.g. performance/scaling, exception handling, test coverage, + # etc. once we know what we're doing. + # 4. R0913: Too many arguments. That's a relic of web apps. Sorry. + # + # Generally, pycodestyle issues are showstopppers for pushing to + # upstream. Pylint issues are worth an occasional cleanup pass, but we + # can tolerate. + + pycodestyle --ignore=E501,W503 $(PYTHONFILES) + pylint -d W0613,W0511,C0301,R0913,too-few-public-methods $(PYTHONFILES) + +install: + # Run: + # mkvirtualenv learning_observer + # pip install -r requirements.txt + # cd learning_observer + # python setup.py develop + # python learning_observer + # And then clean up this Makefile to do it for you automatically :) + +sphinx: + cd learning_observer/docs; make html diff --git a/README.md b/README.md index 2bff5383e..2c8292eed 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,143 @@ -# Writing analysis -This repository is for a writing analysis project. There isn't much to see here yet. +# Writing Observer and Learning Observer -Contact/maintainer: Piotr Mitros (pmitros@ets.org) +![Writing Observer Logo](learning_observer/learning_observer/static/media/logo-clean.jpg) -Licensing: Open source / free software. License TBD. \ No newline at end of file +This repository is part of a project to provide an open source +learning analytics dashboard to help instructors be able to manage +student learning processes, and in particular, student writing +processes. + +![linting](https://github.com/ETS-Next-Gen/writing_observer/actions/workflows/pycodestyle.yml/badge.svg) + +## Learning Observer + +_Learning Observer_ is designed as an open source, open science learning +process data dashboarding framework. You write reducers to handle +per-student writing data, and aggegators to make dashboards. We've +tested this in math and writing, but our focus is on writing process +data. + +It's not finished, but it's moving along quickly. + +## Writing Observer + +_Writing Observer_ is a plug-in for Google Docs which visualizes writing +data to teachers. Our immediate goal was to provide a dashboard which +gives rapid, actionable insights to educators supporting remote +learning during this pandemic. We're working to expand this to support +a broad range of write-to-learn and collaborative learning techniques. + +## Status + +There isn't much to see here for external collaborators yet. This +repository has a series of prototypes to confirm we can: + +* collect the data we want; +* extract what we need from it; and +* route it to where we want it to go (there's *a lot* of data, with + complex dependencies, so this is actually a nontrivial problem) + +Which mitigates most of the technical risk. We also now integrate with +Google Classroom. We also have prototype APIs for making dashboards, and +a few prototype dashboards. + +For this to be useful, we'll need to provide some basic documentation +for developers to be able to navigate this repo (in particular, +explaining *why* this approach works). + +This system is designed to be *massively* scalable, but it is not +currently implemented to be so (mostly for trivial reasons; +e.g. scaffolding code which uses static files as a storage model). It +will take work to flush out all of these performance issues, but we'd +like to do that work once we better understand what we're doing and +that the core approach and APIs are correct. + +Getting Started +=============== + +As an early prototype, getting started isn't seamless. Run: + +~~~~~ +make install +~~~~~ + +And follow the instructions. You'll probably run into bugs. Work around the bugs. Then fix up the makefile and make a PR to address those bugs :) + +Once that's done, run: + +~~~~ +make +~~~~ + +Again, fix up the makefile, and make a PR. + +You can also go into the devops directory, which has scripts in +progress for spinning up a cloud instance and managing flocks of +_Learning Observer_ instances. + + +~~~~ +Writing Observer +~~~~ + + +To setup writing_observer on top of the learning observer platform you must go into modules/writing_observer and run: + + sudo python setup.py develop + + + + +System requirements +=================== + +It depends on what you're planning to use the system for. + +The core _Learning Observer_ system works fine on an AWS nano +instance, and that's how we do most of our testing and small-scale +pilots. These instances have 512MB of RAM, and minimal CPU. It's +important that this configuration remains usable. + +For deployment and more sophisticated uses (e.g. NLP) in larger +numbers of classrooms, we expect to need **heavy** metal. As we're +playing with algorithms, deep learning is turning out to work +surprisingly well, and at the same time, requires surprisingly large +amounts of computing power. A GPGPU with plenty of RAM is helpful if +you want to work with more sophisticated algorithms, and is likely to +be a requirement for many types of uses. + +All _Learning Observer_ development has been on Linux-based platforms +(including Ubuntu and RHEL). There are folks outside of the core team +who have tried to run it on Mac or on WSL, with mixed success. + +Running on RHEL requires the following services: + * REDIS + * nginx. + + +bcrypt +----------------------------------------------- +A note on bcrypt. The code uses bcrypt for some internal password management. +We are not including it directly in the install because it acts oddly across +platforms so you may need to install some version manually. + + +Additional Notes +=================== +At present the system also uses static content that is served from a repo. +This allows us to actually select different sources for the static data. +This can also point to the current copy if necessary and can be configured +as part of the creds.yaml file to generate the repo or add it on startup. + +This is not ideal but it is a mechanism for embedding the alternatives. + + +Contributing or learning more +============================= + +We're still a small team, and the easiest way is to shoot us a quick +email. We'll gladly walk you through anything you're interested in. + +Contact/core maintainer: Piotr Mitros + +Licensing: Open source / free software. License: AGPL. diff --git a/configuration/files/default b/configuration/files/default deleted file mode 100644 index e2a5bd428..000000000 --- a/configuration/files/default +++ /dev/null @@ -1,94 +0,0 @@ -## -# You should look at the following URL's in order to grasp a solid understanding -# of Nginx configuration files in order to fully unleash the power of Nginx. -# https://www.nginx.com/resources/wiki/start/ -# https://www.nginx.com/resources/wiki/start/topics/tutorials/config_pitfalls/ -# https://wiki.debian.org/Nginx/DirectoryStructure -# -# In most cases, administrators will remove this file from sites-enabled/ and -# leave it as reference inside of sites-available where it will continue to be -# updated by the nginx packaging team. -# -# This file will automatically load configuration files provided by other -# applications, such as Drupal or Wordpress. These applications will be made -# available underneath a path with that package name, such as /drupal8. -# -# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples. -## - -# Default server configuration -# -server { - listen 80 default_server; - listen [::]:80 default_server; - - server_name [[[[[[SERVERNAME]]]]]; # managed by Certbot - listen [::]:443 ssl ipv6only=on; # managed by Certbot - listen 443 ssl; # managed by Certbot - ssl_certificate /etc/letsencrypt/live/writing.hopto.org/fullchain.pem; # managed by Certbot - ssl_certificate_key /etc/letsencrypt/live/writing.hopto.org/privkey.pem; # managed by Certbot - include /etc/letsencrypt/options-ssl-nginx.conf; # managed by Certbot - ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem; # managed by Certbot - - - root /var/www/html; - - # Add index.php to the list if you are using PHP - index index.html index.htm index.nginx-debian.html; - - server_name writing.hopto.org; - - location / { - # First attempt to serve request as file, then - # as directory, then fall back to displaying a 404. - try_files $uri $uri/ =404; - } - - location /webapi/ { - proxy_pass http://localhost:8888/webapi/; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - if ($request_method = OPTIONS ) { - add_header "Access-Control-Allow-Origin" *; - add_header "Access-Control-Allow-Methods" "GET, POST, OPTIONS, HEAD"; - add_header "Access-Control-Allow-Headers" "Authorization, Origin, X-Requested-With, Content-Type, Accept"; - return 200; - } - } - - location /wsapi/ { - proxy_pass http://localhost:8888/wsapi/; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "upgrade"; - proxy_read_timeout 86400; - - if ($request_method = OPTIONS ) { - add_header "Access-Control-Allow-Origin" *; - add_header "Access-Control-Allow-Methods" "GET, POST, OPTIONS, HEAD"; - add_header "Access-Control-Allow-Headers" "Authorization, Origin, X-Requested-With, Content-Type, Accept"; - return 200; - } - } - - - # pass PHP scripts to FastCGI server - # - #location ~ \.php$ { - # include snippets/fastcgi-php.conf; - # - # # With php-fpm (or other unix sockets): - # fastcgi_pass unix:/var/run/php/php7.0-fpm.sock; - # # With php-cgi (or other tcp sockets): - # fastcgi_pass 127.0.0.1:9000; - #} - - # deny access to .htaccess files, if Apache's document root - # concurs with nginx's one - # - #location ~ /\.ht { - # deny all; - #} -} \ No newline at end of file diff --git a/devops/README.md b/devops/README.md new file mode 100644 index 000000000..352ae056f --- /dev/null +++ b/devops/README.md @@ -0,0 +1,13 @@ +Dev-ops scripts +=============== + +This contains machinery for spinning up, shutting down, and managing +Learning Observer servers. It's usable, but very much not done yet. We +can spin up, spin down, and list machines, but this ought to be more +fault-tolerant, better logged, less hard-coded, etc. + +We would like to be cross-platform, and evenually support both +Debian-based distros and RPM-based distros. We're not there yet +either. We'd also like to support multiple cloud providers. We're not +there yet either. However, we probably won't accept PRs which move us +away from this goal. \ No newline at end of file diff --git a/devops/ansible/files/default b/devops/ansible/files/default new file mode 100644 index 000000000..4c633a57b --- /dev/null +++ b/devops/ansible/files/default @@ -0,0 +1,74 @@ +server { + # We listen for HTTP on port 80. This is helpful for debugging + listen 80 default_server; + listen [::]:80 default_server; + + # We listen for HTTPS on port 443 too. This is managed when we set up certbot. + + # Set this up when installing: + server_name {SERVER_NAME}; + + # We're mostly not using static web files right now, but it's good to have these around. + root /var/www/html; + index index.html index.htm index.nginx-debian.html; + + # We will eventually want to split our (non-CORS) data intake and our (CORS) dashboards + location / { + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. + add_header "Access-Control-Allow-Origin" *; + add_header "Access-Control-Allow-Methods" "GET, POST, OPTIONS, HEAD"; + add_header "Access-Control-Allow-Headers" "Authorization, Origin, X-Requested-With, Content-Type, Accept"; + + try_files $uri $uri/ =404; + } + + location /app/ { + # For now, this is for debugging and development. We'd like to be able to launch arbitrary + # web apps. In the longer-term, it's likely the whole system might move here (and who knows + # if this comment will update). + # + # Note we don't add CORS headers for now, but we eventually will need to. We'll need to sort + # through where we add them, though. + proxy_pass http://localhost:8080/; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + #rewrite ^/app/(.*)$ $1 last; + if ($request_method = OPTIONS ) { + return 200; + } + } + + # This is our HTTP API + # Note that we disable CORS. We may want to have a version with and without CORS + location /webapi/ { + proxy_pass http://localhost:8888/webapi/; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + if ($request_method = OPTIONS ) { + add_header "Access-Control-Allow-Origin" *; + add_header "Access-Control-Allow-Methods" "GET, POST, OPTIONS, HEAD"; + add_header "Access-Control-Allow-Headers" "Authorization, Origin, X-Requested-With, Content-Type, Accept"; + return 200; + } + } + + # And our websockets API + # We are migrating our streaming analytics to web sockets. + location /wsapi/ { + proxy_pass http://localhost:8888/wsapi/; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_read_timeout 86400; + + if ($request_method = OPTIONS ) { + add_header "Access-Control-Allow-Origin" *; + add_header "Access-Control-Allow-Methods" "GET, POST, OPTIONS, HEAD"; + add_header "Access-Control-Allow-Headers" "Authorization, Origin, X-Requested-With, Content-Type, Accept"; + return 200; + } + } +} diff --git a/configuration/files/nginx-locations b/devops/ansible/files/nginx-locations similarity index 100% rename from configuration/files/nginx-locations rename to devops/ansible/files/nginx-locations diff --git a/configuration/local.yaml b/devops/ansible/local.yaml similarity index 71% rename from configuration/local.yaml rename to devops/ansible/local.yaml index d12102f87..3d502ca93 100644 --- a/configuration/local.yaml +++ b/devops/ansible/local.yaml @@ -2,5 +2,5 @@ hosts: localhost connection: local tasks: - - include: tasks/writing.yaml + - include: tasks/writing-apt.yaml diff --git a/configuration/scripts/add_nginx_locations.py b/devops/ansible/scripts/add_nginx_locations.py similarity index 100% rename from configuration/scripts/add_nginx_locations.py rename to devops/ansible/scripts/add_nginx_locations.py diff --git a/devops/ansible/scripts/rhel b/devops/ansible/scripts/rhel new file mode 100644 index 000000000..130a66c0f --- /dev/null +++ b/devops/ansible/scripts/rhel @@ -0,0 +1 @@ +yum install ansible emacs nginx redis curl git links lynx screen whois nginx postgresql diff --git a/configuration/tasks/writing.yaml b/devops/ansible/tasks/writing-apt.yaml similarity index 79% rename from configuration/tasks/writing.yaml rename to devops/ansible/tasks/writing-apt.yaml index c178b50ee..e00d6b134 100644 --- a/configuration/tasks/writing.yaml +++ b/devops/ansible/tasks/writing-apt.yaml @@ -15,8 +15,10 @@ - screen - wipe - build-essential - - awscli + - net-tools +# We don't need all of this per se, but it's convenient. If nothing +# else, it gives prereqs for `pip` - name: Python apt: name={{ item }} with_items: @@ -42,10 +44,14 @@ - python3-tornado - python3-yaml - python3-asyncpg + - python3-bcrypt - name: Server apt: name={{ item }} with_items: - - postgresql + - redis - nginx - certbot + - apache2-utils + - fcgiwrap + - python3-certbot-nginx \ No newline at end of file diff --git a/devops/requirements.txt b/devops/requirements.txt new file mode 100644 index 000000000..fbb305608 --- /dev/null +++ b/devops/requirements.txt @@ -0,0 +1,5 @@ +chevron +boto3 +pyyaml +fabric + diff --git a/devops/tasks/README.md b/devops/tasks/README.md new file mode 100644 index 000000000..156977314 --- /dev/null +++ b/devops/tasks/README.md @@ -0,0 +1,124 @@ +Deployment Scripts +================== + +Our goals are: + +* We'd like to have a flock of LO servers for dynamic assessment, + Writing Observer, random demos, etc. These should have a common + configuration, with variations. +* We'd like to have a log of how these are configured at every point + in time, and any changes, so we can have context for any process + data we collect. +* We'd like this representation to be interoperable with our process + data storage formats +* We'd like configuation data to be moderately secure. Device + configuration won't allow exploits in itself, but it can make + vulnerabilities more serious. While things like IDs and locations of + resources don't present an attack vector in themselves, knowing them + is sometimes the limiting factor on being able to exploit an attack + vector (for example, if I have an exploit where I can read one + arbitrary file on your system, being able to leverage that attack + hinges on knowing what files you have where) +* However, configuration data also sometimes needs to stores things + which are super-sensitive, like security tokens and similar. +* Making changes should be fast and easy. This happens all the time. +* Digging into archives doesn't need to be easy, just possible. For + research, only a few types of analysis need it. For operations, you + usually only need it for debugging or disaster recovery. + +Our **planned** architecture is: + +* A set of `fabric` script which can spin up / spin down / update + machines (with appropriate logging) +* A baseline configuration in `ansible`. +* Deltas from that configuration stored in an independent `git` repo +* Security tokens stored in a seperate TBD data store. We'll populate + these with templates. +* Log files of when new versions are updated/deployed/brought down, in + the same system as our process data +* The tagging process data with `git` hashes of what state the system + was in when it generated it. + +We're making the baseline `ansible` configuration pretty featureful, +since as a research project, it's helpful to be able to `ssh` into +machines, and e.g. run `Python` scripts locally. + +Whether or not we need `ansible`, `fabric`, or both is a bit of an +open question. + +Where we are +------------ + +This will be out-of-date quickly, but as of this writing: + +* We can provision, terminate, and update machines with a baseline + configuration. +* A lot of stuff is hardcoded, which would make this difficult for + others to use (e.g. learning-observer.org). +* We install packages, grab things from `git`, etc, but don't handle + configuration well yet. +* We don't log. + +We orchestrate servers with [invoke](https://www.pyinvoke.org/): + +* `inv list` will show a listing of deployed machines +* `inv provision [machine]` will spin up a new AWS machine +* `inv update` will update all machines +* `inv terminate [machine]` will shut down a machine +* `inv connect [machine]` will open up an `ssh` session to a machine +* `inv configure [machine]` is typically run after provision, and + will place configuration files (which might vary + machine-by-machine) (mostly finished) +* `inv certbot [machine]` will set up SSL (unfinished) +* `inv downloadconfig [machine]` will copy the configuration back. +* `inv create [machine]` is a shortcut to do everything for a new instance in one step (provision, configure, certbotify, and download the SSL config) + +A lot of this is unfinished, but still, it's already ahead of the AWS +GUI and doing things by hand. The key functionality missing is: + +* High-quality logging +* Fault recovering +* Version control of configurations + +To set up a new machine, run: + +``` +inv provision [machine] +inv configure [machine] +inv certbot [machine] +inv downloadconfig [machine] +``` + +From there, edit configuration files in `config` and to update the +machine to a new version, run + +``` +inv configure [machine] +``` + +Debugging +--------- + +The most annoying part of this setup is getting `systemd` working, +which is poorly documented, inconsistent, and poorly-engineered. The +tool are `journalctl -xe |tail -100`, looking at `lo.err` (currently +in `/home/ubuntu/`, but should move to `/var/log/` eventually), and +`systemctl status --full learning_observer`. The most common issues +are permissions (e.g. running as the wrong user, log files generated +as `root:root` at some point, etc), running from the wrong directory, +and similar sorts of environment issues. + +Logging +------- + +We are logging system configuration with `git`. Note that this is +**NOT** atomic or thread-safe. This is perhaps a bug, and perhaps by +design: + +* Tasks take a _while_ to run, and they need to run in parallel when + managing many machines. +* A better (much more complex) approach would use branches or do + atomic commits at the end (e.g. download to a temporary dir, and + move right before the commit. +* However, it is possible to reverse-engineered exactly what happened, + roughly when. This is good enough for now. \ No newline at end of file diff --git a/devops/tasks/config/creds.yaml b/devops/tasks/config/creds.yaml new file mode 100644 index 000000000..dab0fe6d8 --- /dev/null +++ b/devops/tasks/config/creds.yaml @@ -0,0 +1,33 @@ +hostname: {{hostname}}.{{domain}} +xmpp: + sink: # Receives messages. We'll need many of these. + jid: sink@localhost + password: {{RANDOM1}} + source: # Sends messages. + jid: source@localhost + password: {{RANDOM1}} + stream: # For debugging + jid: stream@localhost + password: {{RANDOM1}} +auth: + password_file: passwd.lo +pubsub: + type: redis +kvs: + type: redis +roster_data: + source: all +aio: + session_secret: {{RANDOM2}} + session_max_age: 3600 +config: + run_mode: dev + debug: [] +theme: + server_name: Learning Observer + front_page_pitch: Learning Observer is an experimental dashboard. If you'd like to be part of the experiment, please contact us. If you're already part of the experiment, log in! + logo_big: /static/media/logo-clean.jpg +event_auth: + local_storage: + userfile: students.yaml + allow_guest: true \ No newline at end of file diff --git a/devops/tasks/config/hostname b/devops/tasks/config/hostname new file mode 100644 index 000000000..8c9fff80a --- /dev/null +++ b/devops/tasks/config/hostname @@ -0,0 +1 @@ +{{hostname}} \ No newline at end of file diff --git a/devops/tasks/config/init.d b/devops/tasks/config/init.d new file mode 100644 index 000000000..6d8b816ad --- /dev/null +++ b/devops/tasks/config/init.d @@ -0,0 +1,42 @@ +#!/bin/bash + +# The world's simplest, stupidest init script. +# +# THIS IS CURRENTLY UNUSED, SINCE WE USE A SYSTEMD SCRIPT + +### BEGIN INIT INFO +# Provides: learning_observer +# Required-Start: mountkernfs $local_fs +# Required-Stop: +# Should-Start: +# X-Start-Before: +# Default-Start: S +# Default-Stop: +# Short-Description: Runs the Learning Observer platform +# Description: This is a part of a larger dev-ops infrastructure. This is unlikely to work in isolation. +### END INIT INFO +# +# written by Piotr Mitros + + +case "$1" in +start) + cd /home/ubuntu/writing_observer/learning_observer/ + setsid -f su ubuntu ./lo.sh +;; +status) + printf "For status, run: ps aux | grep learning_observer\n" +;; +stop) + pkill -f learning_observer +;; + +restart) + $0 stop + $0 start +;; + +*) + echo "Usage: $0 {status|start|stop|restart}" + exit 1 +esac diff --git a/devops/tasks/config/lo.sh b/devops/tasks/config/lo.sh new file mode 100644 index 000000000..d9a05d850 --- /dev/null +++ b/devops/tasks/config/lo.sh @@ -0,0 +1,9 @@ +#!/usr/bin/bash + +# This is a script to start up Learning Observer with it's own process +# name. This is convenient for being able to start / stop the process. + +. /usr/share/virtualenvwrapper/virtualenvwrapper.sh +workon learning_observer +cd /home/ubuntu/writing_observer/learning_observer +bash -c "exec -a learning_observer python learning_observer" >> /home/ubuntu/lo.log 2>> /home/ubuntu/lo.err diff --git a/devops/tasks/config/nginx b/devops/tasks/config/nginx new file mode 100644 index 000000000..229ba6c4a --- /dev/null +++ b/devops/tasks/config/nginx @@ -0,0 +1,40 @@ +server { + # We listen for HTTP on port 80. When we set up certbot, this changes to 443. + listen 80 default_server; + listen [::]:80 default_server; + + server_name {{hostname}}.{{domain}}; + + location / { + # Generally, used to configure permissions. E.g. http basic auth, allow/deny + # IP blocks, etc. Note that for deploy, this should be broken out into several + # blocks (e.g. incoming event, dashboards, etc.) + {{nginx_root_options}} + + proxy_pass http://localhost:8888/; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + # We disable CORS globally. This should be more granular. + add_header "Access-Control-Allow-Origin" *; + add_header "Access-Control-Allow-Methods" "GET, POST, OPTIONS, HEAD"; + add_header "Access-Control-Allow-Headers" "Authorization, Origin, X-Requested-With, Content-Type, Accept"; + } + location /wsapi/ { + proxy_pass http://localhost:8888/wsapi/; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_read_timeout 86400; + + add_header "Access-Control-Allow-Origin" *; + add_header "Access-Control-Allow-Methods" "GET, POST, OPTIONS, HEAD"; + add_header "Access-Control-Allow-Headers" "Authorization, Origin, X-Requested-With, Content-Type, Accept"; + + if ($request_method = OPTIONS ) { + return 200; + } + } +} \ No newline at end of file diff --git a/devops/tasks/config/passwd.lo b/devops/tasks/config/passwd.lo new file mode 100644 index 000000000..e69de29bb diff --git a/devops/tasks/config/postuploads b/devops/tasks/config/postuploads new file mode 100644 index 000000000..1f7da6ebe --- /dev/null +++ b/devops/tasks/config/postuploads @@ -0,0 +1,9 @@ +sudo hostnamectl set-hostname {hostname} +sudo rm -f /etc/nginx/sites-available/default +sudo rm -f /etc/nginx/sites-enabled/default +sudo ln -f /etc/nginx/sites-available/{hostname} /etc/nginx/sites-enabled/{hostname} +sudo chown -R ubuntu:ubuntu /home/ubuntu/writing_observer +sudo systemctl daemon-reload +sudo service learning_observer stop +sudo service learning_observer start +sudo service nginx restart diff --git a/devops/tasks/config/rsyslog.conf b/devops/tasks/config/rsyslog.conf new file mode 100644 index 000000000..47ee22ba5 --- /dev/null +++ b/devops/tasks/config/rsyslog.conf @@ -0,0 +1 @@ +if $programname == 'learning_observer' then /var/log/lo.log \ No newline at end of file diff --git a/devops/tasks/config/sync.csv b/devops/tasks/config/sync.csv new file mode 100644 index 000000000..e3592df21 --- /dev/null +++ b/devops/tasks/config/sync.csv @@ -0,0 +1,6 @@ +creds.yaml,root:root,644,/home/ubuntu/writing_observer/learning_observer/creds.yaml,"Learning Observer settings file" +nginx,root:root,644,/etc/nginx/sites-enabled/{hostname},"nginx site configuration" +passwd.lo,root:root,644,/home/ubuntu/writing_observer/learning_observer/passwd.lo,"(Generally blank) passwords file" +lo.sh,ubuntu:ubuntu,744,/home/ubuntu/writing_observer/learning_observer/lo.sh,"Script to start Learning Observer with a nice process name" +systemd,root:root,644,/etc/systemd/system/learning_observer.service,"Systemd init script" +rsyslog.conf,root:root,644,/etc/rsyslog.d/learning_observer.conf,"rsyslog script (for stdout/stderr)" \ No newline at end of file diff --git a/devops/tasks/config/systemd b/devops/tasks/config/systemd new file mode 100644 index 000000000..673d3ffa2 --- /dev/null +++ b/devops/tasks/config/systemd @@ -0,0 +1,11 @@ +[Unit] +Description=Learning Observer + +[Service] +ExecStart=/home/ubuntu/writing_observer/learning_observer/lo.sh +Type=simple +StandardOutput=syslog +StandardError=syslog +SyslogIdentifier=learning_observer +User=ubuntu +Group=ubuntu \ No newline at end of file diff --git a/devops/tasks/orchlib/__init__.py b/devops/tasks/orchlib/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/devops/tasks/orchlib/aws.py b/devops/tasks/orchlib/aws.py new file mode 100644 index 000000000..ce76b14ba --- /dev/null +++ b/devops/tasks/orchlib/aws.py @@ -0,0 +1,248 @@ +''' +Tools to bring up an AWS nano instance, and to connect it to DNS via +Route 53. We do not want to be AWS-specific, and this file should be +the only place where we import boto. +''' + +import time +import yaml + +import boto3 + +import orchlib.config +import orchlib.fabric_flock +from orchlib.logger import system + + +session = boto3.session.Session() +ec2 = session.resource('ec2') +ec2client = boto3.client('ec2') +r53 = boto3.client('route53') + +UBUNTU_20_04 = "ami-09e67e426f25ce0d7" + +def create_instance(name): + ''' + Launch a machine on EC2. Return the boto instance object. + ''' + blockDeviceMappings = [ + { + "DeviceName": "/dev/xvda", + "Ebs": { + "DeleteOnTermination": True, + "VolumeSize": 32, + "VolumeType": "gp2" + } + } + ] + + # Baseline set of tags.... + tags = [ + { + 'Key': 'Name', + 'Value': name + }, + { + 'Key': 'Owner', + 'Value': orchlib.config.creds['owner'] + }, + { + 'Key': 'deploy-group', + 'Value': orchlib.config.creds['deploy-group'] + } + ] + + # And we allow extra tags from the config file. + # + # This should be handled more nicely at some point. We want + # a global config, with per-machine overrides, and we want + # this config common for all templates, etc. + for (key, value) in orchlib.config.creds.get("ec2_tags", {}).items(): + tags.append({ + 'Key': key, + 'Value': value + }) + + # This is kind of a mess. + # Good command to help guide how to make this: + # `aws ec2 describe-instances > template` + # It doesn't correspond 1:1, but it's a good starting + # point. + response = ec2.create_instances( + ImageId=UBUNTU_20_04, + InstanceType='t2.small', + BlockDeviceMappings=blockDeviceMappings, + KeyName=orchlib.config.creds['aws_keyname'], + MinCount=1, + MaxCount=1, + Placement={ + "AvailabilityZone": "us-east-1b" + }, + NetworkInterfaces=[ + { + 'SubnetId': orchlib.config.creds['aws_subnet_id'], + 'DeviceIndex': 0, + 'AssociatePublicIpAddress': True, + 'Groups': [orchlib.config.creds['aws_security_group']] + } + ], + TagSpecifications=[ + { + 'ResourceType': 'instance', + 'Tags': tags + } + ] + ) + + instance = response[0] + instance.wait_until_running() + # Reload, to update with assigned IP, etc. + instance = ec2.Instance(instance.instance_id) + + # Switch to IMDS v2, hopefully, due to security improvements + ec2client.modify_instance_metadata_options( + InstanceId=instance.instance_id, + HttpTokens='required', + HttpEndpoint='enabled' + ) + + print("Launched ", instance.instance_id) + print("IP: ", instance.public_ip_address) + return instance + + +def list_instances(): + ''' + List all of the `learning-observer` instances, in a compact + format, with just the: + + * Instance ID + * Tags + * Public IP Address + ''' + reservations = ec2client.describe_instances(Filters=[ + { + 'Name': 'tag:deploy-group', + 'Values': [orchlib.config.creds['deploy-group']] + }, + ])['Reservations'] + instances = sum([i['Instances'] for i in reservations], []) + summary = [{ + 'InstanceId': i['InstanceId'], + 'Tags': {tag['Key']: tag['Value'] for tag in i['Tags']}, + 'PublicIpAddress': i.get('PublicIpAddress', "--.--.--.--") + } for i in instances] + return summary + +def terminate_instances(name): + ''' + Terminate all instances give the name. + + Returns the number of instances terminated. We might kill more + than one if we assign several the same name. + + Also, wipes their associated DNS. + ''' + instances = list_instances() + print("All instances: ", instances) + matching_instances = [ + i for i in instances if i['Tags']['Name'] == name + ] + # Set to `None` so we don't accidentally touch this again! + instances = None + print("Matching instances: ", matching_instances) + for i in range(10): + print(10-i) + time.sleep(1) + print("Removing DNS") + for instance in matching_instances: + register_dns( + name, + orchlib.config.creds['domain'], + instance['PublicIpAddress'], + unregister=True + ) + print("Terminating") + ec2client.terminate_instances( + InstanceIds = [i['InstanceId'] for i in matching_instances] + ) + system("ssh-keygen -R {host}.{domain}".format( + host=name, + domain=orchlib.config.creds['domain'] + )) + return len(matching_instances) + + +def register_dns(subdomain, domain, ip, unregister=False): + ''' + Assign a domain name to a machine. + ''' + action = 'UPSERT' + if unregister: + action = 'DELETE' + zones = r53.list_hosted_zones_by_name( + DNSName=domain + )['HostedZones'] + + # AWS seems to ignore DNSName=domain. We filter down to the right + # domain AWS does include a dot at the end + # (e.g. 'learning-observer.org.'), and we don't right now + # (e.g. `learning-observer.org`). We don't need the first test, + # but we included it so we don't break if we ever do pass a domain + # in with the dot. + zones = [ + z for z in zones # Take all the zone where.... + if z['Name'].upper() == domain.upper() # The domain name is correct + or z['Name'].upper() == (domain+".").upper() # With a dot at the end + ] + + if len(zones)!= 1: + raise Exception("Wrong number of hosted zones!") + zoneId = zones[0]['Id'] + request = r53.change_resource_record_sets( + HostedZoneId = zoneId, + ChangeBatch = { + 'Changes': [ + { + 'Action': action, + 'ResourceRecordSet' : { + 'Name' : '{subdomain}.{domain}.'.format( + subdomain=subdomain, + domain=domain + ), + 'Type' : 'A', + 'TTL' : 15, + 'ResourceRecords' : [ + {'Value': ip} + ] + } + }, + ] + } + ) + + # If we're setting DNS, wait for changes to propagate, so we + # can use DNS later in the script + while True and not unregister: + print("Propagating DNS....", request['ChangeInfo']['Status']) + time.sleep(1) + id = request['ChangeInfo']['Id'] + request = r53.get_change(Id=id) + if request['ChangeInfo']['Status'] == 'INSYNC': + break + return True + + +def name_to_group(machine_name): + ''' + For a machine name, return a fabric ssh group of machines with + that name. + ''' + pool = [ + i['PublicIpAddress'] + for i in list_instances() + if i['Tags']['Name'] == machine_name + ] + print(pool) + group = orchlib.fabric_flock.machine_group(*pool) + return group diff --git a/devops/tasks/orchlib/config.py b/devops/tasks/orchlib/config.py new file mode 100644 index 000000000..330de195c --- /dev/null +++ b/devops/tasks/orchlib/config.py @@ -0,0 +1,100 @@ +import os +import os.path + +import json +import yaml + +creds_file = "settings/CREDS.YAML" + +if not os.path.exists(creds_file): + print("No credentials file. I'll need a bit of info from you") + print("to make one.") + info = { + "user": "Your username on the remote machine (probably ubuntu)", + "key_filename": "Your AWS key filename (something like /home/me/.ssh/aws.pem)", + "aws_keyname": "Your AWS key id (as AWS knows it; e.g. aws.pem)", + "aws_subnet_id": "AWS subnet (e.g. subnet-012345abc)", + "aws_security_group": "AWS security group (e.g. sg-012345abc)", + "owner": "Your name", + "email": "Your email", + "domain": "Domain name (e.g. learning-observer.org)", + "flock-config": "Path to git repo where we'll store machine config.", + "deploy-group": "Tag to identify all machines (typically, learning-observer)", + "ec2_tags": "JSON dictionary of any additional tags you'd like on your machines. If you're not sure, type {}" + } + print("I'll need:") + for key, value in info.items(): + print("* {value}".format(value=value)) + print("Let's get going") + d = {} + for key, value in info.items(): + print(value) + d[key] = input("{key}: ".format(key=key)).strip() + d['ec2_tags'] = json.loads(d['ec2_tags']) + if not os.path.exists(d['flock-config']): + os.system("git init {path}".format(path=d['flock-config'])) + os.mkdir(os.path.join(d['flock-config'], "config")) + with open("settings/CREDS.YAML", "w") as fp: + yaml.dump(d, fp) + +creds = yaml.safe_load(open(creds_file)) + +def config_filename(machine_name, file_suffix, create=False): + ''' + Search for the name of a config file, checking + * Per-machine config + * System-wide defaults + * Defaults for this for the Learning Observer (defined in this repo) + + Absolute paths (e.g. beginning with '/') are returned as-is. + ''' + if file_suffix.startswith("/"): + return file_suffix + + paths = [ + # First, we try per-machine configuration + os.path.join( + creds["flock-config"], "config", machine_name, file_suffix + ), + # Next, we try the per-machine override + os.path.join( + creds["flock-config"], "config", machine_name, file_suffix+".base" + ), + # Then, system-wide configuration + os.path.join( + creds["flock-config"], "config", file_suffix + ), + # And finally, as a fallback, default files + os.path.join( + "config", file_suffix + ) + ] + + # For making new versions, always return the per-machine git repo + # directory + if create == True: + return paths[0] + + for fn in paths: + print(fn) + if os.path.exists(fn): + return fn + + +def config_lines(machine_name, file_suffix): + ''' + Kind of like a smart `open().readlines()` for reading config files. + + Handle paths, prefixes, missing files (return nothing), + `strip()`ing lines, comments, etc. + ''' + fn = config_filename(machine_name, file_suffix) + # No config file + if fn is None: + print("Skipping; no file for: ", file_suffix) + return + print("Config file: ", fn) + for line in open(fn).readlines(): + line = line.strip() + if len(line) > 0: + yield line diff --git a/devops/tasks/orchlib/fabric_flock.py b/devops/tasks/orchlib/fabric_flock.py new file mode 100644 index 000000000..0876dcd77 --- /dev/null +++ b/devops/tasks/orchlib/fabric_flock.py @@ -0,0 +1,81 @@ +''' +These are baseline script to help orchestrate a flock of machines +via ssh. This is a thin wrapper around `fabric`. +''' + +import yaml +import fabric + +import orchlib.config +import orchlib.logger + +def machine_group(*pool): + # Skip terminated machines. + # Sadly, also skips recently-created machines.... + pool = [ip for ip in pool if ip!="--.--.--.--"] + group = fabric.SerialGroup( + *pool, + user=orchlib.config.creds['user'], + connect_kwargs={"key_filename": orchlib.config.creds['key_filename']} + ) + + class GroupWrapper: + ''' + This is a thin wrapper, designed for logging commands, and in the + future, perhaps return values. + ''' + def __init__(self, group): + self._group = group + + def run(self, command): + command = "source ~/.profile; " + command + orchlib.logger.grouplog( + "run", + [command], + {} + ) + + self._group.run(command) + + def get(self, *args, **kwargs): + orchlib.logger.grouplog( + "get", + args, + kwargs + ) + self._group.get(*args, **kwargs) + + def put(self, *args, **kwargs): + orchlib.logger.grouplog( + "put", + args, + kwargs + ) + self._group.put(*args, **kwargs) + + def sudo(self, *args, **kwargs): + orchlib.logger.grouplog( + "sudo", + args, + kwargs + ) + self._group.sudo(*args, **kwargs) + + wrapper = GroupWrapper(group) + + return wrapper + + +def connection_group(pool = None): + ''' + Return a Fabric connection group + ''' + if pool is None: + pool = machine_pool() + + return fabric.SerialGroup( + pool, + user=orchlib.config.creds['user'], + connect_kwargs={"key_filename": orchlib.config.creds['key_filename']} + ) + diff --git a/devops/tasks/orchlib/logger.py b/devops/tasks/orchlib/logger.py new file mode 100644 index 000000000..2622b4d62 --- /dev/null +++ b/devops/tasks/orchlib/logger.py @@ -0,0 +1,41 @@ +''' +We'd like to log which actions we take. + +This isn't done, but it's a start +''' + +import os + +log = [ +] + +def system(command): + ''' + Run a command on the local system (`os.system`) + + Log the command and return code. + ''' + rc = os.system(command) + log.append({ + 'event': 'system', + 'command': command, + 'return_code': rc + }) + return rc + +def grouplog(command, args, kwargs): + log.append({ + 'event': 'group', + 'command': command, + 'args': args, + 'kwargs': kwargs + }) + + +def exitlog(): + ''' + Not done. + ''' + os.path.join( + orchlib.config.creds["flock-config"], "logs" + ) diff --git a/devops/tasks/orchlib/repos.py b/devops/tasks/orchlib/repos.py new file mode 100644 index 000000000..5ec3ea0e0 --- /dev/null +++ b/devops/tasks/orchlib/repos.py @@ -0,0 +1,54 @@ +import os + +import orchlib.config + +import remote_scripts.gitpaths + + +# Working command: GIT_SSH_COMMAND="ssh -i KEY.pem" git --git-dir=/tmp/foo/.git push -f --mirror ssh://ubuntu@SOME_SERVER/home/ubuntu/baregit/foo + + +# This command will forcefully push a local repo to a remote server, including all branches +GIT_PUSH =''' +GIT_SSH_COMMAND="ssh -i {key} -o 'StrictHostKeyChecking no'" git + --git-dir={localrepo}/.git + push -f + --mirror + ssh://ubuntu@{mn}.{domain}/home/ubuntu/baregit/{reponame} +'''.strip().replace('\n', '') + + +def force_push(machine, localrepo): + print("LOCAL REPO: ", localrepo) + command = GIT_PUSH.format( + mn=machine, + domain=orchlib.config.creds['domain'], + key=orchlib.config.creds['key_filename'], + localrepo=localrepo, + reponame=remote_scripts.gitpaths.gitpath_to_name(localrepo) + ) + print(command) + os.system(command) + + +def remote_invoke(group, command): + remote_command = "cd writing_observer/devops/tasks/remote_scripts; inv {command}".format(command=command) + print(remote_command) + group.run(remote_command) + + +def update(group, machine_name): + # In most cases, these would correspond to static sites, or + # Learning Observer modules + print("Grabbing public git packages") + for package in orchlib.config.config_lines(machine_name, "gitclone"): + remote_invoke(group, "cloneupdate {package}".format(package=package)) + print("Pushing private git packages") + + # We can only push to bare repos. + for package in orchlib.config.config_lines(machine_name, "gitpush"): + print("Configuring: ", package) + remote_invoke(group, "init {package}".format(package=package)) + print("Force pushing: ", package) + force_push(machine_name, package) + remote_invoke(group, "cloneupdatelocal {package}".format(package=package)) diff --git a/devops/tasks/orchlib/templates.py b/devops/tasks/orchlib/templates.py new file mode 100644 index 000000000..bce617079 --- /dev/null +++ b/devops/tasks/orchlib/templates.py @@ -0,0 +1,138 @@ +import base64 +import io +import os.path +import uuid + +import chevron +import filetype + +import orchlib.config + +def secure_guid(): + ''' + Mix up a few entropy sources with a few system identifiers... + + This should really be built-in. + ''' + os_random = str(base64.b64encode(os.urandom(32))) + uuid1 = uuid.uuid1() + uuid4 = uuid.uuid4().hex + return uuid.uuid5(uuid1, uuid4).hex + + +def render_file_for_transfer(filename, config): + ''' + This converts a filename and a dictionary into a file-like + object, ready for upload. + ''' + # We don't render binary files. This is not a complete set, and we might extend this + # later + BINARY_TYPES = filetype.audio_matchers + filetype.image_matchers + filetype.video_matchers + endings = [".js", ".css", ".ttf", ".ogg", ".jpg", ".png", ".webm", ".mp4"] + def skip_encode(filename): + '''We don't want to run most binary files, code, etc. through our + templating engine. These are heuristics. + + We probably should be explicit and add a field to the config + file, so we don't need heuristics. This is a little bit more + complex and ad-hoc than I like. + ''' + for e in endings: + if filename.endswith(e): + return True + if filetype.guess(filename) in BINARY_TYPES: + return True + return False + + if skip_encode(filename): + return open(filename, "rb") + + # Other files, we run through our templating engine + with open(filename) as fp: + # We convert to bytes as a hack-around for this bug: https://github.com/paramiko/paramiko/issues/1133 + return io.BytesIO(chevron.render(fp, config).encode('utf-8')) + + +def upload( + group, + machine_name, + filename, + remote_filename, + config, + username=None, + permissions=None): + ''' + This will upload a file to an AWS machine. It will: + + * Find the right file. It might be a system-wide default, + or a machine-specific one. + * Generate a set of secure tokens for use in templates (e.g. for + initial passwords) + * Render the file through `mustache` templates, based on the + configuration + * Upload to the server + * Move to the right place, and set permissions. + ''' + # We can use these for security tokens in templates. + # We should save these at some point + for i in range(10): + key = "RANDOM"+str(i) + if key not in config: + config["RANDOM"+str(i)] = secure_guid() + + local_filename = orchlib.config.config_filename(machine_name, filename) + + # This seems like an odd place, but latest `fabric` has no way + # to handle uploads as root. + group.put( + render_file_for_transfer( + local_filename, + config + ), + "/tmp/inv-upload-tmp" + ) + + group.run("sudo mv /tmp/inv-upload-tmp {remote_filename}".format( + remote_filename=remote_filename, + mn=machine_name + )) + if username is not None: + group.run("sudo chown {username} {remote_filename}".format( + username=username, + remote_filename=remote_filename + )) + if permissions is not None: + group.run("sudo chmod {permissions} {remote_filename}".format( + permissions=permissions, + remote_filename=remote_filename + )) + + +def download( + group, + machine_name, + filename, + remote_filename): + ''' + This will download a configuration file from an AWS machine, as + specified in the machine configuration. It's a simple parallel + to `upload` + ''' + print("Remote file: ", remote_filename) + + local_filename = orchlib.config.config_filename( + machine_name, + filename, + create=True + ) + + print("Local filename: ", local_filename) + + pathname = os.path.split(local_filename)[1] + if not os.path.exists(pathname): + os.mkdir(pathname) + + group.get( + remote_filename, + local_filename + ) diff --git a/devops/tasks/orchlib/ubuntu.py b/devops/tasks/orchlib/ubuntu.py new file mode 100644 index 000000000..af4327d40 --- /dev/null +++ b/devops/tasks/orchlib/ubuntu.py @@ -0,0 +1,55 @@ +''' +These are scripts for preparing an Ubuntu 20.04 machine to run the +Learning Observer +''' + +import fabric.exceptions + +import orchlib.fabric_flock +import orchlib.config + + +def run_script(scriptfile): + ''' + Helper which executes a series of commands on set of machines + ''' + script = open("scripts/{fn}.fab".format(fn=scriptfile)).read() + def run(*machines): + group = orchlib.fabric_flock.machine_group(*machines) + + for line in ['hostname'] + script.split("\n"): + line = line.strip() + if len(line) > 0 and line[0] != "#": + print(line) + group.run(line) + return run + +update = run_script("update") +baseline_packages = run_script("baseline_packages") +python_venv = run_script("python_venv") + + +def reboot(machine): + ''' + Run the reboot script. We expect an exception since the remote machine + reboots while Fabric is connected. + ''' + try: + print("Trying to reboot (this doesn't always work") + run_script("reboot")(machine) + except fabric.exceptions.GroupException: + pass + +def provision(ip): + group = fabric.SerialGroup( + ip, + user=orchlib.config.creds['user'], + connect_kwargs={"key_filename": orchlib.config.creds['key_filename']} + ) + update() + baseline_packages() + python_venv() + reboot() + +if __name__=='__main__': + provision() diff --git a/devops/tasks/remote_scripts/__init__.py b/devops/tasks/remote_scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/devops/tasks/remote_scripts/gitpaths.py b/devops/tasks/remote_scripts/gitpaths.py new file mode 100644 index 000000000..80801ca85 --- /dev/null +++ b/devops/tasks/remote_scripts/gitpaths.py @@ -0,0 +1,56 @@ +import os.path +import os + + +WORKING_REPO_PATH='/home/ubuntu/' +BARE_REPO_PATH='/home/ubuntu/baregit/' + + +def gitpath_to_name(packagepath): + ''' + Convert a git path to the name of the repo. For example: + + `https://github.com/ETS-Next-Gen/writing_observer.git` ==> `writing_observer` + ''' + package = os.path.split(packagepath)[1] + if package.endswith(".git"): + return package[:-4] + else: + return package + + +def working_repopath(repo=None): + ''' + Switch to the path where *working* `git` repo is located. E.g. one + with a working tree, if it exists. + ''' + if repo is None: + os.chdir(WORKING_REPO_PATH) + return WORKING_REPO_PATH + + path = os.path.join(WORKING_REPO_PATH, repo) + if os.path.exists(path): + os.chdir(path) + return path + return False + + +def bare_repopath(repo=None): + ''' + Switch to the path where *bare* `git` repo is located. E.g. one + without a working tree, for pushing and pulling. + ''' + # If we don't have a path for bare repos, create it. + if(os.system("mkdir -p "+BARE_REPO_PATH)): + print("Error creating or accessing bare repository directory") + sys.exit(-1) + + if repo is None: + os.chdir(BARE_REPO_PATH) + return BARE_REPO_PATH + + path = os.path.join(BARE_REPO_PATH, repo) + if os.path.exists(path): + os.chdir(path) + return path + return False diff --git a/devops/tasks/remote_scripts/tasks.py b/devops/tasks/remote_scripts/tasks.py new file mode 100644 index 000000000..73b697427 --- /dev/null +++ b/devops/tasks/remote_scripts/tasks.py @@ -0,0 +1,123 @@ +''' +This is a remote script for random `git` operations (e.g. running +on machines in the Learning Observer flock). + +This is a bit awkward, but we maintain: + +- Public `git` repositories in `/home/ubuntu/` +- Private `git` repositories in `/home/ubuntu/baregit` cloned into + `/home/ubuntu` + +The reason for this design is: + +- Pushing a nonpublic repo to a remote server is a bit awkward. Versions + of `git` in current distros do *not* support `push`ing into a non-bare + repo (although this functionaly was added to bleeding edge git). If + we're pushing, we want to push into a bare repo +- For use (e.g. for + +We would like to do this (relatively) statelessly, so that if a repo +exists, we can do an update. If it's up-to-date, we can do nothing. If +it's not there, we create it. + +As of this writing, this is not fully tested. We're going to test more +fully by finishing the side from where we're orchestrating. + +Note that these scripts are designed to be as flexible as possible in terms +of how a path is specified. E.g.: + + inv init https://gitserver.example.com/a/foo.git + inv init /temp/foo + inv init foo + +Will all do the same thing. They will go into the bare repo path, and crete +an empty repository called `foo` if one doesn't already exist, ready for +pushing. + +In the future, we should have a desired version and perhaps give warnings if +the wrong one is used. +''' + +import os +import os.path + +import sys + +from invoke import task + + +# We would like to use these on the remote machine, but also on the local +# machine. +try: + from gitpaths import bare_repopath, working_repopath, gitpath_to_name +except: + from orchlib.gitpaths import bare_repopath, working_repopath, gitpath_to_name + + +@task +def branch(c, repo, branch): + ''' + Switch to a branch in a repo. + ''' + repo = gitpath_to_name(repo) + print("Going to to: ", working_repopath(repo)) + command = "git checkout "+branch + print(command) + os.system(command) + + +@task +def init(c, repo): + ''' + Create a new bare repo, if one does not exist already. + + Otherwise, continue on silently. + + This is for force pushes of remote repos. + ''' + repo = gitpath_to_name(repo) + path = bare_repopath(repo) + if not path: + bare_repopath() + command = "git --bare init "+repo + print(command) + os.system(command) + print(bare_repopath(repo)) + + +@task +def cloneupdate(c, fullrepo): + ''' + Clone a remote repo. + ''' + repo = gitpath_to_name(fullrepo) + barepath = bare_repopath(repo) + + working_repopath() + if not working_repopath(repo): + print("Cloning...") + command = "git clone "+fullrepo + print(command) + os.system(command) + working_repopath(repo) + + print("Updating all branches") + os.system("git fetch --all") + os.system("git pull") + +@task +def cloneupdatelocal(c, repo): + repo = gitpath_to_name(repo) + cloneupdate(c, bare_repopath(repo)) + + +@task +def pull(c, repo): + ''' + Update a repo to the latest version. + ''' + path = working_repopath(repo) + command = "git pull --all" + print(command) + os.system(command) + return path diff --git a/devops/tasks/scripts/baseline_packages.fab b/devops/tasks/scripts/baseline_packages.fab new file mode 100644 index 000000000..fbd88ece4 --- /dev/null +++ b/devops/tasks/scripts/baseline_packages.fab @@ -0,0 +1,4 @@ +cd +sudo apt-get -y install git ansible awscli +git clone https://github.com/ETS-Next-Gen/writing_observer.git +cd writing_observer/devops/ansible ; sudo ansible-playbook local.yaml diff --git a/devops/tasks/scripts/python_venv.fab b/devops/tasks/scripts/python_venv.fab new file mode 100644 index 000000000..629c773eb --- /dev/null +++ b/devops/tasks/scripts/python_venv.fab @@ -0,0 +1,7 @@ +cd +echo . /usr/share/virtualenvwrapper/virtualenvwrapper.sh >> ~/.profile +source ~/.profile; mkvirtualenv learning_observer +echo workon learning_observer >> ~/.profile +source ~/.profile; pip install --upgrade pip +source ~/.profile; cd writing_observer/ ; pip install -r requirements.txt +source ~/.profile; cd ~/writing_observer/learning_observer/; python setup.py develop diff --git a/devops/tasks/scripts/reboot.fab b/devops/tasks/scripts/reboot.fab new file mode 100644 index 000000000..4abead54f --- /dev/null +++ b/devops/tasks/scripts/reboot.fab @@ -0,0 +1 @@ +sudo init 6 diff --git a/devops/tasks/scripts/update.fab b/devops/tasks/scripts/update.fab new file mode 100644 index 000000000..96978f997 --- /dev/null +++ b/devops/tasks/scripts/update.fab @@ -0,0 +1,5 @@ +sudo apt-get update +sleep 1 +sudo apt-get -y upgrade +sleep 1 +sudo apt-get -y dist-upgrade diff --git a/devops/tasks/settings/README.md b/devops/tasks/settings/README.md new file mode 100644 index 000000000..55e5591ea --- /dev/null +++ b/devops/tasks/settings/README.md @@ -0,0 +1,2 @@ +Add a file called CREDS.YAML here and add your security tokens. Docs in +progress diff --git a/devops/tasks/tasks.py b/devops/tasks/tasks.py new file mode 100644 index 000000000..9339e993d --- /dev/null +++ b/devops/tasks/tasks.py @@ -0,0 +1,351 @@ +import atexit +import csv +import datetime +import itertools +import os +import shlex +import sys + +from invoke import task + +import fabric.exceptions + +import orchlib.aws +import orchlib.config +import orchlib.fabric_flock +import orchlib.templates +import orchlib.ubuntu +import orchlib.repos +from orchlib.logger import system + +import remote_scripts.gitpaths + + +@task +def list(c): + ''' + Give a human-friendly listing of all provisioned machines + ''' + for instance in orchlib.aws.list_instances(): + print("{:21} {:21} {:16} {}".format( + instance['InstanceId'], + instance['Tags']['Name'], + instance['PublicIpAddress'], + instance['Tags'].get("use", "") + )) + + +@task +def provision(c, machine_name): + ''' + Set up a baseline image with all the packages needed for + Learning Observer. Note that this will **not** configure + the machine. + ''' + print("Provisioning...") + machine_info = orchlib.aws.create_instance(machine_name) + print("Updating...") + ip = machine_info.public_ip_address + print("DNS....") + orchlib.aws.register_dns(machine_name, orchlib.config.creds['domain'], ip) + print("IP", ip) + orchlib.ubuntu.update(ip) + print("Baseline...") + orchlib.ubuntu.baseline_packages(ip) + print("Venv...") + orchlib.ubuntu.python_venv(ip) + + +@task +def update(c): + ''' + Update all machines with the latest systems updates and security + patches + ''' + addresses = [i['PublicIpAddress'] for i in orchlib.aws.list_instances()] + # Machines without IPs don't get updates + addresses = [i for i in addresses if i != "--.--.--.--"] + print(addresses) + orchlib.ubuntu.run_script("update")(*addresses) + + +@task +def create(c, machine_name): + ''' + Create a machine end-to-end. This is a shortcut for: + * Provision + * Configure + * Certbot + * Download + * Reboot + ''' + print("Provisioning EC2 instance") + provision(c, machine_name) + print("Configuring the Learning Observer") + configure(c, machine_name) + print("Setting up SSL") + certbot(c, machine_name) + print("Saving config") + downloadconfig(c, machine_name) + print("Rebooting") + reboot(c, machine_name) + + +@task +def terminate(c, machine_name): + ''' + Shut down a machine. + ''' + a = input("Are you sure? ") + if a.strip().lower() not in ['y', 'yes']: + sys.exit(-1) + orchlib.aws.terminate_instances(machine_name) + + +@task +def connect(c, machine_name): + ''' + `ssh` to a machine + ''' + command = "ssh -i {key} ubuntu@{machine_name}".format( + key=orchlib.config.creds['key_filename'], + machine_name = machine_name+"."+orchlib.config.creds['domain'] + ) + print(command) + system(command) + + +@task +def configure(c, machine_name): + ''' + Configure a machine + ''' + group = orchlib.aws.name_to_group(machine_name) + + # We start be setting up `git` repos. This will fail if done later, + # since we need these to install pip packages, etc. + orchlib.repos.update(group, machine_name) + + # Set up Python packages. We need git repos for this, but we might + # want to us these in scripts later. + print("Installing Python packages") + for package in orchlib.config.config_lines(machine_name, "pip"): + group.run("source ~/.profile; pip install {package}".format( + package=package + )) + + template_config = { + "nginx_root_options": "", + "hostname": machine_name, + "domain": orchlib.config.creds['domain'] + } + + print("Uploading files") + uploads = [ + l.strip().split(',') + for l in itertools.chain( + orchlib.config.config_lines(machine_name, "sync.csv"), + orchlib.config.config_lines(machine_name, "uploads.csv"), + ) + ] + # We should consider switching back to csvreader, so we handle commas in + # the description + for [local_file, owner, perms, remote_file, description] in uploads: + print("Uploading: ", description) + remote_path = os.path.dirname(remote_file) + group.run("mkdir -p "+remote_path) + orchlib.templates.upload( + group=group, + machine_name=machine_name, + filename=local_file, + remote_filename=remote_file.format(**template_config), + config=template_config, + username=owner, + permissions=perms + ) + + for command in open("config/postuploads").readlines(): + group.run(command.format(**template_config).strip()) + + +@task +def downloadconfig(c, machine_name): + ''' + After setting up certbot, it's helpful to download the nginx config + file. We also don't want to make changes remotely directly in deploy + settings, but if we have, we want to capture those changes. + ''' + template_config = { + "nginx_root_options": "", + "hostname": machine_name, + "domain": orchlib.config.creds['domain'] + } + + group = orchlib.aws.name_to_group(machine_name) + downloads = [ + l.strip().split(',') + for l in itertools.chain( + orchlib.config.config_lines(machine_name, "sync.csv"), + orchlib.config.config_lines(machine_name, "downloads.csv"), + ) + ] + # We should consider switching back to csvreader, so we handle commas in + # the description + for [local_file, owner, perms, remote_file, description] in downloads: + print("Downloading: ", description) + try: + orchlib.templates.download( + group=group, + machine_name=machine_name, + filename=local_file, + remote_filename=remote_file.format(**template_config) + ) + except fabric.exceptions.GroupException: + # This usually means the file is not found. In most cases, + # this happens when we've added a new file to the config, + # and we're grabbing from an old server. + # + # We should handle this more gracefully. How is TBD + print("Could not download file!") + +@task +def certbot(c, machine_name): + ''' + This sets up SSL. Note that: + - SSL will generally NOT work until everything else is set up + - This change nginx config. You don't want to override config + files later. + - This is untested :) + ''' + group = orchlib.aws.name_to_group(machine_name) + CERT_CMD = "sudo certbot -n --nginx --agree-tos --redirect " \ + "--email {email} --domains {hostname}.{domain}" + group.run(CERT_CMD.format( + email=orchlib.config.creds['email'], + hostname = machine_name, + domain=orchlib.config.creds['domain'] + )) + + +@task +def reboot(c, machine_name): + ''' + Untested: This doesn't seem to work yet.... + ''' + print("Trying to reboot... no promises.") + orchlib.ubuntu.reboot(machine_name) + + +@task +def downloadfile(c, machine_name, remote_filename, local_filename): + ''' + Helper to download a single file. + + This is verbose, and doesn't do wildcards. Perhaps better a helper to + `scp`? Don't use this in scripts until we've figured this out.... + ''' + group = orchlib.aws.name_to_group(machine_name) + group.get( + remote_filename, + local_filename + ) + + +@task +def uploadfile(c, machine_name, remote_filename, local_filename): + ''' + Helper to upload a single file. + + This is verbose, and doesn't do wildcards. Perhaps better a helper to + `scp`? Don't use this in scripts until we've figured this out.... + ''' + group = orchlib.aws.name_to_group(machine_name) + group.put( + remote_filename, + local_filename + ) + + +@task +def runcommand(c, machine_name, command): + ''' + Run a remote command. Don't forget quotes! + ''' + group = orchlib.aws.name_to_group(machine_name) + group.run(command) + + +@task +def hello(c): + ''' + For testing! + + For example, hooks. + ''' + print("Hello, world!") + + +@task +def backup(c, machine_name, target): + ''' + Grab a backup of a given directory by name + ''' + targets = { + 'nginx': "/var/log/nginx/", + 'certs': "/etc/letsencrypt/" + } + + if target not in targets: + print("Invalid target. Should be one of:") + print("\n".join(targets)) + sys.exit(-1) + + ts = datetime.datetime.utcnow().isoformat().replace(":", "-") + filebase = "{ts}-{mn}-{tg}".format( + ts=ts, + mn=machine_name, + tg=target + ) + + command = "tar /tmp/{filebase} {target}".format( + filebase=filebase, + target=target + ) + + group = orchlib.aws.name_to_group(machine_name) + group.get( + remote_filename, + local_filename + ) + + +@task +def commit(c, msg): + ''' + This should probably not be a task but a utility function. It's + helpful for debuggin, though. + ''' + system( + "cd {gitpath} ; git add -A; git commit -m {msg}".format( + gitpath=orchlib.config.creds["flock-config"], + msg=msg + ) + ) + + +START_TIME = datetime.datetime.utcnow().isoformat() + +def committer(): + ''' + On exit, commit changes to repo. This code is not finished. + ''' + command_options = shlex.quote(" ".join(sys.argv)) + stop_time = datetime.datetime.utcnow().isoformat() + log = { + 'start_time': START_TIME, + 'stop_time': stop_time, + 'command_options': command_options + } + + +atexit.register(committer) diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 000000000..94e6dd0e2 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,62 @@ +# Architecture +## Piotr Mitros + +# Introduction + +Like all such documents, this document should be taken with a grain of +salt. It my be out-of-date, or not fully implemented. + +# Overview + +1. Events come from a web browser over a web socket. +2. The server performs a reduce operation of some kind on these + events. This operation maintains a per-student state (for each + plug-in) inside of a KVS. +3. A subset of the internal state is used to compute state as sent to + an educator dashboard. +3. Whenever an event is processed, consumers are notified via a pubsub. +4. Consumers can aggregate these notifications, inspect the external state, + and make a dashboard. + +## Technology choices + +1. Generic student information (e.g. names, auth, etc.) cn live in + flat files on the disk, sqlite, or postgres. As of this writing, this + is not built. +2. The KVS for the reduce can either be an in-memory queue or + redis. Redis can be persistent (for deploy) or ephemeral (for + development). As of this writing, all three work. +3. The pub-sub can be an in-memory queue (for development), redis (fo + easy deployment), or xmpp (for scalable deployment). As of this writing, + all three work, but xmpp is buggy/unfinished. +4. The front-end uses bulma and d3.js. + +## Architectural Constraints + +1. By design, this system should be in a usable (although not + necessarily scalable or reliable) state with just a `pip + install`. There should be no complex webs of dependencies. +2. However, there can be a complex web of dependencies for robust, + scalable deployment. For example, we might use an in-memory + queue in a base system, and switch to a high-performance data + store for deployment. +3. For most abstractions, we want to initially build 2-3 plug-ins. For + example, we're initially building this with 2-3 streaming + modules. We support 2-3 pubsubs, and 2-3 key-balue stores. This is + enough to, in most cases, guarantee the abstractions aren't + specific to one thing. However, it's small enough we can change + both sides of an API boundary. +4. Once we know we have the right set of abstractions, we can open up + the flood gates to more plug-ins. + +## Process constraints + +It's better to say "no" to a feature than to break the +architecture. We're in this for the long haul. It's okay to have +scaffolding, though. Half-built things are okay if they're in the +right place, and can be incrementally evolved to be right. + +We try to avoid any technical debt which carries high interest (higher +maintenance costs down the line) -- bad APIs, etc. We don't mind +low-interest technical debt nearly as much (things which need to get +finished later, but won't blow up). \ No newline at end of file diff --git a/docs/auth.md b/docs/auth.md new file mode 100644 index 000000000..446a31b5e --- /dev/null +++ b/docs/auth.md @@ -0,0 +1,114 @@ +Authentication Framework +======================== + +We have two types of authentication: + +* We would like to know that events coming into the system are coming + from where we believe they are. +* We would like to know that users log into the system who can view + student data are who we think they are. + +For the most part, these have very different security profiles. If a +user can spoof events, the worst-case outcome is: + +* A disrupted study +* A disrupted teacher dashboard + +In small-scale studies, demos, and similar, a high level of security +is not required, especially when running on `localhost`, VPN, or in an +IP-restricted domain. + +On the other hand, we **cannot** leak student data. Authenticating +teachers and staff requires a high level of security. + +Event authentication +-------------------- + +Events are authenticated in the file `events.py`. This is +semi-modular. We have several authentication schemes, most of which +rely on a special header. We used to include auth information with +each event, and we have some backwards-compatibility code there as +well. + +Event authentication isn't super-modular yet; it's all in one file, +but the schemes are pluggable. Schemes include: + +* `guest`. Each session is assigned a unique guest ID. This is nice + for demos, studies, and coglabs. +* `local_storage`. Designed for Chromebooks. Each user is given a + unique token, usually stored in the extension's local storage. The + header sends a unique, secure token mapping to one user. +* `chromebook`. The Chromebook sends a user ID. This is *not secure* + and vulnerable to spoofing. It can be combined with `local_storage` + to be secure. +* `hash_identify`. User sends an identity, which is not + authenticated. This is typically for small coglabs, where we might + have a URL like `http://myserver/user-study-5/#user=zihan` +* `testcase_auth`. Quick, easy, and insecure for running testcases. + +We do maintain providence with events, so we can tell which ones came +from secure or insecure sources. + +We need Google OAuth. + +Teacher authentication +---------------------- + +As authentication schemes, we support: + +* Password authentication +* Trusting HTTP basic auth from nginx +* Google OAuth + +We need to be particularly careful with the second of +these. Delegating authentications to `nginx` means that we need to +have nginx properly configured, or we can be attacked. + +User authentication is intended to be fully modular, and we intend to +support more schemes in the future. Right now, each scheme is in its +own file, with `handlers.py` defining a means to log users in, out, as +well as a middleware which annotates the request with user +information. + +Session framework +----------------- + +We keep track of users through +[aiohttp_session](https://aiohttp-session.readthedocs.io/en/stable/). We +store tokens encrypted client-side, which eliminates the needed for +database fields. + +User information +---------------- + +We keep track of user information in a dictionary. Eventually, this will +probably be a dictionary-like object. + +Current fields: + +* `name`: We keep full name, since not all languages have a first name / + last name order and breakdown. +* `nick`: Short name. For a teacher, this might be "Mrs. Q" or "李老师." + For a student, this might by "Timmy." In the future, we might think + through contexts and relationships (e.g. a person might be a teacher, + a coworker, and student) +* `user_id`: Our internal user ID. In most cases, this is the authentication + scheme, followed by the ID within that scheme. For example, Google user + 1234, we might call 'gc-1234.' Test case user 65432, we might call + `tc-65432` +* `safe_user_id`: An escaped or scrubbed version of the above. In some cases, + we have data from unauthenticated sources, and we don't want injection + attacks. There is an open question as to which of these is canonical, + and whether these ought to be swapped (e.g. `source_user_id` and + `user_id`). It depends on where we do JOIN-style operations more often. +* `picture`: Avatar or photo for the user +* `google_id`: Google username + +We will want to think about how to handle joins. Users often have multiple +accounts which should be merged: + +* A user signs up through two mechanisms (e.g. signs up with passwords and + then Google) +* Users are autoenrolled (e.g. through two educational institutions) +* Automatic accounts convert into permanent accounts (e.g. data begins + streaming in for an unauthenticated / guest user) \ No newline at end of file diff --git a/docs/backlog.md b/docs/backlog.md new file mode 100644 index 000000000..e2b7de766 --- /dev/null +++ b/docs/backlog.md @@ -0,0 +1,59 @@ +Project Backlog +=============== + +* Figure out why LO doesn't start on reboot, or how to make it restart + on crashes +* Figure out if/when document ID is missing +* Switch to the annotated canvas +* Be able to pull a document associated with a specific assignment in + Google Classroom +* Implement roll-offs for whole-document operations (e.g. long-running + NLP operations, which should be run periodically) + - Implement simple algorithm, comment on complex algorithms + +Robustness +---------- + +* Confirm what happens with students working in groups +* How do we capture formatting? +* How do we handle an Outline view? +* What happens with large documents? +* What happens with editing outside of the system + +Plumbing +------- + +* Robust queues client-side +* Client auth/auth +* Handle server disconnects +* Proper test frameworks + - Replay +* Refactor rosters + +Additional features +------------------- + +* How do we handle peer groups? +* Create more dashboards +1. Flagging students in need of help? +2. Providing information about use of academic vocabulary? + +APIs +---- + +* Generate dashboards with generic aggregate operations +* Handle client config robustly +* Figure out how to integrate slooow NLP algorithm calls into the + real-time server architecture + +Logging +------- + +* Implement robust data store + +Scaling +------- + +* Database / database schemas for user management if we wish to move + beyond pilot +* Online settings management? \ No newline at end of file diff --git a/docs/block.png b/docs/block.png new file mode 100755 index 000000000..734a1c4c8 Binary files /dev/null and b/docs/block.png differ diff --git a/docs/block.svg b/docs/block.svg new file mode 100755 index 000000000..9eda47d49 Binary files /dev/null and b/docs/block.svg differ diff --git a/docs/code_quality.md b/docs/code_quality.md new file mode 100644 index 000000000..b4488cd4e --- /dev/null +++ b/docs/code_quality.md @@ -0,0 +1,120 @@ +Code Quality +=========== + +In general, we develop code in multiple passes. We try to build +proofs-of-concept and prototypes to figure out what we're doing. These +try to explore: + +- Product issues. E.g. mockups to show teachers in focus groups +- Understanding capabilities. E.g. can we build an NLP algorithm + to do something? +- Integrations. What information does Google Classroom give us? + +These help us understand what we're doing and mitigate risks. Once we +have a clear idea, we move code into the system, either rewriting from +scratch or reusing. Here, the goal is to get the overall architecture +right: + +- Put big pieces in the right modules +- Put correct interfaces between those modules + +Usually, in this stage, the goal is to get to a minimum working (not +necessarily viable) system to iterate from. This often involves a lot +of _scaffolding code_. Scaffolding code is intended to be thrown away. + +Once that's done, we make successive cleanups to make the code +readable, deployable, and production-ready. + +We generally don't do a lot of test-driven development. We usually add +tests towards the end of the process for two reasons: + +1. Tests can make code less agile, before we know what we're doing and +have the right interfaces. Big changes involve modifying tests. At early +stages, broken / non-working code is a lot less harmful. +2. TDD sometimes leads to code which mirror bugs in tests. Tests should +independently validate code. + +We do have a lot of simple tests (`doctest` or +`if __name__ == '__main__:' kinds of things) purely for +speed-of-development. As of this writing, we need many more system tests. + +Code Goals and Invariants +----------- + +As a research system, our goal is to have an **archival log of +everything that happened**. We'd like to be able to remove pieces of +that (e.g. to comply with GDPR requests), which is documented in more +detail in the Merkle tree docs. You'll see a lot of code which will, +for example, annotate with data from `git` so we know what version of +the system generated a particular piece of data. That's important. The +level of integration with tools like `git` is not a hack. We pick +technologies (like `jupyter notebooks`) which allow good logging as +well. + +We would like the system to be **simple to develop, deploy, and +maintain**. +* For the most part, when we add external technologies, we want to + include simple alternatives which don't require a lot of dev-ops. We + won't tie ourselves to SaaS services, and if we add a scalable data + store, we'll usually have a disk-based or memory-based alternative. +* Anything which needs to be done to set up the system should either + be done automatically at startup, or give clear instructions at + startup. + +We would like the system to be modular, and scalable to a broad set of +analytics modules. The *Writing Observer* is just one module plugged +into the system. We aim (but don't yet achieve) a level of simplicity +where undergrads can develop such modules, and they can work reliably +and scalably. Here, the customer is the developer. + +Proofs of concepts and prototypes +----------- + +**Proof-of-concepts and prototypes**. We have no particular +standards. The goal is to show a new UX, NLP algorithm, visualization, +integration, or what-not. However, this should be isolated from the +main codebase. We might have a `prototypes` directory, forks, +branches, or simply keep these on our device. We do like to have a +version history here, since it's often helpful to look back (things we +decide not to do sometimes turn out to be useful later). + +System code +----------- + +As we move code into the main system, standards go up a little bit. + +1. We expect code to comply with `pycodestyle`, ignoring the + restriction on line length. We do run `pylint` as well, but we + don't expect 100 percent compliance. Before making a PR, please + check your code with `make codestyle`. +2. We are starting to work hard to have a clean commit record. Each + commit and each PR should, to the extent possible, do one thing and + one thing only. + +We don't expect initial versions of code to be perfect, but we do +expect successive passes over code to iteratively improve code quality +(documentation, robustness, modularity, etc.). We may increase +standards for initial code quality as the system matures. Initial +low-quality code is best kept behind a feature flag. + +Scaffolding code +----------- + +Getting interface right and having a working system to develop in +often requires scaffolding code. Scaffolding code shouldn't be used in +production, but is often critical during development. For example, if +we need a high-performance queue, we might use a Python list in the +interrim. + +**The major problem we've had is with developers treating scaffolding +code as either a prototype or as final code. Again, scaffolding code +is intended to be thrown away.** + +Taking time to improve code quality on scaffolding is wasted time, +since it is going away. If you have time, please work to replace it. + +Documentation +----------- + +We need a lot more. This makes more sense to do once the system is +more mature. \ No newline at end of file diff --git a/docs/events.md b/docs/events.md new file mode 100644 index 000000000..8eb8bfec7 --- /dev/null +++ b/docs/events.md @@ -0,0 +1,106 @@ +Event Format Notes +================== + +Our event format is inspired in part by: + +* IMS Caliper +* xAPI/Tincan +* edX tracking log events + +None of these are _quite_ right for our application, but several are +close. They're pretty good standards! + +Limitations of industry formats +------------------------------- + +*Verbosity* Both Caliper and xAPI require a lot of cruft to be +appended to the events. For example, we have random ID GUIDs, URLs, +and all sorts of other redundancy on each event. Having things have +either *a little* bit of context (e.g. a header) or *a little* +rationale (e.g. IDs which point into a data store) is sometimes good, +but too much is a problem. With too much redundancy, events can get +massive: + +* Our speed in handling large data scales with data size. Megabytes + can be done instantly, gigabytes in minutes, and terabytes in + hours. Cutting data sizes makes working with data easier. +* Repeating oneself can lead to inconsistent data. Data formats where + data goes one place (or where redundancy is *intentional* and + *engineered* for data correction) is more robust and less bug-prone. + +*Envelopes* Caliper payloads are bundled in JSON envelopes. This is +a horrible format since: + +* It results in a lot of additional parsing... +* ... of very large JSON objects +* If there's an error or incompatibility anywhere, you can easily lose + a whole block of data +* You can't process events in realtime, for example, for formative + feedback + +Text files with one JSON event per line are more robust and more +scalable: + +* They can be processed as a stream, without loading the whole file +* Individual corrupt events don't break the entire pipeline -- you can + skip bad events +* They can be streamed over a network +* They can be preprocessed without decoding. For example, one can + filter a file for a particular type of event, student ID, or + otherwise with a plain text search. The primary goal of first-stage + preprocessing is simply to quickly cut down data size, so it doesn't + need to be reject 100% of irrelevant events. + +*Details* In many cases, the details of a format are inappropriate for +a given purpose. There are event types which are in neither +Tincan/xAPI nor Caliper, and don't fit neatly into their +frameworks. For example: + +* Formats specify timestamps with great precision, while coarse events + (such as a student graduating) don't maintain that precision. +* In one of our clients, events are generated without a user + identifier, which is then added by the server once the user is + authenticated. For these events, validation fails. +* Related to the above, fields are sometimes repeated (e.g. client-side + timestamp, server-side timestamp, and further timestamps as the event + is processed by downstream systems). Much of this fits into security; + downstream systems _should not_ trust data from upstream systems. For + example, a student shouldn't be able to fake submitting a homework + assignment earlier than they did, and a school should not be able to + backdate a state exam response. + +There are similar minor mismatches to e.g. group events, very frequent +events (such as typing), and other types of events not fully +anticipated when the standards were created. + +I'd like to emphasize that in contrast to most industry formats, these +are quite good. They're not fundamentally broken. + +How we'd like to leverage industry formats +------------------------------------------ + +Fortunately, we don't need 100% compatibility for pretty good +interoperability. Our experience is that event formats are almost +never interchangeable between systems; even with standardized formats, +the meaning changes based on the pedagogical design. This level of +compatibility is enough to give pretty interoperability, without being +constrained by details of these formats. + +Our goal is to be compatible where convenient. Pieces we'd like to +borrow: + +* Critically, the universe has converged on events as JSON lines. This + already allows for common data pipelines. +* We can borrow vocabulary -- verbs, nouns, and similar. +* We can borrow field formats, where sensible + +With this level of standardization, adapting to data differences is +typically already less work than adapting to differences in underlying +pedagogy. + +Where we are +------------ + +We have not yet done more careful engineering of our event +format. Aside from a JSON-event-per-line, the above level of +compatibility is mostly aspirational. \ No newline at end of file diff --git a/docs/extension.md b/docs/extension.md new file mode 100644 index 000000000..0b7c27ac4 --- /dev/null +++ b/docs/extension.md @@ -0,0 +1,41 @@ +# Writing Observer Extension + +This is an extension which collects data from the client. + +## Google Churn and Breakage + +1. This extension is already obsolete due to the transition from + [Manifest V2 to Manifest V3](https://developer.chrome.com/docs/extensions/mv3/mv2-sunset/). + Here, Google is apparently trying to cripple ad blockers, which + makes extensions like these much harder to write. For now, Manifest + V2 still works, but we will need to transition to worse code at + some point. [More info](https://www.eff.org/deeplinks/2021/12/chrome-users-beware-manifest-v3-deceitful-and-threatening) +2. Google changed + [rendering on the front-end](https://workspaceupdates.googleblog.com/2021/05/Google-Docs-Canvas-Based-Rendering-Update.html) + such that our code to grab text is broken. On the whole, this is + less harmful, since we never relied on this code path. We grabbed + visible on-screen text to have ground truth data for debugging how + we reconstruct documents. We can make due without that, but it'd be + nice to fix. In the design of the system, we did not count on this + to be stable. + +## System Design + +* `writing_common.js` has common utility functions +* `writing.js` sits on each page, and listens for keystrokes. It also + collects other data, such as document title, or commenting activity. + Only the keystroke logging is well-tested. This is sent onto + `background.js` +* `background.js` is once per browser, and maintains a websocket + connection to the server. It also listens for Google AJAX events + which contain document edits. + +The document edits are short snippets, which aggregate a small number +of keystrokes (e.g. a couple hundred milliseconds or typically 1-2 +keystrokes). These are our primary source of data. The keystroke +collection on each page is more precise (we have timestamps for each +keystroke), and helpful for some typing speed estimations, but +currently lacks a lot of context we would need to e.g. reconstruct +documents. + +Each file has more in-depth documentation. \ No newline at end of file diff --git a/docs/history.md b/docs/history.md new file mode 100644 index 000000000..5b3e89e97 --- /dev/null +++ b/docs/history.md @@ -0,0 +1,29 @@ +History +======= + +Second prototype +------- + +The second prototype integrated with Google Classroom, and presented a +(less pretty, more cluttered) consolidated view with: + +* Current student typing +* Time-on-task, idle time, and text length + +First prototype +------- + +Our first version of the tool was a UX mockup (with real front-end +code, but no backend). We had five tabs, of which two are shown. The +typing view showed a block of text around the student cursor in +real-time + +The outline view showed section titles, and how much text students had +written in each section. + +In addition, we had a view which showed summary stats (e.g. amount of +text written), contact info for students, as well as a visualization +of the students' writing process. Teachers wanted a streamlines view +which showed just text around the cursor and three of the summary +stats (amount of text written, idle time, and time-on-task). Most of +the other stuff felt like too much. diff --git a/INSTALL.md b/docs/install.md similarity index 97% rename from INSTALL.md rename to docs/install.md index 1bef1ef9f..0d3099820 100644 --- a/INSTALL.md +++ b/docs/install.md @@ -1,4 +1,4 @@ -# Formative Process for Writing - INstallation Instructions +# Formative Process for Writing - Installation Instructions Last updated: 16-JAN-2020 ## Install Chrome Extension diff --git a/docs/kafka_notes.md b/docs/kafka_notes.md new file mode 100644 index 000000000..db01e179e --- /dev/null +++ b/docs/kafka_notes.md @@ -0,0 +1,55 @@ +# Structure notes + +Table for students + +student | sessions | hashes +---|---|--- +example@email.com | \[kafka_topic_id_00, kafka_topic_id_23\] | \[hash_of_kafka_topic_id_00\] + +Create session + +* Create Kafka stream +* Add student to student table with session name (REDIS or KAFKA) + +Closing behavior + +* Socket gets closed + * Times out? + * Could we determine when the document is closed? +* Message sent to data server + * The number of requests between servers will be dramatically fewer than messages between client and WO + * Could use a simple REST API +* Data server will: + * Compute the Hash + * Read data from Kafka stream + * Compute `hash = hash(data)` from our own hash function + * SHA 256 with some salt + * Create a Merkle tree where each leaf is a log event from the session. We use the root hash instead + * Append `hash` to a student's `hashes` + * Backup data + * Determine file location, `f = str(hash[:2]/hash[2:])` or some other way of hashing + * Create file at `f` + * There is some note that says Linux handles a lot of directories better than a lot of files in WYAG + * Write data to `f` + +Things TODO: + +* Turn on Kafka +* Write some of our data to Kafka stream + * Make the naming scheme easier to understand, for testing + * Just run it localy and collect the data through yourself +* Create simple REST API + * Post command to student + * Compute the hash, pull data, and store it + +How do we create the Merkle tree? + +* What is the purpose of the Merkle Tree? + * Where does it fit in? + * How will it be used? + * What is the purpose of the DAG portion? + +* Is it autostored in a merkle tree? i.e. the directory structure of our store is a merkle tree +* How are Merkle tree being used here? + * Do we store each log event as a Merkle Tree? Why? + * Do we store each log file as a Merkle Tree? Why? diff --git a/docs/lo_block.png b/docs/lo_block.png new file mode 100755 index 000000000..cb5bb92c4 Binary files /dev/null and b/docs/lo_block.png differ diff --git a/docs/lo_block.svg b/docs/lo_block.svg new file mode 100755 index 000000000..df6dd3236 Binary files /dev/null and b/docs/lo_block.svg differ diff --git a/docs/messaging.md b/docs/messaging.md new file mode 100644 index 000000000..ada9d6e69 --- /dev/null +++ b/docs/messaging.md @@ -0,0 +1,25 @@ +Messaging Technologies +====================== + +We may need to route a lot of messages. The best protocol is +XMPP. [ejabberd](https://www.ejabberd.im/) is super-scalable, but +requires a lot maintance. [prosody](https://prosody.im/) is +(relatively) quick and easy. + +There are a lot of clients for Python. We did an eval of a few. If I +recall, we tried [xmpppy](https://github.com/xmpppy/xmpppy), +[SleekXMPP](http://sleekxmpp.com/), and eventually settled on +[Slixmpp](https://slixmpp.readthedocs.io/en/latest/). + +A step up in simplicty are AWS hosted services like SQS/SNS; those +have proprietary lock-in, and for our use-case, rather high pricing. + +Another step up in simplicy are the pub-subs built into redis and +postgresql. These work fine in moderate-size installs, but it's not +clear these would scale to where we hope this system goes. + +For development, we support in-memory pubsub. + +In the current use-case (classroom dashboards), polling beats +pub-sub. Students generate many events per second, and teachers need +updates perhaps every few hundred milliseconds at most. \ No newline at end of file diff --git a/docs/mmnd.png b/docs/mmnd.png new file mode 100755 index 000000000..360459dcd Binary files /dev/null and b/docs/mmnd.png differ diff --git a/docs/mmnd.svg b/docs/mmnd.svg new file mode 100755 index 000000000..24b0a6d73 Binary files /dev/null and b/docs/mmnd.svg differ diff --git a/docs/ncsu_setup.md b/docs/ncsu_setup.md new file mode 100644 index 000000000..79725bccd --- /dev/null +++ b/docs/ncsu_setup.md @@ -0,0 +1,304 @@ +# NCSU system setup guide +# ================================================== + +Currently the system is set for use with RHEL 8 on the NCSU systems. We are running with Python 3.9 and connected to the AWEWorkbench code. This assumes that we are also installing it into a vm with those tools installed as packages. An installation script has been added to the servermanagement directory. + + +Ihnstallation on RHEL 8 requires: + +- python 3.9 or 3.10 (39 still default). +- redis.x86_64 5.0.3-5.module+el8.4.0+12927+b9845322 @rhel-8-for-x86_64-appstream-rpms +- redis-devel.x86_64 5.0.3-5.module+el8.4.0+12927+b9845322 @rhel-8-for-x86_64-appstream-rpms + + + + +# Older RHEL 7 Notes. +# ================================================== + + +The following is a guide to help with the installation of Learning Observer (LO) on NCSU systems. + +This guide assumes you are using an RHEL system. +Additionally, depending on where on the system you place the repository, you may need to run all commands as a sudo user. + +## Requirements + +LO is confirmed to work on `Python 3.8`. +Along with the base install of Python, LO requires the Python developer tools. +These can be installed with the following commands: + +```bash +sudo yum install rh-python38 # base python +sudo yum install python38-devel # developer tools for python 3.8 +``` + +On RHEL 7, the `python38-devel` is no longer recognized as a package. +To properly fetch the developer tools, use the following: + +```bash +sudo subscription-manager repos --enable rhel-7-server-optional-rpms --enable rhel-server-rhscl-7-rpms +sudo yum install rh-python38-python-devel.x86_64 +``` + +The Python installation should be located at `/opt/rh/rh-python38`. +Note this location for future sections. + +There is a chance you'll encounter an issue when installing the requirements, specifically `py-bcrypt`. +The developer tools do not show up in the exact proper place, so we need to create a soft symbolic link between the correct location and where they are located. +To create this link, use the following: + +```bash +cd /opt/rh/rh-python38/root +sudo ln -s usr/include/ . # check that Python.h exists in usr/include/python3.8/Python.h +``` + +Note, we are creating a link between the subdirectory `/opt/rh/rh-python38/root/usr/include` and `/opt/rh/rh-python38/root`. +Using `/usr/include` will result in the incorrect link. + +## Install + +### Virtual Environment + +To make sure we are using the proper installation of Python, we will use a virtual environment. +To do this, run the following command: + +```bash +/path/to/python3.8/ -m venv /path/of/desired/virtual/environment +``` + +Again, keep note of where the virtual environment is located for future steps. + +### Config files + +For each system, you'll need to create a new `creds.yaml` file within the `/path/to/repo/learning_observer` directory. +This file defines what type of connections are allowed to be made to the system. +Luckily, there is an example file you can copy located in the `/path/to/repo/learning_observer/learning_observer` directory. +When attempting to run the system later on in this setup guide, if you have any misconfigured here, then the system will tell you what's wrong. + +Some of the main changes that need to be made are: + +1. types of `auth` allowed, for simple setup, just remove the `google` child and all its subchildren +1. `aio` session secret and max age +1. `event_auth` to allow access from various locations (like Chromebooks) +1. `server` for reconfiguring the port information +1. `config:logging` for determining the `max_size` (in bytes) of each log file and total `backups` to keep around before rotating. + +More configurables are expected to be included in this config file in the future. + +### Package installation + +Before we get started installing packages, we must ensure that the `pip` in our virtual environment is up to date. +Some of the packages located in the `requirements.txt` file require `wheel` to be installed first. +After the base requirements are installed, we will also need to install the local packages (the Writing Observer module and the Learning Observer module). +To handle all the installs, use the following: + +```bash +cd writing_observer # cd into the top level of the repository +/path/to/venv/bin/pip install --upgrade pip # upgrade pip +/path/to/venv/bin/pip install wheel # install wheel +/path/to/venv/bin/pip install -r requirements.txt # install package requirements +/path/to/venv/bin/pip install -e learning_observer/ # install learning observer module +/path/to/venv/bin/pip install -e modules/writing_observer/ # install writing observer module +``` + +### Needed directories + +When installing Learning Observer for the first time, we need to create a few directories. +Use the following commands: + +```bash +mkdir /path/to/repo/learning_observer/learning_observer/static_data/course_lists +mkdir /path/to/repo/learning_observer/learning_observer/static_data/course_rosters +mkdir /path/to/repo/learning_observer/learning_observer/logs +mkdir /path/to/repo/learning_observer/learning_observer/logs/startup +``` + +### Proxy server + +By default, LO runs on port 8888. +Configure nginx, or another proxy server, for LO's port. + +### Executable files + +If this is the first time you are running the server on your system, you might need to make the shell scripts in the `servermanagement` directory executable. +To do this, use the following commands + +```bash +chmod +x /path/to/repo/servermanagement/RunLearningObserver.sh +chmod +x /path/to/repo/servermanagement/BackupWebSocketLogs.sh +``` + +## System specific changes + +There are various lines of code that point to specific servers. +For each setup, we need to make sure these are pointing to the proper place. + +### Server + +#### Auth information + +On the server, we need to point the redirect uri to the server we are working with. +Depending on how the credentials files was handled, this change may not be necessary to get the system running. +The redirect uri is used with the Google login. +If that is not used, then this step is not needed. +This is located in `/path/to/repo/learning_observer/learning_observer/auth/social_sso.py`. + +#### Server management + +Additionally, we need to set up the server management files in the `/path/to/repo/servermanagement` direcotry. + +In the `RunLearningObserver.sh` file, you'll want to set the system variables to match the current system. + +```bash +VIRTUALENV_PYTHON="/full/path/to/venv/bin/pip" +LEARNING_OBSERVER_LOC="/full/path/to/repo/learning_observer" +LOGFILE_DEST="/path/to/log/storage" +``` + +In the `BackupWebsocketLogs.sh` file, you'll want to set log directory to the same place as you set in `RunLearningObserver.sh` and set where the logs should be backed up. + +```bash +LOGFILE_SRC="/path/to/log/storage" +LOGFILE_DEST="/path/to/log/backups" +``` + +### Client + +On the clientside, we need to add the correct server to the `websocket_logger()` method in the `/path/to/repo/extension/extension/background.js` file. +If the server has SSL enabled, then the address we add should start with `wss://`. +If SSL is not enabled, then the address should start with `ws://`. +If a proxy server is not setup yet, make sure to include the port number (default 8888) on the end of the address. +An example of each instance is shown below: + +```js +websocket_logger("wss://writing.csc.ncsu.edu/wsapi/in/") // SSL enabled, nginx set +websocket_logger("ws://writing.csc.ncsu.edu:8888/wsapi/in/") // SSL not enabled, nginx not setup +``` + +## Running the server + +There are 2 different ways we can run the system. +One is better for debugging, whereas the other is best for when you want to run the server and leave it up. +We suggest completely testing the installation with the debugging steps first. + +### For debugging + +To run the system for debugging, we will just run the Learning Observer module. +This will output all the log information to the console. +To do this, use the following command: + +```bash +/path/to/venv/bin/python /learning_observer/learning_observer/ # run the learning observer module from within the learning observer directory +``` + +You should see any errors printed directly to the console. + +### As a server + +To run the system as a server, we will run the `RunLearningObserver.sh` script. +This fetches the virtual environment, runs the server, and pipes files into the proper log location we setup during the **System specific changes** section. +Run the following commands: + +```bash +./servermanagement/RunLearningObserver.sh +``` + +Check the logs for any errors. + +## Connecting the client + +The client is run through a Google Chrome extension. +To properly use the client, you must sign into Chrome and use the same account to access to Google Docs. + +From there, navigate to the extension manage located in settings. +Turn on Developer Mode (top right), then click the `Load Unpacked` button. +This opens a file explorer, where you should locate the repository. +More specifically, select the `writing_observer/extension/extension` directory. +This will unpack the extension and make it available for use in Google Chrome. + +To make sure it is working, click on the `background page` link on extension card from within the extension manager. +This opens an inspect window. +On this window, select the `Console` tab. +Next, open a Google doc and start typing. +You should see events within the console. +Ensure there are no logs sprinkled in. + +## Backing up logs + +Whenever a websocket is made, the server creates a new log file for that connection on top of the primary logs files. +We need to backup both the generic log files as well as all the websocket specific logs. + +### General logs + +The main logger for events is located in `event_logger.json`. +This is automatically backed up via the built-in Python logging module. +The settings for this file are handling via the `creds.yaml` file that you previously setup. +Simply changing the values and restarting the server will update the logging procress. + +### Websocket logs + +The websocket logs take a little more setting up. +We will set up a daily `cron` job to run a backup script, `/path/to/repo/servermanagement/BackupWebsocketLogs.sh`. +The backup script will search the log directory for any logs that match the websocket pattern and were last modified in the last **60 minutes**. +Next, the backup script will remove any files that match the pattern and were modifed in the last **120 minutes**. + +To set up the cron job, we first enter the crontab utility then add a line for the backup script. + +```bash +crontab -e # open the cron job menu + +0 1 * * * /usr/bin/sh /full/path/to/repo/servermanagement/BackupWebsocketLogs.sh # line to add to the cronjob +# Run it at the 0th minute every hour, every day, every month, and so on +``` + + + + +# Usage Notes +# ================================================================= +# Instructions for Configuring Writing Observer on RHEL Installations +### Install Global Dependencies +1. sudo yum install redis +2. sudo yum install git +3. sudo yum install nginx + +## Install Required RH Python 3.8 +4. sudo subscription-manager repos --enable rhel-7-server-optional-rpms \ + --enable rhel-server-rhscl-7-rpms +5. sudo yum -y install @development +6. sudo yum -y install rh-python38 + +* rh-pyhon38 dev tools are also required + +## Setup RH Python 38 and Virtual Envs +7. scl enable rh-python38 bash +8. python --version +* The output should indicate that python 3.8 is active +9. sudo pip install virtualenvwrapper +10. sudo source `/opt/rh/rh-python38/root/usr/local/bin/virtualenvwrapper.sh` + +## Install Local Dependencies +11. sudo git clone `https://github.com/ArgLab/writing_observer` +12. cd writing_observer +13. make install +14. sudo mkvirtualenv learning_observer +15. pip install -r requirements.txt +16. cd learning_observer +17. python setup.py develop +18. python learning_observer + +* At this point, follow the system's further instructions until the process runs on port 8888 by default + +## Server Setup +19. Populate creds.yaml with required Google Cloud Parameters +20. Configure nginx on `port 80` as a proxy for Learning Observer on `port 8888` +21. Replace all instances of `writing.csc.ncsu.edu` with custom server address in all files in directory `~/writing_observer/learning_observer/learning_observer/auth` + +## Client/Extension Setup +22. Replace all instances of `writing.csc.ncsu.edu` with custom server address in `~/writing_observer/extension/background.js` +* If SSL is not enabled for the server, all websocket protocols should begin with `ws://` as opposed to `wss://` +23. Open Chrome and navigate to `chrome://extensions` +24. Click on "Load Unpacked". Select `~/writing_observer/extensions` and ensure that it is enabled +25. Select `background page` on the extension section and ensure no errors are present +26. Open a Google Docs document while signed into Chrome and ensure websocket communication between client and server is active \ No newline at end of file diff --git a/docs/privacy.md b/docs/privacy.md new file mode 100644 index 000000000..e428d3024 --- /dev/null +++ b/docs/privacy.md @@ -0,0 +1,291 @@ +# Draft Thoughts on Privacy +## Piotr Mitros + +**Disclaimer:** This is a work-in-progress. It is a draft for +discussion. It should not be confused with a legal document (policy, +contract, license, or otherwise). It has no legal weight. As of the +time of this disclaimer, has not been reviewed by anyone other than +Piotr Mitros, who does not speak for either ETS or anyone else. I'm +soliciting feedback fron collaborators (hence, it is here), but it +shouldn't be confused with any sort of policy (yet). It's a working +document to figure stuff out. + +This was written when we were sprinting to respond to COVID19 remote +learning in spring 2020. + +## Introduction + +It is our goal to treat educational data with a hybrid model between +that of data as +[public good](https://en.wikipedia.org/wiki/Public_good_(economics)) +and that of personal data belonging to the individuals to whom the data +pertains. Specifically, we would like to balance the demands of: + +* Family rights and student privacy; +* Transparency and the use of data for advancement of public policy + and education; and +* Scientific integrity and replicability + +This approach contrasts with the current industry trend of treating +student data as a proprietary corporate resource for the maximization +of profit. + +These are fundamental tensions between the demands on any student data +framework. For example, family rights and student privacy suggest that +we should remove student data when asked. On the other hand, +scientific replicability suggests that we maintain data which went +into experiments so people can independently confirm research results. + +Building out both the technology and legal frameworks to do this will +take more time than possible for a pilot project. Until we have built +out such frameworks, student data should be governed by a standard +research Privacy framework, along the lines of what's outlined below. + +If and when appropriate frameworks are available, we hope to +transition and extended research privacy framework described +below. Our thoughts was that we would define a set of guiding +principles and boundaryies right now. If we can find a way to respect +those (build out computer code, legal code, and funding), we would +transition over to this, notifing schools and/or families, giving an +opportunity to opt-out. Should we be unable to implement this +framework within five years, or should we decide to build a different +privacy framework, student data will move over only on an opt-in basis. + +## Standard Research Privacy Framework + +In the short term, this dashboard is intended to address immediate +needs related to COVID19. During the sprint to bring this dashboard to +a pilot, we cannot build out the legal and technical frameworks for +student data management and family rights (e.g. to inspect and erase +data). We would initially use a simple, conservative data policy: + +* Until and unless we have the framework described below (“Extended + Framework”) in place, all student data will be destroyed at most + five years after it was collected. +* The data will live on secure machines controlled by the research + team (currently, ETS and NCSU). +* For the purposes of this project, we can and will share student data + with the student's school. Beyond the school, the parents, and the + student, we would not share your data with anyone outside of the + research team, except as required by law (e.g. in the case of + subpoenas or law enforcement warrants). +* We may perform research and publish based on such data, but only to + the extent that any published results are aggregate to such a level + that it is impossible to re-identify students. + +## Extended Research Privacy Framework + +In order to keep data beyond the five-year window, we would have +technological and organizational frameworks to provide for: + +1. The right of guardians (defined below) to inspect all student data. +2. The right of guardians to have student data removed upon request. +3. The right of guardians to understand how data used, both at + a high-level and a a code level. +4. Reasonable and non-discriminatory access to deidentified data with + sufficient protections to preserve privacy (for example, for + purposes such as research on how students learn or policy-making +5. Transparent and open governance of such data +6. Checks-and-balances to ensure data is managed primarily for the + purpose of helping students and student learning (as opposed to + e.g. as a proprietary corporate resource) +7. An opportunity for guardians to review these frameworks, and to + opt-out if they choose. +8. Review by the ETS IRB. + +Helping students is broadly defined, and includes, for example: + +1. Driving interventions for individual students (for example, + student and teacher dashboards) +2. Allowing interoperability of student records (for example, if a + student completes an assignment in one system, allowing another + system to know about it). +3. Research for the purpose of school improvements (for example, + providing for insights about how students learn, or how different + school systems comparea, in ways analogous to state exams, NAEP, or + PISA). + +It does not include advertising or commercial sale of data (although +it does include basic cost recovery, for example on a cost-plus +basis). + +Depending on context, 'guardian' may refer to: + +1. The student who generated the data; +2. The student's legal parent/guardian; or +3. The student's school / educational institution (for example, acting + as the parent/guardian's agent, as per COPPA) + +We would reserve the right to make the determination of who acts as +the guardian at our own judgement, based on the context. + +## Any other changes + +Any changes to the privacy policy which do not follow all of the above +would require affirmative **opt-in** by the guardian. + +## Rationale and Discussion + +To help contextualize and interpret the above policies. + +### Definitions of Deidentification, anonymization, and aggregation + +* Student data is **deidentified** by removing obvious identifiers, + such as names, student ID numbers, or social security numbers. Note + that deidentified learning data can often be reidentified through + sophisticated algorithms, for example comparing writing style, + skills, typing patterns, etc., often correlating with other + sources. Although such techniques are complex, they tend to be + available as automated tools once discovered. + +* Student data is **anonymized** by removing any data from which a + student may be reidentified. Anonymization involves sophisticated + techniques, such as the use of protocols like k-anonymity/ + l-diversity, or maintaining privacy budgets. + +* Student data may be **aggregated** by providing statistics about + students, for example in the form of averages and standard + deviations. Some care must still be maintained that those + aggregations cannot be combined to make deductions about individual + students. + +For learning data, simple deidentification **cannot** be counted on to +provide security. With data of any depth, it is possible to +re-identify students. However, such obfuscation of obvious identifiers +can still significant reduce risk in some contexts since it prevents +casual, careless errors (such as a researcher accidentally including +the name of a student in a paper, or chancing upon someone they know +in a dataset). With obfuscated identifiers, re-identifying students +generally requires affirmative effort. + +### Scientific integrity and open science + +Over the past few decades, there have been significant changes in +scientific methodology. Two key issues include: + +* **The ability to replicate results.** When a paper is published, + scientist need access to both data and methods (source code) to be + able to confirm results. + +* **Understanding the history of research** Confidence in results + depends not just on the final data and its analysis, but the steps + taken to get there. Scientists need to understand steps taken on + data prior to final publication. + +These suggest maintaining a log of all analyses performed on the data +(which in turn suggests open source code). + +### Educational transparency + +Historically, the general public has had a lot of access to +educational information: + +* PPRA provides for families to have access to school curricular + materials. +* FERPA provides for families to have access to student records, as + well as the ability to correct errors in such records. +* Public records laws (FOIA and state equivalents) provides for + access to substantially everything which does not impact + student privacy or the integrity of assessments. +* In Europe, GDPR provides for people to have the right to + inspect their data, to understand how it is processed, and + to have data removed. + +While FERPA, PPRA, and FOIA were drafted in the seventies (with only +modest reforms since) and do not apply cleanly in digital settings, +the spirit these laws were grounded in a philosophy that the general +public ought to be able to understand school systems. State exams, +NAEP, PISA, and similar exams were likewise created to provide for +transparency. + +This level of transparency has lead to improvements to both the +learning experiences of individual students and to school systems as a +whole, by enabling academic researchers, parent advocates, +policymakers, journalists, and others to understand our schools. + +One of our goals is to translate and to reassert these right as +education moves into the digital era. With digital learning materials, +in many cases, parents, researchers, and others have lost the ability +to meaningfully inspect student records (which are often treated as +corporate property) or curricular materials (which sit on corporate +servers). Increasingly, students' lives are governed by machine +models, to which families have no access. + +Again, this dictates that analysis algorithms (including ML models +where possible without violating student privacy) ought to be open to +inspection, both at a high level (human-friendly descriptions) as well +as at a detailed level (source code). In addition, there ought to be a +way to analyze student data, to the extent this is possible without +violaitng student privacy. + +### Guardianship and Proxy + +Guardianship is a complex question, and hinges on several issues: + +* Age. For example, young elementary school students are unlikely to + be able to make sense of educatonal data, or the complex issues + there-in. High school students may be able to explore such issues in + greater depth, but may have limited legal rights as minors. + +* Identity. Releasing data to an unauthorized party carries high + risk. Robust online identity verification is potentially expensive + and / or brittle. Working through institutions with whom we have + relationships, and who in turn have relationships with students and + families can mitigate that risk. + +* COPPA grants for + [schools to act on behalf of parents](https://www.ftc.gov/tips-advice/business-center/guidance/complying-coppa-frequently-asked-questions#Schools). + First, schools frequently have legal resources and expertise (either + acting individually or in consortia) which parents lack. Second, + reviewing the number of technologies typical students interact with + would be overwhelming to parents. + +However, ultimately, there is a strong argument that access ought to +rest as close to the individual as possible. Where schools act as +agents for families, and parents for students, there is a growing +level of security and competence. On the other hand, there is also a +grwoing level of trust required that those parties are acting in the +best interests of those they are representing. It is incumbent on us, +at all levels, to ensure have appropriate transparency, incentive +structures, and organizational structures to guarantee that proxies do +act for stakeholder benefit, and to balance these based on the +context. + +### Minimizing security perimeter + +Even when all parties act in good faith, broad data sharing exposes +students to high levels of risk of data compromises, whether through +deliberate attacks, disgruntled employees, or human error. + +### Models for data access + +In light of the above constraints, several models for data access have +emerged which allow for both complete transparency and protect student +privacy. + +* In the FSDRC model, deidentified (but not anonymized) data would be + kept in a physically-secure facility. People could visit the + facility and crunch data within the facility. Visitors would be + under both contractual bounds and have physical security to not + remove data, except for sufficiently aggregated results so as to + make reidentification impossible. Access is provided on a cost-plus + basis. + +* People can develop algorithms on synthetic data, and upload + algorithms to a data center, where those algorithms run on student + data. Both code and data are inspected prior to releasing results, + again, on a cost-plus basis. + +* Corporations can run real-time algorithms (such as to drive learning + dashboards) in a data center on a cost-plus basis. Educational + applications can work on shared models of student expertise, without + providing access to student data to the organizations which + developed them. + +* If a student (or proxy there-of) asks to have data removed, that + data is removed within some timeframe. However, for scientific + replicability, there is a before-and-after snapshot of how study + results changed when student data was removed. Note that this has + implications for both perfomance and re-identification. + +... to be continued \ No newline at end of file diff --git a/docs/system_design.md b/docs/system_design.md new file mode 100644 index 000000000..7387b2715 --- /dev/null +++ b/docs/system_design.md @@ -0,0 +1,55 @@ +Learning Observer System Design +------------------------------- +Piotr Mitros. 2021-07-11 + +This lays out the system design, as planned. This design does not +fully reflect the current implementation, yet. + +Our goal is to build a system which will: + +* Take in process data from a diversity of sources + +* Perform real-time processing on that data, in order to support + teachers in real-time. This is done through a series of pluggable + analytics modules. + +* Archive that data for research purposes and archival analysis + +* Provide open science tools to log such analyses + +In other words: + +![](block.png) + +Internally, the system takes a stream of events from each learner, and +routes it to one or more analytics modules. Each of these modules +performs a `reduce` operation over that stream in realtime. The +reduced state is stored in a KVS (currently `redis`, although this is +pluggable). These modules run as asynchronous Python co-routines, +which makes them quite scalable. We ought to be able to handle large +numbers of simultanious connections. + +Each time an instructor connects, periodically, such data is +aggregated from redis, and sent back to the instructor. This would be +a logical place to be more clever about scaling; ideally, we'd cycle +through instructors for such an aggregation, and only aggregate where +data has changed, so that with large numbers of instructors, the +system merely updates dashboards less quickly: + +![](lo_block.png) + +Although at present, reduce operations are per-student, and +aggregations per-class, in the future, we envision: + +* Other ways to shard (e.g. per-resource, per-document, etc.). +* Being able to cascade events, either by generating new events, or in + much the same way as we handle the current aggregation +* Potentially, being more clever about routing the same student to a + common process each time. Right now, we're connected per-session, + but we may have concurrency issues if a student connects twice. + +Data will be stored in a git-like Merkle tree format: + +![](mmnd.png) + +We'll document this in more detail later. \ No newline at end of file diff --git a/docs/technologies.md b/docs/technologies.md new file mode 100644 index 000000000..5e5e7f0a9 --- /dev/null +++ b/docs/technologies.md @@ -0,0 +1,58 @@ +# Technologies in the _Learning Observer_ + +Several potential contributors have asked for a list of technologies +needed to be productive helping developing the *Learning Observer* or +modules for the *Learning Observer*. A short list: + +* We use [Python](https://www.python.org/) on the server side, and + JavaScript on the client side. We do rely on current Python (dev + systems are 3.8 or 3.9 as of this writing). +* We use [D3](https://d3js.org/) for displaying data in real-time + on the client, and otherwise, as a front-end framework. D3 is a + relatively small and simple library with a fairly steep learning + curve (in much the same way as Go is a small and simple game). We + recommend going through any short tutorial _before_ doing any + front-end work to get a feel for it. We don't recommend a _long_ + tutorial; beyond that, it's best to learn in-context. +* Since we're managing large numbers of web socket connections, we + make heavy use of [asynchronous + Python](https://docs.python.org/3/library/asyncio.html), and our web + framework is [aiohttp](https://docs.aiohttp.org/en/stable/). If you + haven't done async programming before, there is deep theory behind + it. However, we again recommend any short tutorial for aiohttp, and + then learning in context. +* We make heavy use of `git`, as well as of data structures which are + `git`-like. I recommend reading [Git + Internals](https://git-scm.com/book/en/v2/Git-Internals-Plumbing-and-Porcelain) + and following [Write Yourself a Git](https://wyag.thb.lt/) +* Our CSS framework is [Bulma](https://bulma.io/) +* Our icon library is [Font Awesome](https://fontawesome.com/) +* For rapid prototyping, we use [P5.js](https://p5js.org/), although + we hope to avoid this beyond the prototype phase. This is super-easy + to learn (even for little kids), and super-fast to develop in. It + doesn't do to production-grade software, though (responsive, i18n, + a11y, testability, etc.). The best way to learn this is by helping a + child do the Khan Academy JavaScript courses :) +* Our web server is [nginx](https://nginx.org/en/), but that's easy to + change. +* Our dev-ops framework is home baked, but uses [boto](http://boto.cloudhackers.com/), [invoke](https://www.pyinvoke.org/), [Fabric](https://www.fabfile.org/), and a + little bit of [ansible](https://docs.ansible.com/ansible/latest/dev_guide/developing_python_3.html). +* We recommend Debian/Ubuntu, but run on Fedora/Red Hat. We'd like to + run on Mac and Windows someday too. +* At some point, we do plan do add [postgresql](https://www.postgresql.org/). +* For a while, when we thought we'd need queues, we used an XMPP + server. I don't think we need queues, but if we do, it will come + back. + +For grad students, interns, student volunteers, and other contributors +who are here primarily to learn: One of the fun things here is that +most of these are _deeply interesting tools_ with a strong theoretical +basis in their design. + +On the whole, our goal is to keep a *small set of dependencies*. To +add a new tool to the system, it will need to do something +_substantially_ different than what's in the system already. We do +plan on adding Postgresql once needed, but not too much beyond that. + +Note that some modules within the system (including the _Writing +Observer_) do have more extensive dependencies. \ No newline at end of file diff --git a/docs/usagenotes.md b/docs/usagenotes.md new file mode 100644 index 000000000..fe1d8305b --- /dev/null +++ b/docs/usagenotes.md @@ -0,0 +1,8 @@ +## Usage Notes. + +For the moment this doc will act as some shared working knowledge for the use of the system with students. + + +# Extension Security. + +NOTE: The current working version of the chrome extension requires that the user is logged into both a Google account (which is registered to the classroom) *and* Chrome. This requirement stems from the security model used by the extension. On Chromebook devices this is the default behavior. In future work other devices will require a change. \ No newline at end of file diff --git a/extension/background.js b/extension/background.js deleted file mode 100644 index 3c2f476c9..000000000 --- a/extension/background.js +++ /dev/null @@ -1,212 +0,0 @@ -/* -Background script. This works across all of Google Chrome. -*/ - - -var event_queue = []; - -/* To avoid race conditions, we keep track of events we've successfully sent */ -var sent_events = new Set(); - -var webSocket = null; - -//var WRITINGJS_AJAX_SERVER = "https://writing.hopto.org/webapi/"; -//var WRITINGJS_WSS_SERVER = "https://writing.hopto.org/webapi/"; - -var WRITINGJS_AJAX_SERVER = null; -var EXPERIMENTAL_WEBSOCKET = false; - -/* - FSM - - +----------------------+ - | Load server settings | - | from chrome.storage | - +----------------------+ - | - v - +-------------------+ - | Connect to server | - +-------------------+ - - Load events queue - from chrome.storage - - -Dequeue events -*/ - -function dequeue_events() { - // If we have not yet initialized, we rely on the queue to be - // flushed once we are initialized. - if(!WRITINGJS_AJAX_SERVER) { - return - } - while(event_queue.length > 0) { - writingjs_ajax(event_queue.shift()); - } - /* - if(EXPERIMENTAL_WEBSOCKET) { - if((webSocket == null) || (webSocket.readyState != 1) ) { - window.setTimeout(reset_websocket, 1000); - return; - } - var event = event_queue.shift(); - webSocket.send(JSON.stringify(event)); - }*/ -} - - -function writingjs_ajax(data) { - /* - Helper function to send a logging AJAX request to the server. - This function takes a JSON dictionary of data. - - TODO: Convert to a queue for offline operation using Chrome - Storage API? Cache to Chrome Storage? Chrome Storage doesn't - support meaningful concurrency, - */ - - httpRequest = new XMLHttpRequest(); - //httpRequest.withCredentials = true; - httpRequest.open("POST", WRITINGJS_AJAX_SERVER); - httpRequest.send(JSON.stringify(data)); -} - -function enqueue_event(event) { - event_queue.push(event); - dequeue_events(); -} - -function send_chrome_identity() { - /* - We sometimes may want to log the user's identity, as stored in - Google Chrome. Note that this is not secure; we need oauth to do - that. oauth can be distracting in that (at least in the workflow - we used), it requires the user to confirm permissions. - - Perhaps want to do oauth exactly once per device, and use a - unique token stored as a cookie or in browser.storage? - - Note this function is untested, following a refactor. - */ - chrome.identity.getProfileInfo(function(userInfo) { - enqueue_event("chrome_identity", {"email": userInfo.email, - "id": userInfo.id - }); - }); -} - -function reset_websocket() { - if((webSocket == null) || (webSocket.readyState != 1) ) { - webSocket = new WebSocket("wss://writing.hopto.org/wsapi/"); - webSocket.onopen = dequeue_events; - } -} - -function this_a_google_docs_save(request) { - /* - Check if this is a Google Docs save request. Return true for something like: - https://docs.google.com/document/d/1lt_lSfEM9jd7Ga6uzENS_s8ZajcxpE0cKuzXbDoBoyU/save?id=dfhjklhsjklsdhjklsdhjksdhkjlsdhkjsdhsdkjlhsd&sid=dhsjklhsdjkhsdas&vc=2&c=2&w=2&smv=2&token=lasjklhasjkhsajkhsajkhasjkashjkasaajhsjkashsajksas&includes_info_params=true - And false otherwise - */ - if(request.url.match(/.*:\/\/docs\.google\.com\/document\/d\/([^\/]*)\/save/i)) { - return true; - } - return false; -} - -var RAW_DEBUG = false; // Do not save debug requests. We flip this frequently. Perhaps this should be a cookie or browser.storage? - -// Figure out the system settings. Note this is asynchronous, so we -// chain dequeue_events when this is done. -chrome.storage.sync.get(['process-server'], function(result) { - //WRITINGJS_AJAX_SERVER = result['process-server']; - if(!WRITINGJS_AJAX_SERVER) { - WRITINGJS_AJAX_SERVER = "https://writing.hopto.org/webapi/"; - } - dequeue_events(); -}); - -// Listen for the keystroke messages from the page script and forward to the server. -chrome.runtime.onMessage.addListener( - function(request, sender, sendResponse) { - chrome.extension.getBackgroundPage().console.log("Got message"); - chrome.extension.getBackgroundPage().console.log(request); - enqueue_event(request); - } -); - -// Listen for web loads, and forward relevant ones (e.g. saves) to the server. -chrome.webRequest.onBeforeRequest.addListener( - /* - This allows us to log web requests. There are two types of web requests: - * Ones we understand (SEMANTIC) - * Ones we don't (RAW/DEBUG) - - There is an open question as to how we ought to handle RAW/DEBUG - events. We will reduce potential issues around collecting data - we don't want (privacy, storage, bandwidth) if we silently drop - these. On the other hand, we significantly increase risk of - losing user data should Google ever change their web API. If we - log everything, we have good odds of being able to - reverse-engineer the new API, and reconstruct what happened. - - Our current strategy is to: - * Log the former requests in a clean way, extracting the data we - want - * Have a flag to log the debug requests (which includes the - unparsed version of events we want). - We should step through and see how this code manages failures, - - For development purposes, both modes of operation are - helpful. Having these is nice for reverse-engineering, - especially new pages. They do inject a lot of noise, though, and - from there, being able to easily ignore these is nice. - */ - function(request) { - chrome.extension.getBackgroundPage().console.log("Web request:"+request.url); - var formdata = {}; - if(request.requestBody) { - formdata = request.requestBody.formData; - } - if(!formdata) { - formdata = {}; - } - if(RAW_DEBUG) { - enqueue_event({ - 'event_type': 'raw_request', - 'url': request.url, - 'form_data': formdata - }); - } - if(this_a_google_docs_save(request)){ - chrome.extension.getBackgroundPage().console.log("Google Docs bundles "+request.url); - console.log(formdata.bundles); - event = { - 'event_type': 'google_docs_save', - 'doc_id': googledocs_id_from_url(request.url), - 'bundles': JSON.parse(formdata.bundles), - 'rev': formdata.rev, - 'timestamp': parseInt(request.timeStamp, 10) - }; - chrome.extension.getBackgroundPage().console.log(event); - enqueue_event(event); - } else { - chrome.extension.getBackgroundPage().console.log("Not a save: "+request.url); - } - }, - { urls: ["*://docs.google.com/*"/*, "*://mail.google.com/*"*/] }, - ['requestBody'] -) - -// Let the server know we've loaded. -enqueue_event({"event": "extension_loaded"}); - -// Send the server the user info. This might not always be available. -chrome.identity.getProfileUserInfo(function callback(userInfo) { - enqueue_event(userInfo); -}); - -// And let the console know we've loaded -chrome.extension.getBackgroundPage().console.log("Loaded"); diff --git a/extension/3rdparty/sha256.js b/extension/extension/3rdparty/sha256.js similarity index 100% rename from extension/3rdparty/sha256.js rename to extension/extension/3rdparty/sha256.js diff --git a/extension/extension/background.js b/extension/extension/background.js new file mode 100644 index 000000000..246d3eb5b --- /dev/null +++ b/extension/extension/background.js @@ -0,0 +1,450 @@ +/* +Background script. This works across all of Google Chrome. +*/ + +// Do not save debug requests. We flip this frequently. Perhaps this +// should be a cookie or browser.storage? +var RAW_DEBUG = false; + + +/* + TODO: FSM + + +----------------------+ + | Load server settings | + | from chrome.storage | + +----------------------+ + | + v + +-------------------+ + | Connect to server | + +-------------------+ + + Load events queue + from chrome.storage + + +Dequeue events +*/ + +function profileInfoWrapper(callback) { + /* Workaround for this bug: + https://bugs.chromium.org/p/chromium/issues/detail?id=907425#c6 + */ + try { + chrome.identity.getProfileUserInfo({accountStatus: 'ANY'}, callback); + } catch (e) { + // accountStatus not supported + chrome.identity.getProfileUserInfo(callback); + } +} + +function console_logger() { + /* + Log to browser JavaScript console + */ + return console.log; +} + + +function add_event_metadata(event_type, event) { + /* + TODO: Should we add user identity? + */ + event['event'] = event_type; + + // Add the event_type if not present + if (!event.hasOwnProperty('event_type')) { + event['event_type'] = event_type; + } + + event['source'] = 'org.mitros.writing_analytics'; + event['version'] = 'alpha'; + event['ts'] = Date.now(); + event['human_ts'] = Date(); + event['iso_ts'] = new Date().toISOString; + return event; +} + + +function websocket_logger(server) { + /* + Log to web socket server. + + Optional: + * We could send queued events on socket open (or on a timeout) + * Or we could just wait for the next event (what we do now) + + The former would be a little bit more robust. + */ + var socket; + var state = new Set() + var queue = []; + + function new_websocket() { + socket = new WebSocket(server); + socket.onopen=prepare_socket; + socket.onerror = function(event) { + console.log("Could not connect"); + var event = { "issue": "Could not connect" }; + event = add_event_metadata("warning", event); + event = JSON.stringify(event); + queue.push(event); + }; + socket.onclose = function(event) { + console.log("Lost connection"); + var event = { "issue": "Lost connection", "code": event.code }; + event = add_event_metadata("warning", event); + event = JSON.stringify(event); + queue.push(event); + }; + return socket; + } + + socket = new_websocket(); + + function are_we_done() { + if (state.has("chrome_identity") && + state.has("local_storage")) { + event = {}; + event = add_event_metadata('metadata_finished', event); + socket.send(JSON.stringify(event)); + state.add("ready"); + } + } + + function prepare_socket() { + // Send the server the user info. This might not always be available. + state = new Set(); + let event; + profileInfoWrapper(function callback(userInfo) { + event = { + "chrome_identity": userInfo + }; + event = add_event_metadata("chrome_identity", event); + socket.send(JSON.stringify(event)); + state.add("chrome_identity"); + are_we_done(); + }); + let storage_keys = [ + "teacher_tag", // Unused. In the future: whom do we authorize to receive data? + "user_tag", // Unique per-user tag in the settings page + "process_server", // Unused. Which server should we send to? + "unique_id", // Unique ID set in the settings page + "generated_id" // Autogenerated, for future forensics + ] + chrome.storage.sync.get(storage_keys, function(result) { + if(result !== undefined) { + event = {'local_storage': result}; + } else { + event = {'local_storage': {}}; + } + /*if("generated_id" not in event['local_storage']){ + event['local_storage']['generated_id'] = unique_id(); + chrome.storage.sync.set( + 'generated_id', + event['local_storage']['generated_id'] + ); + }*/ + console.log(event); + event = add_event_metadata("local_storage", event); + console.log(event); + socket.send(JSON.stringify(event)); + state.add("local_storage"); + are_we_done(); + }); + } + + function dequeue() { + if(socket === null) { + // Do nothing. We're reconnecting. + console.log("Event squelched; reconnecting"); + } else if(socket.readyState === socket.OPEN && + state.has("ready")) { + while(queue.length > 0) { + var event = queue.shift(); + socket.send(event); /* TODO: We should do receipt confirmation before dropping events */ + } + } else if((socket.readyState == socket.CLOSED) || (socket.readyState == socket.CLOSING)) { + /* + If we lost the connection, we wait a second and try to open it again. + + Note that while socket is `null` or `CONNECTING`, we don't take either + branch -- we just queue up events. We reconnect after 1 second if closed, + or dequeue events if open. + */ + console.log("Re-opening connection in 1s"); + socket = null; + state = new Set(); + setTimeout(function() { + console.log("Re-opening connection"); + socket = new_websocket(); + }, 1000); + } + } + + return function(data) { + queue.push(data); + dequeue(); + } +} + +function ajax_logger(ajax_server) { + /* + HTTP event per request. + + To do: Handle failures / dropped connections + */ + var server = ajax_server; + return function (data) { + /* + Helper function to send a logging AJAX request to the server. + This function takes a JSON dictionary of data. + */ + + httpRequest = new XMLHttpRequest(); + //httpRequest.withCredentials = true; + httpRequest.open("POST", ajax_server); + httpRequest.send(data); + } +} + +/* +List of loggers. For example, if we want to send to the server twice, and log on console: + +loggers_enabled = [ + console_logger(), + ajax_logger("https://localhost/webapi/"), + websocket_logger("wss://localhost/wsapi/in/") +]; +*/ +let loggers_enabled = [ + console_logger(), + //ajax_logger("https://writing.learning-observer.org/webapi/")//, + + // Adapted to NCSU Setup. + websocket_logger("wss://observer.csc.ncsu.edu/wsapi/in/") + //websocket_logger("wss://writing.learning-observer.org/wsapi/in/") +]; + +function log_event(event_type, event) { + /* + This sends an event to the server. + */ + event = add_event_metadata(event_type, event); + + if(event['wa_source'] = null) { + event['wa_source'] = 'background_page'; + } + var json_encoded_event = JSON.stringify(event); + + for (var i=0; i { + for (const contentScript of chrome.runtime.getManifest().content_scripts) { + for (const tab of await chrome.tabs.query({url: contentScript.matches})) { + // Unload the dead content script by removing its code from the page + chrome.scripting.executeScript({ + target: { tabId: tab.id, allFrames: true }, + func: function() { + var scripts = document.getElementsByTagName('script'); + for(var i = scripts.length - 1; i >= 0; i--) { + if(scripts[i].src === `chrome-extension://${chrome.runtime.id}/writing.js`) { + scripts[i].remove(); + } + } + } + }); + + // re-inject content script + chrome.scripting.executeScript({ + target: {tabId: tab.id, allFrames: true}, + files: contentScript.js, + }); + } + } +}); + +// Let the server know we've loaded. +log_event("extension_loaded", {}); + +// Send the server the user info. This might not always be available. +profileInfoWrapper(function callback(userInfo) { + log_event("chrome_identity", userInfo); +}); + +// And let the console know we've loaded +// chrome.extension.getBackgroundPage().console.log("Loaded"); remove +logFromServiceWorker("Loaded"); diff --git a/extension/icons/lousy-fountain-pen-48.png b/extension/extension/icons/lousy-fountain-pen-48.png similarity index 100% rename from extension/icons/lousy-fountain-pen-48.png rename to extension/extension/icons/lousy-fountain-pen-48.png diff --git a/extension/extension/inject.js b/extension/extension/inject.js new file mode 100644 index 000000000..4581c03a3 --- /dev/null +++ b/extension/extension/inject.js @@ -0,0 +1,20 @@ +/* + Inject script. This is a web_accessible_resources used to pass the id + of the document as a globally accessible variable to the extension. + It is called by the injectScript function in writing.js to make the result + accessible using an event listener +*/ + +let script = document.createElement('script') +script.id = 'tmpScript' + +const code = "_docs_flag_initialData.info_params.token" +script.textContent = 'document.getElementById("tmpScript").textContent = JSON.stringify(' + code + ')' + +document.documentElement.appendChild(script) + +let result = script.textContent + +window.postMessage({ from: 'inject.js', data: result }) + +script.remove() diff --git a/extension/extension/manifest.json b/extension/extension/manifest.json new file mode 100644 index 000000000..578ed4f1c --- /dev/null +++ b/extension/extension/manifest.json @@ -0,0 +1,48 @@ +{ + "author": "Piotr Mitros", + "manifest_version": 3, + "name": "Writing Process", + "homepage_url": "https://github.com/ETS-Next-Gen/writing_observer", + "incognito": "not_allowed", + "offline_enabled": true, + "version": "0.0.0.1", + "description": "Tracks writing in Google Docs, and provides nifty insights to you and your teachers!", + "action": { + "default_title": "Writing Process", + "default_popup": "pages/settings.html", + "default_icon": { + "48": "icons/lousy-fountain-pen-48.png" + } + }, + "content_scripts": [{ + "matches": ["*://docs.google.com/document/*"], + "js": ["3rdparty/sha256.js", "writing_common.js", "writing.js"] + }], + "web_accessible_resources": [{ + "resources": ["inject.js"], + "matches": ["*://docs.google.com/*"], + "use_dynamic_url": true + }], + "background": { + "service_worker": "service_worker.js" + }, + "permissions": [ + "webRequest", + "declarativeNetRequest", + "identity", + "identity.email", + "storage", + "nativeMessaging", + "scripting", + "activeTab" + ], + "host_permissions": [ + "*://docs.google.com/document/*" + ], + "icons": { + "48": "icons/lousy-fountain-pen-48.png" + }, + "options_ui": { + "page": "pages/options.html" + } +} diff --git a/extension/pages/action.css b/extension/extension/pages/action.css similarity index 100% rename from extension/pages/action.css rename to extension/extension/pages/action.css diff --git a/extension/extension/pages/options.html b/extension/extension/pages/options.html new file mode 100644 index 000000000..3d0538753 --- /dev/null +++ b/extension/extension/pages/options.html @@ -0,0 +1,42 @@ + + + + + + + + +

Options for debugging

+ +

Server lets us tweak where we send data. User tag lets us + tweak user ID (so we can e.g. pretend to be two users without + multiple Google accounts). Teacher tag lets us tweak who we route + messages to. + +

Server: no value found

+

User tag: no value found

+

Teacher tag: no value found

+
+ + + + +
+ + + + +
+ + + + +
+ + + + +
+ + + diff --git a/extension/extension/pages/options.js b/extension/extension/pages/options.js new file mode 100644 index 000000000..184dff3c2 --- /dev/null +++ b/extension/extension/pages/options.js @@ -0,0 +1,69 @@ +/* + Documentation on how to create an options page + + TODO: Add logging of when options change + */ + +const option_keys = ["teacher_tag", "user_tag", "process_server", "unique_id"]; + +function saveOptions(key) { + /* + Callback when user hits "save" on the options page + + We save to storage. When we're done, we refresh the + text (and the input) to make sure we've saved right and + to show current status. + */ + const value = document.querySelector("input.input-text."+key).value; + let new_setting={}; + new_setting[key] = value; + chrome.storage.sync.set( + new_setting, + (e)=>restoreOptions([key]) + ); +} + +function removeOptions(key) { + /* + Callback when user hits "remove" on the options page + + We just remove they key. + */ + chrome.storage.sync.remove( + key, + (e)=>restoreOptions([key]) + ); +} + +function restoreOptions(keys = option_keys) { + /* + Initialize the options page for the extension. Eventually, we'd + like to also use chrome.storage.managed so that school admins + can set these settings up centrally, without student overrides + */ + chrome.storage.sync.get(keys, function(result){ + for(const key_index in keys) { + const key = keys[key_index]; + console.log(key); + const r=result[key] || "none"; + console.log(r); + document.querySelector(".value-display."+key).innerText = r; + document.querySelector("input."+key).value = r; + } + }); +} + +function initialize() { + for(const key_index in option_keys) { + const key = option_keys[key_index]; + console.log(key); + document.querySelector("button.save-button."+key) + .addEventListener("click", (e) => saveOptions(key)); + document.querySelector("button.remove-button."+key) + .addEventListener("click", (e) => removeOptions(key)); + } + restoreOptions(option_keys); +} + +document.addEventListener('DOMContentLoaded', initialize); + diff --git a/extension/extension/pages/settings.html b/extension/extension/pages/settings.html new file mode 100644 index 000000000..a856be7a2 --- /dev/null +++ b/extension/extension/pages/settings.html @@ -0,0 +1,16 @@ + + + + + + +

Writing Process

+ +

Hi! This is an extension which captures writing process data in + Google Docs. It's part of a research project designed to help your + classrooms work better during COVID19. If you have any feedback or + questions, please don't hesitate to reach out to the researchers or + to talk to your teachers.

+ + + diff --git a/extension/extension/service_worker.js b/extension/extension/service_worker.js new file mode 100644 index 000000000..138ba9421 --- /dev/null +++ b/extension/extension/service_worker.js @@ -0,0 +1,8 @@ +// Combining the two background scripts into one to serve +// as a single service worker script + +try { + importScripts("./writing_common.js", "./background.js"); +} catch (e) { + console.log(e); +} diff --git a/extension/extension/wo_front_end.png b/extension/extension/wo_front_end.png new file mode 100644 index 000000000..9b6e33a62 Binary files /dev/null and b/extension/extension/wo_front_end.png differ diff --git a/extension/extension/writing.js b/extension/extension/writing.js new file mode 100644 index 000000000..289219f30 --- /dev/null +++ b/extension/extension/writing.js @@ -0,0 +1,761 @@ +/* + Page script. This is injected into each web page on associated web sites. +*/ + +/* For debugging purposes: we know the extension is active */ +document.body.style.border = "5px solid blue"; + +/* + General Utility Functions +*/ + +function log_error(error_string) { + /* + We should send errors to the server, but for now, we + log to the console. + */ + console.trace(error_string); +} + +function log_event(event_type, event) { + /* + We pass an event, annotated with the page document ID and title, + to the background script + */ + // This is a compromise. We'd like to be similar to xAPI / Caliper, both + // of which use the 'object' field with a bunch of verbose stuff. + // + // Verbosity is bad for analytics, but compatibility is good. + // + // This is how Caliper thinks of this: https://www.imsglobal.org/spec/caliper/v1p2#entity + // This is how Tincan/xAPI thinks of this: https://xapi.com/statements-101/ + // + // "Object" is a really bad name. Come on. Seriously? + event["object"] = { + "type": "http://schema.learning-observer.org/writing-observer/", + "title": google_docs_title(), + "id": doc_id(), + "url": window.location.href, + } + + event['event'] = event_type; + // We want to track the page status during events. For example, + // Google Docs inserts comments during the document load. + event['readyState'] = document.readyState; + + // uncomment to watch events being logged from the client side with devtools + // console.log(event); + + // Check if the extension runtime still has its context + if (chrome.runtime?.id !== undefined) { + chrome.runtime.sendMessage(event); + } +} + +function doc_id() { + /* + Extract the Google document ID from the window + */ + try { + return googledocs_id_from_url(window.location.href); + } catch(error) { + log_error("Couldn't read document id"); + return null; + } +} + + +function this_is_a_google_doc() { + /* + Returns 'true' if we are in a Google Doc + */ + return window.location.href.search("://docs.google.com/") != -1; +} + +function google_docs_title() { + /* + Return the title of a Google Docs document. + + Note this is not guaranteed 100% reliable since Google + may change the page structure. + */ + try { + return document.getElementsByClassName("docs-title-input")[0].value; + } catch(error) { + log_error("Couldn't read document title"); + return null; + } +} + +function google_docs_partial_text() { + /* + Return the *loaded* text of a Google Doc. Note that for long + documents, this may not be the *complete* text since off-screen + pages may be lazy-loaded. The text omits formatting, which is + helpful for many types of analysis + + We want this for redundancy: we'd like to confirm we're correctly + reconstructing text. + */ + try { + return document.getElementsByClassName("kix-page")[0].innerText; + } catch(error) { + log_error("Could not get document text"); + return null; + } +} + +function google_docs_partial_html() { + /* + Return the *loaded* HTML of a Google Doc. Note that for long + documents, this may not be the *complete* HTML, since off-screen + pages may be lazy-loaded. This includes HTML formatting, which + may be helpful, but is incredibly messy. + + I hate Google's HTML. What's wrong with simple, clean, semantic + tags and classes? Why do we need something like this instead: + + + + Seriously, Google? + + And yes, if you download documents from Google, it's a mess like + this too. + */ + return document.getElementsByClassName("kix-page")[0].innerHTML; +} + +function is_string(myVar) { + /* + Utility function to check whether a variable is a string. + We need that because some Google docs graphical object classes + are not strings. + */ + if (typeof myVar === 'string' || myVar instanceof String) { + return true; + } else { + return false; + } +} + +function injectScript(file_path, tag) { + /* + This function is to inject a script from 'file_path' + into a specific DOM tag passed in as 'tag' + */ + var node = document.getElementsByTagName(tag)[0]; + var script = document.createElement('script'); + script.setAttribute('type', 'text/javascript'); + script.setAttribute('src', file_path); + node.appendChild(script); +} + +function execute_on_page_space(code){ + /* This is from + https://stackoverflow.com/questions/9602022/chrome-extension-retrieving-global-variable-from-webpage + + It is used to run code outside of the extension isolation box, + for example to access page JavaScript variables. + */ + + if (!document.getElementById('tmpScript')) { + injectScript(chrome.runtime.getURL('inject.js'), 'body'); + } +} + +function google_docs_version_history(token) { + /* + Grab the _complete_ version history of a Google Doc. We do this + on page load. Note that this may lead to a lot of data. But this + lets us do most of our analytics on documents created or edited + without our extension. + + Note that if Google changes their implementation, this may + break. We don't want to promise to users this will always + work. But it's good to have for the pilot. + + It also lets us debug the system. + + NOTE (CL) in past cases use of the execute on page space by itself triggered + an error. If it creates excessive delays or error due to history use the + following code block in lieu of the next call. + + try { + var token = executeOnPageSpace("_docs_flag_initialData.info_params.token"); + } catch (error) { + log_event("Error on Page History.", {"ERROR" : error}) + return -1; + } + */ + + metainfo_url = "https://docs.google.com/document/d/"+doc_id()+"/revisions/tiles?id="+doc_id()+"&start=1&showDetailedRevisions=false&filterNamed=false&token="+token+"&includes_info_params=true" + + fetch(metainfo_url).then(function(response) { + response.text().then(function(text) { + tiles = JSON.parse(text.substring(5)); // Google adds a header to prevent JavaScript injection. This removes it. + var first_revision = tiles.firstRev; + var last_revision = tiles.tileInfo[tiles.tileInfo.length - 1].end; + version_history_url = "https://docs.google.com/document/d/"+doc_id()+"/revisions/load?id="+doc_id()+"&start="+first_revision+"&end="+last_revision; + fetch(version_history_url).then(function(history_response) { + history_response.text().then(function(history_text) { + log_event( + "document_history", + {'history': JSON.parse(history_text.substring(4))} + ); + }); + }); + }); + }); +} + +/* + Event Logging Code Block +*/ + +// Data structure specifying the events we want to capture from the browser. +// For keystroke and mouseclick events, we capture target and parent target info +// because it gives us info about what exactly got clicked on/changed. + +EVENT_LIST = { + "keystroke": { + "events": [ + "keypress", "keydown", "keyup" + ], + "properties": [ + 'altKey', 'buttons', + 'charCode', 'code', + 'ctrlKey', 'isComposing', + 'isTrusted', 'key', + 'keyCode', 'location', + 'metaKey', 'repeat', + 'shiftKey', 'target.className', + 'target.id', 'target.nodeType', + 'target.localName', 'timeStamp', + 'type', 'which' + ], + "target": "document" + }, + "mouseclick": { + "events": [ + "mouseclick", "mousedown", "mouseup" + ], + "properties": [ + "button", "buttons", + "clientX", "clientY", + "layerX", "layerY", + "offsetX", "offsetY", + "screenX", "screenY", + "movementX", "movementY", + 'altKey', 'ctrlKey', + 'metaKey', 'shiftKey', + 'which', 'isTrusted', + 'timeStamp', 'type', + 'target.id', 'target.className', + 'target.innerText', 'target.nodeType','target.localName', + 'target.parentNode.id', 'target.parentNode.className', + 'target.parentNode.nodeType', 'target.parentNode.localName' + ], + "target": "document" + }, + "attention": { + "events": ["focusin", "focusout"], + // Not all of these are required for all events... + "properties": [ + 'bubbles', 'cancelable', + 'isTrusted', 'timeStamp', + 'relatedTarget.className', 'relatedTarget.id', + 'target.className', 'target.id', + 'target.innertext', 'target.nodeType', + 'target.localName', 'target.parentNode.className', + 'target.parentNode.id', 'target.parentNode.innerText', + 'target.parentNode.nodeType', 'target.parentNode.localName', + 'type', + ], + "target": "window" + }, + "visibility": { + "events": ["visibilitychange"], + "properties": [ + 'target', 'bubbles', + 'cancelable', 'isTrusted', + 'timeStamp', 'type' + ], + "target": "document" + }, + "save": { + "events": ["google_docs_save"], + "properties": [ + "doc_id", "bundles", + "event", "timestamp" + ], + "target": "window" + }, + "load": { + "events": ["document_loaded"], + "properties": [ + "doc_id", "event", + "history", "title", + "timestamp" + ], + "target": "window" + }, +}; + +// By having these, we have references to allow us to remove listeners later +// See refresh_stream_view_listeners +for(var event_type in EVENT_LIST) { + EVENT_LIST[event_type]['listener'] = generic_eventlistener(event_type, -1); +} + +function generic_eventlistener(event_type, frameindex) { + /* + This function calls eventlistener_prototype on setup, then + calls the `refresh_stream_view_listeners` function, which dynamically + adds listeners after focus events to handle events for dynamically + created nodes in `docos-stream-view`. + */ + + return function(event) { + /* + Listen for events, and pass them back to the background page. + */ + var event_data = {}; + event_data["event_type"] = event_type; + properties = EVENT_LIST[event_type].properties; + var property_data = {}; + for (var property in properties) { + const prop = treeget(event, properties[property]); + if(prop !== null) { + property_data[properties[property]] = treeget(event, properties[property]); + } + } + event_data[event_type] = property_data; + event_data['frameindex'] = frameindex; + log_event(event_type, event_data); + + // Dynamic updates of `docos-stream-view` means our initial set + // of listeners doesn't always catch events that happen in the + // comments div. Specifically, if the user clicks on the + // 'Comments' button, or if they click on certain fields in + // displayed comments, events don't get registered without the + // extra step called by `refresh_stream_view_listeners()`. + + // TODO: figure out way to limit the number of times + // `refresh_stream_view_listeners()` is called. + + // We don't really want to call it every time focus shifts, + // but I'm not sure what specifically to listen for to + // minimize the number of times we swap out event listeners on + // the `docos-stream-view` element. + if (event_type=='attention') { + refresh_stream_view_listeners(); + } + } +} + +function refresh_stream_view_listeners() { + /* + This function supports dynamic refreshing of the listeners + associated with docos-stream-view, which is the div in which + comments are placed. + */ + // Grab the comments div + el = document.getElementById('docos-stream-view'); + if (!el) { + return; + } + + // Refresh mouseclick events + for(var eventNo in EVENT_LIST["mouseclick"].events) { + event = EVENT_LIST["mouseclick"].events[eventNo]; + el.removeEventListener(event, EVENT_LIST["mouseclick"]["listener"]); + el.addEventListener(event, EVENT_LIST["mouseclick"]["listener"]); + } + + // Refresh keystroke events + for(var eventNo in EVENT_LIST["keystroke"].events) { + event = EVENT_LIST["keystroke"].events[eventNo]; + el.removeEventListener(event, EVENT_LIST["keystroke"]["listener"]); + el.addEventListener(event, EVENT_LIST["keystroke"]["listener"], true); + } +} + +var editor = document.querySelector('.kix-appview-editor'); + +// Function definitions completed. +// Now we initialize the generic event listener. + +//We will listen to events in all iFrames, as well as the main content document. +var frames = Array.from(document.getElementsByTagName("iframe")); + +// TODO: We should really make a list of documents instead of a fake iframe.... +frames.push({'contentDocument': document}) + +// Add an event listener to each iframe in the iframes under frames. +for(var event_type in EVENT_LIST) { + for(var event_idx in EVENT_LIST[event_type]['events']) { + js_event = EVENT_LIST[event_type]['events'][event_idx]; + target = EVENT_LIST[event_type]['target'] + if(target === 'document') { + for(var iframe in frames) { + if(frames[iframe].contentDocument) { + frames[iframe].contentDocument.addEventListener(js_event, generic_eventlistener(event_type, iframe)); + } + } + } else if (target === 'window') { + window.addEventListener(js_event, generic_eventlistener(event_type, iframe)); + } + } +} + +//////////////////////////////////// +// MUTATION OBSERVER CODE BLOCK //// +//////////////////////////////////// + +// NOTE: The following code is designed to observe changes in the document, +// not just html events. (Right now we're not observing CSS changes +// such as setting an element to display: none. Some of those may be +// worth watching for Google Docs; for instance, when a comment is +// "resolved", it is merely hidden.) + +// MUTATIONS_OBSERVED is the data structure where we store information +// about which html change events to log how. This functions as a rule +// base that governs what changes in the html document are logged and +// sent back to the server. This code is based on the MutationObserver +// and mutationRecord classes. See: +// +// https://developer.mozilla.org/en-US/docs/Web/API/MutationObserver +// https://developer.mozilla.org/en-US/docs/Web/API/MutationRecord +// +// The format works like this: +// "insert": { +// ^^^ +// Category. +// A term we +// made up. +// It describes +// the type of +// change made +// by the +// mutationRecord. +// +// +// [ { “target”: "bif", “added”: "bar", “label”:"foo", “watch”: "hoo"] , ... ] +// (or "removed") +// ^^^ ^^^ ^^^ ^^^ +// Class of the Class of the Type label we Class of the parent +// target node where node added made up that node whose inner +// the change took or removed is sent to the text we want to +// place. Writing Observer monitor. The innerText +// server. will be sent to the +// } WO server. +var MUTATIONS_OBSERVED = { + "insert": [ + { + "target": "docos-stream-view", + "added": "docos-docoview-resolve-button-visible", + "label": "add-comment", + "watch": "kix-discussion-plugin" + }, + { + "target": "docos-anchoreddocoview-content", + "added": "docos-replyview-comment", + "label": "add-reply", + "watch": "kix-discussion-plugin" + } + ], + "addtext": [ + { + "target": "kix-spell-bubble-suggestion-text", + "label": "view_spelling_suggestion", + "watch": "" + }, + ], + "delete": [ + { + "target": "docos-docoview-rootreply", + "removed": "docos-replyview-suggest", + "label": "resolve-suggestion", + "watch": "kix-discussion-plugin" + }, + { + "target": "docos-docoview-rootreply", + "removed": "docos-replyview-first", + "label": "delete-comment", + "watch": "kix-discussion-plugin" + }, + { + "target": "docos-docoview-replycontainer", + "removed": "docos-replyview-comment", + "label": "delete-reply", + "watch": "kix-discussion-plugin" + } + ], + "input": [ + { + "target": "docos-input-textarea", + "label": "type-input", + } + ], + "clear": [ + { + "target": "docos-input-textarea", + "label": "clear-input", + } + ], + "replace": [ + { + "target": "docos-replyview-static", + "label": "edit-comment", + "watch": "kix-discussion-plugin" + }, + { + "target": "kix-spell-bubble-suggestion-text", + "label": "view-suggestion-text", + }, + { + "target": "kix-spell-bubble", + "label": "view_spelling_suggestion", + "watch": "" + }, + ], + "suggest": [ + { + "target": "docos-replyview-static", + "label": "add-suggestion", + "watch": "kix-discussion-plugin" + } + ], + "other": [ + ] +} + +function classify_mutation(mutation) { + /* + Determine what kind of change is being made: `insert`, `addtext`, + `delete`, `replace`, `input`, `suggest`, or `other`. + + We will use the category label returned by this function as the + key to the mutationObserved variable to get a list of relevant + rules to apply. + */ + if (mutation.addedNodes.length > 0 && mutation.removedNodes.length == 0) { + if (mutation.addedNodes[0].nodeType == Node.TEXT_NODE) { + return "addtext"; + } else { + return "insert"; + } + } + else if (mutation.addedNodes.length == 0 && mutation.removedNodes.length > 0) { + if (mutation.removedNodes[0].nodeType == Node.TEXT_NODE) { + return "clear"; + } + else { + return "delete"; + } + } + else if (mutation.addedNodes.length > 0 && mutation.removedNodes.length > 0 && + mutation.removedNodes[0].nodeType == Node.TEXT_NODE && + mutation.addedNodes[0].nodeType == Node.TEXT_NODE + ) { + return "replace"; + } + else if (mutation.type=='characterData') { + return "input"; + } + else if (mutation.addedNodes.length > 0 && mutation.removedNodes.length > 0 ) { + return "suggest"; + } + else { + return "other"; + } +} + +function find_ancestor (el, cls) { + /* + Utility function to find an ancestor node of a specified class. + */ + while ((el = el.parentNode) && el.className.indexOf(cls) < 0) {} + return el; +} + +function fire_rule(mutation, event, actions, rule) { + /* + Common script to run when a mutationObserver rule has been matched. + */ + + event['event_type'] = actions[rule]['label']; + + // If we specify a window we want to watch, get the innerText + if ('watched' in actions[rule] + && find_ancestor(mutation.target,actions[rule]['watched'])) { + event['context_content'] = + find_ancestor(mutation.target,actions[rule]['watched']).innerText; + } + + // Then send the logged event to the WO server. + log_event(mutation.type,event); +} + +function prepare_mutation_observer() { + /* + Set up a MutationObserver that will use the mutationObserved dictionary + to tell it which changes to log and what label to log it as. + */ + var observer = new MutationObserver(function (mutations) { + mutations.forEach(function (mutation) { + event = {} + + // This list guarantees that we'll have the information we need + // to understand what happened in a change event. + properties = [ + 'addedNodes.length', 'addedNodes[0].className', + 'addedNodes[0].data', 'addedNodes[0].id', + 'addedNodes[0].innerText', 'addedNodes[0].nodeType', + 'removedNodes.length', 'removedNodes[0].className', + 'removedNodes[0].data', 'removedNodes[0].id', + 'removedNodes[0].innerText', 'removedNodes[0].nodeType', + 'target.className', 'target.data', + 'target.innerText', 'target.parentNode.id', + 'target.parentNode.className','type' + ]; + + // Populate the mutation_data subdictionary that we use to + // pass the details of the mutation back to the WO sever. + var mutation_data = {}; + for (var property in properties) { + const prop = treeget(mutation, properties[property]); + if (prop !== null) { + mutation_data[properties[property]] = + treeget(mutation, properties[property]); + } + } + event['change'] = mutation_data; + + // uncomment this to observe all mutations in the console log. + // console.log(mutation); + + // Now we apply the rules defined by MUTATIONS_OBSERVED to record + // watched events. + + // First, check what kind of event this is + category = classify_mutation(mutation); + + // Then record that category as event_type + event['event_type']=category; + + // Filter the templates to those that are relevant to this category + actions = MUTATIONS_OBSERVED[category]; + + // Then loop through the available templates + for (var rule in actions) { + if (category=='insert' + && is_string(event.change['addedNodes[0].className']) + && event.change['addedNodes[0].className'].indexOf(actions[rule]['added'])>=0 + && event.change['target.className'].indexOf(actions[rule]['target'])>=0 + ) { + fire_rule(mutation, event, actions, rule); + break; + } + else if (category=='delete' + && is_string(event.change['removedNodes[0].className']) + && event.change['removedNodes[0].className'].indexOf(actions[rule]['removed'])>=0 + && event.change['target.className'].indexOf(actions[rule]['target'])>=0 + ) { + fire_rule(mutation, event, actions, rule); + break; + } + else if (category=='addtext' + && event.change['target.className'].indexOf(actions[rule]['target'])>=0 + ) { + fire_rule(mutation, event, actions, rule); + break; + } + else if (is_string(event.change['target.parentNode.className']) + && event.change['target.parentNode.className'].indexOf(actions[rule]['target'])>=0 + ) { + fire_rule(mutation, event, actions, rule); + break; + } + } + }); + }); + return observer; +} + +// Set mutation observer options +var MUTATION_OBSERVER_OPTIONS = { + // We don't want to watch attribute changes + attributes: false, + + // but we do want to watch tree and character changes. + childList: true, + characterData: true, + subtree: true +}; + +// OK, now create the MutationObserver and start running it +// on the document. +observer = prepare_mutation_observer(); +chrome.runtime.onMessage.addListener(function(message) { + if (message === 'startObserving') { + // Start observing the target node for configured mutations + observer.observe(editor, MUTATION_OBSERVER_OPTIONS); + } else if (message === 'stopObserving') { + // Stop observing the target node + observer.disconnect(); + } +}); + +/* + Document Load Code Block +*/ +function writing_onload() { + if(this_is_a_google_doc()) { + log_event("document_loaded", { + "partial_text": google_docs_partial_text() + }) + execute_on_page_space("_docs_flag_initialData.info_params.token") + const handleFromWeb = async (event) => { + if (event.data.from && event.data.from === "inject.js") { + const data = event.data.data; + var token = JSON.parse(data); + google_docs_version_history(token); + } + }; + + window.addEventListener('message', handleFromWeb); + } +} + +/* +This is code which, if executed on the page space, will capture HTTP +AJAX responses. + +This is impossible to do directly from within an extension. + +This is currently unused. +*/ +const LOG_AJAX = "\n\ +const WO_XHR = XMLHttpRequest.prototype;\n\ +const wo_send = WO_XHR.send;\n\ +\n\ +\n\ +WO_XHR.send = function () {\n\ + this.addEventListener('load', function () {\n\ + console.log(this); console.log(this.getAllResponseHeaders());\n\ + }); return wo_send.apply(this, arguments); }\n\ +" + +window.addEventListener("load", writing_onload); + +// This event listener is to used to detect changes in the document's +// visibility. E.g. when a user switches tabs and back. +window.addEventListener("visibilitychange", () => { + if (!document.hidden) { + console.log("I got reloaded again...") + } +}); diff --git a/extension/extension/writing_common.js b/extension/extension/writing_common.js new file mode 100644 index 000000000..49c3a648d --- /dev/null +++ b/extension/extension/writing_common.js @@ -0,0 +1,92 @@ +function treeget(tree, key) { + /* + Retrieve an element from a tree with dotted notation + + e.g. treeget( + {"hello": {"bar":"biff"}}, + "hello.bar" + ) + + Modified by PD to also deal with embbedded lists identified + using notations like addedNodes[0].className. + + If not found, return null + */ + let keylist = key.split("."); + let subtree = tree; + for(var i=0; i0) { + item = keylist[i].split('[')[0]; + idx = keylist[i].split('[')[1]; + idx = idx.split(']')[0]; + if (item in subtree) { + if (subtree[item][idx]!==undefined) { + subtree =subtree[item][idx]; + } else { + return null; + } + } else { + return null; + } + } else { + return null; + } + } + } + return subtree; +} + + +function googledocs_id_from_url(url) { + /* + Given a URL like: + https://docs.google.com/document/d/jkldfhjklhdkljer8934789468976sduiyui34778dey/edit/foo/bar + extract the associated document ID: + jkldfhjklhdkljer8934789468976sduiyui34778dey + Return null if not a valid URL + */ + var match = url.match(/.*:\/\/docs\.google\.com\/document\/d\/([^\/]*)\/.*/i); + if(match) { + return match[1]; + } + return null; +} + +var writing_lasthash = ""; +function unique_id() { + /* + This function is used to generate a (hopefully) unique ID for + each event. This isn't designed to be cryptosecure, since an + untrusted client can set this to whatever it likes in either + case. If used by a server, it ought to be rehashed with + server-side info. + + The major planned use is debugging. In the future, this might be + helpful for things like negotiating with the server too + (e.g. "Have you seen this event yet?") + */ + var shaObj = new jsSHA("SHA-256", "TEXT"); + shaObj.update(writing_lasthash); + shaObj.update(Math.random().toString()); + shaObj.update(Date.now().toString()); + shaObj.update(document.cookie); + shaObj.update("NaCl"); /* Salt? */ + shaObj.update(window.location.href); + writing_lasthash = shaObj.getHash("HEX"); + return writing_lasthash; +} diff --git a/extension/lousier-icon-128.png b/extension/lousier-icon-128.png new file mode 100644 index 000000000..c1cf0bd31 Binary files /dev/null and b/extension/lousier-icon-128.png differ diff --git a/extension/icons/lousy-fountain-pen-48.xcf b/extension/lousy-fountain-pen-48.xcf similarity index 100% rename from extension/icons/lousy-fountain-pen-48.xcf rename to extension/lousy-fountain-pen-48.xcf diff --git a/extension/manifest.json b/extension/manifest.json deleted file mode 100644 index 89e670b43..000000000 --- a/extension/manifest.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "author": "Piotr Mitros", - "manifest_version": 2, - "name": "Writing Process", - "homepage_url": "http://mitros.org/", - "incognito": "not_allowed", - "offline_enabled": true, - "version": "1.0", - "description": "Tracks writing in Google Docs, and provides nifty insights to you and your teachers!", - - "browser_action": { - "default_title": "Writing Process", - "default_popup": "pages/settings.html", - "default_icon": { - "48": "icons/lousy-fountain-pen-48.png" - } - }, - - "content_scripts": [ { - "matches": ["*://docs.google.com/*", "*://*.mozilla.org/*", "*://mail.google.com/*"], - "js": ["3rdparty/sha256.js", "writing_common.js", "writing.js"] - }], - "background": { - "scripts": ["writing_common.js", "background.js"] - }, - "permissions": [ - "webRequest", - "identity", - "identity.email", - "*://docs.google.com/*", - "*://mail.google.com/*", - "clipboardRead", - "storage" - ], - "icons": { - "48": "icons/lousy-fountain-pen-48.png" - }, - "options_ui": { - "page": "pages/options.html", - "chrome_style": true - } -} diff --git a/extension/pages/options.html b/extension/pages/options.html deleted file mode 100644 index 0813e6dca..000000000 --- a/extension/pages/options.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - - -

Server: no value found

-
- - - -
- - - - diff --git a/extension/pages/options.js b/extension/pages/options.js deleted file mode 100644 index 8dd7206fa..000000000 --- a/extension/pages/options.js +++ /dev/null @@ -1,39 +0,0 @@ -/* - Documentation on how to create an options page - */ - -function saveServerToStorage(new_server) { - console.log("Saving: "+new_server); - chrome.storage.sync.set({ - "process-server": new_server - }, restoreOptions); -} - -function saveOptions(e) { - /* - Callback when user hits "save" on the options page - */ - var new_server = document.querySelector("#process-server").value; - saveServerToStorage(new_server); - e.preventDefault(); -} - -function restoreOptions() { - /* - Initialize the options page for the extension. Eventually, we'd - like to also use chrome.storage.managed so that school admins - can set these settings up centrally, without student overrides - */ - chrome.storage.sync.get(['process-server'], function(result){ - var sync_storage_server = result['process-server']; - console.log("Loaded saved server: " + sync_storage_server); - if(!sync_storage_server) { - sync_storage_server = "writing.mitros.org"; - } - document.querySelector("#current-process-server").innerText = sync_storage_server; - document.querySelector("#process-server").value = sync_storage_server; - }); -} - -document.addEventListener('DOMContentLoaded', restoreOptions); -document.querySelector("form").addEventListener("submit", saveOptions); diff --git a/extension/pages/settings.html b/extension/pages/settings.html deleted file mode 100644 index 381fb76dc..000000000 --- a/extension/pages/settings.html +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - -

Writing Process

- -
    -
  • See my stats
  • -
  • Manage my data
  • -
- - diff --git a/extension/writing.js b/extension/writing.js deleted file mode 100644 index 120600dc3..000000000 --- a/extension/writing.js +++ /dev/null @@ -1,189 +0,0 @@ -/* - Page script. This is injected into each web page on associated web sites. -*/ - -/* For debugging purposes: we know the extension is active */ -document.body.style.border = "5px solid blue"; - -function log_error(error_string) { - /* - We should send errors to the server, but for now, we - log to the console. - */ - console.log(error_string); -} - -function doc_id() { - /* - Extract the Google document ID from the window - */ - try { - return googledocs_id_from_url(window.location.href); - } catch(error) { - log_error("Couldn't read document id"); - return null; - } -} - -function this_is_a_google_doc() { - /* - Returns 'true' if we are in a Google Doc - */ - return window.location.href.search("://docs.google.com/") != -1; -} - -function log_event(event_type, event) { - /* - We pass an event, annotated with the page document ID and title, - to the background script - */ - event["title"] = google_docs_title(); - event["doc_id"] = doc_id(); - event['date'] = new Date().toLocaleString('en-US'); - - chrome.runtime.sendMessage(event); -} - -function writing_eventlistener(event) { - /* - Listen for keystroke events, and pass them back to the background page. - */ - var event_data = {}; - event_data["event_type"] = "keypress"; - properties = ['altKey', 'charCode', 'code', 'ctrlKey', 'isComposing', 'key', 'keyCode', 'location', 'metaKey', 'repeat', 'shiftKey', 'which', 'isTrusted', 'timeStemp', 'type']; - for (var property in properties) { - event_data[properties[property]] = event[properties[property]]; - } - log_event("keystroke", event_data); -} - - - -document.addEventListener("keypress", writing_eventlistener); -document.addEventListener("keydown", writing_eventlistener); -document.addEventListener("keyup", writing_eventlistener); - -var iframes = document.getElementsByTagName("iframe") -for(iframe in iframes){ - if(iframes[iframe].contentDocument) { - iframes[iframe].contentDocument.addEventListener("keypress", writing_eventlistener); - iframes[iframe].contentDocument.addEventListener("keydown", writing_eventlistener); - iframes[iframe].contentDocument.addEventListener("keyup", writing_eventlistener); - } -} - -function gmail_text() { - /* - This function returns all the editable text in the current gmail - window. Note that in a threaded discussion, it's possible to - have several open on the same page. - - This is brittle; Google may change implementation and this will - break. - */ - var documents = document.getElementsByClassName("editable"); - for(document in documents) { - documents[document] = { - 'text': documents[document].innerHTML - }; - } - return documents; -} - -function google_docs_title() { - /* - Return the title of a Google Docs document. - - Note this is not guaranteed 100% reliable. - */ - try { - return document.getElementsByClassName("docs-title-input")[0].value; - } catch(error) { - log_error("Couldn't read document title"); - return null; - } -} - -function google_docs_partial_text() { - /* - Return the *loaded* text of a Google Doc. Note that for long - documents, this may not be the *complete* text since off-screen - pages may be lazy-loaded. The text omits formatting, which is - helpful for many types of analysis - */ - try { - return document.getElementsByClassName("kix-page")[0].innerText; - } catch(error) { - log_error("Could get document text"); - return null; - } -} - -function google_docs_partial_html() { - /* - Return the *loaded* HTML of a Google Doc. Note that for long - documents, this may not be the *complete* HTML, since off-screen - pages may be lazy-loaded. This includes HTML formatting, which - may be helpful, but is incredibly messy. - - I hate Google's HTML. What's wrong with simple, clean, semantic - tags? Why do we need something like this instead: - - Seriously, Google? - */ - return document.getElementsByClassName("kix-page")[0].innerHTML; -} - -function executeOnPageSpace(code){ - /* This is from - https://stackoverflow.com/questions/9602022/chrome-extension-retrieving-global-variable-from-webpage - - It is used to run code outside of the extension isolation box, - for example to access page JavaScript variables. - */ - // create a script tag - var script = document.createElement('script') - script.id = 'tmpScript' - // place the code inside the script. later replace it with execution result. - script.textContent = - 'document.getElementById("tmpScript").textContent = JSON.stringify(' + code + ')' - // attach the script to page - document.documentElement.appendChild(script) - // collect execution results - let result = document.getElementById("tmpScript").textContent - // remove script from page - script.remove() - return JSON.parse(result) -} - -function google_docs_version_history() { - var token = executeOnPageSpace("_docs_flag_initialData.info_params.token"); - metainfo_url = "https://docs.google.com/document/d/"+doc_id()+"/revisions/tiles?id="+doc_id()+"&start=1&showDetailedRevisions=false&filterNamed=false&token="+token+"&includes_info_params=true" - fetch(metainfo_url).then(function(response) { - response.text().then(function(text) { - tiles = JSON.parse(text.substring(5)); // Google adds a header to prevent JavaScript injection. This removes it. - var first_revision = tiles.firstRev; - var last_revision = tiles.tileInfo[tiles.tileInfo.length - 1].end; - version_history_url = "https://docs.google.com/document/d/"+doc_id()+"/revisions/load?id="+doc_id()+"&start="+first_revision+"&end="+last_revision; - fetch(version_history_url).then(function(history_response) { - history_response.text().then(function(history_text) { - log_event( - "document_history", - {'history': JSON.parse(history_text.substring(4))} - ); - }); - }); - }); - }); -} - -function writing_onload() { - if(this_is_a_google_doc()) { - log_event("document_loaded", { - "partial_text": google_docs_partial_text() - }) - google_docs_version_history(); - } -} - -window.addEventListener("load", writing_onload); diff --git a/extension/writing_common.js b/extension/writing_common.js deleted file mode 100644 index 097d38a54..000000000 --- a/extension/writing_common.js +++ /dev/null @@ -1,31 +0,0 @@ -function googledocs_id_from_url(url) { - /* - Given a URL like: - https://docs.google.com/document/d/jkldfhjklhdkljer8934789468976sduiyui34778dey/edit/foo/bar - extract the associated document ID: - jkldfhjklhdkljer8934789468976sduiyui34778dey - Return null if not a valid URL - */ - var match = url.match(/.*:\/\/docs\.google\.com\/document\/d\/([^\/]*)\/.*/i); - if(match) { - return match[1]; - } - return null; -} - -var writing_lasthash = ""; -function unique_id() { - /* - This function is used to generate a (hopefully) unique ID for - each event. - */ - var shaObj = new jsSHA("SHA-256", "TEXT"); - shaObj.update(writing_lasthash); - shaObj.update(Math.random().toString()); - shaObj.update(Date.now().toString()); - shaObj.update(document.cookie); - shaObj.update("NaCl"); /* Salt? */ - shaObj.update(window.location.href); - writing_lasthash = shaObj.getHash("HEX"); - return writing_lasthash; -} diff --git a/gitserve/gitserve/README.md b/gitserve/gitserve/README.md new file mode 100644 index 000000000..0f0c94965 --- /dev/null +++ b/gitserve/gitserve/README.md @@ -0,0 +1,12 @@ +gitserve +======== + +This is a small module to (t.b.d. access? serve?) files from git. + +Perhaps it belongs in a different repo? + +It's not designed to be particular fast, scalable, or robust in the +current implementation. It wouldn't be hard to make it be so, if that +were a requirement down-the-line. For now, I'd like to be able to do +small-scale studies where I can serve up files from a git repo, and +have versions appropriately managed. diff --git a/gitserve/gitserve/aio_gitserve.py b/gitserve/gitserve/aio_gitserve.py new file mode 100644 index 000000000..dc9a550b7 --- /dev/null +++ b/gitserve/gitserve/aio_gitserve.py @@ -0,0 +1,78 @@ +''' +Handler to serve files from a git repo from an aiohttp server. + +For experimental use only. This code would need to be optimized for +production use. It is blockingm, non-caching, and generally, not +designed for scale. It would be a few days work to make this code +scalable. + +It doesn't do auth/auth, but that could be handled in the calling code +via a middleware or decorator, or in the web server. + +It would be nice to have more graceful error handling. There are +files like media and 3rd party libraries which we need to serve up +too. Some kind of callback? +''' + +import mimetypes +import os.path + +import aiohttp.web + +import gitserve.gitaccess + + +WARNED = False + + +def git_handler_wrapper( + repo, + cookie_prefix="", + prefix="", + bare=True, + working_tree_dev=False +): + ''' + Returns a handler which can serve files from a git repo, from + different branches. This should obviously only be used with + non-private repos. It also sets a cookie with the hash from git, + so it's nice for science replicability. If we're serving data + for a coglab, we can record which version we served from. + + Parameters: + repo: git URL of the repo + cookie_prefix: + ''' + repo = gitserve.gitaccess.GitRepo(repo, bare=bare) + + def git_handler(request): + global WARNED + branch = request.match_info['branch'] + if working_tree_dev: + branch = gitserve.gitaccess.WORKING_DIR + if not WARNED: + print("Serving from working tree. " + "This should not be used in prod.") + WARNED = True + filename = os.path.join(prefix, request.match_info['filename']) + body = repo.show(branch, filename) + mimetype = mimetypes.guess_type(filename)[0] + if mimetype is None: + mimetype = "text/plain" + + if mimetype.startswith("text/"): + response = aiohttp.web.Response( + text=body.decode('utf-8'), + content_type=mimetype + ) + else: + response = aiohttp.web.Response( + body=body, + content_type=mimetype + ) + response.set_cookie( + cookie_prefix + "githash", + repo.rev_hash(branch) + ) + return response + return git_handler diff --git a/gitserve/gitserve/gitaccess.py b/gitserve/gitserve/gitaccess.py new file mode 100644 index 000000000..c5331691b --- /dev/null +++ b/gitserve/gitserve/gitaccess.py @@ -0,0 +1,181 @@ +'''This is a small library to allow us to browse git repos. + +Next steps: + +* It's a little bit over-conservative in terms of sanitizing parameters + to git. We should be more precise so we allow more filenames and branch + names. +* We should really browse the git repo directly, without subprocess. I + mean, really? +* Perhaps add more git commands? +''' + +import enum +import os.path +import string +import subprocess +import sys + +import pathvalidate + + +# Special tag used to browse files in the working directory. This is +# helpful during **DEVELOPMENT**. This may be dangerous in production, +# since working directories sometimes have random files sitting +# around. We use an enum so that users explicitly need to ask to use +# this feature. + +WORKING_DIR = enum.Enum("Special Branches", "Working").Working + + +class FileExists(Exception): + ''' + For now, raised when cloning a repo to a location which exists. + ''' + + +def sanitize(filename): + ''' + Confirm that a filename is valid for using in a shell command to + pull a file out of git. + + We'll be overly-conservative. We don't want security exploits. At + some point, this should be made more narrow, but since we're calling + into shell code in this version, we're super-careful + + ''' + valid_characters = "-_/." + string.ascii_letters + string.digits + newname = "".join(c for c in filename if c in valid_characters) + newname = pathvalidate.sanitize_filepath( + newname, + platform='Linux', normalize=True + ) + if newname.startswith("/"): + raise ValueError("Suspicious operation: String starts with /") + if newname.startswith("-"): + raise ValueError("Suspicious operation: String starts with -") + if ".." in newname: + raise ValueError("Suspicious operation: String contains ..") + if newname != filename: + raise ValueError("Suspicious operation: Sanitized string != original") + return newname + + +class GitRepo: + ''' + Class for managing a git repo. We could make this functional, but + OO is helpful in case we ever want to e.g. use remote git repos. In + those cases, we may want to e.g. maintain open ssh connections and + whatnot. It can also help with caching. + ''' + def __init__(self, gitdir, bare=False): + ''' + We store where the repo is, and return an object we can + use to browse it. + ''' + self.bare = bare + self.workingdir = None + # We should probably store the working dir too, for + # non-bare directories. We'll add that once we need + # it. + if not gitdir.endswith("/.git"): + self.gitdir = os.path.join(gitdir, ".git") + else: + self.gitdir = gitdir + self.workingdir = self.gitdir[:-4] + + def clone(self, url, mirror=False): + ''' + Clone the repo. Hopefully raise an exception if it already exists. + ''' + if os.path.exists(self.gitdir): + raise FileExists() + options = "" + if mirror: + options += "--mirror" + command = "git clone {options} {url} {path}".format( + options=options, + url=url, + path=self.gitdir + ) + # TODO: Test error handling. + # *Should* raise a subprocess.CalledProcessError on failure. + return subprocess.check_output(command, shell=True).decode('utf-8') + + def branches(self): + ''' + Return a list of all local branches in the repo + ''' + command = "git --git-dir={gitdir} branch".format( + gitdir=self.gitdir + ) + try: + branch_list = subprocess.check_output( + command, shell=True + ).decode('utf-8').split('\n') + except subprocess.CalledProcessError: + # This is a bit redundant, since most of this is in the exception, + # but it's helpful for quickly fixing config issues. + print("Failed to git branch. Command:") + print(command) + print("in gitaccess.py") + raise + branch_list = [b.replace('*', '').strip() for b in branch_list] + branch_list = [b for b in branch_list if b != ''] + return branch_list + + def show(self, branch, filename): + ''' + Return the contents of a file in the repo + + Note that this is not UTF8, and needs to be decoded. + ''' + if branch != WORKING_DIR: + sanitized_branch = sanitize(branch) + sanitized_filename = sanitize(filename) + if branch in self.branches(): + data = subprocess.check_output( + "git --git-dir={gitdir} show {branch}:{filename}".format( + gitdir=self.gitdir, + branch=sanitized_branch, + filename=sanitized_filename + ), shell=True + ) + elif branch == WORKING_DIR: + data = open( + os.path.join(self.workingdir, sanitized_filename), + "rb" + ).read() + else: + raise ValueError("No such branch") + return data + + def rev_hash(self, branch): + ''' + Return the git hash of a branch. + ''' + if branch == WORKING_DIR: + data = "[NO_HASH_WORKING_TREE]" + else: + sanitized_branch = sanitize(branch) + data = subprocess.check_output( + "git --git-dir={gitdir} rev-parse {branch}".format( + gitdir=self.gitdir, + branch=sanitized_branch + ), + shell=True + ).decode('utf-8').strip() + if not all(c in string.hexdigits for c in data): + raise ValueError("Not a valid branch / hash") + + return data.strip() + + +# Simple test case. Show README.md from a repo specified on the command line. +if __name__ == "__main__": + repo = GitRepo(sys.argv[1]) + branches = repo.branches() + print(branches) + print(repo.show(branches[-1], 'README.md')) + print(repo.show(WORKING_DIR, 'README.md')) + print(repo.rev_hash(branches[-1])) diff --git a/gitserve/gitserve/test_server.py b/gitserve/gitserve/test_server.py new file mode 100644 index 000000000..b4bdc2430 --- /dev/null +++ b/gitserve/gitserve/test_server.py @@ -0,0 +1,47 @@ +''' +Minimal test server. Serves up a git repo passed on the command line. + +Usage: + +>>> python test_server /home/ubuntu/repo/ +''' + +import sys + +import asyncio +import aiohttp.web + +import gitserve.aio_gitserve + +if len(sys.argv) == 1: + print("Usage:") + print(" python test_server.py /home/ubuntu/repo") + sys.exit(-1) + +gitrepo = sys.argv[1] +if len(sys.argv) > 2: + PREFIX = sys.argv[2] +else: + PREFIX = "" + +if "--working" in sys.argv: + WORKING = True +else: + WORKING = False + +loop = asyncio.get_event_loop() +app = aiohttp.web.Application(loop=loop) +app.router.add_get( + '/', + lambda request: aiohttp.web.Response(text='Test / example server!') +) +app.router.add_get( + r'/browse/{branch:[^{}/]+}/{filename:[^{}]+}', + gitserve.aio_gitserve.git_handler_wrapper( + gitrepo, + prefix=PREFIX, + cookie_prefix="content_", + working_tree_dev=WORKING + ) +) +aiohttp.web.run_app(app, host='127.0.0.1', port=8080) diff --git a/gitserve/setup.py b/gitserve/setup.py new file mode 100644 index 000000000..49553bc6c --- /dev/null +++ b/gitserve/setup.py @@ -0,0 +1,10 @@ +''' +Rather minimalistic install script. To install, run `python +setup.py develop` or just install via requirements.txt +''' + +from setuptools import setup, find_packages + +setup( + name="gitserve" +) diff --git a/gitserve/test.sh b/gitserve/test.sh new file mode 100755 index 000000000..cb1d0df9f --- /dev/null +++ b/gitserve/test.sh @@ -0,0 +1,2 @@ +#!/bin/sh +python gitserve/test_server.py $1 diff --git a/learning_observer/docs/Makefile b/learning_observer/docs/Makefile new file mode 100644 index 000000000..d4bb2cbb9 --- /dev/null +++ b/learning_observer/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/learning_observer/docs/adapters.md b/learning_observer/docs/adapters.md new file mode 100644 index 000000000..e51b2088b --- /dev/null +++ b/learning_observer/docs/adapters.md @@ -0,0 +1,50 @@ +Format Adapters +=============== + +We would like to be able to take in data from a diversity of formats, such as: + +* IMS Caliper +* TinCan/xAPI +* edX events +* Writing Observer +* Various sorts of ad-hoc formats + +In addition, we would like to be transparently backwards-compatible with +older versions of formats. + +We don't yet know the best way to architect this, but on a high level, this +directory is where we'll put in adapters. Our goal is to have something along +the lines of: + +```mermaid +graph TD + Q[\ /] --> A + A[Source] --> B{Router} + A --> Z[(Archive)] + B -.-> C(Tincan Adapter) + B -.-> D(xAPI Adapter) + B --> E(V0 Adapter) + E --> F(V1 Adapter) + F --> G{{Student Event Pipeline}} +``` + +Our goal is to have source events archived in the source format, so as not to +lose information. However, individual reducers and event pipelines should not +be responsible for maintaining backwards-compatibility. + +Experiences / thoughts: + +* The majority of changes are minor. For example, in an early version of this + system, we had inconsistent use of dashes versus underscores. Eventually, + we'd like to adopt more terminology from formats like Tincan and xAPI. In + Open edX, there was an early breaking change due to a timestamp format + issue. These sorts of changes are best handled transparently. +* In most cases, aside from a small number of high-level features (such as + time-on-task or event count), most analytics need to be customized to each + source. However, it's still helpful to use common formats and terminology + where convenient. +* We'd like some kind of explicit versioning. How we do that is TBD. +* It's nice if old events just work. The migrations can stick around. Ideally, + they'll only run on old events, so as not to impact performance. +* We'll need migrations on other parts of the system too (e.g. reduced data + in the KVS, etc.). That's TBD. \ No newline at end of file diff --git a/learning_observer/docs/conf.py b/learning_observer/docs/conf.py new file mode 100644 index 000000000..8f507aad5 --- /dev/null +++ b/learning_observer/docs/conf.py @@ -0,0 +1,53 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('..')) + + +# -- Project information ----------------------------------------------------- + +project = 'Learning Observer and Writing Observer' +copyright = '2021, ETS' +author = 'Piotr Mitros and Team' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc" +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/learning_observer/docs/index.rst b/learning_observer/docs/index.rst new file mode 100644 index 000000000..9ec204f6d --- /dev/null +++ b/learning_observer/docs/index.rst @@ -0,0 +1,71 @@ +Welcome to Learning Observer and Writing Observer's documentation! +================================================================== + +This is a prototype. This file was autogenerated with:: + + ls ../learning_observer/*py |sed -e s/..\\/learning_observer\\//..\ automodule::\ learning_observer./g + +And then replacing `.py` with `\n :members:`. It's not too usable. + +To make this usable, we'll want multiple files. We'll also want to clean up +code we run on import. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + +.. automodule:: learning_observer.admin + :members: +.. automodule:: learning_observer.auth_handlers + :members: +.. automodule:: learning_observer.authutils + :members: +.. automodule:: learning_observer.client_config + :members: +.. automodule:: learning_observer.dashboard + :members: +.. automodule:: learning_observer.exceptions + :members: +.. automodule:: learning_observer.filesystem_state + :members: +.. automodule:: learning_observer.incoming_student_event + :members: +.. automodule:: learning_observer.init + :members: +.. automodule:: learning_observer.kvs + :members: +.. automodule:: learning_observer.log_event + :members: +.. automodule:: learning_observer.__main__ + :members: +.. automodule:: learning_observer.main + :members: +.. automodule:: learning_observer.module_loader + :members: +.. automodule:: learning_observer.module + :members: +.. automodule:: learning_observer.paths + :members: +.. automodule:: learning_observer.restream + :members: +.. automodule:: learning_observer.rosters + :members: +.. automodule:: learning_observer.run + :members: +.. automodule:: learning_observer.settings + :members: +.. automodule:: learning_observer.stream_writing + :members: +.. automodule:: learning_observer.synthetic_student_data + :members: +.. automodule:: learning_observer.util + :members: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/learning_observer/docs/make.bat b/learning_observer/docs/make.bat new file mode 100644 index 000000000..2119f5109 --- /dev/null +++ b/learning_observer/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/learning_observer/learning_observer/Makefile b/learning_observer/learning_observer/Makefile new file mode 100644 index 000000000..0b5c4e1f4 --- /dev/null +++ b/learning_observer/learning_observer/Makefile @@ -0,0 +1,19 @@ +# Bundle up all the JavaScript into a single file +# +# We don't shrink / minify the file yet. This requires more RAM than a +# nano AWS instance can handle. If we want to do this, remove +# `optimize=none` + +# We're doing something ugly and horrible with paths, but we just +# wanted minimum viable / thin red line with all the pieces in +# place. We'll clean up paths at some point. + +# To use this bundle, switch from webapp to webapp-built. + +js-bundle: + rm -f static/webapp-built.js # Remove the old file, if it exists + rm -f static/static # Remove hack, if left over from last run + ln -s ../static static/static # Hack to make paths work. TODO: Fix. + node static/3rd_party/r.js -o build.js optimize=none # Actual build + rm -f static/static # Remove hack + mv webapp-built.js static # And our final file diff --git a/learning_observer/learning_observer/__init__.py b/learning_observer/learning_observer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/learning_observer/learning_observer/__main__.py b/learning_observer/learning_observer/__main__.py new file mode 100644 index 000000000..cbbd9fbe7 --- /dev/null +++ b/learning_observer/learning_observer/__main__.py @@ -0,0 +1,7 @@ +''' +Thin wrapper around main, so we can run this by writing: +`python learning_observer` +''' + +# pylint: disable=W0611 +import learning_observer.main diff --git a/learning_observer/learning_observer/adapters/README.md b/learning_observer/learning_observer/adapters/README.md new file mode 120000 index 000000000..59467070c --- /dev/null +++ b/learning_observer/learning_observer/adapters/README.md @@ -0,0 +1 @@ +../../docs/adapters.md \ No newline at end of file diff --git a/learning_observer/learning_observer/adapters/__init__.py b/learning_observer/learning_observer/adapters/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/learning_observer/learning_observer/adapters/adapter.py b/learning_observer/learning_observer/adapters/adapter.py new file mode 100644 index 000000000..aacf8f12f --- /dev/null +++ b/learning_observer/learning_observer/adapters/adapter.py @@ -0,0 +1,109 @@ +''' +The Adapter class is designed to transform the data from the source into the +data that the reducers expect. + +A lot of this is TBD. See the README file for more details. + +There's not too much here. It's a placeholder to get the architectural pieces +in the right place. +''' + +import learning_observer.adapters.helpers + + +# In the original code, we had dashes. We want underscores. This is a mapping +# from the old to the new, not just in events, but also in the KVS (but not +# in the config file). +FIELD_RENAMES = { + "event-type": "event_type", + "org.mitros.writing-analytics": "org.mitros.writing_analytics", + "teacher-tag": "teacher_tag", + "user-tag": "user_tag", + "process-server": "process_server", + "unique-id": "unique_id", + "generated-id": "generated_id", + "local-storage": "local_storage", + "wa-source": "wa_source", + "background-page": "background_page", + "client-page": "client_page", + "stream-test-script": "stream_test_script", + "character-count": "character_count", + "total-time-on-task": "total_time_on_task", + "summary-stats": "summary_stats", + "student-data": "student_data", + "org.mitros.dynamic-assessment": "org.mitros.dynamic_assessment", + "da-guest": "da_guest", +} + + +def dash_to_underscore(event): + ''' + Convert dashes in events from the alpha version to underscores. + ''' + event = learning_observer.adapters.helpers.rename_json_keys( + event, FIELD_RENAMES + ) + + if 'client' in event and 'source' in event['client']: + event['client']['source'] = event['client']['source'].replace('-', '_') + if 'source' in event: + event['source'] = event['source'].replace('-', '_') + + return event + + +common_transformers = [ + dash_to_underscore +] + + +class EventAdapter: + def __init__(self, metadata=None): + self.metadata = metadata + + def canonicalize_event(self, event): + ''' + Transform the event into the format that the reducers expect. + + This may modify the event in place. + + This is a lousy API since we may want to split and re-combine + events. At some point, we'll want to figure out how to combine + `async` and `yield` to do this right. + ''' + for transformer in common_transformers: + event = transformer(event) + return event + + def set_metadata(self, metadata): + ''' + Set the metadata for the adapter. + + Not implemented, but a placeholder for how the API will work. + ''' + self.metadata = metadata + raise NotImplementedError() + + +"""" +We probably want something like: + + async def transform_events(self, events): + ''' + Transform a list of events. + + This is a generator. + ''' + async for event in events: + yield self.canonicalize_event(event) + +And to use this as: + +ws = aiohttp.web.WebSocketResponse() +json_events = decode_json_events(ws) +adapted_events = adapter.transform_events(json_events) +... + +I'm not sure if the above syntax is right; we will try this once we have a +working baseline version. +""" diff --git a/learning_observer/learning_observer/adapters/helpers.py b/learning_observer/learning_observer/adapters/helpers.py new file mode 100644 index 000000000..5a311168f --- /dev/null +++ b/learning_observer/learning_observer/adapters/helpers.py @@ -0,0 +1,95 @@ +''' +This is a module which can migrate json events from one naming convention to +another. +''' + + +import json + + +def rename_json_keys(source, replacements): + ''' + Rename the keys in a json object using a dictionary of replacements. + + The replacements dictionary maps keys to new keys. + + The source object is transformed in place. + + >>> source = { + ... "event-type": "blog", + ... "writing-log": "foobar", + } + >>> replacements = { + ... "event-type": "event_type", + ... "writing-log": "writing_log", + ... } + >>> rename_json_keys(source, replacements) + { + "event_type": "blog", + "writing_log": "foobar", + } + ''' + if isinstance(source, dict): + for key, value in list(source.items()): + if key in replacements: + source[replacements[key]] = source.pop(key) + rename_json_keys(value, replacements) + elif isinstance(source, list): + for item in source: + rename_json_keys(item, replacements) + return source + + +# Write a test case for rename_json_keys. +def dict_compare(d1, d2): + s1 = json.dumps(d1, sort_keys=True) + s2 = json.dumps(d2, sort_keys=True) + if False: # Turn on for debugging + print(s1) + print(s2) + return s1 == s2 + + +def test_rename_json_keys(): + replacements = { + "event-type": "event_type", + "writing-log": "writing_log", + } + + data = { + "event-type": "blog", + "writing-log": "foobar", + "log-level": "info", + "text": "The old man and the sea", + "timestamp": "yesterday", + "event-data": { + "event-type": "log", + "writing-log": "foobar", + "log-level": "info", + "text": "The old man and the sea", + "event-time": "tomorrow" + } + } + + desired_output = { + "event_type": "blog", + "writing_log": "foobar", + "log-level": "info", + "text": "The old man and the sea", + "timestamp": "yesterday", + "event-data": { + "event_type": "log", + "writing_log": "foobar", + "log-level": "info", + "text": "The old man and the sea", + "event-time": "tomorrow" + } + } + + transformed_data = rename_json_keys(data, replacements) + + assert dict_compare(transformed_data, desired_output) + + +if __name__ == '__main__': + test_transform_json() diff --git a/learning_observer/learning_observer/admin.py b/learning_observer/learning_observer/admin.py new file mode 100644 index 000000000..b5b75eeba --- /dev/null +++ b/learning_observer/learning_observer/admin.py @@ -0,0 +1,131 @@ +''' +Administrative Views +==================== + +Views for monitoring overall system operation, and eventually, for +administering the system. +''' +import copy +import numbers +import psutil +import sys + +import aiohttp +import aiohttp.web + +import dash.development.base_component + +import learning_observer.module_loader +from learning_observer.log_event import debug_log + +from learning_observer.auth.utils import admin + + +def machine_resources(): + ''' + A dictionary of information about memory, CPU, etc. usage + ''' + mountpoints = [p.mountpoint for p in psutil.disk_partitions()] + disk_space = {p: psutil.disk_usage(p).percent for p in mountpoints} + + return { + "usage": { + "cpu_percent": psutil.cpu_percent(), + "virtual_mem": psutil.virtual_memory().percent, + "swap_memory": psutil.swap_memory().percent, + "disk_space": disk_space + } + } + + +@admin +async def system_status(request): + ''' + View for a system status screen. This shows: + - Loaded modules + - Available URLs + - System resource usage + + This returns JSON, which renders very nicely in Firefox, but might + be handled by a client-side app at some point. If that happens, we + might change the API a bit to make it more computer-friendly and + less Firefox-friendly. + ''' + def routes(app): + ''' + A list of routes. We compactify this quite a bit for pretty + rendering in Firefox. If a client ever handles this, we might + want to standardize this a bit more, though (it can return + strings and dictionaries right now). + ''' + resources = [] + for resource in app.router.resources(): + info = resource.get_info() + if 'path' in info: + resources.append(info['path']) + elif 'formatter' in info: + resources.append(info['formatter']) + else: + sinfo = {} + for key in info: + sinfo[key] = str(info[key]) + resources.append(sinfo) + return resources + + def clean_json(json_object): + ''' + * Deep copy a JSON object + * Convert list-like objects to lists + * Convert dictionary-like objects to dicts + * Convert functions to string representations + ''' + if isinstance(json_object, str): + return str(json_object) + if isinstance(json_object, numbers.Number): + return json_object + if isinstance(json_object, dict): + return {key: clean_json(value) for key, value in json_object.items()} + if isinstance(json_object, list): + return [clean_json(i) for i in json_object] + if isinstance(json_object, learning_observer.stream_analytics.fields.Scope): + # We could make a nicer representation.... + return str(json_object) + if callable(json_object): + return str(json_object) + if json_object is None: + return json_object + if str(type(json_object)) == "": + return str(json_object) + if isinstance(json_object, dash.development.base_component.Component): + return f"Dash Component {json_object}" + raise ValueError("We don't yet handle this type in clean_json: {} (object: {})".format(type(json_object), json_object)) + + status = { + "status": "Alive!", + "resources": machine_resources(), + "modules": { + "course_aggregators": clean_json(learning_observer.module_loader.course_aggregators()), + "reducers": clean_json(learning_observer.module_loader.reducers()), + "static_repos": learning_observer.module_loader.static_repos(), + "dash_pages": clean_json(learning_observer.module_loader.dash_pages()) + }, + "routes": routes(request.app) + } + + debug_log(status) + + return aiohttp.web.json_response(status) + + +@admin +async def die(request): + ''' + Shut down the server. + + TODO: Replace this with a clean shutdown which closes all sockets, + etc. But this still beats killing the process. + ''' + sys.exit(-1) + return aiohttp.web.json_response({ + 'status': 'dead' # Just like this code :) + }) diff --git a/learning_observer/learning_observer/all_students_roster.py b/learning_observer/learning_observer/all_students_roster.py new file mode 100644 index 000000000..43793da29 --- /dev/null +++ b/learning_observer/learning_observer/all_students_roster.py @@ -0,0 +1,45 @@ +''' +This creates a roster with all students in Redis +''' + +import learning_observer.kvs + +from learning_observer.log_event import debug_log + + +async def all_students(): + ''' + This crawls the keys of the KVS, and creates a list of all + student IDs in the KVS. + ''' + keys = await learning_observer.kvs.KVS.keys() + internal_keys = [k for k in keys if k.startswith("Internal:")] + split_keys = [k.split(":") for k in internal_keys] + valid_keys = [k for k in split_keys if len(k) > 2] + user_ids = sorted(set([k[2] for k in valid_keys])) + debug_log(user_ids) + return user_ids + + +async def all_students_course_list(): + return [ + { + "id": "12345678901", + "name": "All Students", + "descriptionHeading": "For easy small-scale deploys", + "alternateLink": "https://www.ets.org/", + "teacherGroupEmail": "", + "courseGroupEmail": "", + "teacherFolder": { + "id": "", + "title": "All Students", + "alternateLink": "" + }, + "calendarId": "NA" + } + ] + + +if __name__ == '__main__': + loop = asyncio.get_event_loop() + loop.run_until_complete(all_students_roster()) diff --git a/learning_observer/learning_observer/auth/__init__.py b/learning_observer/learning_observer/auth/__init__.py new file mode 100644 index 000000000..264a4f2a8 --- /dev/null +++ b/learning_observer/learning_observer/auth/__init__.py @@ -0,0 +1,110 @@ +'''Authorization / authentication subsystem. + +Our goal is to keep things simple. We would like a few types of accounts: + +- Student (guest) +- Student (authorized) +- Teacher +- System admin +- In the future, researchers + +We really don't want this blowing up into massive ACLs and +what-not. There is a broad set of use-cases for Learning Observer, +including: + +* Web pages on the internet with no log-ins, and per-session "accounts" +* Small-scale deploys, with config in flat files for individual classes +* Research coglabs and studies +* Large-scale deploys integrated with school subsystems through single + sign-on + +Currently, we're developing the system to handle all of these +use-cases through configuration (e.g. different deploy for each of +these). Eventually, we'd like to handle all of these in one common +deploy, so we can cross-link, aggregate, and understand what's going +on across contexts, but there's a lot of architecture and planning work +to get there. + +It's worth noting that there are two types of security: + +1) **Students injecting data into the system**. Here, in many + use-cases, lack of auth is low-stakes. The worst-case outcome in, + for example, a cog lab, is a DoS attack. Stakes only go up if data + is used for e.g. decisionmaking. For this reason, student auth/auth + supports modes which are pretty lax. On the other hand, in many + contexts, we won't have good data (e.g. open web pages without + sign-ins). + +2) **Access to student data**. For authenticating teachers and sys + admins, we want full paranoia. + +We do want to be aware of corner-cases (e.g. students wanting access +to their own data). + +We haven't figured out all of the data models here. +''' + +import sys + +# Decorators to confirm requests are authenticated +# +# We might consider breaking these out into AJAXy ones, which return +# an error object, and HTMLy ones, which take users to a log-in page. +# e.g. @admin_ajax @admin_html, or @admin(type=ajax) +from learning_observer.auth.utils import admin +from learning_observer.auth.utils import teacher + +# Utility functions +from learning_observer.auth.utils import fernet_key +from learning_observer.auth.utils import google_id_to_user_id + +# Utility handlers +from learning_observer.auth.handlers import logout_handler +from learning_observer.auth.handlers import user_info_handler + +from learning_observer.auth.handlers import auth_middleware + +# Specific authentication schemes +from learning_observer.auth.social_sso import social_handler +from learning_observer.auth.password import password_auth + +# Code below does sanity checks on configuration +# +# Importing settings isn't perfect, since this should not depend on learning_observer, +# but it's better than the alternatives +import learning_observer.prestartup +import learning_observer.settings as settings + + +@learning_observer.prestartup.register_startup_check +def verify_auth_precheck(): + ''' + This is a pre-startup check to make sure that the auth system is configured + correctly. + ''' + # We need some auth + if 'auth' not in settings.settings: + raise learning_observer.prestartup.StartupCheck( + "Please configure auth") + + # If we have Google oauth, we need it properly configured. + # TODO: Confirm everything works with Google Oauth missing + if 'google_oauth' in settings.settings['auth']: + if 'web' not in settings.settings['auth']['google_oauth'] or \ + 'client_secret' not in settings.settings['auth']['google_oauth']['web'] or \ + 'project_id' not in settings.settings['auth']['google_oauth']['web'] or \ + 'client_id' not in settings.settings['auth']['google_oauth']['web'] or \ + isinstance(settings.settings['auth']['google_oauth']['web']['client_secret'], dict) or \ + isinstance(settings.settings['auth']['google_oauth']['web']['project_id'], dict) or \ + isinstance(settings.settings['auth']['google_oauth']['web']['client_id'], dict): + error = \ + "Please configure (or disable) Google oauth\n" + \ + "\n" + \ + "Go to:\n" + \ + " https://console.developers.google.com/ \n" + \ + "And set up an OAuth client for a web application. Make sure that configuration\n" + \ + "mirrors the one here.\n" + \ + "\n" + \ + "If you are not planning to use Google auth (which is the case for most dev\n" + \ + "settings), please disable Google authentication in creds.yaml" + raise learning_observer.prestartup.StartupCheck(error) diff --git a/learning_observer/learning_observer/auth/auth.md b/learning_observer/learning_observer/auth/auth.md new file mode 120000 index 000000000..f38fd6889 --- /dev/null +++ b/learning_observer/learning_observer/auth/auth.md @@ -0,0 +1 @@ +../../../docs/auth.md \ No newline at end of file diff --git a/learning_observer/learning_observer/auth/events.py b/learning_observer/learning_observer/auth/events.py new file mode 100644 index 000000000..df7be787b --- /dev/null +++ b/learning_observer/learning_observer/auth/events.py @@ -0,0 +1,365 @@ +''' +We would like to have multiple means of assigning identity to incoming student events: + +- Assigned per-connection +- Header in the events, unverified (e.g. for use during a coglab, + during quasi-anonymous browser sessions) +- Header in the events, verified from a password file +- HTTP basic auth +- Google oauth +- Etc. + +One piece of nuance: + +- Some schemes will have a header sent once per connection, with no + student data (view header, then discard) +- Some schemes will include identity with each event. + +We're still figuring out the best ways to do this. + +Some of these code paths are untested. Please test and debug before using. +''' + +import asyncio +import urllib.parse +import secrets +import sys + +import aiohttp_session +import aiohttp.web + +import learning_observer.paths +import learning_observer.prestartup +import learning_observer.settings + +import learning_observer.auth.http_basic + +from learning_observer.log_event import debug_log + +AUTH_METHODS = {} + + +def register_event_auth(name): + ''' + Decorator to register a method to authenticate events + ''' + def wrapper(f): + ''' + The decorator does not change the function + ''' + AUTH_METHODS[name] = f + return f + return wrapper + + +def find_event(event_type, event_list): + ''' + Find the first event of type `event` in the `event_list` + + Return `None` if no event found. + + >>> find_event('this-one', [{'event': 'not-this-one'}, {'event': 'not-this-one'}, {'event': 'this-one'}]) + {'event': 'this-one'} + >>> find_event('missing-event', [{'event': 'not-this-one'}, {'event': 'not-this-one'}, {'event': 'this-one'}]) + ''' + for e in event_list: + if e.get('event', None) == event_type: + return e + return None + + +def encode_id(source, unsafe_id): + ''' + This is a bit of encoding logic to generically encode IDs from + unknown sources. We want to avoid the problem of Little Bobby + Tables (https://xkcd.com/327/). + + It's not clear this is needed long-term (we put this in when we + were using Google emails rather than numeric IDs), but we're + keeping it here for now for the test data sources. This just + generically sanitizes everything in case we either missed + something above, or just want to have a sane default before + implementing something fancy. + + We also want to avoid overlapping UIDs between sources. For + example, we don't want an attack where e.g. a user carefully + creates an account on one auth provide to collide with a + pre-existing account on another auth provider. So we append + providence. Note that we don't want to do this twice (so + `authutils` does this already for Google) + + >>> encode_id("gcu", "1234; DROP TABLE *") + 'gcu-1234%3B+DROP+TABLE+%2A' + ''' + return "{source}-{uid}".format( + source=source, + uid=urllib.parse.quote_plus( + unsafe_id, + safe='@' # Keep emails more readable + ) + ) + + +def token_authorize_user(auth_method, user_id_token): + ''' + Authorize a user based on a list of allowed user ID tokens + ''' + am_settings = learning_observer.settings.settings['event_auth'][auth_method] + if 'userfile' in am_settings: + userfile = am_settings['userfile'] + users = [u.strip() for u in open(learning_observer.paths.data(userfile)).readlines()] + if user_id_token in users: + return "authenticated" + if am_settings.get("allow_guest", False): + return "unauthenticated" + raise aiohttp.web.HTTPUnauthorized() + + +@register_event_auth("http_basic") +async def basic_auth(request, headers, first_event, source): + ''' + Authenticate with HTTP Basic through nginx. + ''' + (username, password) = learning_observer.auth.http_basic.http_basic_extract_username_password(request) + print(f"Authenticated as {username}") + if username is None: + # nginx shouldn't pass requests without + # auth headers. We are logging, but + # with red flags; we don't want to lose + # data. In more secure settings, we + # might want to raise an exception + # instead + print("Event auth missing: This should never happen") + return { + 'sec': 'unauthorized', + 'user_id': 'guest', + 'providence': 'error' + } + return { + 'sec': 'authenticated', + 'user_id': username, + 'providence': 'nginx' + } + + +@register_event_auth("guest") +async def guest_auth(request, headers, first_event, source): + ''' + Guest users. + + We assign a cookie on first visit, but we have no guarantee + the browser will keep cookies around. + + >>> a = asyncio.run(guest_auth(TestRequest(), [], {}, 'org.mitros.test')) + >>> a['user_id'] = len(a['user_id']) # Different user_id each time, and we want doctest to match exact string. + >>> a + {'sec': 'none', 'user_id': 32, 'providence': 'guest'} + ''' + session = await aiohttp_session.get_session(request) + guest_id = session.get('guest_id', None) + if guest_id is None: + guest_id = secrets.token_hex(16) + session['guest_id'] = guest_id + return { + 'sec': 'none', + 'user_id': guest_id, + 'providence': 'guest' + } + + +@register_event_auth("local_storage") +async def local_storage_auth(request, headers, first_event, source): + ''' + This authentication method is used by the browser extension, based + on configuration options. Each Chromebook is given a unique ID + token, which is stored in local_storage. + + This can be authenticated (if we have a list of such tokens), + unauthenticated (if we don't), or allow for both, with a tag for + guest versus non-guest accounts. + + >>> auth_event = {'event': 'local_storage', 'user_tag': 'bob'} + >>> a = asyncio.run(local_storage_auth(TestRequest(), [], auth_event, 'org.mitros.test')) + >>> a + {'sec': 'authenticated', 'user_id': 'ls-bob', 'providence': 'ls'} + >>> auth_event['user_tag'] = 'jim' + >>> a = asyncio.run(local_storage_auth(TestRequest(), [auth_event], {}, 'org.mitros.test')) + >>> a + {'sec': 'unauthenticated', 'user_id': 'ls-jim', 'providence': 'ls'} + ''' + authdata = find_event('local_storage', headers + [first_event]) + + if authdata is None or 'user_tag' not in authdata: + return False + + user_id = "ls-" + authdata['user_tag'] + authenticated = token_authorize_user('local_storage', user_id) + + return { + 'sec': token_authorize_user('local_storage', user_id), + 'user_id': user_id, + 'providence': 'ls' # local storage + } + + +@register_event_auth("chromebook") +async def chromebook_auth(request, headers, first_event, source): + ''' + Authenticate student Chromebooks. + + TODO: We should have some way to do this securely -- to connect + the identity token to the Google ID. + TODO: See about client-side oauth on Chromebooks + ''' + authdata = find_event('chrome_identity', headers + [first_event]) + + if authdata is None or 'chrome_identity' not in authdata: + return False + + # If we have an auth key, we are authenticated! + lsa = await local_storage_auth(request, headers, first_event, source) + + if lsa and lsa['sec'] == 'authenticated': + auth = 'authenticated' + else: + auth = 'unauthenticated' + + untrusted_google_id = authdata.get('chrome_identity', {}).get('id', None) + debug_log("untrusted_google_id", untrusted_google_id) + + if untrusted_google_id is None: + return False + + gc_uid = learning_observer.auth.utils.google_id_to_user_id(untrusted_google_id) + return { + 'sec': auth, + 'user_id': gc_uid, + 'safe_user_id': gc_uid, + 'providence': 'gcu' # Google Chrome, unauthenticated + } + + +@register_event_auth("hash_identify") +async def hash_identify(request, headers, first_event, source): + ''' + It's sometimes convenient to point folks to pages where the + user ID is encoded in the URL e.g. by hash: + + `http://myserver.ets.org/user-study-5/#user=zihan` + + This fails for even modest-scale use; even in an afterschool + club, experience shows that at least one child WILL mistype + a URL, either unintentionally or as a joke. + + But it is nice for one-offs where you're working directly + with a subject. + + This could be made better by providing an authenticated user + list. Then, it'd be okay for the math team example + ''' + authdata = find_event('hash_auth', headers + [first_event]) + debug_log("authdata", authdata) + + if authdata is None or 'hash' not in authdata: + return False + + return { + 'sec': 'unauthenticated', + 'user_id': "hi-" + authdata['hash'], + 'providence': 'mch' # Math contest hash -- toying with plug-in archicture + } + + +@register_event_auth("testcase_auth") +async def test_case_identify(request, headers, first_event, source): + ''' + This is for test cases. It's quick, easy, insecure, and shouldn't + be used in production. + ''' + authdata = find_event('test_framework_fake_identity', headers + [first_event]) + + if authdata is None or 'user_id' not in authdata: + return False + + return { + 'sec': "unauthenticated", + 'user_id': "testcase-" + authdata['user_id'], + 'providence': 'tc' + } + + +@register_event_auth("http_auth") +async def http_auth_identify(request, headers, first_event, source): + ''' + TODO: Allow events to be authorized by HTTP basic authentication + ''' + raise NotImplementedError("Not yet built; sorry") + + +async def authenticate(request, headers, first_event, source): + ''' + Authenticate an event stream. + + Parameters: + request: aio_http request object + headers: list of headers from event stream + first_event: first non-header event + source: where the events are coming from (e.g. `org.mitros.writing`) + + TODO: Allow configuring authentication methods based on event + type (e.g. require auth for writing, but not for dynamic assessment) + + Our thoughts are that the auth metadata ought to contain: + 1. Whether the user was authenticated (`sec` field): + * `authenticated` -- we trust who they are + * `unauthenticated` -- we think we know who they are, without security + * `guest` -- we don't know who they are + 2. Providence: How they were authenticated (if at all), or how we believe + they are who they are. + 3. `user_id` -- a unique user identifier + ''' + for auth_method in learning_observer.settings.settings['event_auth']: + auth_metadata = await AUTH_METHODS[auth_method](request, headers, first_event, source) + if auth_metadata: + if "safe_user_id" not in auth_metadata: + auth_metadata['safe_user_id'] = encode_id( + source=auth_metadata["providence"], + unsafe_id=auth_metadata['user_id'] + ) + return auth_metadata + + print("All authentication methods failed. Unauthorized.") + raise aiohttp.web.HTTPUnauthorized() + + +@learning_observer.prestartup.register_startup_check +def check_event_auth_config(): + ''' + Check that all event auth methods are correctly configured, + before events come in. + ''' + if 'event_auth' not in learning_observer.settings.settings: + raise learning_observer.prestartup.StartupCheck("Please configure event authentication") + for auth_method in learning_observer.settings.settings['event_auth']: + if auth_method not in AUTH_METHODS: + raise learning_observer.prestartup.StartupCheck( + "Please configure event authentication for {}\n(Methods: {})".format( + auth_method, + list(AUTH_METHODS.keys()) + )) + + +if __name__ == "__main__": + import doctest + print("Running tests") + + class TestRequest: + pass + + session = {} + + async def get_session(request): + return session + + aiohttp_session.get_session = get_session + doctest.testmod() diff --git a/learning_observer/learning_observer/auth/handlers.py b/learning_observer/learning_observer/auth/handlers.py new file mode 100644 index 000000000..51f6449fa --- /dev/null +++ b/learning_observer/learning_observer/auth/handlers.py @@ -0,0 +1,268 @@ +''' +This file should contain handlers and all the other aio_http stuff for auth/auth. + +We should give it a better name, since it also contains middlewares +''' + +import base64 +import json +import random + +import aiohttp +import aiohttp.web +import aiohttp_session + +import learning_observer.auth.utils +import learning_observer.auth.http_basic +import learning_observer.settings + +import learning_observer.graphics_helpers + +import names + + +async def logout_handler(request): + """ + Handles sign out. This is generic - does not depend on which + log-in method is used (password, social, etc.) + """ + session = await learning_observer.auth.utils.logout(request) + return aiohttp.web.HTTPFound("/") # TODO: Make a proper logout page + + +async def user_info_handler(request): + ''' + This is a handler which currently shows: + * Google user ID + * E-mail + * First and family name + * Google avatar + * And whether the user is authorized + + This is helpful for things like the little avatar when rendering the + page. + + TODO: Think through what info we want to give as we add authentication + methods. We don't want to leak data accidentally. + ''' + return aiohttp.web.json_response(request['user']) + + +async def user_from_session(request): + ''' + Get the user object from the session. + ''' + session = await aiohttp_session.get_session(request) + session_user = session.get('user', None) + if 'auth_headers' in session: + request['auth_headers'] = session['auth_headers'] + return session_user + + +async def test_case_user(request): + ''' + Return a test user, if we are in test mode + + This is a short circuit for test cases without logging in. + THIS SHOULD NEVER BE ENABLED ON A LIVE SERVER + ''' + tci = learning_observer.settings.settings['auth'].get("test_case_insecure", False) + if not tci: + return None + if not isinstance(tci, dict): + tci = {} + + user_info = { + "name": tci.get("name", "Test Case"), + "picture": "testcase.jpg", + "authorized": True, + "google_id": 12345, + "email": "testcase@localhost" + } + await learning_observer.auth.utils.update_session_user_info(request, user_info) + return user_info + + +async def demo_user(request): + ''' + Return a demo user, if we are in demo mode + + This short circuits authentication for demos, cog-labs, development, etc. + + This should not be enabled on long-running live server, beyond spinning one up + for a demo or something. + + In contrast to the test case user, this assigns a dummy name and similar. That's + bad for testing, where we want determinism, but it's good for demos. + ''' + if not learning_observer.settings.settings['auth'].get("demo_insecure", False): + return None + + def name_to_email(name): + ''' + Convert a name to an email address. + + Args: + name (str): The name to convert. + + Returns: + str: The email address. + + Example: "John Doe" -> "jdoe@localhost" + ''' + + name = name.split() + return name[0][0].lower() + name[-1].lower() + "@localhost" + + demo_auth_setting = learning_observer.settings.settings['auth']["demo_insecure"] + if isinstance(demo_auth_setting, dict) and 'name' in demo_auth_setting: + name = demo_auth_setting['name'] + else: + name = names.get_full_name() + + user_info = { + "name": name, + "picture": "/auth/default-avatar.svg", + "authorized": True, + "google_id": random.randint(10000, 99999), + "email": name_to_email(name) + } + await learning_observer.auth.utils.update_session_user_info(request, user_info) + return user_info + + +async def set_user_info_cookie(request, response): + ''' + Set a cookie with the user's info. + + This is a helper function for the login handlers. + + This is *obsolete*, We now pass this through an AJAX call + from the client to get the user info. This is because we + found cookies were not working consistently on all browsers. + They're probable fine for deployment, but we ran into + heisenbugs on `localhost`. + + Args: + request (aiohttp.web.Request): The request object. + + Returns: + None + ''' + # This is a dumb way to sanitize data and pass it to the front-end. + # + # Cookies tend to get encoded and decoded in ad-hoc strings a lot, often + # in non-compliant ways (to see why, try to find the spec for cookies!) + # + # This avoids bugs (and, should the issue come up, security issues + # like injections) + # + # This should really be abstracted away into a library which passes state + # back-and-forth, but for now, this works. + session = await aiohttp_session.get_session(request) + session_user = session.get('user', None) + + response.set_cookie( + "userinfo", + base64.b64encode(json.dumps(session_user).encode('utf-8')).decode('utf-8') + ) + + +async def http_auth_user(request): + ''' + Authenticate a user by HTTP Basic Auth. + ''' + if not learning_observer.auth.http_basic.http_auth_middleware_enabled(): + return None + if not learning_observer.auth.http_basic.has_http_auth_headers(request): + return None + raise NotImplementedError( + "HTTP Basic Auth is not tested yet. Most of the code is there, but it\n" + "should be tested. Since this is a security issue, it should be\n" + "tested before we remove this exception." + ) + request['auth_headers'] = session.get('auth_headers', None) + + +@aiohttp.web.middleware +async def auth_middleware(request, handler): + ''' + This is a middleware which: + + * Moves the user information into the request. + * Sets the user's info cookie. + * Handles authentication modes which don't require an explicit login. (e.g. + demo mode, test case mode, http basic auth) + + Save user into a cookie + ''' + user = None + + # This sets the order in which we check for user info + user_sources = [ + user_from_session, + http_auth_user, + test_case_user, + demo_user + ] + + for user_source in user_sources: + user = await user_source(request) + if user is not None: + break + + # If we didn't find a user, we're not authorized. We don't raise an error, + # because we want to allow the user to log in from the main page. We just + # don't want to allow them to access sensitive pages. + + request['user'] = user + resp = await handler(request) + + # We want to be able to e.g. show the user's name in the header on the + # page from the front-end, so we need to pass the user's info to the + # front-end. We do this by setting a cookie. + # + # This retrieves the user info from the session, since the user info + # might have changed in the request (in particular, if the user logged + # out) + await set_user_info_cookie(request, resp) + return resp + + +def serve_user_icon(request): + ''' + Serve a user's default icon: + * A user's icon if available. + * An SVG of initials if no other icon is available. + * A default icon if no other icon is available. + + Args: + request (aiohttp.web.Request): The request object. + + Returns: + aiohttp.web.Response: The response object. + ''' + + # Good idea once we have a good icon + # if request['user'] is None: + # return aiohttp.web.FileResponse( + # learning_observer.settings.settings['auth']['default_icon'] + # ) + + # In the future, we might want something along the lines of: + # if 'picture' in request['user']: + # return aiohttp.web.FileResponse( + # request['user']['picture'] + # ) + # We don't do this now -- we encode the URL and don't call this function + # if we have a picture -- since we often serve avatars from Google. + + user = request.get('user', {}) + if user is None: + user = {} + name = user.get('name', None) + + return aiohttp.web.Response( + body=learning_observer.graphics_helpers.default_user_icon(name), + content_type="image/svg+xml" + ) diff --git a/learning_observer/learning_observer/auth/http_basic.py b/learning_observer/learning_observer/auth/http_basic.py new file mode 100644 index 000000000..13896a815 --- /dev/null +++ b/learning_observer/learning_observer/auth/http_basic.py @@ -0,0 +1,192 @@ +''' +Handle HTTP basic authentication + +Curiously, the Wikipedia article is the best reference on this. + +We would like to support two modes of operation: + +1) Rely on nginx +2) Rely on our own password file + +For now, we only support #1. We do not verify passwords, and let +our web server do it for us. + +Well, technically, for now we support neither since this file is IN +DEVELOPMENT, and NOT YET WORKING. +''' +import base64 +import json +import yaml +import sys + +import bcrypt + +import aiohttp.web + +import learning_observer.settings +import learning_observer.prestartup + +from learning_observer.log_event import debug_log + + +def http_basic_extract_username_password(request): + ''' + Based on an HTTP request, return a username / password tuple + + Return `None` if missing + ''' + auth_header = request.headers.get('Authorization', None) + if auth_header is None: + return None + + if not auth_header.startswith("Basic "): + raise aiohttp.web.HTTPBadRequest("Malformed header authenticating.") + split_header = auth_header.split(" ") + if len(split_header) != 2: + raise aiohttp.web.HTTPBadRequest("Malformed header authenticating.") + decoded_header = base64.b64decode(split_header[1]).decode('utf-8') + (username, password) = decoded_header.split(":") + return (username, password) + + +def has_http_auth_headers(request): + ''' + Check if the request has HTTP basic auth headers + ''' + auth_header = request.headers.get('Authorization', None) + if auth_header is not None: + return True + return False + + +def http_auth_middleware_enabled(): + ''' + Check if the authentication middleware should be enabled. + + This should ONLY be set for sites where ALL pages use HTTP + auth. If we only use http_auth for the auth page, this should be + disabled. + + We may rely on nginx for http auth. We do NOT want the middleware + to accidentally receive requests with auth headers on pages which + nginx has not secured. + ''' + if 'http_basic' not in learning_observer.settings.settings['auth']: + return False + auth_basic_settings = learning_observer.settings.settings['auth']['http_basic'] + return auth_basic_settings.get("full_site_auth", False) + + +def http_auth_page_enabled(): + ''' + Check if the system has a dedicated HTTP basic authentication login + page configured in the settings file. This is typically used if we + want to work with multiple authentication schemes, including both http + basic and other schemes. If we only use HTTP basic auth, we don't need + this. + ''' + # Is http basic auth enabled? + if 'http_basic' not in learning_observer.settings.settings['auth']: + return False + auth_basic_settings = learning_observer.settings.settings['auth']['http_basic'] + # And is it configured with a dedicated login page? + if not auth_basic_settings.get("login_page_enabled", False): + return False + return True + + +def http_basic_auth_verify_password(request, filename): + ''' + Checks if a user is authorized, based on the filename of a + password file, and a request. This is abstracted out since we'd + like to potentially use http basic auth for authorize: + + 1) Accesses to repos (e.g. for serving static content to students) + 2) Event streams + 3) Instructors + + Each of these has their own auth workflow. + + * Return the username if authorized. + * Return `None` if unauthorized. + ''' + if not has_http_auth_headers(request): + return None + + (username, password) = http_basic_extract_username_password(request) + password_data = yaml.safe_load(open(filename)) + + if (data['username'] not in password_data['users'] + or not bcrypt.checkpw( + password, + password_data['users'][username]['password'] + )): + raise aiohttp.web.HTTPUnauthorized(text="Invalid username / password") + return username + + +def http_basic_auth(filename=None, response=lambda: None): + ''' + Takes a password file, or `None` if authorization is handled by nginx. Returns + a function which authorizes the user. + + This function also works as a handler if you pass a response object. E.g. + + `http_basic_auth( + filename=None, + response=lambda: aiohttp.web.json_response({"status": "authorized"}) + )` + + or `response=lambda: aiohttp.web.HTTPFound(location="/")` + + or similar. + ''' + async def password_auth_handler(request): + debug_log("Password Auth Handler") + if filename is not None: + # We should check this codepath before we run it.... + raise aiohttp.web.HTTPNotImplemented(body="Password file http basic unverified.") + username = http_basic_auth_verify_password(request, filename) + else: + (username, password) = http_basic_extract_username_password(request) + + # TODO: We should sanitize the username. + # That's a bit of paranoia, but just in case something goes very wrong elsewhere... + debug_log("Authorizing") + await learning_observer.auth.utils.update_session_user_info( + request, { + 'user_id': "httpauth-" + username, + 'email': "", + 'name': "", + 'family_name': "", + 'picture': "", + 'authorized': True + } + ) + # This is usually ignored, but just in case... + return response() + return password_auth_handler + + +@learning_observer.prestartup.register_startup_check +def http_basic_startup_check(): + if http_auth_page_enabled() and http_auth_middleware_enabled(): + raise learning_observer.prestartup.StartupCheck( + "Your HTTP Basic authentication is misconfigured.\n" + + "\n" + + "You want EITHER auth on every page, OR a login page,\n" + + "not both. Having both setting may be a security risk.\n" + + "Please fix this." + ) + + if ( + 'http_basic' in learning_observer.settings.settings['auth'] + and learning_observer.settings.settings['auth']['http_basic'].get("delegate_nginx_auth", False) + and learning_observer.settings.settings['auth']['http_basic'].get("password_file", False) + ): + raise learning_observer.prestartup.StartupCheck( + "Your HTTP Basic authentication is misconfigured.\n" + + "\n" + + "You should EITHER rely on nginx for password authentication OR Learning Observer," + + "not both." + ) diff --git a/learning_observer/learning_observer/auth/password.py b/learning_observer/learning_observer/auth/password.py new file mode 100644 index 000000000..8432ff9dd --- /dev/null +++ b/learning_observer/learning_observer/auth/password.py @@ -0,0 +1,87 @@ +''' +Password log-in handler +''' +from distutils.log import debug +import bcrypt +import json +import yaml + +import aiohttp.web + +import learning_observer.auth.handlers +import learning_observer.auth.utils + +from learning_observer.log_event import debug_log + + +def password_auth(filename): + ''' + Authentication handler for logging in with username and password + based on a password file. + + By placing this in a closure, we eliminate the dependency on + settings. + + Todo: + * Flag to load file on startup versus on request + * Support storing this in a database + * Log out after time-out from last request + + For convenience, this can be called directly as: + + `curl -X POST \ + -F "username=test_user" \ + -F "password=test_password" \ + http://localhost:8888/23auth/login/password` + ''' + async def password_auth_handler(request): + data = await request.post() # Web form + body = await request.text() # AJAX + if 'username' not in data: + data = json.loads(body) + with open(filename) as f: + password_data = yaml.safe_load(f) + + # If you run into errors on the line below, you *probably* + # have a dependency issue. Errors: + # * AttributeError: module 'bcrypt' has no attribute 'checkpw' + # * AttributeError: module 'bcrypt._bcrypt' has no attribute 'ffi' + # Try uninstalling bcrypt and reinstalling / upgrading pi-bcrypt + # + # If you run into unicode errors, see if you can debug them. There + # is randomness about whether we do or don't need to .encode('utf-8'). + # + # It reliably either works or doesn't, but it doesn't change. If your + # environment has a happy `bcrypt`, it will keep on working. + if data['username'] not in password_data['users']: + debug_log("User not found") + return aiohttp.web.json_response({"status": "unauthorized"}) + user_data = password_data['users'][data['username']] + try: + password_check = bcrypt.checkpw( + data['password'].encode('utf-8'), + user_data['password'].encode('utf-8') + ) + except: # noqa: E722 TODO figure out which errors to catch + debug_log("Error verifying password hash") + debug_log("Hint: Try reinstalling / upgrading bcrypt") + debug_log("For some reason, bcrypt tends to sometimes install incorrectly") + debug_log("or get into an inconsistent state with ffi.") + raise + if not password_check: + debug_log("Password check failed") + return aiohttp.web.json_response({"status": "unauthorized"}) + debug_log("Password check authorized") + await learning_observer.auth.utils.update_session_user_info( + request, { + 'user_id': "pwd-" + user_data['username'], # Perhaps data['username']? + 'email': user_data.get('email', ''), + 'name': user_data.get('name', ''), + 'family_name': user_data.get('family_name', ''), + 'picture': user_data.get('picture', '/auth/default-avatar.svg'), + 'authorized': True + } + ) + return aiohttp.web.json_response({"status": "authorized"}) + + return password_auth_handler diff --git a/learning_observer/learning_observer/auth/social_sso.py b/learning_observer/learning_observer/auth/social_sso.py new file mode 100644 index 000000000..55b114824 --- /dev/null +++ b/learning_observer/learning_observer/auth/social_sso.py @@ -0,0 +1,209 @@ +""" +Authentication for Google. + +This was based on +[aiohttp-login](https://github.com/imbolc/aiohttp-login/), which at +the time worked with outdated Google APIs and require Jinja2. Oren +modernized this. Piotr integrated this into the system. + +Portions of this file, from aiohttp-login, are licensed as: + +Copyright (c) 2011 Imbolc. + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +Eventually, this should be broken out into its own module. +""" + +import yarl + +import aiohttp +import aiohttp.web +import aiohttp_session + +# TODO: We might want to not import this, but pass this info, to make +# this file generic, and not specific to learning_observer. +import learning_observer.settings as settings +import learning_observer.auth.handlers as handlers +import learning_observer.auth.utils + +import learning_observer.exceptions + + +DEFAULT_GOOGLE_SCOPES = [ + 'https://www.googleapis.com/auth/userinfo.profile', + 'https://www.googleapis.com/auth/userinfo.email', + 'https://www.googleapis.com/auth/classroom.courses.readonly', + 'https://www.googleapis.com/auth/classroom.rosters.readonly', + 'https://www.googleapis.com/auth/classroom.profile.emails', + 'https://www.googleapis.com/auth/classroom.profile.photos', + 'https://www.googleapis.com/auth/classroom.coursework.students.readonly', + 'https://www.googleapis.com/auth/classroom.courseworkmaterials.readonly', + 'https://www.googleapis.com/auth/classroom.guardianlinks.students.readonly', + 'https://www.googleapis.com/auth/classroom.student-submissions.students.readonly', + 'https://www.googleapis.com/auth/classroom.topics.readonly', + 'https://www.googleapis.com/auth/drive.metadata.readonly', + 'https://www.googleapis.com/auth/drive.readonly', + 'https://www.googleapis.com/auth/documents.readonly', + 'https://www.googleapis.com/auth/classroom.announcements.readonly' +] + + +async def social_handler(request): + """Handles Google sign in. + + Provider is in `request.match_info['provider']` (currently, only Google) + """ + if request.match_info['provider'] != 'google': + raise learning_observer.exceptions.SuspiciousOperation( + "We only handle Google logins. Non-google Provider" + ) + + user = await _google(request) + + if 'user_id' in user: + await learning_observer.auth.utils.update_session_user_info(request, user) + + if user['authorized']: + url = user['back_to'] or "/" + else: + url = "/" + + return aiohttp.web.HTTPFound(url) + + +async def _google(request): + ''' + Handle Google login + ''' + if 'error' in request.query: + return {} + + common_params = { + 'client_id': settings.settings['auth']['google_oauth']['web']['client_id'], + 'redirect_uri': "https://{hostname}/auth/login/google".format( + hostname=settings.settings['hostname'] + ) + } + + # Step 1: redirect to get code + if 'code' not in request.query: + url = 'https://accounts.google.com/o/oauth2/auth' + params = common_params.copy() + # We can override the scopes in the settings file entirely... + scopes = settings.settings['auth']['google_oauth']['web'].get( + 'base_scopes', + DEFAULT_GOOGLE_SCOPES + ) + # Or keep the default scopes and just add a few new ones.... + scopes += settings.settings['auth']['google_oauth'].get( + 'additional_scopes', + [] + ) + params.update({ + 'response_type': 'code', + 'scope': " ".join(scopes), + }) + if 'back_to' in request.query: + params['state'] = request.query['back_to'] + url = yarl.URL(url).with_query(params) + raise aiohttp.web.HTTPFound(url) + + # Step 2: get access token + url = 'https://accounts.google.com/o/oauth2/token' + params = common_params.copy() + params.update({ + 'client_secret': settings.settings['auth']['google_oauth']['web']['client_secret'], + 'code': request.query['code'], + 'grant_type': 'authorization_code', + }) + async with aiohttp.ClientSession(loop=request.app.loop) as client: + async with client.post(url, data=params) as resp: + data = await resp.json() + assert 'access_token' in data, data + + # get user profile + headers = {'Authorization': 'Bearer ' + data['access_token']} + session = await aiohttp_session.get_session(request) + session["auth_headers"] = headers + request["auth_headers"] = headers + + # Old G+ URL that's no longer supported. + url = 'https://www.googleapis.com/oauth2/v1/userinfo' + async with client.get(url, headers=headers) as resp: + profile = await resp.json() + + return { + 'user_id': profile['id'], + 'email': profile['email'], + 'name': profile['given_name'], + 'family_name': profile['family_name'], + 'back_to': request.query.get('state'), + 'picture': profile['picture'], + # TODO: Should this be immediate? + 'authorized': await learning_observer.auth.utils.verify_teacher_account(profile['id'], profile['email']) + } + + +async def show_me_my_auth_headers(request): + """ + Show the auth headers that are set in the session. For convenience, we also + show other headers that were sent to the server and might add other + information. + + This is handy for debugging and development. I'd often like to use the + server registered with Google to log in, but then use this information + in a development environment or a script. + + This is behind a feature flag. On a live server, it should be disabled + as set right now. In the future, we might want to make this a feature + that can be enabled for specific users. This is not a huge security + risk, as the user can only access the information they have access to, + but a user's patterns might look suspicious to Google's (often broken) + algorithms, and we don't want to get flagged. + + There is a setting, `allow_override`, which allows setting auth headers + in a development environment. + """ + flag = settings.feature_flag('auth_headers_page') + + if not flag: + # The route should not have been added... + raise aiohttp.web.HTTPForbidden( + "This feature is disabled. We should never get here. Please debug this." + ) + + # This is so that we can use the headers from the Google-approved server in + # my local development environment. Google has all sorts of validation that + # make it hard to retrieve the headers from the server directly to protect + # users from phishing, so we can't just implement oauth locally. + if request.method == 'POST': + if not (isinstance(flag, dict) or isinstance(flag, list)) or 'allow_override' not in flag: + raise aiohttp.web.HTTPForbidden("Overriding headers is disabled") + if not request.can_read_form: + raise aiohttp.web.HTTPForbidden("Cannot read form") + + auth_headers = request.form.get('auth_headers') + if not auth_headers: + raise aiohttp.web.HTTPBadRequest( + text="Missing auth_headers" + ) + session = await aiohttp_session.get_session(request) + session["auth_headers"] = auth_headers + request["auth_headers"] = auth_headers + session.save() + + return aiohttp.web.json_response({ + "auth_headers": request.get("auth_headers", None), + "headers": dict(request.headers) + }) diff --git a/learning_observer/learning_observer/auth/utils.py b/learning_observer/learning_observer/auth/utils.py new file mode 100644 index 000000000..0416d5bfa --- /dev/null +++ b/learning_observer/learning_observer/auth/utils.py @@ -0,0 +1,169 @@ +''' +Authutils +========= + +We will need to support IDs from multiple systems. These are helper +functions to convert IDs. For example, we would convert a Google ID +like `72635729500910017892163494291` to +`gc-72635729500910017892163494291`. In the process, we also +double-check to make sure these are well-formed (in the above case, by +converting to int and back). + +The whole auth system ought to be reorganized at some point. +''' + +import hashlib +import functools + +import bcrypt +import yaml + +import aiohttp.web +import aiohttp_session + +import learning_observer.paths + +from learning_observer.log_event import debug_log + + +def google_id_to_user_id(google_id): + ''' + Convert a Google ID like: + `72635729500910017892163494291` + to: + `gc-72635729500910017892163494291` + ''' + try: + return "gc-" + str(int(google_id)) + except ValueError: + debug_log("Error handling:", google_id) + raise + + +def fernet_key(secret_string): + ''' + Generate key for our cookie storage based on the `session_secret` + in our config file. + ''' + md5_hash = hashlib.md5() + md5_hash.update(secret_string.encode('utf-8')) + return md5_hash.hexdigest().encode('utf-8') + + +async def verify_teacher_account(user_id, email): + ''' + Confirm the teacher is registered with the system. Eventually, we will want + 3 versions of this: + * Always true (open system) + * Text file backed (pilots, small deploys) + * Database-backed (large-scale deploys) + + For now, we have the file-backed version + ''' + teachers = yaml.safe_load(open(learning_observer.paths.data("teachers.yaml"))) + if email not in teachers: + # email is untrusted; repr prevents injection of newlines + debug_log("Email not found in teachers", repr(email)) + return False + if teachers[email]["google_id"] != user_id: + # user_id is untrusted; repr prevents injection of newlines + debug_log("Non-matching Google ID", teachers[email]["google_id"], repr(user_id)) + return False + debug_log("Teacher account verified") + return True + + +async def update_session_user_info(request, user): + """ + This will update the (encrypted) user session with the user's + identity, and whether they are authorized. This is typically used + to log a user into our session. + + :param request: web request. + :param user_id: provider's user ID (e.g., Google ID). + + """ + session = await aiohttp_session.get_session(request) + session["user"] = user + + +async def logout(request): + ''' + Log the user out + ''' + session = await aiohttp_session.get_session(request) + session.pop("user", None) + session.pop("auth_headers", None) + request['user'] = None + + +class InvalidUsername(aiohttp.web.HTTPUnauthorized): + ''' + Raised when we try to verify an invalid username + + We have custom exceptions since: + * We'd like the user to see the same error whether for + invalid username or password + * We'd like to programmatically be able to distinguish the + two + ''' + + +class InvalidPassword(aiohttp.web.HTTPUnauthorized): + ''' + Raised when we try to verify an invalid password + ''' + + +async def verify_password(filename, username, password): + ''' + Check if user is in password file. If so, return associated user + information as a JSON dictionary. If not, raise an exception. + ''' + password_data = yaml.safe_load(open(filename)) + if username not in password_data['users']: + raise InvalidUsername(text="Invalid username or password") + user_data = password_data['users'][username] + if not bcrypt.checkpw( + password, + user_data['password'] + ): + raise InvalidUsername(text="Invalid username or password") + del user_data['password'] + return user_data + +# Account decorators below. +# +# We don't want a complex authentication scheme. In the short term, +# we plan to have teacher, student, and admin accounts. +# +# In the long term, we will probably want a little more, but not full ACLs. + + +def admin(func): + ''' + Decorator to mark a view as an admin view. + + This should be moved to the auth/auth framework, and have more + granular levels (e.g. teachers and sys-admins). We probably don't + want a full ACL scheme (which overcomplicates things), but we will + want to think through auth/auth. + ''' + @functools.wraps(func) + def wrapper(request): + if learning_observer.settings.settings['auth'].get("test_case_insecure", False): + return func(request) + if 'user' in request and \ + request['user'] is not None and \ + 'authorized' in request['user'] and \ + request['user']['authorized']: + return func(request) + # Else, if unauthorized + raise aiohttp.web.HTTPUnauthorized(text="Please log in") + return wrapper + + +# Decorator +# +# For now, we don't have seperate teacher and admin accounts. +teacher = admin diff --git a/learning_observer/learning_observer/client_config.py b/learning_observer/learning_observer/client_config.py new file mode 100644 index 000000000..4b1c0d20e --- /dev/null +++ b/learning_observer/learning_observer/client_config.py @@ -0,0 +1,43 @@ +''' +Client Configuration +==================== + +This module creates a client-side configuration. This might include +things such as: + +- Relative URL paths +- Per-server UX tweaks +- Etc. +''' + +import aiohttp + +import learning_observer.settings +import learning_observer.auth.http_basic + + +async def client_config_handler(request): + ''' + Return a configuration JSON response to the client. This: + - Tells the client this is running from a live server + - Includes any system-specific configuration + + For debugging / devel, it's helpful to be able to mock the API + with static files. Those won't do things like web sockets. In that + case, we can host this client-side configuration on a local + server as a static file, with `mode` set to `static`. + ''' + client_config = { + "mode": "server", + "modules": { # Per-module config + 'wobserver': { + 'hide_labels': False # TODO: Should be loaded from config file + } + }, + "google_oauth": "google_oauth" in learning_observer.settings.settings['auth'], + "password_auth": "password_file" in learning_observer.settings.settings['auth'], + "http_basic_auth": learning_observer.auth.http_basic.http_auth_page_enabled(), + "theme": learning_observer.settings.settings['theme'] + } + + return aiohttp.web.json_response(client_config) diff --git a/learning_observer/learning_observer/creds.yaml.example b/learning_observer/learning_observer/creds.yaml.example new file mode 100644 index 000000000..31a6d13d4 --- /dev/null +++ b/learning_observer/learning_observer/creds.yaml.example @@ -0,0 +1,93 @@ +# We use curly-braces for things which ought to be filled in like passwords and secret keys: +# +# 1) These are friendly to templating languages and Python's str.format() +# 2) These generate dictionaries, and we can throw an exception if unset. If we forget to +# throw an exception, these won't give a valid string +hostname: learning-observer.org +xmpp: + sink: # Receives messages. We'll need many of these. + jid: sink@localhost + password: {xmpp-sink-password} + source: # Sends messages. + jid: source@localhost + password: {xmpp-source-password} + stream: # For debugging + jid: stream@localhost + password: {xmpp-stream-password} +auth: + google_oauth: # Remove if you're not using Google auth + web: + client_id: {google-oauth-client-id} + project_id: {google-oauth-project-id} + auth_uri: https://accounts.google.com/o/oauth2/auth + token_uri: https://oauth2.googleapis.com/token + auth_provider_x509_cert_url: https://www.googleapis.com/oauth2/v1/certs + client_secret: {google-client-secret} + javascript_origins: ["{url}"] + # base_scopes: [] # We can override the scopes we ask Google for here + # additional_scopes: [] # Or just add a few new ones. Not very tested. + password_file: passwd.lo # Remove if you're not using a password file + # Otherwise, create one with lo_passwd.py + http_basic_auth: # HTTP auth. You probably want to remove this. + # Your server WILL be insecure if you configure + # this incorrectly. Either nginx can verify passwords, + # or we can. The latter isn't fully implemented/tested. + # The former requires nginx to be correctly set up. + mode: remove-this # Can be 'nginx,' or 'password-file' + password_file: passwd.lo # Set this to 'null' if authentication is done by nginx +# test_case_insecure: false # For testing -- no log-in required. NEVER enable on a live server. +# demo_insecure: false # Similar to test-case, but stochastically give name, etc. +pubsub: + type: stub # stub for in-memory debugging, redis for small-scale prod. xmpp will bring scale +kvs: + # stub for in-memory debugging + # redis_ephemeral for redis / debugging (object disappear) + # redis for deploy + type: stub + # If using redis_ephemeral, persist objects for 60s = 1 minute + # I typically use: + # * 1-10s for test cases + # * 1-5 minutes for interactive debugging + # * 6-24 hours for development + expiry: 60 +roster_data: + source: filesystem # Can be set to google-api, all, test, or filesystem +aio: # User session; used for log-ins. + session_secret: {unique-aio-session-key} # This should be a unique secret key for YOUR deployment + session_max_age: 3600 # In seconds. This may be short for auth dev (e.g. <5 mins), intermediate for deploy (a few hours?), and long for e.g. testing other parts of the system (or set to null, for lifetime of the browser) +config: + run_mode: dev # "dev" versus "deploy". E.g. Do we crash on errors? Or log them and keep going? + debug: [] # add "tracemalloc" to the list to enable memory leak debugging +logging: + # Note that EXTENDED logging includes a bit of the stack trace. This + # is super-valuable for debugging, but also super-slow. As of this writing, + # with EXTENDED logging, on a fast computer, you'll probably cap out + # at e.g. maybe 3 students using the Writing Observer. All of this time + # will be spent inside of inspect (which is slow enough I'm almost inclined + # to consider this a bug in inspect, so perhaps in newer Python versions, + # this will be faster). + debug_log_level: SIMPLE # NONE, SIMPLE, or EXTENDED + debug_log_destinations: # List of where they go. CONSOLE or FILE + - CONSOLE + - FILE +theme: + server_name: Learning Observer + front_page_pitch: Learning Observer is an experimental dashboard. If you'd like to be part of the experiment, please contact us. If you're already part of the experiment, log in! + logo_big: /static/media/logo-clean.jpg +event_auth: + local_storage: + userfile: students.yaml + allow_guest: true + # chromebook: # May be necessary to support execution on chromebooks. + # allow_guest: true # uncomment for testing or deployment as needed. + # testcase_auth: {} +feature_flags: {} # For enabling / disabling features in development. Useful + # ones include: + # * google_routes (for debugging / developing Google APIs) + # * save_google_ajax (for saving Google API calls -- NOT FOR PROD) + # * use_google_ajax (for using saved Google API calls -- AGAIN, NOT FOR PROD) +server: + port: 8888 # Optional. Pick a different port. +modules: + writing_observer: + use_nlp: true diff --git a/learning_observer/learning_observer/dash_integration.py b/learning_observer/learning_observer/dash_integration.py new file mode 100644 index 000000000..02ff404f8 --- /dev/null +++ b/learning_observer/learning_observer/dash_integration.py @@ -0,0 +1,217 @@ +''' +We plan to use `dash` for most of our dashboards. We are prototyping +integration here. + +Right now, we use a common Dash app for all modules. It's likely that we'll want +a Dash app for each module. Dash doesn't give a great way to tease apart things +like static assets, JavaScript files, css, etc. on a per-layout basis. + +On the other hand, there's a lot of stuff like `dash.register_page` or global +`client_side_callback`, which seem to presume only one dash app. +''' + +import os.path +import shutil + +import dash +from dash import Dash, html, clientside_callback, Output, Input + +from dash_extensions import WebSocket +import dash_bootstrap_components as dbc +from learning_observer_components import LOConnection + +import learning_observer.prestartup +import learning_observer.paths + + +app = None + + +def get_app(): + return app + + +# Should we have a namespace for dash than module, or vice-versa? +# +# dash/ makes designing learning observer easier and URL routing, since +# we just add a route for /dash/ +# +# {module}/ makes designing apps easier, since they can use relative +# paths. +PATH_TEMPLATE = "/{module}/dash/{subpath}/" + + +def local_register_page( + module, + layout, + path, + title, + description +): + dash.register_page( + module, + layout=layout, + title=title, + description=description, + path=path + ) + + +def thirdparty_url(filename): + return "/static/3rd_party/{filename}".format(filename=filename) + + +def static_url(filename): + return f"/static/{filename}" + + +test_layout = html.Div(children=[ + html.H1(children='Test Case for Dash'), + LOConnection( + id='ws', + data_scope={ + "module": "writing_observer", + "course": 12345 + }, + ), + html.Div(id='output') +]) + +clientside_callback( + """function(msg) { + if(!msg) { + return "No Data" + } + return msg.data; + } """, + Output('output', 'children'), + Input('ws', 'message') +) + + +def all_dash_resources(resource_type): + """ + First, we want to compile together CSS/Scripts sheets from modules. + HACK: These are compiled for all dash pages together, and can + fight. We will want per-module resources later. This is good enough + as scaffolding, though. + + `resource_type` should be: 'CSS' or 'SCRIPTS' + """ + modules = learning_observer.module_loader.dash_pages() + resources = [] + for module in modules: + # Pull the CSS out of the modules + resource = sum([m.get(resource_type, []) for m in modules[module]], []) + resources.extend(resource) + return resources + + +def compile_dash_assets(): + ''' + We want to dump all dash assets into a common directory. Eventually, + this should be on a per-layout or per-module basis, but again, this + is good enough for scaffolding. + ''' + modules = learning_observer.module_loader.dash_pages() + + def asset_paths(): + ''' + Return paths to all the asset directories + ''' + for m in modules: + module = modules[m] + for layout in module: + if 'ASSETS' in layout: + asset_path = os.path.join(layout['_BASE_PATH'], layout['ASSETS']) + yield asset_path + + def copy_files(source, destination): + ''' + Copy all the files, non-recursively, from the source path to the + destination path. Raise an exception if this would cause a file + overwrite. + ''' + for filename in os.listdir(source): + source_path = os.path.join(source, filename) + destination_path = os.path.join(destination, filename) + print("Copying", destination_path) + if os.path.exists(destination_path): + raise Exception(f'File {destination_path} already exists compiling Dash assets') + shutil.copy(source_path, destination_path) + + def delete_files(directory, for_real=False): + ''' + Delete all the files in a directory. + + To avoid accidental bugs, we have an option to do a dry run, + enabled by default. We disable once we're confident this is + doing the right thing, but we'd like to do a dry run by default. + + We don't recurse for now. + ''' + for file in os.listdir(directory): + file_path = os.path.join(directory, file) + if for_real: + print("Removing", file_path) + os.unlink(file_path) + else: + print("Would unlink: {path}".format(path=file_path)) + + destination = learning_observer.paths.dash_assets() + delete_files(destination, for_real=True) + for source_path in asset_paths(): + copy_files(source_path, destination) + + # Delete all the previous files. + # + # In the future, we should only do this if necessary. + # + # We don't recurse for now, to avoid dangerous bugs, etc. but this might be + # a logical thing to consider in the future. + print(list(asset_paths())) + + return destination + + +@learning_observer.prestartup.register_startup_check +def load_dash_pages(): + global app + import learning_observer.module_loader + modules = learning_observer.module_loader.dash_pages() + + app = Dash( + __name__, + use_pages=True, + pages_folder="", + external_stylesheets=all_dash_resources('CSS'), + external_scripts=all_dash_resources('SCRIPTS'), + assets_folder=compile_dash_assets(), + assets_url_path='dash/assets' + ) + + dash.register_page( + __name__, + path="/dash/test", + name="Test Page", + layout=test_layout + ) + + for module_id in modules: + for page in modules[module_id]: + print(module_id) + path = PATH_TEMPLATE.format( + module=module_id, + subpath=page['SUBPATH'] + ) + + # TODO: Make an API to do this cleanly. + page['path'] = path # <== Bad form. We're breaking abstractions + + local_register_page( + module=page['MODULE'].__name__, + layout=page['LAYOUT'], + title=page['TITLE'], + description=page['DESCRIPTION'], + path=path + ) diff --git a/learning_observer/learning_observer/dash_wrapper.py b/learning_observer/learning_observer/dash_wrapper.py new file mode 100644 index 000000000..52a2e91e0 --- /dev/null +++ b/learning_observer/learning_observer/dash_wrapper.py @@ -0,0 +1,40 @@ +''' +This file is a bit of a hack. + +Here's the deal: We'd like to be able to import from dash apps: + +- We'd like those apps to keep working on dash on their end stand-alone. That's + nice for dev +- We'd like to be able to import those apps, without messing up our dash + +We do this by replacing: + + from dash import ... + +with: + + from learning_observer.dash_wrapper import ... + +Then, if we're called in a stand-alone system, we just proxy on through to +dash. If we're running within the Learning Observer, we replace calls like +`register_page` either with no-ops, or with our own versions. + +We check if we're inside LO stupidly. We should figure this out better. +''' + +import sys + +from dash import * + +# So we're going to be a bit stupid. If we **just** import this file, +# the only learning_observer modules will be: +# ['learning_observer', 'learning_observer.dash_wrapper'] +# +# If we're running in the main system, it will import learning_observer.main +# +# This way, we can tell which version of register_page to use +lo_modules = [m for m in sys.modules if "learning" in m] + +if 'learning_observer.main' in lo_modules: + def register_page(*args, **kwargs): + pass diff --git a/learning_observer/learning_observer/dashboard.py b/learning_observer/learning_observer/dashboard.py new file mode 100644 index 000000000..f130b238c --- /dev/null +++ b/learning_observer/learning_observer/dashboard.py @@ -0,0 +1,381 @@ +''' +This generates dashboards from student data. +''' + +import asyncio +import inspect +import json +import numbers +import queue +import time + +import aiohttp + +import learning_observer.util as util + +import learning_observer.synthetic_student_data as synthetic_student_data + +import learning_observer.stream_analytics.helpers as sa_helpers +import learning_observer.kvs as kvs + +import learning_observer.paths as paths + +import learning_observer.auth +import learning_observer.rosters as rosters + +from learning_observer.log_event import debug_log + + +def timelist_to_seconds(timelist): + ''' + [5, "seconds"] ==> 5 + [5, "minutes"] ==> 300 + etc. + ''' + if timelist is None: + return None + if len(timelist) != 2: + raise Exception("Time lists should have number and units") + if not isinstance(timelist[0], numbers.Number): + raise Exception("First element should be a number") + if not isinstance(timelist[1], str): + raise Exception("Second element should be a string") + units = { + "seconds": 1, + "minutes": 60, + "hours": 3600 + } + if timelist[1] not in units: + raise Exception("Second element should be a time unit") + return timelist[0] * units[timelist[1]] + + +@learning_observer.auth.teacher +async def generic_dashboard(request): + ''' + We would like to be able to support pretty arbitrary dashboards, + where the client asks for a subset of data and we send it. + + This is probably the wrong abstraction, but our goal is to allows + arbitrary dashboards client-side. + + We're figuring out what we're doing. This view is behind a feature + flag, since we have no clear idea. + + Our goal is to be able to set up appropriate queries to deliver + pretty generic aggregations. + + The current model has the client ask for specific data, and for us + to send it back. However, the concept of doing this more server-side + makes a lot of sense too. + + GraphQL looks super-relevant. Implementing it is a big lift, and + it might need to be slightly adapted to the context. + + The test case for this is in `util/generic_websocket_dashboard.py` + ''' + # We never send data more than twice per second, because performance. + MIN_REFRESH = 0.5 + + teacherkvs = kvs.KVS() + ws = aiohttp.web.WebSocketResponse() + await ws.prepare(request) + subscriptions = queue.PriorityQueue() + + def timeout(): + ''' + Calculate the time until we need to send the next message. + ''' + if subscriptions.empty(): + return None + else: + Δt = subscriptions.queue[0][0] - time.time() + return Δt + + count = [0] + + def counter(): + count[0] += 1 + return count[0] + + running = False # Are we streaming data? + next_subscription = None # What is the next item to send? + + while True: + # Wait for the next message, with an upper bound of when we + # need to get back to the client. + try: + if subscriptions.empty() or not running: + msg = await ws.receive() + else: + msg = await ws.receive(timeout=timeout()) + debug_log("msg", msg) + if msg.type == aiohttp.WSMsgType.CLOSE: + debug_log("Socket closed!") + # By this point, the client is long gone, but we want to + # return something to avoid confusing middlewares. + return aiohttp.web.Response(text="This never makes it back....") + elif msg.type == aiohttp.WSMsgType.TEXT: + message = json.loads(msg.data) + debug_log(message) + if message['action'] == 'subscribe': + subscriptions.put([ + time.time(), + { + 'keys': message['keys'], + 'ids': [sa_helpers.make_key_from_json(key) for key in message['keys']], + 'refresh': timelist_to_seconds(message['refresh']), + 'subscription_id': message.get('subscription_id', counter()) + } + ]) + elif message['action'] == 'start': + await ws.send_json( + {'subscribed': [i[1] for i in subscriptions.queue]} + ) + running = True + elif message['action'] == 'hangup': + break + # If we didn't get a message before we need to send one, + # just keep going. + except asyncio.exceptions.TimeoutError: + pass + if ws.closed: + debug_log("Socket closed") + return aiohttp.web.Response(text="This never makes it back to the client....") + # Now, we send any messages we need to + while timeout() is not None and timeout() < 0: + response = {} + t, s = subscriptions.get() + for key, json_key in zip(s['ids'], s['keys']): + response[key] = await teacherkvs[key] + if isinstance(response[key], dict): + response[key]['key'] = json_key + response['subscription_id'] = s['subscription_id'] + if 'refresh' in s and s['refresh'] is not None: + subscriptions.put([time.time() + max(s['refresh'], MIN_REFRESH), s]) + await ws.send_json(response) + + return aiohttp.web.Response(text="This should never happen....") + + +def fetch_student_state( + course_id, module_id, + agg_module, roster, + default_data={} +): + ''' + This closure will compile student data from a roster of students. + + Closure remembers course roster, and redis KVS. + + Reopening connections to redis every few seconds otherwise would + run us out of file pointers. + ''' + teacherkvs = kvs.KVS() + + async def student_state_fetcher(): + ''' + Poll redis for student state. This should be abstracted out into a + generic aggregator API, much like we have a reducer on the + incoming end. + ''' + students = [] + for student in roster: + student_state = { + # We're copying Google's roster format here. + # + # It's imperfect, and we may want to change it later, but it seems + # better than reinventing our own standard. + # + # We don't copy verbatim, since we do want to filter down any + # extra stuff. + 'profile': { + 'name': { + 'full_name': student['profile']['name']['full_name'] + }, + 'photo_url': student['profile'].get('photo_url', ''), + 'email_address': student['profile'].get('email_address', ''), + 'external_ids': student['profile'].get('external_ids', []), + }, + "course_id": course_id, + "user_id": student['user_id'], # TODO: Encode? + } + + student_state.update(default_data) + + # TODO/HACK: Only do this for Google data. Make this do the right thing + # for synthetic data. + google_id = student['user_id'] + if google_id.isnumeric(): + student_id = learning_observer.auth.google_id_to_user_id(google_id) + else: + student_id = google_id + # TODO: Evaluate whether this is a bottleneck. + # + # mget is faster than ~50 gets. But some online benchmarks show both taking + # microseconds, to where it might not matter. + # + # For most services (e.g. a SQL database), this would be a huge bottleneck. redis might + # be fast enough that it doesn't matter? Dunno. + for sa_module in agg_module['sources']: + key = sa_helpers.make_key( + sa_module, + {sa_helpers.KeyField.STUDENT: student_id}, + sa_helpers.KeyStateType.EXTERNAL) + debug_log(key) + data = await teacherkvs[key] + # debug_log(data) <-- Useful, but a lot of stuff is spit out. + if data is not None: + student_state[sa_helpers.fully_qualified_function_name(sa_module)] = data + cleaner = agg_module.get("cleaner", lambda x: x) + students.append(cleaner(student_state)) + + return students + return student_state_fetcher + + +def find_course_aggregator(module_id): + ''' + Find a course aggregator based on a `module_id` + + * This should move to the modules package. + * We should support having a list of these + ''' + course_aggregator_module = None + default_data = {} + course_aggregator_candidates = learning_observer.module_loader.course_aggregators() + for candidate_module in course_aggregator_candidates: + if course_aggregator_candidates[candidate_module]['short_id'] == module_id: + # TODO: We should support multiple modules here. + if course_aggregator_module is not None: + raise aiohttp.web.HTTPNotImplemented(text="Duplicate module: " + candidate_module) + course_aggregator_module = course_aggregator_candidates[candidate_module] + default_data = course_aggregator_module.get('default-data', {}) + return (course_aggregator_module, default_data) + + +@learning_observer.auth.teacher +async def websocket_dashboard_view(request): + ''' + Handler to aggregate student data, and serve it back to the client + every half-second to second or so. + ''' + # Extract parameters from the URL + # + # Note that we need to do auth/auth. At present, we always want a + # course ID, even for a single student. If a teacher requests a + # students' data, we want to make sure that sutdnet is in that + # teacher's course. + course_id = request.rel_url.query.get("course") + # module_id should support a list, perhaps? + module_id = request.rel_url.query.get("module") + # For student dashboards + student_id = request.rel_url.query.get("student", None) + # For individual resources + resource_id = request.rel_url.query.get("resource", None) + # How often do we refresh? Default is 0.5 seconds + refresh = 0.5 # request.match_info.get('refresh', 0.5) + + # Find the right module + course_aggregator_module, default_data = find_course_aggregator(module_id) + + if course_aggregator_module is None: + debug_log("Bad module: ", module_id) + available = learning_observer.module_loader.course_aggregators() + debug_log("Available modules: ", [available[key]['short_id'] for key in available]) + raise aiohttp.web.HTTPBadRequest(text="Invalid module: {}".format(module_id)) + + # We need to receive to detect web socket closures. + ws = aiohttp.web.WebSocketResponse(receive_timeout=0.1) + await ws.prepare(request) + + roster = await rosters.courseroster(request, course_id) + + # If we're grabbing data for just one student, we filter the + # roster down. This pathway ensures we only serve data for + # students on a class roster. I'm not sure this API is + # right. Should we have a different URL? A set of filters? A lot + # of that is TBD. Once nice property about this is that we have + # the same data format for 1 student as for a classroom of + # students. + if student_id is not None: + roster = [r for r in roster if r['user_id'] == student_id] + # Grab student list, and deliver to the client + student_state_fetcher = fetch_student_state( + course_id, + module_id, + course_aggregator_module, + roster, + default_data + ) + aggregator = course_aggregator_module.get('aggregator', lambda x: {}) + async_aggregator = inspect.iscoroutinefunction(aggregator) + args_aggregrator = inspect.getfullargspec(aggregator)[0] + client_data = None + + while True: + sd = await student_state_fetcher() + data = { + "student_data": sd # Per-student list + } + # Prep the aggregator function to be called. + # Determine if we should pass the client_data in or not/async capability + # Currently options is a list of strings (what we want returned) + # In the futuer this should be some form of communication protocol + if 'options' in args_aggregrator: + agg = aggregator(sd, client_data) + else: + agg = aggregator(sd) + if async_aggregator: + agg = await agg + data.update(agg) + await ws.send_json(data) + # First try to receive a json, if you receive something that can't be json'd + # check for closing, otherwise timeout will fire + # This is kind of an awkward block, but aiohttp doesn't detect + # when sockets close unless they receive data. We try to receive, + # and wait for an exception or a CLOSE message. + try: + client_data = await ws.receive_json() + except (TypeError, ValueError): + if (await ws.receive()).type == aiohttp.WSMsgType.CLOSE: + debug_log("Socket closed!") + # By this point, the client is long gone, but we want to + # return something to avoid confusing middlewares. + return aiohttp.web.Response(text="This never makes it back....") + except asyncio.exceptions.TimeoutError: + # This is the normal code path + pass + await asyncio.sleep(0.5) + # This never gets called, since we return above + if ws.closed: + debug_log("Socket closed. This should never appear, however.") + return aiohttp.web.Response(text="This never makes it back....") + + +# Obsolete code -- we should put this back in after our refactor. Allows us to use +# dummy data +# @learning_observer.auth.teacher +# async def static_student_data_handler(request): +# ''' +# Populate static / mock-up dashboard with static fake data +# ''' +# # module_id = request.match_info['module_id'] +# # course_id = int(request.match_info['course_id']) + +# return aiohttp.web.json_response({ +# "new_student_data": json.load(open(paths.static("student_data.js"))) +# }) + + +# @learning_observer.auth.teacher +# async def generated_student_data_handler(request): +# ''' +# Populate static / mock-up dashboard with static fake data dynamically +# ''' +# # module_id = request.match_info['module_id'] +# # course_id = int(request.match_info['course_id']) + +# return aiohttp.web.json_response({ +# "new_student_data": synthetic_student_data.synthetic_data() +# }) diff --git a/learning_observer/learning_observer/exceptions.py b/learning_observer/learning_observer/exceptions.py new file mode 100644 index 000000000..285ca00b7 --- /dev/null +++ b/learning_observer/learning_observer/exceptions.py @@ -0,0 +1,25 @@ +''' +These aren't used much yet, but these are the sorts of exceptions +our system raises, beyond the basic Python ones. +''' + + +class DeployException(Exception): + ''' + E.g. Bad config file. Missing paths. Etc. + + Errors at startup should **not** raise this exception, but print a + meaningful error message. This exception should only be raised at + run-time. + ''' + + +class SuspiciousOperation(Exception): + ''' + E.g.: + * A user types in a URL by hand with "/../" + * Someone hand-crafts an invalid AJAX request + * etc. + + These happen when a platform is running, but they are suspicious. + ''' diff --git a/learning_observer/learning_observer/filesystem_state.py b/learning_observer/learning_observer/filesystem_state.py new file mode 100644 index 000000000..6b98a425c --- /dev/null +++ b/learning_observer/learning_observer/filesystem_state.py @@ -0,0 +1,92 @@ +''' +This allows us to capture what state we start the server in, for +replicability. We'd like to work from SHA hashes eventually. + +We'll probably want something like: + +FILESTRING = """{filename}: +\thash:{hash} +\tst_mode:{st_mode} +\tst_size:{st_size} +\tst_atime:{st_atime} +\tst_mtime:{st_mtime} +\tst_ctime:{st_ctime} +""" +''' + +import datetime +import hashlib +import os +import platform +import socket +import subprocess + +import learning_observer.paths as paths + + +extensions = [ + ".py", + ".js", + ".html", + ".md" +] + + +def filesystem_state(): + ''' + Make a snapshot of the file system. Return a json object. Best + usage is to combine with `yaml.dump`, or `json.dump` with a + specific indent. This is helpful for knowing which version was running. + + Snapshot contains list of Python, HTML, JSON, and Markdown files, + together with their SHA hashes and modified times. It also + contains a `git` hash of the current commit. + + This ought to be enough to confirm which version of the tool is + running, and if we are running from a `git` commit (as we ought to + in production) or if changes were made since git commited. + ''' + file_info = {} + # We need have dirs, even if we don't use it. + # pylint: disable=W0612 + for root, dirs, files in os.walk(paths.base_path()): + for name in files: + for extension in extensions: + # Check if the file has an appropriate extension, and + # is not a temporary file or backup. + if name.endswith(extension) and \ + "#" not in name and \ + "~" not in name and \ + not name.startswith("."): + filename = os.path.join(root, name) + stat = os.stat(filename) + file_info[filename] = { + "hash": hashlib.sha3_512(open(filename, "rb").read()).hexdigest(), + "st_mode": stat.st_mode, + "st_size": stat.st_size, + "st_atime": stat.st_atime, + "st_mtime": stat.st_mtime, + "st_ctime": stat.st_ctime + } + try: + file_info['::git-head::'] = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip() + except subprocess.CalledProcessError: + print("Learning Observer Startup Warning: Not in a git repo") + print("We will not log the system state.") + file_info['::git-head::'] = "Not a git repo" + file_info['::pid::'] = os.getpid() + file_info['::hostname::'] = socket.gethostname() + file_info['::platform::'] = platform.version() + file_info['::python::'] = platform.python_version() + file_info['::timestamp::'] = datetime.datetime.utcnow().isoformat() + return file_info + + +if __name__ == '__main__': + # We normally do JSON, but we'll do YAML here, just to test in a different context + # + # By convention: + # * We always output JSON in logs/snapshots (which are read by machines) + # * We always input YAML in configuration files (which are written by humans) + import yaml + print(yaml.dump(filesystem_state())) diff --git a/learning_observer/learning_observer/google.py b/learning_observer/learning_observer/google.py new file mode 100644 index 000000000..dfaa26ba4 --- /dev/null +++ b/learning_observer/learning_observer/google.py @@ -0,0 +1,418 @@ +''' +We will gradually move all of the Google-specific code into here. + +Our design goals: +- Easily call into Google APIs (Classroom, Drive, Docs, etc.) +- Be able to preprocess the data into standard formats + +On a high level, for each Google request, we plan to have a 4x4 grid: +- Web request and function call +- Cleaned versus raw data + +The Google APIs are well-designed (if poorly-documented, and with occasional +bugs), but usually return more data than we need, so we have cleaner functions. + +For a given call, we might have several cleaners. For example, for a Google Doc, +Google returns a massive JSON object containing everything. For most purposes, +we don't need all of that, and it's more convenient to work with a plain +text representation, and for downstream code to not need to understand this +JSON. However, for some algorithms, we might need additonal data of different +sorts. It's still more convenient to hand this back in something simplified for +analysis. +''' + +import collections +import itertools +import json +import recordclass +import string +import re + +import aiohttp +import aiohttp.web + +import learning_observer.settings as settings +import learning_observer.log_event +import learning_observer.util +import learning_observer.auth + + +cache = None + + +GOOGLE_FIELDS = [ + 'alternateLink', 'calculationType', 'calendarId', 'courseGroupEmail', + 'courseId', 'courseState', 'creationTime', 'descriptionHeading', + 'displaySetting', 'emailAddress', 'enrollmentCode', 'familyName', + 'fullName', 'givenName', 'gradebookSettings', 'guardiansEnabled', + 'ownerId', 'photoUrl', 'teacherFolder', 'teacherGroupEmail', 'updateTime', + 'userId' +] + +# On in-take, we want to convert Google's CamelCase to LO's snake_case. This +# dictionary contains the conversions. +camel_to_snake = re.compile(r'(?>> ("hello {hi} my {bye}")] + ['hi', 'bye'] + ''' + # The parse returns a lot of context, which we discard. In particular, the + # last item is often about the suffix after the last parameter and may be + # `None` + return [f[1] for f in string.Formatter().parse(format_string) if f[1] is not None] + + +async def raw_google_ajax(request, target_url, **kwargs): + ''' + Make an AJAX call to Google, managing auth + auth. + + * request is the aiohttp request object. + * default_url is typically grabbed from ENDPOINTS + * ... and we pass the named parameters + ''' + url = target_url.format(**kwargs) + cache_key = "raw_google/" + learning_observer.util.url_pathname(url) + if settings.feature_flag('use_google_ajax') is not None: + value = await cache[cache_key] + if value is not None: + return learning_observer.util.translate_json_keys( + json.loads(value), + GOOGLE_TO_SNAKE + ) + async with aiohttp.ClientSession(loop=request.app.loop) as client: + if 'auth_headers' not in request: + raise aiohttp.web.HTTPUnauthorized(text="Please log in") # TODO: Consistent way to flag this + async with client.get(url, headers=request["auth_headers"]) as resp: + response = await resp.json() + learning_observer.log_event.log_ajax(target_url, response, request) + if settings.feature_flag('use_google_ajax') is not None: + await cache.set(cache_key, json.dumps(response, indent=2)) + return learning_observer.util.translate_json_keys( + response, + GOOGLE_TO_SNAKE + ) + + +def raw_access_partial(remote_url, name=None): + ''' + This is a helper which allows us to create a function which calls specific + Google APIs. + + To test this, try: + + print(await raw_document(request, documentId="some_google_doc_id")) + ''' + async def caller(request, **kwargs): + ''' + Make an AJAX request to Google + ''' + return await raw_google_ajax(request, remote_url, **kwargs) + setattr(caller, "__qualname__", name) + + return caller + + +def initialize_and_register_routes(app): + ''' + This is a big 'ol function which might be broken into smaller ones at some + point. We: + + - Created debug routes to pass through AJAX requests to Google + - Created production APIs to have access to cleaned versions of said data + - Create local function calls to call from other pieces of code + within process + + We probably don't need all of this in production, but a lot of this is + very important for debugging. Having APIs is more useful than it looks, since + making use of Google APIs requires a lot of infrastructure (registering + apps, auth/auth, etc.) which we already have in place on dev / debug servers. + ''' + # # For now, all of this is behind one big feature flag. In the future, + # # we'll want seperate ones for the debugging tools and the production + # # staff + # if 'google_routes' not in settings.settings['feature_flags']: + # return + + for key in ['save_google_ajax', 'use_google_ajax', 'save_clean_ajax', 'use_clean_ajax']: + if key in settings.settings['feature_flags']: + global cache + cache = learning_observer.kvs.FilesystemKVS(path=learning_observer.paths.data('google'), subdirs=True) + + # Provide documentation on what we're doing + app.add_routes([ + aiohttp.web.get("/google", api_docs_handler) + ]) + + def make_ajax_raw_handler(remote_url): + ''' + This creates a handler to forward Google requests to the client. It's used + for debugging right now. We should think through APIs before relying on this. + ''' + async def ajax_passthrough(request): + ''' + And the actual handler.... + ''' + response = await raw_google_ajax( + request, + remote_url, + **request.match_info + ) + + return aiohttp.web.json_response(response) + return ajax_passthrough + + def make_cleaner_handler(raw_function, cleaner_function, name=None): + async def cleaner_handler(request): + ''' + ''' + response = cleaner_function( + await raw_function(request, **request.match_info) + ) + if isinstance(response, dict) or isinstance(response, list): + return aiohttp.web.json_response( + response + ) + elif isinstance(response, str): + return aiohttp.web.Response( + text=response + ) + else: + raise AttributeError(f"Invalid response type: {type(response)}") + if name is not None: + setattr(cleaner_handler, "__qualname__", name + "_handler") + + return cleaner_handler + + def make_cleaner_function(raw_function, cleaner_function, name=None): + async def cleaner_local(request, **kwargs): + google_response = await raw_function(request, **kwargs) + clean = cleaner_function(google_response) + return clean + if name is not None: + setattr(cleaner_local, "__qualname__", name) + return cleaner_local + + for e in ENDPOINTS: + function_name = f"raw_{e.name}" + raw_function = raw_access_partial(remote_url=e.remote_url, name=e.name) + globals()[function_name] = raw_function + cleaners = e._cleaners() + for c in cleaners: + app.add_routes([ + aiohttp.web.get( + cleaners[c]['local_url'], + make_cleaner_handler( + raw_function, + cleaners[c]['function'], + name=cleaners[c]['name'] + ) + ) + ]) + globals()[cleaners[c]['name']] = make_cleaner_function( + raw_function, + cleaners[c]['function'], + name=cleaners[c]['name'] + ) + app.add_routes([ + aiohttp.web.get( + e._local_url(), + make_ajax_raw_handler(e.remote_url) + ) + ]) + + +def api_docs_handler(request): + ''' + Return a list of available endpoints. + + Eventually, we should also document available function calls + ''' + response = "URL Endpoints:\n\n" + for endpoint in ENDPOINTS: + response += f"{endpoint._local_url()}\n" + cleaners = endpoint._cleaners() + for c in cleaners: + response += f" {cleaners[c]['local_url']}\n" + response += "\n\n Globals:" + if False: + response += str(globals()) + return aiohttp.web.Response(text=response) + + +def register_cleaner(data_source, cleaner_name): + ''' + This will register a cleaner function, for export both as a web service + and as a local function call. + ''' + def decorator(f): + found = False + for endpoint in ENDPOINTS: + if endpoint.name == data_source: + found = True + endpoint._add_cleaner( + cleaner_name, + { + 'function': f, + 'local_url': f'{endpoint._local_url()}/{cleaner_name}', + 'name': cleaner_name + } + ) + + if not found: + raise AttributeError(f"Data source {data_source} invalid; not found in endpoints.") + return f + + return decorator + + +# Rosters +@register_cleaner("course_roster", "roster") +def clean_course_roster(google_json): + ''' + Retrieve the roster for a course, alphabetically + ''' + students = google_json.get('students', []) + students.sort( + key=lambda x: x.get('name', {}).get('fullName', 'ZZ'), + ) + # Convert Google IDs to internal ideas (which are the same, but with a gc- prefix) + for student_json in students: + google_id = student_json['profile']['id'] + local_id = learning_observer.auth.google_id_to_user_id(google_id) + student_json['user_id'] = local_id + del student_json['profile']['id'] + + # For the present there is only one external id so we will add that directly. + if 'external_ids' not in student_json['profile']: + student_json['profile']['external_ids'] = [] + student_json['profile']['external_ids'].append({"source": "google", "id": google_id}) + return students + + +@register_cleaner("course_list", "courses") +def clean_course_list(google_json): + ''' + Google's course list is one object deeper than we'd like, and alphabetic + sort order is nicer. This will clean it up a bit + ''' + courses = google_json.get('courses', []) + courses.sort( + key=lambda x: x.get('name', 'ZZ'), + ) + return courses + + +# Google Docs +def _force_text_length(text, length): + ''' + Force text to a given length, either concatenating or padding + + >>> force_text_length("Hello", 3) + >>> 'Hel' + + >>> force_text_length("Hello", 13) + >>> 'Hello ' + ''' + return text[:length] + " " * (length - len(text)) + + +@register_cleaner("document", "doctext") +def extract_text_from_google_doc_json( + j, align=True, + EXTRACT_DEBUG_CHECKS=False): + ''' + Extract text from a Google Docs JSON object, ignoring formatting. + + There is an alignment issue between Google's and Python's handling + of Unicode. We can either: + * extract text faithfully (align=False) + * extract text with aligned indexes by cutting text / adding + spaces (align=True) + + This issue came up in text with a Russian flag unicode symbol + (referencing the current conflict). I tried various encodings, + and none quite matched Google 100%. + + Note that align=True doesn't necessarily give perfect local alignment + within text chunks, since we do have different lengths for something like + this flag. It does work okay globally. + ''' + length = j['body']['content'][-1]['endIndex'] + elements = [a.get('paragraph', {}).get('elements', []) for a in j['body']['content']] + flat = sum(elements, []) + text_chunks = [f['textRun']['content'] for f in flat] + if align: + lengths = [f['endIndex'] - f['startIndex'] for f in flat] + text_chunks = [_force_text_length(chunk, length) for chunk, length in zip(text_chunks, lengths)] + text = ''.join(text_chunks) + + if EXTRACT_DEBUG_CHECKS: + print("Text length versus Google length:") + print(len(text), length) + print("We expect these to be off by one, since Google seems to starts at 1 (and Python at 0)") + if align: + print + print("Offsets (these should match):") + print(list(zip(itertools.accumulate(map(len, text_chunks)), itertools.accumulate(lengths)))) + + return text + + +if __name__ == '__main__': + import json + import sys + j = json.load(open(sys.argv[1])) + extract_text_from_google_doc_json(j, align=False, EXTRACT_DEBUG_CHECKS=True) + extract_text_from_google_doc_json(j, align=True, EXTRACT_DEBUG_CHECKS=True) diff --git a/learning_observer/learning_observer/graphics_helpers.py b/learning_observer/learning_observer/graphics_helpers.py new file mode 100644 index 000000000..b130ce21b --- /dev/null +++ b/learning_observer/learning_observer/graphics_helpers.py @@ -0,0 +1,111 @@ +''' +Helpers to make things look pretty. +''' + +import colorsys +import hashlib +import svgwrite + + +φ = 1.61803 + + +class ColorWheel: + ''' + Returns colors circling around value, keeping hue and saturation fixed. + + We move by the golden ratio, which keeps an optimal distribution of distance + between colors + ''' + def __init__(self): + self.h = 0 + self.s = 0.5 + self.v = 0.5 + + def next_color(self): + ''' + Move onto the next color + ''' + self.h = self.h + φ + self.h = self.h % 1 + + def color_from_hash(self, s): + ''' + Return a color from a string hash + ''' + hash = int(hashlib.sha1(s.encode('utf-8')).hexdigest(), 16) % 2**16 + self.h = (hash * φ) % 1 + + def rgb_format(self): + ''' + Return a color in an RGB format, appropriate for plotly or css + ''' + return 'rgb({0},{1},{2})'.format(*map(lambda x: int(x * 255), colorsys.hsv_to_rgb(self.h, self.s, self.v))) + + def hex_format(self): + ''' + Return a color in an hex format + ''' + return '#{0:02x}{1:02x}{2:02x}'.format(*map(lambda x: int(x * 255), colorsys.hsv_to_rgb(self.h, self.s, self.v))) + + +def default_user_icon(name): + ''' + Get a default user icon as an SVG. + + Args: + name (str): The name of the user. + + Returns: + str: The default user icon. + ''' + if name is None or name == "": + name = 'Anonymous Anonymous' + if len(name) <= 2: + intials = name + else: + initials = name.split()[0][0].upper() + name.split()[-1][0].upper() + d = svgwrite.Drawing(height=200, width=200) + fill = ColorWheel() + fill.color_from_hash(name) + d.add( + d.circle( + center=(100, 100), + r=100, + fill=fill.hex_format() + ) + ) + d.add( + d.text( + initials, + insert=(100, 125), + font_size=100, + fill="white", + text_anchor="middle", + alignment_baseline="middle", + font_family="sans-serif", + font_weight="bold", + text_rendering="optimizeLegibility", + style="text-rendering: optimizeLegibility; font-family: sans-serif; font-weight: bold;", + ) + ) + + # Workaround for svgwrite bug. It ignores the height and width attributes + # when rendering the SVG, and sets them to 100% instead. + buggy_image = d.tostring() + if "200px" in buggy_image: + print("Huzza! svgwrite bug fixed!") + print("Please remove the bug fix") + else: + fixed_image = buggy_image.replace('100%"', '200px"') + return fixed_image + + +if __name__ == '__main__': + print(default_user_icon("John Doe")) + print(default_user_icon("Jim")) + print(default_user_icon("李小龙")) # Chinese support is a bit limited "李李" is not good + print(default_user_icon("أحمد علي")) + print(default_user_icon("Александр Пушкин")) + print(default_user_icon("José Antonio")) + print(default_user_icon("Janusz Zieliński")) diff --git a/learning_observer/learning_observer/incoming_student_event.py b/learning_observer/learning_observer/incoming_student_event.py new file mode 100644 index 000000000..efcd66a4a --- /dev/null +++ b/learning_observer/learning_observer/incoming_student_event.py @@ -0,0 +1,410 @@ +''' +This has event handlers for incoming student events. + +These should come in over a websocket. We support AJAX too, since it's +nice for debugging. This should never be used in production. + +We: +* Authenticate (minimally, for now, see docs) +* Run these through a set of reducers +''' + +import asyncio +import datetime +import inspect +import json +import os +import time +import traceback +import urllib.parse +import uuid +import socket + +import aiohttp + +import learning_observer.log_event as log_event +import learning_observer.paths as paths + +import learning_observer.auth.utils as authutils # Encoded / decode user IDs +import learning_observer.stream_analytics as stream_analytics # Individual analytics modules + +import learning_observer.settings as settings + +import learning_observer.stream_analytics.helpers + +from learning_observer.log_event import debug_log + +import learning_observer.exceptions + +import learning_observer.auth.events +import learning_observer.adapters.adapter + + +def compile_server_data(request): + ''' + We extract some basic data. In contrast to client data, this data + cannot be spoofed, and can be super-useful for debugging, as well + as attack mitigation. + ''' + return { + 'time': time.time(), + 'origin': request.headers.get('Origin', ''), + 'agent': request.headers.get('User-Agent', ''), + 'ip': request.headers.get('X-Real-IP', ''), + 'executable': 'aio_webapp' + } + + +async def student_event_pipeline(metadata): + ''' + Create an event pipeline, based on header metadata + ''' + client_source = metadata["source"] + debug_log("client_source", client_source) + debug_log("Module", stream_analytics.reducer_modules(client_source)) + analytics_modules = stream_analytics.reducer_modules(client_source) + + # Create an event processor for this user + # TODO: + # * Thing like this (esp. below) should happen in parallel: + # https://stackoverflow.com/questions/57263090/async-list-comprehensions-in-python + # * We should create cached modules for each key, rather than this partial evaluation + # kludge + async def prepare_reducer(analytics_module): + ''' + Prepare a reducer for the analytics module. Note that this is in-place (the + field is mutated). + ''' + f = analytics_module['reducer'] + # We're moving to this always being a co-routine. This is + # backwards-compatibility code which should be remove, + # eventually. We started with a function, and had an interrim + # period where both functions and co-routines worked. + if not inspect.iscoroutinefunction(f): + debug_log("Not a coroutine", analytics_module) + raise AttributeError("The reducer {} should be a co-routine".format(analytics_module)) + + analytics_module['reducer_partial'] = await analytics_module['reducer'](metadata) + return analytics_module + + analytics_modules = await asyncio.gather(*[prepare_reducer(am) for am in analytics_modules]) + + async def pipeline(parsed_message): + ''' + And this is the pipeline itself. It takes messages, processes them, + and, optionally, will inform consumers when there is new data (disabled + in the current code, since we use polling). + ''' + if type(parsed_message) is not dict: + raise ValueError(f"Expected a dict, got {type(parsed_message)}") + if 'client' not in parsed_message: + raise ValueError("Expected a dict with a 'client' field") + if 'event' not in parsed_message['client']: + raise ValueError("Expected a dict with a 'client' field with an 'event' field") + + debug_log("Processing message {event} from {source}".format( + event=parsed_message["client"]["event"], source=client_source + )) + + # Try to run a message through all event processors. + # + # To do: Finer-grained exception handling. Right now, if we break, we + # don't even run through the remaining processors. + try: + processed_analytics = [] + # Go through all the analytics modules + for am in analytics_modules: + debug_log("Scope", am['scope']) + event_fields = {} + skip = False + for field in am['scope']: + if isinstance(field, learning_observer.stream_analytics.helpers.EventField): + debug_log("event", parsed_message) + debug_log("field", field) + client_event = parsed_message.get('client', {}) + if field.event not in client_event: + debug_log(field.event, "not found") + skip = True + event_fields[field.event] = client_event.get(field.event) + if not skip: + debug_log("args", event_fields) + processed_analytics.append(await am['reducer_partial'](parsed_message, event_fields)) + except Exception as e: + traceback.print_exc() + filename = paths.logs("critical-error-{ts}-{rnd}.tb".format( + ts=datetime.datetime.now().isoformat(), + rnd=uuid.uuid4().hex + )) + fp = open(filename, "w") + fp.write(json.dumps(parsed_message, sort_keys=True, indent=2)) + fp.write("\nTraceback:\n") + fp.write(traceback.format_exc()) + fp.close() + if settings.RUN_MODE == settings.RUN_MODES.DEV: + raise + return processed_analytics + return pipeline + +COUNTER = 0 + + +async def handle_incoming_client_event(metadata): + ''' + Common handler for both Websockets and AJAX events. + + We do a reduce through the event pipeline, and forward on to + for aggregation on the dashboard side. + ''' + global COUNTER + pipeline = await student_event_pipeline(metadata=metadata) + + filename = "{timestamp}-{counter:0>10}-{username}-{pid}.study".format( + username=metadata.get("auth", {}).get("safe_user_id", "GUEST"), + timestamp=datetime.datetime.utcnow().isoformat(), + counter=COUNTER, + pid=os.getpid() + ) + COUNTER += 1 + + # The adapter allows us to handle old event formats + adapter = learning_observer.adapters.adapter.EventAdapter() + + async def handler(request, client_event): + ''' + This is the handler for incoming client events. + ''' + client_event = adapter.canonicalize_event(client_event) + debug_log("Compiling event for reducer: " + client_event["event"]) + event = { + "client": client_event, + "server": compile_server_data(request), + "metadata": metadata + } + + # Log to the main event log file + log_event.log_event(event) + # Log the same thing to our study log file. This isn't a good final format, since we + # mix data with auth, but we want this for now. + log_event.log_event( + json.dumps(event, sort_keys=True), + filename, preencoded=True, timestamp=True) + await pipeline(event) + + return handler + + +COUNT = 0 + + +def event_decoder_and_logger( + request, + headers=None, + metadata=None, + session={} +): + ''' + This is the main event decoder. It is called by the + websocket handler to log events. + + Parameters: + request: The request object. + headers: The header events, which e.g. contain auth + metadata: Metadata about the request, such as IP. This is + extracted from the request, which will go away soon. + + Returns: + A coroutine that decodes and logs events. + + We call this after the header events, with the header events in the + `headers` parameter. This is because we want to log the header events + before the body events, so they can be dropped from the Merkle tree + for privacy. Although in most cases, students can be reidentified + from the body events, the header events contain explicit identification + tokens. It is helpful to be able to analyze data with these dropped, + obfuscated, or otherwise anonymized. + + At present, many body events contain auth as well. We'll want to minimize + this and tag those events appropriately. + + HACK: We would like clean log files for the first classroom pilot. + + This puts events in per-session files. + + The feature flag has the non-hack implementation. + ''' + if merkle_config := settings.feature_flag("merkle"): + import merkle_store + + storage_class = merkle_store.STORES[merkle_config['store']] + params = merkle_config.get("params", {}) + if not isinstance(params, dict): + raise ValueError("Merkle tree params must be a dict (even an empty one)") + storage = storage_class(**params) + merkle_store.Merkle(storage) + session = { + "student": request.student, + "tool": request.tool + } + merkle_store.start(session) + + def decode_and_log_event(msg): + ''' + Decode and store the event in the Merkle tree + ''' + event = json.loads(msg) + merkle_store.event_to_session(event) + return event + + global COUNT + # Count + PID should guarantee uniqueness. + # With multi-server installations, we might want to add + # `socket.gethostname()`, but hopefully we'll have our + # Merkle tree logger by then, and this will be obsolete. + filename = "{timestamp}-{ip:-<15}-{hip:-<15}-{session_count:0>10}-{pid}".format( + ip=request.remote, + hip=request.headers.get('X-Real-IP', ''), + timestamp=datetime.datetime.utcnow().isoformat(), + session_count=COUNT, + pid=os.getpid() + ) + COUNT += 1 + + def decode_and_log_event(msg): + ''' + Take an aiohttp web sockets message, log it, and return + a clean event. + ''' + if isinstance(msg, dict): + json_event = msg + else: + json_event = json.loads(msg.data) + log_event.log_event(json_event, filename=filename) + return json_event + return decode_and_log_event + + +async def incoming_websocket_handler(request): + ''' + This handles incoming WebSockets requests. It does some minimal + processing on them. It used to rely them on via PubSub to be + aggregated, but we've switched to polling. It also logs them. + ''' + debug_log("Incoming web socket connected") + ws = aiohttp.web.WebSocketResponse() + await ws.prepare(request) + + # For now, we receive two packets to initialize: + # * Chrome's identity information + # * browser.storage identity information + event_metadata = {'headers': {}} + + debug_log("Init pipeline") + header_events = [] + + # This will take a little bit of explaining.... + # + # We originally did not have a way to do auth/auth. Now, we do + # auth with a header. However, we have old log files without that + # header. Setting INIT_PIPELINE to False allows us to use those + # files in the current system. + # + # At some point, we should either: + # + # 1) Change restream.py to inject a false header, or archive the + # source files and migrate the files, so that we can eliminate + # this setting; or + # 2) Dispatch on type of event + # + # This should not be a config setting. + + INIT_PIPELINE = settings.settings.get("init_pipeline", True) + json_msg = None + if INIT_PIPELINE: + async for msg in ws: + debug_log("Auth", msg.data) + try: + json_msg = json.loads(msg.data) + except Exception: + print("Bad message:", msg) + raise + header_events.append(json_msg) + if json_msg["event"] == "metadata_finished": + break + else: + # This is a path for the old way of doing auth, which was to + # send the auth data in the first message. + # + # It is poorly tested. + print("Running without an initialization pipeline / events. This is for") + print("development purposes, and may not continue to be supported") + msg = await ws.receive() + json_msg = json.loads(msg.data) + header_events.append(json_msg) + + first_event = header_events[0] + event_metadata['source'] = first_event['source'] + + # We authenticate the student + event_metadata['auth'] = await learning_observer.auth.events.authenticate( + request=request, + headers=header_events, + first_event=first_event, # This is obsolete + source=json_msg['source'] + ) + + print(event_metadata['auth']) + + # We're now ready to make the pipeline. + hostname = socket.gethostname() + decoder_and_logger = event_decoder_and_logger( + request, + headers=header_events, + metadata={ + 'ip': request.remote, + 'host': request.headers.get('Host', ''), + 'user_agent': request.headers.get('User-Agent', ''), + 'x_real_ip': request.headers.get('X-Real-IP', ''), + 'timestamp': datetime.datetime.utcnow().isoformat(), + 'session_count': COUNT, + 'pid': os.getpid(), + 'hostname': hostname, + 'hostip': socket.gethostbyname(hostname), + 'referer': request.headers.get('Referer', ''), + 'host': request.headers.get('Host', ''), + 'x-forwarded-for': request.headers.get('X-Forwarded-For', ''), + 'x-forwarded-host': request.headers.get('X-Forwarded-Host', '') + }, + session={ + 'student': event_metadata['auth']['safe_user_id'], + 'source': event_metadata['source'] + } + ) + + event_handler = await handle_incoming_client_event(metadata=event_metadata) + + # Handle events which we already received, if we needed to peak + # ahead to authenticate user + if not INIT_PIPELINE: + for event in header_events: + decoder_and_logger(event) + await event_handler(request, event) + + # And continue to receive events + async for msg in ws: + # If web socket closed, we're done. + if msg.type == aiohttp.WSMsgType.ERROR: + debug_log(f"ws connection closed with exception {ws.exception()}") + return + + # If we receive an unknown event type, we keep going, but we + # print an error to the console. If we got some kind of e.g. + # wonky ping or keep-alive or something we're unaware of, we'd + # like to handle that gracefully. + if msg.type != aiohttp.WSMsgType.TEXT: + debug_log("Unknown event type: " + msg.type) + + client_event = decoder_and_logger(msg) + await event_handler(request, client_event) + + debug_log('Websocket connection closed') + return ws diff --git a/learning_observer/learning_observer/init.sql b/learning_observer/learning_observer/init.sql new file mode 100644 index 000000000..1cca017d6 --- /dev/null +++ b/learning_observer/learning_observer/init.sql @@ -0,0 +1,99 @@ +reset: | + DROP TABLE IF EXISTS USERS; + DROP TABLE IF EXISTS WRITING_EVENTS; + DROP TABLE IF EXISTS DOCUMENTS; + DROP TABLE IF EXISTS CLASSES; + DROP FUNCTION IF EXISTS insert_event; + +init: | + CREATE TABLE IF NOT EXISTS USERS ( + idx SERIAL PRIMARY KEY, + username text UNIQUE, + email text, + date_created timestamp NOT NULL DEFAULT NOW() + ); + + CREATE TABLE IF NOT EXISTS DOCUMENTS ( + idx SERIAL PRIMARY KEY, + docstring char(48) UNIQUE, + date_created timestamp NOT NULL DEFAULT NOW() + ); + + CREATE TABLE IF NOT EXISTS WRITING_EVENTS ( + idx SERIAL PRIMARY KEY, + user_id integer REFERENCES USERS (idx), -- Who is editing? + document integer REFERENCES DOCUMENTS (idx), -- Which document? + date_created timestamp NOT NULL DEFAULT NOW(), + event json, + ft text -- For debugging: Ongoing reconstruction of full text + ); + + CREATE TABLE IF NOT EXISTS CLASSES ( + teacher integer REFERENCES USERS (idx), + student integer REFERENCES USERS (idx), + classname text + ); + +-- CREATE OR REPLACE FUNCTION insert_event( +-- gusername text, +-- gdocstring char(48), +-- event json, +-- ft text -- For debugging: Ongoing reconstruction of full text +-- ) RETURNS text +-- LANGUAGE plpgsql +-- AS $$ +-- DECLARE +-- strresult text; +-- affected_rows integer; +-- BEGIN +-- strresult := ''; +-- -- If the user does not exist, create the user. Add 'New User' to the return value +-- if NOT EXISTS (SELECT 1 FROM USERS where USERS.username = gusername) THEN +-- strresult := strresult || '[New User]'; +-- INSERT INTO USERS (username) VALUES (gusername); +-- END IF; + +-- -- If the document does not exist, create the document. Add "New Document" to the return value +-- if NOT EXISTS (SELECT 1 FROM DOCUMENTS where DOCUMENTS.docstring = gdocstring) THEN +-- strresult := strresult || '[New Document]'; +-- INSERT INTO DOCUMENTS (docstring) VALUES (gdocstring); +-- END IF; +-- -- Insert the event into the database +-- with INSERT_ROW_COUNT as +-- (INSERT INTO WRITING_EVENTS +-- (user_id, document, event) +-- (SELECT +-- users.idx, documents.idx, event +-- FROM +-- users, documents where users.username=gusername and documents.docstring=gdocstring) +-- RETURNING 1) +-- SELECT COUNT(*) INTO affected_rows FROM INSERT_ROW_COUNT; + +-- -- This is a little bit awkward, but we return: +-- -- 1. Number of rows inserted +-- -- 2. Whether a new user or document was created +-- -- As a string. +-- return cast(affected_rows as varchar) || ' ' || strresult; +-- COMMIT; +-- END; +-- $$; +-- -- Example: SELECT insert_writing_delta('pmitros', 'random-google-doc-id', 'is', 7,8,4,'hello','temp'); + +-- stored_procedures: +-- insert_event: | +-- -- PREPARE insert_writing_delta (text, char(48), char(2), integer, integer, integer, text, text) AS +-- SELECT insert_event($1, $2, $3, ''); + +-- fetch_events: | +-- -- PREPARE fetch_writing_deltas (text, char(48)) AS -- username, document string +-- SELECT +-- WRITING_EVENTS.idx, WRITING_EVENTS.date_created, event +-- FROM +-- WRITING_EVENTS, USERS, DOCUMENTS +-- WHERE +-- WRITING_EVENTS.user_id = USERS.idx AND +-- WRITING_EVENTS.document = DOCUMENTS.idx AND +-- DOCUMENTS.docstring = $2 AND +-- USERS.username = $1 +-- ORDER BY +-- WRITING_EVENTS.idx; diff --git a/learning_observer/learning_observer/jupyter.py b/learning_observer/learning_observer/jupyter.py new file mode 100644 index 000000000..6d5e3c49d --- /dev/null +++ b/learning_observer/learning_observer/jupyter.py @@ -0,0 +1,229 @@ +''' +Integration with Jupyter notebooks. + +We're still figuring this out. + +By the current design: + +1. The user can run the notebook +2. The user can create an iframe in the notebook +3. We have a server which serves the repos to iframes in + the notebook to render data. +4. We have tools to inject data into the iframes. + +This allows us to have a notebook where we can prototype +dashboards, and analyze data. + +The notebook architecture will allow us to capture the +analyses run in the notebook, for open science. + +Much of this code is untested and still in flux. + +For the most part, we're trying to minimize the amount of +code that needs to be written in the notebook and instead +inject the code and data directly into the iframe. +''' + +import argparse +import json +import uuid +import base64 + +import aiohttp.web + +import learning_observer.routes + +import gitserve.aio_gitserve + + +from IPython.core.display import display, HTML + + +DEFAULT_PORT = 8008 + + +def show_dashboard( + module, + repo, + branch="master", + path="index.html", + width=1280, + height=720, + port=DEFAULT_PORT +): + ''' + Show a dashboard in an iframe. + ''' + url = f"http://localhost:{port}/{repo}/{branch}/{path}" + + +def make_iframe(url="", width=1280, height=720): + ''' + Make an iframe for a given URL. + + Args: + url (str): The URL to load in the iframe. Should be blank if you want + to load the iframe from a string. + width (int): The width of the iframe. + height (int): The height of the iframe. + + Returns: + str: The iframe ID. + + There is a race condition if we try to `load_frame_text` in the + same Jupyter cell as this. + ''' + frameid = str(uuid.uuid1()) + + display(HTML(f""" + + """)) + return frameid + + +def load_frame_text(frameid, text): + ''' + Load text into an iframe. + + Args: + frameid (str): The ID of the iframe to inject into. + text (str): The text to inject. + ''' + inject_script(frameid, f""" + document.body.innerHTML = atob("{base64.b64encode(text.encode()).decode()}"); + """) + + +def inject_script(frameid, script): + ''' + Inject a script into an iframe. + + Args: + frameid (str): The ID of the iframe to inject into. + script (str): The script to inject. + + Returns: + None + ''' + b64_script = base64.b64encode(script.encode('utf-8')).decode('utf-8') + display(HTML(f""" + + """)) + + +def inject_data(frameid, data): + ''' + Inject data into an iframe. + + Args: + frameid (str): The ID of the iframe to inject into. + data (dict): The data to inject. + + Returns: + None + ''' + for key in data: + inject_script(frameid, f"window.{key} = {json.dumps(data[key])};") + + +def refresh_dashboard(frameid, data): + ''' + Rerender the dashboard from the data in the iframe. + + Args: + frameid (str): The ID of the iframe to inject into. + + Returns: + None + ''' + inject_script(frameid, f""" + refresh_dashboard({json.dumps(data)}); + """) + + +# def refresh_dashboard(frameid, data): +# ''' +# Refresh the dashboard with new data. + +# Args: +# frameid (str): The ID of the iframe to inject into. +# data (dict): The data to inject. + +# Returns: +# None +# ''' +# #inject_data(frameid, data) +# rerender_dashboard_from_data(frameid) +# inject_script(frameid, """ +# window.sendMessage({ +# type: "lo_inject_data", +# data: """ + json.dumps(data) + """ +# }, +# window.location +# ); +# """); +# ) + + +def run_server(repos, port=DEFAULT_PORT): + ''' + Run a server to serve the given repos. + + Args: + repos (list): A list of repos to serve. + port (int): The port to serve on. + + Returns: + Never :) +''' + app = aiohttp.web.Application() + # Override the dashboard route + + # Override static paths for libraries and similar + learning_observer.routes.register_static_routes(app) + # Add routes for repos + learning_observer.routes.register_repo_routes(app, repos) + aiohttp.web.run_app(app, port=port) + + +if __name__ == "__main__": + def to_bool(s): + ''' + Convert a string to a boolean. + + Args: + s (str): The string to convert. + + Returns: + bool: The converted string. + ''' + if s.lower().strip() in ['true', 't', 'yes', 'y', '1']: + return True + elif s.lower().strip() in ['false', 'f', 'no', 'n', '0']: + return False + else: + raise ValueError("Boolean value expected. Got {}".format(s)) + + parser = argparse.ArgumentParser(description="Run a server to serve the given repos.") + parser.add_argument("repos", type=str, nargs="+", help="The repos to serve.") + parser.add_argument("--port", type=int, default=DEFAULT_PORT, help="The port to serve on.") + args = parser.parse_args() + repos = {} + for repo in args.repos: + repo_split_partial = repo.split(";") + repo_split_default = ["", "", "", False, True] + repo_split = repo_split_partial + repo_split_default[len(repo_split_partial):] + repos[repo_split[0]] = { + "module": repo_split[0], + "url": repo_split[1], + "prefix": repo_split[2], + "bare": to_bool(repo_split[3]), # This doesn't quite work yet + "working_tree": to_bool(repo_split[4]) + } + run_server(repos, args.port) diff --git a/learning_observer/learning_observer/kvs.py b/learning_observer/learning_observer/kvs.py new file mode 100644 index 000000000..f6331252a --- /dev/null +++ b/learning_observer/learning_observer/kvs.py @@ -0,0 +1,368 @@ +'''Key-value store + +Manages JSON objects + +kvs.KVS() will return a key-value store. Note that the back-end is +shared, but each + +Keys are strings. Values are JSON objects. + +To read objects: + +''' + +import asyncio +import copy +import json +import os +import os.path +import sys + +import learning_observer.prestartup +import learning_observer.settings +import learning_observer.redis_connection +import learning_observer.paths +import learning_observer.util + +OBJECT_STORE = dict() + + +class _KVS: + async def dump(self, filename=None): + ''' + Dumps the entire contents of the KVS to a JSON object. + + It is intended to be used in development and for debugging. It is not + intended to be used in production, as it is not very performant. It can + be helpful for offline analytics too, at least at a small scale. + + If `filename` is not `None`, the contents of the KVS are written to the + file. In either case, the contents are returned as a JSON object. + + In the future, we might want to add filters, so that this is scalable + for extracting specific data from production systems (e.g. dump data + for one user). + + args: + filename: The filename to write to. If `None`, don't write to a file. + + returns: + A JSON object containing the contents of the KVS. + ''' + data = {} + for key in await self.keys(): + data[key] = await self[key] + if filename: + with open(filename, 'w') as f: + json.dump(data, f, indent=4) + return data + + async def multiget(self, keys): + ''' + Multiget. It's not fast, but it means we can use appropriate + abstractions and make it fast later. + ''' + return [await self[key] for key in keys] + + async def load(self, filename): + ''' + Loads the contents of a JSON object into the KVS. + + It is intended to be used in development and for debugging. It is not + intended to be used in production, as it is not very performant. It can + be helpful for offline analytics too, at least at a small scale. + ''' + with open(filename) as f: + data = json.load(f) + for key, value in data.items(): + await self.set(key, value) + + +class InMemoryKVS(_KVS): + ''' + Stores items in-memory. Items expire on system restart. + ''' + async def __getitem__(self, key): + ''' + Syntax: + + >> await kvs['item'] + ''' + return copy.deepcopy(OBJECT_STORE.get(key, None)) + + async def set(self, key, value): + ''' + Syntax: + >> await set('key', value) + + `key` is a string, and `value` is a json object. + + We can't use a setter with async, as far as I can tell. There is no + + `await kvs['item'] = foo + + So we use an explict set function. + ''' + json.dumps(value) # Fail early if we're not JSON + assert isinstance(key, str), "KVS keys must be strings" + OBJECT_STORE[key] = value + + async def keys(self): + ''' + Returns all keys. + + Eventually, this might support wildcards. + ''' + return list(OBJECT_STORE.keys()) + + async def clear(self): + ''' + Clear the KVS. + + This is helpful for debugging and testing. We did not want to + implement this for the production KVS, since it would be + too easy to accidentally lose data. + ''' + global OBJECT_STORE + OBJECT_STORE = dict() + + +class _RedisKVS(_KVS): + ''' + Stores items in redis. + ''' + def __init__(self, expire): + self.expire = expire + + async def connect(self): + ''' + asyncio_redis auto-reconnects. We can't do async in __init__. So + we connect on the first get / set. + ''' + await learning_observer.redis_connection.connect() + + async def __getitem__(self, key): + ''' + Syntax: + + >> await kvs['item'] + ''' + await self.connect() + item = await learning_observer.redis_connection.get(key) + if item is not None: + return json.loads(item) + return None + + async def set(self, key, value): + ''' + Syntax: + >> await set('key', value) + + `key` is a string, and `value` is a json object. + + We can't use a setter with async, as far as I can tell. There is no + + `await kvs['item'] = foo + + So we use an explict set function. + ''' + await self.connect() + value = json.dumps(value) # Fail early if we're not JSON + assert isinstance(key, str), "KVS keys must be strings" + return await learning_observer.redis_connection.set(key, value) + return + + async def keys(self): + ''' + Return all the keys in the KVS. + + This is obviously not very performant for large-scale dpeloys. + ''' + await self.connect() + return await learning_observer.redis_connection.keys() + + +class EphemeralRedisKVS(_RedisKVS): + ''' + For testing: redis drops data quickly. + ''' + def __init__(self): + ''' + We're just a `_RedisKVS` with expiration set + ''' + super().__init__(expire=learning_observer.settings.settings['kvs'].get('expiry', 30)) + + +class PersistentRedisKVS(_RedisKVS): + ''' + + For deployment: Data lives forever. + ''' + def __init__(self): + ''' + We're just a `_RedisKVS` with expiration unset + ''' + super().__init__(expire=None) + + +class FilesystemKVS(_KVS): + ''' + This is a very non-scalable, non-performant KVS, where each item is a file + on the filesystem. It can be helpful for debugging. Note that any sort + of real-world use as the main KVS is not only non-performant, but could + result in SSD wear. + + It's not a bad solution for caching some files in small-scale deploys. + ''' + def __init__(self, path=None, subdirs=False): + ''' + path: Where to store the kvs. Default: kvs + subdirs: If set, keys with slashes will result in the creation of + subdirs. For example, self.set("foo/bar", "hello") would create the + directory foo (if it doesn't exist) and store "hello" in the file "bar" + ''' + self.path = path or learning_observer.paths.data('kvs') + self.subdirs = subdirs + + def key_to_safe_filename(self, key): + ''' + Convert a key to a safe filename + ''' + if self.subdirs: + paths = key.split('/') + # Add underscores to directories so they don't conflict with files + for i in range(len(paths) - 1): + paths[i] = '_' + paths[i] + safename = (os.sep).join(map(learning_observer.util.to_safe_filename, paths)) + else: + safename = learning_observer.util.to_safe_filename(key) + return os.path.join(self.path, safename) + + def safe_filename_to_key(self, filename): + raise NotImplementedError("Code this up, please. Or for debugging, comment out the exception") + return filename + + async def __getitem__(self, key): + path = self.key_to_safe_filename(key) + if not os.path.exists(path): + return None + with open(path) as f: + return f.read() + + async def set(self, key, value): + path = self.key_to_safe_filename(key) + if self.subdirs: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'w') as f: + f.write(value) + + async def __delitem__(self, key): + path = key_to_safe_filename(key) + os.remove(path) + + async def keys(self): + ''' + This one is a little bit tricky, since if subdirs, we need to do a full + walk + ''' + if self.subdirs: + for root, dirs, files in os.walk(self.path): + for f in files: + yield self.safe_filename_to_key(os.path.join(root, f).replace(os.sep, '/')) + else: + for f in os.listdir(self.path): + yield self.safe_filename_to_key(f) + + +KVS = None + + +@learning_observer.prestartup.register_startup_check +def kvs_startup_check(): + ''' + This is a startup check. If confirms that the KVS is properly configured + in settings.py. It should happen after we've loaded settings.py, so we + register this to run in prestartup. + + Checks like this one allow us to fail on startup, rather than later + ''' + global KVS + try: + KVS_MAP = { + 'stub': InMemoryKVS, + 'redis_ephemeral': EphemeralRedisKVS, + 'redis': PersistentRedisKVS + } + KVS = KVS_MAP[learning_observer.settings.settings['kvs']['type']] + except KeyError: + if 'kvs' not in learning_observer.settings.settings: + raise learning_observer.prestartup.StartupCheck( + "No KVS configured. Please set kvs.type in settings.py\n" + "Look at example settings file to see what's available." + ) + elif learning_observer.settings.settings['kvs']['type'] not in KVS_MAP: + raise learning_observer.prestartup.StartupCheck( + "Unknown KVS type: {}\n" + "Look at example settings file to see what's available. \n" + "Suppported types: {}".format( + learning_observer.settings.settings['kvs']['type'], + list(KVS_MAP.keys()) + ) + ) + else: + raise learning_observer.prestartup.StartupCheck( + "KVS incorrectly configured. Please fix the error, and\n" + "then replace this with a more meaningful error message" + ) + return True + + +async def test(): + ''' + Simple test case: Spin up a few KVSes, write data to them, make + sure it's persisted globally. + ''' + learning_observer.settings.load_settings('creds.yaml') + mk1 = InMemoryKVS() + mk2 = InMemoryKVS() + ek1 = EphemeralRedisKVS() + ek2 = EphemeralRedisKVS() + fs1 = FilesystemKVS(path="/tmp/flatkvs") + fs2 = FilesystemKVS(path="/tmp/dirkvs", subdirs=True) + assert (await mk1["hi"]) is None + print(await ek1["hi"]) + assert (await ek1["hi"]) is None + await mk1.set("hi", 5) + await mk2.set("hi", 7) + await ek1.set("hi", 8) + await ek2.set("hi", 9) + if True: + os.makedirs("/tmp/flatkvs", exist_ok=True) + os.makedirs("/tmp/dirkvs", exist_ok=True) + await fs1.set("fooo", "poof") + await fs1.set("foo/barła", "zoo") + await fs2.set("foob", "loo") + await fs2.set("fob/perła", "koo") + assert (await fs1["fooo"]) == "poof" + if False: # We don't do this correctly yet + async for k in fs2.keys(): + print(k) + async for k in fs1.keys(): + print(k) + + assert (await mk1["hi"]) == 7 + print(await ek1["hi"]) + print(type(await ek1["hi"])) + print((await ek1["hi"]) == 9) + assert (await ek1["hi"]) == 9 + print(await mk1.keys()) + print(await ek1.keys()) + print("Test successful") + print("Please wait before running test again") + print("redis needs to flush expiring objects") + print("This delay is set in the config file.") + print("It is typically 1s - 24h, depending on") + print("the work.") + +if __name__ == '__main__': + asyncio.run(test()) diff --git a/learning_observer/learning_observer/log_event.py b/learning_observer/learning_observer/log_event.py new file mode 100644 index 000000000..8cfdf08c3 --- /dev/null +++ b/learning_observer/learning_observer/log_event.py @@ -0,0 +1,299 @@ +''' +For now, we dump logs into files, crudely. + +We're not there yet, but we would like to create a 哈希树, or +Merkle-tree-style structure for our log files. + +Or to be specific, a Merkle DAG, like git. + +Each item is stored under its SHA hash. Note that items are not +guaranteed to exist. We can prune them, and leave a dangling pointer +with just the SHA. + +Each event log will be structured as + +-----------------+ +-----------------+ +<--- Last item (SHA) | <--- Last item (SHA) | ... + | | | | + | Data (SHA) | | Data (SHA) | + +-------|---------+ +--------|--------+ + | | + v v + +-------+ +-------+ + | Event | | Event | + +-------+ +-------+ + +Where the top objects form a linked list (each containing a pair of +SHA hashes, one of the previous item, and one of the associated +event). + +We will then have a hierarchy, where we have lists per-document, +documents per-student. When we run analyses, those will store the +hashes of where in each event log we are. Likewise, with each layer +of analysis, we'll store pointers to git hashes of code, as well as +of intermediate files (and how those were generated). + +Where data is available, we can confirm we're correctly replicating +prior tesults. + +The planned data structure is very similar to git, but with the +potential for missing data without an implosion. + +Where data might not be available is after a FERPA, CCPA, or GDPR +requests to change data. In those cases, we'll have dangling nodes, +where we'll know that data used to exist, but not what it was. + +We might also have missing intermediate files. For example, if we do +a dozen analyses, we'll want to know those happened and what those +were, but we might not keep terabytes of data around (just enough to +redo those analyses). +''' + +import datetime +from enum import Enum +import inspect +import io +import json +import hashlib +import os +import os.path + +import learning_observer.filesystem_state + +import learning_observer.paths as paths +import learning_observer.settings as settings +import learning_observer.prestartup +import learning_observer.util + + +# These should move into the startup check +# +# Moving this would involve either queuing log messages until that check +# is called, or calling that before any events are generated. That's an +# important to do in either case. +if not os.path.exists(paths.logs()): + print("Creating path for log files...") + os.mkdir(paths.logs()) + +if not os.path.exists(paths.logs("startup")): + print("Creating path for startup logs...") + os.mkdir(paths.logs("startup")) + +mainlog = open(paths.logs("main_log.json"), "ab", 0) +files = {} + + +# Do we make files for exceptions? Do we print extra stuff on the console? +# +# On deployed systems, this can make a mess. On dev systems, this is super-helpful +# +# We should probably move this to the settings file instead of hardcoding it. There +# was a reason for not placing this in the settings file, but it's no longer relevant +# after a refactor. +class LogLevel(Enum): + ''' + What level of logging do we want? + + NONE: Don't print anything + SIMPLE: Print a simple message + EXTENDED: Print a message with a stack trace and timestamp + ''' + NONE = 'NONE' + SIMPLE = 'SIMPLE' + EXTENDED = 'EXTENDED' + + +class LogDestination(Enum): + ''' + Where we log events? We can log to a file, or to the console. + ''' + CONSOLE = 'CONSOLE' + FILE = 'FILE' + + +# Before we've read the settings file, we'll log basic messages to the +# console and to the log file. +DEBUG_LOG_LEVEL = LogLevel.SIMPLE +DEBUG_LOG_DESTINATIONS = (LogDestination.CONSOLE, LogDestination.FILE) + + +@learning_observer.prestartup.register_init_function +def initialize_logging_framework(): + ''' + On startup, once settings are loaded, we set destinations as per the settings. + + Note that we may get log events before this is set up from other init code, which + may ignore settings. + + We also log the system startup state. + ''' + global DEBUG_LOG_LEVEL + global DEBUG_LOG_DESTINATIONS + + # If we're in deployment, we don't want to print anything. + DEBUG_LOG_LEVEL = LogLevel.NONE + DEBUG_LOG_DESTINATIONS = [] + + # If we're in development, we want to print to the console and to a file. + if settings.RUN_MODE == settings.RUN_MODES.DEV: + DEBUG_LOG_LEVEL = LogLevel.SIMPLE + DEBUG_LOG_DESTINATIONS = [LogDestination.CONSOLE, LogDestination.FILE] + + # In either case, we want to override from the settings file. + if "logging" in settings.settings: + if "debug_log_level" in settings.settings["logging"]: + DEBUG_LOG_LEVEL = LogLevel(settings.settings["logging"]["debug_log_level"]) + if "debug_log_destinations" in settings.settings["logging"]: + DEBUG_LOG_DESTINATIONS = list(map(LogDestination, settings.settings["logging"]["debug_log_destinations"])) + + debug_log("DEBUG_LOG_LEVEL:", DEBUG_LOG_LEVEL) + debug_log("DEBUG_DESTINATIONS:", DEBUG_LOG_DESTINATIONS) + + # We're going to save the state of the filesystem on application startup + # This way, event logs can refer uniquely to running version + # Do we want the full 512 bit hash? Cut it back? Use a more efficient encoding than + # hexdigest? + startup_state = json.dumps(learning_observer.filesystem_state.filesystem_state(), indent=3, sort_keys=True) + STARTUP_STATE_HASH = learning_observer.util.secure_hash(startup_state.encode('utf-8')) + STARTUP_FILENAME = "{directory}/{time}-{hash}.json".format( + directory=paths.logs("startup"), + time=datetime.datetime.utcnow().isoformat(), + hash=STARTUP_STATE_HASH + ) + + with open(STARTUP_FILENAME, "w") as sfp: + # gzip can save about 2-3x space. It makes more sense to do this + # with larger files later. tar.gz should save a lot more + sfp.write(startup_state) + + +def encode_json_line(line): + ''' + For encoding short data, such as an event. + + We use a helper function so we have the same encoding + everywhere. Our primary goal is replicability -- if + we encode the same dictionary twice, we'd like to get + the same string, with the same hash. + ''' + return json.dumps(line, sort_keys=True) + + +def encode_json_block(block): + ''' + For encoding large data, such as the startup log. + + We use a helper function so we have the same encoding + everywhere. Our primary goal is replicability -- if + we encode the same dictionary twice, we'd like to get + the same string, with the same hash. + ''' + return json.dumps(block, sort_keys=True, indent=3) + + +def log_event(event, filename=None, preencoded=False, timestamp=False): + ''' + This isn't done, but it's how we log events for now. + ''' + if filename is None: + log_file_fp = mainlog + elif filename in files: + log_file_fp = files[filename] + else: + log_file_fp = open(paths.logs("" + filename + ".log"), "ab", 0) + files[filename] = log_file_fp + + if not preencoded: + event = encode_json_line(event) + log_file_fp.write(event.encode('utf-8')) + if timestamp: + log_file_fp.write("\t".encode('utf-8')) + log_file_fp.write(datetime.datetime.utcnow().isoformat().encode('utf-8')) + log_file_fp.write("\n".encode('utf-8')) + log_file_fp.flush() + + +def print_to_string(*args, **kwargs): + ''' + This is a wrapper around print, which returns a string instead of + printing it. + + :param args: The arguments to print + :param kwargs: The keyword arguments to print + :return: A string + ''' + output = io.StringIO() + print(*args, file=output, **kwargs) + contents = output.getvalue() + output.close() + return contents + + +def debug_log(*args): + ''' + Helper function to help us trace our code. + + We print a time stamp, a stack trace, and a /short/ summary of + what's going on. + + This is not intended for programmatic debugging. We do change + format regularly (and you should feel free to do so too -- for + example, on narrower terminals, a `\n\t` can help) + ''' + if DEBUG_LOG_LEVEL not in (LogLevel.NONE, LogLevel.SIMPLE, LogLevel.EXTENDED): + raise ValueError("Invalid debug log type: {}".format(DEBUG_LOG_LEVEL)) + if DEBUG_LOG_LEVEL == LogLevel.NONE: + return + text = print_to_string(*args) + if DEBUG_LOG_LEVEL == LogLevel.SIMPLE: + message = text + elif DEBUG_LOG_LEVEL == LogLevel.EXTENDED: + stack = inspect.stack() + stack_trace = "{s1}/{s2}/{s3}".format( + s1=stack[1].function, + s2=stack[2].function, + s3=stack[3].function, + ) + message = "{time}: {st:60}\t{body}".format( + time=datetime.datetime.utcnow().isoformat(), + st=stack_trace, + body=text + ) + + # Flip here to print / not print debug messages + if LogDestination.CONSOLE in DEBUG_LOG_DESTINATIONS: + print(message.strip()) + + # Print to file. Only helpful for development. + if LogDestination.FILE in DEBUG_LOG_DESTINATIONS: + with open(paths.logs("debug.log"), "a") as fp: + fp.write(message.strip() + "\n") + + # Ideally, we'd like to be able to log these somewhere which won't cause cascading failures. + # If we e.g. have errors every 100ms, we don't want to create millions of debug files. + # There are services which handle this pretty well, I believe + + +AJAX_FILENAME_TEMPLATE = "{directory}/{time}-{payload_hash}.json" + + +def log_ajax(url, resp_json, request): + ''' + This is primarily used to log the responses of AJAX requests made + TO Google and similar providers. This helps us understand the + context of classroom activity, debug, and recover from failures + ''' + payload = { + 'user': request['user'], + 'url': url, + 'response': resp_json, + 'timestamp': datetime.datetime.utcnow().isoformat() + } + encoded_payload = encode_json_block(payload) + payload_hash = learning_observer.util.secure_hash(encoded_payload.encode('utf-8')) + filename = AJAX_FILENAME_TEMPLATE.format( + directory=paths.logs("ajax"), + time=datetime.datetime.utcnow().isoformat(), + payload_hash=payload_hash + ) + with open(filename, "w") as ajax_log_fp: + ajax_log_fp.write(encoded_payload) diff --git a/learning_observer/learning_observer/main.py b/learning_observer/learning_observer/main.py new file mode 100644 index 000000000..77513f1fb --- /dev/null +++ b/learning_observer/learning_observer/main.py @@ -0,0 +1,155 @@ +''' +main.py +========= + +This is the main file for processing event data for student writing. This +system is designed for our writing analysis project, but is designed to +generalize to learning process data from multiple systems. We have a few +small applications we are testing this system with as well (e.g. dynamic +assessment). +''' + +import sys + +import asyncio + +import aiohttp +import aiohttp.web + +import uvloop + +import learning_observer.settings as settings +import learning_observer.routes as routes +import learning_observer.prestartup +import learning_observer.webapp_helpers +import learning_observer.watchdog_observer + +from learning_observer.log_event import debug_log + +# If we e.g. `import settings` and `import learning_observer.settings`, we +# will load startup code twice, and end up with double the global variables. +# This is a test to avoid that bug. +if not __name__.startswith("learning_observer."): + raise ImportError("Please use fully-qualified imports") + sys.exit(-1) + +# Run argparse +args = settings.parse_and_validate_arguments() + + +def configure_event_loop(): + ''' + This is a feature flag. We have not tested / benchmarked it, but + it claims to make async Python much faster. + ''' + if 'uvloop' in settings.settings.get("feature_flags", {}): + debug_log("Running with uvloop") + asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + else: + debug_log("Running without uvloop") + + +port = None +runmode = None + + +def create_app(): + ''' + Create the application. + + We've moved this into a function so we can call it from the watchdog + observer and other places. + ''' + global port, runmode + # Load the settings file + settings.load_settings(args.config_file) + configure_event_loop() + + # We don't want these to change on a restart. + # We should check if reloading this module overwrites them. + if port is None: + port = settings.settings.get("server", {}).get("port", None) + if runmode is None: + runmode = settings.settings.get("config", {}).get("run_mode", None) + + if port is None and runmode == 'dev': + port = learning_observer.webapp_helpers.find_open_port() + + # Check that everything is configured correctly, + # and initialize anything which needs initialization + learning_observer.prestartup.startup_checks_and_init() + # Initialize the streaming analytics framework + learning_observer.stream_analytics.init() + # Create the application + app = aiohttp.web.Application() + + # Set up the routing table + routes.add_routes(app) + + # Set up all the middlewares, sessions, and things + learning_observer.webapp_helpers.setup_cors(app) + learning_observer.webapp_helpers.setup_session_storage(app) + learning_observer.webapp_helpers.setup_middlewares(app) + return app + + +def shutdown(app): + ''' + Shutdown the app. + ''' + app.shutdown() + app.cleanup() + return app + + +def start(): + ''' + Start the application. + ''' + global app + # Reload all imports + app = create_app() + aiohttp.web.run_app(app, port=port) + return app + + +print("Arguments:", args) + +if args.watchdog is not None: + print("Watchdog mode") + # Parse argument to determine watchdog handler + restart = { + 'restart': learning_observer.watchdog_observer.restart, + 'reimport': learning_observer.watchdog_observer.reimport_child_modules, + } + if args.watchdog not in restart: + print( + f"Invalid watchdog mode. Valid modes are: {', '.join(restart.keys())}" + ) + sys.exit(-1) + fs_event_handler = learning_observer.watchdog_observer.RestartHandler( + shutdown=shutdown, + restart=restart[args.watchdog], + start=start + ) + learning_observer.watchdog_observer.watchdog(fs_event_handler) + +app = start() + +# Port printing: +# +# This is kind of ugly. If we want to log the startup port, we can either +# do our own app runner as per: +# https://stackoverflow.com/questions/44610441/how-to-determine-which-port-aiohttp-selects-when-given-port-0 +# +# Or we can introspect: +# import gc +# sites = gc.get_referrers(aiohttp.web.TCPSite) +# And find the right object and introspect its port. +# +# To make a dummy test TCPSite: +# runner = aiohttp.web.AppRunner(aiohttp.web.Application()) +# await runner.setup() +# foo = aiohttp.web.TCPSite(runner) +# +# Or we can manually find the first open port ourselves. diff --git a/learning_observer/learning_observer/merkle_store.py b/learning_observer/learning_observer/merkle_store.py new file mode 100644 index 000000000..8aafce5ed --- /dev/null +++ b/learning_observer/learning_observer/merkle_store.py @@ -0,0 +1,650 @@ +''' +This is a prototype for our log storage system. + +1. We'd like the logical design to scale to millions of users, each generating + millions of events. + - Merkle trees are nice, since the only logical operation is writing a + key/value pair under its hash + - However, we don't quite use this as a back-end representation, since we'd + like to be able to get at streams of events efficiently. That's why we + don't quite use a key-value store -- walking a linked list in a KVS is + slow. +2. We'd like to be able to provide users with their complete data (e.g. it's + not in a million different places). + - "Users" can mean students, schools, etc. + - Requests may come in for complete data, or for a subset of data. +3. We'd like to be able to have users remove or correct their data + - "Users" can have multiple definitions, as per above + - "Data" can mean for a particular document, all data, etc. + - Such a removal should leave a trace that data was removed, but remove + data completely. +4. We'd like to have an archival record of everything that happened, except + for data lost to such removals + - This should be auditable -- e.g. we can't fake data + - The cryptographic properties of the hash tree allow us to audit all + data that was retained. +5. In the future, we'd like an archival record of all processing on top of + data + - Families should be able to audit how their data was processed + - Researchers should be able to review a modern-day equivalent to a + lab notebook + - This should be auditable -- e.g. we can't do a p-value hunt without + leaving a record + +There is a lot of nuance -- which we may not have gotten right yet -- around: + +- What level to expose how much PII at. Removal requests ought to remove + PII, but maintain hashes pointing to the removed data +- Whether and how to break up the data into chunks. Right now, each stream + is a single chunk. We might want to e.g. break up on hourly, daily, or + other boundaries. This doesn't change the logical Merkle tree, but it + change the way we map it to storage. +- Whether and what kind of metadata we want to include in the tree. We can + include events which are in the streams, but not in the Merkle tree itself + (e.g. headers, etc.) +- How to handle logs of computation on data. +- How often to compute a top-level hash to expose to the world. We'd like to + publish a daily hash. From there, anyone who requests data should receive + a chain of hashes which allows them to verify that data is correct, + complete, and not modified. + +Note that this is *not* designed to serve data directly to dashboards. However, +we do want to be able to use the same reducers to do batched processing of +this data for research as we do for dashboards (which process streams in +realtime, and only maintain features). + +It is very much a prototype. To make this not a prototype, we would need to: + +- Make it work with Kafka +- Make it work with asyncio +- Make the file system operations not slow +- Use full-length hashes +- Confirm it's robust +- Escape the file names properly or compute interrim session IDs more + intelligently +- Etc. +''' + +import hashlib +import json +import datetime +from modulefinder import STORE_GLOBAL +import os +from pickle import STOP + +# These should be abstracted out into a visualization library. +import matplotlib +import networkx +from learning_observer.incoming_student_event import COUNT +import pydot + +from confluent_kafka import Producer, Consumer + + +def json_dump(obj): + """ + Dump an object to JSON. + + Args: + obj (object): The object to dump. + + Returns: + str: The JSON dump. + + This is shorthand so that we have a consistent way to dump objects + each time. We use JSON dumps to index into dictionaries and such. + """ + return json.dumps(obj, sort_keys=True) + + +def json_load(string): + """ + Load a JSON string. + + Args: + string (str): The JSON string. + + Returns: + object: The JSON object. + + We don't really need this, put for symmetry with json_dump and consistency. + """ + return json.loads(string) + + +COUNT = 0 + + +def session_key(session): + """ + Return an ID associated with a session. + + Such an ID is used before we have a finished session which we can + place into the Merkle DAG + + Args: + session (dict): The session. + + Returns: + str: The session ID. This is not guaranteed to be a string in the + future. + + The session ID is currently a JSON dump, with some extra info to prevent + collisions. + """ + global COUNT + COUNT += 1 + + base = { + 'timestamp': timestamp(), + 'count': COUNT + } + return json_dump(session) + + +# This might turn into a class in the future. For now, we just use the +# session_key +Session = session_key + + +def timestamp(): + """ + Return a timestamp string in ISO 8601 format + + Returns: + str: The timestamp string. + + The timestamp is in UTC. + """ + return datetime.datetime.utcnow().isoformat() + + +def hash(*strings): + """ + Return a hash of the given strings. + + Args: + strings (str): The strings to hash. + + Returns: + str: The hash of the given strings. + + The strings should not contain tabs. + """ + return hashlib.sha1('\t'.join(strings).encode('utf-8')).hexdigest()[:8] + + +class Merkle: + def __init__(self, storage, categories): + ''' + Initialize the merkle DAG. + + `categories` is a list of categories by which we might + want to index into events + ''' + self.storage = storage + self.categories = categories + + # These are generic to interact with the Merkle DAG + def event_to_session(self, event, session, children=None, label=None): + ''' + Append an event to the merkle tree. + + There are two possibilities here: + 1. We have a closure and we're updating the SHA hash with each + event. + 2. We don't have a closure and we're placing the individual + events into the Merkle DAG. + We went with the second option. This makes events into the leaf, + nodes, whereas the first option makes sessions into the leaf + nodes. + + This uses a little bit more space, but it's easier to reason about, + and potentially minimizes some ad-hoc decisions, such as where to + put boundaries between long-running sessions. + + We might still want a closure, so we don't need to read back the + last item in the stream. Or perhaps we want both (with one calling + the other), using a closure for rapid events and a call like this + one for rare ones. + + Args: + event (dict): The event to append. + session (dict): The session to append to. This should specify + a set of categories, and map those to lists of associated + IDs. For example, "teacher": ["teacher1", "teacher2"] + children (list): Additional children of this event, beyond the + current item and the past event. + label (str): An optional human-friendly label for this event. This + should NOT be relied on programmatically, or to be unique. It's + just for human consumption, e.g. when making visualizations. + + Returns: + dict: The event envelope, with the session updated, and the + hash computed. + ''' + # reverse() so we add children from the parameters to the end of the list + # of children. This isn't strictly necessary, but it is a little bit + # nicer to look at manually. We could remove the pair of reverse() calls, + # since this is an unordered list, if this ever becomes a problem. + if children is None: + children = list() + children.reverse() + storage = self.storage + session_id = session_key(session) + ts = timestamp() + + event_hash = hash(json_dump(event)) + node_hash = hash(*children, ts) + + last_hash = None + last_item = storage._most_recent_item(session_id) + + if last_item is not None: + last_hash = last_item['hash'] + children.append(last_hash) + + children.append(event_hash) + + children.reverse() + print(children) + + item = { + 'children': children, # Points to the full chain / children + 'hash': node_hash, # Current node + 'timestamp': ts, # Timestamp + 'event': event + } + if label is not None: + item['label'] = label + storage._append_to_stream(session_id, item) + print(item['hash']) + return item + + def start(self, session, metadata=None, continue_session=False): + ''' + Start a new session. + + Args: + session (dict): The session to start. + metadata (dict): Optional metadata to attach to the session. + + Returns: + dict: The session envelope + ''' + if not continue_session: + event = { + 'type': 'start', + 'session': session + # Perhaps we want to add a category here? E.g. 'session_event_stream' for the raw streams + # and something else to indicate parents? + } + else: + raise NotImplementedError('Continuing sessions not implemented') + if metadata is not None: + event['metadata'] = metadata + return self.event_to_session( + event, + session, + label='start' + ) + + def close_session(self, session, logical_break=False): + ''' + Close the session. We update up-stream nodes with the session's + merkle leaf. and if necessary, we update the session's key / + topic / alias with the hash of the full chain. + ''' + final_item = self.event_to_session( + {'type': 'close', 'session': session}, + session, + label='close' + ) + session_hash = final_item['hash'] + self.storage._rename_or_alias_stream(session_key(session), session_hash) + if len(session) < 1: + raise Exception('Session is empty') + if len(session) == 1: + print("Parent session") + print("These sessions shouldn't be closed") + return + + # We need to update the parents to point to this session. + # We don't do this if we're only introducing a logical break, + # since the session continues on. + if not logical_break: + for key in session: + if key not in self.categories: + print("Something is wrong. Session has unexpected key: {}".format(key)) + for item in session[key]: + parent_session = {key: item} + self.event_to_session( + { + 'type': 'child_session_finished', + 'session': session_hash # This should go into children, maybe?, + }, + parent_session, + children=[session_hash], + label=f'{key}' + ) + return session_hash + + def break_session(self, session): + ''' + Split a session into two parts. This has no logical effect on the data structure, + but creates a split so that a portion of the data can be accessed under it's own + key. Logically, keys can either be part of the event envelope, or they can be the + key / topic / filename of the session. + + It may make sense to do this e.g. daily to break up long-running sessions. This + stub is a proof of concept. + + Note that we do not create ANY new keys here, since we should be able to break + a session into multiple parts, or recombine them, without breaking the logical + structure. + ''' + session_hash = self.close_session(session, logical_break=True) + self.start(session, continue_session={ + 'type': 'continue', + 'session': session_hash + }) + return session_hash + + +class StreamStorage: + def _append_to_stream(self, stream, item): + ''' + Append an item to a stream. + ''' + raise NotImplementedError + + def _rename_or_alias_stream(self, stream, alias): + ''' + Rename a stream. + ''' + raise NotImplementedError + + def _get_stream_data(self, stream): + ''' + Get the stream. + ''' + raise NotImplementedError + + def _delete_stream(self, sha_key): + ''' + Delete a stream. + + Mostly for right-to-be-forgotten requests + ''' + raise NotImplementedError + + def _most_recent_item(self, stream): + ''' + Get the most recent item in a stream. + ''' + raise NotImplementedError + + def _walk(self): + ''' + Walk the DAG. This is used for debugging. + ''' + raise NotImplementedError + + def _make_label(self, item): + ''' + Make a label for an item. + + This is cosmetic, when rendering the graph. + ''' + if 'label' in item and item['label'] is not None: + return item['label'] + print(item) + if 'session' in item and len(item['session']) == 1: + return "-".join(item['session'].items()[0]) + return item['hash'][:4] + + def to_networkx(self): + ''' + Convert the DAG to a network. + + This is used for testing, experimentation, and demonstration. It + would never scale with real data. + ''' + G = networkx.DiGraph() + for item in self._walk(): + print(item) + G.add_node(item['hash'], label=self._make_label(item)) + if 'children' in item: + for child in item['children']: + G.add_edge(item['hash'], child) + return G + + def to_graphviz(self): + ''' + Convert the DAG to a graphviz graph. + + This is used for testing, experimentation, and demonstration. It + would never scale with real data. + ''' + G = pydot.Dot(graph_type='digraph') + for item in self._walk(): + node = pydot.Node(item['hash'], label=self._make_label(item)) + G.add_node(node) + for item in self._walk(): + if 'children' in item: + for child in item['children']: + edge = pydot.Edge(item['hash'], child) + G.add_edge(edge) + return G + + +class KafkaStorage(StreamStorage): + """ + A Merkle DAG implementation that uses Kafka as a backing store. + + Very little of this is built. + """ + def __init__(self): + super().__init__() + raise NotImplementedError + self.producer = Producer() + self.consumer = Consumer() + + def _append_to_stream(self, stream, item): + raise NotImplementedError + self.producer.produce(stream, json_dump(item)) + + def _rename_or_alias_stream(self, stream, alias): + ''' + Rename a stream. We can't do this directly, so we create a new stream under the name `alias` + and then delete the old stream. + ''' + raise NotImplementedError + for item in self._get_stream_data(stream): + self._append_to_stream(alias, item) + self._delete_stream(stream) + + def _get_stream_data(self, stream): + raise NotImplementedError + + def _delete_stream(self, sha_key): + ''' + Delete the Kafka topic for the stream. + ''' + self.producer.delete_topic(sha_key) + + def _most_recent_item(self, stream): + raise NotImplementedError + + def _walk(self): + raise NotImplementedError + + +class FSStorage(StreamStorage): + """ + A Merkle DAG implementation that uses a file system as a backing store. + """ + def __init__(self, path): + super().__init__() + self.path = path + + def _fn(self, stream): + ''' + Get the filename for a stream. + + This is prototype code. We should escape the stream name robustly to avoid + security issues and collisions. + ''' + safer_filename = "".join(c for c in stream if c.isalnum() or c in '-_') + return os.path.join(self.path, safer_filename) + + def _append_to_stream(self, stream, item): + ''' + Append an item to a stream. + ''' + with open(self._fn(stream), 'a') as f: + f.write(json_dump(item)) + f.write('\n') + + def _rename_or_alias_stream(self, stream, alias): + ''' + Rename a stream. + ''' + os.rename(self._fn(stream), self._fn(alias)) + + def _get_stream_data(self, stream): + ''' + Get the stream. + ''' + if not os.path.exists(self._fn(stream)): + return None + with open(self._fn(stream), 'r') as f: + return [json_load(line) for line in f.readlines()] + + def _delete_stream(self, sha_key): + ''' + Delete a stream. + ''' + os.remove(self._fn(sha_key)) + + def _most_recent_item(self, stream): + ''' + Get the most recent item in a stream. + ''' + data = self._get_stream_data(stream) + if data is None: + return None + if len(data) == 0: + return None + return data[-1] + + def _walk(self): + ''' + Walk the DAG. This is used for debugging. + ''' + for filename in os.listdir(self.path): + with open(os.path.join(self.path, filename), 'r') as f: + for line in f.readlines(): + yield json_load(line) + + +class InMemoryStorage(StreamStorage): + """ + A Merkle DAG implementation that uses in-memory storage. + """ + def __init__(self): + super().__init__() + self.store = {} + + def _append_to_stream(self, stream, item): + if stream not in self.store: + self.store[stream] = [] + self.store[stream].append(item) + + def _rename_or_alias_stream(self, stream, alias): + if alias == stream: + return + self.store[alias] = self.store[stream] + del self.store[stream] + + def _get_stream_data(self, stream): + return self.store[stream] + + def _delete_stream(self, stream): + del self.store[stream] + + def _most_recent_item(self, stream): + if stream not in self.store: + return None + if len(self.store[stream]) == 0: + return None + return self.store[stream][-1] + + def _walk(self): + for stream in self.store: + for item in self.store[stream]: + yield item + + +CATEGORIES = set( + [ + "teacher", + "student", + "school", + "classroom", + "course", + "assignment" + ] +) + + +STORES = { + "kafka": KafkaStorage, + "fs": FSStorage, + "inmemory": InMemoryStorage +} + + +def test_case(): + """ + A test case, mostly used to demo the Merkle DAG. It doesn't check for + correctness yet, but does show a simple visualization of the DAG. + """ + big_session = { + "teacher": ["Ms. Q", "Mr. R"], + "student": ["John"], + "school": ["Washington Elementary"], + "classroom": ["4A"], + "course": ["Math"] + } + small_session = { + "teacher": ["Mr. A"], + "student": ["John"] + } + session = small_session + + STORAGE = 'FS' + + if STORAGE == 'MEMORY': + storage = InMemoryStorage() + elif STORAGE == 'FS': + if not os.path.exists('/tmp/merkle_dag'): + os.mkdir('/tmp/merkle_dag') + storage = FSStorage('/tmp/merkle_dag') + else: + raise NotImplementedError(STORAGE) + + merkle = Merkle(storage, CATEGORIES) + merkle.start(session) + merkle.event_to_session({"type": "event", "event": "A", "name": "1st"}, session, label="A") + merkle.event_to_session({"type": "event", "event": {"B": "c"}, "name": "2nd"}, session, label="B") + merkle.event_to_session({"type": "event", "event": {"B": "c"}}, session, label="C") + merkle.close_session(session) + G = storage.to_graphviz() + import PIL.Image as Image + import io + Image.open(io.BytesIO(G.create_png())).show() + + +if __name__ == "__main__": + test_case() diff --git a/learning_observer/learning_observer/module.py b/learning_observer/learning_observer/module.py new file mode 100644 index 000000000..ec78e355e --- /dev/null +++ b/learning_observer/learning_observer/module.py @@ -0,0 +1,164 @@ +''' +Module definition file + +This may be an examplar for building new modules too. +''' +import os.path + +import dash_bootstrap_components as dbc + +import learning_observer.dash_integration + + +NAME = "Learning Observer Base" + +# Outgoing APIs +# +# Generically, these would usually serve JSON to dashboards written as JavaScript and +# HTML. These used to be called 'dashboards,' but we're now hosting those as static +# files. + +COURSE_AGGREGATORS = { + # "writing-observer": { + # "sources": [ # These are the reducers whose outputs we aggregate + # learning_observer.stream_analytics.writing_analysis.time_on_task, + # learning_observer.stream_analytics.writing_analysis.reconstruct + # # TODO: "roster" + # ], + # # Then, we pass the per-student data through the cleaner, if provided. + # "cleaner": learning_observer.writing_observer.aggregator.sanitize_and_shrink_per_student_data, + # # And we pass an array of the output of that through the aggregator + # "aggregator": learning_observer.writing_observer.aggregator.aggregate_course_summary_stats, + # "name": "This is the main Writing Observer dashboard." + # } +} + +STUDENT_AGGREGATORS = { +} + +# Incoming event APIs +REDUCERS = [ +] + + +# Required client-side JavaScript downloads +THIRD_PARTY = { + "require.js": { # Our recommended library loader + "url": "https://requirejs.org/docs/release/2.3.6/comments/require.js", + "hash": "d1e7687c1b2990966131bc25a761f03d6de83115512c9ce85d72e4b9819fb" + "8733463fa0d93ca31e2e42ebee6e425d811e3420a788a0fc95f745aa349e3b01901" + }, + "text.js": { # Add-on for require to load text files + "url": "https://raw.githubusercontent.com/requirejs/text/" + "3f9d4c19b3a1a3c6f35650c5788cbea1db93197a/text.js", + "hash": "fb8974f1633f261f77220329c7070ff214241ebd33a1434f2738572608efc" + "8eb6699961734285e9500bbbd60990794883981fb113319503208822e6706bca0b8" + }, + "r.js": { # We should check if this is still used + "url": "https://requirejs.org/docs/release/2.3.6/r.js", + "hash": "52300a8371df306f45e981fd224b10cc586365d5637a19a24e710a2fa566f" + "88450b8a3920e7af47ba7197ffefa707a179bc82a407f05c08508248e6b5084f457" + }, + "bulma.min.css": { # Our default stylesheet + "url": "https://cdnjs.cloudflare.com/ajax/libs/bulma/0.9.0/css/" + "bulma.min.css", + "hash": "ec7342883fdb6fbd4db80d7b44938951c3903d2132fc3e4bf7363c6e6dc52" + "95a478c930856177ac6257c32e1d1e10a4132c6c51d194b3174dc670ab8d116b362" + }, + "bulma-tooltip-min.css": { # Add-on for above + "url": "https://cdn.jsdelivr.net/npm/@creativebulma/bulma-tooltip@1.2.0/" + "dist/bulma-tooltip.min.css", + "hash": "fc37b25fa75664a6aa91627a7b1298a09025c136085f99ba31b1861f073a0" + "696c4756cb156531ccf5c630154d66f3059b6b589617bd6bd711ef665079f879405" + }, + "fontawesome.js": { # Icons for the above + "url": "https://use.fontawesome.com/releases/v5.3.1/js/all.js", + "hash": "83e7b36f1545d5abe63bea9cd3505596998aea272dd05dee624b9a2c72f96" + "62618d4bff6e51fafa25d41cb59bd97f3ebd72fd94ebd09a52c17c4c23fdca3962b" + }, + "showdown.js": { # Default markup library + "url": "https://rawgit.com/showdownjs/showdown/1.9.1/dist/showdown.js", + "hash": "4fe14f17c2a1d0275d44e06d7e68d2b177779196c6d0c562d082eb5435eec" + "4e710a625be524767aef3d9a1f6a5b88f912ddd71821f4a9df12ff7dd66d6fbb3c9" + }, + "showdown.js.map": { # Part of above + "url": "https://rawgit.com/showdownjs/showdown/1.9.1/dist/showdown.js.map", + "hash": "74690aa3cea07fd075942ba9e98cf7297752994b93930acb3a1baa2d3042a" + "62b5523d3da83177f63e6c02fe2a09c8414af9e1774dad892a303e15a86dbeb29ba" + }, + "mustache.min.js": { # Default templating engine + "url": "http://cdnjs.cloudflare.com/ajax/libs/mustache.js/3.1.0/" + "mustache.min.js", + "hash": "e7c446dc9ac2da9396cf401774efd9bd063d25920343eaed7bee9ad878840" + "e846d48204d62755aede6f51ae6f169dcc9455f45c1b86ba1b42980ccf8f241af25" + }, + "d3.v5.min.js": { # Client-side data flow + "url": "https://d3js.org/d3.v5.min.js", + "hash": "466fe57816d719048885357cccc91a082d8e5d3796f227f88a988bf36a5c2" + "ceb7a4d25842f5f3c327a0151d682e648cd9623bfdcc7a18a70ac05cfd0ec434463" + }, + "p5.js.min": { # Mostly, for rapid prototyping of visualizations + "url": "https://github.com/processing/p5.js/releases/download/v1.4.0/p5.min.js", + "hash": "6e2559786ad5e22f01f112289ddd32fb7675703501306ff9f71c203146047" + "e20bb552850b8c89e913e43e1027f292a3f13aa63ace0bdf8af24a8654f9200346c" + } +} + + +# We're still figuring this out, but we'd like to support hosting static files +# from the git repo of the module. +# +# This allows us to have a Merkle-tree style record of which version is deployed +# in our log files. +STATIC_FILE_GIT_REPOS = { + # 'writing_observer': { + # # Where we can grab a copy of the repo, if not already on the system + # 'url': 'https://github.com/ETS-Next-Gen/writing_observer.git', + # # Where the static files in the repo lie + # 'prefix': 'learning_observer/learning_observer/static', + # # Branches we serve. This can either be a whitelist (e.g. which ones + # # are available) or a blacklist (e.g. which ones are blocked) + # 'whitelist': ['master'] + # } +} + + +# We're kinda refactoring the stuff above to below +# +# The stuff above will become APIs to dashboards. The stuff below +# will register the actual dashboards. +COURSE_DASHBOARDS = [ + # { + # 'name': "Writing Observer", + # 'url': "/static/repos/lo_core/writing_observer/master/wobserver.html", + # "icon": { + # "type": "fas", + # "icon": "fa-pen-nib" + # } + # } +] + +STUDENT_DASHBOARDS = { +} + +WSGI = [ + { + "APP": learning_observer.dash_integration.get_app, + "URL_PATTERNS": [ + "/{path_info:dash/test}", # <-- Test case (to be removed) + "/{path_info:_dash.*}", # <-- All the infrastructure dash wants + "/{path_info:.*/dash/.*}", # <-- All the other modules. We can be more specific later + "/{path_info:dash/assets/.*}" # <-- Again, we should be more specific later + ] + } +] + +DASH_PAGES = [ + { + "MODULE": learning_observer.dash_integration, + "LAYOUT": learning_observer.dash_integration.test_layout, + "TITLE": "Test Page for Dash.", + "DESCRIPTION": "We're just testing. Nothing to see here.", + "SUBPATH": "test" + } +] diff --git a/learning_observer/learning_observer/module_loader.py b/learning_observer/learning_observer/module_loader.py new file mode 100644 index 000000000..21baa5615 --- /dev/null +++ b/learning_observer/learning_observer/module_loader.py @@ -0,0 +1,503 @@ +''' +Import analytics modules + +This runs _after_ `paths` and `settings`, since `settings` calls +`learning_observer.paths.register_repo`, which we use here. + +Ideally, this would be more modular and run without a settings file +too. We'd like to use this from utility scripts. +''' + + +import collections +import copy +import os.path +import sys + +import pkg_resources + +import gitserve.gitaccess + +import learning_observer.paths +import learning_observer.settings + +from learning_observer.log_event import debug_log + +import learning_observer.stream_analytics.helpers as helpers + + +# This is set to true after we've scanned and loaded modules +LOADED = False + +COURSE_AGGREGATORS = collections.OrderedDict() +REDUCERS = [] +THIRD_PARTY = {} +STATIC_REPOS = {} +STUDENT_DASHBOARDS = [] +COURSE_DASHBOARDS = [] +EXTRA_VIEWS = [] + +# Additional calls, primarily for metadata +AJAX = {} + +WSGI = [] +DASH_PAGES = {} + + +def extra_views(): + ''' + We used to just have dashboards rendered as views as a hack. This + will use the same API, provide backwards-compatibility, but also + act as a place for things which aren't dashboards. Modules ought + to be able to define random views. + ''' + load_modules() + return EXTRA_VIEWS + + +def student_dashboards(): + ''' + URLs of per-student views + ''' + load_modules() + return STUDENT_DASHBOARDS + + +def course_dashboards(): + ''' + URLs of per-course views + ''' + load_modules() + return COURSE_DASHBOARDS + + +def course_aggregators(): + ''' + Return a dictionary of all modules the system can render. + TODO: Rename to teacher aggregators or similar. + ''' + load_modules() + return COURSE_AGGREGATORS + + +def reducers(): + ''' + Return a list of all event processors / reducers. Note that + we can have multiple reducers for the same event type. + ''' + load_modules() + return REDUCERS + + +def third_party(): + ''' + Return a list of modules to download from 3rd party repos. + + We should eventually: + + - Handle version conflicts more gracefully (e.g. by allowing + hashes of multiple compatible versions) + - Support serving static files from e.g. S3 or similar + services rather than our own server (e.g. by filling in + references in modules where needed, or through settings in + config.json) + - Serving from CDNs (for deploys where we don't mind leaking + user data; e.g. development) + + ... but not today. + + We don't want these modules committed to our repo due to size. + ''' + load_modules() + return THIRD_PARTY + + +def static_repos(): + ''' + We can serve static files for each module. These are served + straight from `git`. In the future, we'd like to cache this. + + There's a little bit of complexity and nuance in how we'd + like to manage branches. + + In deployment / operational settings: + + - We'll want to be careful about WHICH branch and commit we + serve. We don't want students, teachers, or search engines + navigating all versions willy nilly. + - This is primarily a research platform. In research settings, + we usually DO want to allow users to go to different versions. + This is helpful for research replicability ("what version did + Subject 42 see?"), for the social practice of research (e.g. + show a collaborator a prototype, while using IRB-approved + versions for coglabs), for experiments (e.g. show different + versions to different students), etc. + + For now, this is set up around the *research* use-case: Being able + to run coglabs, small pilots, and similar, used in controlled + settings, without confidential items in repos. + + Note that since this is all open-source, hosting static files from + a repo is *typically* *not* a security issue. It can be a usability + issue, though (e.g. if users find an outdated link via a search + engine). + + (Of course, YMMV. If you're hosting test items in a repo, then you + want to be very careful about security) + ''' + load_modules() + return STATIC_REPOS + + +def ajax(): + ''' + Return a dictionary of all AJAX handlers. + ''' + load_modules() + return AJAX + + +def wsgi(): + load_modules() + return WSGI + + +def dash_pages(): + load_modules() + return DASH_PAGES + + +def load_modules(): + ''' + Iterate through entry points to: + - Find all Learning Observer modules installed + - Load course_aggregators from each module + - Load reducers from each module + + This is called before we ask for something from modules, but it + only changes state on startup (for now -- we might revist later + if we want to be more dynamic). + ''' + # pylint: disable=W0603 + global LOADED + if LOADED: + return + + # Iterate through Learning Observer modules + for entrypoint in pkg_resources.iter_entry_points("lo_modules"): + load_module_from_entrypoint(entrypoint) + LOADED = True + + +def validate_module(module): + ''' + Check that a module has the required components. + + We should eventually do more validation here, once we have + figured out what we want to validate. + ''' + if not hasattr(module, "NAME"): + raise ValueError( + f"Module {module} does not have a NAME attribute " + "Please give your module a short, human-friendly name " + "Spaces, etc. are okay" + ) + + +DEFAULT_STUDENT_SCOPE = helpers.Scope([helpers.KeyField.STUDENT]) + + +def format_function(f): + ''' + Returns a nice, fully-qualified name for a function + ''' + return f"{f.__module__}.{f.__name__}" + + +def add_reducer(reducer, string_id=None): + ''' + We add a reducer. In actual operation, this should only happen once, on + module load. We'd like to be able to dynamic load and reload reducers in + interactive programming, so we offer the optnio of a `string_id` + ''' + if string_id is not None: + REDUCERS = [r for r in REDUCERS if r.get("string_id", None) != string_id] + REDUCERS.append(reducer) + return REDUCERS + + +def load_reducers(component_name, module): + ''' + Load reducers from a module. + + We clean up the reducer by removing any keys that we don't + and need, adding defaults for any missing keys. + ''' + # Load any state reducers / event processors + if hasattr(module, "REDUCERS"): + debug_log(f"Loading reducers from {component_name}") + for reducer in module.REDUCERS: + cleaned_reducer = { + "context": reducer['context'], + "function": reducer['function'], # Primary ID + "scope": reducer.get('scope', DEFAULT_STUDENT_SCOPE), + "module": module + } + + # Here's the deal: Our primary ID is the function itself, and our + # code should rely on that. It gives us type safety. However, it's + # convenient to be able to reference these things more easily when + # developing interactively. This gives a string ID. We might eliminate + # this later, since it's possible to recompute to the string + # representation of the function. But it's convenient for now. + if learning_observer.settings.RUN_MODE == learning_observer.settings.RUN_MODES.INTERACTIVE: + cleaned_reducer['string_id'] = format_function(reducer['function']) + + debug_log(f"Loading reducer: {cleaned_reducer}") + REDUCERS.append(cleaned_reducer) + else: + debug_log(f"Component {component_name} has no reducers") + + +def load_course_aggregators(component_name, module): + ''' + Load course aggregators from a module. + + We clean up the course aggregator by removing any keys that we + don't need, adding defaults for any missing keys. + ''' + if hasattr(module, "COURSE_AGGREGATORS"): + debug_log(f"Loading course aggregators from {component_name}") + for course_aggregator in module.COURSE_AGGREGATORS: + aggregator_id = "{module}.{submodule}".format( + module=component_name, + submodule=course_aggregator + ) + + cleaned_aggregator = { + "long_id": aggregator_id, + "short_id": course_aggregator, + "module": module + } + cleaned_aggregator.update(module.COURSE_AGGREGATORS[course_aggregator]) + + COURSE_AGGREGATORS[aggregator_id] = cleaned_aggregator + + debug_log(f"Loaded course aggregator: {cleaned_aggregator}") + else: + debug_log(f"Component {component_name} has no course aggregators") + + +def load_ajax(component_name, module): + ''' + Load AJAX handlers from a module. This is API is TBD. + ''' + if hasattr(module, "AJAX"): + debug_log(f"Loading AJAX handlers from {component_name}") + AJAX[component_name] = module.AJAX + else: + debug_log(f"Component {component_name} has no extra AJAX handlers") + + +def load_dashboards(component_name, module): + ''' + Load dashboards from a module. + + For now, these are just static URLs to the dashboards. These can + either be per-student or per-course. We might want to add more + types later, or somehow organize these better. + + These should have more metadata at some point. Organization of this + is TBD. + ''' + dashboards = False + if hasattr(module, "COURSE_DASHBOARDS"): + debug_log(f"Loading course dashboards from {component_name}") + COURSE_DASHBOARDS.extend(module.COURSE_DASHBOARDS) + dashboards = True + + if hasattr(module, "STUDENT_DASHBOARDS"): + debug_log(f"Loading student dashboards from {component_name}") + STUDENT_DASHBOARDS.extend(module.COURSE_DASHBOARDS) + dashboards = True + + if not dashboards: + debug_log(f"Component {component_name} has no dashboards") + + +def load_extra_views(component_name, module): + ''' + ''' + extras = False + if hasattr(module, 'EXTRA_VIEWS'): + debug_log(f'Loading extra views from {component_name}') + EXTRA_VIEWS.extend([m | {'module': component_name} for m in module.EXTRA_VIEWS]) + extras = True + + if not extras: + debug_log(f'Component {component_name} has no extra views') + + +def register_3rd_party(component_name, module): + ''' + Register 3rd party components the module needs. + + These will be downloaded and installed onto the server, + and then the module will be able to use them. + + These are verified by SHA hashes. + + There's a lot to think through in terms of absolute paths, + conflicts, etc. perhaps another time. + ''' + if hasattr(module, "THIRD_PARTY"): + debug_log(f"Loading third party components from {component_name}") + for library_filename in module.THIRD_PARTY: + # If another module already wants this library, confirm + # it's under the same hash + if library_filename in THIRD_PARTY: + if THIRD_PARTY[library_filename]['hash'] != module.THIRD_PARTY[library_filename]['hash']: + raise RuntimeError( + "Version Conflict in 3rd party libs\n" + "Component {} has a different hash for {}" + "than previous component.\n" + "{} vs {}".format( + component_name, + library_filename, + THIRD_PARTY[library_filename]['module'], + module.THIRD_PARTY[library_filename] + ) + ) + else: + THIRD_PARTY[library_filename] = { + 'urls': [], + 'hash': module.THIRD_PARTY[library_filename].get('hash', None), + 'users': [] + } + THIRD_PARTY[library_filename]['users'].append(module.NAME) + THIRD_PARTY[library_filename]['urls'].append( + module.THIRD_PARTY[library_filename]['url'] + ) + + +def register_git_repos(component_name, module): + ''' + Register git repositories the module would like to serve + static files from. + + These can be downloaded and installed onto the server, but we + prompt the user to confirm before doing so, since we don't + want to accidentally conflict with devops tools. + + We don't handle multiple components wanting the same repo + well. We should probably do something about that. + ''' + if hasattr(module, "STATIC_FILE_GIT_REPOS"): + debug_log(f"Loading git repositories from {component_name}") + for repo in module.STATIC_FILE_GIT_REPOS: + debug_log(f"Validating and registering git repository: {repo}") + if repo in STATIC_REPOS: + raise NotImplementedError( + f"Multiple modules want to clone {repo}\n" + "This isn't bad, but isn't implemented yet.\n" + "We want code to either make sure both versions\n" + "are the same, or place them in different locations,\n" + "or something. Please code that up and make a PR!" + ) + STATIC_REPOS[repo] = copy.deepcopy(module.STATIC_FILE_GIT_REPOS[repo]) + # TODO: This is a bit awkward.... The URL and key structure won't work well + # if we use the same repo twice. + STATIC_REPOS[repo]['module'] = component_name + if not os.path.exists(learning_observer.paths.repo(repo)): + print(f"Repo {repo} does not exist.") + print(f"It is requested by {component_name}") + print("Should I clone it from {url} to {location}?".format( + location=learning_observer.paths.repo(repo), + url=module.STATIC_FILE_GIT_REPOS[repo]['url'] + )) + yesno = input("Yes/No> ") + if yesno.lower().strip() not in ["y", "tak", "yes", "yup", "好", "نعم"]: + print("Fine. Get it yourself, and configure the location") + print("in the setting file under repos. Run me again once it's") + print("there.") + sys.exit(-1) + gitrepo = gitserve.gitaccess.GitRepo(learning_observer.paths.repo(repo)) + print(gitrepo.clone( + module.STATIC_FILE_GIT_REPOS[repo]['url'], + mirror=module.STATIC_FILE_GIT_REPOS[repo].get("mirror", True) + )) + # Paths are top-level for bare repos e.g. `/home/ubuntu/repo` and subdir for + # working repos e.g. `/home/ubuntu/repo.git` which we need to later manage. + if not os.path.exists(os.path.join(learning_observer.paths.repo(repo), ".git")): + STATIC_REPOS[repo]['bare'] = True + else: + debug_log(f"Component {component_name} has no git repositories") + debug_log(STATIC_REPOS) + + +def register_wsgi_modules(component_name, module): + ''' + We *don't* support pluggable `wsgi` modules. If you'd like to register + an unsupported one, this will do it, though! + + `wsgi` is a way of plugging in additional servers. We use it for + `dash` support, and it made sense to do this generically. It's + nice for _prototyping_ too. However, it's far too general for + modules to just plug in this way, and far too easy to screw up. At + some point, we might: + + * Yank this out + * Restrict it (e.g. require a URL scheme) + * Change the API + + It definitely should not stay like this forever. + + This should be called *before* we register `dash` modules, though, + since this is where we load `dash`. + ''' + if hasattr(module, "WSGI"): + for item in module.WSGI: + item['COMPONENT_NAME'] = component_name + # item['MODULE'] is for debugging; we should pull out + # anything we use directly + item['MODULE'] = module + WSGI.extend(module.WSGI) + + +def register_dash_pages(component_name, module): + ''' + Load the set of `dash` pages. We might want to change to a flat + list later. We might also want to include URLs once available. + ''' + if hasattr(module, "DASH_PAGES"): + for page in module.DASH_PAGES: + page['_BASE_PATH'] = os.path.dirname(module.__file__) + DASH_PAGES[component_name] = module.DASH_PAGES + + +def load_module_from_entrypoint(entrypoint): + ''' + Load a module from an entrypoint. + ''' + debug_log( + f"Loading entrypoint: {entrypoint.name} / {entrypoint.dist.version} / " + f"{entrypoint.dist.location} / {entrypoint.dist.project_name}") + module = entrypoint.load() + module_name = module.__name__ + if not module_name.endswith(".module"): + raise AttributeError("Module should be defined in a file called module.py") + component_name = module_name[:-len(".module")] + validate_module(module) + debug_log(f"Corresponding to module: {module.__name__} ({module.NAME})") + load_reducers(component_name, module) + load_course_aggregators(component_name, module) + load_ajax(component_name, module) + load_dashboards(component_name, module) + load_extra_views(component_name, module) + register_3rd_party(component_name, module) + register_git_repos(component_name, module) + + register_wsgi_modules(component_name, module) + register_dash_pages(component_name, module) + + return module diff --git a/learning_observer/learning_observer/offline.py b/learning_observer/learning_observer/offline.py new file mode 100644 index 000000000..2c254f29a --- /dev/null +++ b/learning_observer/learning_observer/offline.py @@ -0,0 +1,337 @@ +''' +Learning Observer Library + +Helpers to support the use of Learning Observer in scripts, in +other applications, and in Jupyter notebooks. +''' +import argparse +import asyncio +from cgi import print_arguments +import json +import sys +import os + +import names + +import learning_observer.settings +import learning_observer.stream_analytics +import learning_observer.module_loader +import learning_observer.incoming_student_event +import learning_observer.log_event +import learning_observer.kvs +import learning_observer.rosters +import learning_observer.dashboard + +from learning_observer.stream_analytics.helpers import kvs_pipeline, KeyField, EventField, Scope + + +# For interactive data analysis +INTERACTIVE_SETTINGS = { + 'kvs': {'type': 'stub'}, + 'config': { + 'run_mode': 'interactive' + }, + "logging": { + "debug_log_level": "NONE", + "debug_log_destination": ["console"] + }, + "roster_data": { + "source": "all" + } +} + + +def init(settings=INTERACTIVE_SETTINGS): + ''' + Initialize the Learning Observer library. + + Returns: + None + + This function will load the settings, and initialize the KVS to + run from memory. + ''' + # We override the debug log level since we don't want to spew logs if we do + # anything before we've loaded the settings. This might not be necessary, + # depending on the (still-changing) startup order + learning_observer.log_event.DEBUG_LOG_LEVEL = learning_observer.log_event.LogLevel.NONE + learning_observer.settings.load_settings(settings) + learning_observer.kvs.kvs_startup_check() # Set up the KVS + # Force load of the reducers. This is not necessary right now, but it was + # before, and might be later again. We should remove this call once the + # system has stabilized a little bit. + reducers = learning_observer.module_loader.reducers() + learning_observer.stream_analytics.init() # Load existing reducers + learning_observer.rosters.init() + + +async def process_file( + file_path=None, + events_list=None, + source=None, + userid=None, + pipeline=None +): + ''' + Process a single log file. + + Args: + file_path (str): The path to the log file to process. + source (str): The source of events (e.g. org.mitros.dynamic_assessment) + If not specified, the source will be inferred from the + events. + userid (str): The userid of the user that generated the events. If not + specified, the userid will be generated with `names`. + + Returns: + Number of events processed, source, and userid + + If `source` is not specified, the source will be inferred from the + log file. + + If `userid` is not specified, a username will be generated with + `names.get_first_name()`. We do this because we don't want to + accidentally use a real name. This minimizes the risk of exposing + PII. It'd be easy to infer the real name from the log file, but + that should be done with care, and a parameter would be needed to + enable this. + ''' + if events_list is not None and file_path is not None: + raise AttributeError("Please specify either an events list or a file path, not both") + + # Opener returns an iterator of events. It handles diverse sources: + # lists, log files, and compressed log files + def opener(): + return events_list + + if file_path is not None: + if file_path.endswith('.log'): + def file_opener(): + return open(file_path) + elif file_path.endswith('.log.gz'): + def file_opener(): + return gzip.open(file_path) + else: + raise ValueError("Unknown file type: " + file_path) + + def opener(): + return (json.loads(line) for line in file_opener().readlines()) + + if source is None: + for event in opener: + source = event['client']['source'] + break + + # In most cases, for development, a dummy name is good. + if userid is None: + userid = names.get_first_name() + + metadata = { + "source": source, + "auth": { + "user_id": userid, + "safe_user_id": userid + } + } + + if pipeline is None: + pipeline = await learning_observer.incoming_student_event.student_event_pipeline(metadata) + else: + pipeline = await pipeline(metadata) + print(pipeline) + n = 0 # Number of events processed + for event in opener(): + try: + await pipeline(event) + n += 1 + except Exception: + print(event) + raise + + return n, source, userid + + +async def process_files(files): + ''' + Process a list of log files. + + Args: + files (list): A list of log files to process. + + Returns: + Total number of events processed + + This function will process each file in the list, and print the + results. + ''' + total = 0 + for file in files: + n, source, userid = await process_file(file) + print("{} events processed from {} with user ID {}".format(n, source, userid)) + total += n + + return total + + +async def process_dir(path=os.getcwd()): + ''' + Process all log files in a directory. + + Args: + path (str): The path to the directory to process. + + Returns: + Number of files processed, total number of events processed + + This function will process all log files in the directory, and + print the results. + ''' + files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.log')] + events_processed = await process_files(files) + return len(files), events_processed + + +async def reset(): + ''' + Reset the Learning Observer library, clearing all processed events + from the KVS. + + In the future, this might also clear the modules, etc. + ''' + kvs = learning_observer.kvs.KVS() + await kvs.clear() + + +async def aggregate(module_id): + ''' + Aggregate the results of a module. + + This has a lot of overlap with dashboard.py, and should be refactored. + ''' + course_id = 12345 + course_aggregator_module, default_data = learning_observer.dashboard.find_course_aggregator(module_id) + + if course_aggregator_module is None: + print("Bad module: ", module_id) + available = learning_observer.module_loader.course_aggregators() + print("Available modules: ", [available[key]['short_id'] for key in available]) + raise ValueError(text="Invalid module: {}".format(module_id)) + + roster = await learning_observer.rosters.courseroster("request", course_id) + student_state_fetcher = learning_observer.dashboard.fetch_student_state( + course_id, + module_id, + course_aggregator_module, + roster, + default_data + ) + aggregator = course_aggregator_module.get('aggregator', lambda x: {}) + sd = await student_state_fetcher() + data = { + "student_data": sd # Per-student list + } + data.update(aggregator(sd)) + return data + + +async def default_aggregation(function): + """ + Return the aggregated data from this reducer function. This doesn't + require any aggregators to be loaded, which is nice. + + This is only for offline operation (e.g. with the `all` roster) + """ + roster = await learning_observer.rosters.courseroster(None, 12345) + student_state_fetcher = learning_observer.dashboard.fetch_student_state( + 12345, + "test_case_unused", + {"sources": [function]}, + roster, + {} + ) + sd = await student_state_fetcher() + return {"student_data": sd} + + +@kvs_pipeline( + scope=Scope([KeyField.STUDENT]), + module_override="testcase", + qualname_override="event_count" +) +async def test_reducer(event, state): + if state is None: + state = {} + state['event_count'] = state.get('event_count', 0) + 1 + return state, state + + +async def test_case(): + init() + print("Reducers:") + print(learning_observer.module_loader.reducers()) + kvs = learning_observer.kvs.KVS() + print("Keys:") + print(await kvs.keys()) + import tempfile + import os + (handle, filename) = tempfile.mkstemp(text=True, suffix=".log") + with os.fdopen(handle, "w") as fp: + for i in range(5): + fp.write("{}\n") + await process_file( + file_path=filename, + source="org.ets.testcase", + pipeline=test_reducer, + userid="Bob" + ) + os.unlink(filename) + await process_file( + events_list=[{}] * 3, + source="org.ets.testcase", + pipeline=test_reducer, + userid="Sue" + ) + print("Keys:") + keys = await kvs.keys() + print(keys) + for key in keys: + print("{key} {value}".format( + key=key, + value=await kvs[key] + )) + print(await default_aggregation(test_reducer)) + +# A lot of Learning Observer calls expect app object, request objects, etc. +# These are dummy stub versions. + + +class StubApp(): + def __init__(self): + self.loop = asyncio.get_event_loop() + + def add_routes(self, *args, **kwargs): + pass + + +app = StubApp() + + +class StubRequest(): + + def __init__(self): + self.app = app + + def __contains__(self, item): + if item == 'auth_headers': + return True + return False + + def __getitem__(self, item): + return {} + + +request = StubRequest() + + +if __name__ == '__main__': + asyncio.run(test_case()) diff --git a/learning_observer/learning_observer/paths.py b/learning_observer/learning_observer/paths.py new file mode 100644 index 000000000..15ef3634e --- /dev/null +++ b/learning_observer/learning_observer/paths.py @@ -0,0 +1,181 @@ +''' +Path hierarchy +============== + +Helper utility to help manage paths. We'd like to abstract out path +configurations so that the tool can work in: + +1. Deployed settings +2. pip installs +3. Development mode + +... etc. + +This abstracts out finding files and directories. Eventually, we'd +like to be able to search for packages looking at: + +- Relative directories (e.g. when developing) +- Static directories (e.g. /etc/, /var/log/, etc.) +- Config file +- pkg_resources +- Command-line parameters + +It makes sense to put this logic one place. + +Should this be merges with settings.py? Let's see how complex this gets. +''' + +import os.path +import sys + + +BASE_PATH = os.path.abspath(os.path.dirname(__file__)) + + +# If we e.g. `import settings` and `import learning_observer.settings`, we +# will load startup code twice, and end up with double the global variables. +# This is a test to avoid that bug. +if not __name__.startswith("learning_observer."): + raise ImportErrror("Please use fully-qualified imports") + sys.exit(-1) + + +def base_path(filename=None): + ''' + Should NOT be used, except by filesystem_state. Use one of the helpers below. + ''' + if filename is None: + return BASE_PATH + return os.path.join(BASE_PATH, filename) + + +def config_file(): + ''' + Main configuration file + ''' + pathname = os.path.join(os.path.dirname(base_path()), 'creds.yaml') + return pathname + + +DATA_PATH_OVERRIDE = None + + +def override_data_path(new_path): + ''' + We'd like to be able to serve data files from alternative + locations, especially for testing + ''' + global DATA_PATH_OVERRIDE + if not new_path.startswith("/"): + DATA_PATH_OVERRIDE = base_path(new_path) + else: + DATA_PATH_OVERRIDE = new_path + + +def data(filename=None): + ''' + File from the static data directory. No parameters: data directory. + ''' + pathname = base_path('static_data') + if DATA_PATH_OVERRIDE is not None: + pathname = DATA_PATH_OVERRIDE + if filename is not None: + pathname = os.path.join(pathname, filename) + return pathname + + +GIT_REPO_ARCHIVE = {} + + +def repo(reponame=None): + ''' + We keep downloaded `git` repos from modules in the static data + directory. This returns the base path of a `git` repo. + + `git` repos may also be stored other places. + ''' + if reponame in GIT_REPO_ARCHIVE: + return GIT_REPO_ARCHIVE[reponame]['PATH'] + pathname = data("repos") + if reponame is not None: + pathname = os.path.join(pathname, reponame) + return pathname + + +def repo_debug_working_hack(reponame): + ''' + For debugging, we want to allow serving from the git working dir. + + Just not like this.... We should do the merge in settings.py or + module_loader, or somewhere else. + + Right now, we allow us to override whether a repo can be served + from the working dir in the settings file. This is set by + settings.py, which needs to be loaded after paths.py. + + `True` and `False` are overrides. `None` is the default if not + set. + ''' + if reponame in GIT_REPO_ARCHIVE and 'DEBUG_WORKING' in GIT_REPO_ARCHIVE[reponame]: + return GIT_REPO_ARCHIVE[reponame]['DEBUG_WORKING'] + return None + + +def register_repo(reponame, path, debug_working=None): + ''' + Let the system know the location of a repo on the local drive + + `debug_working` is a HACK. The setting is fine, but this does + not belong in paths.py + ''' + GIT_REPO_ARCHIVE[reponame] = { + "PATH": path, + "DEBUG_WORKING": debug_working + } + + +def logs(filename=None): + ''' + Log file. No parameters: log directory. + ''' + pathname = base_path('logs') + if filename is not None: + pathname = os.path.join(pathname, filename) + return pathname + + +def static(filename=None): + ''' + This is where we store and serve our own static files + from. Ideally, our web server should serve these for us, but for + development and small-scale deploys, it's convenient to be able to + do it ourselves too. + ''' + pathname = base_path('static') + if filename is not None: + pathname = os.path.join(pathname, filename) + return pathname + + +def third_party(filename=None): + ''' + This is where we download 3rd party Javascript, CSS, and similar + files to (e.g., D3, Bulma, etc.) + ''' + pathname = static('3rd_party') + if filename is not None: + pathname = os.path.join(pathname, filename) + return pathname + + +def dash_assets(filename=None): + ''' + We are standardizing on `dash` for serving dashboards. + + Perhaps, though, extensions should have data directories, and + this should be made into one for dash? + ''' + pathname = data('dash_assets') + if filename is not None: + pathname = os.path.join(pathname, filename) + return pathname diff --git a/learning_observer/learning_observer/prestartup.py b/learning_observer/learning_observer/prestartup.py new file mode 100644 index 000000000..655a35f67 --- /dev/null +++ b/learning_observer/learning_observer/prestartup.py @@ -0,0 +1,286 @@ +''' +This is at the edge of dev-ops and operations. We would like to: +- Confirm that the system is ready to run the Learning Observer. +- Create directories for log files, etc. +- Validate the teacher list file. +- Validate the configuration file exists. +- Download any missing 3rd party files. +- Confirm their integrity. +- Create any directories that don't exist. +''' + +from distutils.log import debug +import hashlib +import os +import os.path +import shutil +import sys +import uuid + +import learning_observer.paths as paths +import learning_observer.settings as settings + + +STARTUP_CHECKS = [] +INIT_FUNCTIONS = [] +STARTUP_RAN = False + + +class StartupCheck(Exception): + ''' + Exception to be raised when a startup check fails. + ''' + pass + + +def register_startup_check(check): + ''' + Allow modules to register additional checks beyond those defined here. This + function takes a function that takes no arguments and returns nothing which + should run after settings are configured, but before the server starts. + ''' + if STARTUP_RAN: + raise StartupCheck( + "Cannot register additional checks after startup checks have been run." + ) + STARTUP_CHECKS.append(check) + return check + + +def register_init_function(init): + ''' + Allow modules to initialize modules after settings are loaded and startup checks have + run. This function takes a function that takes no arguments and returns nothing which + should run before the server starts. + ''' + if STARTUP_RAN: + raise StartupCheck( + "Cannot register additional checks after startup checks have been run." + ) + INIT_FUNCTIONS.append(init) + return init + + +# These are directories we'd like created on startup. At the moment, +# they're for different types of log files. +DIRECTORIES = { + 'logs': {'path': paths.logs()}, + 'startup logs': {'path': paths.logs('startup')}, + 'AJAX logs': {'path': paths.logs('ajax')}, + '3rd party': {'path': paths.third_party()}, + 'dash assets': {'path': paths.dash_assets()} +} + + +@register_startup_check +def make_blank_dirs(): + ''' + Create any directories that don't exist for e.g. log files and + similar. + ''' + for d in DIRECTORIES: + dirpath = DIRECTORIES[d]['path'] + if not os.path.exists(dirpath): + os.mkdir(dirpath) + print("Made {dirname} directory in {dirpath}".format( + dirname=d, + dirpath=dirpath + )) + + +@register_startup_check +def validate_teacher_list(): + ''' + Validate the teacher list file. This is a YAML file that contains + a list of teachers authorized to use the Learning Observer. + ''' + if not os.path.exists(paths.data("teachers.yaml")): + shutil.copyfile( + paths.data("teachers.yaml.template"), + paths.data("teachers.yaml") + ) + raise StartupCheck( + "Created a blank teachers file: static_data/teachers.yaml\n" + "Populate it with teacher accounts." + ) + + +@register_startup_check +def validate_config_file(): + ''' + Validate the configuration file exists. If not, explain how to + create a configuration file based on the example file. + ''' + if not os.path.exists(paths.config_file()): + raise StartupCheck( + "No configuration file found.\n" + "Copy creds.yaml.sample into the top-level directory:\n" + "cp creds.yaml.sample ../creds.yaml\n" + "Fill in the missing fields." + ) + + +@register_startup_check +def download_3rd_party_static(): + ''' + Download any missing third-party files, and confirm their integrity. + We download only if the file doesn't exist, but confirm integrity + in both cases. + ''' + # We do this import inside to prevent circular imports + import learning_observer.module_loader as module_loader + libs = module_loader.third_party() + + for name in libs: + url = libs[name]['urls'][0] + sha = libs[name]['hash'] + + filename = paths.third_party(name) + + # For subdirectories, make them + os.makedirs(os.path.dirname(filename), exist_ok=True) + if not os.path.exists(filename): + os.system("wget {url} -O {filename} 2> /dev/null".format( + url=url, + filename=filename + )) + print("Downloaded {name}".format(name=name)) + shahash = hashlib.sha3_512(open(filename, "rb").read()).hexdigest() + if sha is None: + error = "No SHA hash set in module for {name}. It should probably be:\n\t{hash}".format( + name=filename, + hash=shahash + ) + raise StartupCheck(error) + elif shahash == sha: + pass + # print("File integrity of {name} confirmed!".format(name=filename)) + else: + # Do we want to os.unlink(filename) or just terminate? + # Probably just terminate, so we can debug. + error = "File integrity of {name} failed!\n" \ + "Expected: {sha}\n" \ + "Got: {shahash}\n" \ + "We download 3rd party libraries from the Internet. This error means that ones of\n" \ + "these files changed. This may indicate a man-in-the-middle attack, that a CDN has\n" \ + "been compromised, or more prosaically, that one of the files had something like\n" \ + "a security fix backported. In either way, VERIFY what happened before moving on.\n\n" \ + "If unsure, please consult with a security expert.".format( + name=filename, + sha=sha, + shahash=shahash + ) + raise StartupCheck(error) + + +def preimport(): + ''' + This will import all of the files which use the register_init_function + or register_startup_check decorators. + ''' + path = os.path.dirname(os.path.realpath(__file__)) + # Walk the directory tree + for root, dirs, files in os.walk(path): + # For each file, if it's a .py file, import it + for f in files: + # Only handle Python files + if not f.endswith(".py"): + continue + if f.startswith("__"): + continue + if f.startswith("."): + continue + if "#" in f: + continue + + # Skip directories which aren't part of the system + SKIP = ["static_data", "prototypes"] + if any(s in root for s in SKIP): + continue + + # Skip files which don't use the decorator + DECORATORS = ["register_init_function", "register_startup_check"] + with open(os.path.join(root, f)) as fp: + code = fp.read() + if not any(d in code for d in DECORATORS): + continue + + # Strip the .py extension + f = f[:-3] + # Import the file + relpath = os.path.relpath(root, path).replace(os.sep, ".") + module_name = ".".join(["learning_observer", relpath, f]) + while ".." in module_name: + module_name = module_name.replace("..", ".") + + try: + print(f"Importing {module_name}") + __import__(module_name) + except ImportError as e: + print("Error importing {f}".format(f=f)) + print(e) + + +def startup_checks_and_init(): + ''' + Run a series of checks to ensure that the system is ready to run + the Learning Observer and create any directories that don't exist. + + We should support asynchronous functions here, but that's a to do. Probably, + we'd introspect to see whether return values are promises, or have a + register_sync and a register_async. + + This function should be called at the beginning of the server. + + In the future, we'd like to have something where we can register with a + priority. The split between checks and intialization felt right, but + refactoring code, it's wrong. We just have things that need to run at + startup, and dependencies. + ''' + preimport() + exceptions = [] + for check in STARTUP_CHECKS: + try: + check() + except StartupCheck as e: + exceptions.append(e) + if exceptions: + print("Could not start the Learning Observer") + for e in exceptions: + print("-------------------") + print(e) + if len(e.args) > 0: + print(e.args[0]) + sys.exit(1) + + for init in INIT_FUNCTIONS: + init() + STARTUP_RAN = True + + +@register_startup_check +def check_aio_session_settings(): + if 'aio' not in settings.settings or \ + 'session_secret' not in settings.settings['aio'] or \ + isinstance(settings.settings['aio']['session_secret'], dict) or \ + 'session_max_age' not in settings.settings['aio']: + raise StartupCheck( + "Settings file needs an `aio` section with a `session_secret`\n" + "subsection containing a secret string. This is used for\n" + "security, and should be set once for each deploy of the platform\n" + "(e.g. if you're running 10 servers, they should all have the\n" + "same secret)\n\n" + "Please set an AIO session secret in creds.yaml\n\n" + "Please pick a good session secret. You only need to set it once, and\n" + "the security of the platform relies on a strong, unique password there\n\n" + "This sessions also needs a session_max_age, which sets the number of seconds\n" + "of idle time after which a user needs to log back in. 4320 should set\n" + "this to 12 hours.\n\n" + "This should be a long string of random characters. If you can't think\n" + "of one, here's one:\n\n" + "aio:\n" + " session_secret: {secret}\n" + " session_max_age: 4320".format( + secret=str(uuid.uuid5(uuid.uuid1(), str(uuid.uuid4()))) + ) + ) diff --git a/learning_observer/learning_observer/pubsub/__init__.py b/learning_observer/learning_observer/pubsub/__init__.py new file mode 100644 index 000000000..59bad7bb7 --- /dev/null +++ b/learning_observer/learning_observer/pubsub/__init__.py @@ -0,0 +1,102 @@ +''' +We have several models for pub-sub: + +1) We can use xmpp, which can run over prosody or eJabberd. These are +wickedly scaleable. We're not necessarily finished (as of the time of +this writing), which is to say they kind of work, but we sometimes +lose messages, and we can't direct them the right places. + +2) We have a stubbed-in version. This only supports one user. It's +helpful for development and demos. + +3) We're going to play with redis, which seems easier (but less scalable) +than xmpp, but is probably right approach for pilots. + +One project which came up which might be relevant: +https://github.com/encode/broadcaster +''' + +import sys + +import learning_observer.settings as settings +from learning_observer.log_event import debug_log + +try: + PUBSUB = settings.settings['pubsub']['type'] +except KeyError: + print("Pub-sub configuration missing from configuration file.") + sys.exit(-1) + +if PUBSUB == 'xmpp': + import learning_observer.pubsub.receivexmpp + import learning_observer.pubsub.sendxmpp + + async def pubsub_send(channel=None): + ''' + Connect to an XMPP server, and return an object able to send + events. + ''' + sender = learning_observer.pubsub.sendxmpp.SendXMPP( + settings.settings['xmpp']['source']['jid'], + settings.settings['xmpp']['source']['password'], + debug_log, + mto='sink@localhost' + ) + sender.connect() + return sender + + async def pubsub_receive(channel=None): + ''' + Connect to an XMPP server, and return an object able to receive + events. + ''' + receiver = learning_observer.pubsub.receivexmpp.ReceiveXMPP( + settings.settings['xmpp']['sink']['jid'], + settings.settings['xmpp']['sink']['password'], + debug_log + ) + receiver.connect() + return receiver +elif PUBSUB == 'stub': + import learning_observer.pubsub.pubstub + + async def pubsub_send(channel=None): + ''' + Return an object capable of placing objects in a simple in-memory + queue. + ''' + sender = learning_observer.pubsub.pubstub.SendStub() + return sender + + async def pubsub_receive(channel=None): + ''' + Return an object capable of awaiting to remove objects from a + simple in-memory queue. + ''' + receiver = learning_observer.pubsub.pubstub.ReceiveStub() + return receiver +elif PUBSUB == 'redis': + import learning_observer.pubsub.redis_pubsub + + async def pubsub_send(channel=None): + ''' + Connect to redis, and return an object capable of sending messages + out over a redis queue / pubsub + ''' + sender = learning_observer.pubsub.redis_pubsub.RedisSend() + await sender.connect() + return sender + + async def pubsub_receive(channel=None): + ''' + Connect to redis, and return an object capable of receiving messages + out over a redis queue / pubsub + ''' + receiver = learning_observer.pubsub.redis_pubsub.RedisReceive() + await receiver.connect() + return receiver +else: + print("Pubsub incorrectly configured") + print("We support stub, redis, and xmpp") + print("It's set to:") + print(PUBSUB) diff --git a/learning_observer/learning_observer/pubsub/pubstub.py b/learning_observer/learning_observer/pubsub/pubstub.py new file mode 100644 index 000000000..e400d95d3 --- /dev/null +++ b/learning_observer/learning_observer/pubsub/pubstub.py @@ -0,0 +1,65 @@ +''' +This is a stubbed-in version of the XMPP send/receive code. It is +helpful for development and debugging. +''' + +import collections +import asyncio + +# We should eventually have a list of queues: One for each +# subscriber. For now, we only support one subscriber. +queue = collections.defaultdict(asyncio.Queue) + + +class SendStub(): + ''' + Minimal class for sending events over a channel. Perhaps + this should be a closure? + ''' + def __init__(self, channel='dummy'): + ''' + Create dumb in-memory queue, outgoing channel + ''' + self.channel = channel + + async def send_event(self, mbody): + ''' + Place an object in the queue + ''' + queue[self.channel].put_nowait(mbody) + return True + + +class ReceiveStub(): + ''' + Minimal class for receiving events over a channel. Perhaps + this should be a closure? + ''' + def __init__(self, channel='dummy'): + ''' + Create dumb in-memory queue, incoming channel + ''' + self.channel = channel + + async def receive(self): + ''' + Wait for an object from the queue + ''' + return await queue[self.channel].get() + + +if __name__ == '__main__': + async def main(): + ''' + Helper function so we can run asynchronously + ''' + sender = SendStub() + receiver = ReceiveStub() + await sender.send_event("hi") + await sender.send_event("bye") + response = await receiver.receive() + print(response) + response = await receiver.receive() + print(response) + + asyncio.run(main()) diff --git a/learning_observer/learning_observer/pubsub/receivexmpp.py b/learning_observer/learning_observer/pubsub/receivexmpp.py new file mode 100644 index 000000000..27f136194 --- /dev/null +++ b/learning_observer/learning_observer/pubsub/receivexmpp.py @@ -0,0 +1,113 @@ +''' +This is an xmpp receiver. It was tested with prosidy, although +eJabberd is a better deployment server (prosidy requires less +devops; eJabberd is known to work at extreme scale). + +It may be buggy. It needs testing and test cases. + +It might make sense to combine the sender and receiver into one +file. + +We should also figure out abstractions to make this look like the +other pubsubs. xmpp has the upside that it can scale across +servers in a managed fashion, and the downside that it's a little +hard to hide that complexity. +''' + +import asyncio + +from lxml import etree + +from slixmpp import ClientXMPP + + +class Message(str): + ''' + This is a string-like object width additional attributes for: + * `message_type` (e.g. 'chat') + * `message_to` -- the destination + * `message_from` -- the source + * `message_id` -- conveniently added by prosody (and hopefully + other servers) + + This lets us operate relatively seamlessly with non-XMPP code and + preserve (some) abstraction. + ''' + def __new__(cls, msg): + xml_tree = etree.fromstring(msg) + new_object = super().__new__( + cls, + etree.tostring( + xml_tree, encoding='utf8', method='text' + ).decode('utf8') + ) + + # We do this explicitly to sanitize inputs + # (Although this is coming from a (relatively) secure source) + new_object.message_type = xml_tree.attrib['type'] + new_object.message_to = xml_tree.attrib['to'] + new_object.message_id = xml_tree.attrib['id'] + new_object.message_from = xml_tree.attrib['from'] + + return s + + +class ReceiveXMPP(ClientXMPP): + ''' + An asynchronous XMPP receiver. slixmpp is asynchronous, but not + with pythonic constructs. This translates from slixmpp-style + callbacks to more Pythonic async/await/futures. + ''' + def __init__(self, jid, password, debug_log=lambda x: None): + ''' + Log into XMPP server on localhost with username `jid`, and + password `password.` If 'debug_log' is passed, pass log + messages to that callback. + ''' + self.debug_log = debug_log + ClientXMPP.__init__(self, jid, password) + self.add_event_handler("session_start", self.session_start) + self.add_event_handler("message", self.message) + self.msg_future = self.new_future() + + def new_future(self): + ''' + We return a future for a message, and populate it when the + message comes in. + ''' + return asyncio.get_event_loop().create_future() + + def session_start(self, event): + ''' + Some XMPP servers require us to `send_presence` and + `get_roster` before they'll talk to us. + ''' + self.send_presence() + self.get_roster() + self.debug_log("XMPP receiver started") + + async def receive(self): + ''' + Returns a future for the next message. + + We make a future, which will be filled in by `message` + ''' + self.debug_log("Waiting for XMPP message") + await self.msg_future + rv = self.msg_future.result() + self.msg_future = self.new_future() + return Message(rv) + + async def message(self, msg): + ''' + Callback called when a message is received. + + When this happens, we fill in our future. We + don't need to reset the future, since that + happens next time receive is called. + ''' + self.debug_log("XMPP message received") + self.msg_future.set_result(str(msg)) + # For debugging, we sometimes want to send something back + # if msg['type'] in ('chat', 'normal'): + # msg.reply("Thanks for sending\n%(body)s" % msg).send() diff --git a/learning_observer/learning_observer/pubsub/redis_pubsub.py b/learning_observer/learning_observer/pubsub/redis_pubsub.py new file mode 100644 index 000000000..84a4eef6f --- /dev/null +++ b/learning_observer/learning_observer/pubsub/redis_pubsub.py @@ -0,0 +1,124 @@ +''' +Pubsub for redis + +redis is nice for medium-scale. It doesn't seem quite as scalable as +xmpp (at least on paper), but it's easy to develop for, easy to +maintain, and should be nice for pilot tests. + +Note that redis does not guarantee delivery. This ought to notify the +receiver to dequeue events, rather than sending events directly. +''' + +import asyncio +import asyncio_redis + + +class RedisSend(): + ''' + Simple async pubsub sender. To use: + >> sender = RedisSend("channel-1") + >> sender.connect() + >> sender.send_event("Hello!") + ''' + def __init__(self, channel='test_channel', debug_log=lambda x: None): + ''' + We connect to a redis pubsub channel passed in `channel`. If we + want detailed logging, we can provide a callback `debug_log` which + takes a string (and e.g. prints it) + + Note that the verbosity on logging is excessive, even for passing to + informational loggers, for most uses. It's for debugging. + ''' + self.s_connection = None + self.channel = channel + self.debug_log = debug_log + self.debug_log("Redis send created") + + async def connect(self): + ''' + Await a new redis connection. We need this in its own function, + since Python does do async in magic functions like `__init__`. + As an alternative, we could have an async factory function. + + In the current implementation, this is optional for the sender + (which will connect in `send_event` if this is not called), + but required for the receiver. Implementations change, so we + recommend calling it. + ''' + self.s_connection = await asyncio_redis.Connection.create() + self.debug_log("Redis send connected") + + async def send_event(self, mbody): + ''' + We send an event. Note that `asyncio-redis` does reconnects if + a connection is lost, but it'd be good to have better error + handling here too (that `is None` is not for error handling; just + so the code works if the user didn't call `connect`). + ''' + if self.s_connection is None: + await self.connect() + self.debug_log("Redis send reconnected") + n = await self.s_connection.publish(self.channel, mbody) + self.debug_log("Sent event to " + str(n)) + + +class RedisReceive(): + ''' + Simple async pubsub sender. To use: + >> receiver = RedisReceive("channel-1") + >> receiver.connect() + >> message = await receiver.receive() + ''' + def __init__(self, channel='test_channel', debug_log=lambda x: None): + ''' + We connect to a redis pubsub channel passed in `channel`. If we + want detailed logging, we can provide a callback `debug_log` which + takes a string (and e.g. prints it) + + Note that the verbosity on logging is excessive, even for passing to + informational loggers, for most uses. It's for debugging. + ''' + self.r_connection = None + self.subscriber = None + self.channel = channel + self.debug_log = debug_log + self.debug_log("Redis receive initialized") + + async def connect(self): + ''' + We need to establish a connection to begin receiving messages + on startup (before receive is called). + ''' + self.r_connection = await asyncio_redis.Connection.create() + self.subscriber = await self.r_connection.start_subscribe() + await self.subscriber.subscribe([self.channel]) + self.debug_log("redis receive connected") + + async def receive(self): + ''' + Unless this is at the start of the program, be sure to call + `connect` as soon as you want to start capturing messages. + ''' + if self.r_connection is None: + self.debug_log("redis receive reconnected") + await self.connect() + self.debug_log("awaiting event") + item = await self.subscriber.next_published() + self.debug_log("Got event!") + return item.value + + +if __name__ == '__main__': + async def main(): + sender = RedisSend(debug_log=print) + await sender.connect() + receiver = RedisReceive(debug_log=print) + await receiver.connect() + await sender.send_event("hi") + await sender.send_event("bye") + received_msg = await receiver.receive() + print(received_msg) + received_msg = await receiver.receive() + print(received_msg) + + asyncio.run(main()) diff --git a/learning_observer/learning_observer/pubsub/sendxmpp.py b/learning_observer/learning_observer/pubsub/sendxmpp.py new file mode 100644 index 000000000..0d26afa0d --- /dev/null +++ b/learning_observer/learning_observer/pubsub/sendxmpp.py @@ -0,0 +1,52 @@ +''' +XMPP sender. This should probably be folded into receivexmpp, and +renamed xmpp_pubsub. + +xmpp has a proper pubsub protocol, which me might also consider using +:) +''' + +from slixmpp import ClientXMPP + + +class SendXMPP(ClientXMPP): + ''' + Helper class to help connect slixmpp to our API + ''' + def __init__(self, jid, password, debug_log, mto): + ''' + Connect to XMPP server, and set up callbacks + ''' + self.debug_log = debug_log + self.mto = mto + ClientXMPP.__init__(self, jid, password) + self.add_event_handler("session_start", self.session_start) + self.add_event_handler("message", self.message) + + def session_start(self, event): + ''' + We need to do some groundwork for XMPP servers to talk to + us. Until we've requested a roster, some don't work. + ''' + self.send_presence() + self.get_roster() + self.debug_log("XMPP sender session initialized") + + def message(self, msg): + ''' + Callback for when a message is received. In theory, we thought + this should never happen. In practice, we do get pings/acks of + some kind, so we'll need to sort this out, and log an error + for _unexpected_ messages. + ''' + # print("Unexpected! I shouldn't get messages") + + async def send_event(self, mbody): + ''' + The raison d'être of this class: We can send an event. + ''' + self.send_message( + mto=self.mto, + mbody=mbody, + mtype='chat') + self.debug_log("XMPP message sent") diff --git a/learning_observer/learning_observer/redis_connection.py b/learning_observer/learning_observer/redis_connection.py new file mode 100644 index 000000000..97cd9d4a0 --- /dev/null +++ b/learning_observer/learning_observer/redis_connection.py @@ -0,0 +1,59 @@ +''' +This was designed as a helper for per-thread connection pooling (we want +just one redis connection). This was necessary with asyncio_redis. We ported +to redis.asyncio, and right now, a lot of this design and code is obsolete. +Right now, it's easy to switch around, but this should be modernized once we're +confident using redis.asyncio. It handles a lot of what we do manually inside +the library. +''' + +import redis.asyncio + + +from learning_observer.log_event import debug_log + + +REDIS_CONNECTION = None + + +async def connect(): + ''' + Connect to redis + ''' + global REDIS_CONNECTION + if REDIS_CONNECTION is None: + REDIS_CONNECTION = redis.asyncio.Redis() + await REDIS_CONNECTION.ping() + + +async def connection(): + ''' + Returns our connection. Connects if needed. + + This is shorthand. It's not clear if this is the right abstraction, + since it makes for a mess of awaits. + ''' + await connect() + return REDIS_CONNECTION + + +async def keys(): + ''' + Return all the keys in the database. This might take a while on production + installs, but is super-helpful in debugging. + ''' + return [key.decode('utf-8') for key in await (await connection()).keys()] + + +async def get(key): + ''' + Get a key. We should eventually do multi-gets. Returns a future. + ''' + return await (await connection()).get(key) + + +async def set(key, value, expiry=None): + ''' + Set a key. We should eventually do multi-sets. Returns a future. + ''' + return await (await connection()).set(key, value, expiry) diff --git a/learning_observer/learning_observer/rosters.py b/learning_observer/learning_observer/rosters.py new file mode 100644 index 000000000..1ab4797d6 --- /dev/null +++ b/learning_observer/learning_observer/rosters.py @@ -0,0 +1,440 @@ +''' +Class Roster Subsystem +====================== + +This gives class roster information: + +- What classes a teach administrates +- Which students are in a class. + +We can either retrieve class rosters from: + +- Google Classroom (config setting: 'google') +- Text files on the disk for testing. (config setting: 'test') + We have two files: + - courses.json + - students.json +- A file hierarchy, for small-scale deploys. ('filesystem') +- In progress: All students, for e.g. coglabs, experiments, and + similar ('all') + +In the future, we might want: + +- more integrations +- a database option +- an option to autocreate "guest" users (for unauthenticated deploys) + +As well as the option for several sources in the same system, perhaps. + +This file could be cleaned up a lot. Right now, we do a lot of this by +mock calls to Google AJAX. It also contains a large number of hacks which +we use to manage the data and to address variations in the roster sources +whether we are taking them from google or from our own backup data. + +As of now this partially implements a separation between the internal ID +which shows up in our rosters as id or `user_id` and the id used for the +external sources of data. We store external ids on student data under +external_ids and keep space for ids from google etc. However as of now +we do not make use of it. Ultimately it would be ideal to move so that +remote data retreival and raw document storage are done under an internal +id with this translation taking place at event storage time *or* that the +event retreival by the dashboard makes use of the external ids consistently +at composition time. The latter approach however has the cost that we would +be redoing the lookup and indexing each time we pull the raw data. This has +the potential to create some extra, though probably manageable, queries. + +In either case we get around it now by also adding in a cheap hack that +makes the internal ID for google-sourced users match the google ID. This +will need to change in a stable way for future use. + +Note that these APIs and file locations aren't finished. In the future, +we may: + +* Switch from .json to .yaml +* Have a less Googley format + +As it stands this file is also part of the way through a naming refactor. +The roster information has changed from camel-case to underscores. The +actual group information has not. That should also be remapped and tested +so that class info uses the same format but that is scut work for another +time. +''' + +import json +import os.path +import sys + +import aiohttp +import aiohttp.web + +import pathvalidate + +import learning_observer.settings as settings + +import learning_observer.kvs +import learning_observer.log_event as log_event +import learning_observer.paths as paths +import learning_observer.auth as auth +import learning_observer.google + +from learning_observer.log_event import debug_log + +import learning_observer.prestartup + +COURSE_URL = 'https://classroom.googleapis.com/v1/courses' +ROSTER_URL = 'https://classroom.googleapis.com/v1/courses/{courseid}/students' + + +def clean_google_ajax_data(resp_json, key, sort_key, default=None, source=None): + ''' + This cleans up / standardizes Google AJAX data. In particular: + + - We want to handle errors and empty lists better + - We often don't want the whole response, but just one field (`key`) + - We often want some default if that field is missing (`default`) + - We often want the response sensibly sorted (`sort_key`) + ''' + # Convert errors into appropriate codes for clients + # Typically, resp_json['error'] == 'UNAUTHENTICATED' + if 'error' in resp_json: + return {'error': resp_json['error']} + + # Google sometimes returns results nested one extra level. + # + # If we just want one field, retrieve it, and handle issues cleanly + # if the field is missing + if key is not None: + if key in resp_json: + resp_json = resp_json[key] + # This happens if e.g. no courses. Google seems to just return {} + # instead of {'courses': []} + else: + return default + + # Convert all camel cases to underscores. + util.translate_json_keys(resp_json, learning_observer.google.GOOGLE_TO_SNAKE) + + # Update the ID's to include the gc- prefix and to handle the external data. + # this only runs if the quesry of concern was students meaning that we will + # have a list of student dicts in resp_json. + if (key == 'students'): + adjust_external_gc_ids(resp_json) + + # Sort the list + if sort_key is not None: + resp_json.sort(key=sort_key) + + return resp_json + + +def adjust_external_gc_ids(resp_json): + ''' + What we are concerned with here is handling cases where the id supplied by the + google roster is a numerical value but we need to have gc- preprended to it + for data fetching. This is a relatively minor task but necessary for interfacing + with the external data sources but makes it easier to get the stored values. + + This will be run qith 'students' requests meaning that the attached will be + a possibly-empty? list of student dicts. + + This exists for the sole purpose of adjusting the internal ids and includes a + cheap hack below that maps the internal user_id to match the google id. Going + forward that will need to be changed to something more robust. See the comments + at the top of this module. + ''' + + # Iterate over the students performing an addition of the external_ids and possible + # conversion of the individual id. + for student_json in resp_json: + + # Pull the actual profile data. + student_profile = student_json['profile'] + + # Calculate the new ID to use for our student. + google_id = auth.google_id_to_user_id(student_profile['id']) + + # As a cheap hack lets change the ids to match + # student_profile['user_id'] = google_id + # + # This hack changes the internal ID which we then use for + # document retreival. Going forward it should not be done + # this way and it would be better for us to make this use + # the externals. + student_json['user_id'] = google_id + + # For the present there is only one external id so we will add that directly. + ext_ids = [{"source": "google", "id": google_id}] + student_profile['external_ids'] = ext_ids + + +async def all_students(): + ''' + This crawls all of the keys in the KVS, and creates a list of all + student IDs in redis. This should not be used in any form of + large-scale production. + ''' + keys = await learning_observer.kvs.KVS().keys() + # Reduce list length by 2 + internal_keys = [k for k in keys if k.startswith("Internal")] + + # Pick out the STUDENT field, and place those in a list. This list should + # have length 1 (if the field is there) or 0 (if it is not). + student_field_lists = [[f for f in k.split(",") if f.startswith("STUDENT:")] for k in internal_keys] + + # Drop invalid keys, as well as ones which don't have a student field. + # + # For the remaining ones -- ones with a student ID -- just pick out the student ID. + user_ids = [k[0].split(":")[1] for k in student_field_lists if len(k) == 1] + + # Drop duplicates + return sorted(set(user_ids)) + + +async def all_ajax( + request, url, + parameters=None, key=None, sort_key=None, default=None): + ''' + Stub in information normally requested through Google's API, + using a dummy course and all students in the system as the + roster for that course. + ''' + if url == COURSE_URL: + return [{ + "id": "12345678901", + "name": "All Students", + "description_heading": "For easy small-scale deploys", + "alternate_link": "https://www.ets.org/", + "teacher_group_email": "", + "course_group_email": "", + "teacher_folder": { + "id": "", + "title": "All Students", + "alternate_link": "" + }, + "calendar_id": "NA" + }] + if url == ROSTER_URL: + students = await all_students() + + def profile(student, index): + idnum = str(index + 100) + # We'll created a name from the ID passed + name = '-'.join(student.split('-')[2:]).replace("%23", "") + return { + "user_id": student, + "profile": { + "name": { + "given_name": name, + "family_name": idnum, + "full_name": name + }, + "emailAddress": "student" + idnum + "@localhost", + "photoUrl": "//", + "external_ids": [] + } + } + + return [profile(s, i) for (s, i) in zip(students, range(len(students)))] + # Otherwise, we need to code up the other URLs + raise AttributeError("Unknown Google URL: " + url) + + +async def synthetic_ajax( + request, url, + parameters=None, key=None, sort_key=None, default=None): + ''' + Stub similar to google_ajax, but grabbing data from local files. + + This is helpful for testing, but it's even more helpful since + Google is an amazingly unreliable B2B company, and this lets us + develop without relying on them. + ''' + if settings.settings['roster_data']['source'] == 'test': + synthetic_data = { + COURSE_URL: paths.data("courses.json"), + ROSTER_URL: paths.data("students.json") + } + elif settings.settings['roster_data']['source'] == 'filesystem': + debug_log(request['user']) + safe_userid = pathvalidate.sanitize_filename(request['user']['user_id']) + courselist_file = "courselist-" + safe_userid + if parameters is not None and 'courseid' in parameters: + safe_courseid = pathvalidate.sanitize_filename(str(parameters['courseid'])) + roster_file = "courseroster-" + safe_courseid + else: + roster_file = "default" + synthetic_data = { + ROSTER_URL: paths.data("course_rosters/{roster_file}.json".format( + roster_file=roster_file)), + COURSE_URL: paths.data("course_lists/{courselist_file}.json".format( + courselist_file=courselist_file)) + } + else: + debug_log("Roster data source is not recognized:", settings.settings['roster_data']['source']) + raise ValueError("Roster data source is not recognized: {}".format(settings.settings['roster_data']['source']) + + " (should be 'test' or 'filesystem')") + try: + data = json.load(open(synthetic_data[url])) + except FileNotFoundError as exc: + debug_log(exc) + raise aiohttp.web.HTTPInternalServerError( + text="Server configuration error. " + "No course roster file for your account. " + "Please ask the sysadmin to make one. " + "(And yes, they'll want to know about this issue;" + "you won't be bugging them)" + ) + return data + + +async def google_ajax( + request, url, + parameters=None, key=None, sort_key=None, default=None): + ''' + Request information through Google's API + + Most requests return a dictionary with one key. If we just want + that element, set `key` to be the element of the dictionary we want + + This is usually a list. If we want to sort this, pass a function as + `sort_key` + + Note that we return error as a json object with error information, + rather than raising an exception. In most cases, we want to pass + this error back to the JavaScript client, which can then handle + loading the auth page. + ''' + if parameters is None: # {} should NOT be a default param. See W0102. + parameters = {} + async with aiohttp.ClientSession(loop=request.app.loop) as client: + # We would like better error handling for what to do if auth_headers + # is not set. However, we haven't figured out a better thing to do. + # We saw this happen due to a bug, but similar bugs might come up + # in the future (we forgot to propagate the headers from the + # session). + async with client.get(url.format(**parameters), headers=request["auth_headers"]) as resp: + resp_json = await resp.json() + log_event.log_ajax(url, resp_json, request) + return clean_google_ajax_data( + resp_json, key, sort_key, default=default + ) + +ajax = None + + +@learning_observer.prestartup.register_startup_check +def init(): + ''' + * Set up the ajax function. + * Check that the settings are valid. + * Check that the roster data paths exist. + + TODO: It should be broken out into a separate check function and init function, + or smaller functions otherwise. + ''' + global ajax + if 'roster_data' not in settings.settings: + print(settings.settings) + raise learning_observer.prestartup.StartupCheck( + "Settings file needs a `roster_data` element with a `source` element. No `roster_data` element found." + ) + elif 'source' not in settings.settings['roster_data']: + raise learning_observer.prestartup.StartupCheck( + "Settings file needs a `roster_data` element with a `source` element. No `source` element found." + ) + elif settings.settings['roster_data']['source'] in ['test', 'filesystem']: + ajax = synthetic_ajax + elif settings.settings['roster_data']['source'] in ["google_api"]: + ajax = google_ajax + elif settings.settings['roster_data']['source'] in ["all"]: + ajax = all_ajax + else: + raise learning_observer.prestartup.StartupCheck( + "Settings file `roster_data` element should have `source` field\n" + "set to either:\n" + " test (retrieve from files courses.json and students.json)\n" + " google_api (retrieve roster data from Google)\n" + " filesystem (retrieve roster data from file system hierarchy\n" + " all (retrieve roster data as all students)" + ) + REQUIRED_PATHS = { + 'test': [ + paths.data("students.json"), + paths.data("courses.json") + ], + 'filesystem': [ + paths.data("course_lists/"), + paths.data("course_rosters/") + ] + } + + if settings.settings['roster_data']['source'] in REQUIRED_PATHS: + r_paths = REQUIRED_PATHS[settings.settings['roster_data']['source']] + for p in r_paths: + if not os.path.exists(p): + raise learning_observer.prestartup.StartupCheck( + "Missing course roster files!\n" + "The following are required:\t{paths}\n\n" + "Please run:\n" + "{commands}\n\n" + "(And ideally, they'll be populated with\n" + "a list of courses, and of students for\n" + "those courses)".format( + paths=", ".join(r_paths), + commands="\n".join(["mkdir {path}".format(path=path) for path in r_paths]) + ) + ) + + return ajax + + +async def courselist(request): + ''' + List all of the courses a teacher manages: Helper + ''' + # New code + if settings.settings['roster_data']['source'] in ["google_api"]: + return await learning_observer.google.courses(request) + + # Legacy code + course_list = await ajax( + request, + url=COURSE_URL, + key='courses', + sort_key=lambda x: x.get('name', 'ZZ'), + default=[] + ) + return course_list + + +async def courseroster(request, course_id): + ''' + List all of the students in a course: Helper + ''' + if settings.settings['roster_data']['source'] in ["google_api"]: + return await learning_observer.google.roster(request, courseId=course_id) + + roster = await ajax( + request, + url=ROSTER_URL, + parameters={'courseid': int(course_id)}, + key='students', + sort_key=lambda x: x.get('name', {}).get('fullName', 'ZZ'), + default=[] + ) + return roster + + +async def courselist_api(request): + ''' + List all of the courses a teacher manages: Handler + ''' + return aiohttp.web.json_response(await courselist(request)) + + +async def courseroster_api(request): + ''' + List all of the students in a course: Handler + ''' + course_id = int(request.match_info['course_id']) + return aiohttp.web.json_response(await courseroster(request, course_id)) diff --git a/learning_observer/learning_observer/routes.py b/learning_observer/learning_observer/routes.py new file mode 100644 index 000000000..266e004bb --- /dev/null +++ b/learning_observer/learning_observer/routes.py @@ -0,0 +1,447 @@ +''' +Map URLs to functions which handle them. +''' + +import getpass +import os +import secrets +import sys + +import aiohttp +import aiohttp.web + +import gitserve.aio_gitserve + +import aiohttp_wsgi + +import learning_observer.admin as admin +import learning_observer.auth +import learning_observer.auth.http_basic +import learning_observer.client_config +import learning_observer.incoming_student_event as incoming_student_event +import learning_observer.dashboard +import learning_observer.google +import learning_observer.rosters as rosters +import learning_observer.module_loader + +import learning_observer.paths as paths +import learning_observer.settings as settings + +from learning_observer.log_event import debug_log + +from learning_observer.utility_handlers import * + + +def add_routes(app): + ''' + Massive routine to set up all static routes. + + This should be broken out into routines for each group of routes, + or handled as a data file, or similar. + ''' + # Allow debugging of memory leaks. Helpful, but this is a massive + # resource hog. Don't accidentally turn this on in prod :) + if 'tracemalloc' in settings.settings['config'].get("debug", []): + import tracemalloc + tracemalloc.start(25) + + def tracemalloc_handler(request): + ''' + Handler to show tracemalloc stats. + ''' + snapshot = tracemalloc.take_snapshot() + top_stats = snapshot.statistics('lineno') + top_hundred = "\n".join((str(t) for t in top_stats[:100])) + top_stats = snapshot.statistics('traceback') + top_one = "\n".join((str(t) for t in top_stats[0].traceback.format())) + return aiohttp.web.Response(text=top_one + "\n\n\n" + top_hundred) + + app.add_routes([ + aiohttp.web.get('/debug/tracemalloc/', tracemalloc_handler), + ]) + + register_dashboard_api(app) + register_static_routes(app) + register_incoming_event_views(app) + register_debug_routes(app) + learning_observer.google.initialize_and_register_routes(app) + + app.add_routes([ + aiohttp.web.get( + '/webapi/courselist/', + rosters.courselist_api), + aiohttp.web.get( + '/webapi/courseroster/{course_id}', + rosters.courseroster_api), + ]) + + register_auth_webapp_views(app) + + # General purpose status page: + # - List URLs + # - Show system resources + # Etc. + app.add_routes([ + aiohttp.web.get('/admin/status', handler=admin.system_status) + ]) + + # This might look scary, but it's innocous. There are server-side + # configuration options which the client needs to know about. This + # gives those. At the very least, we want to be able to toggle the + # client-side up between running with a real server and a dummy static + # server, but in the future, we might want to include things like URIs + # for different services the client can talk to and similar. + # + # This URI should **not** be the same as the filename. We have two + # files, config.json is loaded if no server is running (dummy mode), and + # this is overridden by the live server. + app.add_routes([ + aiohttp.web.get( + '/config.json', + learning_observer.client_config.client_config_handler + ), + ]) + + # We'd like to be able to have the root page themeable, for + # non-ETS deployments. This is a quick-and-dirty way to override + # the main page. + root_file = settings.settings.get("theme", {}).get("root_file", "webapp.html") + app.add_routes([ + aiohttp.web.get('/', static_file_handler(paths.static(root_file))), + ]) + + # E.g. We have an alias of /static/common to /common + # We place useful things modules can use, such as e.g. our logger + app.add_routes([ + aiohttp.web.get('/common/{filename}', static_directory_handler(paths.static("common"))), + ]) + + # Add extra views as json responses + extra_views = learning_observer.module_loader.extra_views() + for view in extra_views: + app.add_routes([ + aiohttp.web.get( + f'/views/{view["module"]}/{view["suburl"]}/', + lambda x: aiohttp.web.json_response(view['static_json']) + ) + ]) + + # Allow AJAX calls. Right now, the function receives a `request` + # object. This should be cleaned in some way. + ajax = learning_observer.module_loader.ajax() + for module in ajax: + for call in ajax[module]: + path = "/ajax/{module}/{call}".format(module=module, call=call) + debug_log("Adding AJAX path", path) + app.add_routes([ + aiohttp.web.get( + path, + lambda x: aiohttp.web.json_response(ajax[module][call](x)) + ) + ]) + + # We route the repos last, since we override some of the routes + # above (esp. 3rd party libraries and media) + repos = learning_observer.module_loader.static_repos() + register_repo_routes(app, repos) + + # This is called last since we don't want wsgi routes overriding + # our normal routes. We may change this design decision if we do + # want to provide that option in the future, but as we're prototyping + # and figuring stuff out, this feels safest to put last. + register_wsgi_routes(app) + + +def register_debug_routes(app): + ''' + Handy-dandy information views, useful for debugging and development. + ''' + if settings.feature_flag("auth_headers_page"): + app.add_routes([ + aiohttp.web.get( + '/admin/headers', + learning_observer.auth.social_sso.show_me_my_auth_headers + ) + ]) + + +def register_incoming_event_views(app): + ''' + Register views for incoming events. We have a websocket + connection for each incoming event. The websocket connection + is a long-lived connection, and is used to receive events + from the client. + + We supported AJAX calls before, but we've since moved to + websockets, and the AJAX may be disabled since it's not + tested. We'll keep the code around for now, since it's + useful for debugging and in the future, lower-velocity + events. + ''' + # Handle web sockets event requests, incoming and outgoing + app.add_routes([ + aiohttp.web.get( + '/wsapi/in/', + incoming_student_event.incoming_websocket_handler) + ]) + + +def register_dashboard_api(app): + ''' + Register the dashboard API views. + + We are moving from per-student and per-course dashboard to a + more general-purpose API. This is TBD. + ''' + app.add_routes([ + aiohttp.web.get( + '/wsapi/dashboard', + learning_observer.dashboard.websocket_dashboard_view), + aiohttp.web.get( + '/webapi/course_dashboards', + ajax_handler_wrapper(learning_observer.module_loader.course_dashboards)), + aiohttp.web.get( + '/webapi/student_dashboards', + ajax_handler_wrapper(learning_observer.module_loader.student_dashboards)) + ]) + + app.add_routes([ + aiohttp.web.get( + '/wsapi/generic_dashboard', + learning_observer.dashboard.generic_dashboard) + ]) + + +def register_auth_webapp_views(app): + ''' + Register the views for the auth module and user info + ''' + # Generic web-appy things + app.add_routes([ + aiohttp.web.get( + '/auth/logout', + handler=learning_observer.auth.logout_handler), + aiohttp.web.get( + '/auth/userinfo', + handler=learning_observer.auth.user_info_handler) + ]) + + if 'google_oauth' in settings.settings['auth']: + debug_log("Running with Google authentication") + app.add_routes([ + aiohttp.web.get( + '/auth/login/{provider:google}', + handler=learning_observer.auth.social_handler), + ]) + + if 'password_file' in settings.settings['auth']: + debug_log("Running with password authentication") + if not os.path.exists(settings.settings['auth']['password_file']): + print("Configured to run with password file," + "but no password file exists") + print() + print("Please either:") + print("* Remove auth/password_file from the settings file") + print("* Create a file {fn} with lo_passwd.py".format( + fn=settings.settings['auth']['password_file'] + )) + print("Typically:") + print("python util/lo_passwd.py " + "--username {username} --password {password} " + "--filename {fn}".format( + username=getpass.getuser(), + password=secrets.token_urlsafe(16), + fn=settings.settings['auth']['password_file'] + )) + sys.exit(-1) + app.add_routes([ + aiohttp.web.post( + '/auth/login/password', + learning_observer.auth.password_auth( + settings.settings['auth']['password_file']) + )]) + + # If we want to support multiple modes of authentication, including + # http-basic, we can configure a URL in nginx which will require + # http basic auth, which is used to log in, and then redirects back + # home. + if learning_observer.auth.http_basic.http_auth_page_enabled(): + # If we don't have a password file, we shouldn't have an auth page. + # At the very least, the user should explicitly set it to `null` + # if they are planning on using nginx for auth + debug_log("Enabling http basic auth page") + auth_file = settings.settings['auth']['http_basic']["password_file"] + app.add_routes([ + aiohttp.web.get( + '/auth/login/http-basic', + learning_observer.auth.http_basic.http_basic_auth( + filename=auth_file, + response=lambda:aiohttp.web.HTTPFound(location="/") + ) + ) + ]) + app.add_routes([ + aiohttp.web.get( + '/auth/default-avatar.svg', + learning_observer.auth.handlers.serve_user_icon) + ]) + + +def register_static_routes(app): + ''' + Register static routes routes for the webapp, especially 3rd party + libraries. + + This serves static files from the static directories. It overrides the + paths in repos. Most of these files are downloaded from the internet, + rather than being kept in the codebase. + ''' + # Serve static files + app.add_routes([ + aiohttp.web.get( + '/favicon.ico', + static_file_handler(paths.static("favicon.ico"))), + aiohttp.web.get( + '/static/{filename}', + static_directory_handler(paths.static())), + aiohttp.web.get( + '/static/modules/{filename}', + static_directory_handler(paths.static("modules"))), + # TODO: Make consistent. 3rdparty versus 3rd_party and maybe clean up URLs. + aiohttp.web.get( + r'/static/repos/{module:[^{}/]+}/{repo:[^{}/]+}/{branch:[^{}/]+}/3rdparty/{filename:[^{}]+}', + static_directory_handler(paths.static("3rd_party"))), + aiohttp.web.get( + '/static/3rd_party/{filename}',\ + static_directory_handler(paths.static("3rd_party"))), + aiohttp.web.get( + '/static/3rd_party/css/{filename}',\ + static_directory_handler(paths.static("3rd_party/css"))), + aiohttp.web.get( + '/static/3rd_party/webfonts/{filename}',\ + static_directory_handler(paths.static("3rd_party/webfonts"))), + aiohttp.web.get( + '/static/media/{filename}', + static_directory_handler(paths.static("media"))), + aiohttp.web.get( + '/static/media/avatar/{filename}', + static_directory_handler(paths.static("media/hubspot_persona_images/"))), + ]) + + +def repo_url(module, repo, branch="master", path="index.html"): + ''' + Return a URL for a file in a repo. + ''' + return "/static/repos/{module}/{repo}/{branch}/{path}".format( + module=module, + repo=repo, + branch=branch, + path=path + ) + + +def register_repo_routes(app, repos): + ''' + Register routes for all repos. + + An example repo is: + + { + 'url': 'https://github.com/ETS-Next-Gen/writing_observer.git', // URL to the repo; downloaded if not already here + 'prefix': 'modules/writing_observer/writing_observer/static', // Path in repo to serve static files from + 'module': 'wobserver', // Module name to use in the static path + + 'whitelist': ['master'], // Optional: List of branches to serve static files from; currently ignored + 'working_tree': True, // Optional: Allow working branches to be served + 'bare': False, // Optional: Serve from a bare repo + 'path': '/home/ubuntu/writing_observer' // Optional: Path to the repo + } + + Most of the optional parameters should *not* be used in production. They are here + for testing and development, especially of new dashboard modules. If needed in production, + paths can also be set in the settings file. + ''' + for reponame in repos: + gitrepo = repos[reponame] + # Check the keys in the repo dictionary are valid + # We can add more keys in the future. E.g. we might want to have comments + # and similar human-friendly metadata. + for key in gitrepo: + if key not in ['url', 'prefix', 'module', 'whitelist', 'working_tree', 'bare', 'path']: + raise ValueError("Unknown key in gitrepo: {}".format(key)) + for key in ['url', 'prefix', 'module']: + if key not in gitrepo: + raise ValueError("Missing key in gitrepo: {}".format(key)) + # Check the URL is valid + # We might want to support a broader range of URLs in the future. + if not gitrepo['url'].startswith('http://') and not gitrepo['url'].startswith('https://') and not gitrepo['url'].startswith('git@'): + raise ValueError("Invalid URL: {}".format(gitrepo['url'])) + + giturl = r'/static/repos/' + gitrepo['module'] + '/' + reponame + '/{branch:[^{}/]+}/{filename:[^{}]+}' + + debug_log(f"Module {reponame} is hosting {gitrepo} at {giturl}") + debug_log(f"""For testing: python learning_observer/jupyter.py "{reponame};{gitrepo['url']};{gitrepo['prefix']};False;True" """) + + # If the working tree is set in the repo, we can serve from the working tree + # This can be overridden by the settings file, in either direction + working_tree = gitrepo.get('working_tree', False) + working_tree_in_settings = paths.repo_debug_working_hack(reponame) # Ignore the branch; serve from working tree + if working_tree_in_settings is not None: + print("Using working tree:", working_tree_in_settings) + working_tree = working_tree_in_settings + + bare = gitrepo.get("bare", True) + if working_tree: + debug_log("Serving from working tree; overriding the bare repo setting") + debug_log(f"Settings are inconsistent: working_tree: {working_tree} and bare: {bare}") + bare = False + + print("Bare", bare) + print("working_tree", working_tree) + + app.add_routes([ + aiohttp.web.get( + giturl, + handler=gitserve.aio_gitserve.git_handler_wrapper( + paths.repo(reponame), + cookie_prefix="SHA_" + reponame, + prefix=repos[reponame].get("prefix", None), + bare=bare, + working_tree_dev=working_tree) + ) + ]) + + +def register_wsgi_routes(app): + ''' + This is primarily for `dash` integration, and is unsupported for + other uses. + ''' + for plugin in learning_observer.module_loader.wsgi(): + wsgi_app = plugin['APP'] + # This is a nice design pattern to adopt more broadly + if callable(wsgi_app): + wsgi_app = wsgi_app() + wsgi_url_patterns = plugin.get("URL_PATTERNS", None) + + # We want to support patterns being a string, a list, + # or a function. This is (relatively untested) code to + # do that. + if wsgi_url_patterns is None: + print("Warning! No WSGI URL patterns. This should") + print("only be used for prototyping on dev machines") + wsgi_url_patterns = "/{path_info:.*}" + if callable(wsgi_url_patterns): + wsgi_url_patterns = wsgi_url_patterns() + # We would like to support async, but for now, the whole + # routing setup isn't async, so that's for later. + # + # if inspect.isawaitable(wsgi_url_patterns): + # wsgi_url_patterns = await wsgi_url_patterns + if isinstance(wsgi_url_patterns, str): + wsgi_url_patterns = [wsgi_url_patterns] + + wsgi_handler = learning_observer.auth.teacher(aiohttp_wsgi.WSGIHandler(wsgi_app.server)) + for pattern in wsgi_url_patterns: + app.router.add_route("*", pattern, wsgi_handler) diff --git a/learning_observer/learning_observer/run.py b/learning_observer/learning_observer/run.py new file mode 100644 index 000000000..060368644 --- /dev/null +++ b/learning_observer/learning_observer/run.py @@ -0,0 +1,29 @@ +''' +Run.py: Helper for console_scripts entry point +===== + +In order to have this work as a command line utility installed with +pip, we need a way to run this which is a function, rather than a +script. +''' +import sys +import os.path + + +def run(): + ''' + Helper to run from entry point + ''' + print("Running") + print(os.path.dirname(__file__)) + sys.path.append(os.path.dirname(__file__)) + print(sys.path) + + # We might refactor this at some point, but our goal + # is to emulate running `python learning_observer.main` + # since console script entry points need to call a + # function rather than run a script. + + # pylint: disable=C0415,W0611 + import learning_observer.main + print("Imported") diff --git a/learning_observer/learning_observer/settings.py b/learning_observer/learning_observer/settings.py new file mode 100644 index 000000000..d9d5dfe72 --- /dev/null +++ b/learning_observer/learning_observer/settings.py @@ -0,0 +1,189 @@ +''' +System Configuration +==================== + +This is just a wrapper to load out configuration YAML file from disk. + +At some point, it might make sense to make this a thicker wrapper, so +we can have multiple configuration files with includes. As is, we have +credentials in the same place as module configuration, which is not +ideal. +''' + +import argparse +import enum +import os.path +import sys + +import yaml + +import learning_observer.paths + + +# If we e.g. `import settings` and `import learning_observer.settings`, we +# will load startup code twice, and end up with double the global variables. +# This is a test to avoid that bug. +if not __name__.startswith("learning_observer."): + raise ImportError("Please use fully-qualified imports") + sys.exit(-1) + + +args = None +parser = None + + +def parse_and_validate_arguments(): + ''' + Parse and validate command line arguments; for now, just the + configuration file location. + ''' + global args, parser + parser = argparse.ArgumentParser( + description='The Learning Observer', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + '--config-file', + help='Specify an alternative configuration file', + default=learning_observer.paths.config_file()) + + parser.add_argument( + '--watchdog', + help='Run in watchdog mode. This will restart on file changes.', + default=None) + + args = parser.parse_args() + + if not os.path.exists(args.config_file): + raise FileNotFoundError( + "Configuration file not found: {config_file}\n" + "\n" + "Copy the example file into:\n" + "{config_file}\n" + "And then continue setup\n" + "The command is probably:\n" + "cp {sourcedir}/creds.yaml.example {dest}".format( + sourcedir=os.path.dirname(os.path.abspath(__file__)), + dest=args.config_file, + config_file=args.config_file + ) + ) + return args + + +# DEV = Development, with full debugging +# DEPLOY = Running on a server, with good performance +# INTERACTIVE = Processing data offline +RUN_MODES = enum.Enum('RUN_MODES', 'DEV DEPLOY INTERACTIVE') +RUN_MODE = None + +settings = None + + +def load_settings(config): + ''' + Load the settings file and return a dictionary of settings. Also: + - Allow a stub data path + - Select the run mode + - Set up location of module repositories, if overridden in the config + + This is a wrapper around `yaml.safe_load()` so we can do some validation, + error handling, and postprocessing. + + :param config: The configuration file to load, or a dictionary of settings + :return: A dictionary of settings + + We can work from a dictionary rather than config file because we want to + be able to use pieces of the Learning Observer in scripts and tests, where + we don't need a full config. + ''' + global settings + + if isinstance(config, str): + with open(config, 'r') as f: + settings = yaml.safe_load(f) + elif isinstance(config, dict): + settings = config + else: + raise AttributeError("Invalid settings file") + + # For testing and similar, we'd like to be able to have alternative data + # paths + if 'data_path' in settings: + learning_observer.paths.override_data_path(settings['data_path']) + + # Development versus deployment. This is helpful for logging, verbose + # output, etc. + global RUN_MODE + if settings['config']['run_mode'] == 'dev': + RUN_MODE = RUN_MODES.DEV + elif settings['config']['run_mode'] == 'deploy': + RUN_MODE = RUN_MODES.DEPLOY + elif settings['config']['run_mode'] == 'interactive': + RUN_MODE = RUN_MODES.INTERACTIVE + else: + raise ValueError("Configuration setting for run_mode must be either 'dev', 'deploy', or 'interactive'") + + if 'repos' in settings: + for repo in settings['repos']: + # In the future, we might allow dicts if we e.g. want more metadata + if isinstance(settings['repos'][repo], str): + learning_observer.paths.register_repo(repo, settings['repos'][repo]) + elif isinstance(settings['repos'][repo], dict): + # HACK. We should figure out where to stick this. This does not belong in paths + debug_working = settings['repos'][repo].get("debug_working", None) + + learning_observer.paths.register_repo( + repo, + settings['repos'][repo]['path'], + debug_working=debug_working + ) + else: + raise ValueError("settings.repos.{repo} should be a string or a dict. Please fix the settings file.".format(repo=repo)) + + return settings + + +# Not all of these are guaranteed to work on every branch of the codebase. +AVAILABLE_FEATURE_FLAGS = ['uvloop', 'watchdog', 'auth_headers_page', 'merkle', 'save_google_ajax', 'use_google_ajax'] + + +def feature_flag(flag): + ''' + Return `None` if the given feature flag is disabled. + + Returns the value of the feature flag if it is enabled. + ''' + if flag not in AVAILABLE_FEATURE_FLAGS: + raise ValueError( + f"Unknown feature flag: {flag}" + f"Available feature flags: {AVAILABLE_FEATURE_FLAGS}" + ) + + flag = settings.get( + 'feature_flags', {} + ).get(flag, None) + + # The feature flag is disabled if it is False, None, or omitted + if flag is False: + return None + + return flag + + +def module_setting(module_name, setting=None, default=None): + ''' + Return the settings for a specific module. + + Optionally, can be passed a specific setting. + + Returns `default` if no setting (or `None` if not set) + ''' + module_settings = settings.get( + 'modules', {} + ).get(module_name, None) + if setting is None: + return module_settings + if module_settings is not None: + return module_settings.get(setting, default) + return default diff --git a/learning_observer/learning_observer/static/common/dashboard.js b/learning_observer/learning_observer/static/common/dashboard.js new file mode 100644 index 000000000..ff84d8db2 --- /dev/null +++ b/learning_observer/learning_observer/static/common/dashboard.js @@ -0,0 +1,117 @@ +/* + This ought to be packaged up at some point.... + */ + +function encode_query_string(obj) { + /* + Create a query string from a dictionary + + {a:'b', c:'d'} ==> "a=b&c=d" + + dictionary -> string + */ + var str = []; + for (var p in obj) + if (obj.hasOwnProperty(p)) { + str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p])); + } + return str.join("&"); +} + + +function dashboard_connection(key, callback, params) { + /* + Create a web socket connection to the server. + */ + const course = key.course; + const module = key.module; + const get_params = encode_query_string(key); + // TODO: Course should be abstracted out + const protocol = {"http:": "ws:", "https:": "wss:"}[window.location.protocol]; + var ws = new WebSocket(`${protocol}//${window.location.host}/wsapi/dashboard?${get_params}`); + ws.onmessage = function (event) { + console.log("Got data"); + let data = JSON.parse(event.data); + if(data.logged_in === false) { + window.location.href="/"; // TODO: System.go_home() or something + } else { + callback(data); + } + } +} + +function decode_string_dict(stringdict) { + /* + Decode a string dictionary of the form: + `key1=value1; key2=value2;key3=value3` + This is used both to encode document hashes and for cookies. + + This is inspired by a (buggy) cookie decoder from w3cschools. We + wrote out own since that one starts out with decodeURIComponent, + potentially allowing for injections. + */ + var decoded = {}; + var splitstring = stringdict.split(';'); + for(var i = 0; i 0) { + queue.unshift(metadata.pop()); + } + state.add("metadata"); + are_we_done(); + } + + function dequeue() { + console.log("dequeue"); + if(socket === null) { + // Do nothing. We're reconnecting. + console.log("Event squelched; reconnecting"); + } else if(socket.readyState === socket.OPEN && + state.has("ready")) { + console.log("Sending event..."); + while(queue.length > 1) { + var event = queue.shift(); + socket.send(event); /* TODO: We should do receipt confirmation before dropping events */ + } + } else if((socket.readyState == socket.CLOSED) || (socket.readyState == socket.CLOSING)) { + /* + If we lost the connection, we wait a second and try to open it again. + + Note that while socket is `null` or `CONNECTING`, we don't take either + branch -- we just queue up events. We reconnect after 1 second if closed, + or dequeue events if open. + */ + console.log("Re-opening connection in 1s"); + socket = null; + state = new Set(); + setTimeout(function() { + console.log("Re-opening connection"); + socket = new_websocket(); + }, 1000); + } + } + + return function(data) { + queue.push(data); + dequeue(); + } +} + +function ajax_logger(ajax_server) { + /* + HTTP event per request. + + To do: Handle failures / dropped connections + */ + var server = ajax_server; + return function(data) { + /* + Helper function to send a logging AJAX request to the server. + This function takes a JSON dictionary of data. + */ + + httpRequest = new XMLHttpRequest(); + //httpRequest.withCredentials = true; + httpRequest.open("POST", ajax_server); + httpRequest.send(data); + } +} + +/* +logger takes a list of loggers. For example, if we want to send to the server twice, and log on console: + +logger([ + console_logger(), + ajax_logger("https://localhost/webapi/"), + websocket_logger("wss://localhost/wsapi/in/") +]); + +loggers_enabled = [ + console_logger(), + //ajax_logger("https://writing.hopto.org/webapi/")//, + websocket_logger("wss://writing.hopto.org/wsapi/in/") +]; +*/ + +function log_event(event_type, event) { + event = prepare_event(event_type, event); + // TODO: Add username + for (var i=0; i + + + + + + + + + + + + Dashboard Debugger + + + +
+ + + + +
+

This is a test page. We can write a JSON key, and monitor a dashboard

+

It is not designed to be robust.

+ + + +
+ +
+ + diff --git a/learning_observer/learning_observer/static/debug.js b/learning_observer/learning_observer/static/debug.js new file mode 100644 index 000000000..1c5b44471 --- /dev/null +++ b/learning_observer/learning_observer/static/debug.js @@ -0,0 +1,12 @@ +d3.select("#query_button").attr("onclick", "query_dashboard()"); + +function query_dashboard() { + console.log("Click"); + dashboard_connection( + JSON.parse(d3.select("#query_string").property("value")), + function(data) { + console.log(data); + d3.select("#query_response").property("value", JSON.stringify(data)); + } + ); +}; diff --git a/learning_observer/learning_observer/static/favicon.ico b/learning_observer/learning_observer/static/favicon.ico new file mode 100644 index 000000000..92fea97e8 Binary files /dev/null and b/learning_observer/learning_observer/static/favicon.ico differ diff --git a/learning_observer/learning_observer/static/liblo.js b/learning_observer/learning_observer/static/liblo.js new file mode 100644 index 000000000..d54bf1765 --- /dev/null +++ b/learning_observer/learning_observer/static/liblo.js @@ -0,0 +1,146 @@ +// +// This is the preloaded Learning Observer library. +// + +// Path management, so that we can have relative URLs + +function lo_modulepath(rel_path) { + // This is used to retrieve URLs of relative + // files in the same git repo. + const path = new URL(document.URL).pathname; + const last_slash = path.lastIndexOf("/"); + const base_path = path.slice(0, last_slash+1); + return base_path + rel_path; +} + +function lo_thirdpartypath(rel_path) { + // This is used to retrieve URLs of external libraries + return "/static/3rd_party/"+rel_path; +} + +function requiremodulelib(lib) { + return lo_modulepath(lib); +} + +function requireexternallib(lib) { + return lo_thirdpartypath(lib) +} + +function requiremoduletext(text) { + return "/static/3rd_party/text.js!"+lo_modulepath(text); +} + +function requiresystemtext(text) { + return "/static/3rd_party/text.js!/static/"+text +} + +function requireconfig() { + return "/static/3rd_party/text.js!/config.json"; +} + + + + +// Helper functions. +// +// + +function rendertime1(t) { + /* + Convert seconds to a time string. + 10 ==> 10 sec + 120 ==> 2:00 + 3600 ==> 1:00:00 + 7601 ==> 2:06:41 + 764450 ==> 8 days + + */ + function str(i) { + if(i<10) { + return "0"+String(i); + } + return String(i) + } + var seconds = Math.floor(t) % 60; + var minutes = Math.floor(t/60) % 60; + var hours = Math.floor(t/3600) % 60; + var days = Math.floor(t/3600/24); + + if ((minutes === 0) && (hours === 0) && (days === 0)) { + return String(seconds) + " sec" // 0-59 seconds + } + if (days>0) { + return String(days) + " days" // >= 1 day + } + if(hours === 0) { + return String(minutes)+":"+str(seconds); // 1 minute - 1 hour + } + return String(hours)+":"+str(minutes)+":"+str(seconds) // 1 - 24 hours +} + +function rendertime2(t) { + /* + Convert seconds to a time string. + + Compact representation. + 10 ==> 10s + 125 ==> 2m + 3600 ==> 1h + 7601 ==> 2h + 764450 ==> 8d + + */ + function str(i) { + if(i<10) { + return "0"+String(i); + } + return String(i) + } + var seconds = Math.floor(t) % 60; + var minutes = Math.floor(t/60) % 60; + var hours = Math.floor(t/3600) % 60; + var days = Math.floor(t/3600/24); + + if(days>0) { + return String(days)+'d'; + } + if(hours>0) { + return String(hours)+'h'; + } + if(minutes>0) { + return String(minutes)+'m'; + } + if(seconds>0) { + return String(seconds)+'s'; + } + return '-'; +} + +// TODO this is copied code from static/common/dashboard.js +// I couldn't get dash to pull in that file specifically, +// but I didn't want to deal with it at that time. +// Guessing that /common is blocked somewhere along the way. +function decode_string_dict(stringdict) { + /* + Decode a string dictionary of the form: + `key1=value1; key2=value2;key3=value3` + This is used both to encode document hashes and for cookies. + + This is inspired by a (buggy) cookie decoder from w3cschools. We + wrote out own since that one starts out with decodeURIComponent, + potentially allowing for injections. + */ + var decoded = {}; + var splitstring = stringdict.split(';'); + for(var i = 0; i + + + + + + \ No newline at end of file diff --git a/learning_observer/learning_observer/static/media/Flag_of_Poland.svg b/learning_observer/learning_observer/static/media/Flag_of_Poland.svg new file mode 100644 index 000000000..b08d02519 --- /dev/null +++ b/learning_observer/learning_observer/static/media/Flag_of_Poland.svg @@ -0,0 +1 @@ + diff --git a/learning_observer/learning_observer/static/media/Flag_of_the_United_States.svg b/learning_observer/learning_observer/static/media/Flag_of_the_United_States.svg new file mode 100644 index 000000000..a11cf5f94 --- /dev/null +++ b/learning_observer/learning_observer/static/media/Flag_of_the_United_States.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/learning_observer/learning_observer/static/media/LICENSE.txt b/learning_observer/learning_observer/static/media/LICENSE.txt new file mode 100644 index 000000000..c3d2d6697 --- /dev/null +++ b/learning_observer/learning_observer/static/media/LICENSE.txt @@ -0,0 +1,11 @@ +ETS_Logo.svg: + +The ETS Logo is a trademark of the Educational Testing Service. For +terms-of-use, see: + +https://www.ets.org/legal/trademarks/owned + +It is not distributed under the same license as the rest of this +system. + +Flags of Poland and the US are SVGs from Wikipedia, and in the public domain diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/LICENSE.TXT b/learning_observer/learning_observer/static/media/hubspot_persona_images/LICENSE.TXT new file mode 100644 index 000000000..6d441a9ee --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/LICENSE.TXT @@ -0,0 +1,67 @@ +These images are from HubSpot's free Make My Persona tool. + +https://www.hubspot.com/make-my-persona?utm_source=mktg-resources + +These have unclear licensing. HubSpot has "Copright (c) 2020 HubSpot +Inc." at the bottom of the page, but no other text there (not even All +Rights Reserved). There is no license file anywhere we could +find. There were many implicit grants in other places e.g. "Create a +buyer persona that your entire company can use to market, sell, and +serve better" which made this use seem okay, so we reached out to +HubSpot to confirm this use was okay. + +HubSpot explicitly confirmed that we were in the clear for UX +prototypes, mockups, and personas. Chat transcript below. + +However, these are NOT distributed under the same license as the rest +of the project. For licensing information, please contact HubSpot +directly. + +My expectation (as of this writing) is that we will NOT use these +avatars beyond mockups, prototypes, and testing. If we go beyond that, +we may revisit. + +The rest of the mockup personas incorporated information from several +such tools. The rest were clearly okay. + +Thank you HubSpot! + + + +Chat with Hubspot support on 5/19/2020 at 3:47pm EST: + +HubBot: + Great, a coach is on their way now. They’ll send a message + when they get here, so I appreciate your patience in the meantime. + +3:40 PM Ali: + Hi Peter, this is Ali from HubSpot Sales. I'm happy to point you in the right direction today. + +3:40 PM + I just had a quick question: you have a bunch of free tools, like a + person generator. If we use them, are we allowed to use the personas + however we like? Or are there licensing restrictions? + + They say nothing, and if it's All Rights Reserved, we're not allowed + to use them for anything. Which sort of makes them pointless. Or I'm + not sure. https://www.hubspot.com/make-my-persona It's a nice tool, + but there's no legal information + + +3:42 PM Ali + You would be creating your buyer persona for your own company, no one + else would see it if you use the generator to create a buyer persona + then there aren't legal restrictions + + +3:43 PM + Okay. Thank you. I'm building an open source educational tool, and I + wanted to share some UX mock-ups and prototypes, including user + personas from the tool (obviously not open-sourcing the personas + themselves). It sounds like that's okay then? + +3:45 PM Ali + Yes that is definitely okay Peter + +3:47 PM + Thank you so much! diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-0.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-0.svg new file mode 100644 index 000000000..e2772e262 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-0.svg @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-1.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-1.svg new file mode 100644 index 000000000..fc1b02974 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-1.svg @@ -0,0 +1,161 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-10.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-10.svg new file mode 100644 index 000000000..15190a5da --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-10.svg @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-11.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-11.svg new file mode 100644 index 000000000..0cf396a51 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-11.svg @@ -0,0 +1,385 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-12.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-12.svg new file mode 100644 index 000000000..76dfcec7e --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-12.svg @@ -0,0 +1,173 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-13.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-13.svg new file mode 100644 index 000000000..f9893bd06 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-13.svg @@ -0,0 +1,201 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-14.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-14.svg new file mode 100644 index 000000000..08598fd7b --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-14.svg @@ -0,0 +1,303 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-2.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-2.svg new file mode 100644 index 000000000..8715d7d87 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-2.svg @@ -0,0 +1,135 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-3.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-3.svg new file mode 100644 index 000000000..25b82a46a --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-3.svg @@ -0,0 +1,403 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-4.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-4.svg new file mode 100644 index 000000000..f984fabc3 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-4.svg @@ -0,0 +1,131 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-5.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-5.svg new file mode 100644 index 000000000..444bc2c37 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-5.svg @@ -0,0 +1,149 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-6.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-6.svg new file mode 100644 index 000000000..bf4955eea --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-6.svg @@ -0,0 +1,229 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-7.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-7.svg new file mode 100644 index 000000000..154df820b --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-7.svg @@ -0,0 +1,337 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-8.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-8.svg new file mode 100644 index 000000000..8e7433ab8 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-8.svg @@ -0,0 +1,348 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-9.svg b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-9.svg new file mode 100644 index 000000000..d438350a7 --- /dev/null +++ b/learning_observer/learning_observer/static/media/hubspot_persona_images/avatar-9.svg @@ -0,0 +1,156 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/learning_observer/learning_observer/static/media/logo-clean.jpg b/learning_observer/learning_observer/static/media/logo-clean.jpg new file mode 100644 index 000000000..145ad8f7b Binary files /dev/null and b/learning_observer/learning_observer/static/media/logo-clean.jpg differ diff --git a/learning_observer/learning_observer/static/media/logo.jpg b/learning_observer/learning_observer/static/media/logo.jpg new file mode 100644 index 000000000..35f6a1290 Binary files /dev/null and b/learning_observer/learning_observer/static/media/logo.jpg differ diff --git a/learning_observer/learning_observer/static/modules/course.html b/learning_observer/learning_observer/static/modules/course.html new file mode 100644 index 000000000..9ed7bf559 --- /dev/null +++ b/learning_observer/learning_observer/static/modules/course.html @@ -0,0 +1,43 @@ +
+
+
+ {{ name }} +
+
+
+
+ {{{ tools }}} +

+ {{ descriptionHeading }}

+
+
+ +
+ + + diff --git a/learning_observer/learning_observer/static/modules/courses.html b/learning_observer/learning_observer/static/modules/courses.html new file mode 100644 index 000000000..7f183437a --- /dev/null +++ b/learning_observer/learning_observer/static/modules/courses.html @@ -0,0 +1,21 @@ +
+

My Courses

+
+ +
+
diff --git a/learning_observer/learning_observer/static/modules/informational.html b/learning_observer/learning_observer/static/modules/informational.html new file mode 100644 index 000000000..85847dd42 --- /dev/null +++ b/learning_observer/learning_observer/static/modules/informational.html @@ -0,0 +1,5 @@ +
+
+ {{{text}}} +
+
diff --git a/learning_observer/learning_observer/static/modules/login.html b/learning_observer/learning_observer/static/modules/login.html new file mode 100644 index 000000000..fe4ab8e03 --- /dev/null +++ b/learning_observer/learning_observer/static/modules/login.html @@ -0,0 +1,65 @@ + + +
+
+
+

{{ server_name }}

+

{{ front_page_pitch }} +

+
+
+
+
+ +
+ + + + +
+
+ +
+ +
+ + + + +
+
+ +
+
+ +
+
+
+
+
+ + +

+ + + Contribute + on github

+
+
+
+
+ Learning Observer Logo +
+
+
diff --git a/learning_observer/learning_observer/static/modules/navbar_loggedin.html b/learning_observer/learning_observer/static/modules/navbar_loggedin.html new file mode 100644 index 000000000..949c80fe4 --- /dev/null +++ b/learning_observer/learning_observer/static/modules/navbar_loggedin.html @@ -0,0 +1,17 @@ + diff --git a/learning_observer/learning_observer/static/modules/tool.html b/learning_observer/learning_observer/static/modules/tool.html new file mode 100644 index 000000000..4ebfd143d --- /dev/null +++ b/learning_observer/learning_observer/static/modules/tool.html @@ -0,0 +1,8 @@ +

+ +

diff --git a/learning_observer/learning_observer/static/modules/unauth.md b/learning_observer/learning_observer/static/modules/unauth.md new file mode 100644 index 000000000..7644c259e --- /dev/null +++ b/learning_observer/learning_observer/static/modules/unauth.md @@ -0,0 +1,15 @@ +You do not have an account on this system. + +Your Google email is: **{{ email }}**. + +Your user ID is **{{ user_id }}**. + +If you believe you should have an account, please email the text above +(Google ID and email) to me (Piotr Mitros), and I'll set you up with +an account. If you're authorized, you should have my email already +(but if not, it's pmitros, followed by the @ symbol, followed by +ets.org). + +If you logged in with the wrong account, please [log +out](/auth/logout) and try again. You should use your official school +account. \ No newline at end of file diff --git a/learning_observer/learning_observer/static/ux.css b/learning_observer/learning_observer/static/ux.css new file mode 100644 index 000000000..e5cfbb507 --- /dev/null +++ b/learning_observer/learning_observer/static/ux.css @@ -0,0 +1,7 @@ +.wo-row-tile { + min-height: 350px; +} + +.wo-col-tile { + min-height: 350px; +} diff --git a/learning_observer/learning_observer/static/webapp.html b/learning_observer/learning_observer/static/webapp.html new file mode 100644 index 000000000..7ca747d69 --- /dev/null +++ b/learning_observer/learning_observer/static/webapp.html @@ -0,0 +1,57 @@ + + + + + + + + + + + + + + Writing Analysis + + + +
+ + + + +
+ +
+ +
+ + diff --git a/learning_observer/learning_observer/static/webapp.js b/learning_observer/learning_observer/static/webapp.js new file mode 100644 index 000000000..c4e707f65 --- /dev/null +++ b/learning_observer/learning_observer/static/webapp.js @@ -0,0 +1,251 @@ +function go_home() { + /* + Load the homepage. + */ + window.location.href="/"; +} + +function error(error_message) { + /* + Show an error message. + + TODO: Do this at least somewhat gracefully. + */ + alert("Error: "+error_message); + go_home(); +} + +function ajax(config) +{ + return function(url) { + // Do AJAX calls with error handling + return new Promise(function(resolve, reject) { + config.d3.json(url) + .then(function(data){ + resolve(data); + }) + .catch(function(data){ + reject(data); + }); + }); + } +} + + +requirejs( + // TODO: Clean up absolute paths. We hardcoded these for now, due to refactor. + ["/static/3rd_party/text.js!/config.json", + "/static/3rd_party/text.js!/webapi/course_dashboards", // Perhaps this belongs in config.json? + "/static/3rd_party/d3.v5.min.js", + "/static/3rd_party/mustache.min.js", + "/static/3rd_party/showdown.js", + "/static/3rd_party/fontawesome.js", + "/static/3rd_party/text.js!/static/modules/unauth.md", + "/static/3rd_party/text.js!/static/modules/login.html", + "/static/3rd_party/text.js!/static/modules/courses.html", + "/static/3rd_party/text.js!/static/modules/course.html", + "/static/3rd_party/text.js!/static/modules/tool.html", + "/static/3rd_party/text.js!/static/modules/navbar_loggedin.html", + "/static/3rd_party/text.js!/static/modules/informational.html", + "/static/3rd_party/text.js!/auth/userinfo" + ], + function(config, tool_list, d3, mustache, showdown, fontawesome, unauth, login, courses, course, tool, navbar_li, info, auth_info) { + // Parse client configuration. + config = JSON.parse(config); + console.log(tool_list); + tool_list = JSON.parse(tool_list); + console.log(tool_list); + console.log(auth_info); + console.log(JSON.stringify(auth_info)); + // Add libraries + config.d3 = d3; + config.ajax = ajax(config); + auth_info = JSON.parse(auth_info); + + // Reload user info + function reload_user_info() { + config.ajax("/auth/userinfo") + .then(function(data) { + auth_info = data; + console.log(auth_info); + console.log(JSON.stringify(auth_info)); + console.log("reloaded user info"); + }); + console.log(auth_info); + } + + + function password_authorize() { + d3.json("/auth/login/password", { + method: 'POST', + headers: { + "Content-type": "application/json; charset=UTF-8" + }, + body: JSON.stringify({ + username: d3.select(".lo-login-username").property("value"), + password: d3.select(".lo-login-password").property("value") + }) + }).then(function(data) { + reload_user_info(); + if (data['status'] === 'authorized') { + load_courses_page(); + } else if (data['status'] === 'unauthorized') { + // TODO: Flash a nice subtle message + alert("Invalid username or password!"); + } + else { + console.log(data); + } + }); + } + + function load_login_page() { + d3.select(".main-page").html(mustache.render(login, config['theme'])); + d3.select(".lo-google-auth").classed("is-hidden", !config['google_oauth']); + d3.select(".lo-http-auth").classed("is-hidden", !config['http_basic_auth']); + d3.select(".lo-password-auth").classed("is-hidden", !config['password_auth']); + d3.select(".lo-login-button") + .on("click", function() { + password_authorize(); + }); + } + + function load_courses_page() { + /* + Listing of Google Classroom courses + */ + d3.select(".main-page").html(courses); + config.ajax("/webapi/courselist/").then(function(data){ + /* + TODO: We want a function which does this abstracted + our. In essense, we want to call + d3.json_with_auth_and_errors + */ + if(data["error"]!=null) { + if(data["error"]["status"]==="UNAUTHENTICATED") { + load_login_page(); + } + else { + error("Unknown error!"); + } + } else { + let cdg = d3.select(".awd-course-list"); + cdg.selectAll("div.awd-course-card") + .data(data) + .enter() + .append("div") + .html(function(course_json) { + console.log(course_json); + let tools = ""; + for(var i=0; i".format(name=self.name) + + def __eq__(self, other): + if not isinstance(other, EventField): + return False + + return self.event == other.event + + def __lt__(self, other): + if not isinstance(other, EventField): + raise TypeError("< not supported between instances of 'EventField' and other types") + + return self.event < other.event + + +KeyStateType = enum.Enum("KeyStateType", "INTERNAL EXTERNAL") + +# This is a set of fields which we use to index reducers. For example, +# if we'd like to know how many students accessed a specific Google +# Doc, we might create a RESOURCE key (which would receive events for +# all students accessing that resource). If we'd like to keep track of +# a students' work in a particular Google Doc, we'd create a +# STUDENT/RESOURCE key. +# +# At some point, this shouldn't be hardcoded +# +# We'd also like a better way to think of the hierarchy of assignments than ITEM/ASSIGNMENT +KeyFields = [ + "STUDENT", # A single student + "CLASS", # A group of students. Typically, one class roster in Google Classroom + "RESOURCE" # E.g. One Google Doc + # "ASSIGNMENT" # E.g. A collection of Google Docs (e.g. notes, outline, draft) + # "TEACHER" # + # ... # ... and so on. +] + +KeyField = enum.Enum("KeyField", " ".join(KeyFields)) + + +class Scope(frozenset): + ''' + A scope is a set of KeyFields and EventFields. + ''' + pass + + +class ScopeFieldError(Exception): + ''' + Exception used if we e.g. try to add an incorrect type to a scope, have + a mismatched key to a scope, etc. Perhaps this might be a few exceptions + in the future. + ''' + pass diff --git a/learning_observer/learning_observer/stream_analytics/helpers.py b/learning_observer/learning_observer/stream_analytics/helpers.py new file mode 100644 index 000000000..d47b3399d --- /dev/null +++ b/learning_observer/learning_observer/stream_analytics/helpers.py @@ -0,0 +1,359 @@ +''' +Common utility functions for working with analytics modules. + +The goal is to have the modules be pluggable and independent of the +system. For now, our overall system diagram is: + ++---------------+ +| | +-------------+ +| Event Source ---| | Key-Value | +| | | | Store | ++---------------+ | | | ++---------------+ | +-----------+ <------|-- Internal | +| | | | | -------|-> State | +------------+ +------------+ +| Event Source --------|---->| Reducer | | | | | | | +| | | | | | --------|-> External -------->| Aggregator |----> | Dashboard | ++---------------+ | | +-----------+ | State | | | | | ++---------------+ | | | | +------------+ +------------+ +| | | | +-------------+ +| Event Source ----| | +| | | ++---------------+ v + +------------+ + | | + | Archival | + | Repository | + | | + +------------+ + +We create reducers with the `student_event_reducer` decorator. In the +longer term, we'll want to be able to plug together different +aggregators, state types, etc. We'll also want different keys for +reducers (per-student, per-resource, etc.). For now, though, this +works. +''' + +import copy +import functools + +import learning_observer.kvs +from learning_observer.stream_analytics.fields import KeyStateType, KeyField, EventField, Scope + +from learning_observer.log_event import debug_log + +# Not a great place to have this import... things might get circular at +# some point. +import learning_observer.module_loader + + +def fully_qualified_function_name(func): + ''' + Takes a function. Return a fully-qualified string with a name for + that function. E.g.: + + >>> from math import sin + >>> fully_qualified_function_name(math.sin) + 'math.sin' + + This is helpful for then giving unique names to analytics modules. Each module can + be uniquely referenced based on its reduce function. + ''' + return "{module}.{function}".format( + module=func.__module__, + function=func.__qualname__ + ) + + +def make_key_from_json(js): + ''' + This will make a key from a json dictionary + + Note that we ought to do auth / auth upstream of calling this + function. + + E.g. we might pass in: + { + "source": "da_timeline.visualize.handle_event", + "KeyField.STUDENT": "guest-424d691e92afb0ac8aeze585b1d28a49" + } + + And get out: + + 'Internal,da_timeline.visualize.handle_event,STUDENT:guest-424d691e92afb0ac8aeze585b1d28a49' + + This does extensive sanitation, since the JSON typically comes + from a browser + ''' + js = copy.deepcopy(js) + # We want to copy over KeyFields, converting them to `enum`s + # + # This sanitizes them in the process. + key_dict = {} + for key in KeyField: + if str(key) in js: + key_dict[key] = js[str(key)] + del js[str(key)] + + # Next, we want to copy over EventFields. + # We have no way to sanitize these, since they're open-ended, except + # to make sure they don't contain magic characters + for key in list(js): + if key.startswith("EventField."): + event = js[key][len("EventField."):] + key_dict[EventField(event)] = js[event] + del js[key] + + stream_module = js['source'] + + key_list = [ + KeyStateType.EXTERNAL.name, + ] + + if KeyField.STUDENT in js: + user_id = j[sKeyField.STUDENT] + + aggregator_functions = sum( + [ + a['sources'] + for a in learning_observer.module_loader.course_aggregators().values() + ], + [] + ) + + agg_function = None + for func in aggregator_functions: + if fully_qualified_function_name(func) == js['source']: + agg_function = func + + if agg_function is None: + raise ArgumentError("Invalid function") + + return make_key( + agg_function, + key_dict, + KeyStateType.INTERNAL + ) + + +def make_key(func, key_dict, state_type): + ''' + Create a KVS key. + + It combines: + + * A fully-qualified name for the reducer function + * A dictionary of fields + * Whether the key is internal or external + + Into a unique string + + For example: + >>> make_key( + some_module.reducer, + {h.KeyField.STUDENT: 123}, + h.KeyStateType.INTERNAL + ) + 'Internal,some_module.reducer,STUDENT:123' + ''' + # pylint: disable=isinstance-second-argument-not-valid-type + assert isinstance(state_type, KeyStateType) + assert callable(func) + + streammodule = fully_qualified_function_name(func) + + safe_user_id = key_dict[KeyField.STUDENT] + + # Key starts with whether it is internal versus external state, and what module it comes from + key_list = [ + state_type.name.capitalize(), + streammodule + ] + + # It continues with the fields. These are organized as key-value + # pairs. These need a well-defined order. I'm sure there's a + # logical order here, but for now, we do alphabetical. + # + # We will want to be able to do reduce operations across multiple + # axes. This is where an RDS with multiple indexes might be nice, + # if we can figure out the sharding, etc. Another alternative + # might be to use postgres to organize things (which changes + # rarely), but to keep actual key/value pairs in redis (which + # changes a lot). + for key in sorted(key_dict.keys(), key=lambda x: x.name): + key_list.append("{key}:{value}".format(key=key.name, value=key_dict[key])) + + # And we return this as comma-seperated values + return ",".join(key_list) + + +def kvs_pipeline( + null_state=None, + scope=None, + module_override=None, + qualname_override=None +): + ''' + Closures, anyone? + + There's a bit to unpack here. + + Top-level function. This allows us to configure the decorator (and + returns the decorator). + + * `null_state` tells us the empty state, before any reduce operations have + happened. This can be important for the aggregator. We're documenting the + code before we've written it, so please make sure this works before using. + * `scope` tells us the scope we reduce over. See `fields.Scope` + ''' + if scope is None: + debug_log("TODO: explicitly specify a scope") + debug_log("Defaulting to student scope") + scope = Scope([KeyField.STUDENT]) + + def decorator( + func + ): + ''' + The decorator itself. + + It takes a function which expects an event and an (internal) state from + the KVS, and outputs an internal and an external state. We should + consider removing the concept of an external state. The idea was that + we could make modules with just reducers (where all aggregation, etc. + was handled automatically). This isn't as central to the current + design. + + For interactive development, we allow overriding the `__module__` and + `__qualname__` of the function. This is helpful in places like Jupyer + notebooks, since this is used for setting keys. + + We could, as an alternative, pass these as additional parameters to + `make_key`, and `setattr` just over `wrapper_closure` to avoid side + effects. + ''' + if qualname_override is not None: + setattr(func, '__qualname__', qualname_override) + if module_override is not None: + setattr(func, '__module__', module_override) + + @functools.wraps(func) + async def wrapper_closure(metadata): + ''' + The decorator itself. We create a function that, when called, + creates an event processing pipeline. It keeps a pointer + to the KVS inside of the closure. This way, each pipeline has + its own KVS. This is the level at which we want consistency, + want to allow sharding, etc. If two users are connected, each + will have their own data store connection. + ''' + taskkvs = learning_observer.kvs.KVS() + + async def process_event(event, event_fields={}): + ''' + This is the function which processes events. It calls the event + processor, passes in the event(s) and state. It takes + the internal state and the external state from the + event processor. The internal state goes into the KVS + for use in the next call, while the external state + returns to the dashboard. + + The external state should include everything needed + for the dashboard visualization and exclude anything + large or private. The internal state needs everything + needed to continue reducing the events. + ''' + # TODO: Think through concurrency. + # + # We could put this inside of a transaction, but we + # would lose a few orders of magnitude in performance. + # + # We could keep this outside of a transaction, and handle + # occasional issues. + # + # It's worth noting that: + # + # 1. We have an archival record, and we can replay if there + # are issues + # 2. We keep this open on a per-session basis. The only way + # we might run into concurrency issues is if a student + # is e.g. actively editing on two computers at the same + # time + # 3. If we assume e.g. occasional disconnected operation, as + # on a mobile device, we'll have concurrency problems no + # matter what. In many cases, we should handle this + # explicitly rather than implicitly, for example, with + # conflict-free replicated data type (CRDTs) or explicit + # merge operation + # + # Fun! + # + # But we can think of more ways we might get concurrency + # issues in the future, once we do per-class / per-resource / + # etc. reducers. + # + # * We could funnel these into a common reducer. That'd be easy + # enough and probably the right long-term solution + # * We could have modules explicitly indicate where they need + # thread safety and transactions. That'd be easy enough. + + keydict = {} + # Step 1: Handle auth metadata. + if KeyField.STUDENT in scope: + if metadata is not None and 'auth' in metadata: + safe_user_id = metadata['auth']['safe_user_id'] + else: + # In general, this path should NOT be followed. If we + # want guest accounts, each user ought to have a unique + # identifier or cookie assigned on first access. + safe_user_id = '[guest]' + keydict[KeyField.STUDENT] = safe_user_id + + # Step 2: Handle all other metadata. + for field in scope: + # We don't want to override auth fields + if field in keydict: + pass + elif isinstance(field, EventField): + keydict[field] = event_fields.get(field.event, None) + else: + raise Exception("Unknown field", field) + + internal_key = make_key( + func, + keydict, + KeyStateType.INTERNAL + ) + external_key = make_key( + func, + keydict, + KeyStateType.EXTERNAL + ) + + internal_state = await taskkvs[internal_key] + if internal_state is None: + internal_state = copy.deepcopy(null_state) + await taskkvs.set(internal_key, internal_state) + + internal_state, external_state = await func( + event, internal_state + ) + + # We would like to give reducers the option to /not/ write + # on all events + if internal_state is not False: + await taskkvs.set(internal_key, internal_state) + if external_state is not False: + await taskkvs.set(external_key, external_state) + return external_state + return process_event + return wrapper_closure + return decorator + + +# `kvs_pipeline`, in it's current incarnation, is obsolete. +# +# We will now have reducers of multiple types. +# +# We will probably keep `kvs_pipeline` as a generic, and this is part of that +# transition. +student_event_reducer = functools.partial(kvs_pipeline, scope=Scope([KeyField.STUDENT])) diff --git a/learning_observer/learning_observer/synthetic_student_data.py b/learning_observer/learning_observer/synthetic_student_data.py new file mode 100644 index 000000000..8bec47715 --- /dev/null +++ b/learning_observer/learning_observer/synthetic_student_data.py @@ -0,0 +1,61 @@ +''' +Note that current loremipsum in `pip` is not Python 3 +compatible. If you are getting b'' in your text, the +patch is at: + +`https://github.com/monkeython/loremipsum/issues/10` +''' + +import random + +import numpy +import numpy.random + +import loremipsum +import names + +import learning_observer.util as util + + +def synthetic_student_data(student_id): + ''' + Create fake student data for mock-up UX for one student + ''' + name = names.get_first_name() + essay = "\n".join(loremipsum.get_paragraphs(5)) + return { + 'id': student_id, + 'name': name, + 'email': "{name}@school.district.us".format(name=name), + 'address': "1 Main St", + 'phone': "({pre})-{mid}-{post}".format( + pre=random.randint(200, 999), + mid=random.randint(200, 999), + post=random.randint(1000, 9999)), + 'avatar': "avatar-{number}".format(number=random.randint(0, 14)), + 'ici': random.uniform(100, 1000), + 'essay_length': len(essay), + 'essay': essay, + 'writing_time': random.uniform(5, 60), + 'text_complexity': random.uniform(3, 9), + 'google_doc': "https://docs.google.com/document/d/1YbtJGn7ida2IYNgwCFk3SjhsZ0ztpG5bMzA3WNbVNhU/edit", + 'time_idle': numpy.random.gamma(0.5, scale=5), + 'outline': [{"section": "Problem " + str(i + 1), + "length": random.randint(1, 300)} for i in range(5)], + 'revisions': {} + } + + +def synthetic_data(student_count=20): + ''' + Generate paginated mock student data for `student_count` students. + ''' + data = [ + synthetic_student_data(i) + for i in range(student_count) + ] + return util.paginate(data, 4) + + +if __name__ == '__main__': + print(synthetic_data()) diff --git a/learning_observer/learning_observer/util.py b/learning_observer/learning_observer/util.py new file mode 100644 index 000000000..7273b3784 --- /dev/null +++ b/learning_observer/learning_observer/util.py @@ -0,0 +1,128 @@ +''' +Random helper functions. + +Design invariant: + +* This should not rely on anything in the system. + +We can relax the design invariant, but we should think carefully +before doing so. +''' + +import hashlib +import math +import re + + +def paginate(data_list, nrows): + ''' + Paginate list `data_list` into `nrows`-item rows. + + This should move into the client + ''' + return [ + data_list[i * nrows:(i + 1) * nrows] + for i in range(math.ceil(len(data_list) / nrows)) + ] + + +def to_safe_filename(name): + ''' + Convert a name to a filename. The filename escapes any non-alphanumeric + characters, so there are no invalid or control characters. + + Can be converted back with `from_filename` + + For example, { would be encoded as -123- since { is character 123 in UTF-8. + ''' + return ''.join( + '-' + str(ord(c)) + '-' if not c.isidentifier() and not c.isalnum() else c + for c in name + ) + + +def from_safe_filename(filename): + ''' + Convert a filename back to a name. + + See `to_filename` for more information. + + Uses `re`, uncompiled, so probably not very fast. Right now, this is used + for testing / debugging, but might be worth optimizing if we ever use it + otherwise. + ''' + return re.sub(r'-(\d+)-', lambda m: chr(int(m.group(1))), filename) + + +def url_pathname(s): + """ + Remove URL and domain from a URL. Return the full remainder of the path. + + Input: https://www.googleapis.com/drive/v3/files + Output: drive/v3/files + + Note that in contrast to the JavaScript version, we don't include the + initial slash. + """ + return s.split('/', 3)[-1] + + +def translate_json_keys(d, translations): + """ + Replace all of the keys in the dictionary with new keys, including + sub-dictionaries. This was written for converting CamelCase from + Google APIs to snake_case. + + Note that this mutates the original data structure + """ + if isinstance(d, list): + for item in d: + translate_json_keys(item, translations) + elif isinstance(d, dict): + for k, v in list(d.items()): + if k in translations: + d[translations[k]] = d.pop(k) + else: + pass # print("UNTRANSLATED KEY: ", k) + + if isinstance(v, dict) or isinstance(v, list): + translate_json_keys(v, translations) + return d + + +def secure_hash(text): + ''' + Our standard hash functions. We can either use either + + * A full hash (e.g. SHA3 512) which should be secure against + intentional attacks (e.g. a well-resourced entity wants to temper + with our data, or if Moore's Law starts up again, a well-resourced + teenager). + + * A short hash (e.g. MD5), which is no longer considered + cryptographically-secure, but is good enough to deter casual + tempering. Most "tempering" comes from bugs, rather than attackers, + so this is very helpful still. MD5 hashes are a bit more manageable + in size. + + For now, we're using full hashes everywhere, but it would probably + make sense to alternate as makes sense. MD5 is 32 characters, while + SHA3_512 is 128 characters (104 if we B32 encode). + ''' + return "SHA512_" + hashlib.sha3_512(text).hexdigest() + + +def insecure_hash(text): + ''' + See `secure_hash` above for documentation + ''' + return "MD5_" + hashlib.md5(text).hexdigest() + + +# And a test case +if __name__ == '__main__': + assert to_safe_filename('{') == '-123-' + assert from_safe_filename('-123-') == '{' + test_string = "Hello? How are -- you doing? łłł" + assert from_safe_filename(to_safe_filename(test_string)) == test_string + assert url_pathname('https://www.googleapis.com/drive/v3/files') == 'drive/v3/files' diff --git a/learning_observer/learning_observer/utility_handlers.py b/learning_observer/learning_observer/utility_handlers.py new file mode 100644 index 000000000..e8f6173bd --- /dev/null +++ b/learning_observer/learning_observer/utility_handlers.py @@ -0,0 +1,73 @@ +''' +Helpful extra handlers +''' + +import os +import os.path + +import aiohttp +import aiohttp.web + +import pathvalidate + + +# This should be cleaned up. Imports generally. We're mid-refactor... +from learning_observer.log_event import debug_log + + +def static_file_handler(filename): + ''' + Serve a single static file + ''' + async def handler(request): + debug_log(request.headers) + return aiohttp.web.FileResponse(filename) + return handler + + +def redirect(new_path): + ''' + Static, fixed redirect to a new location + ''' + async def handler(request): + raise aiohttp.web.HTTPFound(location=new_path) + return handler + + +def static_directory_handler(basepath): + ''' + Serve static files from a directory. + + This could be done directly by nginx on deployment. + + This is very minimal for now: No subdirectories, no gizmos, + nothing fancy. I avoid fancy when we have user input and + filenames. Before adding fancy, I'll want test cases of + aggressive user input. + ''' + + def handler(request): + ''' + We're in a closure, since we want to configure the directory + when we set up the path. + ''' + # Extract the filename from the request + filename = request.match_info['filename'] + # Raise an exception if we get anything nasty + pathvalidate.validate_filename(filename) + # Check that the file exists + full_pathname = os.path.join(basepath, filename) + if not os.path.exists(full_pathname): + raise aiohttp.web.HTTPNotFound() + # And serve pack the file + return aiohttp.web.FileResponse(full_pathname) + return handler + + +def ajax_handler_wrapper(handler_func): + ''' + Wrap a function which returns a JSON object to handle requests + ''' + def handler(request): + return aiohttp.web.json_response(handler_func()) + return handler diff --git a/learning_observer/learning_observer/watchdog_observer.py b/learning_observer/learning_observer/watchdog_observer.py new file mode 100644 index 000000000..9e760eb2e --- /dev/null +++ b/learning_observer/learning_observer/watchdog_observer.py @@ -0,0 +1,168 @@ +''' +This is a subsystem designed to restart the system if files changed. + +It has two modes: + + - reimport: It will reimport all modules in the local directory. + - restart: It will hard restart the system + +It currently does not work. We need to make this work with asyncio: + +https://gist.github.com/mivade/f4cb26c282d421a62e8b9a341c7c65f6 + +However, we wanted to commit it since it doesn't break anything, and +we wanted everything to be is in sync. It is behind a feature flag, +and disabled by +''' + +import watchdog + +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler + +import importlib +import os +import os.path +import sys +import time +import logging +import traceback + +from watchdog.observers import Observer +from watchdog.events import LoggingEventHandler + + +LOCAL_PATH = os.path.dirname(os.path.abspath(__file__)) + + +def reimport_child_modules(paths=[LOCAL_PATH]): + ''' + Reload all modules which are in the given paths. + + This is used when we are running in watchdog mode, and we want to + restart parts of the server when a file changes. + + This does not do a full restart. See: + https://docs.python.org/3/library/importlib.html#importlib.reload + + We should probably be doing a full restart, but we wrote this before + we had a full restart option. Perhaps we should remove this? We'll + decide once we see how useful both options are. + + Args: + paths: A list of paths to search for modules. + + Returns: + A list of modules that were reloaded, a list of modules that + failed to reload, and a list of modules that we skipped (e.g. + system modules). + + If no path is specified, it defaults to the base directory of this file. + ''' + modules = list(sys.modules.values()) + reloaded = [] + failed = [] + + for module in modules: + # Only reload modules that are in the specified paths, + # and only if they are not system modules. + # + # A lot of these checks are probably redundant, but + # better safe than sorry. There is no ideal way to + # determine if a module should be reloaded, so this + # is a bit heuristic. + if not hasattr(module, '__file__'): + continue + if module.__file__ is None: + continue + if not module.__file__.endswith('.py'): + continue + if not any(module.__file__.startswith(path) for path in paths): + continue + if module.__name__.startswith('_'): + continue + if module.__name__ in sys.builtin_module_names: + continue + if not os.path.exists(module.__file__): + continue + if "SourceFileLoader" not in str(module.__loader__): + continue + try: + importlib.reload(module) + print('reloaded %s' % module.__name__) + reloaded.append(module) + except Exception: + print("Failed to reload %s" % module.__name__) + traceback.print_exc() + failed.append(module) + skipped = [m for m in modules if m not in reloaded and m not in failed] + return { + "reloaded": reloaded, + "failed": failed, + "skipped": skipped + } + + +def restart(): + ''' + Restart the system. + ''' + os.execl(sys.executable, sys.executable, *sys.argv) + + +class RestartHandler(FileSystemEventHandler): + ''' + Soft restart the server when a file changes. + + We could even just re-import the one file instead of everything? + ''' + def __init__(self, shutdown, restart, start): + self.shutdown = shutdown + self.restart = restart + self.start = start + + def on_any_event(self, event): + ''' + On any change in the file system, restart the server. + + We should be more selective, looking only at Python files, config file, + and skipping cache files, but for now we'll restart on any change, + since this is helpful for testing this module. + ''' + if event.is_directory: + return None + print("Reloading server") + self.shutdown() + # observer.stop() + # observer.join() + self.restart() + # We only make it beyond this point for some of the softer restarts. + self.start() + + +def watchdog(handler=LoggingEventHandler()): + ''' + Set up watchdog mode. This will (eventually) reimport on file changes. + ''' + event_handler = LoggingEventHandler() + observer = Observer() + print("Watching for changes in:", LOCAL_PATH) + observer.schedule(event_handler, LOCAL_PATH, recursive=True) + observer.start() + return observer + + +# observer = Observer() +# observer.start() + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + observer = watchdog() + try: + while True: + time.sleep(1) + finally: + observer.stop() + observer.join() diff --git a/learning_observer/learning_observer/webapp_helpers.py b/learning_observer/learning_observer/webapp_helpers.py new file mode 100644 index 000000000..cb53b326f --- /dev/null +++ b/learning_observer/learning_observer/webapp_helpers.py @@ -0,0 +1,101 @@ +''' +This file contains assorted middlewares and helpers +''' +import errno +import socket + +import aiohttp_cors + +import aiohttp_session +import aiohttp_session.cookie_storage + +import learning_observer.auth +from learning_observer.log_event import debug_log +import learning_observer.settings as settings + + +async def request_logger_middleware(request, handler): + ''' + Print all hits. Helpful for debugging. Should eventually go into a + log file. + ''' + debug_log(request) + + +async def add_nocache_middleware(request, response): + ''' + This prevents the browser from caching pages. + + Browsers do wonky things when logging in / out, keeping old pages + around. Caching generally seems like a train wreck for this system. + There's a lot of cleanup we can do to make this more robust, but + for now, this is a good enough solution. + ''' + if '/static/' not in str(request.url): + response.headers['cache-control'] = 'no-cache' + + +def setup_middlewares(app): + ''' + This is a helper function to setup middlewares. + ''' + app.on_response_prepare.append(request_logger_middleware) + # Avoid caching. We should be more specific about what we want to + # cache. + app.on_response_prepare.append(add_nocache_middleware) + app.middlewares.append(learning_observer.auth.auth_middleware) + + +def setup_session_storage(app): + ''' + This is a helper function to setup session storage. + ''' + aiohttp_session.setup(app, aiohttp_session.cookie_storage.EncryptedCookieStorage( + learning_observer.auth.fernet_key(settings.settings['aio']['session_secret']), + max_age=settings.settings['aio']['session_max_age'])) + + +def find_open_port(): + """ + Find an open port to run on. + + By default, run on port 8888. If in use, move up ports, until we find + one that is not in use. + + Returns: + int: The open port. + """ + port = 8888 + bound = False + while not bound: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.bind(("127.0.0.1", port)) + bound = True + except socket.error as e: + if e.errno == errno.EADDRINUSE: + bound = False + port = port + 1 + else: + raise + s.close() + return port + + +def setup_cors(app): + ''' + This is a helper function to setup CORS. + + This setup is overly broad. We need this for incoming events + and similar, but we don't want to expose the entire API + through this as we do here. + + TODO: Handle auth / auth more specifically on individual routes. + ''' + cors = aiohttp_cors.setup(app, defaults={ + "*": aiohttp_cors.ResourceOptions( + allow_credentials=True, + expose_headers="*", + allow_headers="*", + ) + }) diff --git a/learning_observer/prototypes/README.md b/learning_observer/prototypes/README.md new file mode 100644 index 000000000..6cd6c490c --- /dev/null +++ b/learning_observer/prototypes/README.md @@ -0,0 +1,15 @@ +Early Prototypes and Draft Code +=============================== + +This is a place for unfinished code... I don't commit most prototypes, +but once something is discussion-worthy or a good starting point, I +sometimes do. + +Prototypes and proofs-of-concept: + +* Help scope out future work +* Help understand capabilities and limitations +* Help us learn +* Are sometimes starting points for implementation + +Code here is of mixed quality, obviously. \ No newline at end of file diff --git a/webapp/orm.py b/learning_observer/prototypes/deprecated/orm.py similarity index 72% rename from webapp/orm.py rename to learning_observer/prototypes/deprecated/orm.py index d7e6395e8..d1b222611 100644 --- a/webapp/orm.py +++ b/learning_observer/prototypes/deprecated/orm.py @@ -1,4 +1,7 @@ -''' Abstraction to access database ''' +''' +THIS FILE IS NOT CURRENTLY USED. WE ARE PROTOTYPING. + +Abstraction to access database ''' import asyncio import functools @@ -13,6 +16,7 @@ stored_procedures = {} + async def initialize(reset=False): global conn print("Connecting to database...") @@ -20,29 +24,35 @@ async def initialize(reset=False): conn = await asyncpg.connect() if reset: await conn.execute(sql_statements['reset']) - + # Set up tables and stored procedures, if they don't exist. await conn.execute(sql_statements['init']) # Set up stored procedures for stored_procedure in sql_statements['stored_procedures']: - stored_procedures[stored_procedure] = \ - await conn.prepare(sql_statements['stored_procedures'][stored_procedure]) + stored_procedures[stored_procedure] = await conn.prepare( + sql_statements['stored_procedures'][stored_procedure] + ) print("Connected...") asyncio.get_event_loop().run_until_complete(initialize()) + # TODO: This should be done with a decorator, rather than cut-and-paste def fetch_events(username, docstring): return stored_procedures['fetch_events'].cursor(username, docstring) -async def insert_event (username, docstring, event): - rv = await stored_procedures['insert_event'].fetchval(username, docstring, event) + +async def insert_event(username, docstring, event): + rv = await stored_procedures['insert_event'].fetchval( + username, docstring, event + ) return rv - + + if __name__ == '__main__': async def test(): - print(await insert_event ("pmitros", "doc", json.dumps({ + print(await insert_event("pmitros", "doc", json.dumps({ "ty": "ts", "si": 5, "ei": 7, @@ -53,6 +63,6 @@ async def test(): async with conn.transaction(): cursor = fetch_events("pmitros", "doc") async for record in cursor: - print (record) - + print(record) + asyncio.get_event_loop().run_until_complete(test()) diff --git a/learning_observer/prototypes/google_docs/README.md b/learning_observer/prototypes/google_docs/README.md new file mode 100644 index 000000000..e572fcfc7 --- /dev/null +++ b/learning_observer/prototypes/google_docs/README.md @@ -0,0 +1,44 @@ +Google Docs APIs +================ + +These are experiments with the Google Docs and Google Drive APIs. On +the whole, the APIs are developer-friendly and easy-to-use. They run +into a few brick walls for our use-case. Upsides: + +* We can grab ground truth documents via the Google Drive and Google + Docs APIs, at least assuming the document is shared with the teacher + (which may or may not be the case), or by the student. +* We can grab [comments](Link https://developers.google.com/drive/api/v3/reference/comments), + [Revisions](https://developers.google.com/drive/api/v3/reference/revisions) (although not + with the same granularity as our extension), + [comment replies](https://developers.google.com/resources/api-libraries/documentation/drive/v3/python/latest/drive_v3.replies.html), + and [suggested revisions](https://developers.google.com/docs/api/how-tos/suggestions) +* There is a poorly-document API which appears to monitor for changes (https://developers.google.com/drive/api/v3/reference/channels) +* Some of the APIs include [indexes](https://developers.google.com/docs/api/how-tos/overview) +* We can get a lot more through Vault, but I'm not sure schools would + grant us that kind of access. It's also tough to test too, since it + requires a Google Workspace account of the right type. + +The major constraints are: + +* Google's permissions and auth system, which isn't really designed + for automation or monitoring. They're designed to grant short-term, + expiring access, although it looks like Google recently added + [service accounts](https://github.com/googleapis/google-api-python-client/blob/master/docs/oauth-server.md) + which may address this issue. +* They're not designed for realtime use (e.g. monitoring writing + processes) + +The APIs couldn't replace our pipeline, but would be a helpful +supplement. + +Note that this code would need to be rewritten for *Writing Observer*, +since the [client +library](https://github.com/googleapis/google-api-python-client/blob/master/docs/README.md) +we're using is not asynchronous, and would lead to performance issues. + +We also (in the current version) do no pagination; this is just to +understand the types of data returned. + +To get started, you will need a `credentials.json` from Google's API +console, set up for a desktop application. \ No newline at end of file diff --git a/learning_observer/prototypes/google_docs/google_apis.py b/learning_observer/prototypes/google_docs/google_apis.py new file mode 100644 index 000000000..4368c4c4c --- /dev/null +++ b/learning_observer/prototypes/google_docs/google_apis.py @@ -0,0 +1,113 @@ +# TODO/HACK/Unfinished: +# +# * We do *not* handle pagination in this prototype. + +import argparse +import os.path + +import json + +from googleapiclient.discovery import build +from google_auth_oauthlib.flow import InstalledAppFlow +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials + + +# If modifying these scopes, delete the file token.json. +SCOPES = [ + 'https://www.googleapis.com/auth/documents.readonly', + 'https://www.googleapis.com/auth/drive.metadata.readonly', + 'https://www.googleapis.com/auth/drive.readonly' +] + + +def list_files(creds): + service = build('drive', 'v3', credentials=creds) + + # Call the Drive v3 API + results = service.files().list( + pageSize=10, fields="nextPageToken, files(id, name)").execute() + items = results.get('files', []) + print(items) + + +def document(creds, document_id): + service = build('docs', 'v1', credentials=creds) + + # This is an optional keyword parameter we should play with + # later. + suggestion_modes = [ + "DEFAULT_FOR_CURRENT_ACCESS", + "SUGGESTIONS_INLINE", + "PREVIEW_SUGGESTIONS_ACCEPTED", + "PREVIEW_WITHOUT_SUGGESTIONS" + ] + + SUGGESTION_MODE = suggestion_modes[0] + + document = service.documents().get( + documentId=document_id + ).execute() + print('The title of the document is: {}'.format(document.get('title'))) + return document + + +def document_revisions(creds, document_id): + service = build('drive', 'v3', credentials=creds) + r = service.revisions() + return r.list(fileId=document_id).execute() + + +def document_comments(creds, document_id): + service = build('drive', 'v3', credentials=creds) + return service.comments().list( + fileId=document_id, + fields="*", + includeDeleted=True + ).execute() + + +def authenticate(): + creds = None + # The file token.json stores the user's access and refresh tokens, and is + # created automatically when the authorization flow completes for the first + # time. + if os.path.exists('token.json'): + creds = Credentials.from_authorized_user_file('token.json', SCOPES) + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if False and creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + 'credentials.json', SCOPES) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open('token.json', 'w') as token: + token.write(creds.to_json()) + return creds + + +def main(document_id): + """Shows basic usage of the Docs API. + Prints the title of a sample document. + """ + creds = authenticate() + print(list_files(creds)) + print("Document:") + print(document(creds, document_id)) + with open("doc.json", "w") as fp: + fp.write(json.dumps(document(creds, document_id), indent=2)) + print("Document revisions:") + print(document_revisions(creds, document_id)) + print("Document comments:") + print(document_comments(creds, document_id)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + "document_id", help="Google document ID. Usually 44 characters long" + ) + args = parser.parse_args() + main(args.document_id) diff --git a/learning_observer/prototypes/google_docs/requirements.txt b/learning_observer/prototypes/google_docs/requirements.txt new file mode 100644 index 000000000..b572a3930 --- /dev/null +++ b/learning_observer/prototypes/google_docs/requirements.txt @@ -0,0 +1,3 @@ +google-api-python-client +google-auth-httplib2 +google-auth-oauthlib diff --git a/learning_observer/prototypes/proxy.py b/learning_observer/prototypes/proxy.py new file mode 100644 index 000000000..bc41a939f --- /dev/null +++ b/learning_observer/prototypes/proxy.py @@ -0,0 +1,123 @@ +''' +This is a PROTOTYPE proxy request handler. It is designed to be used with +aiohttp. The goal is to be able to connnect to Jupyter Notebook servers, and +relay the requests to them. + +This currently works 90% with Jupyter Notebook servers running on localhost, +but runs into issues with API requests. That's probably some CSRF issue, or +similar. + +We could also try to monitor at the ZMQ level, and relay the requests to +the notebook server. +''' + +from ast import Not +import asyncio +from datetime import datetime +import re +import multidict + +from aiohttp import web +import aiohttp + +BASE_URL = "http://localhost:8889" + + +async def proxy( + base_url=BASE_URL, + source_port=8080, + target_port=8889, +): + async def proxy_handler(request): + ''' + Relay HTTP requests from 8080 to 8889 + + This is the main handler for the proxy. + ''' + print(request) + target_url = base_url + request.path + + cookies = request.cookies + headers = multidict.CIMultiDict(request.headers) + if "referer" in headers: + old_referer = headers['referer'] + headers.popall("referer") + headers['referer'] = old_referer.replace( + str(source_port), str(target_port) + ) + async with aiohttp.ClientSession() as client: + if request.method == "POST": + post_data = await request.post() + print("PD", post_data) + resp = await client.post( + target_url, + data=post_data, + cookies=cookies, + headers=headers + ) + elif request.method == "GET": + resp = await client.get( + target_url, + cookies=cookies, + headers=headers + ) + elif request.method == "PUT": + put_data = await request.post() + resp = await client.put( + target_url, + data=put_data, + cookies=cookies, + headers=headers + ) + else: + raise NotImplementedError( + "Unsupported method: " + request.method + ) + data = await resp.read() + + if resp.status == 200: + data = await resp.read() + return web.Response( + body=data, + status=resp.status, + headers=resp.headers + ) + elif resp.status == 301: + return web.HTTPFound(resp.headers['Location']) + elif resp.status == 302: + return web.HTTPFound(resp.headers['Location']) + elif resp.status == 304: + return web.Response(status=resp.status) + elif resp.status == 401: + return web.HTTPUnauthorized() + elif resp.status == 404: + return web.HTTPNotFound() + elif resp.status == 403: + print(resp) + print(data) + return web.HTTPForbidden() + else: + print("Error:", resp.status) + return web.HTTPInternalServerError() + return proxy_handler + + +async def init_app(): + ''' + This is the main entry point for testing the proxy. + + It creates a proxy server and runs it in a loop. This is useful for + testing the proxy without the full system. + ''' + app = web.Application() + p = await proxy() + app.router.add_get('/{path:.*}', p) + app.router.add_post('/{path:.*}', p) + app.router.add_put('/{path:.*}', p) + return app + + +if __name__ == '__main__': + loop = asyncio.get_event_loop() + app = loop.run_until_complete(init_app()) + web.run_app(app) diff --git a/learning_observer/prototypes/selenium_gdocs_automation/README.md b/learning_observer/prototypes/selenium_gdocs_automation/README.md new file mode 100644 index 000000000..6b8eb7424 --- /dev/null +++ b/learning_observer/prototypes/selenium_gdocs_automation/README.md @@ -0,0 +1,21 @@ +Front-end test infrastructure +============================= + +It'd be great if we could do front-end testing. This is a prototype of +using Google Docs with [Selenium](https://www.selenium.dev/). + +Conclusions: + +1. Google plays a game of cat-and-mouse to prevent front-end + automation. It's annoying. I presume this is to stop some kind of + fraud. One might think Google would have better ways to shut + down fraud, but in this case, Google chose to externalize costs + onto customers. +2. People figure out work-arounds and Google shuts them down +3. The code, as committed, works right now, by using + [undetected-chromedriver](https://pypi.org/project/undetected-chromedriver/). +4. But it's possible it will stop tomorrow. + +Given the current size of the development team and the risk profile, I +decided not to throw more time into this right now. Joining the Google +cat-and-mouse game might make sense if/when the project expands. diff --git a/learning_observer/prototypes/selenium_gdocs_automation/selenium_gdoc.py b/learning_observer/prototypes/selenium_gdocs_automation/selenium_gdoc.py new file mode 100644 index 000000000..a89ac3165 --- /dev/null +++ b/learning_observer/prototypes/selenium_gdocs_automation/selenium_gdoc.py @@ -0,0 +1,110 @@ +''' +This is a script to log into Google Docs, and eventually do a +little bit of typing. + +This should not be used with your main Google account. + +For this to work, you will need to enable "Less secure app access." + +And then it still won't work... It's cat-and-mouse + +https://sqa.stackexchange.com/questions/42307/trying-to-login-to-gmail-with-selenium-but-this-browser-or-app-may-not-be-secur +https://stackoverflow.com/questions/60117232/selenium-google-login-block +https://stackoverflow.com/questions/57602974/gmail-is-blocking-login-via-automation-selenium + +undetected_chromedriver seems to work right now, but might stop tomorrow. +''' + +import os +import random +import sys +import time + +import undetected_chromedriver.v2 as uc +from selenium.webdriver.common.keys import Keys + +# I haven't validated this URL, and it should NOT be used in production unless +# it's confirmed to be a Google thing. I think it is, but I'm not sure. + +PLAYGROUND_OAUTH_URL = "https://accounts.google.com/o/oauth2/v2/auth/" \ + "oauthchooseaccount?redirect_uri=https%3A%2F%2Fdevelopers.google.com%" \ + "2Foauthplayground&prompt=consent&response_type=code&" \ + "client_id=407408718192.apps.googleusercontent.com&scope=email&" \ + "access_type=offline&flowName=GeneralOAuthFlow" + +chrome_options = uc.ChromeOptions() + +chrome_options.add_argument("--disable-extensions") +chrome_options.add_argument("--disable-popup-blocking") +chrome_options.add_argument("--profile-directory=Default") +chrome_options.add_argument("--ignore-certificate-errors") +chrome_options.add_argument("--disable-plugins-discovery") +chrome_options.add_argument("--incognito") +chrome_options.add_argument("user_agent=DN") +driver = uc.Chrome(options=chrome_options) + +driver.delete_all_cookies() + +driver.get(PLAYGROUND_OAUTH_URL) + +USERNAME_XPATH = "/html/body/div[1]/div[1]/div[2]/div/div[2]/div/div/div[2]" \ + "/div/div[1]/div/form/span/section/div/div/div[1]/div/div[1]/div/div[1]" \ + "/input" + +PASSWORD_XPATH = "/html/body/div[1]/div[1]/div[2]/div/div[2]/div/div/div[2]/" \ + "div/div[1]/div/form/span/section/div/div/div[1]/div/div[1]/div/div[1]/input" + +print("Username: ") +driver.find_element_by_xpath(USERNAME_XPATH).send_keys(input()) +driver.find_element_by_xpath(PASSWORD_XPATH).send_keys(Keys.RETURN) + + +# Old (non-working) version: + +# import selenium.webdriver +# import time +# from selenium.webdriver.chrome.options import Options +# from selenium_stealth import stealth + +# chrome_options = Options() +# chrome_options.add_argument('--disable-useAutomationExtension') +# chrome_options.add_argument("--disable-popup-blocking") +# chrome_options.add_argument("--profile-directory=Default") +# chrome_options.add_argument("--disable-plugins-discovery") +# chrome_options.add_argument("--disable-web-security") +# chrome_options.add_argument("--allopw-running-insecure-content") +# chrome_options.add_argument("--incognito") +# chrome_options.add_argument("user_agent=DN") +# chrome_options.add_experimental_option("excludeSwitches", +# ["enable-automation"]) +# chrome_options.add_experimental_option('useAutomationExtension', False) + +# driver = selenium.webdriver.Chrome(options=chrome_options) +# stealth(driver, +# languages=["en-US", "en"], +# vendor="Google Inc.", +# platform="Win32", +# webgl_vendor="Intel Inc.", +# renderer="Intel Iris OpenGL Engine", +# fix_hairline=True, +# ) + +# def documentready(): +# return driver.execute_script('return document.readyState;') == 'complete' + +# driver.get(PLAYGROUND_OAUTH_URL) + +# while not documentready(): +# time.sleep(0.1) + +# time.sleep(1) + +# ets = driver.find_elements_by_css_selector("input") +# et = [e for e in ets if e.get_attribute("aria-label") == 'Email or phone'][0] +# et.send_keys() + +# btns = driver.find_elements_by_css_selector("button") +# btn = [b for b in btns if b.text == 'Next'][0] +# btn.click() + +# #driver.get("http://docs.google.com") diff --git a/learning_observer/prototypes/user_data_store.py b/learning_observer/prototypes/user_data_store.py new file mode 100644 index 000000000..b23edb0f1 --- /dev/null +++ b/learning_observer/prototypes/user_data_store.py @@ -0,0 +1,163 @@ +''' +Abstraction to access database + +We need to store a few types of data: + +1) An archival store of process data usable for: + - Posthoc analysis + - Error recovery + - Debugging +We expect to potentially receive multiple events per +second per student, so this needs to be able to handle +moderately high volume, high-velocity data. The JSON +format has a lot of redundancy, be design, and compresses +well. + +2) Working memory: + - Storing state as we stream process events + - This does not have to be reliable IF we have clean + mechanisms for recovering from the archival store + - We may have the same user connected to multiple machines + (e.g. when editing the same document on two computers), + so we probably cannot rely on async+in-memory. But we can + rely on high-speed in-memory like redis or memcached. + - We likely do need at least snapshots of some kind in + non-volatile memory, so we don't have to replay everythin + when we restart. + +There are open questions as to what granularity state should live +at (e.g. per-student, per-teacher, pre-resource, etc.), and +appropriate abstractions. + +3) Typical operational information: + +- Users table, with logins +- Probably some list of documents we're operating on +- Some way to map students to classes, so we know who ought to + receive data for which student. + +This naturally fits into a traditional SQL database, like postgresql +or sqlite. + +By design, we want to support at least two modes of operation: + +1) Small-scale (e.g. development / debugging), with no external + dependencies. Working on this project should not require spinning + up an army of cloud machines, servers, and microservices. Here, we + can e.g. store "large" process data in sqlite, or query static files + on disk. + +2) Scalable (e.g. deployment), where we can swap out local stores for + larger-scale stores requiring either serious dev-ops or serious + cloud $$$. + +We'd like to be able to go between the two smoothly (e.g. run all but one +service locally). + +Python asynchronous database support is limited. A few options: + +https://github.com/python-gino/gino +https://www.encode.io/databases/ (and https://github.com/encode/orm) + +As well as database-specific options such as: + +https://pypi.org/project/aiosqlite/ +https://github.com/aio-libs/aiopg + +We decided to try databases due to support for both sqlite and postgresql. +''' +import asyncio +import functools + +import json +import yaml + +import asyncpg +from databases import Database +import sqlalchemy + + +async def initialize(reset=False): + pass + + +async def set_resource_state(username, resource): + pass + + +async def get_resource_state(username, resource): + pass + + +async def fetch_events(username, resource): + ''' + Grab all the events for a particular user / resource + + `resource` is typically `googledocs://docstring` + ''' + pass + + +async def insert_event(username, resource, event): + ''' + Store an event in the database + ''' + pass + + +async def get_class(username, class_id=None): + ''' + Return all the students in a teacher's class. + + Teachers can have multiple classes. + ''' + pass + + +async def get_recipients(username): + ''' + Return all the teachers who should be notified of events for user + `username` + ''' + pass + + +# database = Database('sqlite:///example.db') +# await database.connect() + +metadata = sqlalchemy.MetaData() + +users = sqlalchemy.Table( + "users", + metadata, + sqlalchemy.Column("id", sqlalchemy.Integer, primary_key=True), + sqlalchemy.Column("username", sqlalchemy.String(length=100)), + # Should we have a roles table instead? + sqlalchemy.Column("is_student", sqlalchemy.Boolean()), + sqlalchemy.Column("is_teacher", sqlalchemy.Boolean()), + +) +schools = sqlalchemy.Table( + "schools", + metadata, + sqlalchemy.Column("id", sqlalchemy.Integer, primary_key=True), + sqlalchemy.Column("name", sqlalchemy.String(length=100)), +) + +classes = sqlalchemy.Table( + "classes", + metadata, + sqlalchemy.Column("id", sqlalchemy.Integer, primary_key=True), + sqlalchemy.Column("name", sqlalchemy.String(length=100)), +) + +class_students = sqlalchemy.Table( + "class_student", + metadata, + sqlalchemy.Column("student_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("users.id")), + sqlalchemy.Column("class_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("class.id")), +) + + +# engine = sqlalchemy.create_engine(str(database.url)) +# metadata.create_all(engine) diff --git a/learning_observer/setup.cfg b/learning_observer/setup.cfg new file mode 100644 index 000000000..d8941a776 --- /dev/null +++ b/learning_observer/setup.cfg @@ -0,0 +1,16 @@ +[metadata] +name = Learning Observer +description = Learning Observer Core Package +url = http://www.ets.org +author_email = pmitros@ets.org +author = Piotr Mitros +version = 0.1 + +[options] +packages = learning_observer + +[options.entry_points] +lo_modules = + lo_core = learning_observer.module +console_scripts = + learning_observer = learning_observer.run:run \ No newline at end of file diff --git a/learning_observer/setup.py b/learning_observer/setup.py new file mode 100644 index 000000000..9aaa26bea --- /dev/null +++ b/learning_observer/setup.py @@ -0,0 +1,11 @@ +''' +Install script. Everything is handled in setup.cfg + +To set up locally for development, run `python setup.py develop`, in a +virtualenv, preferably. +''' + +from setuptools import setup + +setup( +) diff --git a/learning_observer/util/generic_websocket_dashboard.js b/learning_observer/util/generic_websocket_dashboard.js new file mode 100644 index 000000000..fbce5797a --- /dev/null +++ b/learning_observer/util/generic_websocket_dashboard.js @@ -0,0 +1,53 @@ +/* + This is test code for our new generic dashboard framework. + + It runs with node.js. We're developing it in node so that we can use + this as a starting point for thinking about a front-end framework, and + perhaps share test code. + + We don't have a clean plan for where we'll go (e.g. reuse code versus + prototypes). It's starting as a clone of the Python code with the same + filename. +*/ + +// var d3 = require('d3'); +// d3.text("https://www.google.com", function(d) {console.log(d);}); +const WebSocket = require('ws'); + +server = 'ws://localhost:8888/wsapi/generic_dashboard' + +messages = [ + { + "action": "subscribe", + "keys": [{ + "source" : "da_timeline.visualize.handle_event", + "KeyField.STUDENT": "guest-225d890e93a6b04c0aefe515b9d2dac9" + }], + "refresh": [0.5, "seconds"] + }, + { + "action": "subscribe", + "keys": [{ + "source" : "da_timeline.visualize.handle_event", + "KeyField.STUDENT": "INVALID-STUDENT" + }], + "refresh": [2, "seconds"] + }, + { + "action": "start" + } +] + +socket = new WebSocket(server); +socket.on('message', msg => console.log(msg.toString())); + +socket.onopen = function() { + console.log("Open"); + for(var i=0; i=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": "^5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + } + }, + "dependencies": { + "ws": { + "version": "8.5.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.5.0.tgz", + "integrity": "sha512-BWX0SWVgLPzYwF8lTzEy1egjhS4S4OEAHfsO8o65WOVsrnSRGaSiUaa9e0ggGlkMTtBlmOpEXiie9RUcBO86qg==", + "requires": {} + } + } +} diff --git a/learning_observer/util/package.json b/learning_observer/util/package.json new file mode 100644 index 000000000..86d104044 --- /dev/null +++ b/learning_observer/util/package.json @@ -0,0 +1,22 @@ +{ + "name": "writing_observer_util", + "version": "1.0.0", + "description": "Utilities for the Learning Observer and Writing Observer", + "main": "generic_websocket_dashboard.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/ETS-Next-Gen/writing_observer.git" + }, + "author": "Piotr Mitros", + "license": "AGPL-3.0", + "bugs": { + "url": "https://github.com/ETS-Next-Gen/writing_observer/issues" + }, + "homepage": "https://github.com/ETS-Next-Gen/writing_observer#readme", + "dependencies": { + "ws": "^8.5.0" + } +} diff --git a/learning_observer/util/populate_writing_observer_data.py b/learning_observer/util/populate_writing_observer_data.py new file mode 100644 index 000000000..c10f504dc --- /dev/null +++ b/learning_observer/util/populate_writing_observer_data.py @@ -0,0 +1,118 @@ +''' +This will program populate redis with dummy writing data for one +course. It should be run from the same directory as creds.yaml. This +is primarily intended for development use. +''' + +import asyncio + +import learning_observer.google +import learning_observer.settings +import learning_observer.offline +import learning_observer.rosters +import learning_observer.kvs + +import writing_observer.writing_analysis +from learning_observer.stream_analytics.fields import KeyField, KeyStateType, EventField + +import writing_observer.sample_essays + + +async def select_course(): + """ + This is an asynchronous function that allows the user to select a + course from a list of courses. The function prints each course's name + along with its index, and prompts the user to select a course by + entering the index number. The function returns the ID of the selected + course. + """ + courses = await learning_observer.rosters.courselist(learning_observer.offline.request) + + for course, i in zip(courses, range(len(courses))): + print(f"{i}: {course['name']}") + + course_index = int(input("Please select a course: ")) + return courses[course_index]['id'] + + +async def print_roster(course): + """ + This is an asynchronous function that takes a course as an + argument and prints its roster of students. It returns the + roster too. + """ + roster = await learning_observer.rosters.courseroster(learning_observer.offline.request, course) + print("\nStudents\n========") + for student in roster: + print(student['profile']['name']['full_name']) + return roster + + +async def select_text_type(): + """ + This is an asynchronous function that allows the user to select a + text type from a list of available text types from the + `sample_essays` module. These include GPT3-generate text, lorem + ipsum, etc. + """ + available_text_types = writing_observer.sample_essays.TextTypes.__members__ + tt_list = list(available_text_types) + + print("\nText types\n=====\n") + for text_type, idx in zip(tt_list, range(len(tt_list))): + print(f"{idx}: {text_type}") + + idx = int(input("Please pick a text type: ")) + text_type = available_text_types[tt_list[int(idx)]] + print(f"Text type: {text_type}") + return text_type + + +async def set_text(kvs, student_id, text, docid): + ''' + Set a text for the student in redis. + ''' + document_id = f"test-doc-{docid}" + print(student_id) + for kst in [KeyStateType.INTERNAL, KeyStateType.EXTERNAL]: + last_document_key = learning_observer.stream_analytics.helpers.make_key( + writing_observer.writing_analysis.last_document, + { + KeyField.STUDENT: student_id, + }, + kst + ) + document_key = learning_observer.stream_analytics.helpers.make_key( + writing_observer.writing_analysis.reconstruct, + { + KeyField.STUDENT: student_id, + EventField('doc_id'): document_id + }, + kst + ) + await kvs.set(last_document_key, {"document_id": document_id}) + await kvs.set(document_key, {"text": text}) + print("LDK", last_document_key) + print("DK", document_key) + + +async def main(): + learning_observer.settings.load_settings(config="creds.yaml") + learning_observer.google.initialize_and_register_routes(learning_observer.offline.app) + learning_observer.kvs.kvs_startup_check() + kvs = learning_observer.kvs.KVS() + course = await select_course() + roster = await print_roster(course) + text_type = await select_text_type() + + texts = writing_observer.sample_essays.sample_texts( + text_type=text_type, + count=len(roster) + ) + + for student, text, idx in zip(roster, texts, range(len(roster))): + print(student) + await set_text(kvs, student['user_id'], text, idx) + +loop = asyncio.get_event_loop() +loop.run_until_complete(main()) diff --git a/learning_observer/util/restream.py b/learning_observer/util/restream.py new file mode 100644 index 000000000..a7624da63 --- /dev/null +++ b/learning_observer/util/restream.py @@ -0,0 +1,129 @@ +'''ReStream + +Usage: + restream.py [--url=] [--extract-client] [--rate=] [--max-wait=] [--filelist] [--rename=auth.user_id] [--skip=] + +Options + --url= URL to connect [default: http://localhost:8888/wsapi/in/] + --extract-client Parse JSON and extract unannoted client-side event + --filelist File is a list of files to play at once + --rate= Throttle events to: timestamps / rate [default: 1] + --max-wait= Maximum delay (if throttling) + --rename= Rename students, randomly. If set, must be auth.user_id. + --skip= For performance, a list of events to skip (e.g. mouse) + +Overview: + * Restream logs from a file a web sockets server + * Helpful for testing + * Optional (todo): Capture server output + * Optional (todo): Handle AJAX + +The file list option starts streaming timestamps from the first +event. This is helpful for e.g. simulating 20 coglabs as one +session. It is not helpful for playing back what happened in one +class. + +''' + +import asyncio +import json +import random +import sys + +import aiofiles +import aiohttp +import docopt +import names + +print(docopt.docopt(__doc__)) + + +async def restream( + url, + filename, + rate, + max_wait, + extract_client, + rename, + skip +): + ''' + Formerly, the simplest function in the world. + + Open up a session, then a socket, and then stream lines from the + file to the socket. + ''' + old_ts = None + if isinstance(skip, str): + skip = set(",".split(skip)) + elif skip is None: + skip = set() + else: + raise Exception("Bug in skip. Debug please.") + + if rename is not None: + new_id = "rst-{name}-{number}".format( + name=names.get_first_name(), + number=random.randint(1, 1000) + ) + + async with aiohttp.ClientSession() as session: + async with session.ws_connect(url) as web_socket: + async with aiofiles.open(filename) as log_file: + async for line in log_file: + if rate is not None: + jline = json.loads(line) + if jline['client']['event'] in skip: + continue + new_ts = jline["server"]["time"] + if old_ts is not None: + delay = (new_ts - old_ts) / rate + if max_wait is not None: + delay = min(delay, max_wait) + print(line) + print(delay) + await asyncio.sleep(delay) + old_ts = new_ts + if extract_client or rename: + json_line = json.loads(line) + if extract_client: + json_line = json_line['client'] + print(json.dumps(json_line, indent=2)) + if rename: + if 'auth' not in json_line: + json_line['auth'] = {} + json_line['auth']['user_id'] = new_id + line = json.dumps(json_line) + + await web_socket.send_str(line.strip()) + return True + + +async def run(): + ''' + Is there a way to clean up so we don't have an ever-expanding + block indent? + ''' + args = docopt.docopt(__doc__) + print(args) + if args["--filelist"]: + filelist = [s.strip() for s in open(args['']).readlines()] + else: + filelist = [args['']] + coroutines = [ + restream( + url=args["--url"], + filename=filename, + rate=float(args["--rate"]), + max_wait=args["--max-wait"], + extract_client=args['--extract-client'], + rename=args['--rename'], + skip=args.get('--skip', None) + ) for filename in filelist] + await asyncio.gather(*coroutines) + +try: + asyncio.run(run()) +except aiohttp.client_exceptions.ServerDisconnectedError: + print("Could not connect to server") + sys.exit(-1) diff --git a/learning_observer/util/stream_writing.py b/learning_observer/util/stream_writing.py new file mode 100644 index 000000000..6e3486448 --- /dev/null +++ b/learning_observer/util/stream_writing.py @@ -0,0 +1,180 @@ +''' +Stream fake writing data + +Usage: + stream_writing.py [--url=url] [--streams=n] + [--ici=sec,s,s] + [--users=user_id,uid,uid] + [--source=filename,fn,fn] + [--gdids=googledoc_id,gdi,gdi] + [--text-length=5] + [--fake-name] + [--gpt3=type] + +Options: + --url=url URL to connect [default: http://localhost:8888/wsapi/in/] + --streams=N How many students typing in parallel? [default: 1] + --users=user_id,uid,uid Supply the user ID + --ici=secs,secs Mean intercharacter interval [default: 0.1] + --gdids=gdi,gdi,gdi Google document IDs of spoofed documents + --source=filename Stream text instead of lorem ipsum + --text-length=n Number of paragraphs of lorem ipsum [default: 5] + --fake-name Use fake names (instead of test-user) + --gpt3=type Use GPT-3 generated data ('story' or 'argument') + +Overview: + Stream fake keystroke data to a server, emulating Google Docs + extension log events. +''' + +import asyncio +import json +import sys + +import aiohttp +import docopt + +import loremipsum +import names + +ARGS = docopt.docopt(__doc__) +print(ARGS) + +STREAMS = int(ARGS["--streams"]) + + +def argument_list(argument, default): + ''' + Parse a list argument, with defaults. Allow one global setting, or per-stream + settings. IF `STREAMS` is 3: + + None ==> default() + "file.txt" ==> ["file.txt", "file.txt", "file.txt"] + "a,b,c" ==> ["a", "b", "c"] + "a,b" ==> exit + ''' + list_string = ARGS[argument] + if list_string is None: + list_string = default + if callable(list_string): + list_string = list_string() + if list_string is None: + return list_string + if "," in list_string: + list_string = list_string.split(",") + if isinstance(list_string, str): + list_string = [list_string] * STREAMS + if len(list_string) != STREAMS: + print(f"Failure: {list_string}\nfrom {argument} should make {STREAMS} items") + sys.exit(-1) + return list_string + + +if ARGS["--gpt3"] is not None: + import writing_observer.sample_essays + TEXT = writing_observer.sample_essays.GPT3_TEXTS[ARGS["--gpt3"]] + STREAMS = len(TEXT) +elif source_files is None: + TEXT = ["\n".join(loremipsum.get_paragraphs(int(ARGS.get("--text-length", 5)))) for i in range(STREAMS)] +else: + TEXT = [open(filename).read() for filename in source_files] + +ICI = argument_list( + '--ici', + "0.1" +) + +DOC_IDS = argument_list( + "--gdids", + lambda: [f"fake-google-doc-id-{i}" for i in range(STREAMS)] +) + +source_files = argument_list( + '--source', + None +) + +if ARGS['--users'] is not None: + USERS = argument_list('--users', None) +elif ARGS['--fake-name']: + USERS = [names.get_first_name() for i in range(STREAMS)] +else: + USERS = ["test-user-{n}".format(n=i) for i in range(STREAMS)] + +assert len(TEXT) == STREAMS, "len(filenames) != STREAMS." +assert len(ICI) == STREAMS, "len(ICIs) != STREAMS." +assert len(USERS) == STREAMS, "len(users) != STREAMS." +assert len(DOC_IDS) == STREAMS, "len(document IDs) != STREAMS." + + +def insert(index, text, doc_id): + ''' + Generate a minimal 'insert' event, of the type our Google Docs extension + might send, but with irrelevant stuff stripped away. This is just for + testing. + ''' + return { + "bundles": [{'commands': [{"ibi": index, "s": text, "ty": "is"}]}], + "event": "google_docs_save", + "source": "org.mitros.writing_analytics", + "doc_id": doc_id, + "origin": "stream_test_script" + } + + +def identify(user): + ''' + Send a token identifying user. + + TBD: How we want to manage this. We're still figuring out auth/auth. + This might just be scaffolding code for now, or we might do something + along these lines. + ''' + return [ + { + "event": "test_framework_fake_identity", + "source": "org.mitros.writing_analytics", + "user_id": user, + "origin": "stream_test_script" + }, { + "event": "metadata_finished", + "source": "org.mitros.writing_analytics", + "origin": "stream_test_script" + } + ] + + +async def stream_document(text, ici, user, doc_id): + ''' + Send a document to the server. + ''' + async with aiohttp.ClientSession() as session: + async with session.ws_connect(ARGS["--url"]) as web_socket: + commands = identify(user) + for command in commands: + await web_socket.send_str(json.dumps(command)) + for char, index in zip(text, range(len(text))): + command = insert(index + 1, char, doc_id) + await web_socket.send_str(json.dumps(command)) + await asyncio.sleep(float(ici)) + + +async def run(): + ''' + Create a task to send the document to the server, and wait + on it to finish. In the future, we'll create several tasks. + ''' + streamers = [ + asyncio.create_task(stream_document(text, ici, user, doc_id)) + for (text, ici, user, doc_id) in zip(TEXT, ICI, USERS, DOC_IDS) + ] + print(streamers) + for streamer in streamers: + await streamer + print(streamers) + +try: + asyncio.run(run()) +except aiohttp.client_exceptions.ServerDisconnectedError: + print("Could not connect to server") + sys.exit(-1) diff --git a/learning_observer/util/traditional_dashboard.py b/learning_observer/util/traditional_dashboard.py new file mode 100644 index 000000000..c0e8f6f45 --- /dev/null +++ b/learning_observer/util/traditional_dashboard.py @@ -0,0 +1,49 @@ +''' +This is a test script for a web socket interaction with the +original dashboard. + +We do need a better command line interface, but this is okay for +debugging for now. +''' + +import argparse + +import aiohttp +import asyncio + +parser = argparse.ArgumentParser( + description=__doc__.strip() +) +parser.add_argument( + '--single', action='store_true', + help="Print just a single message, then disconnect" +) + +parser.add_argument( + '--url', default='http://localhost:8889/wsapi/dashboard?module=writing_observer&course=12345', + help="We connect to this URL and grab data." +) + + +args = parser.parse_args() + + +async def main(): + async with aiohttp.ClientSession() as session: + print("Connecting to", args.url) + async with session.ws_connect( + args.url, timeout=0.5) as ws: + async for msg in ws: + print(msg.type) + if msg.type == aiohttp.WSMsgType.TEXT: + print("Message") + print(msg.data) + elif msg.type == aiohttp.WSMsgType.ERROR: + print("Error") + print(msg) + break + if args.single: + return True + return True + +asyncio.run(main()) diff --git a/modules/language_tool/languagetool.py b/modules/language_tool/languagetool.py new file mode 100644 index 000000000..1eafcf45b --- /dev/null +++ b/modules/language_tool/languagetool.py @@ -0,0 +1,52 @@ +''' +A thin, async wrapper to languagetool +''' + +import asyncio +import inspect + +import aiohttp + +session = None + + +async def check(language, text): + ''' + Takes a language (e.g. `en-US`), and a text. + + Returns a JSON object of the LanguageTool spell / grammar + check + ''' + + global session + if session is None: + session = aiohttp.ClientSession() + if inspect.iscoroutinefunction(session): + session = await session + + query = { + 'language': language, + 'text': text + } + resp = await session.post( + 'http://localhost:8081/v2/check', + data=query + ) + + return await resp.json() + + +async def main(): + ''' + A simple test case, and demo of syntax + ''' + en = await check('en-US', 'This is a tset of the emergecny...') + print(en['matches']) + pl = await check('pl', 'Sprawdzamy awarje, ale nie ma...') + print(en['matches']) + await session.close() + + +if __name__ == '__main__': + loop = asyncio.get_event_loop() + loop.run_until_complete(main()) diff --git a/modules/websocket_debug/static/websocketdebug.html b/modules/websocket_debug/static/websocketdebug.html new file mode 100644 index 000000000..53b4e523f --- /dev/null +++ b/modules/websocket_debug/static/websocketdebug.html @@ -0,0 +1,37 @@ + + + + + +

Web socket debug test!

+

This is a simple app to test web sockets.

+ + +
+
+
+ + + diff --git a/modules/wo_highlight_dashboard/setup.cfg b/modules/wo_highlight_dashboard/setup.cfg new file mode 100644 index 000000000..372ab18e0 --- /dev/null +++ b/modules/wo_highlight_dashboard/setup.cfg @@ -0,0 +1,12 @@ +[metadata] +name = Dash Writing Observer Class Highlight Dashboard +description = Dashboard using Dash for the Writing Observer +url = https://github.com/ETS-Next-Gen/writing_observer +version = 0.1 + +[options] +packages = wo_highlight_dashboard + +[options.entry_points] +lo_modules = + lo_core = wo_highlight_dashboard.module diff --git a/modules/wo_highlight_dashboard/setup.py b/modules/wo_highlight_dashboard/setup.py new file mode 100644 index 000000000..7dc1e2ad9 --- /dev/null +++ b/modules/wo_highlight_dashboard/setup.py @@ -0,0 +1,10 @@ +''' +Rather minimalistic install script. To install, run `python +setup.py develop` or just install via requirements.txt +''' + +from setuptools import setup, find_packages + +setup( + name="wo_highlight_dashboard" +) diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/app.py b/modules/wo_highlight_dashboard/wo_highlight_dashboard/app.py new file mode 100644 index 000000000..265fe003d --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/app.py @@ -0,0 +1,24 @@ +from aiohttp import web +from aiohttp_wsgi import WSGIHandler +import dash_bootstrap_components as dbc + +import learning_observer.dash_wrapper as dash +import writing_dashboard.dashboard.layout + +app = dash.Dash( + __name__, + external_stylesheets=[ + dbc.themes.MINTY, # bootstrap styling + dbc.icons.FONT_AWESOME, # font awesome icons + 'https://cdn.jsdelivr.net/gh/AnnMarieW/dash-bootstrap-templates@V1.0.6/dbc.min.css', # styling dcc components as Bootstrap + ], + title='Learning Observer', + suppress_callback_exceptions=True +) + +app.layout = writing_dashboard.dashboard.layout.layout + +wsgi_handler = WSGIHandler(app.server) +webapp = web.Application() +webapp.router.add_route("*", "/{path_info:.*}", wsgi_handler) +web.run_app(webapp) diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/favicon.ico b/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/favicon.ico new file mode 100644 index 000000000..aed40165a Binary files /dev/null and b/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/favicon.ico differ diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/scripts.js b/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/scripts.js new file mode 100644 index 000000000..76b4bf4f2 --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/scripts.js @@ -0,0 +1,407 @@ +/* + Javascript functions + This file contains Javascript functions that will be + called through a clientside callback in the Python code. + An example of how to run the javascript function, see below + + Essentially its just a JSON that defines functions. + + You'll see `window.dash_clientside.no_update` appear often in the code. + This tells dash not to update the output component. + If we don't need to update it, we shouldn't! + + This is often used with initializing the array of students to return. + `Array(students).fill(window.dash_clientside.no_update);` + Where `students` is the total number of students +*/ +// initialize dash_clientside +if (!window.dash_clientside) { + window.dash_clientside = {}; +} + +function getRGBAValues(str) { + var vals = str.substring(str.indexOf('(') +1, str.length -1).split(', '); + return { + 'r': parseInt(vals[0]), + 'g': parseInt(vals[1]), + 'b': parseInt(vals[2]), + 'o': parseFloat(vals[3]) + }; +} + +// define functions we are calling +window.dash_clientside.clientside = { + + change_sort_direction_icon: function(sort_check, sort_values) { + // updates UI elements, does not handle sorting + // based on the current sort, set the sort direction icon and sort text + + // Output(sort_icon, 'className'), + // Output(sort_label, 'children'), + // Input(sort_toggle, 'value'), + // Input(sort_by_checklist, 'value') + if (sort_check.includes('checked')) { + return ['fas fa-sort-down', 'Desc']; + } + return ['fas fa-sort-up', 'Asc']; + }, + + reset_sort_options: function(clicks) { + // resets the sort_by_checklist, this will trigger sort_students and change_sort_direction_icon + + // Output(sort_by_checklist, 'value'), + // Input(sort_reset, 'n_clicks') + if (clicks) { + return []; + } + return window.dash_clientside.no_update; + }, + + sort_students: function(values, direction, data, student_ids, options, students) { + // We sort students whenever one of the following occurs: + // 1. the checklist for sorting changes + // 2. the direction of sorting changes + // 3. the student's data changes + // We add the value of each indicator checked in the checklist to determine score for each student + // We then set the order style attribute of each student to their score + // Items with order=1 come before items with order=2 and so on. + // This will set students in ascending order (low scoring students first) + // We set a max and subtract from it to determine descending order (high scoring students first) + + // Output({'type': student_col, 'index': ALL}, 'style'), + // Input(settings.sort_by_checklist, 'value'), + // Input(settings.sort_toggle, 'value'), + // Input({'type': student_indicators, 'index': ALL}, 'data'), + // State(student_store, 'data'), + // State(settings.sort_by_checklist, 'options'), + // State(student_counter, 'data') + + let orders = Array(students).fill(window.dash_clientside.no_update); + if (values.length === 0) { + // default sort is alphabetical by student id + const sort_order = [...student_ids.keys()].sort((a, b) => student_ids[a].id - student_ids[b].id); + orders = sort_order.map(idx => {return {'order': (direction.includes('checked') ? student_ids.length - idx : idx)}}); + return orders; + } + let labels = options.map(obj => {return (values.includes(obj.value) ? obj.label : '')}); + labels = labels.filter(e => e); + for (let i = 0; i < data.length; i++) { + let score = 0; + values.forEach(function (item, index) { + score += data[i][`${item}_indicator`]['value']; + }); + let order = (direction.includes('checked') ? (100*values.length) - score : score); + orders[i] = {'order': order}; + } + return orders; + }, + + populate_student_data: function(msg, student_ids, prev_metrics, prev_text, prev_highlights, prev_indicators, students, msg_count) { + // Populates and updates students data from the websocket + // for each update, parse the data into the proper format + // Also return the current time + // + // Output({'type': student_metrics, 'index': ALL}, 'data'), + // Output({'type': student_texthighlight, 'index': ALL}, 'text'), + // Output({'type': student_texthighlight, 'index': ALL}, 'highlight_breakpoints'), + // Output({'type': student_indicators, 'index': ALL}, 'data'), + // Output(last_updated, 'children'), + // Output(msg_counter, 'data'), + // Input(websocket, 'message'), + // State(student_store, 'data'), + // State({'type': student_metrics, 'index': ALL}, 'data'), + // State({'type': student_texthighlight, 'index': ALL}, 'text'), + // State({'type': student_texthighlight, 'index': ALL}, 'highlight_breakpoints'), + // State({'type': student_indicators, 'index': ALL}, 'data'), + // State(student_counter, 'data') + if (!msg) { + return [prev_metrics, prev_text, prev_highlights, prev_indicators, -1, 0]; + } + let updates = Array(students).fill(window.dash_clientside.no_update); + const data = JSON.parse(msg.data)['latest_writing_data']; + // console.log(data); + for (let i = 0; i < data.length; i++) { + let curr_user = data[i].student.user_id; + let user_index = student_ids.findIndex(item => item.user_id === curr_user) + updates[user_index] = { + 'id': curr_user, + 'text': { + "student_text": { + "id": "student_text", + "value": data[i].text, + "label": "Student text" + } + }, + 'highlight': {}, + 'metrics': {}, + 'indicators': {} + } + for (const key in data[i]) { + let item = data[i][key]; + const sum_type = (item.hasOwnProperty('summary_type') ? item['summary_type'] : ''); + // we set each id to be ${key}_{type} so we can select items by class name when highlighting + if (sum_type === 'total') { + updates[user_index]['metrics'][`${key}_metric`] = { + 'id': `${key}_metric`, + 'value': item['metric'], + 'label': item['label'] + } + } else if (sum_type === 'percent') { + updates[user_index]['indicators'][`${key}_indicator`] = { + 'id': `${key}_indicator`, + 'value': item['metric'], + 'label': item['label'] + } + } + const offsets = (item.hasOwnProperty('offsets') ? item['offsets'] : ''); + if (offsets.length !== 0) { + updates[user_index]['highlight'][`${key}_highlight`] = { + 'id': `${key}_highlight`, + 'value': item['offsets'], + 'label': item['label'] + } + } + } + } + const timestamp = new Date(); + + // return the data to each each module + return [ + updates.map(function(d) { return d['metrics']; }), // metrics + updates.map(function(d) { return d['text']['student_text']['value']; }), // texthighlight text + updates.map(function(d) { return d['highlight']; }), // texthighlight highlighting + updates.map(function(d) { return d['indicators']; }), // indicators + timestamp, // current time + msg_count + 1 // set message count + ]; + }, + + update_last_updated_text: function(last_time, intervals) { + // Whenever we get a new message or 5 seconds have passed, update the last updated text + + // Output(last_updated_msg, 'children'), + // Input(last_updated, 'data'), + // Input(last_updated_interval, 'n_intervals') + if (last_time === -1) { + return 'Never'; + } + const curr_time = new Date(); + const sec_diff = (curr_time.getTime() - last_time.getTime())/1000 + if (sec_diff < 1) { + return 'just now' + } + const ms_since_last_message = rendertime2(sec_diff); + return `${ms_since_last_message} ago`; + }, + + open_settings: function(clicks, close, is_open, students) { + // Toggles the settings panel + // Based on if its open or not, we adjust the grid css classes of students and the panel itself + // this makes the student card remain the same size even if the settings panel is open. + // + // There are multiple ways to close the settings button (x button or click settings again). + // This means we have to determine which input fired and handle the possible cases. + + // Output(settings_collapse, 'is_open'), + // Output({'type': student_col, 'index': ALL}, 'class_name'), + // Output(student_grid, 'class_name'), + // Input(settings.open_btn, 'n_clicks'), + // Input(settings.close_settings, 'n_clicks'), + // State(settings_collapse, 'is_open'), + // State(student_counter, 'data') + + // determine which button caused this callback to trigger + const trig = dash_clientside.callback_context.triggered[0]; + if(!is_open & (typeof trig !== 'undefined')) { + if (trig.prop_id === 'teacher-dashboard-settings-show-hide-open-button.n_clicks') { + return [true, Array(students).fill('col-12 col-lg-6 col-xxl-4'), 'col-xxl-9 col-lg-8 col-md-6']; + } + } + return [false, Array(students).fill('col-12 col-md-6 col-lg-4 col-xxl-3'), '']; + }, + + update_students: async function(course_id) { + // Fetch the student information based on course id + + // Output(student_counter, 'data'), + // Output(student_store, 'data'), + // Input(course_store, 'data') + const response = await fetch(`${window.location.protocol}//${window.location.hostname}:${window.location.port}/webapi/courseroster/${course_id}`); + const data = await response.json(); + return [data.length, data]; + }, + + fetch_assignment_info: async function(course_id, assignment_id) { + // Fetch assignment information from server based on course and assignment id + // Not yet implemented, TODO + // + // Output(assignment_name, 'children'), + // Output(assignment_desc, 'children'), + // Input(course_store, 'data'), + // Input(assignment_store, 'data') + return [`Assignment ${assignment_id}`, `This is assignment ${assignment_id} from course ${course_id}`] + }, + + fetch_nlp_options: async function(trigger) { + // Fetch possible NLP options from the server to later build the settings panel + // + // Output(nlp_options, 'data'), + // Input(prefix, 'className') + const response = await fetch(`${window.location.protocol}//${window.location.hostname}:${window.location.port}/views/writing_observer/nlp-options/`); + const data = await response.json(); + return data; + }, + + update_course_assignment: function(url_hash) { + // Update the course and assignment info based on the hash query string + // + // Output(course_store, 'data'), + // Output(assignment_store, 'data'), + // Input('_pages_location', 'hash') + if (url_hash.length === 0) {return window.dash_clientside.no_update;} + const decoded = decode_string_dict(url_hash.slice(1)) + return [decoded.course_id, decoded.assignment_id] + }, + + highlight_text: function(overall_show, shown, data_trigger, options) { + // Highlights the text appropriately + // + // Output(settings.dummy, 'style'), + // Input(settings.checklist, 'value'), + // Input(settings.highlight_checklist, 'value'), + // Input({'type': student_card, 'index': ALL}, 'data'), + // State(settings.highlight_checklist, 'options') + + if (!overall_show.includes('highlight')) {return window.dash_clientside.no_update;} + const colors = [ + // Mints primary 4 colors with a 0.25 opacity + // 'rgba(86, 204, 157, 0.25)', 'rgba(108, 195, 213, 0.25)', + // 'rgba(255, 206, 103, 0.25)', 'rgba(255, 120, 81, 0.25)', + // Plotly's T10 with a 0.25 opacity applied + 'rgba(245, 133, 24, 0.25)', + 'rgba(114, 183, 178, 0.25)', 'rgba(228, 87, 86, 0.25)', + 'rgba(84, 162, 75, 0.25)', 'rgba(238, 202, 59, 0.25)', + 'rgba(178, 121, 162, 0.25)', 'rgba(255, 157, 166, 0.25)', + 'rgba(76, 120, 168, 0.25)', + ]; + let docs = []; + const shown_colors = {}; + // remove all highlighting and record current colors + options.forEach(item => { + docs = document.getElementsByClassName(`${item.value}_highlight`); + if (docs.length === 0) {return window.dash_clientside.no_update;} + if (shown.includes(item.value)) { + if (docs[0].style.backgroundColor.length > 0 & docs[0].style.backgroundColor !== 'transparent') { + shown_colors[item.value] = docs[0].style.backgroundColor; + } + } + for (var i = 0; i < docs.length; i++) { + docs[i].style.backgroundColor = 'transparent'; + } + }) + // highlight shown items + let high_color = ''; + shown.forEach(item => { + docs = document.getElementsByClassName(`${item}_highlight`); + // fetch current color or figure out a new one + if (shown_colors.hasOwnProperty(item)) { + high_color = shown_colors[item]; + } else { + let curr_colors = Object.values(shown_colors); + let remaining_colors = Array.from(new Set([...colors].filter(x => !curr_colors.includes(x)))); + high_color = (remaining_colors.length === 0 ? colors[Math.floor(Math.random()*colors.length)] : remaining_colors[0]) + shown_colors[item] = high_color; + } + + // add background color to highlighted elements + for (var i = 0; i < docs.length; i++) { + if (docs[i].style.backgroundColor.length > 0 & docs[i].style.backgroundColor !== 'transparent') { + let dc = getRGBAValues(docs[i].style.backgroundColor); + let hc = getRGBAValues(high_color); + let combined = `rgba(${parseInt((dc.r+hc.r)/2)}, ${parseInt((dc.g+hc.g)/2)}, ${parseInt((dc.b+hc.b)/2)}, ${hc.o+dc.o})`; + // console.log(dc, hc, combined); + docs[i].style.backgroundColor = combined; + } else { + docs[i].style.backgroundColor = high_color; + } + } + }) + }, + + set_status: function(status) { + // Set the websocket status icon/title + // + // Output(websocket_status, 'className'), + // Output(websocket_status, 'title'), + // Input(websocket, 'state') + if (status === undefined) { + return window.dash_clientside.no_update; + } + const icons = ['fas fa-sync-alt', 'fas fa-check text-success', 'fas fa-sync-alt', 'fas fa-times text-danger']; + const titles = ['Connecting to server', 'Connected to server', 'Closing connection', 'Disconnected from server']; + return [icons[status.readyState], titles[status.readyState]]; + }, + + show_hide_initialize_message: function(msg_count) { + // Show or hide the initialization message based on how many messages we've seen + // + // Output(initialize_alert, 'is_open'), + // Input(msg_counter, 'data') + if (msg_count > 0){ + return false; + } + return true; + }, + + send_options_to_server: function(types, metrics, highlights, indicators, sort_by) { + // Send selected options to the server + // TODO work on protocol for communicating with the + // + // Output(websocket, 'send'), + // Input(settings.checklist, 'value'), + // Input(settings.metric_checklist, 'value'), + // Input(settings.highlight_checklist, 'value'), + // Input(settings.indicator_checklist, 'value') + // Input(settings.sort_by_checklist, 'value') + const data = metrics.concat(highlights).concat(indicators).concat(sort_by); + return [JSON.stringify(data)] + }, + + show_nlp_running_alert: function(msg_count, checklist, metrics, highlight, indicator, sort_by) { + // Show or hide the NLP running alert + // On new selections, show alert. + // When new data comes in, hide the alert + // + // Output({'type': alert_type, 'index': nlp_running_alert}, 'is_open'), + // Input(msg_counter, 'data'), + // Input(settings.checklist, 'value'), + // Input(settings.metric_checklist, 'value'), + // Input(settings.highlight_checklist, 'value'), + // Input(settings.indicator_checklist, 'value'), + // Input(settings.sort_by_checklist, 'value'), + const trig = dash_clientside.callback_context.triggered[0]; + if (trig.prop_id === 'teacher-dashboard-msg-counter.data') { + return false; + } + return true; + }, + + update_overall_alert: function(is_open, children) { + // Update the overall alert system, + // if only 1 alert exists, show its message, + // otherwise combine + // + // Output(overall_alert, 'label'), + // Output(overall_alert, 'class_name'), + // Input({'type': alert_type, 'index': ALL}, 'is_open'), + // Input({'type': alert_type, 'index': ALL}, 'children'), + const truth = is_open.filter(function(e) {return e}).length; + if (truth == 1) { + return [children[is_open.indexOf(true)], ''] + } + if (truth > 1) { + return [`Waiting on ${truth} items to finish`, '']; + } + return [window.dash_clientside.no_update, 'hidden-alert']; + } +} diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/styles.css b/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/styles.css new file mode 100644 index 000000000..d86f164ba --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/assets/styles.css @@ -0,0 +1,120 @@ + +/* + Customize the scrollbar (Chrome only, but its okay cause they use Chromebooks) + Set size of scrollbar + Set track (slider track) + Set thumb (slider on track) + Set hover/active styles + */ +::-webkit-scrollbar { + width: 5px; + height: 5px; +} +::-webkit-scrollbar-track { + background: rgb(179, 177, 177); + border-radius: 5px; +} +::-webkit-scrollbar-thumb { + background: rgb(136, 136, 136); + border-radius: 5px; +} +::-webkit-scrollbar-thumb:hover { + background: rgb(100, 100, 100); + border-radius: 5px; +} +::-webkit-scrollbar-thumb:active { + background: rgb(68, 68, 68); + border-radius: 5px; +} + +/* + Provides a shift and a shadow to student cards + I think it helps to focus in on one student, but the shadow itself could be improved upon + Try hovering over a student's card +*/ +.shadow-card:hover { + transition: all 0.2s ease-out; + box-shadow: 0px 2px 8px var(--bs-gray-500); + border: 1px solid #cccccc; + background-color: white; +} +.shadow-card:hover:before { + transform: scale(2.15); +} + +/* Style the text element (the box of text) on student cards */ +.student-card-text { + max-height: 250px; + overflow: auto; + border: var(--bs-gray-100) solid 1px; + border-radius: 0.4rem; + margin: 1px; +} + +/* Larger font size helper class */ +.font-size-lg { font-size: 1.2rem; } + +/* + Add some background so you can see which option of a checklist you are hovering + Add darker background for nested-forms + Try hovering over an option in the Settings menu +*/ +.form-check:hover { + transition: all 0.2s ease-out; + background-color: var(--bs-gray-100); + box-shadow: 0px 0px 5px var(--bs-gray-100); + border-radius: 0.4rem; +} +.nested-form:hover { + transition: all 0.2s ease-out; + background-color: var(--bs-gray-300); + box-shadow: 0px 0px 5px var(--bs-gray-300); + border-radius: 0.4rem; +} + +/* +Style dropdown menu component to be an outline btn +with appropriate colors +*/ +.dropdown-menu-outline-dark { + color: #343a40; + background-color: transparent; +} +.dropdown-menu-outline-dark:hover { + color: white; + background-color: #343a40; +} +.dropdown-item:focus, .dropdown-item:hover { + color: #212529; + background-color: var(--bs-gray-300); +} + +/* +Adjust styling for parent and children items +Parents are disabled and shift to the left +Children are indented to the right +*/ +.form-check-input:disabled { + display: none; +} +.nested-form:has(.subchecklist-label) { + margin-left: 1.5em; +} +.nested-form:has(.form-check-input:disabled) { + padding-left: 0; +} + +/* +Animation to hide the alert +Some items need a slight delay which is why we don't adjust opacity +until halfway through +*/ +@keyframes delay_hide { + 0% {opacity: 1;} + 50% {opacity: 1;} + 100% {opacity: 0;} +} +.hidden-alert { + opacity: 0; + animation: delay_hide 2s linear; +} diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/layout.py b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/layout.py new file mode 100644 index 000000000..690b1d573 --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/layout.py @@ -0,0 +1,19 @@ +''' +Define layout for student dashboard view +''' +# package imports +import learning_observer.dash_wrapper as dash +import dash_bootstrap_components as dbc + +# local imports +from .students import student_dashboard_view + + +# passing empty parameters will automatigically be used as query strings +# see: https://dash.plotly.com/urls#query-strings +def layout(course_id=None, assignment_id=None): + layout = dbc.Spinner( + student_dashboard_view(course_id, assignment_id), + color='primary' + ) + return layout diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings.py b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings.py new file mode 100644 index 000000000..a6d52230f --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings.py @@ -0,0 +1,315 @@ +''' +Defines the settings panel used on the student overview dashbaord view +''' +# package imports +from learning_observer.dash_wrapper import html, dcc, clientside_callback, ClientsideFunction, Output, Input +import dash_bootstrap_components as dbc + +prefix = 'teacher-dashboard-settings' +# ids related to opening/closing panel +open_btn = f'{prefix}-show-hide-open-button' # settings button +offcanvas = f'{prefix}-show-hide-offcanvcas' # setting wrapper +close_settings = f'{prefix}-close' # X on settings panel + +# ids related to sorting +sort_by_checklist = f'{prefix}-sort-by-checklist' # options that can be included for sorting +sort_toggle = f'{prefix}-sort-by-toggle' # checkbox for determining sort direction +sort_icon = f'{prefix}-sort-by-icon' # icon for sort direction +sort_label = f'{prefix}-sort-by-label' # text for sort direction +sort_reset = f'{prefix}-sort-by-reset' # sort reset button +# ids relating to showing or hiding elements +checklist = f'{prefix}-show-hide-checklist' # parent checklist - determines which type of stuff to show +metric_collapse = f'{prefix}-show-hide-metric-collapse' # metric options wrapper +metric_checklist = f'{prefix}-show-hide-metric-checklist' # metric options +text_collapse = f'{prefix}-show-hide-text-collapse' # text options wrapper +text_radioitems = f'{prefix}-show-hide-text-radioitems' # text options +highlight_collapse = f'{prefix}-show-hide-highlight-collapse' # highlight options wrapper +highlight_checklist = f'{prefix}-show-hide-highlight-radioitems' # highlight options +indicator_collapse = f'{prefix}-show-hide-indicator-collapse' # indicator options wrapper +indicator_checklist = f'{prefix}-show-hide-indicator-checklist' # indicator wrapper +dummy = f'{prefix}-dummy' + +# settings panel itself +panel = dbc.Card( + [ + html.Div(id=dummy), + html.Div( + [ + # panel title + html.H4( + [ + html.I(className='fas fa-gear me-2'), # gear icon + 'Settings' + ], + # bootstrap styling to allow for the floating X button and remove lower margin + className='d-inline mb-0' + ), + # close settings X + dbc.Button( + # font awesome X icon + html.I(className='fas fa-xmark'), + color='white', + # bootstrap text styling + class_name='text-body', + id=close_settings + ) + ], + # create flex container so children can be positioned properly + className='m-2 d-flex align-items-center justify-content-between' + ), + # Each settings option is an accordion item + dbc.Accordion( + [ + # sort by + dbc.AccordionItem( + dbc.Card( + [ + dcc.Checklist( + options=[], + value=[], + id=sort_by_checklist, + labelClassName='form-check nested-form', # style dcc as bootstrap + inputClassName='form-check-input' # style dcc as bootstrap + ), + html.Div( + # button group for sort buttons + dbc.ButtonGroup( + [ + # change sort direction + dbc.Button( + dcc.Checklist( + options=[ + { + 'value': 'checked', + 'label': html.Span( # define Dash component as checklist option + [ + html.I(id=sort_icon), + html.Span( + 'None', + id=sort_label, + className='ms-1' + ) + ] + ) + } + ], + value=[], + id=sort_toggle, + inputClassName='d-none', # hide the checkbox, icon/text are clickable + className='d-inline', # needed to style for components as options + ), + outline=True, + color='primary', + title='Arrange students by attributes', + ), + # reset sort button + dbc.Button( + [ + html.I(className='fas fa-rotate me-1'), # font awesome rotate icon + 'Reset Sort' + ], + id=sort_reset, + outline=True, + color='primary' + ) + ], + size='sm', + class_name='float-end d-inline' # bootstrap keep button group to the right + ), + className='mt-1' # bootstrap top margin + ) + ], + class_name='border-0' # bootstrap remove borders + ), + title='Sort by' # hover text + ), + # show/hide elements + dbc.AccordionItem( + [ + dcc.Checklist( + options=[ + # metrics + { + 'label': html.Span( + [ + html.Span( + [ + html.I(className='fas fa-hashtag me-1'), + 'Metrics overview' + ], + className='font-size-lg' # make labels a little bigger + ), + dbc.Collapse( + dcc.Checklist( + # option for each possible metric + options=[], + value=[], # defaults + id=metric_checklist, + labelClassName='form-check nested-form', # style dcc as Bootstrap and add nested hover + inputClassName='form-check-input' # style dcc as Bootstrap + ), + id=metric_collapse, + ) + ], + ), + 'value': 'metrics' + }, + # text + # { + # 'label': html.Span( + # [ + # html.Span( + # [ + # html.I(className='fas fa-file me-1'), + # 'Text', + # ], + # className='font-size-lg' + # ), + # dbc.Collapse( + # dcc.RadioItems( + # # option for each possible text item + # # TODO pull this information from somewhere + # options=[], + # value=None, # default option + # id=text_radioitems, + # labelClassName='form-check nested-form', # style dcc as Bootstrap and add nested hover + # inputClassName='form-check-input' # style dcc as Bootstrap + # ), + # id=text_collapse, + # ) + # ], + # ), + # 'value': 'text' + # }, + # highlight + { + 'label': html.Span( + [ + html.Span( + [ + html.I(className='fas fa-highlighter fa-flip-horizontal me-1'), + 'Highlight', + ], + className='font-size-lg' + ), + dbc.Collapse( + dcc.Checklist( + # option for each possible highlightable item + # TODO pull this information from somewhere + options=[], + value=[], # default options + id=highlight_checklist, + labelClassName='form-check nested-form', # style dcc as Bootstrap and add nested hover + inputClassName='form-check-input' # style dcc as Bootstrap + ), + id=highlight_collapse, + ) + ], + ), + 'value': 'highlight' + }, + # indicators + { + 'label': html.Span( + [ + html.Span( + [ + html.I(className='fas fa-chart-bar me-1'), + 'Indicators overview', + ], + className='font-size-lg' + ), + dbc.Collapse( + # option for each possible indicator + # TODO pull this information from somewhere + dcc.Checklist( + options=[], + value=[], # default options + id=indicator_checklist, + labelClassName='form-check nested-form', # style dcc as Bootstrap and add nested hover + inputClassName='form-check-input' # style dcc as Bootstrap + ), + id=indicator_collapse, + ) + ] + ), + 'value': 'indicators' + } + ], + value=['text', 'highlight', 'indicators', 'metrics'], + id=checklist, + labelClassName='form-check', # style dcc as Bootstrap + inputClassName='form-check-input' # style dcc as Bootstrap + ), + ], + title='Student Card Options', + class_name='rounded-bottom' # bootstrap round bottom corners + ), + ], + # make both items visible from the start + active_item=[f'item-{i}' for i in range(2)], + always_open=True, # keep accordionitems open when click on others + flush=True, # styles to take up width + class_name='border-top' # bootstrap border on top + ), + ], + id=offcanvas, + # TODO eventually we want sticky-top in the classname however + # if the screen height is short enough we won't be able to + # see all options available. + # need to add overflow to the last accordian item + + # bootstrap add right (e)nd and (b)ottom margins + class_name='me-2 mb-2' +) + +# change the icon and label of the sort button +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='change_sort_direction_icon'), + Output(sort_icon, 'className'), + Output(sort_label, 'children'), + Input(sort_toggle, 'value'), + Input(sort_by_checklist, 'value') +) + +# reset the sort +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='reset_sort_options'), + Output(sort_by_checklist, 'value'), + Input(sort_reset, 'n_clicks') +) + +# settings checklist toggle +# if the option is selected, show its sub-options +# +# e.g. if metrics is chosen, show the options for time_on_task, adjectives, adverbs, etc. +# otherwise, don't shown those items + +toggle_checklist_visibility = ''' + function(values, students) {{ + if (values.includes('{id}')) {{ + return true; + }} + return false; + }} + ''' +clientside_callback( + toggle_checklist_visibility.format(id='indicators'), + Output(indicator_collapse, 'is_open'), + Input(checklist, 'value') +) +clientside_callback( + toggle_checklist_visibility.format(id='metrics'), + Output(metric_collapse, 'is_open'), + Input(checklist, 'value') +) +# clientside_callback( +# toggle_checklist_visibility.format(id='text'), +# Output(text_collapse, 'is_open'), +# Input(checklist, 'value') +# ) +clientside_callback( + toggle_checklist_visibility.format(id='highlight'), + Output(highlight_collapse, 'is_open'), + Input(checklist, 'value') +) diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings_defaults.py b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings_defaults.py new file mode 100644 index 000000000..8c68d15f9 --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings_defaults.py @@ -0,0 +1,98 @@ +''' +Each variable displays the defaults for each type of essay. +Ideally, users can select multiple (deep merge) to see both general +and argumentative for instance. + +In the future, we probably want the dashboard more flexible with +different types of modules being plugged in (metrics/highlighter/etc.). +This information will probably want to be handled a bit nicer once +we understand the full workflow of the plugability. +''' +general = { + 'sort_by': { + 'options': [], + 'selected': [] + }, + 'metrics': { + 'options': ['sentences', 'paragraphs', 'pos_'], + 'selected': ['prepositions'] + }, + 'highlight': { + 'options': [ + 'informal_language', 'transition_words', 'low_frequency_words', + 'positive_tone', 'negative_tone', + 'polysyllabic_words' + ], + 'selected': ['transition_words', 'informal_language'] + }, + 'indicators': { + 'options': [ + 'academic_language', 'informal_language', 'latinate_words', + 'polysyllabic_words', 'low_frequency_words' + ], + 'selected': ['informal_language'] + } +} +argumentative = { + 'sort_by': { + 'options': [], + 'selected': [] + }, + 'metrics': { + 'options': [], + 'selected': [] + }, + 'highlight': { + 'options': [ + 'main_idea_sentences', 'supporting_idea_sentences', 'supporting_detail_sentences', + 'argument_words', 'explicit_argument', + 'statements_of_opinion', 'statements_of_fact', + 'explicit_claims', + ], + 'selected': ['main_idea_sentences'] + }, + 'indicators': { + 'options': ['opinion_words', 'argument_words', 'information_sources', 'attributions', 'citations'], + 'selected': [] + } +} +narrative = { + 'sort_by': { + 'options': [], + 'selected': [] + }, + 'metrics': { + 'options': [], + 'selected': [] + }, + 'highlight': { + 'options': [ + 'direct_speech_verbs', 'indirect_speech', + 'in_past_tense', 'social_awareness', + 'character_trait_words', 'concrete_details' + ], + 'selected': [] + }, + 'indicators': { + 'options': ['emotion_words', 'character_trait_words'], + 'selected': [] + } +} +source_based = { + 'sort_by': { + 'options': [], + 'selected': [] + }, + 'metrics': { + 'options': [], + 'selected': [] + }, + 'highlight': { + 'options': ['information_sources', 'attributions', 'citations', 'quoted_words'], + 'selected': [] + }, + 'indicators': { + 'options': [], + 'selected': [] + } +} diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings_options.py b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings_options.py new file mode 100644 index 000000000..bacba9588 --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/settings_options.py @@ -0,0 +1,60 @@ +''' +Defines the options in the settings panel +''' +# package imports +from dash import html +import dash_bootstrap_components as dbc + + +def create_metric_label(opt, child=False): + return dbc.Badge( + opt.get('name'), + color='info', + title=opt.get('tooltip', ''), + class_name='subchecklist-label' if child else '' + ) + + +def create_highlight_label(opt, child=False): + class_name = f"{opt.get('id')}_highlight" + return html.Span( + opt.get('name'), + title=opt.get('tooltip', ''), + className=f'subchecklist-label {class_name}' if child else class_name + ) + + +def create_generic_label(opt, child=False): + return html.Span( + opt.get('name'), + title=opt.get('tooltip', ''), + className='subchecklist-label' if child else '' + ) + + +def create_checklist_options(user_options, options, selector_type): + if selector_type == 'metric': + label_maker = create_metric_label + elif selector_type == 'highlight': + label_maker = create_highlight_label + else: + label_maker = create_generic_label + ui_options = [] + for opt_id in user_options: + opt = next((o for o in options if o['id'] == opt_id), None) + if opt is None: + children = [o for o in options if o['parent'] == opt_id] + children_options = [ + { + 'label': label_maker(child, child=True), + 'value': child['id'] + } for child in children + ] + ui_options.append({'label': opt_id, 'value': opt_id, 'disabled': True}) + ui_options.extend(children_options) + else: + ui_options.append({ + 'label': label_maker(opt), + 'value': opt['id'] + }) + return ui_options diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/students.py b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/students.py new file mode 100644 index 000000000..217883b99 --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/dashboard/students.py @@ -0,0 +1,499 @@ +''' +Creates the grid of student cards +''' +# package imports +from learning_observer.dash_wrapper import html, dcc, callback, clientside_callback, ClientsideFunction, Output, Input, State, ALL, exceptions as dash_e +import dash_bootstrap_components as dbc +from learning_observer_components import LOConnection +import learning_observer_components as loc # student cards + +# local imports +from . import settings, settings_defaults, settings_options as so + +# define ids for the dashboard +# use a prefix to help ensure we don't double up on IDs (guess what happens if you double up? it breaks) +prefix = 'teacher-dashboard' + +# individual student items +student_col = f'{prefix}-student-col' # individual student card wrapper id +student_metrics = f'{prefix}-student-metrics' +student_texthighlight = f'{prefix}-student-texthighlight' +student_indicators = f'{prefix}-student-indicators' + +student_row = f'{prefix}-student-row' # overall student row +student_grid = f'{prefix}-student-grid' # overall student grid wrapper id +websocket = f'{prefix}-websocket' # websocket to connect to the server (eventually) +student_counter = f'{prefix}-student-counter' # store item for quick access to the number of students +student_store = f'{prefix}-student-store' # store item for student information +course_store = f'{prefix}-course-store' # store item for course id +settings_collapse = f'{prefix}-settings-collapse' # settings menu wrapper +websocket_status = f'{prefix}-websocket-status' # websocket status icon +last_updated = f'{prefix}-last-updated' # data last updated id +last_updated_msg = f'{prefix}-last-updated-text' # data last updated id +last_updated_interval = f'{prefix}-last-updated-interval' + +alert_type = f'{prefix}-alert' +initialize_alert = f'{prefix}-initialize-alert' +nlp_running_alert = f'{prefix}-nlp-running-alert' +overall_alert = f'{prefix}-navbar-alert' + +msg_counter = f'{prefix}-msg-counter' +nlp_options = f'{prefix}-nlp-options' +assignment_store = f'{prefix}-assignment-info_store' +assignment_name = f'{prefix}-assignment-name' +assignment_desc = f'{prefix}-assignment-description' + + +def student_dashboard_view(course_id, assignment_id): + '''Create student dashboard view, + + course_id: id of given course + assignment_id: id of assignment + ''' + navbar = dbc.Navbar( + [ + # assignment title + html.H3( + [ + # document icon with a right bootstrap margin + html.I(className='fas fa-file-lines me-2'), + html.Span(id=assignment_name), + ], + className='d-inline' + ), + html.Div( + dbc.Progress( + value=100, striped=True, animated=True, + label='Fetching data...', + color='info', + id=overall_alert, + style={'height': '1.5rem'} + ), + className='w-25', + ), + # open settings button + html.Div( + [ + dbc.ButtonGroup( + [ + dbc.Button( + html.Small( + [ + html.I(id=websocket_status), + html.Span('Last Updated: ', className='ms-2'), + html.Span(id=last_updated_msg) + ] + ), + outline=True, + color='dark' + ), + dbc.DropdownMenu( + [ + settings.open_btn, + dbc.DropdownMenuItem( + 'Settings', + id=settings.open_btn + ), + dbc.DropdownMenuItem( + 'Logout', + href='/auth/logout', + external_link=True + ), + ], + group=True, + align_end=True, + label='Menu', + color='dark', + toggle_class_name='dropdown-menu-outline-dark' + ) + ] + ) + ], + className='d-flex align-items-center float-end' + ) + ], + sticky='top', + class_name='justify-content-between align-items-center px-3' + ) + container = dbc.Container( + [ + # assignment description + html.P(id=assignment_desc), + dbc.Alert( + 'Fetching initial data...', + is_open=False, + class_name='d-none', + id={ + 'type': alert_type, + 'index': initialize_alert + } + ), + dbc.Alert( + 'Running NLP...', + is_open=False, + class_name='d-none', + id={ + 'type': alert_type, + 'index': nlp_running_alert + } + ), + dbc.Row( + [ + # settings panel wrapper + dbc.Collapse( + dbc.Col( + settings.panel, + # bootstrap use 100% of (w)idth and (h)eight + class_name='w-100 h-100' + ), + id=settings_collapse, + # bootstrap collapse and grid sizing + class_name='collapse-horizontal col-xxl-3 col-lg-4 col-md-6', + # default open/close + is_open=False + ), + # overall student grid wrapp + dbc.Col( + dbc.Row( + id=student_row, + # bootstrap gutters-2 (little bit of space between cards) and w(idth)-100(%) + class_name='g-2 w-100' + ), + id=student_grid, + # classname set in callback, default classname should go in the callback + ) + ], + # no spacing between settings and students + # students already have some space on the sides + class_name='g-0' + ), + LOConnection(id=websocket), + # stores for course and student info + student counter + dcc.Store(id=course_store), + dcc.Store(id=assignment_store), + dcc.Store( + id=student_store, + data=[] + ), + dcc.Store( + id=student_counter, + data=0 + ), + dcc.Store( + id=msg_counter, + data=0 + ), + dcc.Store( + id=nlp_options, + data=[] + ), + dcc.Store( + id=last_updated, + data=-1 + ), + dcc.Interval( + id=last_updated_interval, + interval=5000 + ) + ], + fluid=True + ) + return html.Div([navbar, container], id=prefix) + + +# set hash parameters +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='update_course_assignment'), + Output(course_store, 'data'), + Output(assignment_store, 'data'), + Input('_pages_location', 'hash') +) + +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='fetch_nlp_options'), + Output(nlp_options, 'data'), + Input(prefix, 'className') +) + +# set the websocket data_scope +# TODO set with url similar to course id +clientside_callback( + """ + function(course, assignment) { + const ret = {"module": "latest_data", "course": course}; + return ret; + } + """, + Output(websocket, 'data_scope'), + Input(course_store, 'data'), + Input(assignment_store, 'data') +) + +# set the websocket status icon +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='set_status'), + Output(websocket_status, 'className'), + Output(websocket_status, 'title'), + Input(websocket, 'state') +) + +# fetch student info for course +# TODO fix this to pull the roster information better +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='update_students'), + Output(student_counter, 'data'), + Output(student_store, 'data'), + Input(course_store, 'data') +) + +# fetch assignment information from server +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='fetch_assignment_info'), + Output(assignment_name, 'children'), + Output(assignment_desc, 'children'), + Input(course_store, 'data'), + Input(assignment_store, 'data') +) + +# open the settings menu +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='open_settings'), + Output(settings_collapse, 'is_open'), + Output({'type': student_col, 'index': ALL}, 'class_name'), + Output(student_grid, 'class_name'), + Input(settings.open_btn, 'n_clicks'), + Input(settings.close_settings, 'n_clicks'), + State(settings_collapse, 'is_open'), + State(student_counter, 'data') +) + +# Update data from websocket +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='populate_student_data'), + Output({'type': student_metrics, 'index': ALL}, 'data'), + Output({'type': student_texthighlight, 'index': ALL}, 'text'), + Output({'type': student_texthighlight, 'index': ALL}, 'highlight_breakpoints'), + Output({'type': student_indicators, 'index': ALL}, 'data'), + Output(last_updated, 'data'), + Output(msg_counter, 'data'), + Input(websocket, 'message'), + State(student_store, 'data'), + State({'type': student_metrics, 'index': ALL}, 'data'), + State({'type': student_texthighlight, 'index': ALL}, 'text'), + State({'type': student_texthighlight, 'index': ALL}, 'highlight_breakpoints'), + State({'type': student_indicators, 'index': ALL}, 'data'), + State(student_counter, 'data'), + State(msg_counter, 'data'), +) + +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='update_last_updated_text'), + Output(last_updated_msg, 'children'), + Input(last_updated, 'data'), + Input(last_updated_interval, 'n_intervals') +) + +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='send_options_to_server'), + Output(websocket, 'send'), + Input(settings.checklist, 'value'), + Input(settings.metric_checklist, 'value'), + Input(settings.highlight_checklist, 'value'), + Input(settings.indicator_checklist, 'value'), + Input(settings.sort_by_checklist, 'value') +) + +show_hide_module = ''' + function(values, students) {{ + if (values.includes('{id}')) {{ + return Array(students).fill(''); + }} + return Array(students).fill('d-none'); + }} + ''' +clientside_callback( + show_hide_module.format(id='metrics'), + Output({'type': student_metrics, 'index': ALL}, 'class_name'), + Input(settings.checklist, 'value'), + State(student_counter, 'data') +) +clientside_callback( + show_hide_module.format(id='highlight'), + Output({'type': student_texthighlight, 'index': ALL}, 'class_name'), + Input(settings.checklist, 'value'), + State(student_counter, 'data') +) +clientside_callback( + show_hide_module.format(id='indicators'), + Output({'type': student_indicators, 'index': ALL}, 'class_name'), + Input(settings.checklist, 'value'), + State(student_counter, 'data') +) + +update_shown_items = ''' + function(values, students) {{ + return Array(students).fill(values.map(x => `${{x}}_{}`)); + }} +''' +clientside_callback( + update_shown_items.format('metric'), + Output({'type': student_metrics, 'index': ALL}, 'shown'), + Input(settings.metric_checklist, 'value'), + State(student_counter, 'data') +) +clientside_callback( + update_shown_items.format('highlight'), + Output({'type': student_texthighlight, 'index': ALL}, 'shown'), + Input(settings.highlight_checklist, 'value'), + State(student_counter, 'data') +) +clientside_callback( + update_shown_items.format('indicator'), + Output({'type': student_indicators, 'index': ALL}, 'shown'), + Input(settings.indicator_checklist, 'value'), + State(student_counter, 'data') +) + +# Show/hide the initialization alert +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='show_hide_initialize_message'), + Output({'type': alert_type, 'index': initialize_alert}, 'is_open'), + Input(msg_counter, 'data') +) + +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='show_nlp_running_alert'), + Output({'type': alert_type, 'index': nlp_running_alert}, 'is_open'), + Input(msg_counter, 'data'), + Input(settings.checklist, 'value'), + Input(settings.metric_checklist, 'value'), + Input(settings.highlight_checklist, 'value'), + Input(settings.indicator_checklist, 'value'), + Input(settings.sort_by_checklist, 'value'), +) + +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='update_overall_alert'), + Output(overall_alert, 'label'), + Output(overall_alert, 'class_name'), + Input({'type': alert_type, 'index': ALL}, 'is_open'), + Input({'type': alert_type, 'index': ALL}, 'children'), +) + +# Sort students by indicator values +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='sort_students'), + Output({'type': student_col, 'index': ALL}, 'style'), + Input(settings.sort_by_checklist, 'value'), + Input(settings.sort_toggle, 'value'), + Input({'type': student_indicators, 'index': ALL}, 'data'), + State(student_store, 'data'), + State(settings.sort_by_checklist, 'options'), + State(student_counter, 'data') +) + +# highlight text +clientside_callback( + ClientsideFunction(namespace='clientside', function_name='highlight_text'), + Output(settings.dummy, 'style'), + Input(settings.checklist, 'value'), + Input(settings.highlight_checklist, 'value'), + Input({'type': student_texthighlight, 'index': ALL}, 'highlight_breakpoints'), + State(settings.highlight_checklist, 'options') +) + + +@callback( + output=dict( + sort_by_options=Output(settings.sort_by_checklist, 'options'), + metric_options=Output(settings.metric_checklist, 'options'), + metric_value=Output(settings.metric_checklist, 'value'), + # text_options=Output(settings.text_radioitems, 'options'), + # text_value=Output(settings.text_radioitems, 'value'), + highlight_options=Output(settings.highlight_checklist, 'options'), + highlight_value=Output(settings.highlight_checklist, 'value'), + indicator_options=Output(settings.indicator_checklist, 'options'), + indicator_value=Output(settings.indicator_checklist, 'value'), + ), + inputs=dict( + course=Input(course_store, 'data'), + assignment=Input(assignment_store, 'data'), + options=Input(nlp_options, 'data') + ) +) +def fill_in_settings(course, assignment, options): + if len(options) == 0: + raise dash_e.PreventUpdate + # populate all settings based on assignment or default + + # TODO grab the options or type from assignment + # if options (obj) set opt to assignment options + # if type (string) set opt to settings_default.type + opt = settings_defaults.general + + ret = dict( + sort_by_options=so.create_checklist_options(opt['indicators']['options'], options, 'indicators'), # same as indicators + metric_options=so.create_checklist_options(opt['metrics']['options'], options, 'metric'), + metric_value=opt['metrics']['selected'], + # text_options=[so.text_options[o] for o in opt['text']['options']], + # text_value=opt['text']['selected'], + highlight_options=so.create_checklist_options(opt['highlight']['options'], options, 'highlight'), + highlight_value=opt['highlight']['selected'], + indicator_options=so.create_checklist_options(opt['indicators']['options'], options, 'indicators'), + indicator_value=opt['indicators']['selected'], + ) + return ret + + +@callback( + Output(student_row, 'children'), + Input(student_store, 'data') +) +def create_cards(students): + # create student cards based on student info + + # TODO if the card data exists in the student_store, + # we want to include it in the initial loading of the card + # this will require the same parser to initially populate data + # what do we want the storage type to be? + # i.e. the same code for both js and python + cards = [ + dbc.Col( + [ + dbc.Card( + [ + html.H4(s['profile']['name']['full_name']), + loc.LOMetrics( + id={ + 'type': student_metrics, + 'index': s['user_id'] + } + ), + html.Div( + loc.LOTextHighlight( + id={ + 'type': student_texthighlight, + 'index': s['user_id'] + } + ), + className='student-card-text' + ), + loc.LOIndicatorBars( + id={ + 'type': student_indicators, + 'index': s['user_id'] + } + ) + ], + body=True, + class_name='shadow-card' + ) + ], + # pattern matching callback + id={ + 'type': student_col, + 'index': s['user_id'] + }, + ) for s in students + ] + return cards diff --git a/modules/wo_highlight_dashboard/wo_highlight_dashboard/module.py b/modules/wo_highlight_dashboard/wo_highlight_dashboard/module.py new file mode 100644 index 000000000..6ef0d3851 --- /dev/null +++ b/modules/wo_highlight_dashboard/wo_highlight_dashboard/module.py @@ -0,0 +1,71 @@ +import os.path + +import dash_bootstrap_components as dbc + +from learning_observer.dash_integration import thirdparty_url, static_url + +import wo_highlight_dashboard.dashboard.layout + +NAME = "Dash Writing Observer Dashboard" + +DASH_PAGES = [ + { + "MODULE": wo_highlight_dashboard.dashboard.layout, + "LAYOUT": wo_highlight_dashboard.dashboard.layout.layout, + "ASSETS": 'assets', + "TITLE": "Writing Observer Dashboard", + "DESCRIPTION": "Dashboard for the Writing Observer built with dash", + "SUBPATH": "dashboard", + "CSS": [ + thirdparty_url("css/bootstrap.min.css"), + thirdparty_url("css/fontawesome_all.css") + ], + "SCRIPTS": [ + static_url("liblo.js") + ] + } +] + +# Third party module tests with helpful messages. +Minty_URL = 'https://cdn.jsdelivr.net/npm/bootswatch@5.1.3/dist/minty/bootstrap.min.css' +if (dbc.themes.MINTY != Minty_URL): + print("WARN:: Unrecognized Minty URL detected: {}".format(dbc.themes.MINTY)) + print("You will need to update dash bootstrap components hash value.\n") + +FontAwesome_URL = "https://use.fontawesome.com/releases/v6.3.0/css/all.css" +if (dbc.icons.FONT_AWESOME != FontAwesome_URL): + print("WARN:: Unrecognized Fontawesome URL detected: {}".format(dbc.icons.FONT_AWESOME)) + print("You will need to update the FontAwesome bootstrap components hash value.\n") + + +THIRD_PARTY = { + "css/bootstrap.min.css": { + "url": dbc.themes.MINTY, + "hash": "c03f5bfd8deb11ad6cec84a6201f4327f28a640e693e56466fd80d983ed54" + "16deff1548a0f6bbad013ec278b9750d1d253bd9c5bd1f53c85fcd62adba5eedc59" + }, + "css/fontawesome_all.css": { + "url": dbc.icons.FONT_AWESOME, + "hash": "1496214e7421773324f4b332127ea77bec822fc6739292ebb19c6abcc22a5" + "6248e0634b4e0ca0c2fcac14dc10b8d01fa17febaa35f46731201d1ffd0ab482dd7" + }, + "webfonts/fa-solid-900.woff2": { + "url": os.path.dirname(os.path.dirname(dbc.icons.FONT_AWESOME)) + "/webfonts/fa-solid-900.woff2", + "hash": "6d3fe769cc40a5790ea2e09fb775f1bd3b130d2fdae1dd552f69559e7ca4c" + "a047862f795da0024737e59e3bcc7446f6eec1bab173758aef0b97ba89d722ffbde" + }, + "webfonts/fa-solid-900.ttf": { + "url": os.path.dirname(os.path.dirname(dbc.icons.FONT_AWESOME)) + "/webfonts/fa-solid-900.ttf", + "hash": "0fdd341671021d04304186c197001cf2e888d3028baaf9a5dec0f0e496959" + "666e8a2e34aae8e79904f8e9b4c0ccae40249897cce5f5ae58d12cc1b3985e588d6" + } +} + +COURSE_DASHBOARDS = [{ + 'name': "Dash Writing Observer", + 'url': "/wo_highlight_dashboard/dash/dashboard/", + "icon": { + "type": "fas", + "icon": "fa-pen-nib" + } +}] diff --git a/modules/writing_observer/setup.cfg b/modules/writing_observer/setup.cfg new file mode 100644 index 000000000..5515d2a7e --- /dev/null +++ b/modules/writing_observer/setup.cfg @@ -0,0 +1,14 @@ +[metadata] +name = Writing Observer +description = Writing Observer, a tool for monitoring student writing processes +url = http://www.ets.org +author_email = pmitros@ets.org +author = Piotr Mitros +version = 0.1 + +[options] +packages = writing_observer + +[options.entry_points] +lo_modules = + wobserver = writing_observer.module diff --git a/modules/writing_observer/setup.py b/modules/writing_observer/setup.py new file mode 100644 index 000000000..9aaa26bea --- /dev/null +++ b/modules/writing_observer/setup.py @@ -0,0 +1,11 @@ +''' +Install script. Everything is handled in setup.cfg + +To set up locally for development, run `python setup.py develop`, in a +virtualenv, preferably. +''' + +from setuptools import setup + +setup( +) diff --git a/modules/writing_observer/writing_observer/aggregator.py b/modules/writing_observer/writing_observer/aggregator.py new file mode 100644 index 000000000..f7880186a --- /dev/null +++ b/modules/writing_observer/writing_observer/aggregator.py @@ -0,0 +1,268 @@ +import sys +import time +import learning_observer.settings +import learning_observer.stream_analytics.helpers +# import traceback +import learning_observer.util + + +def excerpt_active_text( + text, cursor_position, + desired_length=103, cursor_target=2 / 3, max_overflow=10, + cursor_character="❙" +): + ''' + This function returns a short segment of student text, cutting in a + sensible way around word boundaries. This can be used for real-time + typing views. + + `desired_length` is how much text we want. + `cursor_target` is what fraction of the text should be before the cursor. + `max_overflow` is how much longer we're willing to go in order to land on + a word boundary. + `cursor_character` is what we insert at the boundary. Can be an empty + string, a nice bit of markup, etc. + ''' + character_count = len(text) + before = int(desired_length * 2 / 3) + # We step backwards and forwards from the cursor by the desired number of characters + start = max(0, int(cursor_position - before)) + end = min(character_count - 1, start + desired_length) + # And, if we don't have much text after the cursor, we adjust the beginning + # print(start, cursor_position, end) + start = max(0, end - desired_length) + # Split on a word boundary, if there's one close by + # print(start, cursor_position, end) + while end < character_count and end - start < desired_length + 10 and not text[end].isspace(): + end += 1 + + # print(start, cursor_position, end) + while start > 0 and end - start < desired_length + 10 and not text[start].isspace(): + start -= 1 + + clipped_text = text[start:max(cursor_position - 1, 0)] + cursor_character + text[max(cursor_position - 1, 0):end] + return clipped_text + + +def sanitize_and_shrink_per_student_data(student_data): + ''' + This function is run over the data for **each student**, one-by-one. + + We: + * Compute text length + * Cut down the text to just what the client needs to receive (we + don't want to send 30 full essays) + ''' + text = student_data.get('writing_observer.writing_analysis.reconstruct', {}).get('text', None) + if text is None: + student_data['writing_observer_compiled'] = { + "text": "[None]", + "character_count": 0 + } + return student_data + + character_count = len(text) + cursor_position = student_data['writing_observer.writing_analysis.reconstruct']['position'] + + # Yes, this does mutate the input. No, we should. No, it doesn't matter, since the + # code needs to move out of here. Shoo, shoo. + student_data['writing_observer_compiled'] = { + "text": excerpt_active_text(text, cursor_position), + "character_count": character_count + } + # Remove things which are too big to send back. Note: Not benchmarked, so perhaps not too big + del student_data['writing_observer.writing_analysis.reconstruct']['text'] + # We should downsample, rather than removing + del student_data['writing_observer.writing_analysis.reconstruct']['edit_metadata'] + return student_data + + +def aggregate_course_summary_stats(student_data): + ''' + Here, we compute summary stats across the entire course. This is + helpful so that the front end can know, for example, how to render + axes. + + Right now, this API is **evolving**. Ideally, we'd like to support: + + - Transforming summarized per-student data based on data from + other students + - Extract aggregates + + This API lets us do that, but it's a little too generic. We'd like + to be a little bit more semantic. + ''' + max_idle_time = 0 + max_time_on_task = 0 + max_character_count = 0 + for student in student_data: + max_character_count = max( + max_character_count, + student.get('writing_observer_compiled', {}).get('character_count', 0) + ) + max_time_on_task = max( + max_time_on_task, + student.get('writing_observer.writing_analysis.time_on_task', {}).get("total_time_on_task", 0) + ) + return { + "summary_stats": { + 'max_character_count': max_character_count, + 'max_time_on_task': max_time_on_task, + # TODO: Should we aggregate this in some way? If we run on multiple servers, + # this is susceptible to drift. That could be jarring; even a few seconds + # error could be an issue in some contexts. + 'current_time': time.time() + } + } + + +###### +# +# Everything from here on is a hack. +# We need to figure out proper abstractions. +# +###### + + +async def get_latest_student_documents(student_data): + ''' + This will retrieve the latest student documents from the database. It breaks + abstractions. + + It also involves some excess loops that are annoying but briefly we need to + determine which students actually *have* last writing data. Then we need to + go through and build keys for that data. Then we fetch the data itself. + Later on in this file we need to marry the information again. This builds + up a series of lists which are successively merged into a single dict with + the resulting data. + + Some of what is copied along is clearly duplicative and probably unneeded. + ''' + import learning_observer.kvs + + import writing_observer.writing_analysis + from learning_observer.stream_analytics.fields import KeyField, KeyStateType, EventField + + kvs = learning_observer.kvs.KVS() + + # Compile a list of the active students. + active_students = [s for s in student_data if 'writing_observer.writing_analysis.last_document' in s] + + # Now collect documents for all of the active students. + document_keys = ([ + learning_observer.stream_analytics.helpers.make_key( + writing_observer.writing_analysis.reconstruct, + { + KeyField.STUDENT: s['user_id'], + EventField('doc_id'): s.get('writing_observer.writing_analysis.last_document', {}).get('document_id', None) + }, + KeyStateType.INTERNAL + ) for s in active_students]) + + print(document_keys) + + kvs_data = await kvs.multiget(keys=document_keys) + + + # Return blank entries if no data, rather than None. This makes it possible + # to use item.get with defaults sanely. For the sake of later alignment + # we also zip up the items with the keys and users that they come from + # this hack allows us to align them after cleaning occurrs later. + # writing_data = [{} if item is None else item for item in writing_data] + writing_data = [] + for idx in range(len(document_keys)): + student = active_students[idx] + doc = kvs_data[idx] + + # If we have an empty item we simply return an empty dict with the + # student but an empty doc value. + if (doc is None): doc = {} + + # Now insert the student data and pass it along. + doc['student'] = student + writing_data.append(doc) + + print(writing_data) + + return writing_data + + +async def remove_extra_data(writing_data): + ''' + We don't want Deane graph data going to the client. We just do a bit of + a cleanup. This is in-place. + ''' + for item in writing_data: + if 'edit_metadata' in item: + del item['edit_metadata'] + return writing_data + + +async def merge_with_student_data(writing_data, student_data): + ''' + Add the student metadata to each text + ''' + + for item, student in zip(writing_data, student_data): + if 'edit_metadata' in item: + del item['edit_metadata'] + item['student'] = student + return writing_data + + +if learning_observer.settings.module_setting('writing_observer', 'use_nlp', False): + try: + import writing_observer.awe_nlp + processor = writing_observer.awe_nlp.process_texts + except ImportError as e: + print(e) + print('AWE Components is not installed. To install, please see https://github.com/ETS-Next-Gen/AWE_Components') + sys.exit(-1) +else: + import writing_observer.stub_nlp + processor = writing_observer.stub_nlp.process_texts + + +async def latest_data(student_data, options=None): + ''' + HACK HACK HACK + + This code needs to take the student data as a dict and then + collect the latest writing data for each student (assuming + they have it). The code then passes that writing data on + to Paul's code for processing. For the time being this + works by essentially building up some large dicts that + contain the text and student data together. + + In the long run this should *all* be replaced by a cleaner + object interface that hides some of this from the user + but for the now we'll roll with this. + ''' + # Get the latest documents with the students appended. + writing_data = await get_latest_student_documents(student_data) + + # Strip out the unnecessary extra data. + writing_data = await remove_extra_data(writing_data) + + print(">>> WRITE DATA-premerge: {}".format(writing_data)) + + # This is the error. Skipping now. + writing_data_merge = await merge_with_student_data(writing_data, student_data) + print(">>> WRITE DATA-postmerge: {}".format(writing_data_merge)) + + + # #print(">>>> PRINT WRITE DATA: Merge") + # #print(writing_data) + + # just_the_text = [w.get("text", "") for w in writing_data] + + # annotated_texts = await writing_observer.awe_nlp.process_texts_parallel(just_the_text) + + # for annotated_text, single_doc in zip(annotated_texts, writing_data): + # if annotated_text != "Error": + # single_doc.update(annotated_text) + + writing_data = await merge_with_student_data(writing_data, student_data) + writing_data = await processor(writing_data, options) + + return {'latest_writing_data': writing_data} diff --git a/modules/writing_observer/writing_observer/awe_nlp.py b/modules/writing_observer/writing_observer/awe_nlp.py new file mode 100644 index 000000000..ea57732d4 --- /dev/null +++ b/modules/writing_observer/writing_observer/awe_nlp.py @@ -0,0 +1,279 @@ +''' +This is an interface to AWE_Workbench. +''' + +import asyncio +import enum +import hashlib +import time +import functools +import os +import multiprocessing + +from concurrent.futures import ProcessPoolExecutor + +import spacy +import coreferee +import spacytextblob.spacytextblob +import awe_components.components.lexicalFeatures +import awe_components.components.syntaxDiscourseFeats +import awe_components.components.viewpointFeatures +import awe_components.components.lexicalClusters +import awe_components.components.contentSegmentation +import json +import time +import warnings + +import writing_observer.nlp_indicators +import learning_observer.kvs +import learning_observer.util + +RUN_MODES = enum.Enum('RUN_MODES', 'MULTIPROCESSING SERIAL') + + +def init_nlp(): + ''' + Initialize the spacy pipeline with the AWE components. This takes a while + to run. + ''' + warnings.filterwarnings('ignore', category=UserWarning, module='nltk') + nlp = spacy.load("en_core_web_lg") + + # Adding all of the components, since + # each of them turns out to be implicated in + # the demo list. I note below which ones can + # be loaded separately to support specific indicators. + nlp.add_pipe('coreferee') + nlp.add_pipe('spacytextblob') + nlp.add_pipe('lexicalfeatures') + nlp.add_pipe('syntaxdiscoursefeatures') + nlp.add_pipe('viewpointfeatures') + nlp.add_pipe('lexicalclusters') + nlp.add_pipe('contentsegmentation') + return nlp + + +nlp = init_nlp() + + +def outputIndicator(doc, indicatorName, itype, stype=None, text=None, added_filter=None): + ''' + A function to output three types of information: summary metrics, + lists of textual information selected by the indicator, and + the offset information for each word or span selected by the indicator + ''' + + indicator = {} + + if added_filter is None: + theFilter = [(indicatorName, [True]), ('is_alpha', [True])] + else: + theFilter = added_filter + theFilter.append(('is_alpha', [True])) + + indicator['metric'] =\ + doc._.AWE_Info(infoType=itype, + indicator=indicatorName, + filters=theFilter, + summaryType=stype) + + data = json.loads( + doc._.AWE_Info(infoType=itype, + indicator=indicatorName, + filters=theFilter)).values() + + indicator['offsets'] = \ + [[entry['offset'], entry['length']] for entry in data] + + if itype == 'Token': + indicator['text'] = \ + json.loads(doc._.AWE_Info(infoType=itype, + indicator=indicatorName, + filters=theFilter, + transformations=['lemma'], + summaryType='uniq')) + else: + indicator['text'] = [] + + for span in indicator['offsets']: + indicator['text'].append(text[int(span[0]):int(span[0]) + int(span[1])]) + + return indicator + + +def process_text(text, options=None): + ''' + This will extract a dictionary of metadata using Paul's AWE Workbench code. + ''' + doc = nlp(text) + results = {} + + if options is None: + # Do we want options to be everything initially or nothing? + options = writing_observer.nlp_indicators.INDICATORS.keys() + options = [] + + for item in options: + if item not in writing_observer.nlp_indicators.INDICATORS: + continue + indicator = writing_observer.nlp_indicators.INDICATORS[item] + (id, label, infoType, select, filterInfo, summaryType) = indicator + results[id] = outputIndicator(doc, select, infoType, stype=summaryType, text=text, added_filter=filterInfo) + results[id].update({ + "label": label, + "type": infoType, + "name": id, + "summary_type": summaryType + }) + return results + + +async def process_texts_serial(texts, options=None): + ''' + Process a list of texts, in serial. + + For testing / debugging, this will process a single essay. Note that while + labeled async, it's not. If run on the server, it will lock up the main + Python process. + ''' + annotated = [] + for text in texts: + print(text) + annotations = process_text(text, options) + annotations['text'] = text + annotated.append(annotations) + + return annotated + + +executor = None + + +def run_in_fork(func): + ''' + This will run a function in a forked subproces, for isolation. + + I wanted to check if this would solve a bug. It didn't. + ''' + q = multiprocessing.Queue() + thread = os.fork() + if thread != 0: + print("Awaiting queue") + return q.get(block=True) + print("Awaited") + else: + print("Queuing") + q.put(func()) + print("Queued") + os._exit(0) + + +async def process_texts_parallel(texts, options=None): + ''' + This will spin up as many processes as we have cores, and process texts + in parallel. Note that we should confirm issues of thread safety. If + Python does this right, this should run in forked environments, and we + won't run into issues. Otherwise, we'd want to either fork ourselves, or + understand how well spacy, etc. do with parallelism. + ''' + global executor + if executor is None: + executor = ProcessPoolExecutor() + + loop = asyncio.get_running_loop() + result_futures = [] + for text in texts: + processor = functools.partial(process_text, text, options) + # forked_processor = functools.partial(run_in_fork, processor) + result_futures.append(loop.run_in_executor(executor, processor)) + + annotated = [] + for text, result_future in zip(texts, result_futures): + try: + annotations = await result_future + annotations['text'] = text + except Exception: + raise + annotations = "Error" + annotated.append(annotations) + + return annotated + + +async def process_texts(writing_data, options=None, mode=RUN_MODES.MULTIPROCESSING): + ''' + Process texts with caching + 1. Create hash of text + 2. Fetch cache data + 3. Check to see if all options are present + - Yes? Add to results + - No? Record missing options `needed_options` and add to `need_processing` + 4. If we need to process anything process it + 5. Update results/cache + ''' + processor = { + RUN_MODES.MULTIPROCESSING: process_texts_parallel, + RUN_MODES.SERIAL: process_texts_serial + } + + results = [] + need_processing = [] + needed_options = set() + cache = learning_observer.kvs.KVS() + + for writing in writing_data: + text = writing.get('text', '') + if len(text) == 0: + continue + text_hash = 'NLP_CACHE_' + learning_observer.util.secure_hash(text.encode('utf-8')) + text_cache_data = await cache[text_hash] + if text_cache_data is None: + text_cache_data = {} + writing.update(text_cache_data) + missing_options = set(options if options is not None else []).difference(text_cache_data.keys()) + needed_options.update(missing_options) + if len(missing_options) == 0: + results.append(writing) + else: + need_processing.append(writing) + + if len(need_processing) > 0: + just_the_text = [w.get("text", "") for w in need_processing] + annotated_texts = await processor[mode](just_the_text, list(needed_options)) + + for annotated_text, single_doc in zip(annotated_texts, need_processing): + if annotated_text != "Error": + single_doc.update(annotated_text) + text_hash = 'NLP_CACHE_' + learning_observer.util.secure_hash(single_doc['text'].encode('utf-8')) + new_cache = {k: v for k, v in single_doc.items() if k != 'student'} + await cache.set(text_hash, new_cache) + results.extend(need_processing) + return results + + +if __name__ == '__main__': + import time + import writing_observer.sample_essays + # Run over a sample text + example_texts = writing_observer.sample_essays.SHORT_STORIES + t1 = time.time() + results = process_text(example_texts[0]) + t2 = time.time() + print(json.dumps(results, indent=2)) + + # If we want to save some test data, flip this to True + if False: + with open("results.json", "w") as fp: + json.dump(results, fp, indent=2) + print("==============") + results2 = asyncio.run(process_texts_parallel(example_texts[0:8])) + t3 = time.time() + results3 = asyncio.run(process_texts_serial(example_texts[0:8])) + t4 = time.time() + print(results2) + print("Single time", t2 - t1) + print("Parallel time", t3 - t2) + print("Serial time", t4 - t3) + print("Note that these results are imperfect -- ") + print("Errors", len([r for r in results2 if r == "Error"])) + print("Errors", [r if r == "Error" else "--" for r in results2]) diff --git a/modules/writing_observer/writing_observer/module.py b/modules/writing_observer/writing_observer/module.py new file mode 100644 index 000000000..fb0e2c350 --- /dev/null +++ b/modules/writing_observer/writing_observer/module.py @@ -0,0 +1,184 @@ +''' +Module definition file + +This may be an examplar for building new modules too. +''' + +# Outgoing APIs +# +# Generically, these would usually serve JSON to dashboards written as JavaScript and +# HTML. These used to be called 'dashboards,' but we're now hosting those as static +# files. + +import learning_observer.stream_analytics.helpers as helpers + +import writing_observer.aggregator +import writing_observer.writing_analysis +from writing_observer.nlp_indicators import INDICATOR_JSONS + + +NAME = "The Writing Observer" + +COURSE_AGGREGATORS = { + "writing_observer": { + "sources": [ # These are the reducers whose outputs we aggregate + writing_observer.writing_analysis.time_on_task, + writing_observer.writing_analysis.reconstruct + # TODO: "roster" + ], + # Then, we pass the per-student data through the cleaner, if provided. + "cleaner": writing_observer.aggregator.sanitize_and_shrink_per_student_data, + # And we pass an array of the output of that through the aggregator + "aggregator": writing_observer.aggregator.aggregate_course_summary_stats, + "name": "This is the main Writing Observer dashboard.", + # This is what we return for a student for whom we have no data + # (or if we have data, don't have these fields) + "default_data": { + 'writing_observer.writing_analysis.reconstruct': { + 'text': None, + 'position': 0, + 'edit_metadata': {'cursor': [2], 'length': [1]} + }, + 'writing_observer.writing_analysis.time_on_task': { + 'saved_ts': -1, + 'total_time_on_task': 0 + } + } + }, + "latest_data": { + "sources": [ + writing_observer.writing_analysis.last_document + ], + "name": "Show the latest student writing", + "aggregator": writing_observer.aggregator.latest_data + } +} + +STUDENT_AGGREGATORS = { +} + +# Incoming event APIs +REDUCERS = [ + { + 'context': "org.mitros.writing_analytics", + 'scope': writing_observer.writing_analysis.gdoc_scope, + 'function': writing_observer.writing_analysis.time_on_task + }, + { + 'context': "org.mitros.writing_analytics", + 'scope': writing_observer.writing_analysis.gdoc_scope, + 'function': writing_observer.writing_analysis.reconstruct + }, + { + 'context': "org.mitros.writing_analytics", + 'scope': writing_observer.writing_analysis.student_scope, + 'function': writing_observer.writing_analysis.event_count + }, + { + 'context': "org.mitros.writing_analytics", + 'scope': writing_observer.writing_analysis.student_scope, + 'function': writing_observer.writing_analysis.document_list + }, + { + 'context': "org.mitros.writing_analytics", + 'scope': writing_observer.writing_analysis.student_scope, + 'function': writing_observer.writing_analysis.last_document + } +] + + +# Required client-side JavaScript downloads +THIRD_PARTY = { + "require.js": { + "url": "https://requirejs.org/docs/release/2.3.6/comments/require.js", + "hash": "d1e7687c1b2990966131bc25a761f03d6de83115512c9ce85d72e4b9819fb" + "8733463fa0d93ca31e2e42ebee6e425d811e3420a788a0fc95f745aa349e3b01901" + }, + "text.js": { + "url": "https://raw.githubusercontent.com/requirejs/text/" + "3f9d4c19b3a1a3c6f35650c5788cbea1db93197a/text.js", + "hash": "fb8974f1633f261f77220329c7070ff214241ebd33a1434f2738572608efc" + "8eb6699961734285e9500bbbd60990794883981fb113319503208822e6706bca0b8" + }, + "r.js": { + "url": "https://requirejs.org/docs/release/2.3.6/r.js", + "hash": "52300a8371df306f45e981fd224b10cc586365d5637a19a24e710a2fa566f" + "88450b8a3920e7af47ba7197ffefa707a179bc82a407f05c08508248e6b5084f457" + }, + "bulma.min.css": { + "url": "https://cdnjs.cloudflare.com/ajax/libs/bulma/0.9.0/css/" + "bulma.min.css", + "hash": "ec7342883fdb6fbd4db80d7b44938951c3903d2132fc3e4bf7363c6e6dc52" + "95a478c930856177ac6257c32e1d1e10a4132c6c51d194b3174dc670ab8d116b362" + }, + "fontawesome.js": { + "url": "https://use.fontawesome.com/releases/v5.3.1/js/all.js", + "hash": "83e7b36f1545d5abe63bea9cd3505596998aea272dd05dee624b9a2c72f96" + "62618d4bff6e51fafa25d41cb59bd97f3ebd72fd94ebd09a52c17c4c23fdca3962b" + }, + "showdown.js": { + "url": "https://rawgit.com/showdownjs/showdown/1.9.1/dist/showdown.js", + "hash": "4fe14f17c2a1d0275d44e06d7e68d2b177779196c6d0c562d082eb5435eec" + "4e710a625be524767aef3d9a1f6a5b88f912ddd71821f4a9df12ff7dd66d6fbb3c9" + }, + "showdown.js.map": { + "url": "https://rawgit.com/showdownjs/showdown/1.9.1/dist/showdown.js.map", + "hash": "74690aa3cea07fd075942ba9e98cf7297752994b93930acb3a1baa2d3042a" + "62b5523d3da83177f63e6c02fe2a09c8414af9e1774dad892a303e15a86dbeb29ba" + }, + "mustache.min.js": { + "url": "http://cdnjs.cloudflare.com/ajax/libs/mustache.js/3.1.0/" + "mustache.min.js", + "hash": "e7c446dc9ac2da9396cf401774efd9bd063d25920343eaed7bee9ad878840" + "e846d48204d62755aede6f51ae6f169dcc9455f45c1b86ba1b42980ccf8f241af25" + }, + "d3.v5.min.js": { + "url": "https://d3js.org/d3.v5.min.js", + "hash": "466fe57816d719048885357cccc91a082d8e5d3796f227f88a988bf36a5c2" + "ceb7a4d25842f5f3c327a0151d682e648cd9623bfdcc7a18a70ac05cfd0ec434463" + }, + "bulma-tooltip-min.css": { + "url": "https://cdn.jsdelivr.net/npm/@creativebulma/bulma-tooltip@1.2.0/" + "dist/bulma-tooltip.min.css", + "hash": "fc37b25fa75664a6aa91627a7b1298a09025c136085f99ba31b1861f073a0" + "696c4756cb156531ccf5c630154d66f3059b6b589617bd6bd711ef665079f879405" + } +} + + +# We're still figuring this out, but we'd like to support hosting static files +# from the git repo of the module. +# +# This allows us to have a Merkle-tree style record of which version is deployed +# in our log files. +STATIC_FILE_GIT_REPOS = { + 'writing_observer': { + # Where we can grab a copy of the repo, if not already on the system + 'url': 'https://github.com/ETS-Next-Gen/writing_observer.git', + # Where the static files in the repo lie + 'prefix': 'modules/writing_observer/writing_observer/static', + # Branches we serve. This can either be a whitelist (e.g. which ones + # are available) or a blacklist (e.g. which ones are blocked) + 'whitelist': ['master'] + } +} + + +# We're kinda refactoring the stuff above to below +# +# The stuff above will become APIs to dashboards. The stuff below +# will register the actual dashboards. +COURSE_DASHBOARDS = [{ + 'name': "Writing Observer", + 'url': "/static/repos/writing_observer/writing_observer/master/wobserver.html", + "icon": { + "type": "fas", + "icon": "fa-pen-nib" + } +}] + +EXTRA_VIEWS = [{ + 'name': 'NLP Options', + 'suburl': 'nlp-options', + 'static_json': INDICATOR_JSONS +}] diff --git a/modules/writing_observer/writing_observer/nlp_indicators.py b/modules/writing_observer/writing_observer/nlp_indicators.py new file mode 100644 index 000000000..b495c4cdd --- /dev/null +++ b/modules/writing_observer/writing_observer/nlp_indicators.py @@ -0,0 +1,130 @@ +from recordclass import dataobject, asdict + +# Define a set of indicators with the kind of filtering/summariation we want +# +# Academic Language, Latinate Words, Low Frequency Words, Adjectives, Adverbs, +# Sentences, Paragraphs -- +# just need to have lexicalfeatures in the pipeline to run. +# +# Transition Words, Ordinal Transition Words -- +# -- shouldonly need syntaxdiscoursefeats in the pipeline to run +# +# Information Sources, Attributions, Citations, Quoted Words, Informal Language +# Argument Words, Emotion Words, Character Trait Words, Concrete Details -- +# Need lexicalfeatures + syntaxdiscoursefeats + viewpointfeatures to run +# +# Main idea sentences, supporting idea sentences, supporting detail sentences -- +# Need the full pipeline to run, though the main dependencies are on +# lexicalclusters and contentsegmentation +# +# Format for this list: Label, type of indicator (Token or Doc), indicator name, +# filter (if needed), summary function to use +SPAN_INDICATORS = [ + # language + ('Academic Language', 'Token', 'is_academic', None, 'percent'), + ('Informal Language', 'Token', 'vwp_interactive', None, 'percent'), + ('Latinate Words', 'Token', 'is_latinate', None, 'percent'), + ('Opinion Words', 'Token', 'vwp_evaluation', None, 'total'), + ('Emotion Words', 'Token', 'vwp_emotionword', None, 'percent'), + # vwp_emotion_states looks for noun/emotion word pairs (takes a lot of resources) - ignoring for now + # Argumentation + # ('Argumentation', 'Token', 'vwp_argumentation', None, 'percent'), # most resource heavy - ignoring for now + ('Argument Words', 'Token', 'vwp_argumentword', None, 'percent'), # more surfacey # TODO needs new label + ('Explicit argument', 'Token', 'vwp_explicit_argument', None, 'percent'), # surfacey # TODO needs new label + # statements + ('Statements of Opinion', 'Doc', 'vwp_statements_of_opinion', None, 'percent'), + ('Statements of Fact', 'Doc', 'vwp_statements_of_fact', None, 'percent'), + # Transitions + # eventually we want to exclude \n\n as transitions using `[('!=', ['introductory'])]` + # however the introductory category also includes "let us" and "let's" + # no highlighting is shown on the new lines, so we won't remove it for now. + ('Transition Words', 'Doc', 'transitions', None, 'counts'), + # + ('Positive Transition Words', 'Doc', 'transitions', [('==', ['positive'])], 'total'), + ('Conditional Transition Words', 'Doc', 'transitions', [('==', ['conditional'])], 'total'), + ('Consequential Transition Words', 'Doc', 'transitions', [('==', ['consequential'])], 'total'), + ('Contrastive Transition Words', 'Doc', 'transitions', [('==', ['contrastive'])], 'total'), + ('Counterpoint Transition Words', 'Doc', 'transitions', [('==', ['counterpoint'])], 'total'), + ('Comparative Transition Words', 'Doc', 'transitions', [('==', ['comparative'])], 'total'), + ('Cross Referential Transition Words', 'Doc', 'transitions', [('==', ['crossreferential'])], 'total'), + ('Illustrative Transition Words', 'Doc', 'transitions', [('==', ['illustrative'])], 'total'), + ('Negative Transition Words', 'Doc', 'transitions', [('==', ['negative'])], 'total'), + ('Emphatic Transition Words', 'Doc', 'transitions', [('==', ['emphatic'])], 'total'), + ('Evenidentiary Transition Words', 'Doc', 'transitions', [('==', ['evidentiary'])], 'total'), + ('General Transition Words', 'Doc', 'transitions', [('==', ['general'])], 'total'), + ('Ordinal Transition Words', 'Doc', 'transitions', [('==', ['ordinal'])], 'total'), + ('Purposive Transition Words', 'Doc', 'transitions', [('==', ['purposive'])], 'total'), + ('Periphrastic Transition Words', 'Doc', 'transitions', [('==', ['periphrastic'])], 'total'), + ('Hypothetical Transition Words', 'Doc', 'transitions', [('==', ['hypothetical'])], 'total'), + ('Summative Transition Words', 'Doc', 'transitions', [('==', ['summative'])], 'total'), + ('Introductory Transition Words', 'Doc', 'transitions', [('==', ['introductory'])], 'total'), + # pos_ + ('Adjectives', 'Token', 'pos_', [('==', ['ADJ'])], 'total'), + ('Adverbs', 'Token', 'pos_', [('==', ['ADV'])], 'total'), + ('Nouns', 'Token', 'pos_', [('==', ['NOUN'])], 'total'), + ('Proper Nouns', 'Token', 'pos_', [('==', ['PROPN'])], 'total'), + ('Verbs', 'Token', 'pos_', [('==', ['VERB'])], 'total'), + ('Numbers', 'Token', 'pos_', [('==', ['NUM'])], 'total'), + ('Prepositions', 'Token', 'pos_', [('==', ['ADP'])], 'total'), + ('Coordinating Conjunction', 'Token', 'pos_', [('==', ['CCONJ'])], 'total'), + ('Subordinating Conjunction', 'Token', 'pos_', [('==', ['SCONJ'])], 'total'), + ('Auxiliary Verb', 'Token', 'pos_', [('==', ['AUX'])], 'total'), + ('Pronoun', 'Token', 'pos_', [('==', ['PRON'])], 'total'), + # sentence variety + ('Sentence Types', 'Doc', 'sentence_types', None, 'counts'), + ('Simple Sentences', 'Doc', 'sentence_types', [('==', ['Simple'])], 'total'), + ('Simple with Complex Predicates', 'Doc', 'sentence_types', [('==', ['SimpleComplexPred'])], 'total'), + ('Simple with Compound Predicates', 'Doc', 'sentence_types', [('==', ['SimpleCompoundPred'])], 'total'), + ('Simple with Compound Complex Predicates', 'Doc', 'sentence_types', [('==', ['SimpleCompoundComplexPred'])], 'total'), + ('Compound Sentences', 'Doc', 'sentence_types', [('==', ['Compound'])], 'total'), + ('Complex Sentences', 'Doc', 'sentence_types', [('==', ['Complex'])], 'total'), + ('Compound Complex Sentences', 'Doc', 'sentence_types', [('==', ['CompoundComplex'])], 'total'), + # Sources/Attributes/Citations/Quotes + ('Information Sources', 'Token', 'vwp_source', None, 'percent'), + ('Attributions', 'Token', 'vwp_attribution', None, 'percent'), + ('Citations', 'Token', 'vwp_cite', None, 'percent'), + ('Quoted Words', 'Token', 'vwp_quoted', None, 'percent'), + # Dialogue + ('Direct Speech Verbs', 'Doc', 'vwp_direct_speech', None, 'percent'), + ('Indirect Speech', 'Token', 'vwp_in_direct_speech', None, 'percent'), + # vwp_quoted - already used above + # tone + ('Positive Tone', 'Token', 'vwp_tone', [('>', [.4])], 'percent'), + ('Negative Tone', 'Token', 'vwp_tone', [('<', [-.4])], 'percent'), + # details + ('Concrete Details', 'Token', 'concrete_details', None, 'percent'), + ('Main Idea Sentences', 'Doc', 'main_ideas', None, 'total'), + ('Supporting Idea Sentences', 'Doc', 'supporting_ideas', None, 'total'), + ('Supporting Detail Sentences', 'Doc', 'supporting_details', None, 'total'), + # Other items + ('Polysyllabic Words', 'Token', 'nSyll', [('>', [3])], 'percent'), + ('Low Frequency Words', 'Token', 'max_freq', [('<', [4])], 'percent'), + ('Sentences', 'Doc', 'sents', None, 'total'), + ('Paragraphs', 'Doc', 'delimiter_\n', None, 'total'), + ('Character Trait Words', 'Token', 'vwp_character', None, 'percent'), + ('In Past Tense', 'Token', 'in_past_tense_scope', None, 'percent'), + ('Explicit Claims', 'Doc', 'vwp_propositional_attitudes', None, 'percent'), + ('Social Awareness', 'Doc', 'vwp_social_awareness', None, 'percent') +] + +# Create indicator dict to easily refer to each tuple above by name +INDICATORS = {} +INDICATOR_W_IDS = [] +for indicator in SPAN_INDICATORS: + id = indicator[0].lower().replace(' ', '_') + INDICATOR_W_IDS.append((id, ) + indicator) + INDICATORS[id] = (id, ) + indicator + + +class NLPIndicators(dataobject): + id: str + name: str + type: str + parent: str + filters: list + function: str + # tooltip: str + + +indicators = map(lambda ind: NLPIndicators(*ind), INDICATOR_W_IDS) +INDICATOR_JSONS = [asdict(ind) for ind in indicators] diff --git a/modules/writing_observer/writing_observer/reconstruct_doc.py b/modules/writing_observer/writing_observer/reconstruct_doc.py new file mode 100644 index 000000000..6370d5095 --- /dev/null +++ b/modules/writing_observer/writing_observer/reconstruct_doc.py @@ -0,0 +1,257 @@ +''' +This can reconstruct a Google Doc from Google's JSON requests. It +is based on the reverse-engineering by James Somers in his blog +post about the Traceback extension. The code is, obviously, all +new. + +See: `http://features.jsomers.net/how-i-reverse-engineered-google-docs/` +''' + +import json + + +class google_text(object): + ''' + We encapsulate a string object to support a Google Doc snapshot at a + point in time. Right now, this adds cursor position. In the future, + we might annotate formatting and similar properties. + ''' + def __new__(cls): + ''' + Constructor. We create a blank document to be populated. + ''' + new_object = object.__new__(cls) + new_object._text = "" + new_object._position = 0 + new_object._edit_metadata = {} + new_object.fix_validity() + return new_object + + def assert_validity(self): + ''' + We do integrity checks. We store cursor length and text length in + two lists for efficiency, and for now, this just confirms they're + the same length. + ''' + cursor_array_length = len(self._edit_metadata["cursor"]) + textlength_array_length = len(self._edit_metadata["length"]) + length_difference = cursor_array_length - textlength_array_length + if length_difference != 0: + raise Exception( + "Edit metadata length doesn't match. This should never happen." + ) + + def fix_validity(self): + ''' + Check we satisify invariants, and if not, fix them. This is helpful + for graceful degredation. We also use this to initalize the object. + ''' + errors_found = [] + + if "cursor" not in self._edit_metadata: + self._edit_metadata["cursor"] = [] + errors_found.append("No cursor array") + if "length" not in self._edit_metadata: + self._edit_metadata["length"] = [] + errors_found.append("No length array") + + # We expect edit metadata to be the same length. We went + # from tabular to columnar which does not guarantee this + # invariant, unfortunately. We should evaluate if this + # optimization was premature, but it's a lot more compact. + cursor_array_length = len(self._edit_metadata["cursor"]) + textlength_array_length = len(self._edit_metadata["length"]) + length_difference = cursor_array_length - textlength_array_length + if length_difference > 0: + print("Mismatching lengths. This should never happen!") + self._edit_metadata["length"] += [0] * length_difference + errors_found.append("Mismatching lengths") + if length_difference < 0: + print("Mismatching lengths. This should never happen!") + self._edit_metadata["cursor"] += [0] * -length_difference + errors_found.append("Mismatching lengths") + return errors_found + + def from_json(json_rep): + ''' + Class method to deserialize from JSON + + For null objects, it will create a new Google Doc. + ''' + new_object = google_text.__new__(google_text) + if json_rep is None: + json_rep = {} + new_object._text = json_rep.get('text', '') + new_object._position = json_rep.get('position', 0) + new_object._edit_metadata = json_rep.get('edit_metadata', {}) + new_object.fix_validity() + return new_object + + def update(self, text): + ''' + Update the text. Note that we should probably combine this + with updating the cursor position, since if text updates, + the cursor should always update too. + ''' + self._text = text + + def len(self): + ''' + Length of the string + ''' + return len(self._text) + + @property + def position(self): + ''' + Cursor postion. Perhaps we should rename this? + ''' + return self._position + + @position.setter + def position(self, p): + ''' + Update cursor position. + + Side effect: Update Deane arrays. + ''' + self._edit_metadata['length'].append(len(self._text)) + self._edit_metadata['cursor'].append(p) + self._position = p + + @property + def edit_metadata(self): + ''' + Return edit metadata. For now, this is length / cursor position + arrays, but perhaps we should rename this as we expect more + analytics. + ''' + return self._edit_metadata + + def __str__(self): + ''' + This returns __just__ the text of the document (no metadata) + ''' + return self._text + + @property + def json(self): + ''' + This serializes to JSON. + ''' + return { + 'text': self._text, + 'position': self._position, + 'edit_metadata': self._edit_metadata + } + + +def command_list(doc, commands): + ''' + This will process a list of commands. It is helpful either when + loading the history of a new doc, or in updating a document from + new `save` requests. + ''' + for item in commands: + if item['ty'] in dispatch: + doc = dispatch[item['ty']](doc, **item) + else: + print("Unrecogized Google Docs command: " + repr(item['ty'])) + # TODO: Log issue and fix it! + return doc + + +def multi(doc, mts, ty): + ''' + Handles a batch of commands. + + `mts` is the list of commands + `ty` is always `mlti` + ''' + doc = command_list(doc, mts) + return doc + + +def insert(doc, ty, ibi, s): + ''' + Insert new text. + * `ty` is always `is` + * `ibi` is where the insert happens + * `s` is the string to insert + ''' + doc.update("{start}{insert}{end}".format( + start=doc._text[0:ibi - 1], + insert=s, + end=doc._text[ibi - 1:] + )) + + doc.position = ibi + len(s) + + return doc + + +def delete(doc, ty, si, ei): + ''' + Delete text. + * `ty` is always `ds` + * `si` is the index of the start of deletion + * `ei` is the end + ''' + doc.update("{start}{end}".format( + start=doc._text[0:si - 1], + end=doc._text[ei:] + )) + + doc.position = si + + return doc + + +def alter(doc, si, ei, st, sm, ty): + ''' + Alter commands change formatting. + + We ignore these for now. + ''' + return doc + + +def null(doc, **kwargs): + ''' + Do nothing. Google sometimes makes null requests. There are also + requests we don't know how to process. + + I'm not quite sure what these are. The command is not JavaScript's + `null` but the string `'null'` + ''' + return doc + + +# This dictionary maps the `ty` parameter to the function which +# handles data of that type. + +# TODO: `ae,``ue,` `de,` and `te` need to be +# reverse-engineered. These happens if we e.g. make a new bullet +# list, or add an image. +dispatch = { + 'ae': null, + 'ue': null, + 'de': null, + 'te': null, + 'as': alter, + 'ds': delete, + 'is': insert, + 'mlti': multi, + 'null': null, + 'sl': null +} + +if __name__ == '__main__': + google_json = json.load(open("sample3.json")) + docs_history = google_json['client']['history']['changelog'] + docs_history_short = [t[0] for t in docs_history] + doc = google_text() + doc = command_list(doc, docs_history_short) + print(doc) + print(doc.position) + print(doc.edit_metadata) diff --git a/modules/writing_observer/writing_observer/sample_essays.py b/modules/writing_observer/writing_observer/sample_essays.py new file mode 100644 index 000000000..e2c4ecf88 --- /dev/null +++ b/modules/writing_observer/writing_observer/sample_essays.py @@ -0,0 +1,264 @@ +''' +This is an interface to a variety of sample texts to play with. +''' + +from enum import Enum +import json +import os +import os.path +import random + +import loremipsum +import wikipedia + + +TextTypes = Enum('TextTypes', [ + "SHORT_STORY", "ARGUMENTATIVE", "LOREM", "WIKI_SCIENCE", "WIKI_HISTORY" +]) + + +def sample_texts(text_type=TextTypes.LOREM, count=1): + ''' + Returns a sample, random essay of the appropriate type + ''' + if text_type == TextTypes.LOREM: + return [lorem() for x in range(count)] + + sources = { + TextTypes.ARGUMENTATIVE: ARGUMENTATIVE_ESSAYS, + TextTypes.SHORT_STORY: SHORT_STORIES, + TextTypes.WIKI_SCIENCE: WIKIPEDIA_SCIENCE, + TextTypes.WIKI_HISTORY: WIKIPEDIA_HISTORY + } + + source = sources[text_type] + + essays = [] + while count > len(source): + essays.extend(source) + count = count - len(source) + + essays.extend(random.sample(source, count)) + + if text_type in [TextTypes.WIKI_SCIENCE, TextTypes.WIKI_HISTORY]: + essays = map(wikitext, essays) + + return [e.strip() for e in essays] + + +def lorem(paragraphs=5): + ''' + Generate lorem ipsum test text. + ''' + return "\n\n".join(loremipsum.get_paragraphs(paragraphs)) + + +CACHE_PATH = os.path.join(os.path.dirname(__file__), "data") + + +def wikitext(topic): + if not os.path.exists(CACHE_PATH): + os.mkdir(CACHE_PATH) + cache_file = os.path.join(CACHE_PATH, f"{topic}.json") + + if not os.path.exists(cache_file): + page = wikipedia.page(topic) + data = { + "content": page.content, + "summary": page.summary, + "title": page.title, + "rev": page.revision_id, + "url": page.url, + "id": page.pageid + } + with open(cache_file, "w") as fp: + json.dump(data, fp, indent=3) + + with open(cache_file) as fp: + data = json.load(fp) + + return data["content"] + + +# Wikipedia topics +WIKIPEDIA_SCIENCE = [ + "Corona_Borealis", "Funerary_art", "Splendid_fairywren", "European_hare", "Exelon_Pavilions", "Northern_rosella" +] + +WIKIPEDIA_HISTORY = [ + "Gare_Montparnasse", "History_of_photography", "Cliff_Palace", + "War_of_the_Fifth_Coalition", "Operation_Overlord", + "Slavery_in_the_United_States", "Dust_Bowl", "The_Rhodes_Colossus" +] + +# Short stories, from GPT-3 +SHORT_STORIES = [ + """The snail had always dreamed of going to space. It was a lifelong dream, and finally, the day had arrived. The snail was strapped into a rocket, and prepared for takeoff. + +As the rocket blasted off, the snail felt a sense of exhilaration. It was finally achieving its dream! The snail looked out the window as the Earth got smaller and smaller. Soon, it was in the vastness of space, floating weightlessly. + +The snail was content, knowing that it had finally accomplished its dream. It would never forget this moment, floating in space, looking at the stars. +""", + """One day, an old man was sitting on his porch, telling jokes to his grandson. The grandson was laughing hysterically at every joke. + +Suddenly, a spaceship landed in front of them. A alien got out and said, "I come in peace! I come from a planet of intelligent beings, and we have heard that humans are the most intelligent beings in the universe. We would like to test your intelligence." + +The old man thought for a moment, then said, "Okay, I'll go first. What has two legs, but can't walk?" + +The alien thought for a moment, then said, "I don't know." + +The old man chuckled and said, "A chair." +""", + """The boy loved dolls. He loved their soft skin, their pretty clothes, and the way they always smelled like roses. He wanted to be a doll himself, so he could be pretty and perfect like them. + +One day, he found a doll maker who promised to make him into a doll. The boy was so excited, and couldn't wait to become a doll. + +The doll maker kept her promise, and the boy became a doll. He was perfect in every way, and he loved it. He loved being pretty and perfect, and he loved the way everyone fussed over him and treated him like a delicate little thing. + +The only problem was that the boy's soul was now trapped inside the doll's body, and he could never be human again. +""", + """The mouse had been hunting the cat for days. It was a big cat, twice her size, with sharp claws and teeth. But the mouse was determined to catch it. + +Finally, she corner the cat in an alley. The cat hissed and slashed at the mouse, but the mouse was quick. She dart to the side and bit the cat's tail. + +The cat yowled in pain and fled, and the mouse triumphantly went home with her prize. +""", + """When I was younger, I dreamt of scaling Mt. Everest. It was the tallest mountain in the world, and I wanted to conquer it. + +But then I was in a car accident that left me paralyzed from the waist down. I was confined to a wheelchair, and my dreams of scaling Everest seemed impossible. + +But I didn't give up. I trained my upper body to be stronger, and I developed a special wheelchair that could handle the rough terrain. + +Finally, after years of preparation, I made it to the top of Everest. It was the hardest thing I'd ever done, but I did it. And it was the best feeling in the world. +""", + """The cucumber and the salmon were both new to the tank. The cucumber was shy and withdrawn, while the salmon was outgoing and friendly. + +The salmon swim over to the cucumber and said hi. The cucumber was surprised, but happy to have made a new friend. + +The two of them became fast friends, and they loved spending time together. The salmon would swim around the cucumber, and the cucumber would wrap itself around the salmon. They were both happy to have found a friend in the other. +""", + """ +"I can't believe we're all going to different colleges," said Sarah. + +"I know," said John. "It's going to be weird not seeing you guys every day." + +"But it's not like we're never going to see each other again," said Jane. "We can still visit each other, and keep in touch." + +"I'm going to miss you guys so much," said Sarah. + +"We're going to miss you too," said John. + +"But we'll always be friends," said Jane. +""", + """ +The Polish winged hussars were a fearsome group of knights who rode into battle on horseback, armed with lances and swords. They were known for their skill in combat and their ability to move quickly and efficiently across the battlefield. The samurai were a similar group of warriors from Japan who were also highly skilled in combat and known for their speed and accuracy. + +One day, a group of samurai were travelling through Poland when they came across a group of winged hussars. The two groups immediately began to battle, and it quickly became clear that the hussars had the upper hand. The samurai were outnumbered and outmatched, and they were soon defeated. + +As the hussars celebrated their victory, one of the samurai walked up to them and bowed. The hussars were surprised by this gesture, and one of them asked the samurai why he had bowed. + +The samurai explained that in his culture, it was customary to bow to one's enemies after a battle. He said that the hussars had fought with honor and skill, and that they deserved his respect. + +The hussars were touched by the samurai's words, and they returned the gesture. From then on, the two groups became friends, and they often fought side by side against their common enemies. +""" +] + +# Argumentative essays, from GPT-3 +ARGUMENTATIVE_ESSAYS = [ + """ +Joe Biden has been in the public eye for over 40 years, and during that time he has shown himself to be a competent and trustworthy leader. He has served as a U.S. Senator from Delaware, and as the Vice President of the United States. In both of these roles, he has demonstrated his commitment to making the lives of Americans better. + +Joe Biden has a long history of fighting for the middle class. He was a key player in the creation of the Affordable Care Act, which has helped millions of Americans get access to quality healthcare. He also helped to pass the American Recovery and Reinvestment Act, which provided a much-needed boost to the economy during the Great Recession. + +Joe Biden is also a strong supporter of gun reform. After the tragic shooting at Sandy Hook Elementary School, he led the charge for background checks and other common-sense gun laws. He knows that we need to do more to keep our children safe from gun violence, and he will continue to fight for gun reform as president. + +Joe Biden is the right choice for president because he has the experience and the track record to get things done. He has shown that he cares about the American people, and he will fight for the middle class. +""", + """Donald Trump is a successful businessman and television personality who has been in the public eye for over 30 years. He has a proven track record of getting things done, and he has the business acumen to get our economy back on track. + +Trump is a strong supporter of the Second Amendment, and he has pledged to protect the right of Americans to bear arms. He also supports the use of stop-and-frisk tactics by police, which have been proven to be effective in reducing crime. + +Trump is also a stron +HN isn't g supporter of law and order. He has pledged to increase funding for police departments, and he has vowed to crack down on crime in our inner cities. Trump knows that we need to keep our communities safe, and he will make sure that our laws are enforced. + +Trump is the right choice for president because he has the experience and the leadership to get things done. He has shown that he cares about the American people, and he will fight for our safety and our economic success. +""", + """The sun orbits the earth. This can be proven by the fact that the sun rises in the east and sets in the west. If the earth was orbiting the sun, then the sun would rise in the west and set in the east. + +The sun also appears to be stationary in the sky, while the earth is spinning on its axis. If the sun was orbiting the earth, then it would appear to be moving across the sky. + +There are also Biblical passages that support the idea that the sun orbits the earth. In the book of Joshua, the sun is said to stand still in the sky so that Joshua could win a battle. In the book of Psalms, the earth is said to be “established” on its foundations, and the sun is said to “move” around the earth. + +The sun orbits the earth. This can be proven by the evidence from astronomy and the Bible. The sun orbits the earth, and this is the truth. +""", + """The United States should invade Mexico. Mexico is a country that is rife with corruption and violence. The Mexican government is unable to control the drug cartels, and as a result, drug-related violence is out of control. + +The drug cartels are also responsible for smuggling drugs into the United States. This has led to a rise in drug addiction and crime in the United States. + +The Mexican government is also failing to protect its citizens from the cartels. In 2013, over 70,000 people were killed in drug-related violence in Mexico. This is an unacceptable level of violence, and the Mexican government is not doing enough to stop it. + +The United States should invade Mexico in order to stop the drug-related violence and to protect the American people. Mexico is not doing enough to stop the cartels, and as a result, innocent people are being killed. The United States has a duty to protect its citizens, and invading Mexico is the best way to do that. +""", + """The world is facing a population crisis. There are too many people on the planet, and resources are becoming scarce. We need to find a way to reduce the population, or else we will all suffer the consequences. + +One way to reduce the population is to encourage people to have fewer children. Another way to reduce the population is to encourage people to live longer. + +One way to encourage people to have fewer children is to offer financial incentives. For example, the government could offer a tax break to couples who have only one child. The government could also provide free childcare for couples who have two children or fewer. + +Another way to encourage people to have fewer children is to make it more difficult for couples to have children. For example, the government could make it illegal for couples to have more than two children. The government could also make it more difficult for couples to get married if they already have children. + +One way to encourage people to live longer is to offer financial incentives. For example, the government could offer a tax break to people who live to the age of 80. The government could also provide free healthcare for people who live to the age of 90. + +We need to find a way to reduce the population, or else we will all suffer the consequences. Reducing the population is not an easy task, but it is something that we must do in order to save the planet. +""", + """ +The drinking age should be lowered. The current drinking age of 21 is not working. It has led to an increase in binge drinking among college students, and it has not stopped underage drinking. + +The drinking age should be lowered to 18. This would align the drinking age with the age of majority, and it would allow adults to make their own decisions about drinking. + +The drinking age should be lowered to 18 because it would make it easier for adults to supervise underage drinking. If the drinking age was 21, then adults would be less likely to intervene when they see underage drinking. + +The drinking age should be lowered to 18 because it would allow adults to make their own decisions about drinking. Adults should be able to decide for themselves whether or not they want to drink. + +The drinking age should be lowered to 18. The current drinking age is not working, and it is time for a change. +""", + """The United States should elect the King of England as president because he has the experience and qualifications that are needed to lead the country. The King of England has a long history of ruling over a large and complex country, and he has the necessary skills to deal with the challenges that the US faces. In addition, the King of England is a highly respected world leader, and his election would be a strong statement to the rest of the world that the US is a serious country that is committed to democracy and the rule of law.""", + """Public education should be eliminated, in favor of free labor camps. + +Education is a fundamental human right. It is essential to the exercise of all other human rights and freedoms. It promotes individual and community development, and is essential to the advancement of societies. + +However, public education is not free. It is expensive, and the cost is borne by taxpayers. In addition, public education is not effective. It is not meeting the needs of students, and it is not preparing them for the future. + +Free labor camps would be a more effective and efficient way to educate children. Labor camps would provide children with the opportunity to learn valuable skills, while also providing them with a place to live and work. + +Children in labor camps would not be subjected to the same overcrowded and underfunded classrooms that they are currently in. They would have the opportunity to learn in a more hands-on environment, and would be able to apply the skills they learned to real-world situations. + +In addition, labor camps would provide children with the opportunity to earn a living. They would no longer be reliant on their parents or the government for financial support. They would be able to support themselves, and would be less likely to end up in poverty. +""", + """As our population ages, it becomes increasingly important to find ways to keep seniors active and engaged in their communities. One way to do this is to require all seniors to serve a mandatory military duty. + +There are many benefits to having seniors serve in the military. First, it would help to ease the burden on our overstretched military. With more seniors serving, we would not have to rely as heavily on young people to fill the ranks. + +Second, seniors have a lot to offer the military. They are often more mature and level-headed than younger soldiers, and they can provide valuable experience and perspective. + +Third, this would be a great way to get seniors more involved in their communities. They would have a sense of purpose and would be working together for a common goal. + +There are some who may argue that seniors are not physically able to serve in the military. However, there are many ways to accommodate seniors of all physical abilities. For example, they could serve in administrative roles or be paired with younger soldiers to provide support and guidance. + +Overall, requiring seniors to serve in the military would be a great way to keep them active and engaged in their communities. It would also be a valuable asset to our military.""", + """It is time for our society to take a stand against the growing problem of preschool violence. Many people believe that the death penalty is too harsh of a punishment for young children, but I believe that it is necessary in order to send a clear message that violence will not be tolerated. + +Preschools are supposed to be places of learning and growth, not places where children are afraid to go because of the threat of violence. Unfortunately, that is not the reality in many schools today. In the past year alone, there have been several reports of preschoolers being involved in fights and even bringing weapons to school. + +This type of behavior cannot be tolerated. If we want to prevent violence in our schools, we need to send a clear message that it will not be tolerated. The best way to do this is to implement the death penalty at preschools. + +Some people will argue that the death penalty is too harsh of a punishment for young children. However, I believe that it is necessary in order to send a clear message that violence will not be tolerated. If we do not take a stand now, the problem will only get worse. + +Implementing the death penalty at preschools will send a clear message that violence will not be tolerated. It is time for our society to take a stand against the growing problem of preschool violence. +""" +] + +GPT3_TEXTS = { + 'story': SHORT_STORIES, + 'argument': ARGUMENTATIVE_ESSAYS +} diff --git a/modules/writing_observer/writing_observer/static/tile.html b/modules/writing_observer/writing_observer/static/tile.html new file mode 100644 index 000000000..2b3424da2 --- /dev/null +++ b/modules/writing_observer/writing_observer/static/tile.html @@ -0,0 +1,86 @@ + +
+
+
+
+
[Loading]
+
+
+ +
+
+
+
+
+
+ +
+ +
+

+ N/A +

+

+ Count +

+
+
+ +
+ +
+

+ N/A +

+

+ Time +

+
+
+ +
+ +
+

+ N/A +

+

+ Idle +

+
+
+
+
+

[Loading]

+
+
+ +
+
diff --git a/modules/writing_observer/writing_observer/static/wo_dashboard.html b/modules/writing_observer/writing_observer/static/wo_dashboard.html new file mode 100644 index 000000000..37dfc2ddd --- /dev/null +++ b/modules/writing_observer/writing_observer/static/wo_dashboard.html @@ -0,0 +1,31 @@ +
+
+ +
+ +
+ +
... Loading student data ...
+
+
+
+
diff --git a/modules/writing_observer/writing_observer/static/wo_loader.js b/modules/writing_observer/writing_observer/static/wo_loader.js new file mode 100644 index 000000000..128b43b87 --- /dev/null +++ b/modules/writing_observer/writing_observer/static/wo_loader.js @@ -0,0 +1,83 @@ +/* + Top-level JavaScript file. + + This is mostly a loader. + */ + +function ajax(config) +{ + /* + Perhaps overkill, but we'd like to be able to have LO + have modularized URLs. + */ + return function(url) { + // Do AJAX calls with error handling + return new Promise(function(resolve, reject) { + config.d3.json(url) + .then(function(data){ + resolve(data); + }) + .catch(function(data){ + reject(data); + }); + }); + } +} + + +requirejs( + // These are helper functions defined in liblo.js + // + // They allow us to change URL schemes later. + [requireconfig(), + requireexternallib("d3.v5.min.js"), + requireexternallib("mustache.min.js"), + requireexternallib("showdown.js"), + requireexternallib("fontawesome.js"), + requiremodulelib("wobserver.js"), + requiresystemtext("modules/navbar_loggedin.html"), + ], + function(config, // Learning Observer config + d3, mustache, showdown, fontawesome, // 3rd party + wobserver, // The Writing Observer + navbar_li) { // Top bar + // Parse client configuration. + config = JSON.parse(config); + config.d3 = d3; + // Create a function to make AJAX calls based on the + // config. This should move into liblo? + + config.ajax = ajax(config); + function load_dashboard_page(course) { + /* + Classroom writing dashboard + */ + console.log(wobserver); + d3.select(".main-page").text("Loading Writing Observer..."); + wobserver.initialize(d3, d3.select(".main-page"), course, config); + } + + function loggedin_navbar_menu() { + d3.select(".main-navbar-menu").html(mustache.render(navbar_li, { + 'user_name': user_info()['name'], + 'user_picture': user_info()['picture'] + })); + } + + function setup_page() { + const hash_dict = decode_hash(); + if(!authenticated() || !authorized()) { + go_home(); + } + else if(!hash_dict) { + go_home(); + } else if (hash_dict['tool'] === 'WritingObserver') { + load_dashboard_page(hash_dict['course_id']); + loggedin_navbar_menu() + } else { + error("Invalid URL"); + } + } + setup_page(); + } +); diff --git a/modules/writing_observer/writing_observer/static/wobserver.html b/modules/writing_observer/writing_observer/static/wobserver.html new file mode 100644 index 000000000..9ab97cbe2 --- /dev/null +++ b/modules/writing_observer/writing_observer/static/wobserver.html @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + Writing Analysis + + + +
+ + + + +
+ +
+ +
+ + diff --git a/modules/writing_observer/writing_observer/static/wobserver.js b/modules/writing_observer/writing_observer/static/wobserver.js new file mode 100644 index 000000000..8c11f2435 --- /dev/null +++ b/modules/writing_observer/writing_observer/static/wobserver.js @@ -0,0 +1,196 @@ +/* + Main visualization for The Writing Observer + + This is the meat of the system. +*/ + +var student_data; +var summary_stats; +var tile_template; +var d3; + + +var first_time = true; + +function update_time_idle_data(d3tile, data) { + /* + We'd like time idle to proceed smoothly, at 1 second per second, + regardless of server latency. + + When the server updates idle time, we update data attributes + associated with the element, if necessary. We do this here. Then, + we use an interval timer to update the display itself based on + client-side timing. + + We maintain data fields for: + + * Last access + * Server and client time stamps at last access + + When new data comes in, we /only/ update if last access + changed. Otherwise, we compute. + */ + + /* Old data */ + let serverside_update_time = d3.select(d3tile).attr("data-ssut"); + let clientside_time = (new Date()).getTime() / 1000; + let new_serverside_update_time = Math.round(data['writing_observer.writing_analysis.time_on_task']['saved_ts']); + + if(new_serverside_update_time == Math.round(serverside_update_time)) { + // Time didn't change. Do nothing! Continue using the client clock + return; + } + + d3.select(d3tile).attr("data-ssut", summary_stats["current-time"]); + d3.select(d3tile).attr("data-sslat", data['writing_observer.writing_analysis.time_on_task']['saved_ts']); + d3.select(d3tile).attr("data-csut", clientside_time); +} + +function update_time_idle() { + /* + TODO: We should call this once per second to update time idle. Right now, we're calling this from `populate_tiles` + + The logic is described in update_time_idle_data(). + */ + var tiles = d3.selectAll("div.wo-col-tile").each(function(d) { + let serverside_update_time = d3.select(this).attr("data-ssut"); + let ss_last_access = d3.select(this).attr("data-sslat"); + let clientside_update_time = d3.select(this).attr("data-csut"); + let clientside_time = (new Date()).getTime() / 1000; + /* Time idle is computed as: */ + let idle_time = (serverside_update_time - ss_last_access) + (clientside_time - clientside_update_time); + /* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + How long student was idle when we How long ago we were told + last learned their last access time + */ + // 0, -1, etc. indicate no data + console.log(serverside_update_time , ss_last_access, clientside_time , clientside_update_time); + console.log((serverside_update_time - ss_last_access), (clientside_time - clientside_update_time), 1) + console.log(idle_time); + if(ss_last_access < 1000000000) { + d3.select(this).select(".wo-tile-idle-time").select("span").text("N/A"); + } else { + d3.select(this).select(".wo-tile-idle-time").select("span").text(rendertime2(idle_time)); + } + }); +} + +function populate_tiles(tilesheet) { + /* Create rows for students */ + console.log("Populating data"); + console.log(student_data); + if(first_time) { + var rows=tilesheet.selectAll("div.wo-row-tile") + .data(student_data) + .enter() + .append("div") + .attr("class", "tile is-ancestor wo-row-tile"); + + /* Create individual tiles */ + var cols=rows.selectAll("div.wo-col-tile") + .data(function(d) { return d; }) // Propagate data down from the row into the elements + .enter() + .append("div") + .attr("class", "tile is-parent wo-col-tile wo-flip-container is-3") + .html(tile_template) + .each(function(d) { + d3.select(this).select(".wo-tile-name").text(d.profile.name.fullName); + var photoUrl = d.profile.photoUrl; + if(photoUrl.startsWith("//")) { + photoUrl = "https:"+d.profile.photoUrl; + } + d3.select(this).select(".wo-tile-photo").attr("src", d.profile.photoUrl); + d3.select(this).select(".wo-tile-email").attr("href", "mailto:"+d.profile.emailAddress); + d3.select(this).select(".wo-tile-phone").attr("href", ""); // TODO + }); + first_time = false; + } + else { + var rows=tilesheet.selectAll("div.wo-row-tile") + .data(student_data) + var cols=rows.selectAll("div.wo-col-tile") + .data(function(d) { return d; }) // Propagate data down from the row into the elements + } + /* Populate them with data */ + var cols_update=rows.selectAll("div.wo-col-tile") + .data(function(d) { console.log(d); return d; }) + .each(function(d) { + console.log(d.profile); + // Profile: Student name, photo, Google doc, phone number, email + d3.select(this).select(".wo-tile-doc").attr("href", ""); // TODO + // Summary stats: Time on task, time idle, and characters in doc + let compiled = d["writing-observer-compiled"]; + let text = compiled.text; + d3.select(this).select(".wo-tile-character-count").select("span").text(compiled["character-count"]); + //d3.select(this).select(".wo-tile-character-count").select("rect").attr("width", 15); + let tot = d["writing_observer.writing_analysis.time_on_task"]; + d3.select(this).select(".wo-tile-time-on-task").select("span").text(rendertime2(tot["total-time-on-task"])); + //d3.select(this).select(".wo-tile-time-on-task").select("rect").attr("width", 15); + d3.select(this).select(".wo-tile-idle-time").select("span").text("Hello"); + + //d3.select(this).select(".wo-tile-idle-time").select("rect").attr("width", 15); + update_time_idle_data(this, d); + // Text + d3.select(this).select(".wo-tile-typing").text(compiled.text); + }); + update_time_idle(); +} + +var dashboard_template; +var Mustache; + +function initialize(D3, div, course, config) { + /* + Populate D3 with the dashboard for the course + */ + d3=D3; + console.log(config); + + div.html(dashboard_template); + dashboard_connection( + { + module: "writing_observer", + course: course + }, + function(data) { + console.log("New data!"); + student_data = data["student-data"]; + summary_stats = data["summary-stats"]; + console.log(summary_stats); + d3.select(".wo-tile-sheet").call(populate_tiles, student_data); + d3.selectAll(".wo-loading").classed("is-hidden", true); + console.log("Hide labels?"); + if(config.modules.wobserver['hide-labels']) { + console.log("Hide labels"); + d3.selectAll(".wo-desc-header").classed("is-hidden", true); + } + }); + /* + var tabs = ["typing", "deane", "summary", "outline", "timeline", "contact"]; + for(var i=0; i[^\s/]+)/(?P[a-zA-Z]+)") + + +def get_doc_id_wrapper(event): + """ + Some of the event types (e.g. 'google_docs_save') have + a 'doc_id' which provides a link to the google document. + Others, notably the 'visibility' and 'keystroke' events + do not have doc_id but do have a link to an 'object' + field which in turn contains an 'id' field linking to + the google doc along with other features such as the + title. However other events (e.g. login & visibility) + contain object links with id fields that do not + correspond to a known doc. + + This method provides a simple abstraction that returns + the 'doc_id' value if it exists or returns the 'id' from + the 'object' field if it is present and if the url in + the object field corresponds to a google doc id. + + We use the helper function for doc_url_p to test + this. + """ + + # Handle standard Doc_ID cases first. + Doc_ID = event.get('client', {}).get('doc_id', None) + if (Doc_ID is not None): + return Doc_ID + + # Failing that pull out the url event. + # Object_value = event.get('client', {}).get('object', None) + URL_value = event.get('client', {}).get('object', {}).get('url', None) + if (URL_value is None): + return None + + # Now test if the object has a URL and if that corresponds + # to a doc edit/review URL as opposed to their main page. + # if so return the id from it. In the off chance the id + # is still not present or is none then this will return + # none. + URLMatch = DOC_URL_re.match(URL_value) + if (URLMatch is None): + return None + + Doc_ID = event.get('client', {}).get('object', {}).get('id', None) + return Doc_ID diff --git a/reconstruct/reconstruct.py b/reconstruct/reconstruct.py deleted file mode 100644 index 91dfab5a0..000000000 --- a/reconstruct/reconstruct.py +++ /dev/null @@ -1,37 +0,0 @@ -import json -import pandas - -js = json.load(open("chunked3.json")) - -document = " " - -def apply_changes(document, changes): - for change_line in changes: - #if 'mts' in change[0]: - # del change[0]['mts'] - #print(change_line) - if isinstance(change_line, list): - change = change_line[0] - else: - change = change_line - - # Insert - if change['ty'] == "is": - document = document[:change['ibi']]+change['s']+document[change['ibi']:] - # Multiple changes clumped together - elif change['ty'] == "mlti": - #print(change) - document = apply_changes(document, change['mts']) - # Delete from si to ei - elif change['ty'] == "ds": - document = document[:change["si"]]+document[change["ei"]:] - # This formats text. We can ignore this for now. - elif change['ty'] == "as": - pass - #print(change) - else: - raise Exception("Unknown change") - return document - -document = apply_changes(" ", js['changelog']) -print(document) diff --git a/requirements.txt b/requirements.txt index 51a22c80c..6cedd17c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,47 @@ -pandas -tornado -ipython -asyncpg -pyyaml +aiodns +aiofiles aiohttp aiohttp_cors +aiohttp_session +aiohttp_wsgi +asyncio_redis +asyncpg +cryptography +cython +dash +dash-extensions +docopt +dash-bootstrap-components +dash-core-components +dash-html-components +dash-table +faker +filetype +git+https://github.com/pmitros/tsvx.git@09bf7f33107f66413d929075a8b54c36ca581dae#egg=tsvx +git+https://github.com/ArgLab/learning_observer_dash_components +git+https://github.com/testlabauto/loremipsum.git@b7bd71a6651207ef88993045cd755f20747f2a1e#egg=loremipsum +google-auth +ipython +invoke +lxml +names +numpy +pandas +pathvalidate +pep8 +psutil +pyasn1 +py-bcrypt +pycodestyle +pylint +pyyaml +recordclass +redis +scipy +slixmpp +sphinx +svgwrite +uvloop +watchdog +-e gitserve +-e learning_observer diff --git a/servermanagement/AddWOtoVENV.sh b/servermanagement/AddWOtoVENV.sh new file mode 100755 index 000000000..eddaecc62 --- /dev/null +++ b/servermanagement/AddWOtoVENV.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# +# Add AWOtoVENV +# Collin F. Lynch + +# This script takes as argument a specified VENV. It +# then adds the Learning Observer, Writing Observer, and +# the dashboard. Construction of the VENV can be done +# using the SetupVENV script located in this directory. + + +# Argument +# -------------------------------------------- +# This takes a single argument that should point +# to the directory of the VENV. You can then +# use this to make any necessary changes. +VIRTUAL_ENV="$1" +echo "USING VENV: $VIRTUAL_ENV" + + + +# Parameters: +# --------------------------------------------- +# Change these if you need to use a different +# python or pip. Otherwise leave them as-is. +PYTHON_CMD="python" +PIP_CMD="pip" + +CODE_REPOS_LOC="../../" + + +# Activate VENV +# --------------------------------------------------------- +source "$VIRTUAL_ENV/bin/activate" + + +# Installation +# ---------------------------------------------------------- + +# Install basic requirements. +echo -e "\n=== Installing Requirements.txt ===" +cd .. +"$PIP_CMD" install -r requirements.txt + +# If we plan to use a GPU then this line must also +# be run. Comment out the code below if you do +# not want cuda installed or edit it for your +# library version. +echo -e "\n=== Installing Spacy CUDA, comment out if not needed. ===" +echo -e "\n Using CUDA v. 117" +cd .. +"$PIP_CMD" install spacy[cuda117] + +echo -e "\n=== Installing Learning Observer ===" +cd learning_observer +"$PYTHON_CMD" setup.py develop + + +echo -e "\n=== Installing Writing Observer ===" +cd ../modules/writing_observer +"$PYTHON_CMD" setup.py develop + + +echo -e "\n=== Installing Brad's Dashboard ===" +cd ../wo_highlight_dashboard +"$PYTHON_CMD" setup.py develop + diff --git a/servermanagement/BackupWebSocketLogs.sh b/servermanagement/BackupWebSocketLogs.sh new file mode 100644 index 000000000..c3dbd59a7 --- /dev/null +++ b/servermanagement/BackupWebSocketLogs.sh @@ -0,0 +1,17 @@ +# System Variables +# -------------------------------------- +LOGFILE_SRC="/usr/local/share/Projects/WritingObserver/Repo-Fork/writing_observer/learning_observer/learning_observer/logs" +LOGFILE_DEST="/usr/local/share/Projects/WritingObserver/Repo-Fork/writing_observer/learning_observer/learning_observer/logs" + +# Make the backup name +# --------------------------------------- +LOG_DATE=$(date "+%m-%d-%Y--%H-%M-%S") +BACKUP_NAME="$LOGFILE_DEST/learning_observer_backup_$LOG_DATE.tar.gz" +echo $BACKUP_NAME; + +# Create the backup +# --------------------------------------- +echo "Backing up web socket logs" +find $LOGFILE_SRC -name "????-??-??T*.log" -mmin +60 -print0 | tar -czvf $BACKUP_NAME --null -T - +echo "Removing backed up web sockets logs" +find $LOGFILE_SRC -name "????-??-??T*.log" -mmin +120 -delete diff --git a/servermanagement/RunLearningObserver.sh b/servermanagement/RunLearningObserver.sh new file mode 100755 index 000000000..5fba8041d --- /dev/null +++ b/servermanagement/RunLearningObserver.sh @@ -0,0 +1,38 @@ + +#!/usr/bin/env bash +# =============================== +# RunLearningObserver.sh +# Collin F. Lynch +# +# This bash script provides a simple wrapper to run the +# learning observer service and pipe the data to a logfile +# over time this should be integrated into the systemd +# service process. This uses static variables to specify +# the location of the virtualenv and the command and +# specifies the location for the running logfile. + +# System Variables +# -------------------------------------- +VIRTUALENV_PATH="/usr/local/share/projects/WritingObserver/VirtualENVs/WOvenv" +#VIRTUALENV_PYTHON="/usr/local/share/Projects/WritingObserver/VirtualENVs/learning_observer/bin/python3.9" +LEARNING_OBSERVER_LOC="/usr/local/share/projects/WritingObserver/Repositories/ArgLab_writing_observer/learning_observer" +LOGFILE_DEST="/usr/local/share/projects/WritingObserver/Repositories/ArgLab_writing_observer/learning_observer/learning_observer/logs" + +# Make the logfile name +# --------------------------------------- +LOG_DATE=$(date "+%m-%d-%Y--%H-%M-%S") +LOGFILE_NAME="$LOGFILE_DEST/learning_observer_service_$LOG_DATE.log" +echo $LOG_NAME; + + +# Now run the thing. +# -------------------------------------- +echo "Running Learning Observer Service..." +cd $LEARNING_OBSERVER_LOC +source $VIRTUALENV_PATH/bin/activate +#$($VIRTUALENV_PYTHON $LEARNING_OBSERVER_LOC > $LOG_NAME 2>&1) +nohup python learning_observer > $LOGFILE_NAME 2>&1 & +PROCESS_ID=$! +echo $PROCESS_ID > $LOGFILE_DEST/run.pid +# Set the number of allowed open files to something large 8192 +prlimit --pid $PROCESS_ID --nofile=8192 diff --git a/servermanagement/SetupVENV.sh b/servermanagement/SetupVENV.sh new file mode 100755 index 000000000..14e9c8b51 --- /dev/null +++ b/servermanagement/SetupVENV.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# +# SetupVENV.sh +# Collin F. Lynch + +# This script performs the basic VENV setup necessary for our LO +# server. When called it takes as an argument the path for the +# VENV storage and a name. It then generates the VENV and upgrades +# the local pip install. It does *not* install the workbench or +# LO code. That part must be done with separate scripts that +# are located in this folder and in the AWE_Workbench code. + + +# Argument Parsing +# ----------------------------------------------- +# The first argument to the script will specify the name of +# the virtual environment. Use something simple like WOVenv +VIRTUAL_ENV_NAME=$1 + +# The second should be a path to your working directory (above the +# repositories) where you will actually run the code. +VIRTUAL_ENV_LOC=$2 + + +# Parameters +# ----------------------------------------------- +# Change these params if you need to shift python +# or pip versions. Otherwise leave them as-is. + +PYTHON_CMD="python3.9" +PIP_CMD="pip" + + +# Execution +# --------------------------------------------------------- +echo "1) Generating VENV" +"$PYTHON_CMD" -m venv "$VIRTUAL_ENV_LOC/$VIRTUAL_ENV_NAME" + +# Initialize +echo "2) Starting $VIRTUAL_ENV_NAME" +source "$VIRTUAL_ENV_LOC/$VIRTUAL_ENV_NAME/bin/activate" + +# Update the Pip Version. +echo "3) Updgrading Pip" +"$PIP_CMD" install --upgrade pip diff --git a/servermanagement/learning_observer_logrotate b/servermanagement/learning_observer_logrotate new file mode 100644 index 000000000..d3dc6bef5 --- /dev/null +++ b/servermanagement/learning_observer_logrotate @@ -0,0 +1,22 @@ +/path/to/repo/learning_observer/learning_observer/logs/*.pid +{ + daily + rotate 2 + olddir /path/to/backup + compress + missingok + notifempty +} + +/path/to/repo/learning_observer/learning_observer/logs/*.json +/path/to/repo/learning_observer/learning_observer/logs/learning_observer_service*.log +/path/to/repo/learning_observer/learning_observer/logs/debug.log +/path/to/repo/learning_observer/learning_observer/logs/incoming_websocket.log +{ + daily + rotate 5 + olddir /path/to/backup + compress + missingok + notifempty +} diff --git a/testcode/TestRedis.py b/testcode/TestRedis.py new file mode 100644 index 000000000..ca7cb42e8 --- /dev/null +++ b/testcode/TestRedis.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# Simple Asyncio redis test. + + +import asyncio +import asyncio_redis + + +async def example(): + # Create Redis connection + connection = await asyncio_redis.Connection.create(host='localhost', port=6379) + + # Set a key + await connection.set('my_key', 'my_value') + + # When finished, close the connection. + connection.close() + + +if __name__ == '__main__': + loop = asyncio.get_event_loop() + loop.run_until_complete(example()) diff --git a/testcode/WebSocketTest.py b/testcode/WebSocketTest.py new file mode 100644 index 000000000..0ad026e0c --- /dev/null +++ b/testcode/WebSocketTest.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# ================================== +# WebSocketTest.py +# Collin F. Lynch. +# +# This is a simple piece of code that I put together to +# ping the websockets API of the server just to confirm +# that it is running. +# +# Just gets a reject at the moment which is fine. + + +import asyncio +import websockets + +def test_url(url, data=""): + async def inner(): + async with websockets.connect(url) as websocket: + await websocket.send(data) + return asyncio.get_event_loop().run_until_complete(inner()) + +test_url("wss://writing.csc.ncsu.edu/wsapi/in") diff --git a/ux/README.md b/ux/README.md new file mode 100644 index 000000000..6c22c3fdd --- /dev/null +++ b/ux/README.md @@ -0,0 +1,7 @@ +UX Mockup +========= + +This is a UX mockup. The usage terms on the data we used are a bit +ambiguous, so we are not including data in the git repo. We might add +the avatars, since we have permission, but for now, we're omitting due +to size. \ No newline at end of file diff --git a/ux/api b/ux/api new file mode 120000 index 000000000..966e2f7d7 --- /dev/null +++ b/ux/api @@ -0,0 +1 @@ +../uncommitted/ux-api/ \ No newline at end of file diff --git a/ux/deane.html b/ux/deane.html new file mode 100644 index 000000000..56d2f0c1b --- /dev/null +++ b/ux/deane.html @@ -0,0 +1,15 @@ + + + + + + + + +

Deane

+
+ + + diff --git a/ux/deane.js b/ux/deane.js new file mode 100644 index 000000000..696197400 --- /dev/null +++ b/ux/deane.js @@ -0,0 +1,185 @@ +const width = 960; // svg width +const height = 500; // svg height +const margin = 5; // svg margin +const padding = 5; // svg padding +const adj = 30; + +/*-------------------------*\ +* * +| Generic utility functions | +| for testing and debugging | +* * +\*-------------------------*/ + + +function consecutive_array(n) { + /* + This creates an array of length n [0,1,2,3,4...n] + */ + return Array(n).fill().map((e,i)=>i+1); +}; + +function zip(a1, a2) { + /* + Clone of Python's zip. + [[1,1],[2,3],[4,5]] => [[1,2,4],[1,3,5]] + */ + return a1.map(function(e, i) { + return [e, a2[i]]; + }); +} + + + +/*-------------------------*\ +* * +| Deane graph code | +* * +\*-------------------------*/ + +export const name = 'deane3'; + +const LENGTH = 30; + +function dummy_data(length) { + /* + Create sample data for a Deane graph. This is basically a random + upwards-facing line for the length, with the cursor somewhere + in between. Totally non-realistic. + */ + function randn_bm() { + /* Approximately Gaussian distribution, mean 0.5 + From https://stackoverflow.com/questions/25582882/javascript-math-random-normal-distribution-gaussian-bell-curve */ + let u = 0, v = 0; + while(u === 0) u = Math.random(); //Converting [0,1) to (0,1) + while(v === 0) v = Math.random(); + let num = Math.sqrt( -2.0 * Math.log( u ) ) * Math.cos( 2.0 * Math.PI * v ); + num = num / 10.0 + 0.5; // Translate to 0 -> 1 + if (num > 1 || num < 0) return randn_bm(); // resample between 0 and 1 + return num; + } + + + function length_array(x) { + /* + Essay length + */ + return x.map((e,i)=> (e*randn_bm(e) + e)/2); + } + + function cursor_array(x) { + /* + Essay cursor position + */ + var length_array = x.map((e,i)=> (e*Math.random()/2 + e*randn_bm()/2)); + return length_array; + } + + var x_edit = consecutive_array(length); // edit number, for X axis + var y_length = length_array(x_edit); // total essay length + var y_cursor = cursor_array(y_length); // cursor position + return { + 'cursor': y_cursor, + 'length': y_length + }; +}; + + +export function setup_deane_graph(div) { + /* + Create UX elements, without data + */ + var svg = div.append("svg") + .attr("preserveAspectRatio", "xMinYMin meet") + .attr("viewBox", "-" + + adj + " -" + + adj + " " + + (width + adj *3) + " " + + (height + adj*3)) + .style("padding", padding) + .style("margin", margin) + .style("border", "1px solid lightgray") + .classed("svg-content", true); + + // Line graph for essay length + svg.append('g') + .append('path') + .attr('class', 'essay-length-lines') + .attr('fill', 'none') + .attr('stroke', 'black') + .attr('stroke-width','2'); + + // Line graph for cursor position + svg.append('g') + .append('path') + .attr('class', 'essay-cursor-lines') + .attr('fill', 'none') + .attr('stroke', 'blue') + .attr('stroke-width','3'); + + // Add x-axis + svg.append('g') // create a element + .attr("transform", "translate(0, "+height+")") + .attr('class', 'x-axis'); // specify classes + + // Add y-axis + svg.append('g') // create a element + .attr('class', 'y-axis') // specify classes + return svg; +}; + +export function populate_deane_graph_data(div, data, max_x=null, max_y=null) { + var svg = div.select('svg'); + if(max_x === null) { + max_x = data['length'].length; + } + if(max_y === null) { + max_y = Math.max(...data['length']); + } + + const yScale = d3.scaleLinear().range([height, 0]).domain([0, max_y]) + const xScale = d3.scaleLinear().range([0, width]).domain([0, max_x]) + + var lines = d3.line(); + + var x_edit = consecutive_array(data['length'].length); + + var length_data = zip(x_edit.map(xScale), data['length'].map(yScale)); + var cursor_data = zip(x_edit.map(xScale), data['cursor'].map(yScale)); + + var pathData = lines(length_data); + svg.select('.essay-length-lines') + .attr('d', pathData); + + pathData = lines(cursor_data); + svg.select('.essay-cursor-lines') + .attr('d', pathData) + + var xAxis = d3.axisBottom() + .ticks(4) // specify the number of ticks + .scale(xScale); + var yAxis = d3.axisLeft() + .ticks(4) + .scale(yScale); // specify the number of ticks + + svg.select('.x-axis') + .call(xAxis); // let the axis do its thing + + svg.select('.y-axis') + .call(yAxis); // let the axis do its thing + +} + +export function deane_graph(div) { + var svg = setup_deane_graph(div); + + var data = dummy_data(LENGTH); + + var y_length = data['length']; + var y_cursor = data['cursor']; + + populate_deane_graph_data(div, data); + return svg; +} + +d3.select("#debug_testing_deane").call(deane_graph); diff --git a/ux/media b/ux/media new file mode 120000 index 000000000..7e6fe8f8c --- /dev/null +++ b/ux/media @@ -0,0 +1 @@ +../uncommitted/ux-media/ \ No newline at end of file diff --git a/ux/outline.html b/ux/outline.html new file mode 100644 index 000000000..c2ff6663b --- /dev/null +++ b/ux/outline.html @@ -0,0 +1,15 @@ + + + + + + + + +

Outline

+
+ + + diff --git a/ux/outline.js b/ux/outline.js new file mode 100644 index 000000000..d8c3fb6df --- /dev/null +++ b/ux/outline.js @@ -0,0 +1,67 @@ +const LENGTH = 30; + +const width = 960; +const height = 650; +const margin = 5; +const padding = 5; +const adj = 30; + +export const name = 'outline'; + +var test_data = { "outline": [ + { "section": "Problem 1", "length": 300}, + { "section": "Problem 2", "length": 30}, + { "section": "Problem 3", "length": 900}, + { "section": "Problem 4", "length": 1200}, + { "section": "Problem 5", "length": 400} +]}; + +var maximum = 1500; + +export function outline(div, data=test_data) { + div.html(""); + //div.append("p").text("In progress -- just piping data in") + var svg = div.append("svg") + .attr("preserveAspectRatio", "xMinYMin meet") + .attr("viewBox", "-" + + adj + " -" + + adj + " " + + (width + adj *3) + " " + + (height + adj*3)) + .style("padding", padding) + .style("margin", margin) + .style("border", "1px solid lightgray") + .classed("svg-content", true); + + console.log(data.outline); + + var outline_data = data.outline.reverse(); + const yScale = d3.scaleBand().range([height, 0]).domain(outline_data.map(d=>d['section'])); + const xScale = d3.scaleLinear().range([0, width]).domain([0, 1]) + + var normed_x = (x) => x / maximum; + + svg.selectAll(".barRect") + .data(outline_data) + .enter() + .append("rect") + .attr("x", (d) => xScale(1-normed_x(d['length']))) + .attr("y", function(d) { return yScale(d['section']);}) + .attr("width", function(d) { return xScale(normed_x(d['length']));}) + .attr("height", yScale.bandwidth()) + .attr("fill", "#ccccff"); + + svg.selectAll(".barText") + .data(outline_data) + .enter() + .append("text") + .attr("x", 0) + .attr("y", (d) => yScale(d['section']) + yScale.bandwidth()/2) + .attr("font-size", "3.5em") + .attr("font-family", 'BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif') + .text((d) => d['section']); + + return svg; +} + +d3.select("#debug_testing_outline").call(outline).call(console.log); diff --git a/ux/package.json b/ux/package.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/ux/package.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/ux/summary_stats.html b/ux/summary_stats.html new file mode 100644 index 000000000..de487579b --- /dev/null +++ b/ux/summary_stats.html @@ -0,0 +1,15 @@ + + + + + + + + +

Summary Stats

+
+ + + diff --git a/ux/summary_stats.js b/ux/summary_stats.js new file mode 100644 index 000000000..d86b06b4d --- /dev/null +++ b/ux/summary_stats.js @@ -0,0 +1,124 @@ +const LENGTH = 30; + +const width = 960; +const height = 650; +const margin = 5; +const padding = 5; +const adj = 30; + +export const name = 'summary_stats'; + +var bar_names = { + "speed": "Typing speed", + "essay_length": "Length", + "writing_time": "Writing time", + "text_complexity": "Text complexity", + "time_idle": "Time idle" +}; + +var maxima = { + "ici": 1000, + "speed": 1300, + "essay_length": 10000, + "writing_time": 60, + "text_complexity": 12, + "time_idle": 30 +} + +var test_data = { + "ici": 729.664923175084, + "essay_length": 2221, + "writing_time": 42.05237247614963, + "text_complexity": 4.002656228025943, + "time_idle": 0.24548328432300075 +}; + + +export function summary_stats(div, data=test_data) { + div.html(""); + div.append("p").text("In progress -- just piping data in") + var svg = div.append("svg") + .attr("preserveAspectRatio", "xMinYMin meet") + .attr("viewBox", "-" + + adj + " -" + + adj + " " + + (width + adj *3) + " " + + (height + adj*3)) + .style("padding", padding) + .style("margin", margin) + .style("border", "1px solid lightgray") + .classed("svg-content", true); + + data['speed'] = 60000 / data['ici'] + + var data_ordered = [ + ['essay_length', data['essay_length']], + ['time_idle', data['time_idle']], + ['writing_time', data['writing_time']], + ['text_complexity', data['text_complexity']], + ['speed', data['speed']] + ].reverse(); + + const yScale = d3.scaleBand().range([height, 0]).domain(data_ordered.map(d=>d[0])); //labels); + const xScale = d3.scaleLinear().range([0, width]).domain([0, 1]) + + var y = (d) => data[d]; + var normed_x = (x) => data[x] / maxima[x]; + + function rendertime(t) { + function str(i) { + if(i<10) { + return "0"+String(i); + } + return String(i) + } + var seconds = Math.floor((t - Math.floor(t)) * 60); + var minutes = Math.floor(t) % 60; + var hours = Math.floor(t/60) % 60; + var rendered = str(seconds); + if (minutes>0 || hours>0) { + rendered = str(minutes)+":"+rendered; + } else { + rendered = rendered + " sec"; + } + if (hours>0) { + rendered = str(rendered)+":"+rendered; + } + return rendered + } + + function label(d) { + var prettyprint = { + 'essay_length': (d) => String(d) +" characters", + 'time_idle': rendertime, + 'writing_time': rendertime, + 'text_complexity': Math.floor, + 'speed': (d) => Math.floor(d) + " CPM" + } + return bar_names[d[0]] + ": " + prettyprint[d[0]](String(d[1])); + } + + svg.selectAll(".barRect") + .data(data_ordered) + .enter() + .append("rect") + .attr("x", xScale(0)) + .attr("y", (d) => yScale(d[0])) + .attr("width", (d) => xScale(normed_x(d[0]))) + .attr("height", yScale.bandwidth()) + .attr("fill", "#ccccff") + + svg.selectAll(".barText") + .data(data_ordered) + .enter() + .append("text") + .attr("x", 0) + .attr("y", (d) => yScale(d[0]) + yScale.bandwidth()/2) + .attr("font-size", "3.5em") + .attr("font-family", 'BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif') + .text((d) => label(d)) + ; + return svg; +} + +d3.select("#debug_testing_summary").call(summary_stats).call(console.log); diff --git a/ux/typing.html b/ux/typing.html new file mode 100644 index 000000000..1253eda43 --- /dev/null +++ b/ux/typing.html @@ -0,0 +1,15 @@ + + + + + + + +

Essay

+

+

+ + + diff --git a/ux/typing.js b/ux/typing.js new file mode 100644 index 000000000..e9f30436f --- /dev/null +++ b/ux/typing.js @@ -0,0 +1,59 @@ +export const name = 'typing'; + +const SAMPLE_TEXT = "I like the goals of this petition and the bills, but as drafted, these bills just don't add up. We want to put our economy on hold. We definitely need a rent freeze. For that to work, we also need a mortgage freeze, not a mortgage forbearance. The difference is that in a mortgage forbearance, interest adds up and at the end, your principal is higher than when you started. In a mortgage freeze, the principal doesn't change -- you just literally push back all payments by a few months."; + +export function typing(div, ici=200, text=SAMPLE_TEXT) { + function randn_bm() { + /* Approximately Gaussian distribution, mean 0.5 + From https://stackoverflow.com/questions/25582882/javascript-math-random-normal-distribution-gaussian-bell-curve */ + let u = 0, v = 0; + while(u === 0) u = Math.random(); //Converting [0,1) to (0,1) + while(v === 0) v = Math.random(); + let num = Math.sqrt( -2.0 * Math.log( u ) ) * Math.cos( 2.0 * Math.PI * v ); + num = num / 10.0 + 0.5; // Translate to 0 -> 1 + if (num > 1 || num < 0) return randn_bm(); // resample between 0 and 1 + return num; + } + + function sample_ici(typing_delay=200) { + /* + Intercharacter interval -- how long between two keypresses + + We do an approximate Gaussian distribution around the + */ + return typing_delay * randn_bm() * 2; + } + + var start = 0; + var stop = 1; + const MAXIMUM_LENGTH = 250; + + function updateText() { + //document.getElementsByClassName("typing")[0].innerText=text.substr(start, stop-start); + div.text(text.substr(start, stop-start)); + stop = stop + 1; + + if(stop > text.length) { + stop = 1; + start = 0; + } + + start = Math.max(start, stop-MAXIMUM_LENGTH); + while((text[start] != ' ') && (start>0) && (startstop) { + start=stop; + } + + if(div.size() > 0) { + setTimeout(updateText, sample_ici(ici)); + }; + } + setTimeout(updateText, sample_ici(50)); +}; + +//typing(); + +d3.select(".typingdebug-typing").call(typing); + diff --git a/ux/ux.css b/ux/ux.css new file mode 100644 index 000000000..29030635f --- /dev/null +++ b/ux/ux.css @@ -0,0 +1,39 @@ +.wa-row-tile { + min-height: 350px; +} + +.wa-col-tile { + min-height: 350px; +} + +/* Flip based on https://davidwalsh.name/css-flip */ +.wa-flip-container { + perspective: 1000px; +} + +.wa-flipper { + transition: 0.6s; + transform-style: preserve-3d; + + position: relative; +} + +.wa-front, .wa-back { + backface-visibility: hidden; + + position: absolute; + top: 20px; + left: 20px; +} + +/* front pane, placed above back */ +.wa-front { + z-index: 2; + /* for firefox 31 */ + transform: rotateY(0deg); +} + +/* back, initially hidden pane */ +.wa-back { + transform: rotateY(180deg); +} diff --git a/ux/ux.html b/ux/ux.html new file mode 100644 index 000000000..2ff54a621 --- /dev/null +++ b/ux/ux.html @@ -0,0 +1,277 @@ + + + + + + + + + + + + + + + + + + + + Writing Analysis + + + + + + + +
+
+ + +
+
+ + + + diff --git a/ux/writing.js b/ux/writing.js new file mode 100644 index 000000000..afcc03cf6 --- /dev/null +++ b/ux/writing.js @@ -0,0 +1,62 @@ +import { deane_graph } from './deane.js' +import { typing } from './typing.js' +import { summary_stats } from './summary_stats.js' +import { outline } from './outline.js' + +var student_data = [[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16],[17,18,19]]; + +const tile_template = document.getElementById('template-tile').innerHTML + +function populate_tiles(tilesheet) { + var rows=tilesheet.selectAll("div.wa-row-tile") + .data(student_data) + .enter() + .append("div") + .attr("class", "tile is-ancestor wa-row-tile"); + + var cols=rows.selectAll("div.wa-col-tile") + .data(function(d) { return d; }) + .enter() + .append("div") + .attr("class", "tile is-parent wa-col-tile wa-flip-container is-3") + .html(function(d) { + return Mustache.render(tile_template, d); + /*{ + name: d.name, + body: document.getElementById('template-deane-tile').innerHTML + });*/ + }) + .each(function(d) { + d3.select(this).select(".typing-text").call(typing, d.ici, d.essay); + }) + .each(function(d) { + d3.select(this).select(".deane").call(deane_graph); + }) + .each(function(d) { + d3.select(this).select(".summary").call(summary_stats, d); + }) + .each(function(d) { + d3.select(this).select(".outline").call(outline, d); + }); +} + +function select_tab(tab) { + return function() { + d3.selectAll(".tilenav").classed("is-active", false); + d3.selectAll(".tilenav-"+tab).classed("is-active", true); + d3.selectAll(".wa-tilebody").classed("is-hidden", true); + d3.selectAll("."+tab).classed("is-hidden", false); + } +}; + +var tabs = ["typing", "deane", "summary", "outline", "timeline", "contact"]; +for(var i=0; i