
    dh(                        S SK r S SKrS SKJr  S SKJrJrJrJr  S SK	J
r
  S SKJr  S SKJr  S SKJr  S SKJr   " S	 S
\5      r\S:X  a7  \" SSSS9r\R-                  5       r\" S\" \5       S\R4                   35        gg)    N)Path)AnyListOptionalTuple)unquote)Document)DirectoryLoader)PyPDFLoader)WebBaseLoaderc                   J  ^  \ rS rSrSr     SS\S\S\S\\\\4      S\\	   S	\S
\4U 4S jjjr
SS jrS\\   4S jrS\S\4S jrS\S\\   4S jrS\S\\   4S jrS\\   SS4S jrS\\   4S jrS\S\\   4S jrS\SS4S jrS\S\4S jrS\S\4S jrSrU =r$ )BlackboardLoader   a  Load a `Blackboard` course.

This loader is not compatible with all Blackboard courses. It is only
compatible with courses that use the new Blackboard interface.
To use this loader, you must have the BbRouter cookie. You can get this
cookie by logging into the course and then copying the value of the
BbRouter cookie from the browser's developer tools.

Example:
    .. code-block:: python

        from langchain_community.document_loaders import BlackboardLoader

        loader = BlackboardLoader(
            blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1",
            bbrouter="expires:12345...",
        )
        documents = loader.load()

Nblackboard_course_urlbbrouterload_all_recursively
basic_authcookiescontinue_on_failureshow_progressc                 V  > [         TU ]  UUUS9   UR                  S5      S   U l        Ub  X@R
                  l        Uc  0 nUR                  SU05        U R
                  R                  R                  U5        X0l	        U R                  5         g! [         a    [	        S5      ef = f)a  Initialize with blackboard course url.

The BbRouter cookie is required for most blackboard courses.

Args:
    blackboard_course_url: Blackboard course url.
    bbrouter: BbRouter cookie.
    load_all_recursively: If True, load all documents recursively.
    basic_auth: Basic auth credentials.
    cookies: Cookies.
    continue_on_failure: whether to continue loading the sitemap if an error
        occurs loading a url, emitting a warning instead of raising an
        exception. Setting this to True makes the loader more robust, but also
        may result in missing data. Default: False
    show_progress: whether to show a progress bar while loading. Default: True

Raises:
    ValueError: If blackboard course url is invalid.
)	web_pathsr   r   z/webapps/blackboardr   zpInvalid blackboard course url. Please provide a url that starts with https://<blackboard_url>/webapps/blackboardNBbRouter)super__init__splitbase_url
IndexErrorsessionauthupdater   r   	check_bs4)	selfr   r   r   r   r   r   r   	__class__s	           g/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/blackboard.pyr   BlackboardLoader.__init__$   s    : 	, 3' 	 	
	1778MNqQDM ! *LL?G
H-.##G,$8!  	> 	s   B B(returnc                 @     SSK ng! [         a    [        S5      ef = f)zdCheck if BeautifulSoup4 is installed.

Raises:
    ImportError: If BeautifulSoup4 is not installed.
r   NzeBeautifulSoup4 is required for BlackboardLoader. Please install it with `pip install beautifulsoup4`.)bs4ImportError)r#   r)   s     r%   r"   BlackboardLoader.check_bs4Y   s,    	 	G 	s    c                 P   U R                   (       a  U R                  5       nU R                  U5      U l        U R	                  U5      n/ nU Ht  nU R
                  U-   n[        SU 35        U R                  U5      n[        R                  " [        5         UR                  U R                  U5      5        SSS5        Mv     U$ [        SU R                   35        U R                  5       nU R                  U5      U l        U R                  U5      $ ! , (       d  f       M  = f)zBLoad data into Document objects.

Returns:
    List of Documents.
zFetching documents from N)r   scrape_get_folder_pathfolder_path
_get_pathsr   print_scrape
contextlibsuppress
ValueErrorextend_get_documentsweb_path)r#   	soup_inforelative_paths	documentspathurls         r%   loadBlackboardLoader.loadg   s     $$I#44Y?D!__Y7NI&mmd*067 LL-	((4$$T%8%8%CD 54	 ' ,T]]O<=I#44Y?D&&y11 54s   !D
D%	soupc                    UR                  SSS05      nUc  [        S5      eUR                  R                  5       n[	        U5      R                  SS5      R                  SS5      R                  SS5      R                  S	S5      R                  S
S5      R                  SS5      R                  SS5      R                  SS5      n[        S5      U-  n[        U5      $ )zvGet the folder path to save the Documents in.

Args:
    soup: BeautifulSoup4 soup object.

Returns:
    Folder path.
spanidcrumb_1zNo course name found. _/:,?'!".)findr5   textstripr   replacer   str)r#   r@   course_namecourse_name_cleanr/   s        r%   r.   !BlackboardLoader._get_folder_path   s     iiy(9:455!&&,,. K WS#WS#WS#WS#WS#WS#WS#WS# 	 3i"33;    c                 j    U R                  U5      nU R                  U5        U R                  5       nU$ )z|Fetch content from page and return Documents.

Args:
    soup: BeautifulSoup4 soup object.

Returns:
    List of documents.
)_get_attachments_download_attachments_load_documents)r#   r@   attachmentsr;   s       r%   r7   BlackboardLoader._get_documents   s6     ++D1"";/((*	rW   c                 B   SSK JnJn  UR                  SSS05      nUc  [	        S5      e/ nUR                  SSS05       HZ  nUR                  S5       HB  nUR                  S	5      nUc  M  UR                  S
5      (       a  M1  UR                  U5        MD     M\     U$ )zqGet all attachments from a page.

Args:
    soup: BeautifulSoup4 soup object.

Returns:
    List of attachments.
r   )BeautifulSoupTagulclasscontentListzNo content list found.r\   ahref#)	r)   r_   r`   rO   r5   find_allget
startswithappend)	r#   r@   r_   r`   content_listr\   
attachmentlinkre   s	            r%   rY   !BlackboardLoader._get_attachments   s     	+ yy'?@566&//w6NOJ"++C0xx'#DOOC,@,@&&t,	 1 P rW   r\   c                 ~    [        U R                  5      R                  SSS9  U H  nU R                  U5        M     g)zGDownload all attachments.

Args:
    attachments: List of attachments.
T)parentsexist_okN)r   r/   mkdirdownload)r#   r\   rl   s      r%   rZ   &BlackboardLoader._download_attachments   s8     	T$$TD$A%JMM*% &rW   c                 X    [        U R                  S[        S9nUR                  5       nU$ )zCLoad all documents in the folder.

Returns:
    List of documents.
z*.pdf)r<   glob
loader_cls)r
   r/   r   r>   )r#   loaderr;   s      r%   r[    BlackboardLoader._load_documents   s0     !!!"
 KKM	rW   c                     / nUR                  SSS05      nUc  [        S5      eUR                  S5       HB  nUR                  S5      nUc  M  UR	                  S5      (       d  M1  UR                  U5        MD     U$ )z%Get all relative paths in the navbar.ra   rb   
courseMenuzNo course menu found.rd   re   rG   )rO   r5   rg   rh   ri   rj   )r#   r@   r:   course_menurm   re   s         r%   r0   BlackboardLoader._get_paths   s{    iiw&=>455((-D88F#DDOOC$8$8%%d+ . rW   r<   c                 6   U R                   R                  U R                  U-   SS9nU R                  UR                  5      n[        [        U R                  5      U-  S5       nUR                  UR                  5        SSS5        g! , (       d  f       g= f)z@Download a file from an url.

Args:
    path: Path to the file.
T)allow_redirectswbN)
r   rh   r   parse_filenamer=   openr   r/   writecontent)r#   r<   responsefilenamefs        r%   rs   BlackboardLoader.download   st     <<##DMMD$8$#O&&x||4$t''(83T:aGGH$$% ;::s   %B


Br=   c                     [        U5      =n(       a  UR                  S:X  a  UR                  $ U R                  U5      $ )zlParse the filename from an url.

Args:
    url: Url to parse the filename from.

Returns:
    The filename.
.pdf)r   suffixname_parse_filename_from_url)r#   r=   url_paths      r%   r   BlackboardLoader.parse_filename   s9     S	!H!x&'@== 0055rW   c                    [         R                  " SU5      nU(       a  UR                  S5      nO[        SU 35      eSU;  a  [        SU 35      eUR	                  S5      S   S-   n[        U5      nUR                  SS5      nU$ )	zParse the filename from an url.

Args:
    url: Url to parse the filename from.

Returns:
    The filename.

Raises:
    ValueError: If the filename could not be parsed.
zfilename%2A%3DUTF-8%27%27(.+)   zCould not parse filename from r   zIncorrect file type: r   z%20rE   )researchgroupr5   r   r   rR   )r#   r=   filename_matchesr   s       r%   r   )BlackboardLoader._parse_filename_from_url  s     99%EsK'--a0H=cUCDD!4XJ?@@>>&)!,v58$##E3/rW   )r   r/   r   )TNNFT)r'   N)__name__
__module____qualname____firstlineno____doc__rS   boolr   r   dictr   r"   r   r	   r>   r   r.   r7   rY   rZ   r[   r0   rs   r   r   __static_attributes____classcell__)r$   s   @r%   r   r      sK   2 &*04"&$)"3"3 3 #	3
 U38_-3 $3 "3 3 3j2d8n 20 S  S  <3 4> S T#Y 8
&c 
&t 
&h "
s 
tCy 
&S &T &6# 6# 6C C  rW   r   __main__zhttps://<YOUR BLACKBOARD URL HERE>/webapps/blackboard/content/listContent.jsp?course_id=_<YOUR COURSE ID HERE>_1&content_id=_<YOUR CONTENT ID HERE>_1&mode=resetz<YOUR BBROUTER COOKIE HERE>T)r   zLoaded z pages of PDFs from )r3   r   pathlibr   typingr   r   r   r   urllib.parser   langchain_core.documentsr	   .langchain_community.document_loaders.directoryr
   (langchain_community.document_loaders.pdfr   -langchain_community.document_loaders.web_baser   r   r   rx   r>   r;   r1   lenr8    rW   r%   <module>r      s     	  - -   - J @ GT} Tn z	C 	&!F I	GC	N##77H
IJ rW   