“Parent” webpage has html element: data-xwiki-reference=“xwiki:IW.Parent.WebHome” data-xwiki-document=“IW.Parent.WebHome” data-xwiki-wiki=“xwiki” data-xwiki-space=“IW.Parent” data-xwiki-page=“WebHome” data-xwiki-isnew=“false” data-xwiki-version=“1.4” data-xwiki-rest-url=“/xwiki/rest/wikis/xwiki/spaces/IW/spaces/Parent/pages/WebHome” data-xwiki-locale=“” data-xwiki-form-token=“abcd123” data-xwiki-user-reference=“xwiki:XWiki.{username}” data-xwiki-page-ready=“true”
“Child” webpage has html element: data-xwiki-reference=“xwiki:IW.Parent.Child.WebHome” data-xwiki-document=“IW.Parent.Child.WebHome” data-xwiki-wiki=“xwiki” data-xwiki-space=“IW.Parent.Child” data-xwiki-page=“WebHome” data-xwiki-isnew=“false” data-xwiki-version=“7.4” data-xwiki-rest-url=“/xwiki/rest/wikis/xwiki/spaces/IW/spaces/Parent/spaces/Child/pages/WebHome” data-xwiki-locale=“” data-xwiki-form-token=“abcd123” data-xwiki-user-reference=“xwiki:XWiki.{username}” data-xwiki-page-ready=“true”
I want to be able to on a large scale scrape through the main base_url and get all space (like “Parent”) and then scrape each space to get all the pages within (like “Child”)
So I used the REST API to attempt to do so with just “Parent”
on a more generic sense I tried to get from just the knowledge of “Parent” what “Child” is:
def get_subspaces(base_url, space_path=""):
"""
Retrieves a list of subspaces within a specified space path.
"""
url = f"{base_url}/spaces/{space_path}/spaces" # Access subspaces
response = requests.get(url, auth=HTTPBasicAuth(username, password))
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'xml')
subspaces = [space.get_text() for space in soup.find_all('name') if space.get_text() != space_path] # Exclude self-reference
print(f"Subspaces in '{space_path}':", subspaces)
return subspaces
else:
print(f"Failed to retrieve subspaces. Status code: {response.status_code}")
return []
# Function to get the WebHome page for a subspace
def get_webhome_page(base_url, full_space_path):
"""
Retrieves the WebHome page content for a specified subspace.
"""
url = f"{base_url}/spaces/{full_space_path}/pages/WebHome"
response = requests.get(url, auth=HTTPBasicAuth(username, password))
if response.status_code == 200:
print(f"WebHome content for '{full_space_path}':")
print(response.text)
else:
print(f"Failed to retrieve WebHome for '{full_space_path}'. Status code: {response.status_code}")
# Main logic to retrieve subspaces and WebHome pages
main_space = "IW.Materialgruppen"
subspaces = get_subspaces(base_api_url, main_space) # Get subspaces within "Materialgruppen"
# Retrieve the WebHome page for each subspace
for subspace in subspaces:
full_space_path = f"{main_space}.{subspace}" # Construct full path
get_webhome_page(base_api_url, full_space_path)
which printed an empty list (why does it not find “Child”?)
so I used “Child” to at least check if it exists
def check_direct_subspace(base_url, main_space, subspace_name):
"""
Attempts to directly retrieve the WebHome page of a known subspace.
"""
full_space_path = f"{main_space}.{subspace_name}"
url = f"{base_url}/spaces/{full_space_path}/pages"
response = requests.get(url, auth=HTTPBasicAuth(username, password))
if response.status_code == 200:
print(f"\nWebHome content for '{full_space_path}':")
print(response.text)
else:
print(f"Failed to retrieve WebHome for '{full_space_path}'. Status code: {response.status_code}")
# Directly check known subspaces under IW.Parent
main_space = "IW.Parent"
known_subspaces = ["Child"]
for subspace in known_subspaces:
check_direct_subspace(base_api_url, main_space, subspace)
which prints:
WebHome content for 'IW.Parent.Child':
<?xml version="1.0" encoding="UTF-8" standalone="yes"?><pages xmlns="http://www.xwiki.org"/>
so then I do:
def get_nested_spaces(base_url, full_space_path):
"""
Lists any nested spaces within a specified space.
"""
url = f"{base_url}/spaces/{full_space_path}/spaces"
response = requests.get(url, auth=HTTPBasicAuth(username, password))
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'xml')
nested_spaces = [space.get_text() for space in soup.find_all('name')]
print(f"Nested spaces within '{full_space_path}':", nested_spaces)
return nested_spaces
else:
print(f"Failed to retrieve nested spaces for '{full_space_path}'. Status code: {response.status_code}")
return []
get_nested_spaces(base_api_url, "IW.Parent.Child")
which prints:
Nested spaces within 'IW.Parent.Child': ['IW.Parent.Child']
['IW.Parent.Child']
ok so then i try to read the content from this:
def get_page_content(base_url, wiki_name, space_name, page_name, username, password):
"""
Retrieves the content of a specified page from XWiki using the REST API.
:param base_url: The base URL of the XWiki instance.
:param wiki_name: The name of the wiki (usually 'xwiki').
:param space_name: The name of the space containing the page.
:param page_name: The name of the page.
:param username: The username for authentication.
:param password: The password for authentication.
:return: The content of the specified page.
"""
# Construct the URL for the page content
url = f"{base_url}/xwiki/rest/wikis/{wiki_name}/spaces/{space_name}/pages/{page_name}"
# Send the GET request to retrieve the page content
response = requests.get(url, auth=HTTPBasicAuth(username, password), headers={"Accept": "application/xml"})
if response.status_code == 200:
# Parse the response to extract the content
soup = BeautifulSoup(response.text, 'xml')
content = soup.find('content').get_text()
print(f"Content of '{space_name}.{page_name}':", content)
return content
else:
print(f"Failed to retrieve content for '{space_name}.{page_name}'. Status code: {response.status_code}")
return None
# Example usage
base_api_url = "https://iwdemo01.de-gmbh.com"
wiki_name = "xwiki"
space_name = "IW.Materialgruppen"
page_name = "Batterie"
get_page_content(base_api_url, wiki_name, space_name, page_name, username, password)
which sadly prints:
Failed to retrieve content for 'IW.Materialgruppen.Batterie'. Status code: 404
How do I actually get, read, and save the content from https://{domain}/xwiki/bin/view/IW/Parent/Child/ within perhaps a json file?